diff options
Diffstat (limited to 'zen/string_tools.h')
-rw-r--r-- | zen/string_tools.h | 569 |
1 files changed, 569 insertions, 0 deletions
diff --git a/zen/string_tools.h b/zen/string_tools.h new file mode 100644 index 00000000..8cafad07 --- /dev/null +++ b/zen/string_tools.h @@ -0,0 +1,569 @@ +// ************************************************************************** +// * This file is part of the zenXML project. It is distributed under the * +// * Boost Software License, Version 1.0. See accompanying file * +// * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt. * +// * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de) * +// ************************************************************************** + +#ifndef STRING_TOOLS_HEADER_213458973046 +#define STRING_TOOLS_HEADER_213458973046 + +#include <cctype> //isspace +#include <cwctype> //iswspace +#include <cstdio> //sprintf +#include <cwchar> //swprintf +#include <algorithm> +#include <cassert> +#include <sstream> +#include <vector> +#include "string_traits.h" +#include "type_traits.h" + + +//enhance arbitray string class with useful non-member functions: +namespace zen +{ +template <class C> bool cStringIsWhiteSpace(C ch); +template <class C> bool cStringIsDigit(C ch); + +template <class S, class T> bool startsWith(const S& str, const T& prefix); //both S and T can be strings or char/wchar_t arrays or simple char/wchar_t +template <class S, class T> bool endsWith (const S& str, const T& postfix); // + +template <class S, class T> S afterLast (const S& str, const T& ch); //returns the whole string if ch not found +template <class S, class T> S beforeLast (const S& str, const T& ch); //returns empty string if ch not found +template <class S, class T> S afterFirst (const S& str, const T& ch); //returns empty string if ch not found +template <class S, class T> S beforeFirst(const S& str, const T& ch); //returns the whole string if ch not found + +template <class S, class T> std::vector<S> split(const S& str, const T& delimiter); +template <class S> void truncate(S& str, size_t newLen); +template <class S> void trim(S& str, bool fromLeft = true, bool fromRight = true); +template <class S, class T, class U> void replace ( S& str, const T& oldOne, const U& newOne, bool replaceAll = true); +template <class S, class T, class U> S replaceCpy(const S& str, const T& oldOne, const U& newOne, bool replaceAll = true); + +//high-performance conversion from numbers to strings +template <class S, class T, class Num> S printNumber(const T& format, const Num& number); //format a single number using ::sprintf + +template <class S, class Num> S toString(const Num& number); +template <class Num, class S > Num toNumber(const S& str); + +//string to string conversion: converst string-like type into char-compatible target string class +template <class T, class S> T cvrtString(const S& str); + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +//---------------------- implementation ---------------------- +template <> inline +bool cStringIsWhiteSpace(char ch) +{ + //caveat 1: std::isspace() takes an int, but expects an unsigned char + //caveat 2: some parts of UTF-8 chars are erroneously seen as whitespace, e.g. the a0 from "\xec\x8b\xa0" (MSVC) + return static_cast<unsigned char>(ch) < 128 && + std::isspace(static_cast<unsigned char>(ch)) != 0; +} + +//template <> inline bool cStringIsWhiteSpace(unsigned char ch) { return cStringIsWhiteSpace<char>(ch); } -> not character types! +//template <> inline bool cStringIsWhiteSpace(signed char ch) { return cStringIsWhiteSpace<char>(ch); } +template <> inline bool cStringIsWhiteSpace(wchar_t ch) { return std::iswspace(ch) != 0; } + +template <> inline +bool cStringIsDigit(char ch) +{ + return std::isdigit(static_cast<unsigned char>(ch)) != 0; //caveat: takes an int, but expects an unsigned char +} + + +template <> inline +bool cStringIsDigit(wchar_t ch) +{ + return std::iswdigit(ch) != 0; +} + + +template <class S, class T> inline +bool startsWith(const S& str, const T& prefix) +{ + assert_static(StringTraits<S>::isStringLike); + assert_static(StringTraits<T>::isStringLike); + + const size_t pfLength = strLength(prefix); + if (strLength(str) < pfLength) + return false; + + return std::equal(strBegin(str), strBegin(str) + pfLength, + strBegin(prefix)); +} + + +template <class S, class T> inline +bool endsWith(const S& str, const T& postfix) +{ + assert_static(StringTraits<S>::isStringLike); + assert_static(StringTraits<T>::isStringLike); + + size_t strLen = strLength(str); + size_t pfLen = strLength(postfix); + if (strLen < pfLen) + return false; + + typedef typename StringTraits<S>::CharType CharType; + + const CharType* cmpBegin = strBegin(str) + strLen - pfLen; + return std::equal(cmpBegin, cmpBegin + pfLen, + strBegin(postfix)); +} + + +//returns the whole string if ch not found +template <class S, class T> inline +S afterLast(const S& str, const T& ch) +{ + assert_static(StringTraits<T>::isStringLike); + + const size_t pos = str.rfind(ch); + if (pos != S::npos) + { + size_t chLen = strLength(ch); + return S(str.c_str() + pos + chLen, str.length() - pos - chLen); + } + else + return str; +} + + +//returns empty string if ch not found +template <class S, class T> inline +S beforeLast(const S& str, const T& ch) +{ + assert_static(StringTraits<T>::isStringLike); + + const size_t pos = str.rfind(ch); + if (pos != S::npos) + return S(str.c_str(), pos); //data is non-empty string in this context: else ch would not have been found! + else + return S(); +} + + +//returns empty string if ch not found +template <class S, class T> inline +S afterFirst(const S& str, const T& ch) +{ + assert_static(StringTraits<T>::isStringLike); + + const size_t pos = str.find(ch); + if (pos != S::npos) + { + size_t chLen = strLength(ch); + return S(str.c_str() + pos + chLen, str.length() - pos - chLen); + } + else + return S(); + +} + + +//returns the whole string if ch not found +template <class S, class T> inline +S beforeFirst(const S& str, const T& ch) +{ + assert_static(StringTraits<T>::isStringLike); + + const size_t pos = str.find(ch); + if (pos != S::npos) + return S(str.c_str(), pos); //data is non-empty string in this context: else ch would not have been found! + else + return str; +} + + +template <class S, class T> inline +std::vector<S> split(const S& str, const T& delimiter) +{ + assert_static(StringTraits<T>::isStringLike); + + std::vector<S> output; + size_t bockStart = 0; + size_t delimLen = strLength(delimiter); + if (delimLen != 0) + { + for (size_t blockEnd = str.find(delimiter, bockStart); + blockEnd != S::npos; + bockStart = blockEnd + delimLen, blockEnd = str.find(delimiter, bockStart)) + { + output.push_back(S(str.c_str() + bockStart, blockEnd - bockStart)); + } + } + output.push_back(S(str.c_str() + bockStart, str.length() - bockStart)); + return output; +} + + +template <class S> inline +void truncate(S& str, size_t newLen) +{ + if (newLen < str.length()) + str.resize(newLen); +} + + +template <class S, class T, class U> inline +S replaceCpy(const S& str, const T& oldOne, const U& newOne, bool replaceAll) +{ + assert_static(StringTraits<T>::isStringLike); + assert_static(StringTraits<U>::isStringLike); + + typedef typename StringTraits<S>::CharType CharType; + + const size_t oldLen = strLength(oldOne); + const size_t newLen = strLength(newOne); + + S output; + + const CharType* strPos = strBegin(str); + const CharType* strEnd = strPos + strLength(str); + + for (;;) + { + const CharType* ptr = std::search(strPos, strEnd, + strBegin(oldOne), strBegin(oldOne) + oldLen); + if (ptr == strEnd) + break; + + output += S(strPos, ptr - strPos); + output += S(strBegin(newOne), newLen); + + strPos = ptr + oldLen; + + if (!replaceAll) + break; + } + output += S(strPos, strEnd - strPos); + + return output; +} + + +template <class S, class T, class U> inline +void replace(S& str, const T& oldOne, const U& newOne, bool replaceAll) +{ + str = replaceCpy(str, oldOne, newOne, replaceAll); +} + + +template <class S> inline +void trim(S& str, bool fromLeft, bool fromRight) +{ + assert(fromLeft || fromRight); + + typedef typename S::value_type CharType; + + const CharType* newBegin = str.c_str(); + const CharType* newEnd = str.c_str() + str.length(); + + if (fromRight) + while (newBegin != newEnd && cStringIsWhiteSpace(newEnd[-1])) + --newEnd; + + if (fromLeft) + while (newBegin != newEnd && cStringIsWhiteSpace(*newBegin)) + ++newBegin; + + const size_t newLength = newEnd - newBegin; + if (newLength != str.length()) + { + if (newBegin != str.c_str()) + str = S(newBegin, newLength); //minor inefficiency: in case "str" is not shared, we could save an allocation and do a memory move only + else + str.resize(newLength); + } +} + + +namespace implementation +{ +template <class S, class T> +struct CnvtStringToString +{ + T convert(const S& src) const { return T(strBegin(src), strLength(src)); } +}; + +template <class S> +struct CnvtStringToString<S, S> //perf: we don't need a deep copy if string types match +{ + const S& convert(const S& src) const { return src; } +}; +} + +template <class T, class S> inline +T cvrtString(const S& str) { return implementation::CnvtStringToString<S, T>().convert(str); } + + +namespace implementation +{ +template <class Num> inline +int saferPrintf(char* buffer, size_t bufferSize, const char* format, const Num& number) //there is no such thing as a "safe" printf ;) +{ +#ifdef _MSC_VER + return ::_snprintf(buffer, bufferSize, format, number); //VS2010 doesn't respect ISO C +#else + return std::snprintf(buffer, bufferSize, format, number); //C99 +#endif +} + +template <class Num> inline +int saferPrintf(wchar_t* buffer, size_t bufferSize, const wchar_t* format, const Num& number) +{ +#ifdef __MINGW32__ //MinGW doesn't respect ISO C + return ::snwprintf(buffer, bufferSize, format, number); +#else + return std::swprintf(buffer, bufferSize, format, number); //C99 +#endif +} +} + +template <class S, class T, class Num> inline +S printNumber(const T& format, const Num& number) //format a single number using ::sprintf +{ + assert_static(StringTraits<T>::isStringLike); + assert_static((IsSameType< + typename StringTraits<S>::CharType, + typename StringTraits<T>::CharType>::result)); + + typedef typename StringTraits<S>::CharType CharType; + + const int BUFFER_SIZE = 128; + CharType buffer[BUFFER_SIZE]; + const int charsWritten = implementation::saferPrintf(buffer, BUFFER_SIZE, format, number); + + return charsWritten > 0 ? S(buffer, charsWritten) : S(); +} + + +namespace implementation +{ +enum NumberType +{ + NUM_TYPE_SIGNED_INT, + NUM_TYPE_UNSIGNED_INT, + NUM_TYPE_FLOATING_POINT, + NUM_TYPE_OTHER, +}; + + +template <class S, class Num, NumberType> +struct CvrtNumberToString +{ + S convert(const Num& number) const //default number to string conversion using streams: convenient, but SLOW, SLOW, SLOW!!!! (~ factor of 20) + { + typedef typename StringTraits<S>::CharType CharType; + + std::basic_ostringstream<CharType> ss; + ss << number; + return cvrtString<S>(ss.str()); + } +}; + + +template <class S, class Num> +struct CvrtNumberToString<S, Num, NUM_TYPE_FLOATING_POINT> +{ + S convert(const Num& number) const { return convertFloat(number, typename StringTraits<S>::CharType()); } + +private: + S convertFloat(const Num& number, char ) const { return printNumber<S>( "%g", static_cast<double>(number)); } + S convertFloat(const Num& number, wchar_t) const { return printNumber<S>(L"%g", static_cast<double>(number)); } +}; + +/* +perf: integer to string: (executed 10 mio. times) + std::stringstream - 14796 ms + std::sprintf - 3086 ms + formatInteger - 778 ms +*/ + +template <class S, class Num> inline +S formatInteger(Num n, bool hasMinus) +{ + assert(n >= 0); + S output; + do + { + output += '0' + n % 10; + n /= 10; + } + while (n != 0); + if (hasMinus) + output += '-'; + + std::reverse(output.begin(), output.end()); + return output; +} + +template <class S, class Num> +struct CvrtNumberToString<S, Num, NUM_TYPE_SIGNED_INT> +{ + S convert(const Num& number) const { return formatInteger<S>(number < 0 ? -number : number, number < 0); } +}; + +template <class S, class Num> +struct CvrtNumberToString<S, Num, NUM_TYPE_UNSIGNED_INT> +{ + S convert(const Num& number) const { return formatInteger<S>(number, false); } +}; + +//-------------------------------------------------------------------------------- + +template <class S, class Num, NumberType> +struct CvrtStringToNumber +{ + Num convert(const S& str) const //default string to number conversion using streams: convenient, but SLOW + { + typedef typename StringTraits<S>::CharType CharType; + Num number = 0; + std::basic_istringstream<CharType>(cvrtString<std::basic_string<CharType> >(str)) >> number; + return number; + } +}; + + +template <class S, class Num> +struct CvrtStringToNumber<S, Num, NUM_TYPE_FLOATING_POINT> +{ + Num convert(const S& str) const { return convertFloat(strBegin(str)); } + +private: + Num convertFloat(const char* str) const { return std::strtod(str, NULL); } + Num convertFloat(const wchar_t* str) const { return std::wcstod(str, NULL); } +}; + +template <class Num, class S> +Num extractInteger(const S& str, bool& hasMinusSign) //very fast conversion to integers: slightly faster than std::atoi, but more importantly: generic +{ + typedef typename StringTraits<S>::CharType CharType; + + const CharType* first = strBegin(str); + const CharType* last = first + strLength(str); + + while (first != last && cStringIsWhiteSpace(*first)) //skip leading whitespace + ++first; + + hasMinusSign = false; //handle minus sign + if (first != last) + { + if (*first == '-') + { + hasMinusSign = true; + ++first; + } + else if (*first == '+') + ++first; + } + + Num number = 0; + for (const CharType* iter = first; iter != last; ++iter) + { + const CharType c = *iter; + if ('0' <= c && c <= '9') + { + number *= 10; + number += c - '0'; + } + else + { + assert(std::find_if(iter, last, std::not1(std::ptr_fun(&cStringIsWhiteSpace<CharType>))) == last); //rest of string should contain whitespace only + break; + } + } + return number; +} + + +template <class S, class Num> +struct CvrtStringToNumber<S, Num, NUM_TYPE_SIGNED_INT> +{ + Num convert(const S& str) const + { + bool hasMinusSign = false; //handle minus sign + const Num number = extractInteger<Num>(str, hasMinusSign); + return hasMinusSign ? -number : number; + } +}; + + +template <class S, class Num> +struct CvrtStringToNumber<S, Num, NUM_TYPE_UNSIGNED_INT> +{ + Num convert(const S& str) const //very fast conversion to integers: slightly faster than std::atoi, but more importantly: generic + { + bool hasMinusSign = false; //handle minus sign + const Num number = extractInteger<Num>(str, hasMinusSign); + if (hasMinusSign) + { + assert(false); + return 0U; + } + return number; + } +}; +} + + +template <class S, class Num> +inline +S toString(const Num& number) //convert number to string the C++ way +{ + using namespace implementation; + return CvrtNumberToString<S, Num, + IsSignedInt <Num>::result ? NUM_TYPE_SIGNED_INT : + IsUnsignedInt<Num>::result ? NUM_TYPE_UNSIGNED_INT : + IsFloat <Num>::result ? NUM_TYPE_FLOATING_POINT : + NUM_TYPE_OTHER + >().convert(number); +} + + +template <class Num, class S> +inline +Num toNumber(const S& str) //convert string to number the C++ way +{ + using namespace implementation; + return CvrtStringToNumber<S, Num, + IsSignedInt <Num>::result ? NUM_TYPE_SIGNED_INT : + IsUnsignedInt<Num>::result ? NUM_TYPE_UNSIGNED_INT : + IsFloat <Num>::result ? NUM_TYPE_FLOATING_POINT : + NUM_TYPE_OTHER + >().convert(str); +} + +} + +#endif //STRING_TOOLS_HEADER_213458973046 |