// ***************************************************************************** // * This file is part of the FreeFileSync project. It is distributed under * // * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0 * // * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved * // ***************************************************************************** #ifndef STRING_TOOLS_H_213458973046 #define STRING_TOOLS_H_213458973046 #include //isspace #include //iswspace #include //sprintf #include //swprintf #include #include #include #include #include "stl_tools.h" #include "string_traits.h" //enhance arbitray string class with useful non-member functions: namespace zen { template bool isWhiteSpace(Char ch); template bool isDigit (Char ch); //not exactly the same as "std::isdigit" -> we consider '0'-'9' only! template bool isHexDigit (Char ch); template bool isAsciiAlpha(Char ch); //case-sensitive comparison (compile-time correctness: use different number of arguments as STL comparison predicates!) struct CmpBinary { template int operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const; }; //basic case-insensitive comparison (considering A-Z only!) struct CmpAsciiNoCase { template int operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const; }; struct LessAsciiNoCase { template //don't support heterogenous input! => use as container predicate only! bool operator()(const S& lhs, const S& rhs) const { return CmpAsciiNoCase()(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; } }; //both S and T can be strings or char/wchar_t arrays or simple char/wchar_t template bool contains(const S& str, const T& term); template bool startsWith(const S& str, const T& prefix); template bool startsWith(const S& str, const T& prefix, Function cmpStringFun); template bool endsWith (const S& str, const T& postfix); template bool endsWith (const S& str, const T& postfix, Function cmpStringFun); template bool strEqual(const S& lhs, const T& rhs); template bool strEqual(const S& lhs, const T& rhs, Function cmpStringFun); enum FailureReturnVal { IF_MISSING_RETURN_ALL, IF_MISSING_RETURN_NONE }; template S afterLast (const S& str, const T& term, FailureReturnVal rv); template S beforeLast (const S& str, const T& term, FailureReturnVal rv); template S afterFirst (const S& str, const T& term, FailureReturnVal rv); template S beforeFirst(const S& str, const T& term, FailureReturnVal rv); enum class SplitType { ALLOW_EMPTY, SKIP_EMPTY }; template std::vector split(const S& str, const T& delimiter, SplitType st); template S trimCpy(S str, bool fromLeft = true, bool fromRight = true); template void trim (S& str, bool fromLeft = true, bool fromRight = true); template void trim(S& str, bool fromLeft, bool fromRight, Function trimThisChar); template void replace ( S& str, const T& oldTerm, const U& newTerm, bool replaceAll = true); template S replaceCpy(const S& str, const T& oldTerm, const U& newTerm, bool replaceAll = true); //high-performance conversion between numbers and strings template S numberTo(const Num& number); template Num stringTo(const S& str); std::pair hexify (unsigned char c, bool upperCase = true); char unhexify(char high, char low); template S printNumber(const T& format, const Num& number); //format a single number using std::snprintf() //string to string conversion: converts string-like type into char-compatible target string class template T copyStringTo(S&& str); //---------------------- implementation ---------------------- template <> inline bool isWhiteSpace(char ch) { assert(ch != 0); //std C++ does not consider 0 as white space //caveat 1: std::isspace() takes an int, but expects an unsigned char //caveat 2: some parts of UTF-8 chars are erroneously seen as whitespace, e.g. the a0 from "\xec\x8b\xa0" (MSVC) return static_cast(ch) < 128 && std::isspace(static_cast(ch)) != 0; } template <> inline bool isWhiteSpace(wchar_t ch) { assert(ch != 0); //std C++ does not consider 0 as white space return std::iswspace(ch) != 0; } template inline bool isDigit(Char ch) //similar to implmenetation of std::isdigit()! { static_assert(IsSameType::value || IsSameType::value, ""); return static_cast('0') <= ch && ch <= static_cast('9'); } template inline bool isHexDigit(Char c) { static_assert(IsSameType::value || IsSameType::value, ""); return (static_cast('0') <= c && c <= static_cast('9')) || (static_cast('A') <= c && c <= static_cast('F')) || (static_cast('a') <= c && c <= static_cast('f')); } template inline bool isAsciiAlpha(Char c) { static_assert(IsSameType::value || IsSameType::value, ""); return (static_cast('A') <= c && c <= static_cast('Z')) || (static_cast('a') <= c && c <= static_cast('z')); } template inline bool startsWith(const S& str, const T& prefix, Function cmpStringFun) { const size_t pfLen = strLength(prefix); if (strLength(str) < pfLen) return false; return cmpStringFun(strBegin(str), pfLen, strBegin(prefix), pfLen) == 0; } template inline bool endsWith(const S& str, const T& postfix, Function cmpStringFun) { const size_t strLen = strLength(str); const size_t pfLen = strLength(postfix); if (strLen < pfLen) return false; return cmpStringFun(strBegin(str) + strLen - pfLen, pfLen, strBegin(postfix), pfLen) == 0; } template inline bool strEqual(const S& lhs, const T& rhs, Function cmpStringFun) { return cmpStringFun(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) == 0; } template inline bool startsWith(const S& str, const T& prefix ) { return startsWith(str, prefix, CmpBinary()); } template inline bool endsWith (const S& str, const T& postfix) { return endsWith (str, postfix, CmpBinary()); } template inline bool strEqual (const S& lhs, const T& rhs ) { return strEqual (lhs, rhs, CmpBinary()); } template inline bool contains(const S& str, const T& term) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t strLen = strLength(str); const size_t termLen = strLength(term); if (strLen < termLen) return false; const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLen; const auto* const termFirst = strBegin(term); return std::search(strFirst, strLast, termFirst, termFirst + termLen) != strLast; } template inline S afterLast(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t termLen = strLength(term); assert(termLen > 0); const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); const auto* const termFirst = strBegin(term); const auto* it = search_last(strFirst, strLast, termFirst, termFirst + termLen); if (it == strLast) return rv == IF_MISSING_RETURN_ALL ? str : S(); it += termLen; return S(it, strLast - it); } template inline S beforeLast(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t termLen = strLength(term); assert(termLen > 0); const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); const auto* const termFirst = strBegin(term); const auto* it = search_last(strFirst, strLast, termFirst, termFirst + termLen); if (it == strLast) return rv == IF_MISSING_RETURN_ALL ? str : S(); return S(strFirst, it - strFirst); } template inline S afterFirst(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t termLen = strLength(term); assert(termLen > 0); const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); const auto* const termFirst = strBegin(term); const auto* it = std::search(strFirst, strLast, termFirst, termFirst + termLen); if (it == strLast) return rv == IF_MISSING_RETURN_ALL ? str : S(); it += termLen; return S(it, strLast - it); } template inline S beforeFirst(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t termLen = strLength(term); assert(termLen > 0); const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); const auto* const termFirst = strBegin(term); auto it = std::search(strFirst, strLast, termFirst, termFirst + termLen); if (it == strLast) return rv == IF_MISSING_RETURN_ALL ? str : S(); return S(strFirst, it - strFirst); } template inline std::vector split(const S& str, const T& delimiter, SplitType st) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t delimLen = strLength(delimiter); assert(delimLen > 0); if (delimLen == 0) { if (str.empty() && st == SplitType::SKIP_EMPTY) return {}; return { str }; } const auto* const delimFirst = strBegin(delimiter); const auto* const delimLast = delimFirst + delimLen; const auto* blockStart = strBegin(str); const auto* const strLast = blockStart + strLength(str); std::vector output; for (;;) { const auto* const blockEnd = std::search(blockStart, strLast, delimFirst, delimLast); if (blockStart != blockEnd || st == SplitType::ALLOW_EMPTY) output.emplace_back(blockStart, blockEnd - blockStart); if (blockEnd == strLast) return output; blockStart = blockEnd + delimLen; } } namespace impl { ZEN_INIT_DETECT_MEMBER(append); //either call operator+=(S(str, len)) or append(str, len) template inline typename EnableIf::value>::Type stringAppend(S& str, InputIterator first, InputIterator last) { str.append(first, last); } template inline typename EnableIf::value>::Type stringAppend(S& str, InputIterator first, InputIterator last) { str += S(first, last); } } template inline S replaceCpy(const S& str, const T& oldTerm, const U& newTerm, bool replaceAll) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t oldLen = strLength(oldTerm); if (oldLen == 0) return str; const auto* const oldBegin = strBegin(oldTerm); const auto* const oldEnd = oldBegin + oldLen; const auto* const newBegin = strBegin(newTerm); const auto* const newEnd = newBegin + strLength(newTerm); S output; for (auto it = str.begin();;) { const auto itFound = std::search(it, str.end(), oldBegin, oldEnd); if (itFound == str.end() && it == str.begin()) return str; //optimize "oldTerm not found": return ref-counted copy impl::stringAppend(output, it, itFound); if (itFound == str.end()) return output; impl::stringAppend(output, newBegin, newEnd); it = itFound + oldLen; if (!replaceAll) { impl::stringAppend(output, it, str.end()); return output; } } } template inline void replace(S& str, const T& oldTerm, const U& newTerm, bool replaceAll) { str = replaceCpy(str, oldTerm, newTerm, replaceAll); } template inline void trim(S& str, bool fromLeft, bool fromRight, Function trimThisChar) { assert(fromLeft || fromRight); const auto* const oldBegin = strBegin(str); const auto* newBegin = oldBegin; const auto* newEnd = oldBegin + strLength(str); if (fromRight) while (newBegin != newEnd && trimThisChar(newEnd[-1])) --newEnd; if (fromLeft) while (newBegin != newEnd && trimThisChar(*newBegin)) ++newBegin; if (newBegin != oldBegin) str = S(newBegin, newEnd - newBegin); //minor inefficiency: in case "str" is not shared, we could save an allocation and do a memory move only else str.resize(newEnd - newBegin); } template inline void trim(S& str, bool fromLeft, bool fromRight) { using CharType = typename GetCharType::Type; trim(str, fromLeft, fromRight, [](CharType c) { return isWhiteSpace(c); }); } template inline S trimCpy(S str, bool fromLeft, bool fromRight) { //implementing trimCpy() in terms of trim(), instead of the other way round, avoids memory allocations when trimming from right! trim(str, fromLeft, fromRight); return std::move(str); //"str" is an l-value parameter => no copy elision! } namespace impl { template struct CopyStringToString { T copy(const S& src) const { return T(strBegin(src), strLength(src)); } }; template struct CopyStringToString //perf: we don't need a deep copy if string types match { template T copy(S&& str) const { return std::forward(str); } }; inline int strcmpWithNulls(const char* ptr1, const char* ptr2, size_t num) { return std::memcmp (ptr1, ptr2, num); } inline int strcmpWithNulls(const wchar_t* ptr1, const wchar_t* ptr2, size_t num) { return std::wmemcmp(ptr1, ptr2, num); } } template inline T copyStringTo(S&& str) { return impl::CopyStringToString, T>().copy(std::forward(str)); } template inline int CmpBinary::operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const { //support embedded 0, unlike strncmp/wcsncmp! const int rv = impl::strcmpWithNulls(lhs, rhs, std::min(lhsLen, rhsLen)); if (rv != 0) return rv; return static_cast(lhsLen) - static_cast(rhsLen); } template inline int CmpAsciiNoCase::operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const { auto asciiToLower = [](Char c) //ordering: lower-case chars have higher code points than uppper-case { if (static_cast('A') <= c && c <= static_cast('Z')) return static_cast(c - static_cast('A') + static_cast('a')); return c; }; const auto* const lhsLast = lhs + std::min(lhsLen, rhsLen); while (lhs != lhsLast) { const Char charL = asciiToLower(*lhs++); const Char charR = asciiToLower(*rhs++); if (charL != charR) return static_cast(charL) - static_cast(charR); //unsigned char-comparison is the convention! //unsigned underflow is well-defined! } return static_cast(lhsLen) - static_cast(rhsLen); } namespace impl { template inline int saferPrintf(char* buffer, size_t bufferSize, const char* format, const Num& number) //there is no such thing as a "safe" printf ;) { return std::snprintf(buffer, bufferSize, format, number); //C99: returns number of chars written if successful, < 0 or >= bufferSize on failure } template inline int saferPrintf(wchar_t* buffer, size_t bufferSize, const wchar_t* format, const Num& number) { return std::swprintf(buffer, bufferSize, format, number); //C99: returns number of chars written if successful, < 0 on failure (including buffer too small) } } template inline S printNumber(const T& format, const Num& number) //format a single number using ::sprintf { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); using CharType = typename GetCharType::Type; const int BUFFER_SIZE = 128; CharType buffer[BUFFER_SIZE]; //zero-initialize? const int charsWritten = impl::saferPrintf(buffer, BUFFER_SIZE, strBegin(format), number); return 0 < charsWritten && charsWritten < BUFFER_SIZE ? S(buffer, charsWritten) : S(); } namespace impl { enum NumberType { NUM_TYPE_SIGNED_INT, NUM_TYPE_UNSIGNED_INT, NUM_TYPE_FLOATING_POINT, NUM_TYPE_OTHER, }; template inline S numberTo(const Num& number, Int2Type) //default number to string conversion using streams: convenient, but SLOW, SLOW, SLOW!!!! (~ factor of 20) { using CharType = typename GetCharType::Type; std::basic_ostringstream ss; ss << number; return copyStringTo(ss.str()); } template inline S floatToString(const Num& number, char ) { return printNumber( "%g", static_cast(number)); } template inline S floatToString(const Num& number, wchar_t) { return printNumber(L"%g", static_cast(number)); } template inline S numberTo(const Num& number, Int2Type) { return floatToString(number, typename GetCharType::Type()); } /* perf: integer to string: (executed 10 mio. times) std::stringstream - 14796 ms std::sprintf - 3086 ms formatInteger - 778 ms */ template inline void formatNegativeInteger(Num n, OutputIterator& it) { assert(n < 0); using CharType = typename std::iterator_traits::value_type; do { const Num tmp = n / 10; *--it = static_cast('0' + (tmp * 10 - n)); //8% faster than using modulus operator! n = tmp; } while (n != 0); *--it = static_cast('-'); } template inline void formatPositiveInteger(Num n, OutputIterator& it) { assert(n >= 0); using CharType = typename std::iterator_traits::value_type; do { const Num tmp = n / 10; *--it = static_cast('0' + (n - tmp * 10)); //8% faster than using modulus operator! n = tmp; } while (n != 0); } template inline S numberTo(const Num& number, Int2Type) { using CharType = typename GetCharType::Type; CharType buffer[2 + sizeof(Num) * 241 / 100]; //zero-initialize? //it's generally faster to use a buffer than to rely on String::operator+=() (in)efficiency //required chars (+ sign char): 1 + ceil(ln_10(256^sizeof(n) / 2 + 1)) -> divide by 2 for signed half-range; second +1 since one half starts with 1! // <= 1 + ceil(ln_10(256^sizeof(n))) =~ 1 + ceil(sizeof(n) * 2.4082) <= 2 + floor(sizeof(n) * 2.41) //caveat: consider INT_MIN: technically -INT_MIN == INT_MIN auto it = std::end(buffer); if (number < 0) formatNegativeInteger(number, it); else formatPositiveInteger(number, it); assert(it >= std::begin(buffer)); return S(&*it, std::end(buffer) - it); } template inline S numberTo(const Num& number, Int2Type) { using CharType = typename GetCharType::Type; CharType buffer[1 + sizeof(Num) * 241 / 100]; //zero-initialize? //required chars: ceil(ln_10(256^sizeof(n))) =~ ceil(sizeof(n) * 2.4082) <= 1 + floor(sizeof(n) * 2.41) auto it = std::end(buffer); formatPositiveInteger(number, it); assert(it >= std::begin(buffer)); return S(&*it, std::end(buffer) - it); } //-------------------------------------------------------------------------------- template inline Num stringTo(const S& str, Int2Type) //default string to number conversion using streams: convenient, but SLOW { using CharType = typename GetCharType::Type; Num number = 0; std::basic_istringstream(copyStringTo>(str)) >> number; return number; } template inline Num stringToFloat(const char* str) { return std::strtod(str, nullptr); } template inline Num stringToFloat(const wchar_t* str) { return std::wcstod(str, nullptr); } template inline Num stringTo(const S& str, Int2Type) { return stringToFloat(strBegin(str)); } template Num extractInteger(const S& str, bool& hasMinusSign) //very fast conversion to integers: slightly faster than std::atoi, but more importantly: generic { using CharType = typename GetCharType::Type; const CharType* first = strBegin(str); const CharType* last = first + strLength(str); while (first != last && isWhiteSpace(*first)) //skip leading whitespace ++first; //handle minus sign hasMinusSign = false; if (first != last) { if (*first == static_cast('-')) { hasMinusSign = true; ++first; } else if (*first == static_cast('+')) ++first; } Num number = 0; for (const CharType* it = first; it != last; ++it) { const CharType c = *it; if (static_cast('0') <= c && c <= static_cast('9')) { number *= 10; number += c - static_cast('0'); } else //rest of string should contain whitespace only, it's NOT a bug if there is something else! break; //assert(std::all_of(iter, last, &isWhiteSpace)); -> this is NO assert situation } return number; } template inline Num stringTo(const S& str, Int2Type) { bool hasMinusSign = false; //handle minus sign const Num number = extractInteger(str, hasMinusSign); return hasMinusSign ? -number : number; } template inline Num stringTo(const S& str, Int2Type) //very fast conversion to integers: slightly faster than std::atoi, but more importantly: generic { bool hasMinusSign = false; //handle minus sign const Num number = extractInteger(str, hasMinusSign); if (hasMinusSign) { assert(false); return 0U; } return number; } } template inline S numberTo(const Num& number) { using TypeTag = Int2Type< IsSignedInt ::value ? impl::NUM_TYPE_SIGNED_INT : IsUnsignedInt::value ? impl::NUM_TYPE_UNSIGNED_INT : IsFloat ::value ? impl::NUM_TYPE_FLOATING_POINT : impl::NUM_TYPE_OTHER>; return impl::numberTo(number, TypeTag()); } template inline Num stringTo(const S& str) { using TypeTag = Int2Type< IsSignedInt ::value ? impl::NUM_TYPE_SIGNED_INT : IsUnsignedInt::value ? impl::NUM_TYPE_UNSIGNED_INT : IsFloat ::value ? impl::NUM_TYPE_FLOATING_POINT : impl::NUM_TYPE_OTHER>; return impl::stringTo(str, TypeTag()); } inline //hexify beats "printNumber("%02X", c)" by a nice factor of 3! std::pair hexify(unsigned char c, bool upperCase) { auto hexifyDigit = [upperCase](int num) -> char //input [0, 15], output 0-9, A-F { assert(0 <= num&& num <= 15); //guaranteed by design below! if (num <= 9) return static_cast('0' + num); //no signed/unsigned char problem here! if (upperCase) return static_cast('A' + (num - 10)); else return static_cast('a' + (num - 10)); }; return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16)); } inline //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!! char unhexify(char high, char low) { auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15] { if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here! return hex - '0'; else if ('A' <= hex && hex <= 'F') return (hex - 'A') + 10; else if ('a' <= hex && hex <= 'f') return (hex - 'a') + 10; assert(false); return 0; }; return static_cast(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed) } } #endif //STRING_TOOLS_H_213458973046