From 3ba62ef1de77153e5a8c7bad4451b96f6a1678b0 Mon Sep 17 00:00:00 2001 From: Daniel Wilhelm Date: Sun, 12 Mar 2017 22:00:35 -0600 Subject: 8.10 --- zen/file_access.cpp | 14 +- zen/file_access.h | 6 +- zen/file_error.h | 2 +- zen/file_io.cpp | 5 +- zen/file_io.h | 2 +- zen/format_unit.cpp | 4 +- zen/globals.h | 10 +- zen/recycler.cpp | 2 +- zen/scope_guard.h | 2 +- zen/serialize.h | 1 + zen/shell_execute.h | 2 +- zen/string_base.h | 15 +- zen/string_tools.h | 196 +++++++++++++++--------- zen/sys_error.h | 2 +- zen/thread.h | 18 +-- zen/utf.h | 421 +++++++++++++++++++++------------------------------- zen/zstring.cpp | 117 +++++++++++++++ zen/zstring.h | 111 +++----------- 18 files changed, 480 insertions(+), 450 deletions(-) (limited to 'zen') diff --git a/zen/file_access.cpp b/zen/file_access.cpp index 61a003bb..71d00386 100755 --- a/zen/file_access.cpp +++ b/zen/file_access.cpp @@ -27,7 +27,7 @@ using namespace zen; -Opt zen::getPathComponents(const Zstring& itemPath) +Opt zen::parsePathComponents(const Zstring& itemPath) { if (startsWith(itemPath, "/")) { @@ -44,7 +44,7 @@ Opt zen::getPathComponents(const Zstring& itemPath) Opt zen::getParentFolderPath(const Zstring& itemPath) { - if (const Opt comp = getPathComponents(itemPath)) + if (const Opt comp = parsePathComponents(itemPath)) { if (comp->relPath.empty()) return NoValue(); @@ -73,7 +73,7 @@ ItemType zen::getItemType(const Zstring& itemPath) //throw FileError } -PathDetails zen::getPathDetails(const Zstring& itemPath) //throw FileError +PathStatus zen::getPathStatus(const Zstring& itemPath) //throw FileError { const Opt parentPath = getParentFolderPath(itemPath); try @@ -91,7 +91,7 @@ PathDetails zen::getPathDetails(const Zstring& itemPath) //throw FileError const Zstring itemName = afterLast(itemPath, FILE_NAME_SEPARATOR, IF_MISSING_RETURN_ALL); assert(!itemName.empty()); - PathDetails pd = getPathDetails(*parentPath); //throw FileError + PathStatus pd = getPathStatus(*parentPath); //throw FileError if (!pd.relPath.empty()) { pd.relPath.push_back(itemName); @@ -115,7 +115,7 @@ PathDetails zen::getPathDetails(const Zstring& itemPath) //throw FileError Opt zen::getItemTypeIfExists(const Zstring& itemPath) //throw FileError { - const PathDetails pd = getPathDetails(itemPath); //throw FileError + const PathStatus pd = getPathStatus(itemPath); //throw FileError if (pd.relPath.empty()) return pd.existingType; return NoValue(); @@ -502,8 +502,8 @@ void zen::createDirectoryIfMissingRecursion(const Zstring& dirPath) //throw File } catch (FileError&) { - Opt pd; - try { pd = getPathDetails(dirPath); /*throw FileError*/ } + Opt pd; + try { pd = getPathStatus(dirPath); /*throw FileError*/ } catch (FileError&) {} //previous exception is more relevant if (pd && pd->existingType != ItemType::FILE) diff --git a/zen/file_access.h b/zen/file_access.h index c3a52f8a..a6b221e5 100755 --- a/zen/file_access.h +++ b/zen/file_access.h @@ -22,7 +22,7 @@ struct PathComponents Zstring rootPath; //itemPath = rootPath + (FILE_NAME_SEPARATOR?) + relPath Zstring relPath; // }; -Opt getPathComponents(const Zstring& itemPath); //no value on failure +Opt parsePathComponents(const Zstring& itemPath); //no value on failure Opt getParentFolderPath(const Zstring& itemPath); @@ -43,13 +43,13 @@ ItemType getItemType (const Zstring& itemPath); //throw FileError //execute potentially SLOW folder traversal but distinguish error/not existing Opt getItemTypeIfExists(const Zstring& itemPath); //throw FileError -struct PathDetails +struct PathStatus { ItemType existingType; Zstring existingPath; //itemPath =: existingPath + relPath std::vector relPath; // }; -PathDetails getPathDetails(const Zstring& itemPath); //throw FileError +PathStatus getPathStatus(const Zstring& itemPath); //throw FileError enum class ProcSymlink { diff --git a/zen/file_error.h b/zen/file_error.h index 87f9525b..949c644f 100755 --- a/zen/file_error.h +++ b/zen/file_error.h @@ -46,7 +46,7 @@ DEFINE_NEW_FILE_ERROR(ErrorDifferentVolume); //----------- facilitate usage of std::wstring for error messages -------------------- inline std::wstring fmtPath(const std::wstring& displayPath) { return L'\"' + displayPath + L'\"'; } -inline std::wstring fmtPath(const Zstring& displayPath) { return fmtPath(utfCvrtTo(displayPath)); } +inline std::wstring fmtPath(const Zstring& displayPath) { return fmtPath(utfTo(displayPath)); } inline std::wstring fmtPath(const wchar_t* displayPath) { return fmtPath(std::wstring(displayPath)); } //resolve overload ambiguity } diff --git a/zen/file_io.cpp b/zen/file_io.cpp index b4affd37..0c5ff490 100755 --- a/zen/file_io.cpp +++ b/zen/file_io.cpp @@ -140,7 +140,7 @@ size_t FileInput::read(void* buffer, size_t bytesToRead) //throw FileError, X; r if (notifyUnbufferedIO_) notifyUnbufferedIO_(bytesRead); //throw X if (bytesRead == 0) //end of file - bytesToRead = memBuf_.size(); + bytesToRead = std::min(bytesToRead, memBuf_.size()); } std::copy(memBuf_.begin(), memBuf_.begin() + bytesToRead, static_cast(buffer)); @@ -185,9 +185,10 @@ FileOutput::FileOutput(const Zstring& filePath, AccessFlag access, const IOCallb FileOutput::~FileOutput() { + notifyUnbufferedIO_ = nullptr; //no call-backs during destruction!!! try { - flushBuffers(); //throw FileError, X + flushBuffers(); //throw FileError, (X) } catch (...) { assert(false); } } diff --git a/zen/file_io.h b/zen/file_io.h index 8a5e0f7f..827abd9e 100755 --- a/zen/file_io.h +++ b/zen/file_io.h @@ -90,7 +90,7 @@ private: size_t tryWrite(const void* buffer, size_t bytesToWrite); //throw FileError; may return short! CONTRACT: bytesToWrite > 0 std::vector memBuf_; - const IOCallback notifyUnbufferedIO_; //throw X + IOCallback notifyUnbufferedIO_; //throw X }; //----------------------------------------------------------------------------------------------- diff --git a/zen/format_unit.cpp b/zen/format_unit.cpp index cf17c8d4..a2208b3e 100755 --- a/zen/format_unit.cpp +++ b/zen/format_unit.cpp @@ -5,7 +5,7 @@ // ***************************************************************************** #include "format_unit.h" -#include //swprintf +//#include //swprintf #include #include #include "basic_math.h" @@ -168,7 +168,7 @@ std::wstring zen::ffs_Impl::includeNumberSeparator(const std::wstring& number) //::setlocale (LC_ALL, ""); -> implicitly called by wxLocale const lconv* localInfo = ::localeconv(); //always bound according to doc - const std::wstring& thousandSep = utfCvrtTo(localInfo->thousands_sep); + const std::wstring& thousandSep = utfTo(localInfo->thousands_sep); // THOUSANDS_SEPARATOR = std::use_facet>(std::locale("")).thousands_sep(); - why not working? // DECIMAL_POINT = std::use_facet>(std::locale("")).decimal_point(); diff --git a/zen/globals.h b/zen/globals.h index a1fd2764..b6c5dd28 100755 --- a/zen/globals.h +++ b/zen/globals.h @@ -18,7 +18,11 @@ template class Global { public: - Global() { static_assert(std::is_trivially_destructible::value, "this memory needs to live forever"); } + Global() + { + static_assert(std::is_trivially_destructible::value, "this memory needs to live forever"); + assert(!pod.inst && !pod.spinLock); //we depend on static zero-initialization! + } explicit Global(std::unique_ptr&& newInst) { set(std::move(newInst)); } ~Global() { set(nullptr); } @@ -50,9 +54,9 @@ private: //=> use trivially-destructible POD only!!! struct Pod { - std::shared_ptr* inst = nullptr; + std::shared_ptr* inst; // = nullptr; + std::atomic spinLock; // { false }; rely entirely on static zero-initialization! => avoid potential contention with worker thread during Global<> construction! //serialize access; can't use std::mutex: has non-trival destructor - std::atomic spinLock { false }; } pod; }; diff --git a/zen/recycler.cpp b/zen/recycler.cpp index 02ea026a..0c71bf3b 100755 --- a/zen/recycler.cpp +++ b/zen/recycler.cpp @@ -45,7 +45,7 @@ bool zen::recycleOrDeleteIfExists(const Zstring& itemPath) //throw FileError return true; } - throw FileError(errorMsg, replaceCpy(L"Glib Error Code %x:", L"%x", numberTo(error->code)) + L" " + utfCvrtTo(error->message)); + throw FileError(errorMsg, formatSystemError(L"g_file_trash", L"Glib Error Code " + numberTo(error->code), utfTo(error->message))); //g_quark_to_string(error->domain) } return true; diff --git a/zen/scope_guard.h b/zen/scope_guard.h index 09a7fbdb..62552f7b 100755 --- a/zen/scope_guard.h +++ b/zen/scope_guard.h @@ -13,7 +13,7 @@ //std::uncaught_exceptions() currently unsupported on GCC and Clang => clean up ASAP - static_assert(__GNUC__ < 6 || (__GNUC__ == 6 && (__GNUC_MINOR__ < 2 || (__GNUC_MINOR__ == 2 && __GNUC_PATCHLEVEL__ <= 1))), "check std::uncaught_exceptions support"); + static_assert(__GNUC__ < 6 || (__GNUC__ == 6 && (__GNUC_MINOR__ < 3 || (__GNUC_MINOR__ == 3 && __GNUC_PATCHLEVEL__ <= 1))), "check std::uncaught_exceptions support"); namespace __cxxabiv1 { diff --git a/zen/serialize.h b/zen/serialize.h index bb2f7a45..c8dfb96d 100755 --- a/zen/serialize.h +++ b/zen/serialize.h @@ -241,6 +241,7 @@ template inline void readArray(BufferedInputStream& stream, void* buffer, size_t len) //throw UnexpectedEndOfStreamError { const size_t bytesRead = stream.read(buffer, len); + assert(bytesRead <= len); //buffer overflow otherwise not always detected! if (bytesRead < len) throw UnexpectedEndOfStreamError(); } diff --git a/zen/shell_execute.h b/zen/shell_execute.h index 9ba0aef0..5e4ddf1a 100755 --- a/zen/shell_execute.h +++ b/zen/shell_execute.h @@ -41,7 +41,7 @@ void shellExecute(const Zstring& command, ExecutionType type) //throw FileError //Posix::system - execute a shell command int rv = ::system(command.c_str()); //do NOT use std::system as its documentation says nothing about "WEXITSTATUS(rv)", ect... if (rv == -1 || WEXITSTATUS(rv) == 127) //http://linux.die.net/man/3/system "In case /bin/sh could not be executed, the exit status will be that of a command that does exit(127)" - throw FileError(_("Incorrect command line:") + L"\n" + utfCvrtTo(command)); + throw FileError(_("Incorrect command line:") + L"\n" + utfTo(command)); } else runAsync([=] { int rv = ::system(command.c_str()); (void)rv; }); diff --git a/zen/string_base.h b/zen/string_base.h index 3afa66c6..b5e45c0e 100755 --- a/zen/string_base.h +++ b/zen/string_base.h @@ -264,8 +264,8 @@ public: void push_back(Char val) { operator+=(val); } //STL access void pop_back(); - Zbase& operator=(const Zbase& str); Zbase& operator=(Zbase&& tmp) noexcept; + Zbase& operator=(const Zbase& str); Zbase& operator=(const Char* str) { return assign(str, strLength(str)); } Zbase& operator=(Char ch) { return assign(&ch, 1); } Zbase& operator+=(const Zbase& str) { return append(str.c_str(), str.length()); } @@ -573,11 +573,14 @@ template inline Zbase& Zbase::append(InputIterator first, InputIterator last) { const size_t len = std::distance(first, last); - const size_t thisLen = length(); - reserve(thisLen + len); //make unshared and check capacity - - *std::copy(first, last, rawStr_ + thisLen) = 0; - this->setLength(rawStr_, thisLen + len); + if (len > 0) //avoid making this string unshared for no reason + { + const size_t thisLen = length(); + reserve(thisLen + len); //make unshared and check capacity + + *std::copy(first, last, rawStr_ + thisLen) = 0; + this->setLength(rawStr_, thisLen + len); + } return *this; } diff --git a/zen/string_tools.h b/zen/string_tools.h index 5a82e0ed..236f8df6 100755 --- a/zen/string_tools.h +++ b/zen/string_tools.h @@ -25,11 +25,31 @@ namespace zen template bool isWhiteSpace(Char ch); template bool isDigit (Char ch); //not exactly the same as "std::isdigit" -> we consider '0'-'9' only! template bool isHexDigit (Char ch); -template bool isAlpha (Char ch); +template bool isAsciiAlpha(Char ch); -template bool startsWith(const S& str, const T& prefix); // -template bool endsWith (const S& str, const T& postfix); //both S and T can be strings or char/wchar_t arrays or simple char/wchar_t -template bool contains (const S& str, const T& term); // +//case-sensitive comparison (compile-time correctness: use different number of arguments as STL comparison predicates!) +struct CmpBinary { template int operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const; }; + +//basic case-insensitive comparison (considering A-Z only!) +struct CmpAsciiNoCase { template int operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const; }; + +struct LessAsciiNoCase +{ + template //don't support heterogenous input! => use as container predicate only! + bool operator()(const S& lhs, const S& rhs) const { return CmpAsciiNoCase()(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; } +}; + +//both S and T can be strings or char/wchar_t arrays or simple char/wchar_t +template bool contains(const S& str, const T& term); + +template bool startsWith(const S& str, const T& prefix); +template bool startsWith(const S& str, const T& prefix, Function cmpStringFun); + +template bool endsWith (const S& str, const T& postfix); +template bool endsWith (const S& str, const T& postfix, Function cmpStringFun); + +template bool strEqual(const S& lhs, const T& rhs); +template bool strEqual(const S& lhs, const T& rhs, Function cmpStringFun); enum FailureReturnVal { @@ -42,16 +62,23 @@ template S beforeLast (const S& str, const T& term, FailureRe template S afterFirst (const S& str, const T& term, FailureReturnVal rv); template S beforeFirst(const S& str, const T& term, FailureReturnVal rv); -template std::vector split(const S& str, const T& delimiter); -template S trimCpy(S str, bool fromLeft = true, bool fromRight = true); -template void trim (S& str, bool fromLeft = true, bool fromRight = true); +enum class SplitType +{ + ALLOW_EMPTY, + SKIP_EMPTY +}; +template std::vector split(const S& str, const T& delimiter, SplitType st); + +template S trimCpy(S str, bool fromLeft = true, bool fromRight = true); +template void trim (S& str, bool fromLeft = true, bool fromRight = true); template void trim(S& str, bool fromLeft, bool fromRight, Function trimThisChar); + template void replace ( S& str, const T& oldTerm, const U& newTerm, bool replaceAll = true); template S replaceCpy(const S& str, const T& oldTerm, const U& newTerm, bool replaceAll = true); //high-performance conversion between numbers and strings template S numberTo(const Num& number); -template Num stringTo(const S& str); +template Num stringTo(const S& str); std::pair hexify (unsigned char c, bool upperCase = true); char unhexify(char high, char low); @@ -61,9 +88,6 @@ template S printNumber(const T& format, const Num& //string to string conversion: converts string-like type into char-compatible target string class template T copyStringTo(S&& str); -//case-sensitive comparison -template int cmpString(const S& lhs, const T& rhs); - @@ -99,7 +123,7 @@ bool isWhiteSpace(wchar_t ch) template inline -bool isDigit(Char ch) //similar to implmenetation of std::::isdigit()! +bool isDigit(Char ch) //similar to implmenetation of std::isdigit()! { static_assert(IsSameType::value || IsSameType::value, ""); return static_cast('0') <= ch && ch <= static_cast('9'); @@ -116,40 +140,52 @@ bool isHexDigit(Char c) } -template <> bool isAlpha(char ch) = delete; //probably not a good idea with UTF-8 anyway... - -template <> inline bool isAlpha(wchar_t ch) { return std::iswalpha(ch) != 0; } +template inline +bool isAsciiAlpha(Char c) +{ + static_assert(IsSameType::value || IsSameType::value, ""); + return (static_cast('A') <= c && c <= static_cast('Z')) || + (static_cast('a') <= c && c <= static_cast('z')); +} -template inline -bool startsWith(const S& str, const T& prefix) +template inline +bool startsWith(const S& str, const T& prefix, Function cmpStringFun) { - static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t pfLen = strLength(prefix); if (strLength(str) < pfLen) return false; - const auto* const cmpFirst = strBegin(str); - return std::equal(cmpFirst, cmpFirst + pfLen, - strBegin(prefix)); + return cmpStringFun(strBegin(str), pfLen, + strBegin(prefix), pfLen) == 0; } -template inline -bool endsWith(const S& str, const T& postfix) +template inline +bool endsWith(const S& str, const T& postfix, Function cmpStringFun) { - static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t strLen = strLength(str); const size_t pfLen = strLength(postfix); if (strLen < pfLen) return false; - const auto* const cmpFirst = strBegin(str) + strLen - pfLen; - return std::equal(cmpFirst, cmpFirst + pfLen, - strBegin(postfix)); + return cmpStringFun(strBegin(str) + strLen - pfLen, pfLen, + strBegin(postfix), pfLen) == 0; } +template inline +bool strEqual(const S& lhs, const T& rhs, Function cmpStringFun) +{ + return cmpStringFun(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) == 0; +} + + +template inline bool startsWith(const S& str, const T& prefix ) { return startsWith(str, prefix, CmpBinary()); } +template inline bool endsWith (const S& str, const T& postfix) { return endsWith (str, postfix, CmpBinary()); } +template inline bool strEqual (const S& lhs, const T& rhs ) { return strEqual (lhs, rhs, CmpBinary()); } + + template inline bool contains(const S& str, const T& term) { @@ -173,6 +209,7 @@ S afterLast(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t termLen = strLength(term); + assert(termLen > 0); const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); @@ -192,12 +229,15 @@ template inline S beforeLast(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); + const size_t termLen = strLength(term); + assert(termLen > 0); + const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); const auto* const termFirst = strBegin(term); const auto* it = search_last(strFirst, strLast, - termFirst, termFirst + strLength(term)); + termFirst, termFirst + termLen); if (it == strLast) return rv == IF_MISSING_RETURN_ALL ? str : S(); @@ -210,6 +250,8 @@ S afterFirst(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); const size_t termLen = strLength(term); + assert(termLen > 0); + const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); const auto* const termFirst = strBegin(term); @@ -228,12 +270,15 @@ template inline S beforeFirst(const S& str, const T& term, FailureReturnVal rv) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); + const size_t termLen = strLength(term); + assert(termLen > 0); + const auto* const strFirst = strBegin(str); const auto* const strLast = strFirst + strLength(str); const auto* const termFirst = strBegin(term); auto it = std::search(strFirst, strLast, - termFirst, termFirst + strLength(term)); + termFirst, termFirst + termLen); if (it == strLast) return rv == IF_MISSING_RETURN_ALL ? str : S(); @@ -242,34 +287,35 @@ S beforeFirst(const S& str, const T& term, FailureReturnVal rv) template inline -std::vector split(const S& str, const T& delimiter) +std::vector split(const S& str, const T& delimiter, SplitType st) { static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); - const size_t delimLen = strLength(delimiter); - + assert(delimLen > 0); if (delimLen == 0) - return { str }; - else { - const auto* const delimFirst = strBegin(delimiter); - const auto* const delimLast = delimFirst + delimLen; + if (str.empty() && st == SplitType::SKIP_EMPTY) + return {}; + return { str }; + } - const auto* blockStart = strBegin(str); - const auto* const strLast = blockStart + strLength(str); + const auto* const delimFirst = strBegin(delimiter); + const auto* const delimLast = delimFirst + delimLen; - std::vector output; - - for (;;) - { - const auto* const blockEnd = std::search(blockStart, strLast, - delimFirst, delimLast); + const auto* blockStart = strBegin(str); + const auto* const strLast = blockStart + strLength(str); + std::vector output; + for (;;) + { + const auto* const blockEnd = std::search(blockStart, strLast, + delimFirst, delimLast); + if (blockStart != blockEnd || st == SplitType::ALLOW_EMPTY) output.emplace_back(blockStart, blockEnd - blockStart); - if (blockEnd == strLast) //clients expect: if delimiter not found, return str - return output; - blockStart = blockEnd + delimLen; - } + + if (blockEnd == strLast) + return output; + blockStart = blockEnd + delimLen; } } @@ -389,33 +435,47 @@ struct CopyStringToString //perf: we don't need a deep copy if string type template T copy(S&& str) const { return std::forward(str); } }; + +inline int strcmpWithNulls(const char* ptr1, const char* ptr2, size_t num) { return std::memcmp (ptr1, ptr2, num); } +inline int strcmpWithNulls(const wchar_t* ptr1, const wchar_t* ptr2, size_t num) { return std::wmemcmp(ptr1, ptr2, num); } } template inline T copyStringTo(S&& str) { return impl::CopyStringToString, T>().copy(std::forward(str)); } -template inline -int cmpString(const S& lhs, const T& rhs) +template inline +int CmpBinary::operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const { - static_assert(IsSameType::Type, typename GetCharType::Type>::value, ""); + //support embedded 0, unlike strncmp/wcsncmp! + const int rv = impl::strcmpWithNulls(lhs, rhs, std::min(lhsLen, rhsLen)); + if (rv != 0) + return rv; + return static_cast(lhsLen) - static_cast(rhsLen); +} - const size_t lenL = strLength(lhs); - const size_t lenR = strLength(rhs); - const auto* strPosL = strBegin(lhs); - const auto* strPosR = strBegin(rhs); +template inline +int CmpAsciiNoCase::operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const +{ + auto asciiToLower = [](Char c) //ordering: lower-case chars have higher code points than uppper-case + { + if (static_cast('A') <= c && c <= static_cast('Z')) + return static_cast(c - static_cast('A') + static_cast('a')); + return c; + }; - const auto* const strPosLLast = strPosL + std::min(lenL, lenR); + const auto* const lhsLast = lhs + std::min(lhsLen, rhsLen); - while (strPosL != strPosLLast) + while (lhs != lhsLast) { - const auto charL = static_cast(*strPosL++); //unsigned char-comparison is the convention! - const auto charR = static_cast(*strPosR++); + const Char charL = asciiToLower(*lhs++); + const Char charR = asciiToLower(*rhs++); if (charL != charR) - return static_cast(charL) - static_cast(charR); + return static_cast(charL) - static_cast(charR); //unsigned char-comparison is the convention! + //unsigned underflow is well-defined! } - return static_cast(lenL) - static_cast(lenR); + return static_cast(lhsLen) - static_cast(rhsLen); } @@ -424,13 +484,13 @@ namespace impl template inline int saferPrintf(char* buffer, size_t bufferSize, const char* format, const Num& number) //there is no such thing as a "safe" printf ;) { - return std::snprintf(buffer, bufferSize, format, number); //C99 + return std::snprintf(buffer, bufferSize, format, number); //C99: returns number of chars written if successful, < 0 or >= bufferSize on failure } template inline int saferPrintf(wchar_t* buffer, size_t bufferSize, const wchar_t* format, const Num& number) { - return std::swprintf(buffer, bufferSize, format, number); //C99 + return std::swprintf(buffer, bufferSize, format, number); //C99: returns number of chars written if successful, < 0 on failure (including buffer too small) } } @@ -444,7 +504,7 @@ S printNumber(const T& format, const Num& number) //format a single number using CharType buffer[BUFFER_SIZE]; //zero-initialize? const int charsWritten = impl::saferPrintf(buffer, BUFFER_SIZE, strBegin(format), number); - return charsWritten > 0 ? S(buffer, charsWritten) : S(); + return 0 < charsWritten && charsWritten < BUFFER_SIZE ? S(buffer, charsWritten) : S(); } @@ -607,12 +667,8 @@ Num extractInteger(const S& str, bool& hasMinusSign) //very fast conversion to i number *= 10; number += c - static_cast('0'); } - else - { - //rest of string should contain whitespace only, it's NOT a bug if there is something else! - //assert(std::all_of(iter, last, &isWhiteSpace)); -> this is NO assert situation - break; - } + else //rest of string should contain whitespace only, it's NOT a bug if there is something else! + break; //assert(std::all_of(iter, last, &isWhiteSpace)); -> this is NO assert situation } return number; } diff --git a/zen/sys_error.h b/zen/sys_error.h index a19409ab..f7c128ef 100755 --- a/zen/sys_error.h +++ b/zen/sys_error.h @@ -67,7 +67,7 @@ std::wstring formatSystemErrorRaw(ErrorCode ec) //return empty string on error std::wstring errorMsg; ZEN_ON_SCOPE_EXIT(errno = currentError); - errorMsg = utfCvrtTo(::strerror(ec)); + errorMsg = utfTo(::strerror(ec)); trim(errorMsg); //Windows messages seem to end with a blank... return errorMsg; diff --git a/zen/thread.h b/zen/thread.h index a59f3807..ae4c347e 100755 --- a/zen/thread.h +++ b/zen/thread.h @@ -28,26 +28,26 @@ public: template InterruptibleThread(Function&& f); - bool joinable () const { return stdThread.joinable(); } + bool joinable () const { return stdThread_.joinable(); } void interrupt(); - void join () { stdThread.join(); } - void detach () { stdThread.detach(); } + void join () { stdThread_.join(); } + void detach () { stdThread_.detach(); } template bool tryJoinFor(const std::chrono::duration& relTime) { - if (threadCompleted.wait_for(relTime) == std::future_status::ready) + if (threadCompleted_.wait_for(relTime) == std::future_status::ready) { - stdThread.join(); //runs thread-local destructors => this better be fast!!! + stdThread_.join(); //runs thread-local destructors => this better be fast!!! return true; } return false; } private: - std::thread stdThread; + std::thread stdThread_; std::shared_ptr intStatus_; - std::future threadCompleted; + std::future threadCompleted_; }; //context of worker thread: @@ -376,9 +376,9 @@ template inline InterruptibleThread::InterruptibleThread(Function&& f) : intStatus_(std::make_shared()) { std::promise pFinished; - threadCompleted = pFinished.get_future(); + threadCompleted_ = pFinished.get_future(); - stdThread = std::thread([f = std::forward(f), + stdThread_ = std::thread([f = std::forward(f), intStatus = this->intStatus_, pFinished = std::move(pFinished)]() mutable { diff --git a/zen/utf.h b/zen/utf.h index 41fdf58c..ab8fda50 100755 --- a/zen/utf.h +++ b/zen/utf.h @@ -10,40 +10,25 @@ #include #include #include "string_tools.h" //copyStringTo +#include "optional.h" namespace zen { //convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!) template -TargetString utfCvrtTo(const SourceString& str); +TargetString utfTo(const SourceString& str); const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF"; -template -bool isValidUtf8(const CharString& str); //check for UTF-8 encoding errors - -//---- explicit conversion: wide <-> utf8 ---- -template -CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8(L"abc"); - -template -WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide("abc"); +template +bool isValidUtf(const UtfString& str); //check for UTF-8 encoding errors //access unicode characters in UTF-encoded string (char- or wchar_t-based) template size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string template -size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string - - - - - - - - - +UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast); @@ -58,7 +43,7 @@ namespace implementation { using CodePoint = uint32_t; using Char16 = uint16_t; -using Char8 = unsigned char; +using Char8 = uint8_t; const CodePoint LEAD_SURROGATE = 0xd800; const CodePoint TRAIL_SURROGATE = 0xdc00; //== LEAD_SURROGATE_MAX + 1 @@ -72,7 +57,6 @@ template inline void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16 { //http://en.wikipedia.org/wiki/UTF-16 - if (cp < LEAD_SURROGATE) writeOutput(static_cast(cp)); else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point @@ -82,8 +66,8 @@ void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a u else if (cp <= CODE_POINT_MAX) { cp -= 0x10000; - writeOutput(LEAD_SURROGATE + static_cast(cp >> 10)); - writeOutput(TRAIL_SURROGATE + static_cast(cp & 0x3ff)); + writeOutput(static_cast( LEAD_SURROGATE + (cp >> 10))); + writeOutput(static_cast(TRAIL_SURROGATE + (cp & 0x3ff))); } else //invalid code point codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 @@ -104,15 +88,19 @@ size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error! } -template inline -void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint +class Utf16Decoder { - static_assert(sizeof(typename std::iterator_traits::value_type) == 2, ""); +public: + Utf16Decoder(const Char16* str, size_t len) : it_(str), last_(str + len) {} - for ( ; first != last; ++first) + Opt getNext() { - CodePoint cp = static_cast(*first); - switch (getUtf16Len(static_cast(cp))) + if (it_ == last_) + return NoValue(); + + const Char16 ch = *it_++; + CodePoint cp = ch; + switch (getUtf16Len(ch)) { case 0: //invalid utf16 character cp = REPLACEMENT_CHAR; @@ -120,23 +108,33 @@ void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutpu case 1: break; case 2: - if (++first != last) //trail surrogate expected! - { - const Char16 ch = static_cast(*first); - if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected! - { - cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000; - break; - } - } - --first; - cp = REPLACEMENT_CHAR; + decodeTrail(cp); break; } - writeOutput(cp); + return cp; + } + +private: + void decodeTrail(CodePoint& cp) + { + if (it_ != last_) //trail surrogate expected! + { + const Char16 ch = *it_; + if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected! + { + cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000; + ++it_; + return; + } + } + cp = REPLACEMENT_CHAR; } -} + const Char16* it_; + const Char16* const last_; +}; + +//---------------------------------------------------------------------------------------------------------------- template inline void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 @@ -155,14 +153,14 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un { writeOutput(static_cast( (cp >> 12 ) | 0xe0)); writeOutput(static_cast(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast( (cp & 0x3f ) | 0x80)); + writeOutput(static_cast( (cp & 0x3f) | 0x80)); } else if (cp <= CODE_POINT_MAX) { writeOutput(static_cast( (cp >> 18 ) | 0xf0)); writeOutput(static_cast(((cp >> 12) & 0x3f) | 0x80)); writeOutput(static_cast(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast( (cp & 0x3f ) | 0x80)); + writeOutput(static_cast( (cp & 0x3f) | 0x80)); } else //invalid code point codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8 @@ -170,7 +168,7 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un inline -size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error! +size_t getUtf8Len(Char8 ch) //ch must be first code unit! returns 0 on error! { if (ch < 0x80) return 1; @@ -184,32 +182,19 @@ size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on e } -template inline -bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte -{ - if (++first != last) //trail surrogate expected! - { - const Char8 ch = static_cast(*first); - if (ch >> 6 == 0x2) //trail surrogate expected! - { - cp = (cp << 6) + (ch & 0x3f); - return true; - } - } - --first; - cp = REPLACEMENT_CHAR; - return false; -} - -template inline -void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint +class Utf8Decoder { - static_assert(sizeof(typename std::iterator_traits::value_type) == 1, ""); +public: + Utf8Decoder(const Char8* str, size_t len) : it_(str), last_(str + len) {} - for ( ; first != last; ++first) + Opt getNext() { - CodePoint cp = static_cast(*first); - switch (getUtf8Len(static_cast(cp))) + if (it_ == last_) + return NoValue(); + + const Char8 ch = *it_++; + CodePoint cp = ch; + switch (getUtf8Len(ch)) { case 0: //invalid utf8 character cp = REPLACEMENT_CHAR; @@ -218,258 +203,184 @@ void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput break; case 2: cp &= 0x1f; - decodeTrail(first, last, cp); + decodeTrail(cp); break; case 3: cp &= 0xf; - if (decodeTrail(first, last, cp)) - decodeTrail(first, last, cp); + if (decodeTrail(cp)) + decodeTrail(cp); break; case 4: cp &= 0x7; - if (decodeTrail(first, last, cp)) - if (decodeTrail(first, last, cp)) - decodeTrail(first, last, cp); + if (decodeTrail(cp)) + if (decodeTrail(cp)) + decodeTrail(cp); if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR; break; } - writeOutput(cp); + return cp; } -} - - -template inline -size_t unicodeLength(const CharString& str, char) //utf8 -{ - using CharType = typename GetCharType::Type; - const CharType* strFirst = strBegin(str); - const CharType* const strLast = strFirst + strLength(str); - - size_t len = 0; - while (strFirst < strLast) //[!] +private: + bool decodeTrail(CodePoint& cp) { - ++len; - size_t utf8len = getUtf8Len(*strFirst); - if (utf8len == 0) ++utf8len; //invalid utf8 character - strFirst += utf8len; + if (it_ != last_) //trail surrogate expected! + { + const Char8 ch = *it_; + if (ch >> 6 == 0x2) //trail surrogate expected! + { + cp = (cp << 6) + (ch & 0x3f); + ++it_; + return true; + } + } + cp = REPLACEMENT_CHAR; + return false; } - return len; -} + const Char8* it_; + const Char8* const last_; +}; -template inline -size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t -{ - using CharType = typename GetCharType::Type; +//---------------------------------------------------------------------------------------------------------------- - const CharType* strFirst = strBegin(str); - const CharType* const strLast = strFirst + strLength(str); +template inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<1>) { codePointToUtf8 (cp, writeOutput); } //UTF8-char +template inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<2>) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t +template inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<4>) { writeOutput(cp); } //other OS: UTF32-wchar_t - size_t len = 0; - while (strFirst < strLast) //[!] - { - ++len; - size_t utf16len = getUtf16Len(*strFirst); - if (utf16len == 0) ++utf16len; //invalid utf16 character - strFirst += utf16len; - } - return len; -} - - -template inline -size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t +template inline +void codePointToUtf(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType { - return strLength(str); + return codePointToUtf(cp, writeOutput, Int2Type()); } +//---------------------------------------------------------------------------------------------------------------- -template inline -size_t unicodeLength(const WideString& str, wchar_t) -{ - return unicodeLengthWide(str, Int2Type()); -} -} +template +class UtfDecoderImpl; -template inline -size_t unicodeLength(const UtfString& str) //return number of code points +template +class UtfDecoderImpl //UTF8-char { - return implementation::unicodeLength(str, typename GetCharType::Type()); -} +public: + UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast(str), len) {} + Opt getNext() { return decoder_.getNext(); } +private: + Utf8Decoder decoder_; +}; -namespace implementation -{ -template inline -size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char +template +class UtfDecoderImpl //Windows: UTF16-wchar_t { - using CharType = typename GetCharType::Type; - - const CharType* strFirst = strBegin(str); - const size_t strLen = strLength(str); - - size_t utfPos = 0; - while (unicodePos-- > 0) - { - if (utfPos >= strLen) - return strLen; +public: + UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast(str), len) {} + Opt getNext() { return decoder_.getNext(); } +private: + Utf16Decoder decoder_; +}; - size_t utf8len = getUtf8Len(strFirst[utfPos]); - if (utf8len == 0) ++utf8len; //invalid utf8 character - utfPos += utf8len; - } - if (utfPos >= strLen) - return strLen; - return utfPos; -} - -template inline -size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t +template +class UtfDecoderImpl //other OS: UTF32-wchar_t { - using CharType = typename GetCharType::Type; - - const CharType* strFirst = strBegin(str); - const size_t strLen = strLength(str); - - size_t utfPos = 0; - while (unicodePos-- > 0) +public: + UtfDecoderImpl(const CharType* str, size_t len) : it_(reinterpret_cast(str)), last_(it_ + len) {} + Opt getNext() { - if (utfPos >= strLen) - return strLen; - - size_t utf16len = getUtf16Len(strFirst[utfPos]); - if (utf16len == 0) ++utf16len; //invalid utf16 character - utfPos += utf16len; + if (it_ == last_) + return NoValue(); + return *it_++; } - if (utfPos >= strLen) - return strLen; - return utfPos; -} +private: + const CodePoint* it_; + const CodePoint* last_; +}; -template inline -size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t -{ - return std::min(strLength(str), unicodePos); -} - - -template inline -size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t) -{ - return findUnicodePosWide(str, unicodePos, Int2Type()); -} -} - - -template inline -size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string -{ - return implementation::findUnicodePos(str, unicodePos, typename GetCharType::Type()); +template +using UtfDecoder = UtfDecoderImpl; } //------------------------------------------------------------------------------------------- -namespace implementation -{ -template inline -WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t +template inline +bool isValidUtf(const UtfString& str) { - WideString output; - utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast(c); }); }); - return output; -} + using namespace implementation; + UtfDecoder::Type> decoder(strBegin(str), strLength(str)); + while (Opt cp = decoder.getNext()) + if (*cp == REPLACEMENT_CHAR) + return false; -template inline -WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t -{ - WideString output; - utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { output += static_cast(cp); }); - return output; + return true; } -template inline -CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8 +template inline +size_t unicodeLength(const UtfString& str) //return number of code points (+ correctly handle broken UTF encoding) { - CharString output; - utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); - return output; + size_t uniLen = 0; + implementation::UtfDecoder::Type> decoder(strBegin(str), strLength(str)); + while (decoder.getNext()) + ++uniLen; + return uniLen; } -template inline -CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8 +template inline +UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast) //return position of unicode char in UTF-encoded string { - CharString output; - std::for_each(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); + assert(uniPosFirst <= uniPosLast && uniPosLast <= unicodeLength(str)); + using namespace implementation; + using CharType = typename GetCharType::Type; + UtfString output; + if (uniPosFirst >= uniPosLast) //optimize for empty range + return output; + + UtfDecoder decoder(strBegin(str), strLength(str)); + for (size_t uniPos = 0; Opt cp = decoder.getNext(); ++uniPos) //[!] declaration in condition part of the for-loop + if (uniPosFirst <= uniPos) + { + if (uniPos >= uniPosLast) + break; + codePointToUtf(*cp, [&](CharType c) { output += c; }); + } return output; } -} +//------------------------------------------------------------------------------------------- -template inline -bool isValidUtf8(const CharString& str) +namespace implementation { - using namespace implementation; - bool valid = true; - utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) - { - if (cp == REPLACEMENT_CHAR) - valid = false; //perf: should we use an (expensive) exception for iteration break? - }); - return valid; -} - - -template inline -WideString utf8ToWide(const CharString& str) +template inline +TargetString utfTo(const SourceString& str, FalseType) { - static_assert(IsSameType::Type, char >::value, ""); - static_assert(IsSameType::Type, wchar_t>::value, ""); + using CharSrc = typename GetCharType::Type; + using CharTrg = typename GetCharType::Type; + static_assert(sizeof(CharSrc) != sizeof(CharTrg), "no UTF-conversion needed"); - return implementation::utf8ToWide(str, Int2Type()); -} + TargetString output; + UtfDecoder decoder(strBegin(str), strLength(str)); + while (Opt cp = decoder.getNext()) + codePointToUtf(*cp, [&](CharTrg c) { output += c; }); -template inline -CharString wideToUtf8(const WideString& str) -{ - static_assert(IsSameType::Type, char >::value, ""); - static_assert(IsSameType::Type, wchar_t>::value, ""); - - return implementation::wideToUtf8(str, Int2Type()); + return output; } -//------------------------------------------------------------------------------------------- template inline -TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide(str); } - -template inline -TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8(str); } - -template inline -TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo(str); } +TargetString utfTo(const SourceString& str, TrueType) { return copyStringTo(str); } +} -template inline -TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo(str); } template inline -TargetString utfCvrtTo(const SourceString& str) +TargetString utfTo(const SourceString& str) { - return utfCvrtTo(str, - typename GetCharType::Type(), - typename GetCharType::Type()); + return implementation::utfTo(str, StaticBool::Type) == sizeof(typename GetCharType::Type)>()); } } diff --git a/zen/zstring.cpp b/zen/zstring.cpp index 5f5b1ec8..a936efb5 100755 --- a/zen/zstring.cpp +++ b/zen/zstring.cpp @@ -6,11 +6,14 @@ #include "zstring.h" #include +#include "utf.h" using namespace zen; /* +MSDN "Handling Sorting in Your Applications": https://msdn.microsoft.com/en-us/library/windows/desktop/dd318144 + Perf test: compare strings 10 mio times; 64 bit build ----------------------------------------------------- string a = "Fjk84$%kgfj$%T\\\\Gffg\\gsdgf\\fgsx----------d-" @@ -32,3 +35,117 @@ time per call | function */ + + +namespace +{ +int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) +{ + //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode + //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c + // => re-implement comparison based on towlower() to avoid memory allocations + using namespace zen::implementation; + + UtfDecoder decL(lhs, lhsLen); + UtfDecoder decR(rhs, rhsLen); + for (;;) + { + const Opt cpL = decL.getNext(); + const Opt cpR = decR.getNext(); + if (!cpL || !cpR) + return static_cast(!cpR) - static_cast(!cpL); + + static_assert(sizeof(wchar_t) == sizeof(CodePoint), ""); + const wchar_t charL = ::towlower(static_cast(*cpL)); //ordering: towlower() converts to higher code points than towupper() + const wchar_t charR = ::towlower(static_cast(*cpR)); //uses LC_CTYPE category of current locale + if (charL != charR) + return static_cast(charL) - static_cast(charR); //unsigned char-comparison is the convention! + //unsigned underflow is well-defined! + } +} +} + + +int cmpStringNaturalLinux(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) +{ + const char* const lhsEnd = lhs + lhsLen; + const char* const rhsEnd = rhs + rhsLen; + /* + - compare strings after conceptually creating blocks of whitespace/numbers/text + - implement strict weak ordering! + - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c + 1. incorrect non-ASCII CI-comparison 2. incorrect bounds checks + 3. incorrect trimming of *all* whitespace 4. arbitrary handling of leading 0 only at string begin + 5. incorrect handling of whitespace following a number 6. code is a mess + */ + for (;;) + { + if (lhs == lhsEnd || rhs == rhsEnd) + return static_cast(lhs != lhsEnd) - static_cast(rhs != rhsEnd); //"nothing" before "something" + //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here + + const bool wsL = isWhiteSpace(*lhs); + const bool wsR = isWhiteSpace(*rhs); + if (wsL != wsR) + return static_cast(!wsL) - static_cast(!wsR); //whitespace before non-ws! + if (wsL) + { + ++lhs, ++rhs; + while (lhs != lhsEnd && isWhiteSpace(*lhs)) ++lhs; + while (rhs != rhsEnd && isWhiteSpace(*rhs)) ++rhs; + continue; + } + + const bool digitL = isDigit(*lhs); + const bool digitR = isDigit(*rhs); + if (digitL != digitR) + return static_cast(!digitL) - static_cast(!digitR); //number before chars! + if (digitL) + { + while (lhs != lhsEnd && *lhs == '0') ++lhs; + while (rhs != rhsEnd && *rhs == '0') ++rhs; + + int rv = 0; + for (;; ++lhs, ++rhs) + { + const bool endL = lhs == lhsEnd || !isDigit(*lhs); + const bool endR = rhs == rhsEnd || !isDigit(*rhs); + if (endL != endR) + return static_cast(!endL) - static_cast(!endR); //more digits means bigger number + if (endL) + break; //same number of digits + + if (rv == 0 && *lhs != *rhs) + rv = *lhs - *rhs; //found first digit difference comparing from left + } + if (rv != 0) + return rv; + continue; + } + + //compare full junks of text: consider unicode encoding! + const char* textBeginL = lhs++; + const char* textBeginR = rhs++; //current char is neither white space nor digit at this point! + while (lhs != lhsEnd && !isWhiteSpace(*lhs) && !isDigit(*lhs)) ++lhs; + while (rhs != rhsEnd && !isWhiteSpace(*rhs) && !isDigit(*rhs)) ++rhs; + + const int rv = compareNoCaseUtf8(textBeginL, lhs - textBeginL, textBeginR, rhs - textBeginR); + if (rv != 0) + return rv; + } +} + + +namespace +{ +} + + +int CmpNaturalSort::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const +{ + //auto strL = utfTo(Zstring(lhs, lhsLen)); + //auto strR = utfTo(Zstring(rhs, rhsLen)); + //return cmpStringNaturalLinux(strL.c_str(), strL.size(), strR.c_str(), strR.size()); + return cmpStringNaturalLinux(lhs, lhsLen, rhs, rhsLen); + +} \ No newline at end of file diff --git a/zen/zstring.h b/zen/zstring.h index 12bda29f..fdb71da0 100755 --- a/zen/zstring.h +++ b/zen/zstring.h @@ -19,35 +19,39 @@ using Zstring = zen::Zbase; -int cmpStringNoCase(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen); - int cmpStringNoCase(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen); - -template -S makeUpperCopy(S str); - - //Compare filepaths: Windows/OS X does NOT distinguish between upper/lower-case, while Linux DOES -int cmpFilePath(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen); - int cmpFilePath(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen); +struct CmpFilePath +{ + int operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const; +}; +struct CmpNaturalSort +{ + int operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const; +}; -template inline -bool equalFilePath(const S& lhs, const T& rhs) { using namespace zen; return cmpFilePath(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) == 0; } struct LessFilePath { - template - bool operator()(const S& lhs, const T& rhs) const { using namespace zen; return cmpFilePath(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; } + template //don't support heterogenous input! => use as container predicate only! + bool operator()(const S& lhs, const S& rhs) const { using namespace zen; return CmpFilePath()(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; } }; - -struct LessNoCase +struct LessNaturalSort { - template - bool operator()(const S& lhs, const T& rhs) const { using namespace zen; return cmpStringNoCase(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; } + template //don't support heterogenous input! => use as container predicate only! + bool operator()(const S& lhs, const S& rhs) const { using namespace zen; return CmpNaturalSort()(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; } }; +template +S makeUpperCopy(S str); + + +template inline +bool equalFilePath(const S& lhs, const T& rhs) { using namespace zen; return strEqual(lhs, rhs, CmpFilePath()); } + + inline Zstring appendSeparator(Zstring path) //support rvalue references! { @@ -63,35 +67,6 @@ Zstring getFileExtension(const Zstring& filePath) } -template inline -bool ciEqual(const S& lhs, const T& rhs) { using namespace zen; return cmpStringNoCase(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) == 0; } - - -template inline -bool ciStartsWith(const S& str, const T& prefix) -{ - using namespace zen; - const size_t pfLen = strLength(prefix); - if (strLength(str) < pfLen) - return false; - - return cmpStringNoCase(strBegin(str), pfLen, strBegin(prefix), pfLen) == 0; -} - - -template inline -bool ciEndsWith(const S& str, const T& postfix) -{ - using namespace zen; - const size_t strLen = strLength(str); - const size_t pfLen = strLength(postfix); - if (strLen < pfLen) - return false; - - return cmpStringNoCase(strBegin(str) + strLen - pfLen, pfLen, strBegin(postfix), pfLen) == 0; -} - - template S ciReplaceCpy(const S& str, const T& oldTerm, const U& newTerm); @@ -110,37 +85,11 @@ inline void makeUpperInPlace(char* str, size_t strLen) { std::for_each(str, str + strLen, [](char& c) { c = std::toupper(static_cast(c)); }); //locale-dependent! - //result of toupper() is an unsigned char mapped to int range, so the char representation is in the last 8 bits and we need not care about signedness! + //result of toupper() is an unsigned char mapped to int range: the char representation is in the last 8 bits and we need not care about signedness! //this should work for UTF-8, too: all chars >= 128 are mapped upon themselves! } -inline -int cmpStringNoCase(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen) -{ - assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls! - assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); // - - const int rv = ::wcsncasecmp(lhs, rhs, std::min(lhsLen, rhsLen)); //locale-dependent! - if (rv != 0) - return rv; - return static_cast(lhsLen) - static_cast(rhsLen); -} - - -inline -int cmpStringNoCase(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) -{ - assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls! - assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); // - - const int rv = ::strncasecmp(lhs, rhs, std::min(lhsLen, rhsLen)); //locale-dependent! - if (rv != 0) - return rv; - return static_cast(lhsLen) - static_cast(rhsLen); -} - - template inline S makeUpperCopy(S str) { @@ -153,20 +102,7 @@ S makeUpperCopy(S str) inline -int cmpFilePath(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen) -{ - assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls! - assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); // - - const int rv = std::wcsncmp(lhs, rhs, std::min(lhsLen, rhsLen)); - if (rv != 0) - return rv; - return static_cast(lhsLen) - static_cast(rhsLen); -} - - -inline -int cmpFilePath(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) +int CmpFilePath::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const { assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls! assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); // @@ -214,6 +150,7 @@ S ciReplaceCpy(const S& str, const T& oldTerm, const U& newTerm) } } + int cmpStringNaturalLinux(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen); //--------------------------------------------------------------------------- //ZEN macro consistency checks: -- cgit