diff options
Diffstat (limited to 'zen')
-rw-r--r-- | zen/build_info.h | 1 | ||||
-rw-r--r-- | zen/file_access.cpp | 17 | ||||
-rw-r--r-- | zen/file_access.h | 14 | ||||
-rw-r--r-- | zen/file_path.cpp | 7 | ||||
-rw-r--r-- | zen/file_path.h | 2 | ||||
-rw-r--r-- | zen/file_traverser.h | 4 | ||||
-rw-r--r-- | zen/format_unit.cpp | 27 | ||||
-rw-r--r-- | zen/process_exec.cpp | 3 | ||||
-rw-r--r-- | zen/resolve_path.cpp | 12 | ||||
-rw-r--r-- | zen/socket.h | 10 | ||||
-rw-r--r-- | zen/stl_tools.h | 37 | ||||
-rw-r--r-- | zen/string_base.h | 31 | ||||
-rw-r--r-- | zen/string_tools.h | 26 | ||||
-rw-r--r-- | zen/string_traits.h | 4 | ||||
-rw-r--r-- | zen/sys_info.cpp | 12 | ||||
-rw-r--r-- | zen/thread.h | 2 | ||||
-rw-r--r-- | zen/time.h | 44 | ||||
-rw-r--r-- | zen/utf.h | 160 | ||||
-rw-r--r-- | zen/zstring.cpp | 222 | ||||
-rw-r--r-- | zen/zstring.h | 26 |
20 files changed, 345 insertions, 316 deletions
diff --git a/zen/build_info.h b/zen/build_info.h index b06c1302..86ff303c 100644 --- a/zen/build_info.h +++ b/zen/build_info.h @@ -26,6 +26,7 @@ enum class BuildArch static_assert((BuildArch::program == BuildArch::bit32 ? 32 : 64) == sizeof(void*) * 8); +//harmonize with os_arch enum in update_checks table: constexpr const char* cpuArchName = BuildArch::program == BuildArch::bit32 ? "i686": "x86-64"; } diff --git a/zen/file_access.cpp b/zen/file_access.cpp index 6a62f671..2e119e87 100644 --- a/zen/file_access.cpp +++ b/zen/file_access.cpp @@ -70,7 +70,7 @@ std::optional<ItemType> zen::itemStillExists(const Zstring& itemPath) //throw Fi try { traverseFolder(*parentPath, - [&](const FileInfo& fi) { if (fi.itemName == itemName) throw ItemType::file; }, + [&](const FileInfo& fi) { if (fi.itemName == itemName) throw ItemType::file; }, //case-sensitive! itemPath must be normalized! [&](const FolderInfo& fi) { if (fi.itemName == itemName) throw ItemType::folder; }, [&](const SymlinkInfo& si) { if (si.itemName == itemName) throw ItemType::symlink; }, [](const std::wstring& errorMsg) { throw FileError(errorMsg); }); @@ -233,7 +233,6 @@ void zen::removeDirectoryPlainRecursion(const Zstring& dirPath) //throw FileErro namespace { - /* Usage overview: (avoid circular pattern!) moveAndRenameItem() --> moveAndRenameFileSub() @@ -319,18 +318,20 @@ void setWriteTimeNative(const Zstring& itemPath, const timespec& modTime, ProcSy => utimens: https://github.com/coreutils/gnulib/blob/master/lib/utimens.c touch: https://github.com/coreutils/coreutils/blob/master/src/touch.c => fdutimensat: https://github.com/coreutils/gnulib/blob/master/lib/fdutimensat.c */ - timespec newTimes[2] = {}; - newTimes[0].tv_sec = ::time(nullptr); //access time; don't use UTIME_NOW/UTIME_OMIT: more bugs! https://freefilesync.org/forum/viewtopic.php?t=1701 - newTimes[1] = modTime; //modification time + const timespec newTimes[2] + { + {.tv_sec = ::time(nullptr)}, //access time; don't use UTIME_NOW/UTIME_OMIT: more bugs! https://freefilesync.org/forum/viewtopic.php?t=1701 + modTime, + }; //test: even modTime == 0 is correctly applied (no NOOP!) test2: same behavior for "utime()" //hell knows why files on gvfs-mounted Samba shares fail to open(O_WRONLY) returning EOPNOTSUPP: //https://freefilesync.org/forum/viewtopic.php?t=2803 => utimensat() works (but not for gvfs SFTP) - if (::utimensat(AT_FDCWD, itemPath.c_str(), newTimes, procSl == ProcSymlink::direct ? AT_SYMLINK_NOFOLLOW : 0) == 0) + if (::utimensat(AT_FDCWD, itemPath.c_str(), newTimes, procSl == ProcSymlink::asLink ? AT_SYMLINK_NOFOLLOW : 0) == 0) return; try { - if (procSl == ProcSymlink::direct) + if (procSl == ProcSymlink::asLink) try { if (getItemType(itemPath) == ItemType::symlink) //throw FileError @@ -554,7 +555,7 @@ void zen::copySymlink(const Zstring& sourcePath, const Zstring& targetPath) //th if (::lstat(sourcePath.c_str(), &sourceInfo) != 0) THROW_LAST_FILE_ERROR(replaceCpy(_("Cannot read file attributes of %x."), L"%x", fmtPath(sourcePath)), "lstat"); - setWriteTimeNative(targetPath, sourceInfo.st_mtim, ProcSymlink::direct); //throw FileError + setWriteTimeNative(targetPath, sourceInfo.st_mtim, ProcSymlink::asLink); //throw FileError } diff --git a/zen/file_access.h b/zen/file_access.h index 17c47731..f6a02edc 100644 --- a/zen/file_access.h +++ b/zen/file_access.h @@ -29,12 +29,7 @@ using FileIndex = ino_t; using FileTimeNative = timespec; inline time_t nativeFileTimeToTimeT(const timespec& ft) { return ft.tv_sec; } //follow Windows Explorer and always round down! -inline timespec timetToNativeFileTime(time_t utcTime) -{ - timespec natTime = {}; - natTime.tv_sec = utcTime; - return natTime; -} +inline timespec timetToNativeFileTime(time_t utcTime) { return {.tv_sec = utcTime}; } enum class ItemType { @@ -44,15 +39,14 @@ enum class ItemType }; //(hopefully) fast: does not distinguish between error/not existing ItemType getItemType(const Zstring& itemPath); //throw FileError -//execute potentially SLOW folder traversal but distinguish error/not existing -// assumes: - base path still exists -// - all child item path parts must correspond to folder traversal +//execute potentially SLOW folder traversal but distinguish error/not existing: +// - all child item path parts must correspond to folder traversal // => we can conclude whether an item is *not* existing anymore by doing a *case-sensitive* name search => potentially SLOW! std::optional<ItemType> itemStillExists(const Zstring& itemPath); //throw FileError enum class ProcSymlink { - direct, + asLink, follow }; void setFileTime(const Zstring& filePath, time_t modTime, ProcSymlink procSl); //throw FileError diff --git a/zen/file_path.cpp b/zen/file_path.cpp index 716dd8de..f5c207f3 100644 --- a/zen/file_path.cpp +++ b/zen/file_path.cpp @@ -13,11 +13,12 @@ std::optional<PathComponents> zen::parsePathComponents(const Zstring& itemPath) { auto doParse = [&](int sepCountVolumeRoot, bool rootWithSep) -> std::optional<PathComponents> { + assert(sepCountVolumeRoot > 0); const Zstring itemPathPf = appendSeparator(itemPath); //simplify analysis of root without separator, e.g. \\server-name\share - int sepCount = 0; + for (auto it = itemPathPf.begin(); it != itemPathPf.end(); ++it) if (*it == FILE_NAME_SEPARATOR) - if (++sepCount == sepCountVolumeRoot) + if (--sepCountVolumeRoot == 0) { Zstring rootPath(itemPathPf.begin(), rootWithSep ? it + 1 : it); @@ -89,7 +90,7 @@ bool zen::isValidRelPath(const Zstring& relPath) if constexpr (FILE_NAME_SEPARATOR != Zstr('\\')) if (contains(relPath, Zstr('\\'))) return false; const Zchar doubleSep[] = {FILE_NAME_SEPARATOR, FILE_NAME_SEPARATOR, 0}; - return !startsWith(relPath, FILE_NAME_SEPARATOR)&& !endsWith(relPath, FILE_NAME_SEPARATOR)&& + return !startsWith(relPath, FILE_NAME_SEPARATOR) && !endsWith(relPath, FILE_NAME_SEPARATOR) && !contains(relPath, doubleSep); } diff --git a/zen/file_path.h b/zen/file_path.h index 4a85514b..85af251d 100644 --- a/zen/file_path.h +++ b/zen/file_path.h @@ -40,7 +40,7 @@ std::weak_ordering compareNativePath(const Zstring& lhs, const Zstring& rhs); inline bool equalNativePath(const Zstring& lhs, const Zstring& rhs) { return compareNativePath(lhs, rhs) == std::weak_ordering::equivalent; } -struct LessNativePath { bool operator()(const Zstring& lhs, const Zstring& rhs) const { return std::is_lt(compareNativePath(lhs, rhs)); } }; +struct LessNativePath { bool operator()(const Zstring& lhs, const Zstring& rhs) const { return compareNativePath(lhs, rhs) < 0; } }; //------------------------------------------------------------------------------------------ diff --git a/zen/file_traverser.h b/zen/file_traverser.h index cb7782d6..11c3eaa0 100644 --- a/zen/file_traverser.h +++ b/zen/file_traverser.h @@ -17,7 +17,7 @@ struct FileInfo Zstring itemName; Zstring fullPath; uint64_t fileSize = 0; //[bytes] - time_t modTime = 0; //number of seconds since Jan. 1st 1970 UTC + time_t modTime = 0; //number of seconds since Jan. 1st 1970 GMT }; struct FolderInfo @@ -30,7 +30,7 @@ struct SymlinkInfo { Zstring itemName; Zstring fullPath; - time_t modTime = 0; //number of seconds since Jan. 1st 1970 UTC + time_t modTime = 0; //number of seconds since Jan. 1st 1970 GMT }; //- non-recursive diff --git a/zen/format_unit.cpp b/zen/format_unit.cpp index 2aa6e094..8b3fccfe 100644 --- a/zen/format_unit.cpp +++ b/zen/format_unit.cpp @@ -168,12 +168,27 @@ std::wstring zen::formatNumber(int64_t n) std::wstring zen::formatUtcToLocalTime(time_t utcTime) { - auto errorMsg = [&] { return _("Error") + L" (time_t: " + numberTo<std::wstring>(utcTime) + L')'; }; + auto fmtFallback = [utcTime] //don't take "no" for an answer! + { + if (const TimeComp tc = getUtcTime(utcTime); + tc != TimeComp()) + { + wchar_t buf[128] = {}; //the only way to format abnormally large or invalid modTime: std::strftime() will fail! + if (const int rv = std::swprintf(buf, std::size(buf), L"%d-%02d-%02d %02d:%02d:%02d GMT", tc.year, tc.month, tc.day, tc.hour, tc.minute, tc.second); + 0 < rv && rv < std::ssize(buf)) + return std::wstring(buf, rv); + } + + return L"time_t = " + numberTo<std::wstring>(utcTime); + }; const TimeComp& loc = getLocalTime(utcTime); //returns TimeComp() on error - std::wstring dateString = utfTo<std::wstring>(formatTime(Zstr("%x %X"), loc)); - return !dateString.empty() ? dateString : errorMsg(); + /*const*/ std::wstring dateTimeFmt = utfTo<std::wstring>(formatTime(Zstr("%x %X"), loc)); + if (dateTimeFmt.empty()) + return fmtFallback(); + + return dateTimeFmt; } @@ -188,9 +203,9 @@ WeekDay impl::getFirstDayOfWeekImpl() //throw SysError const char* firstDay = ::nl_langinfo(_NL_TIME_FIRST_WEEKDAY); //[1-Sunday, 7-Saturday] ASSERT_SYSERROR(firstDay && 1 <= *firstDay && *firstDay <= 7); - const int weekDayStartSunday = *firstDay; - const int weekDayStartMonday = (weekDayStartSunday - 1 + 6) % 7; //+6 == -1 in Z_7 - // [0-Monday, 6-Sunday] + const int weekDayStartSunday = *firstDay; //[1-Sunday, 7-Saturday] + const int weekDayStartMonday = (weekDayStartSunday - 2 + 7) % 7; //[0-Monday, 6-Sunday] 7 == 0 in Z_7 + return static_cast<WeekDay>(weekDayStartMonday); } diff --git a/zen/process_exec.cpp b/zen/process_exec.cpp index 6b670508..df41a627 100644 --- a/zen/process_exec.cpp +++ b/zen/process_exec.cpp @@ -176,8 +176,7 @@ std::pair<int /*exit code*/, std::string> processExecuteImpl(const Zstring& file const auto waitTimeMs = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - now).count(); - timeval tv = {}; - tv.tv_sec = static_cast<long>(waitTimeMs / 1000); + timeval tv{.tv_sec = static_cast<long>(waitTimeMs / 1000)}; tv.tv_usec = static_cast<long>(waitTimeMs - tv.tv_sec * 1000) * 1000; fd_set rfd = {}; //includes FD_ZERO diff --git a/zen/resolve_path.cpp b/zen/resolve_path.cpp index 357dab6a..99e2f6c6 100644 --- a/zen/resolve_path.cpp +++ b/zen/resolve_path.cpp @@ -9,7 +9,7 @@ #include "thread.h" #include "file_access.h" -#include <zen/sys_info.h> + #include <zen/sys_info.h> // #include <stdlib.h> //getenv() #include <unistd.h> //getuid() #include <pwd.h> //getpwuid_r() @@ -63,16 +63,16 @@ Zstring resolveRelativePath(const Zstring& relativePath) https://www.gnu.org/software/bash/manual/html_node/Tilde-Expansion.html */ if (startsWith(pathTmp, "~/") || pathTmp == "~") { - try - { - const Zstring& homePath = getUserHome(); //throw FileError + try + { + const Zstring& homePath = getUserHome(); //throw FileError if (startsWith(pathTmp, "~/")) pathTmp = appendPath(homePath, pathTmp.c_str() + 2); else //pathTmp == "~" pathTmp = homePath; - } - catch (FileError&) {} + } + catch (FileError&) {} //else: error! no further processing! } else diff --git a/zen/socket.h b/zen/socket.h index 5ece29f8..d9517bd8 100644 --- a/zen/socket.h +++ b/zen/socket.h @@ -33,11 +33,13 @@ class Socket //throw SysError public: Socket(const Zstring& server, const Zstring& serviceName) //throw SysError { - ::addrinfo hints = {}; - hints.ai_socktype = SOCK_STREAM; //we *do* care about this one! - hints.ai_flags = AI_ADDRCONFIG; //save a AAAA lookup on machines that can't use the returned data anyhow + const addrinfo hints + { + .ai_flags = AI_ADDRCONFIG, //save a AAAA lookup on machines that can't use the returned data anyhow + .ai_socktype = SOCK_STREAM, //we *do* care about this one! + }; - ::addrinfo* servinfo = nullptr; + addrinfo* servinfo = nullptr; ZEN_ON_SCOPE_EXIT(if (servinfo) ::freeaddrinfo(servinfo)); const int rcGai = ::getaddrinfo(server.c_str(), serviceName.c_str(), &hints, &servinfo); diff --git a/zen/stl_tools.h b/zen/stl_tools.h index 2726a09d..66af8551 100644 --- a/zen/stl_tools.h +++ b/zen/stl_tools.h @@ -68,10 +68,10 @@ template <class Iterator, class T, class CompLess> Iterator binarySearch(Iterator first, Iterator last, const T& value, CompLess less); //read-only variant of std::merge; input: two sorted ranges -template <class Iterator, class FunctionLeftOnly, class FunctionBoth, class FunctionRightOnly> +template <class Iterator, class FunctionLeftOnly, class FunctionBoth, class FunctionRightOnly, class Compare> void mergeTraversal(Iterator first1, Iterator last1, Iterator first2, Iterator last2, - FunctionLeftOnly lo, FunctionBoth bo, FunctionRightOnly ro); + FunctionLeftOnly lo, FunctionBoth bo, FunctionRightOnly ro, Compare compare); //why, oh why is there no std::optional<T>::get()??? template <class T> inline T* get( std::optional<T>& opt) { return opt ? &*opt : nullptr; } @@ -255,31 +255,32 @@ BidirectionalIterator1 searchLast(const BidirectionalIterator1 first1, Bid //--------------------------------------------------------------------------------------- //read-only variant of std::merge; input: two sorted ranges -template <class Iterator, class FunctionLeftOnly, class FunctionBoth, class FunctionRightOnly> inline -void mergeTraversal(Iterator first1, Iterator last1, - Iterator first2, Iterator last2, - FunctionLeftOnly lo, FunctionBoth bo, FunctionRightOnly ro) +template <class Iterator, class FunctionLeftOnly, class FunctionBoth, class FunctionRightOnly, class Compare> inline +void mergeTraversal(Iterator firstL, Iterator lastL, + Iterator firstR, Iterator lastR, + FunctionLeftOnly lo, FunctionBoth bo, FunctionRightOnly ro, Compare compare) { - auto itL = first1; - auto itR = first2; + auto itL = firstL; + auto itR = firstR; - auto finishLeft = [&] { std::for_each(itL, last1, lo); }; - auto finishRight = [&] { std::for_each(itR, last2, ro); }; + auto finishLeft = [&] { std::for_each(itL, lastL, lo); }; + auto finishRight = [&] { std::for_each(itR, lastR, ro); }; - if (itL == last1) return finishRight(); - if (itR == last2) return finishLeft (); + if (itL == lastL) return finishRight(); + if (itR == lastR) return finishLeft (); for (;;) - if (itL->first < itR->first) + if (const std::weak_ordering cmp = compare(*itL, *itR); + cmp < 0) { lo(*itL); - if (++itL == last1) + if (++itL == lastL) return finishRight(); } - else if (itR->first < itL->first) + else if (cmp > 0) { ro(*itR); - if (++itR == last2) + if (++itR == lastR) return finishLeft(); } else @@ -287,8 +288,8 @@ void mergeTraversal(Iterator first1, Iterator last1, bo(*itL, *itR); ++itL; // ++itR; //increment BOTH before checking for end of range! - if (itL == last1) return finishRight(); - if (itR == last2) return finishLeft (); + if (itL == lastL) return finishRight(); + if (itR == lastR) return finishLeft (); //simplify loop by placing both EOB checks at the beginning? => slightly slower } } diff --git a/zen/string_base.h b/zen/string_base.h index ace870b9..e18a0f16 100644 --- a/zen/string_base.h +++ b/zen/string_base.h @@ -312,9 +312,10 @@ template <class Char, template <class> class SP> bool operator==(const Zb template <class Char, template <class> class SP> bool operator==(const Zbase<Char, SP>& lhs, const Char* rhs); template <class Char, template <class> class SP> inline bool operator==(const Char* lhs, const Zbase<Char, SP>& rhs) { return operator==(rhs, lhs); } -template <class Char, template <class> class SP> std::strong_ordering operator<=>(const Zbase<Char, SP>& lhs, const Zbase<Char, SP>& rhs); -template <class Char, template <class> class SP> std::strong_ordering operator<=>(const Zbase<Char, SP>& lhs, const Char* rhs); -template <class Char, template <class> class SP> std::strong_ordering operator<=>(const Char* lhs, const Zbase<Char, SP>& rhs); +//follow convention + compare by unsigned char; alternative: std::lexicographical_compare_three_way + reinterpret_cast<const std::make_unsigned_t<Char>*>() +template <class Char, template <class> class SP> std::strong_ordering operator<=>(const Zbase<Char, SP>& lhs, const Zbase<Char, SP>& rhs) { return compareString(lhs, rhs); } +template <class Char, template <class> class SP> std::strong_ordering operator<=>(const Zbase<Char, SP>& lhs, const Char* rhs) { return compareString(lhs, rhs); } +template <class Char, template <class> class SP> std::strong_ordering operator<=>(const Char* lhs, const Zbase<Char, SP>& rhs) { return compareString(lhs, rhs); } template <class Char, template <class> class SP> inline Zbase<Char, SP> operator+(const Zbase<Char, SP>& lhs, const Zbase<Char, SP>& rhs) { return Zbase<Char, SP>(lhs) += rhs; } template <class Char, template <class> class SP> inline Zbase<Char, SP> operator+(const Zbase<Char, SP>& lhs, const Char* rhs) { return Zbase<Char, SP>(lhs) += rhs; } @@ -495,30 +496,6 @@ bool operator==(const Zbase<Char, SP>& lhs, const Char* rhs) template <class Char, template <class> class SP> inline -std::strong_ordering operator<=>(const Zbase<Char, SP>& lhs, const Zbase<Char, SP>& rhs) -{ - return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), //respect embedded 0 - rhs.begin(), rhs.end()); // -} - - -template <class Char, template <class> class SP> inline -std::strong_ordering operator<=>(const Zbase<Char, SP>& lhs, const Char* rhs) -{ - return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), //respect embedded 0 - rhs, rhs + strLength(rhs)); -} - - -template <class Char, template <class> class SP> inline -std::strong_ordering operator<=>(const Char* lhs, const Zbase<Char, SP>& rhs) -{ - return std::lexicographical_compare_three_way(lhs, lhs + strLength(lhs), - rhs.begin(), rhs.end()); //respect embedded 0 -} - - -template <class Char, template <class> class SP> inline size_t Zbase<Char, SP>::length() const { return SP<Char>::length(rawStr_); diff --git a/zen/string_tools.h b/zen/string_tools.h index d3f35ce8..cafff3d5 100644 --- a/zen/string_tools.h +++ b/zen/string_tools.h @@ -41,7 +41,7 @@ template <class S, class T> bool endsWithAsciiNoCase(const S& str, const T& post template <class S, class T> bool equalString (const S& lhs, const T& rhs); template <class S, class T> bool equalAsciiNoCase(const S& lhs, const T& rhs); -//template <class S, class T> std::strong_ordering compareString(const S& lhs, const T& rhs); +template <class S, class T> std::strong_ordering compareString(const S& lhs, const T& rhs); template <class S, class T> std::weak_ordering compareAsciiNoCase(const S& lhs, const T& rhs); //basic case-insensitive comparison (considering A-Z only!) //STL container predicates for std::map, std::unordered_set/map @@ -269,10 +269,12 @@ bool equalAsciiNoCase(const S& lhs, const T& rhs) } -#if 0 -//support embedded 0, unlike strncmp/wcsncmp: +namespace impl +{ +//support embedded 0 (unlike strncmp/wcsncmp) + compare unsigned[!] char inline std::strong_ordering strcmpWithNulls(const char* ptr1, const char* ptr2, size_t num) { return std:: memcmp(ptr1, ptr2, num) <=> 0; } inline std::strong_ordering strcmpWithNulls(const wchar_t* ptr1, const wchar_t* ptr2, size_t num) { return std::wmemcmp(ptr1, ptr2, num) <=> 0; } +} template <class S, class T> inline std::strong_ordering compareString(const S& lhs, const T& rhs) @@ -280,13 +282,12 @@ std::strong_ordering compareString(const S& lhs, const T& rhs) const size_t lhsLen = strLength(lhs); const size_t rhsLen = strLength(rhs); - //length check *after* strcmpWithNulls(): we DO care about natural ordering: e.g. for "compareString(getUpperCase(lhs), getUpperCase(rhs))" + //length check *after* strcmpWithNulls(): we DO care about natural ordering if (const std::strong_ordering cmp = impl::strcmpWithNulls(strBegin(lhs), strBegin(rhs), std::min(lhsLen, rhsLen)); cmp != std::strong_ordering::equal) return cmp; return lhsLen <=> rhsLen; } -#endif template <class S, class T> inline @@ -587,7 +588,7 @@ struct CopyStringToString T copy(const S& src) const { static_assert(!std::is_same_v<std::decay_t<S>, std::decay_t<T>>); - return T(strBegin(src), strLength(src)); + return {strBegin(src), strLength(src)}; } }; @@ -626,11 +627,10 @@ S printNumber(const T& format, const Num& number) //format a single number using #endif static_assert(std::is_same_v<GetCharTypeT<S>, GetCharTypeT<T>>); - const int BUFFER_SIZE = 128; - GetCharTypeT<S> buffer[BUFFER_SIZE]; //zero-initialize? - const int charsWritten = impl::saferPrintf(buffer, BUFFER_SIZE, strBegin(format), number); + GetCharTypeT<S> buf[128]; //zero-initialize? + const int charsWritten = impl::saferPrintf(buf, std::size(buf), strBegin(format), number); - return 0 < charsWritten && charsWritten < BUFFER_SIZE ? S(buffer, charsWritten) : S(); + return 0 < charsWritten && charsWritten < std::ssize(buf) ? S(buf, charsWritten) : S(); } @@ -944,7 +944,7 @@ Num hashString(const S& str) struct StringHash { - using is_transparent = int; //allow heterogenous lookup! + using is_transparent = int; //enable heterogenous lookup! template <class String> size_t operator()(const String& str) const { return hashString<size_t>(str); } @@ -953,7 +953,7 @@ struct StringHash struct StringEqual { - using is_transparent = int; //allow heterogenous lookup! + using is_transparent = int; //enable heterogenous lookup! template <class String1, class String2> bool operator()(const String1& lhs, const String2& rhs) const { return equalString(lhs, rhs); } @@ -963,7 +963,7 @@ struct StringEqual struct LessAsciiNoCase { template <class String> - bool operator()(const String& lhs, const String& rhs) const { return std::is_lt(compareAsciiNoCase(lhs, rhs)); } + bool operator()(const String& lhs, const String& rhs) const { return compareAsciiNoCase(lhs, rhs) < 0; } }; diff --git a/zen/string_traits.h b/zen/string_traits.h index 1a4f4740..31c8c12c 100644 --- a/zen/string_traits.h +++ b/zen/string_traits.h @@ -105,8 +105,8 @@ class StringTraits public: enum { - isStringClass = hasMemberType_value_type<CleanType> && - hasMember_c_str <CleanType> && + isStringClass = hasMemberType_value_type<CleanType>&& + hasMember_c_str <CleanType>&& hasMember_length <CleanType> }; diff --git a/zen/sys_info.cpp b/zen/sys_info.cpp index bc1bfe62..c57464bc 100644 --- a/zen/sys_info.cpp +++ b/zen/sys_info.cpp @@ -111,16 +111,20 @@ ComputerModel zen::getComputerModel() //throw FileError { auto tryGetInfo = [](const Zstring& filePath) { - if (!fileAvailable(filePath)) - return std::wstring(); try { const std::string stream = getFileContent(filePath, nullptr /*notifyUnbufferedIO*/); //throw FileError return utfTo<std::wstring>(trimCpy(stream)); } - catch (const FileError& e) { throw SysError(replaceCpy(e.toString(), L"\n\n", L'\n')); } //errors should be further enriched by context info => SysError + catch (FileError&) + { + if (!itemStillExists(filePath)) //throw FileError + return std::wstring(); + + throw; + } }; - cm.model = tryGetInfo("/sys/devices/virtual/dmi/id/product_name"); //throw SysError + cm.model = tryGetInfo("/sys/devices/virtual/dmi/id/product_name"); //throw FileError cm.vendor = tryGetInfo("/sys/devices/virtual/dmi/id/sys_vendor"); // //clean up: diff --git a/zen/thread.h b/zen/thread.h index 42fba281..abdc6da0 100644 --- a/zen/thread.h +++ b/zen/thread.h @@ -445,7 +445,7 @@ private: activeCondition_ = cv; } - std::atomic<bool> stopRequested_{false}; //std:atomic is uninitialized by default!!! + std::atomic<bool> stopRequested_{false}; //std::atomic is uninitialized by default!!! //"The default constructor is trivial: no initialization takes place other than zero initialization of static and thread-local objects." std::condition_variable* activeCondition_ = nullptr; @@ -83,30 +83,32 @@ std::tm toClibTimeComponents(const TimeComp& tc) 0 <= tc.minute && tc.minute <= 59 && 0 <= tc.second && tc.second <= 61); - std::tm ctc = {}; - ctc.tm_year = tc.year - 1900; //years since 1900 - ctc.tm_mon = tc.month - 1; //0-11 - ctc.tm_mday = tc.day; //1-31 - ctc.tm_hour = tc.hour; //0-23 - ctc.tm_min = tc.minute; //0-59 - ctc.tm_sec = tc.second; //0-60 (including leap second) - ctc.tm_isdst = -1; //> 0 if DST is active, == 0 if DST is not active, < 0 if the information is not available - //ctc.tm_wday - //ctc.tm_yday - return ctc; + return + { + .tm_sec = tc.second, //0-60 (including leap second) + .tm_min = tc.minute, //0-59 + .tm_hour = tc.hour, //0-23 + .tm_mday = tc.day, //1-31 + .tm_mon = tc.month - 1, //0-11 + .tm_year = tc.year - 1900, //years since 1900 + .tm_isdst = -1, //> 0 if DST is active, == 0 if DST is not active, < 0 if the information is not available + //.tm_wday + //.tm_yday + }; } inline TimeComp toZenTimeComponents(const std::tm& ctc) { - TimeComp tc; - tc.year = ctc.tm_year + 1900; - tc.month = ctc.tm_mon + 1; - tc.day = ctc.tm_mday; - tc.hour = ctc.tm_hour; - tc.minute = ctc.tm_min; - tc.second = ctc.tm_sec; - return tc; + return + { + .year = ctc.tm_year + 1900, + .month = ctc.tm_mon + 1, + .day = ctc.tm_mday, + .hour = ctc.tm_hour, + .minute = ctc.tm_min, + .second = ctc.tm_sec, + }; } @@ -235,12 +237,12 @@ std::pair<time_t, bool /*success*/> localToTimeT(const TimeComp& tc) //convert l const int cycles400 = numeric::intDivFloor(ctc.tm_year + 1900 - 1971/*[!]*/, 400); //see utcToTimeT() //1971: ensures resulting time_t >= 0 after time zone, DST adaption, or std::mktime will fail on Windows! - ctc.tm_year -= 400 * cycles400; + ctc.tm_year -= 400 * cycles400; const time_t locTime = std::mktime(&ctc); if (locTime == -1) return {}; - + assert(locTime > 0); return {locTime + secsPer400Years * cycles400, true}; } @@ -7,8 +7,6 @@ #ifndef UTF_H_01832479146991573473545 #define UTF_H_01832479146991573473545 -//#include <cstdint> -//#include <iterator> #include "string_tools.h" //copyStringTo @@ -45,8 +43,8 @@ using CodePoint = uint32_t; using Char16 = uint16_t; using Char8 = uint8_t; -const CodePoint LEAD_SURROGATE = 0xd800; -const CodePoint TRAIL_SURROGATE = 0xdc00; //== LEAD_SURROGATE_MAX + 1 +const CodePoint LEAD_SURROGATE = 0xd800; //1101 1000 0000 0000 LEAD_SURROGATE_MAX = TRAIL_SURROGATE - 1 +const CodePoint TRAIL_SURROGATE = 0xdc00; //1101 1100 0000 0000 const CodePoint TRAIL_SURROGATE_MAX = 0xdfff; const CodePoint REPLACEMENT_CHAR = 0xfffd; @@ -62,31 +60,17 @@ void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a u if (cp < LEAD_SURROGATE) writeOutput(static_cast<Char16>(cp)); else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point - codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 - else if (cp < 0x10000) + writeOutput(static_cast<Char16>(REPLACEMENT_CHAR)); + else if (cp <= 0xffff) writeOutput(static_cast<Char16>(cp)); else if (cp <= CODE_POINT_MAX) { cp -= 0x10000; writeOutput(static_cast<Char16>( LEAD_SURROGATE + (cp >> 10))); - writeOutput(static_cast<Char16>(TRAIL_SURROGATE + (cp & 0x3ff))); + writeOutput(static_cast<Char16>(TRAIL_SURROGATE + (cp & 0b11'1111'1111))); } else //invalid code point - codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 -} - - -inline -size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error! -{ - if (ch < LEAD_SURROGATE) - return 1; - else if (ch < TRAIL_SURROGATE) - return 2; - else if (ch <= TRAIL_SURROGATE_MAX) - return 0; //unexpected trail surrogate! - else - return 1; + writeOutput(static_cast<Char16>(REPLACEMENT_CHAR)); } @@ -102,17 +86,14 @@ public: const Char16 ch = *it_++; CodePoint cp = ch; - switch (getUtf16Len(ch)) - { - case 0: //invalid utf16 character - cp = REPLACEMENT_CHAR; - break; - case 1: - break; - case 2: - decodeTrail(cp); - break; - } + + if (ch < LEAD_SURROGATE || ch > TRAIL_SURROGATE_MAX) //single Char16, no surrogates + ; + else if (ch < TRAIL_SURROGATE) //two Char16: lead and trail surrogates + decodeTrail(cp); //no range check needed: cp is inside [U+010000, U+10FFFF] by construction + else //unexpected trail surrogate + cp = REPLACEMENT_CHAR; + return cp; } @@ -141,46 +122,37 @@ private: template <class Function> inline void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 { - //https://en.wikipedia.org/wiki/UTF-8 - //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8 + /* https://en.wikipedia.org/wiki/UTF-8 + "high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and + code points not encodable by UTF-16 (those after U+10FFFF) [...] must be treated as an invalid byte sequence" */ - if (cp < 0x80) + if (cp <= 0b111'1111) writeOutput(static_cast<Char8>(cp)); - else if (cp < 0x800) + else if (cp <= 0b0111'1111'1111) { - writeOutput(static_cast<Char8>((cp >> 6 ) | 0xc0)); - writeOutput(static_cast<Char8>((cp & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>((cp >> 6) | 0b1100'0000)); //110x xxxx + writeOutput(static_cast<Char8>((cp & 0b11'1111) | 0b1000'0000)); //10xx xxxx } - else if (cp < 0x10000) + else if (cp <= 0b1111'1111'1111'1111) { - writeOutput(static_cast<Char8>( (cp >> 12 ) | 0xe0)); - writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast<Char8>( (cp & 0x3f) | 0x80)); + if (LEAD_SURROGATE <= cp && cp <= TRAIL_SURROGATE_MAX) //[0xd800, 0xdfff] + codePointToUtf8(REPLACEMENT_CHAR, writeOutput); + else + { + writeOutput(static_cast<Char8>( (cp >> 12) | 0b1110'0000)); //1110 xxxx + writeOutput(static_cast<Char8>(((cp >> 6) & 0b11'1111) | 0b1000'0000)); //10xx xxxx + writeOutput(static_cast<Char8>( (cp & 0b11'1111) | 0b1000'0000)); //10xx xxxx + } } else if (cp <= CODE_POINT_MAX) { - writeOutput(static_cast<Char8>( (cp >> 18 ) | 0xf0)); - writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80)); - writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast<Char8>( (cp & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>( (cp >> 18) | 0b1111'0000)); //1111 0xxx + writeOutput(static_cast<Char8>(((cp >> 12) & 0b11'1111) | 0b1000'0000)); //10xx xxxx + writeOutput(static_cast<Char8>(((cp >> 6) & 0b11'1111) | 0b1000'0000)); //10xx xxxx + writeOutput(static_cast<Char8>( (cp & 0b11'1111) | 0b1000'0000)); //10xx xxxx } else //invalid code point - codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8 -} - - -inline -size_t getUtf8Len(Char8 ch) //ch must be first code unit! returns 0 on error! -{ - if (ch < 0x80) - return 1; - if (ch >> 5 == 0x6) - return 2; - if (ch >> 4 == 0xe) - return 3; - if (ch >> 3 == 0x1e) - return 4; - return 0; //invalid begin of UTF8 encoding + codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte UTF8 } @@ -196,30 +168,34 @@ public: const Char8 ch = *it_++; CodePoint cp = ch; - switch (getUtf8Len(ch)) + + if (ch < 0x80) //1 byte + ; + else if (ch >> 5 == 0b110) //2 bytes { - case 0: //invalid utf8 character - cp = REPLACEMENT_CHAR; - break; - case 1: - break; - case 2: - cp &= 0x1f; - decodeTrail(cp); - break; - case 3: - cp &= 0xf; - if (decodeTrail(cp)) - decodeTrail(cp); - break; - case 4: - cp &= 0x7; - if (decodeTrail(cp)) - if (decodeTrail(cp)) - decodeTrail(cp); - if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR; - break; + cp &= 0b1'1111; + if (decodeTrail(cp)) + if (cp <= 0b111'1111) //overlong encoding: "correct encoding of a code point uses only the minimum number of bytes required" + cp = REPLACEMENT_CHAR; } + else if (ch >> 4 == 0b1110) //3 bytes + { + cp &= 0b1111; + if (decodeTrail(cp) && decodeTrail(cp)) + if (cp <= 0b0111'1111'1111 || + (LEAD_SURROGATE <= cp && cp <= TRAIL_SURROGATE_MAX)) //[0xd800, 0xdfff] are invalid code points + cp = REPLACEMENT_CHAR; + } + else if (ch >> 3 == 0b11110) //4 bytes + { + cp &= 0b111; + if (decodeTrail(cp) && decodeTrail(cp) && decodeTrail(cp)) + if (cp <= 0b1111'1111'1111'1111 || cp > CODE_POINT_MAX) + cp = REPLACEMENT_CHAR; + } + else //invalid begin of UTF8 encoding + cp = REPLACEMENT_CHAR; + return cp; } @@ -229,9 +205,9 @@ private: if (it_ != last_) //trail surrogate expected! { const Char8 ch = *it_; - if (ch >> 6 == 0x2) //trail surrogate expected! + if (ch >> 6 == 0b10) //trail surrogate expected! { - cp = (cp << 6) + (ch & 0x3f); + cp = (cp << 6) + (ch & 0b11'1111); ++it_; return true; } @@ -337,7 +313,9 @@ UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t u assert(uniPosFirst <= uniPosLast && uniPosLast <= unicodeLength(str)); using namespace impl; using CharType = GetCharTypeT<UtfString>; + UtfString output; + assert(uniPosFirst <= uniPosLast); if (uniPosFirst >= uniPosLast) //optimize for empty range return output; @@ -357,6 +335,10 @@ UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t u namespace impl { template <class TargetString, class SourceString> inline +TargetString utfTo(const SourceString& str, std::true_type) { return copyStringTo<TargetString>(str); } + + +template <class TargetString, class SourceString> inline TargetString utfTo(const SourceString& str, std::false_type) { using CharSrc = GetCharTypeT<SourceString>; @@ -371,10 +353,6 @@ TargetString utfTo(const SourceString& str, std::false_type) return output; } - - -template <class TargetString, class SourceString> inline -TargetString utfTo(const SourceString& str, std::true_type) { return copyStringTo<TargetString>(str); } } diff --git a/zen/zstring.cpp b/zen/zstring.cpp index 76c0a81f..1e29e461 100644 --- a/zen/zstring.cpp +++ b/zen/zstring.cpp @@ -11,46 +11,44 @@ using namespace zen; -Zstring getUnicodeNormalForm(const Zstring& str) +Zstring getUnicodeNormalFormNonAscii(const Zstring& str) { - //fast pre-check: - if (isAsciiString(str)) //perf: in the range of 3.5ns - return str; - static_assert(std::is_same_v<decltype(str), const Zbase<Zchar>&>, "god bless our ref-counting! => save output string memory consumption!"); - //Example: const char* decomposed = "\x6f\xcc\x81"; // const char* precomposed = "\xc3\xb3"; + assert(!isAsciiString(str)); + assert(str.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls! + try { gchar* outStr = ::g_utf8_normalize(str.c_str(), str.length(), G_NORMALIZE_DEFAULT_COMPOSE); if (!outStr) - throw SysError(formatSystemError("g_utf8_normalize(" + utfTo<std::string>(str) + ')', L"", L"Conversion failed.")); + throw SysError(formatSystemError("g_utf8_normalize", L"", L"Conversion failed.")); ZEN_ON_SCOPE_EXIT(::g_free(outStr)); return outStr; } - catch ([[maybe_unused]] const SysError& e) + catch (const SysError& e) { - assert(false); - return str; + throw std::runtime_error(std::string(__FILE__) + '[' + numberTo<std::string>(__LINE__) + "] Error normalizing string:" + + '\n' + utfTo<std::string>(str) + "\n\n" + utfTo<std::string>(e.toString())); } } -Zstring getUpperCase(const Zstring& str) +Zstring getUnicodeNormalForm(const Zstring& str) { - assert(str.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls! - //fast pre-check: if (isAsciiString(str)) //perf: in the range of 3.5ns - { - Zstring output = str; - for (Zchar& c : output) - c = asciiToUpper(c); - return output; - } + return str; + static_assert(std::is_same_v<decltype(str), const Zbase<Zchar>&>, "god bless our ref-counting! => save output string memory consumption!"); - Zstring strNorm = getUnicodeNormalForm(str); + return getUnicodeNormalFormNonAscii(str); +} + + +Zstring getUpperCaseNonAscii(const Zstring& str) +{ + Zstring strNorm = getUnicodeNormalFormNonAscii(str); try { static_assert(sizeof(impl::CodePoint) == sizeof(gunichar)); @@ -64,11 +62,26 @@ Zstring getUpperCase(const Zstring& str) return output; } - catch (SysError&) + catch (const SysError& e) { - assert(false); - return str; + throw std::runtime_error(std::string(__FILE__) + '[' + numberTo<std::string>(__LINE__) + "] Error converting string to upper case:" + + '\n' + utfTo<std::string>(str) + "\n\n" + utfTo<std::string>(e.toString())); + } +} + + +Zstring getUpperCase(const Zstring& str) +{ + if (isAsciiString(str)) //fast path: in the range of 3.5ns + { + Zstring output = str; + for (Zchar& c : output) //identical to LCMapStringEx(), g_unichar_toupper(), CFStringUppercase() [verified!] + c = asciiToUpper(c); // + return output; } + //else: slow path -------------------------------------- + + return getUpperCaseNonAscii(str); } @@ -91,10 +104,10 @@ std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* static_assert(sizeof(gunichar) == sizeof(impl::CodePoint)); + //ordering: "to lower" converts to higher code points than "to upper" const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use: const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle. if (charL != charR) - //ordering: "to lower" converts to higher code points than "to upper" return makeUnsigned(charL) <=> makeUnsigned(charR); //unsigned char-comparison is the convention! } } @@ -107,78 +120,111 @@ std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs) Windows: CompareString() already ignores NFD/NFC differences: nice... Linux: g_unichar_toupper() can't ignore differences macOS: CFStringCompare() considers differences */ - - const Zstring& lhsNorm = getUnicodeNormalForm(lhs); - const Zstring& rhsNorm = getUnicodeNormalForm(rhs); - - const char* strL = lhsNorm.c_str(); - const char* strR = rhsNorm.c_str(); - - const char* const strEndL = strL + lhsNorm.size(); - const char* const strEndR = strR + rhsNorm.size(); - /* - compare strings after conceptually creating blocks of whitespace/numbers/text - - implement strict weak ordering! - - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c - 1. incorrect non-ASCII CI-comparison - 2. incorrect bounds checks - 3. incorrect trimming of *all* whitespace - 4. arbitrary handling of leading 0 only at string begin - 5. incorrect handling of whitespace following a number - 6. code is a mess */ - for (;;) + try { - if (strL == strEndL || strR == strEndR) - return (strL != strEndL) <=> (strR != strEndR); //"nothing" before "something" - //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here - - const bool wsL = isWhiteSpace(*strL); - const bool wsR = isWhiteSpace(*strR); - if (wsL != wsR) - return !wsL <=> !wsR; //whitespace before non-ws! - if (wsL) - { - ++strL, ++strR; - while (strL != strEndL && isWhiteSpace(*strL)) ++strL; - while (strR != strEndR && isWhiteSpace(*strR)) ++strR; - continue; - } - - const bool digitL = isDigit(*strL); - const bool digitR = isDigit(*strR); - if (digitL != digitR) - return !digitL <=> !digitR; //numbers before chars! - if (digitL) + const Zstring& lhsNorm = getUnicodeNormalForm(lhs); + const Zstring& rhsNorm = getUnicodeNormalForm(rhs); + + const char* strL = lhsNorm.c_str(); + const char* strR = rhsNorm.c_str(); + + const char* const strEndL = strL + lhsNorm.size(); + const char* const strEndR = strR + rhsNorm.size(); + /* - compare strings after conceptually creating blocks of whitespace/numbers/text + - implement strict weak ordering! + - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c + 1. incorrect non-ASCII CI-comparison + 2. incorrect bounds checks + 3. incorrect trimming of *all* whitespace + 4. arbitrary handling of leading 0 only at string begin + 5. incorrect handling of whitespace following a number + 6. code is a mess */ + for (;;) { - while (strL != strEndL && *strL == '0') ++strL; - while (strR != strEndR && *strR == '0') ++strR; + if (strL == strEndL || strR == strEndR) + return (strL != strEndL) <=> (strR != strEndR); //"nothing" before "something" + //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here + + const bool wsL = isWhiteSpace(*strL); + const bool wsR = isWhiteSpace(*strR); + if (wsL != wsR) + return !wsL <=> !wsR; //whitespace before non-ws! + if (wsL) + { + ++strL, ++strR; + while (strL != strEndL && isWhiteSpace(*strL)) ++strL; + while (strR != strEndR && isWhiteSpace(*strR)) ++strR; + continue; + } - int rv = 0; - for (;; ++strL, ++strR) + const bool digitL = isDigit(*strL); + const bool digitR = isDigit(*strR); + if (digitL != digitR) + return !digitL <=> !digitR; //numbers before chars! + if (digitL) { - const bool endL = strL == strEndL || !isDigit(*strL); - const bool endR = strR == strEndR || !isDigit(*strR); - if (endL != endR) - return !endL <=> !endR; //more digits means bigger number - if (endL) - break; //same number of digits - - if (rv == 0 && *strL != *strR) - rv = *strL - *strR; //found first digit difference comparing from left + while (strL != strEndL && *strL == '0') ++strL; + while (strR != strEndR && *strR == '0') ++strR; + + int rv = 0; + for (;; ++strL, ++strR) + { + const bool endL = strL == strEndL || !isDigit(*strL); + const bool endR = strR == strEndR || !isDigit(*strR); + if (endL != endR) + return !endL <=> !endR; //more digits means bigger number + if (endL) + break; //same number of digits + + if (rv == 0 && *strL != *strR) + rv = *strL - *strR; //found first digit difference comparing from left + } + if (rv != 0) + return rv <=> 0; + continue; } - if (rv != 0) - return rv <=> 0; - continue; + + //compare full junks of text: consider unicode encoding! + const char* textBeginL = strL++; + const char* textBeginR = strR++; //current char is neither white space nor digit at this point! + while (strL != strEndL && !isWhiteSpace(*strL) && !isDigit(*strL)) ++strL; + while (strR != strEndR && !isWhiteSpace(*strR) && !isDigit(*strR)) ++strR; + + if (const std::weak_ordering cmp = compareNoCaseUtf8(textBeginL, strL - textBeginL, textBeginR, strR - textBeginR); + cmp != std::weak_ordering::equivalent) + return cmp; } - //compare full junks of text: consider unicode encoding! - const char* textBeginL = strL++; - const char* textBeginR = strR++; //current char is neither white space nor digit at this point! - while (strL != strEndL && !isWhiteSpace(*strL) && !isDigit(*strL)) ++strL; - while (strR != strEndR && !isWhiteSpace(*strR) && !isDigit(*strR)) ++strR; + } + catch (const SysError& e) + { + throw std::runtime_error(std::string(__FILE__) + '[' + numberTo<std::string>(__LINE__) + "] Error comparing strings:" + '\n' + + utfTo<std::string>(lhs) + '\n' + utfTo<std::string>(rhs) + "\n\n" + utfTo<std::string>(e.toString())); + } +} + - if (const std::weak_ordering cmp = compareNoCaseUtf8(textBeginL, strL - textBeginL, textBeginR, strR - textBeginR); - cmp != std::weak_ordering::equivalent) - return cmp; +std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs) +{ + //fast path: no need for extra memory allocations => ~ 6x speedup + const size_t minSize = std::min(lhs.size(), rhs.size()); + + size_t i = 0; + for (; i < minSize; ++i) + { + const Zchar l = lhs[i]; + const Zchar r = rhs[i]; + if (!isAsciiChar(l) || !isAsciiChar(r)) + goto slowPath; //=> let's NOT make assumptions how getUpperCase() compares "ASCII <=> non-ASCII" + + const Zchar lUp = asciiToUpper(l); // + const Zchar rUp = asciiToUpper(r); //no surprises: emulate getUpperCase() [verified!] + if (lUp != rUp) // + return lUp <=> rUp; // } + return lhs.size() <=> rhs.size(); +slowPath: //-------------------------------------- + return compareNoCaseUtf8(lhs.c_str() + i, lhs.size() - i, + rhs.c_str() + i, rhs.size() - i); } diff --git a/zen/zstring.h b/zen/zstring.h index bc7cfb06..70b9f448 100644 --- a/zen/zstring.h +++ b/zen/zstring.h @@ -39,7 +39,7 @@ Zstring getUnicodeNormalForm(const Zstring& str); Zstring getUpperCase(const Zstring& str); //------------------------------------------------------------------------------------------ -struct ZstringNorm //use as STL container key: avoid needless Unicode normalizations during std::map<>::find() +struct ZstringNorm //use as STL container key: better than repeated Unicode normalizations during std::map<>::find() { /*explicit*/ ZstringNorm(const Zstring& str) : normStr(getUnicodeNormalForm(str)) {} Zstring normStr; @@ -51,7 +51,7 @@ template<> struct std::hash<ZstringNorm> { size_t operator()(const ZstringNorm& //struct LessUnicodeNormal { bool operator()(const Zstring& lhs, const Zstring& rhs) const { return getUnicodeNormalForm(lhs) < getUnicodeNormalForm(rhs); } }; //------------------------------------------------------------------------------------------ -struct ZstringNoCase //use as STL container key: avoid needless upper-case conversions during std::map<>::find() +struct ZstringNoCase //use as STL container key: better than repeated upper-case conversions during std::map<>::find() { /*explicit*/ ZstringNoCase(const Zstring& str) : upperCase(getUpperCase(str)) {} Zstring upperCase; @@ -60,12 +60,18 @@ struct ZstringNoCase //use as STL container key: avoid needless upper-case conve }; template<> struct std::hash<ZstringNoCase> { size_t operator()(const ZstringNoCase& str) const { return std::hash<Zstring>()(str.upperCase); } }; -inline bool equalNoCase(const Zstring& lhs, const Zstring& rhs) { return getUpperCase(lhs) == getUpperCase(rhs); } + +std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs); + +inline +bool equalNoCase(const Zstring& lhs, const Zstring& rhs) { return compareNoCase(lhs, rhs) == std::weak_ordering::equivalent; } +//note: the "lhs.size() != rhs.size()" short-cut would require two isAsciiString() checks +//=> generally SLOWER than starting comparison directly during first pass and breaking on first difference! //------------------------------------------------------------------------------------------ std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs); -struct LessNaturalSort { bool operator()(const Zstring& lhs, const Zstring& rhs) const { return std::is_lt(compareNatural(lhs, rhs)); } }; +struct LessNaturalSort { bool operator()(const Zstring& lhs, const Zstring& rhs) const { return compareNatural(lhs, rhs) < 0; } }; //------------------------------------------------------------------------------------------ @@ -73,16 +79,18 @@ struct LessNaturalSort { bool operator()(const Zstring& lhs, const Zstring& rhs) const wchar_t EN_DASH = L'\u2013'; const wchar_t EM_DASH = L'\u2014'; const wchar_t* const SPACED_DASH = L" \u2014 "; //using 'EM DASH' -const wchar_t LTR_MARK = L'\u200E'; //UTF-8: E2 80 8E const wchar_t* const ELLIPSIS = L"\u2026"; //"..." const wchar_t MULT_SIGN = L'\u00D7'; //fancy "x" //const wchar_t NOBREAK_SPACE = L'\u00A0'; const wchar_t ZERO_WIDTH_SPACE = L'\u200B'; +const wchar_t LTR_MARK = L'\u200E'; //UTF-8: E2 80 8E const wchar_t RTL_MARK = L'\u200F'; //UTF-8: E2 80 8F https://www.w3.org/International/questions/qa-bidi-unicode-controls -const wchar_t BIDI_DIR_ISOLATE_RTL = L'\u2067'; //UTF-8: E2 81 A7 => not working on Win 10 -const wchar_t BIDI_POP_DIR_ISOLATE = L'\u2069'; //UTF-8: E2 81 A9 => not working on Win 10 -const wchar_t BIDI_DIR_EMBEDDING_RTL = L'\u202B'; //UTF-8: E2 80 AB => not working on Win 10 -const wchar_t BIDI_POP_DIR_FORMATTING = L'\u202C'; //UTF-8: E2 80 AC => not working on Win 10 +//const wchar_t BIDI_DIR_ISOLATE_RTL = L'\u2067'; //=> not working on Win 10 +//const wchar_t BIDI_POP_DIR_ISOLATE = L'\u2069'; //=> not working on Win 10 +//const wchar_t BIDI_DIR_EMBEDDING_RTL = L'\u202B'; //=> not working on Win 10 +//const wchar_t BIDI_POP_DIR_FORMATTING = L'\u202C'; //=> not working on Win 10 + +const wchar_t* const TAB_SPACE = L" "; //4: the only sensible space count for tabs #endif //ZSTRING_H_73425873425789 |