diff options
Diffstat (limited to 'zen/zstring.cpp')
-rwxr-xr-x | zen/zstring.cpp | 184 |
1 files changed, 143 insertions, 41 deletions
diff --git a/zen/zstring.cpp b/zen/zstring.cpp index 8bf77a0b..68609030 100755 --- a/zen/zstring.cpp +++ b/zen/zstring.cpp @@ -8,9 +8,102 @@ #include <stdexcept> #include "utf.h" + #include <gtk/gtk.h> + #include "sys_error.h" using namespace zen; + +Zstring makeUpperCopy(const Zstring& str) +{ + //fast pre-check: + if (isAsciiString(str.c_str())) //perf: in the range of 3.5ns + { + Zstring output = str; + for (Zchar& c : output) c = asciiToUpper(c); + return output; + } + + Zstring strNorm = getUnicodeNormalForm(str); + try + { + static_assert(sizeof(impl::CodePoint) == sizeof(gunichar)); + Zstring output; + output.reserve(strNorm.size()); + + impl::UtfDecoder<char> decoder(strNorm.c_str(), strNorm.size()); + while (const std::optional<impl::CodePoint> cp = decoder.getNext()) + impl::codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent! + + return output; + + } + catch (const SysError& e) + { + (void)e; + assert(false); + return str; + } +} + + +Zstring getUnicodeNormalForm(const Zstring& str) +{ + //fast pre-check: + if (isAsciiString(str.c_str())) //perf: in the range of 3.5ns + return str; //god bless our ref-counting! => save output string memory consumption! + + //Example: const char* decomposed = "\x6f\xcc\x81"; + // const char* precomposed = "\xc3\xb3"; + try + { + gchar* outStr = ::g_utf8_normalize (str.c_str(), str.length(), G_NORMALIZE_DEFAULT_COMPOSE); + if (!outStr) + throw SysError(L"g_utf8_normalize: conversion failed. (" + utfTo<std::wstring>(str) + L")"); + ZEN_ON_SCOPE_EXIT(::g_free(outStr)); + return outStr; + + } + catch (const SysError& e) + { + (void)e; + assert(false); + return str; + } +} + + +Zstring replaceCpyAsciiNoCase(const Zstring& str, const Zstring& oldTerm, const Zstring& newTerm) +{ + if (oldTerm.empty()) + return str; + + Zstring strU = str; + Zstring oldU = oldTerm; + + for (Zchar& c : strU) c = asciiToUpper(c); //can't use makeUpperCopy(): input/output sizes may differ! + for (Zchar& c : oldU) c = asciiToUpper(c); // + + Zstring output; + + for (size_t pos = 0;;) + { + const size_t posFound = strU.find(oldU, pos); + if (posFound == Zstring::npos) + { + if (pos == 0) //optimize "oldTerm not found": return ref-counted copy + return str; + output.append(str.begin() + pos, str.end()); + return output; + } + + output.append(str.begin() + pos, str.begin() + posFound); + output += newTerm; + pos = posFound + oldTerm.size(); + } +} + + /* MSDN "Handling Sorting in Your Applications": https://msdn.microsoft.com/en-us/library/windows/desktop/dd318144 @@ -33,8 +126,14 @@ OS X (UTF8 char) ________________________ time per call | function */ +int compareLocalPath(const Zstring& lhs, const Zstring& rhs) +{ + assert(lhs.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls! + assert(rhs.find(Zchar('\0')) == Zstring::npos); // + return compareString(lhs, rhs); +} namespace @@ -43,7 +142,7 @@ int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rh { //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c - // => re-implement comparison based on towlower() to avoid memory allocations + // => re-implement comparison based on g_unichar_tolower() to avoid memory allocations impl::UtfDecoder<char> decL(lhs, lhsLen); impl::UtfDecoder<char> decR(rhs, rhsLen); @@ -54,23 +153,35 @@ int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rh if (!cpL || !cpR) return static_cast<int>(!cpR) - static_cast<int>(!cpL); - //support unit-testing on Windows: CodePoint is truncated to wchar_t - static_assert(sizeof(wchar_t) == sizeof(impl::CodePoint)); + static_assert(sizeof(gunichar) == sizeof(impl::CodePoint)); - const wchar_t charL = ::towlower(static_cast<wchar_t>(*cpL)); //ordering: towlower() converts to higher code points than towupper() - const wchar_t charR = ::towlower(static_cast<wchar_t>(*cpR)); //uses LC_CTYPE category of current locale + const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use: + const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle. if (charL != charR) + //ordering: "to lower" converts to higher code points than "to upper" return static_cast<unsigned int>(charL) - static_cast<unsigned int>(charR); //unsigned char-comparison is the convention! //unsigned underflow is well-defined! } } + } -int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) +int compareNatural(const Zstring& lhs, const Zstring& rhs) { - const char* const lhsEnd = lhs + lhsLen; - const char* const rhsEnd = rhs + rhsLen; + //Unicode normal forms: + // Windows: CompareString() already ignores NFD/NFC differences: nice... + // Linux: g_unichar_toupper() can't ignore differences + // macOS: CFStringCompare() considers differences + + const Zstring& lhsNorm = getUnicodeNormalForm(lhs); + const Zstring& rhsNorm = getUnicodeNormalForm(rhs); + + const char* strL = lhsNorm.c_str(); + const char* strR = rhsNorm.c_str(); + + const char* const strEndL = strL + lhsNorm.size(); + const char* const strEndR = strR + rhsNorm.size(); /* - compare strings after conceptually creating blocks of whitespace/numbers/text - implement strict weak ordering! @@ -84,43 +195,43 @@ int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, s */ for (;;) { - if (lhs == lhsEnd || rhs == rhsEnd) - return static_cast<int>(lhs != lhsEnd) - static_cast<int>(rhs != rhsEnd); //"nothing" before "something" + if (strL == strEndL || strR == strEndR) + return static_cast<int>(strL != strEndL) - static_cast<int>(strR != strEndR); //"nothing" before "something" //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here - const bool wsL = isWhiteSpace(*lhs); - const bool wsR = isWhiteSpace(*rhs); + const bool wsL = isWhiteSpace(*strL); + const bool wsR = isWhiteSpace(*strR); if (wsL != wsR) return static_cast<int>(!wsL) - static_cast<int>(!wsR); //whitespace before non-ws! if (wsL) { - ++lhs, ++rhs; - while (lhs != lhsEnd && isWhiteSpace(*lhs)) ++lhs; - while (rhs != rhsEnd && isWhiteSpace(*rhs)) ++rhs; + ++strL, ++strR; + while (strL != strEndL && isWhiteSpace(*strL)) ++strL; + while (strR != strEndR && isWhiteSpace(*strR)) ++strR; continue; } - const bool digitL = isDigit(*lhs); - const bool digitR = isDigit(*rhs); + const bool digitL = isDigit(*strL); + const bool digitR = isDigit(*strR); if (digitL != digitR) return static_cast<int>(!digitL) - static_cast<int>(!digitR); //number before chars! if (digitL) { - while (lhs != lhsEnd && *lhs == '0') ++lhs; - while (rhs != rhsEnd && *rhs == '0') ++rhs; + while (strL != strEndL && *strL == '0') ++strL; + while (strR != strEndR && *strR == '0') ++strR; int rv = 0; - for (;; ++lhs, ++rhs) + for (;; ++strL, ++strR) { - const bool endL = lhs == lhsEnd || !isDigit(*lhs); - const bool endR = rhs == rhsEnd || !isDigit(*rhs); + const bool endL = strL == strEndL || !isDigit(*strL); + const bool endR = strR == strEndR || !isDigit(*strR); if (endL != endR) return static_cast<int>(!endL) - static_cast<int>(!endR); //more digits means bigger number if (endL) break; //same number of digits - if (rv == 0 && *lhs != *rhs) - rv = *lhs - *rhs; //found first digit difference comparing from left + if (rv == 0 && *strL != *strR) + rv = *strL - *strR; //found first digit difference comparing from left } if (rv != 0) return rv; @@ -128,28 +239,19 @@ int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, s } //compare full junks of text: consider unicode encoding! - const char* textBeginL = lhs++; - const char* textBeginR = rhs++; //current char is neither white space nor digit at this point! - while (lhs != lhsEnd && !isWhiteSpace(*lhs) && !isDigit(*lhs)) ++lhs; - while (rhs != rhsEnd && !isWhiteSpace(*rhs) && !isDigit(*rhs)) ++rhs; + const char* textBeginL = strL++; + const char* textBeginR = strR++; //current char is neither white space nor digit at this point! + while (strL != strEndL && !isWhiteSpace(*strL) && !isDigit(*strL)) ++strL; + while (strR != strEndR && !isWhiteSpace(*strR) && !isDigit(*strR)) ++strR; - const int rv = compareNoCaseUtf8(textBeginL, lhs - textBeginL, textBeginR, rhs - textBeginR); + const int rv = compareNoCaseUtf8(textBeginL, strL - textBeginL, textBeginR, strR - textBeginR); if (rv != 0) return rv; } -} - -namespace -{ } -int CmpNaturalSort::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const -{ - //auto strL = utfTo<std::string>(Zstring(lhs, lhsLen)); - //auto strR = utfTo<std::string>(Zstring(rhs, rhsLen)); - //return cmpStringNaturalLinux(strL.c_str(), strL.size(), strR.c_str(), strR.size()); - return cmpStringNaturalLinux(lhs, lhsLen, rhs, rhsLen); - -}
\ No newline at end of file +warn_static("clean up implementation of these two:") +//template <> inline bool isWhiteSpace(char c) +//template <> inline bool isWhiteSpace(wchar_t c) |