summaryrefslogtreecommitdiff
path: root/zen/zstring.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'zen/zstring.cpp')
-rwxr-xr-xzen/zstring.cpp184
1 files changed, 143 insertions, 41 deletions
diff --git a/zen/zstring.cpp b/zen/zstring.cpp
index 8bf77a0b..68609030 100755
--- a/zen/zstring.cpp
+++ b/zen/zstring.cpp
@@ -8,9 +8,102 @@
#include <stdexcept>
#include "utf.h"
+ #include <gtk/gtk.h>
+ #include "sys_error.h"
using namespace zen;
+
+Zstring makeUpperCopy(const Zstring& str)
+{
+ //fast pre-check:
+ if (isAsciiString(str.c_str())) //perf: in the range of 3.5ns
+ {
+ Zstring output = str;
+ for (Zchar& c : output) c = asciiToUpper(c);
+ return output;
+ }
+
+ Zstring strNorm = getUnicodeNormalForm(str);
+ try
+ {
+ static_assert(sizeof(impl::CodePoint) == sizeof(gunichar));
+ Zstring output;
+ output.reserve(strNorm.size());
+
+ impl::UtfDecoder<char> decoder(strNorm.c_str(), strNorm.size());
+ while (const std::optional<impl::CodePoint> cp = decoder.getNext())
+ impl::codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent!
+
+ return output;
+
+ }
+ catch (const SysError& e)
+ {
+ (void)e;
+ assert(false);
+ return str;
+ }
+}
+
+
+Zstring getUnicodeNormalForm(const Zstring& str)
+{
+ //fast pre-check:
+ if (isAsciiString(str.c_str())) //perf: in the range of 3.5ns
+ return str; //god bless our ref-counting! => save output string memory consumption!
+
+ //Example: const char* decomposed = "\x6f\xcc\x81";
+ // const char* precomposed = "\xc3\xb3";
+ try
+ {
+ gchar* outStr = ::g_utf8_normalize (str.c_str(), str.length(), G_NORMALIZE_DEFAULT_COMPOSE);
+ if (!outStr)
+ throw SysError(L"g_utf8_normalize: conversion failed. (" + utfTo<std::wstring>(str) + L")");
+ ZEN_ON_SCOPE_EXIT(::g_free(outStr));
+ return outStr;
+
+ }
+ catch (const SysError& e)
+ {
+ (void)e;
+ assert(false);
+ return str;
+ }
+}
+
+
+Zstring replaceCpyAsciiNoCase(const Zstring& str, const Zstring& oldTerm, const Zstring& newTerm)
+{
+ if (oldTerm.empty())
+ return str;
+
+ Zstring strU = str;
+ Zstring oldU = oldTerm;
+
+ for (Zchar& c : strU) c = asciiToUpper(c); //can't use makeUpperCopy(): input/output sizes may differ!
+ for (Zchar& c : oldU) c = asciiToUpper(c); //
+
+ Zstring output;
+
+ for (size_t pos = 0;;)
+ {
+ const size_t posFound = strU.find(oldU, pos);
+ if (posFound == Zstring::npos)
+ {
+ if (pos == 0) //optimize "oldTerm not found": return ref-counted copy
+ return str;
+ output.append(str.begin() + pos, str.end());
+ return output;
+ }
+
+ output.append(str.begin() + pos, str.begin() + posFound);
+ output += newTerm;
+ pos = posFound + oldTerm.size();
+ }
+}
+
+
/*
MSDN "Handling Sorting in Your Applications": https://msdn.microsoft.com/en-us/library/windows/desktop/dd318144
@@ -33,8 +126,14 @@ OS X (UTF8 char)
________________________
time per call | function
*/
+int compareLocalPath(const Zstring& lhs, const Zstring& rhs)
+{
+ assert(lhs.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls!
+ assert(rhs.find(Zchar('\0')) == Zstring::npos); //
+ return compareString(lhs, rhs);
+}
namespace
@@ -43,7 +142,7 @@ int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rh
{
//- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode
//- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c
- // => re-implement comparison based on towlower() to avoid memory allocations
+ // => re-implement comparison based on g_unichar_tolower() to avoid memory allocations
impl::UtfDecoder<char> decL(lhs, lhsLen);
impl::UtfDecoder<char> decR(rhs, rhsLen);
@@ -54,23 +153,35 @@ int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rh
if (!cpL || !cpR)
return static_cast<int>(!cpR) - static_cast<int>(!cpL);
- //support unit-testing on Windows: CodePoint is truncated to wchar_t
- static_assert(sizeof(wchar_t) == sizeof(impl::CodePoint));
+ static_assert(sizeof(gunichar) == sizeof(impl::CodePoint));
- const wchar_t charL = ::towlower(static_cast<wchar_t>(*cpL)); //ordering: towlower() converts to higher code points than towupper()
- const wchar_t charR = ::towlower(static_cast<wchar_t>(*cpR)); //uses LC_CTYPE category of current locale
+ const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use:
+ const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle.
if (charL != charR)
+ //ordering: "to lower" converts to higher code points than "to upper"
return static_cast<unsigned int>(charL) - static_cast<unsigned int>(charR); //unsigned char-comparison is the convention!
//unsigned underflow is well-defined!
}
}
+
}
-int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
+int compareNatural(const Zstring& lhs, const Zstring& rhs)
{
- const char* const lhsEnd = lhs + lhsLen;
- const char* const rhsEnd = rhs + rhsLen;
+ //Unicode normal forms:
+ // Windows: CompareString() already ignores NFD/NFC differences: nice...
+ // Linux: g_unichar_toupper() can't ignore differences
+ // macOS: CFStringCompare() considers differences
+
+ const Zstring& lhsNorm = getUnicodeNormalForm(lhs);
+ const Zstring& rhsNorm = getUnicodeNormalForm(rhs);
+
+ const char* strL = lhsNorm.c_str();
+ const char* strR = rhsNorm.c_str();
+
+ const char* const strEndL = strL + lhsNorm.size();
+ const char* const strEndR = strR + rhsNorm.size();
/*
- compare strings after conceptually creating blocks of whitespace/numbers/text
- implement strict weak ordering!
@@ -84,43 +195,43 @@ int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, s
*/
for (;;)
{
- if (lhs == lhsEnd || rhs == rhsEnd)
- return static_cast<int>(lhs != lhsEnd) - static_cast<int>(rhs != rhsEnd); //"nothing" before "something"
+ if (strL == strEndL || strR == strEndR)
+ return static_cast<int>(strL != strEndL) - static_cast<int>(strR != strEndR); //"nothing" before "something"
//note: "something" never would have been condensed to "nothing" further below => can finish evaluation here
- const bool wsL = isWhiteSpace(*lhs);
- const bool wsR = isWhiteSpace(*rhs);
+ const bool wsL = isWhiteSpace(*strL);
+ const bool wsR = isWhiteSpace(*strR);
if (wsL != wsR)
return static_cast<int>(!wsL) - static_cast<int>(!wsR); //whitespace before non-ws!
if (wsL)
{
- ++lhs, ++rhs;
- while (lhs != lhsEnd && isWhiteSpace(*lhs)) ++lhs;
- while (rhs != rhsEnd && isWhiteSpace(*rhs)) ++rhs;
+ ++strL, ++strR;
+ while (strL != strEndL && isWhiteSpace(*strL)) ++strL;
+ while (strR != strEndR && isWhiteSpace(*strR)) ++strR;
continue;
}
- const bool digitL = isDigit(*lhs);
- const bool digitR = isDigit(*rhs);
+ const bool digitL = isDigit(*strL);
+ const bool digitR = isDigit(*strR);
if (digitL != digitR)
return static_cast<int>(!digitL) - static_cast<int>(!digitR); //number before chars!
if (digitL)
{
- while (lhs != lhsEnd && *lhs == '0') ++lhs;
- while (rhs != rhsEnd && *rhs == '0') ++rhs;
+ while (strL != strEndL && *strL == '0') ++strL;
+ while (strR != strEndR && *strR == '0') ++strR;
int rv = 0;
- for (;; ++lhs, ++rhs)
+ for (;; ++strL, ++strR)
{
- const bool endL = lhs == lhsEnd || !isDigit(*lhs);
- const bool endR = rhs == rhsEnd || !isDigit(*rhs);
+ const bool endL = strL == strEndL || !isDigit(*strL);
+ const bool endR = strR == strEndR || !isDigit(*strR);
if (endL != endR)
return static_cast<int>(!endL) - static_cast<int>(!endR); //more digits means bigger number
if (endL)
break; //same number of digits
- if (rv == 0 && *lhs != *rhs)
- rv = *lhs - *rhs; //found first digit difference comparing from left
+ if (rv == 0 && *strL != *strR)
+ rv = *strL - *strR; //found first digit difference comparing from left
}
if (rv != 0)
return rv;
@@ -128,28 +239,19 @@ int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, s
}
//compare full junks of text: consider unicode encoding!
- const char* textBeginL = lhs++;
- const char* textBeginR = rhs++; //current char is neither white space nor digit at this point!
- while (lhs != lhsEnd && !isWhiteSpace(*lhs) && !isDigit(*lhs)) ++lhs;
- while (rhs != rhsEnd && !isWhiteSpace(*rhs) && !isDigit(*rhs)) ++rhs;
+ const char* textBeginL = strL++;
+ const char* textBeginR = strR++; //current char is neither white space nor digit at this point!
+ while (strL != strEndL && !isWhiteSpace(*strL) && !isDigit(*strL)) ++strL;
+ while (strR != strEndR && !isWhiteSpace(*strR) && !isDigit(*strR)) ++strR;
- const int rv = compareNoCaseUtf8(textBeginL, lhs - textBeginL, textBeginR, rhs - textBeginR);
+ const int rv = compareNoCaseUtf8(textBeginL, strL - textBeginL, textBeginR, strR - textBeginR);
if (rv != 0)
return rv;
}
-}
-
-namespace
-{
}
-int CmpNaturalSort::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const
-{
- //auto strL = utfTo<std::string>(Zstring(lhs, lhsLen));
- //auto strR = utfTo<std::string>(Zstring(rhs, rhsLen));
- //return cmpStringNaturalLinux(strL.c_str(), strL.size(), strR.c_str(), strR.size());
- return cmpStringNaturalLinux(lhs, lhsLen, rhs, rhsLen);
-
-} \ No newline at end of file
+warn_static("clean up implementation of these two:")
+//template <> inline bool isWhiteSpace(char c)
+//template <> inline bool isWhiteSpace(wchar_t c)
bgstack15