summaryrefslogtreecommitdiff
path: root/zen/zstring.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'zen/zstring.cpp')
-rw-r--r--zen/zstring.cpp222
1 files changed, 134 insertions, 88 deletions
diff --git a/zen/zstring.cpp b/zen/zstring.cpp
index 76c0a81f..1e29e461 100644
--- a/zen/zstring.cpp
+++ b/zen/zstring.cpp
@@ -11,46 +11,44 @@
using namespace zen;
-Zstring getUnicodeNormalForm(const Zstring& str)
+Zstring getUnicodeNormalFormNonAscii(const Zstring& str)
{
- //fast pre-check:
- if (isAsciiString(str)) //perf: in the range of 3.5ns
- return str;
- static_assert(std::is_same_v<decltype(str), const Zbase<Zchar>&>, "god bless our ref-counting! => save output string memory consumption!");
-
//Example: const char* decomposed = "\x6f\xcc\x81";
// const char* precomposed = "\xc3\xb3";
+ assert(!isAsciiString(str));
+ assert(str.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls!
+
try
{
gchar* outStr = ::g_utf8_normalize(str.c_str(), str.length(), G_NORMALIZE_DEFAULT_COMPOSE);
if (!outStr)
- throw SysError(formatSystemError("g_utf8_normalize(" + utfTo<std::string>(str) + ')', L"", L"Conversion failed."));
+ throw SysError(formatSystemError("g_utf8_normalize", L"", L"Conversion failed."));
ZEN_ON_SCOPE_EXIT(::g_free(outStr));
return outStr;
}
- catch ([[maybe_unused]] const SysError& e)
+ catch (const SysError& e)
{
- assert(false);
- return str;
+ throw std::runtime_error(std::string(__FILE__) + '[' + numberTo<std::string>(__LINE__) + "] Error normalizing string:" +
+ '\n' + utfTo<std::string>(str) + "\n\n" + utfTo<std::string>(e.toString()));
}
}
-Zstring getUpperCase(const Zstring& str)
+Zstring getUnicodeNormalForm(const Zstring& str)
{
- assert(str.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls!
-
//fast pre-check:
if (isAsciiString(str)) //perf: in the range of 3.5ns
- {
- Zstring output = str;
- for (Zchar& c : output)
- c = asciiToUpper(c);
- return output;
- }
+ return str;
+ static_assert(std::is_same_v<decltype(str), const Zbase<Zchar>&>, "god bless our ref-counting! => save output string memory consumption!");
- Zstring strNorm = getUnicodeNormalForm(str);
+ return getUnicodeNormalFormNonAscii(str);
+}
+
+
+Zstring getUpperCaseNonAscii(const Zstring& str)
+{
+ Zstring strNorm = getUnicodeNormalFormNonAscii(str);
try
{
static_assert(sizeof(impl::CodePoint) == sizeof(gunichar));
@@ -64,11 +62,26 @@ Zstring getUpperCase(const Zstring& str)
return output;
}
- catch (SysError&)
+ catch (const SysError& e)
{
- assert(false);
- return str;
+ throw std::runtime_error(std::string(__FILE__) + '[' + numberTo<std::string>(__LINE__) + "] Error converting string to upper case:" +
+ '\n' + utfTo<std::string>(str) + "\n\n" + utfTo<std::string>(e.toString()));
+ }
+}
+
+
+Zstring getUpperCase(const Zstring& str)
+{
+ if (isAsciiString(str)) //fast path: in the range of 3.5ns
+ {
+ Zstring output = str;
+ for (Zchar& c : output) //identical to LCMapStringEx(), g_unichar_toupper(), CFStringUppercase() [verified!]
+ c = asciiToUpper(c); //
+ return output;
}
+ //else: slow path --------------------------------------
+
+ return getUpperCaseNonAscii(str);
}
@@ -91,10 +104,10 @@ std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char*
static_assert(sizeof(gunichar) == sizeof(impl::CodePoint));
+ //ordering: "to lower" converts to higher code points than "to upper"
const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use:
const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle.
if (charL != charR)
- //ordering: "to lower" converts to higher code points than "to upper"
return makeUnsigned(charL) <=> makeUnsigned(charR); //unsigned char-comparison is the convention!
}
}
@@ -107,78 +120,111 @@ std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs)
Windows: CompareString() already ignores NFD/NFC differences: nice...
Linux: g_unichar_toupper() can't ignore differences
macOS: CFStringCompare() considers differences */
-
- const Zstring& lhsNorm = getUnicodeNormalForm(lhs);
- const Zstring& rhsNorm = getUnicodeNormalForm(rhs);
-
- const char* strL = lhsNorm.c_str();
- const char* strR = rhsNorm.c_str();
-
- const char* const strEndL = strL + lhsNorm.size();
- const char* const strEndR = strR + rhsNorm.size();
- /* - compare strings after conceptually creating blocks of whitespace/numbers/text
- - implement strict weak ordering!
- - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c
- 1. incorrect non-ASCII CI-comparison
- 2. incorrect bounds checks
- 3. incorrect trimming of *all* whitespace
- 4. arbitrary handling of leading 0 only at string begin
- 5. incorrect handling of whitespace following a number
- 6. code is a mess */
- for (;;)
+ try
{
- if (strL == strEndL || strR == strEndR)
- return (strL != strEndL) <=> (strR != strEndR); //"nothing" before "something"
- //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here
-
- const bool wsL = isWhiteSpace(*strL);
- const bool wsR = isWhiteSpace(*strR);
- if (wsL != wsR)
- return !wsL <=> !wsR; //whitespace before non-ws!
- if (wsL)
- {
- ++strL, ++strR;
- while (strL != strEndL && isWhiteSpace(*strL)) ++strL;
- while (strR != strEndR && isWhiteSpace(*strR)) ++strR;
- continue;
- }
-
- const bool digitL = isDigit(*strL);
- const bool digitR = isDigit(*strR);
- if (digitL != digitR)
- return !digitL <=> !digitR; //numbers before chars!
- if (digitL)
+ const Zstring& lhsNorm = getUnicodeNormalForm(lhs);
+ const Zstring& rhsNorm = getUnicodeNormalForm(rhs);
+
+ const char* strL = lhsNorm.c_str();
+ const char* strR = rhsNorm.c_str();
+
+ const char* const strEndL = strL + lhsNorm.size();
+ const char* const strEndR = strR + rhsNorm.size();
+ /* - compare strings after conceptually creating blocks of whitespace/numbers/text
+ - implement strict weak ordering!
+ - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c
+ 1. incorrect non-ASCII CI-comparison
+ 2. incorrect bounds checks
+ 3. incorrect trimming of *all* whitespace
+ 4. arbitrary handling of leading 0 only at string begin
+ 5. incorrect handling of whitespace following a number
+ 6. code is a mess */
+ for (;;)
{
- while (strL != strEndL && *strL == '0') ++strL;
- while (strR != strEndR && *strR == '0') ++strR;
+ if (strL == strEndL || strR == strEndR)
+ return (strL != strEndL) <=> (strR != strEndR); //"nothing" before "something"
+ //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here
+
+ const bool wsL = isWhiteSpace(*strL);
+ const bool wsR = isWhiteSpace(*strR);
+ if (wsL != wsR)
+ return !wsL <=> !wsR; //whitespace before non-ws!
+ if (wsL)
+ {
+ ++strL, ++strR;
+ while (strL != strEndL && isWhiteSpace(*strL)) ++strL;
+ while (strR != strEndR && isWhiteSpace(*strR)) ++strR;
+ continue;
+ }
- int rv = 0;
- for (;; ++strL, ++strR)
+ const bool digitL = isDigit(*strL);
+ const bool digitR = isDigit(*strR);
+ if (digitL != digitR)
+ return !digitL <=> !digitR; //numbers before chars!
+ if (digitL)
{
- const bool endL = strL == strEndL || !isDigit(*strL);
- const bool endR = strR == strEndR || !isDigit(*strR);
- if (endL != endR)
- return !endL <=> !endR; //more digits means bigger number
- if (endL)
- break; //same number of digits
-
- if (rv == 0 && *strL != *strR)
- rv = *strL - *strR; //found first digit difference comparing from left
+ while (strL != strEndL && *strL == '0') ++strL;
+ while (strR != strEndR && *strR == '0') ++strR;
+
+ int rv = 0;
+ for (;; ++strL, ++strR)
+ {
+ const bool endL = strL == strEndL || !isDigit(*strL);
+ const bool endR = strR == strEndR || !isDigit(*strR);
+ if (endL != endR)
+ return !endL <=> !endR; //more digits means bigger number
+ if (endL)
+ break; //same number of digits
+
+ if (rv == 0 && *strL != *strR)
+ rv = *strL - *strR; //found first digit difference comparing from left
+ }
+ if (rv != 0)
+ return rv <=> 0;
+ continue;
}
- if (rv != 0)
- return rv <=> 0;
- continue;
+
+ //compare full junks of text: consider unicode encoding!
+ const char* textBeginL = strL++;
+ const char* textBeginR = strR++; //current char is neither white space nor digit at this point!
+ while (strL != strEndL && !isWhiteSpace(*strL) && !isDigit(*strL)) ++strL;
+ while (strR != strEndR && !isWhiteSpace(*strR) && !isDigit(*strR)) ++strR;
+
+ if (const std::weak_ordering cmp = compareNoCaseUtf8(textBeginL, strL - textBeginL, textBeginR, strR - textBeginR);
+ cmp != std::weak_ordering::equivalent)
+ return cmp;
}
- //compare full junks of text: consider unicode encoding!
- const char* textBeginL = strL++;
- const char* textBeginR = strR++; //current char is neither white space nor digit at this point!
- while (strL != strEndL && !isWhiteSpace(*strL) && !isDigit(*strL)) ++strL;
- while (strR != strEndR && !isWhiteSpace(*strR) && !isDigit(*strR)) ++strR;
+ }
+ catch (const SysError& e)
+ {
+ throw std::runtime_error(std::string(__FILE__) + '[' + numberTo<std::string>(__LINE__) + "] Error comparing strings:" + '\n' +
+ utfTo<std::string>(lhs) + '\n' + utfTo<std::string>(rhs) + "\n\n" + utfTo<std::string>(e.toString()));
+ }
+}
+
- if (const std::weak_ordering cmp = compareNoCaseUtf8(textBeginL, strL - textBeginL, textBeginR, strR - textBeginR);
- cmp != std::weak_ordering::equivalent)
- return cmp;
+std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs)
+{
+ //fast path: no need for extra memory allocations => ~ 6x speedup
+ const size_t minSize = std::min(lhs.size(), rhs.size());
+
+ size_t i = 0;
+ for (; i < minSize; ++i)
+ {
+ const Zchar l = lhs[i];
+ const Zchar r = rhs[i];
+ if (!isAsciiChar(l) || !isAsciiChar(r))
+ goto slowPath; //=> let's NOT make assumptions how getUpperCase() compares "ASCII <=> non-ASCII"
+
+ const Zchar lUp = asciiToUpper(l); //
+ const Zchar rUp = asciiToUpper(r); //no surprises: emulate getUpperCase() [verified!]
+ if (lUp != rUp) //
+ return lUp <=> rUp; //
}
+ return lhs.size() <=> rhs.size();
+slowPath: //--------------------------------------
+ return compareNoCaseUtf8(lhs.c_str() + i, lhs.size() - i,
+ rhs.c_str() + i, rhs.size() - i);
}
bgstack15