summaryrefslogtreecommitdiff
path: root/zen/zstring.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'zen/zstring.cpp')
-rw-r--r--zen/zstring.cpp70
1 files changed, 49 insertions, 21 deletions
diff --git a/zen/zstring.cpp b/zen/zstring.cpp
index 1e29e461..3f5328f7 100644
--- a/zen/zstring.cpp
+++ b/zen/zstring.cpp
@@ -15,7 +15,7 @@ Zstring getUnicodeNormalFormNonAscii(const Zstring& str)
{
//Example: const char* decomposed = "\x6f\xcc\x81";
// const char* precomposed = "\xc3\xb3";
- assert(!isAsciiString(str));
+ assert(!isAsciiString(str)); //includes "not-empty" check
assert(str.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls!
try
@@ -51,14 +51,14 @@ Zstring getUpperCaseNonAscii(const Zstring& str)
Zstring strNorm = getUnicodeNormalFormNonAscii(str);
try
{
- static_assert(sizeof(impl::CodePoint) == sizeof(gunichar));
Zstring output;
output.reserve(strNorm.size());
UtfDecoder<char> decoder(strNorm.c_str(), strNorm.size());
while (const std::optional<impl::CodePoint> cp = decoder.getNext())
- impl::codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent!
+ codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent!
+ static_assert(sizeof(impl::CodePoint) == sizeof(gunichar));
return output;
}
@@ -89,6 +89,10 @@ namespace
{
std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
{
+ //expect Unicode normalized strings!
+ assert(std::string(lhs, lhsLen) == getUnicodeNormalForm(std::string(lhs, lhsLen)));
+ assert(std::string(rhs, rhsLen) == getUnicodeNormalForm(std::string(rhs, rhsLen)));
+
//- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode
//- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c
// => re-implement comparison based on g_unichar_tolower() to avoid memory allocations
@@ -103,12 +107,13 @@ std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char*
return !cpR <=> !cpL;
static_assert(sizeof(gunichar) == sizeof(impl::CodePoint));
+ static_assert(std::is_unsigned_v<gunichar>, "unsigned char-comparison is the convention!");
//ordering: "to lower" converts to higher code points than "to upper"
const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use:
const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle.
if (charL != charR)
- return makeUnsigned(charL) <=> makeUnsigned(charR); //unsigned char-comparison is the convention!
+ return charL <=> charR;
}
}
}
@@ -206,25 +211,48 @@ std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs)
std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs)
{
- //fast path: no need for extra memory allocations => ~ 6x speedup
- const size_t minSize = std::min(lhs.size(), rhs.size());
+ //fast path: no memory allocations => ~ 6x speedup
+ if (isAsciiString(lhs) && isAsciiString(rhs))
+ {
+ const size_t minSize = std::min(lhs.size(), rhs.size());
+ for (size_t i = 0; i < minSize; ++i)
+ {
+ //ordering: do NOT call compareAsciiNoCase(), which uses asciiToLower()!
+ const Zchar lUp = asciiToUpper(lhs[i]); //
+ const Zchar rUp = asciiToUpper(rhs[i]); //no surprises: emulate getUpperCase() [verified!]
+ if (lUp != rUp) //
+ return lUp <=> rUp; //
+ }
+ return lhs.size() <=> rhs.size();
+ }
+ //--------------------------------------
+
+ //can't we instead skip isAsciiString() and compare chars as long as isAsciiChar()?
+ // => NOPE! e.g. decomposed Unicode! A seemingly single isAsciiChar() might be followed by a combining character!!!
+
+ return getUpperCase(lhs) <=> getUpperCase(rhs);
+}
+
+
+bool equalNoCase(const Zstring& lhs, const Zstring& rhs)
+{
+ //fast-path: no need for extra memory allocations
+ const bool isAsciiL = isAsciiString(lhs);
+ const bool isAsciiR = isAsciiString(rhs);
+ if (isAsciiL != isAsciiR)
+ return false;
- size_t i = 0;
- for (; i < minSize; ++i)
+ if (isAsciiL)
{
- const Zchar l = lhs[i];
- const Zchar r = rhs[i];
- if (!isAsciiChar(l) || !isAsciiChar(r))
- goto slowPath; //=> let's NOT make assumptions how getUpperCase() compares "ASCII <=> non-ASCII"
-
- const Zchar lUp = asciiToUpper(l); //
- const Zchar rUp = asciiToUpper(r); //no surprises: emulate getUpperCase() [verified!]
- if (lUp != rUp) //
- return lUp <=> rUp; //
+ if (lhs.size() != rhs.size())
+ return false;
+
+ for (size_t i = 0; i < lhs.size(); ++i)
+ if (asciiToUpper(lhs[i]) !=
+ asciiToUpper(rhs[i]))
+ return false;
+ return true;
}
- return lhs.size() <=> rhs.size();
-slowPath: //--------------------------------------
- return compareNoCaseUtf8(lhs.c_str() + i, lhs.size() - i,
- rhs.c_str() + i, rhs.size() - i);
+ return getUpperCaseNonAscii(lhs) == getUpperCaseNonAscii(rhs);
}
bgstack15