From 3ba62ef1de77153e5a8c7bad4451b96f6a1678b0 Mon Sep 17 00:00:00 2001 From: Daniel Wilhelm Date: Sun, 12 Mar 2017 22:00:35 -0600 Subject: 8.10 --- zen/zstring.cpp | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) (limited to 'zen/zstring.cpp') diff --git a/zen/zstring.cpp b/zen/zstring.cpp index 5f5b1ec8..a936efb5 100755 --- a/zen/zstring.cpp +++ b/zen/zstring.cpp @@ -6,11 +6,14 @@ #include "zstring.h" #include +#include "utf.h" using namespace zen; /* +MSDN "Handling Sorting in Your Applications": https://msdn.microsoft.com/en-us/library/windows/desktop/dd318144 + Perf test: compare strings 10 mio times; 64 bit build ----------------------------------------------------- string a = "Fjk84$%kgfj$%T\\\\Gffg\\gsdgf\\fgsx----------d-" @@ -32,3 +35,117 @@ time per call | function */ + + +namespace +{ +int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) +{ + //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode + //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c + // => re-implement comparison based on towlower() to avoid memory allocations + using namespace zen::implementation; + + UtfDecoder decL(lhs, lhsLen); + UtfDecoder decR(rhs, rhsLen); + for (;;) + { + const Opt cpL = decL.getNext(); + const Opt cpR = decR.getNext(); + if (!cpL || !cpR) + return static_cast(!cpR) - static_cast(!cpL); + + static_assert(sizeof(wchar_t) == sizeof(CodePoint), ""); + const wchar_t charL = ::towlower(static_cast(*cpL)); //ordering: towlower() converts to higher code points than towupper() + const wchar_t charR = ::towlower(static_cast(*cpR)); //uses LC_CTYPE category of current locale + if (charL != charR) + return static_cast(charL) - static_cast(charR); //unsigned char-comparison is the convention! + //unsigned underflow is well-defined! + } +} +} + + +int cmpStringNaturalLinux(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) +{ + const char* const lhsEnd = lhs + lhsLen; + const char* const rhsEnd = rhs + rhsLen; + /* + - compare strings after conceptually creating blocks of whitespace/numbers/text + - implement strict weak ordering! + - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c + 1. incorrect non-ASCII CI-comparison 2. incorrect bounds checks + 3. incorrect trimming of *all* whitespace 4. arbitrary handling of leading 0 only at string begin + 5. incorrect handling of whitespace following a number 6. code is a mess + */ + for (;;) + { + if (lhs == lhsEnd || rhs == rhsEnd) + return static_cast(lhs != lhsEnd) - static_cast(rhs != rhsEnd); //"nothing" before "something" + //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here + + const bool wsL = isWhiteSpace(*lhs); + const bool wsR = isWhiteSpace(*rhs); + if (wsL != wsR) + return static_cast(!wsL) - static_cast(!wsR); //whitespace before non-ws! + if (wsL) + { + ++lhs, ++rhs; + while (lhs != lhsEnd && isWhiteSpace(*lhs)) ++lhs; + while (rhs != rhsEnd && isWhiteSpace(*rhs)) ++rhs; + continue; + } + + const bool digitL = isDigit(*lhs); + const bool digitR = isDigit(*rhs); + if (digitL != digitR) + return static_cast(!digitL) - static_cast(!digitR); //number before chars! + if (digitL) + { + while (lhs != lhsEnd && *lhs == '0') ++lhs; + while (rhs != rhsEnd && *rhs == '0') ++rhs; + + int rv = 0; + for (;; ++lhs, ++rhs) + { + const bool endL = lhs == lhsEnd || !isDigit(*lhs); + const bool endR = rhs == rhsEnd || !isDigit(*rhs); + if (endL != endR) + return static_cast(!endL) - static_cast(!endR); //more digits means bigger number + if (endL) + break; //same number of digits + + if (rv == 0 && *lhs != *rhs) + rv = *lhs - *rhs; //found first digit difference comparing from left + } + if (rv != 0) + return rv; + continue; + } + + //compare full junks of text: consider unicode encoding! + const char* textBeginL = lhs++; + const char* textBeginR = rhs++; //current char is neither white space nor digit at this point! + while (lhs != lhsEnd && !isWhiteSpace(*lhs) && !isDigit(*lhs)) ++lhs; + while (rhs != rhsEnd && !isWhiteSpace(*rhs) && !isDigit(*rhs)) ++rhs; + + const int rv = compareNoCaseUtf8(textBeginL, lhs - textBeginL, textBeginR, rhs - textBeginR); + if (rv != 0) + return rv; + } +} + + +namespace +{ +} + + +int CmpNaturalSort::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const +{ + //auto strL = utfTo(Zstring(lhs, lhsLen)); + //auto strR = utfTo(Zstring(rhs, rhsLen)); + //return cmpStringNaturalLinux(strL.c_str(), strL.size(), strR.c_str(), strR.size()); + return cmpStringNaturalLinux(lhs, lhsLen, rhs, rhsLen); + +} \ No newline at end of file -- cgit