// ***************************************************************************** // * This file is part of the FreeFileSync project. It is distributed under * // * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0 * // * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved * // ***************************************************************************** #include "zstring.h" #include #include "utf.h" using namespace zen; /* MSDN "Handling Sorting in Your Applications": https://msdn.microsoft.com/en-us/library/windows/desktop/dd318144 Perf test: compare strings 10 mio times; 64 bit build ----------------------------------------------------- string a = "Fjk84$%kgfj$%T\\\\Gffg\\gsdgf\\fgsx----------d-" string b = "fjK84$%kgfj$%T\\\\gfFg\\gsdgf\\fgSy----------dfdf" Windows (UTF16 wchar_t) 4 ns | wcscmp 67 ns | CompareStringOrdinalFunc+ + bIgnoreCase 314 ns | LCMapString + wmemcmp OS X (UTF8 char) 6 ns | strcmp 98 ns | strcasecmp 120 ns | strncasecmp + std::min(sizeLhs, sizeRhs); 856 ns | CFStringCreateWithCString + CFStringCompare(kCFCompareCaseInsensitive) 1110 ns | CFStringCreateWithCStringNoCopy + CFStringCompare(kCFCompareCaseInsensitive) ________________________ time per call | function */ namespace { int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) { //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c // => re-implement comparison based on towlower() to avoid memory allocations using namespace zen::implementation; UtfDecoder decL(lhs, lhsLen); UtfDecoder decR(rhs, rhsLen); for (;;) { const Opt cpL = decL.getNext(); const Opt cpR = decR.getNext(); if (!cpL || !cpR) return static_cast(!cpR) - static_cast(!cpL); static_assert(sizeof(wchar_t) == sizeof(CodePoint), ""); const wchar_t charL = ::towlower(static_cast(*cpL)); //ordering: towlower() converts to higher code points than towupper() const wchar_t charR = ::towlower(static_cast(*cpR)); //uses LC_CTYPE category of current locale if (charL != charR) return static_cast(charL) - static_cast(charR); //unsigned char-comparison is the convention! //unsigned underflow is well-defined! } } } int cmpStringNaturalLinux(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) { const char* const lhsEnd = lhs + lhsLen; const char* const rhsEnd = rhs + rhsLen; /* - compare strings after conceptually creating blocks of whitespace/numbers/text - implement strict weak ordering! - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c 1. incorrect non-ASCII CI-comparison 2. incorrect bounds checks 3. incorrect trimming of *all* whitespace 4. arbitrary handling of leading 0 only at string begin 5. incorrect handling of whitespace following a number 6. code is a mess */ for (;;) { if (lhs == lhsEnd || rhs == rhsEnd) return static_cast(lhs != lhsEnd) - static_cast(rhs != rhsEnd); //"nothing" before "something" //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here const bool wsL = isWhiteSpace(*lhs); const bool wsR = isWhiteSpace(*rhs); if (wsL != wsR) return static_cast(!wsL) - static_cast(!wsR); //whitespace before non-ws! if (wsL) { ++lhs, ++rhs; while (lhs != lhsEnd && isWhiteSpace(*lhs)) ++lhs; while (rhs != rhsEnd && isWhiteSpace(*rhs)) ++rhs; continue; } const bool digitL = isDigit(*lhs); const bool digitR = isDigit(*rhs); if (digitL != digitR) return static_cast(!digitL) - static_cast(!digitR); //number before chars! if (digitL) { while (lhs != lhsEnd && *lhs == '0') ++lhs; while (rhs != rhsEnd && *rhs == '0') ++rhs; int rv = 0; for (;; ++lhs, ++rhs) { const bool endL = lhs == lhsEnd || !isDigit(*lhs); const bool endR = rhs == rhsEnd || !isDigit(*rhs); if (endL != endR) return static_cast(!endL) - static_cast(!endR); //more digits means bigger number if (endL) break; //same number of digits if (rv == 0 && *lhs != *rhs) rv = *lhs - *rhs; //found first digit difference comparing from left } if (rv != 0) return rv; continue; } //compare full junks of text: consider unicode encoding! const char* textBeginL = lhs++; const char* textBeginR = rhs++; //current char is neither white space nor digit at this point! while (lhs != lhsEnd && !isWhiteSpace(*lhs) && !isDigit(*lhs)) ++lhs; while (rhs != rhsEnd && !isWhiteSpace(*rhs) && !isDigit(*rhs)) ++rhs; const int rv = compareNoCaseUtf8(textBeginL, lhs - textBeginL, textBeginR, rhs - textBeginR); if (rv != 0) return rv; } } namespace { } int CmpNaturalSort::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const { //auto strL = utfTo(Zstring(lhs, lhsLen)); //auto strR = utfTo(Zstring(rhs, rhsLen)); //return cmpStringNaturalLinux(strL.c_str(), strL.size(), strR.c_str(), strR.size()); return cmpStringNaturalLinux(lhs, lhsLen, rhs, rhsLen); }