10.5

author: B Stack <bgstack15@gmail.com> 2018-10-16 17:33:51 -0400
committer: B Stack <bgstack15@gmail.com> 2018-10-16 17:33:51 -0400
commit: 878a41d3be13da2a654df74f2a35ea8b295c8a13 (patch)
tree: 89b2a018482c164bdd8ecac5c76b19a08f420dec /zen/zstring.cpp
parent: Merge branch '10.4' into 'master' (diff)
download: FreeFileSync-878a41d3be13da2a654df74f2a35ea8b295c8a13.tar.gz
FreeFileSync-878a41d3be13da2a654df74f2a35ea8b295c8a13.tar.bz2
FreeFileSync-878a41d3be13da2a654df74f2a35ea8b295c8a13.zip
1 files changed, 143 insertions, 41 deletions
diff --git a/zen/zstring.cpp b/zen/zstring.cpp
index 8bf77a0b..68609030 100755
--- a/zen/zstring.cpp
+++ b/zen/zstring.cpp
@@ -8,9 +8,102 @@
 #include <stdexcept>
 #include "utf.h"
 
+    #include <gtk/gtk.h>
+    #include "sys_error.h"
 
 using namespace zen;
 
+
+Zstring makeUpperCopy(const Zstring& str)
+{
+    //fast pre-check:
+    if (isAsciiString(str.c_str())) //perf: in the range of 3.5ns
+    {
+        Zstring output = str;
+        for (Zchar& c : output) c = asciiToUpper(c);
+        return output;
+    }
+
+    Zstring strNorm = getUnicodeNormalForm(str);
+    try
+    {
+        static_assert(sizeof(impl::CodePoint) == sizeof(gunichar));
+        Zstring output;
+        output.reserve(strNorm.size());
+
+        impl::UtfDecoder<char> decoder(strNorm.c_str(), strNorm.size());
+        while (const std::optional<impl::CodePoint> cp = decoder.getNext())
+            impl::codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent!
+
+        return output;
+
+    }
+    catch (const SysError& e)
+    {
+        (void)e;
+        assert(false);
+        return str;
+    }
+}
+
+
+Zstring getUnicodeNormalForm(const Zstring& str)
+{
+    //fast pre-check:
+    if (isAsciiString(str.c_str())) //perf: in the range of 3.5ns
+        return str; //god bless our ref-counting! => save output string memory consumption!
+
+    //Example: const char* decomposed  = "\x6f\xcc\x81";
+    //         const char* precomposed = "\xc3\xb3";
+    try
+    {
+        gchar* outStr = ::g_utf8_normalize (str.c_str(), str.length(), G_NORMALIZE_DEFAULT_COMPOSE);
+        if (!outStr)
+            throw SysError(L"g_utf8_normalize: conversion failed. (" + utfTo<std::wstring>(str) + L")");
+        ZEN_ON_SCOPE_EXIT(::g_free(outStr));
+        return outStr;
+
+    }
+    catch (const SysError& e)
+    {
+        (void)e;
+        assert(false);
+        return str;
+    }
+}
+
+
+Zstring replaceCpyAsciiNoCase(const Zstring& str, const Zstring& oldTerm, const Zstring& newTerm)
+{
+    if (oldTerm.empty())
+        return str;
+    
+    Zstring strU = str;    
+    Zstring oldU = oldTerm;
+          
+    for (Zchar& c : strU) c = asciiToUpper(c); //can't use makeUpperCopy(): input/output sizes may differ!
+    for (Zchar& c : oldU) c = asciiToUpper(c); //
+
+    Zstring output;
+
+    for (size_t pos = 0;;)
+    {
+        const size_t posFound = strU.find(oldU, pos);
+        if (posFound == Zstring::npos)
+        {
+            if (pos == 0) //optimize "oldTerm not found": return ref-counted copy
+                return str; 
+            output.append(str.begin() + pos, str.end());
+            return output;
+        }
+
+        output.append(str.begin() + pos, str.begin() + posFound);
+        output += newTerm;
+        pos = posFound + oldTerm.size();
+    }
+}
+
+
 /*
 MSDN "Handling Sorting in Your Applications": https://msdn.microsoft.com/en-us/library/windows/desktop/dd318144
 
@@ -33,8 +126,14 @@ OS X (UTF8 char)
 ________________________
 time per call | function
 */
+int compareLocalPath(const Zstring& lhs, const Zstring& rhs)
+{
+    assert(lhs.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls!
+    assert(rhs.find(Zchar('\0')) == Zstring::npos); //
 
+    return compareString(lhs, rhs);
 
+}
 
 
 namespace
@@ -43,7 +142,7 @@ int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rh
 {
     //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode
     //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c
-    // => re-implement comparison based on towlower() to avoid memory allocations
+    // => re-implement comparison based on g_unichar_tolower() to avoid memory allocations
 
     impl::UtfDecoder<char> decL(lhs, lhsLen);
     impl::UtfDecoder<char> decR(rhs, rhsLen);
@@ -54,23 +153,35 @@ int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rh
         if (!cpL || !cpR)
             return static_cast<int>(!cpR) - static_cast<int>(!cpL);
 
-        //support unit-testing on Windows: CodePoint is truncated to wchar_t
-        static_assert(sizeof(wchar_t) == sizeof(impl::CodePoint));
+        static_assert(sizeof(gunichar) == sizeof(impl::CodePoint));
 
-        const wchar_t charL = ::towlower(static_cast<wchar_t>(*cpL)); //ordering: towlower() converts to higher code points than towupper()
-        const wchar_t charR = ::towlower(static_cast<wchar_t>(*cpR)); //uses LC_CTYPE category of current locale
+        const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use:
+        const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle.
         if (charL != charR)
+            //ordering: "to lower" converts to higher code points than "to upper"
             return static_cast<unsigned int>(charL) - static_cast<unsigned int>(charR); //unsigned char-comparison is the convention!
         //unsigned underflow is well-defined!
     }
 }
+
 }
 
 
-int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
+int compareNatural(const Zstring& lhs, const Zstring& rhs)
 {
-    const char* const lhsEnd = lhs + lhsLen;
-    const char* const rhsEnd = rhs + rhsLen;
+    //Unicode normal forms:
+    //      Windows: CompareString() already ignores NFD/NFC differences: nice...
+    //      Linux:  g_unichar_toupper() can't ignore differences
+    //      macOS:  CFStringCompare() considers differences
+
+    const Zstring& lhsNorm = getUnicodeNormalForm(lhs);
+    const Zstring& rhsNorm = getUnicodeNormalForm(rhs);
+
+    const char* strL = lhsNorm.c_str();
+    const char* strR = rhsNorm.c_str();
+
+    const char* const strEndL = strL + lhsNorm.size();
+    const char* const strEndR = strR + rhsNorm.size();
     /*
         - compare strings after conceptually creating blocks of whitespace/numbers/text
         - implement strict weak ordering!
@@ -84,43 +195,43 @@ int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, s
     */
     for (;;)
     {
-        if (lhs == lhsEnd || rhs == rhsEnd)
-            return static_cast<int>(lhs != lhsEnd) - static_cast<int>(rhs != rhsEnd); //"nothing" before "something"
+        if (strL == strEndL || strR == strEndR)
+            return static_cast<int>(strL != strEndL) - static_cast<int>(strR != strEndR); //"nothing" before "something"
         //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here
 
-        const bool wsL = isWhiteSpace(*lhs);
-        const bool wsR = isWhiteSpace(*rhs);
+        const bool wsL = isWhiteSpace(*strL);
+        const bool wsR = isWhiteSpace(*strR);
         if (wsL != wsR)
             return static_cast<int>(!wsL) - static_cast<int>(!wsR); //whitespace before non-ws!
         if (wsL)
         {
-            ++lhs, ++rhs;
-            while (lhs != lhsEnd && isWhiteSpace(*lhs)) ++lhs;
-            while (rhs != rhsEnd && isWhiteSpace(*rhs)) ++rhs;
+            ++strL, ++strR;
+            while (strL != strEndL && isWhiteSpace(*strL)) ++strL;
+            while (strR != strEndR && isWhiteSpace(*strR)) ++strR;
             continue;
         }
 
-        const bool digitL = isDigit(*lhs);
-        const bool digitR = isDigit(*rhs);
+        const bool digitL = isDigit(*strL);
+        const bool digitR = isDigit(*strR);
         if (digitL != digitR)
             return static_cast<int>(!digitL) - static_cast<int>(!digitR); //number before chars!
         if (digitL)
         {
-            while (lhs != lhsEnd && *lhs == '0') ++lhs;
-            while (rhs != rhsEnd && *rhs == '0') ++rhs;
+            while (strL != strEndL && *strL == '0') ++strL;
+            while (strR != strEndR && *strR == '0') ++strR;
 
             int rv = 0;
-            for (;; ++lhs, ++rhs)
+            for (;; ++strL, ++strR)
             {
-                const bool endL = lhs == lhsEnd || !isDigit(*lhs);
-                const bool endR = rhs == rhsEnd || !isDigit(*rhs);
+                const bool endL = strL == strEndL || !isDigit(*strL);
+                const bool endR = strR == strEndR || !isDigit(*strR);
                 if (endL != endR)
                     return static_cast<int>(!endL) - static_cast<int>(!endR); //more digits means bigger number
                 if (endL)
                     break; //same number of digits
 
-                if (rv == 0 && *lhs != *rhs)
-                    rv = *lhs - *rhs; //found first digit difference comparing from left
+                if (rv == 0 && *strL != *strR)
+                    rv = *strL - *strR; //found first digit difference comparing from left
             }
             if (rv != 0)
                 return rv;
@@ -128,28 +239,19 @@ int cmpStringNaturalLinuxTest(const char* lhs, size_t lhsLen, const char* rhs, s
         }
 
         //compare full junks of text: consider unicode encoding!
-        const char* textBeginL = lhs++;
-        const char* textBeginR = rhs++; //current char is neither white space nor digit at this point!
-        while (lhs != lhsEnd && !isWhiteSpace(*lhs) && !isDigit(*lhs)) ++lhs;
-        while (rhs != rhsEnd && !isWhiteSpace(*rhs) && !isDigit(*rhs)) ++rhs;
+        const char* textBeginL = strL++;
+        const char* textBeginR = strR++; //current char is neither white space nor digit at this point!
+        while (strL != strEndL && !isWhiteSpace(*strL) && !isDigit(*strL)) ++strL;
+        while (strR != strEndR && !isWhiteSpace(*strR) && !isDigit(*strR)) ++strR;
 
-        const int rv = compareNoCaseUtf8(textBeginL, lhs - textBeginL, textBeginR, rhs - textBeginR);
+        const int rv = compareNoCaseUtf8(textBeginL, strL - textBeginL, textBeginR, strR - textBeginR);
         if (rv != 0)
             return rv;
     }
-}
-
 
-namespace
-{
 }
 
 
-int CmpNaturalSort::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const
-{
-    //auto strL = utfTo<std::string>(Zstring(lhs, lhsLen));
-    //auto strR = utfTo<std::string>(Zstring(rhs, rhsLen));
-    //return cmpStringNaturalLinux(strL.c_str(), strL.size(), strR.c_str(), strR.size());
-    return cmpStringNaturalLinux(lhs, lhsLen, rhs, rhsLen);
-
-}
-\ No newline at end of file
+warn_static("clean up implementation of these two:")
+//template <> inline bool isWhiteSpace(char c)
+//template <> inline bool isWhiteSpace(wchar_t c)
author	B Stack <bgstack15@gmail.com>	2018-10-16 17:33:51 -0400
committer	B Stack <bgstack15@gmail.com>	2018-10-16 17:33:51 -0400
commit	878a41d3be13da2a654df74f2a35ea8b295c8a13 (patch)
tree	89b2a018482c164bdd8ecac5c76b19a08f420dec /zen/zstring.cpp
parent	Merge branch '10.4' into 'master' (diff)
download	FreeFileSync-878a41d3be13da2a654df74f2a35ea8b295c8a13.tar.gz FreeFileSync-878a41d3be13da2a654df74f2a35ea8b295c8a13.tar.bz2 FreeFileSync-878a41d3be13da2a654df74f2a35ea8b295c8a13.zip