Merge branch 'b11.25' into 'master'11.25

add upstream 11.25 See merge request opensource-tracking/FreeFileSync!48
author: B. Stack <bgstack15@gmail.com> 2022-09-07 18:55:24 +0000
committer: B. Stack <bgstack15@gmail.com> 2022-09-07 18:55:24 +0000
commit: 1e582c4e99fe08c70c75fef7cd8ed22343253297 (patch)
tree: b0047c655d52e4e479ceb73c713414f8d0744c38 /zen
parent: Merge branch 'b11.24' into 'master' (diff)
parent: add upstream 11.25 (diff)
download: FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.tar.gz
FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.tar.bz2
FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.zip
7 files changed, 109 insertions, 81 deletions
diff --git a/zen/json.h b/zen/json.h
index 3a9d73f3..be2cfbab 100644
--- a/zen/json.h
+++ b/zen/json.h
@@ -140,7 +140,7 @@ namespace
         {
             UtfDecoder<impl::Char16> decoder(utf16Buf.c_str(), utf16Buf.size());
             while (std::optional<impl::CodePoint> cp = decoder.getNext())
-                impl::codePointToUtf<char>(*cp, [&](char c) { output += c; });
+                codePointToUtf<char>(*cp, [&](char c) { output += c; });
             utf16Buf.clear();
         }
     };
diff --git a/zen/process_exec.cpp b/zen/process_exec.cpp
index df41a627..fb691151 100644
--- a/zen/process_exec.cpp
+++ b/zen/process_exec.cpp
@@ -19,7 +19,7 @@ using namespace zen;
 
 Zstring zen::escapeCommandArg(const Zstring& arg)
 {
-//*INDENT-OFF*
+//*INDENT-OFF*    if not put exactly here, Astyle will seriously mess this .cpp file up!
     Zstring output;
     for (const Zchar c : arg)
         switch (c)
@@ -27,7 +27,7 @@ Zstring zen::escapeCommandArg(const Zstring& arg)
             case  '"': output += "\\\""; break; //Windows: not needed; " cannot be used as file name
             case '\\': output += "\\\\"; break; //Windows: path separator! => don't escape
             case '`':  output += "\\`";  break; //yes, used in some paths => Windows: no escaping required
-            default: output += c; break;
+            default:   output += c; break;
         }
 //*INDENT-ON*
     if (contains(output, Zstr(' ')))
diff --git a/zen/serialize.h b/zen/serialize.h
index b2561808..26202d96 100644
--- a/zen/serialize.h
+++ b/zen/serialize.h
@@ -8,9 +8,6 @@
 #define SERIALIZE_H_839405783574356
 
 #include <functional>
-//#include <cstdint>
-//#include <stdexcept>
-//#include "string_base.h"
 #include "sys_error.h"
 //keep header clean from specific stream implementations! (e.g.file_io.h)! used by abstract.h!
 
@@ -19,36 +16,35 @@ namespace zen
 {
 /* high-performance unformatted serialization (avoiding wxMemoryOutputStream/wxMemoryInputStream inefficiencies)
 
---------------------------
-|Binary Container Concept|
---------------------------
-binary container for data storage: must support "basic" std::vector interface (e.g. std::vector<std::byte>, std::string, Zbase<char>)
+    ----------------------------
+    | Binary Container Concept |
+    ----------------------------
+        binary container for data storage: must support "basic" std::vector interface (e.g. std::vector<std::byte>, std::string, Zbase<char>)
 
+    ---------------------------------
+    | Buffered Input Stream Concept |
+    ---------------------------------
+        struct BufferedInputStream
+        {
+            size_t read(void* buffer, size_t bytesToRead); //throw X; return "bytesToRead" bytes unless end of stream!
+
+            Optional: support stream-copying
+            --------------------------------
+            size_t getBlockSize() const;
+            const IoCallback& notifyUnbufferedIO
+        };
+
+    ----------------------------------
+    | Buffered Output Stream Concept |
+    ----------------------------------
+        struct BufferedOutputStream
+        {
+            void write(const void* buffer, size_t bytesToWrite); //throw X
 
--------------------------------
-|Buffered Input Stream Concept|
--------------------------------
-struct BufferedInputStream
-{
-    size_t read(void* buffer, size_t bytesToRead); //throw X; return "bytesToRead" bytes unless end of stream!
-
-Optional: support stream-copying
---------------------------------
-    size_t getBlockSize() const;
-    const IoCallback& notifyUnbufferedIO
-};
-
---------------------------------
-|Buffered Output Stream Concept|
---------------------------------
-struct BufferedOutputStream
-{
-    void write(const void* buffer, size_t bytesToWrite); //throw X
-
-Optional: support stream-copying
---------------------------------
-    const IoCallback& notifyUnbufferedIO
-};                                                                           */
+            Optional: support stream-copying
+            --------------------------------
+            const IoCallback& notifyUnbufferedIO
+        };                                                                           */
 using IoCallback = std::function<void(int64_t bytesDelta)>; //throw X
 
 
@@ -116,6 +112,7 @@ private:
     size_t pos_ = 0;
 };
 
+
 template <class BinContainer>
 struct MemoryStreamOut
 {
@@ -144,9 +141,6 @@ private:
 
 
 
-
-
-
 //-----------------------implementation-------------------------------
 template <class BufferedInputStream, class BufferedOutputStream> inline
 void bufferedStreamCopy(BufferedInputStream& streamIn,   //throw X
@@ -214,10 +208,13 @@ void writeNumber(BufferedOutputStream& stream, const N& num)
 template <class C, class BufferedOutputStream> inline
 void writeContainer(BufferedOutputStream& stream, const C& cont) //don't even consider UTF8 conversions here, we're handling arbitrary binary data!
 {
-    const auto len = cont.size();
-    writeNumber(stream, static_cast<uint32_t>(len));
-    if (len > 0)
-        writeArray(stream, &cont[0], sizeof(typename C::value_type) * len); //don't use c_str(), but access uniformly via STL interface
+    const auto size = cont.size();
+
+    assert(size <= INT32_MAX);
+    writeNumber(stream, static_cast<int32_t>(size)); //use *signed* integer to help catch data corruption
+
+    if (size > 0)
+        writeArray(stream, &cont[0], sizeof(typename C::value_type) * size); //don't use c_str(), but access uniformly via STL interface
 }
 
 
@@ -244,18 +241,21 @@ N readNumber(BufferedInputStream& stream) //throw SysErrorUnexpectedEos
 template <class C, class BufferedInputStream> inline
 C readContainer(BufferedInputStream& stream) //throw SysErrorUnexpectedEos
 {
+    const auto size = readNumber<int32_t>(stream); //throw SysErrorUnexpectedEos
+    if (size < 0) //most likely due to data corruption!
+        throw SysErrorUnexpectedEos();
+
     C cont;
-    auto strLength = readNumber<uint32_t>(stream); //throw SysErrorUnexpectedEos
-    if (strLength > 0)
+    if (size > 0)
     {
         try
         {
-            cont.resize(strLength); //throw std::length_error, std::bad_alloc
+            cont.resize(size); //throw std::length_error, std::bad_alloc
         }
-        catch (std::length_error&) { throw SysErrorUnexpectedEos(); } //most likely this is due to data corruption!
+        catch (std::length_error&) { throw SysErrorUnexpectedEos(); } //most likely due to data corruption!
         catch (   std::bad_alloc&) { throw SysErrorUnexpectedEos(); } //
 
-        readArray(stream, &cont[0], sizeof(typename C::value_type) * strLength); //throw SysErrorUnexpectedEos
+        readArray(stream, &cont[0], sizeof(typename C::value_type) * size); //throw SysErrorUnexpectedEos
     }
     return cont;
 }
diff --git a/zen/string_tools.h b/zen/string_tools.h
index cafff3d5..181a3951 100644
--- a/zen/string_tools.h
+++ b/zen/string_tools.h
@@ -263,7 +263,7 @@ bool equalString(const S& lhs, const T& rhs)
 template <class S, class T> inline
 bool equalAsciiNoCase(const S& lhs, const T& rhs)
 {
-    assert(isAsciiString(lhs) || isAsciiString(rhs));
+    //assert(isAsciiString(lhs) || isAsciiString(rhs));
     const size_t lhsLen = strLength(lhs);
     return lhsLen == strLength(rhs) && impl::strcmpAsciiNoCase(strBegin(lhs), strBegin(rhs), lhsLen) == std::weak_ordering::equivalent;
 }
diff --git a/zen/utf.h b/zen/utf.h
index ca231602..56b1ff55 100644
--- a/zen/utf.h
+++ b/zen/utf.h
@@ -222,15 +222,9 @@ private:
 
 //----------------------------------------------------------------------------------------------------------------
 
-template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, std::integral_constant<int, 1>) { codePointToUtf8 (cp, writeOutput); } //UTF8-char
-template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, std::integral_constant<int, 2>) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t
-template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, std::integral_constant<int, 4>) { writeOutput(cp); } //other OS: UTF32-wchar_t
-
-template <class CharType, class Function> inline
-void codePointToUtf(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType
-{
-    return codePointToUtf(cp, writeOutput, std::integral_constant<int, sizeof(CharType)>());
-}
+template <class Function> inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant<int, 1>) { codePointToUtf8 (cp, writeOutput); } //UTF8-char
+template <class Function> inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant<int, 2>) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t
+template <class Function> inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant<int, 4>) { writeOutput(cp); } //other OS: UTF32-wchar_t
 
 //----------------------------------------------------------------------------------------------------------------
 
@@ -277,9 +271,18 @@ private:
 };
 }
 
+
 template <class CharType>
 using UtfDecoder = impl::UtfDecoderImpl<CharType, sizeof(CharType)>;
 
+
+template <class CharType, class Function> inline
+void codePointToUtf(impl::CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType
+{
+    return impl::codePointToUtfImpl(cp, writeOutput, std::integral_constant<int, sizeof(CharType)>());
+}
+
+
 //-------------------------------------------------------------------------------------------
 
 template <class UtfString> inline
diff --git a/zen/zstring.cpp b/zen/zstring.cpp
index 1e29e461..3f5328f7 100644
--- a/zen/zstring.cpp
+++ b/zen/zstring.cpp
@@ -15,7 +15,7 @@ Zstring getUnicodeNormalFormNonAscii(const Zstring& str)
 {
     //Example: const char* decomposed  = "\x6f\xcc\x81";
     //         const char* precomposed = "\xc3\xb3";
-    assert(!isAsciiString(str));
+    assert(!isAsciiString(str)); //includes "not-empty" check
     assert(str.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls!
 
     try
@@ -51,14 +51,14 @@ Zstring getUpperCaseNonAscii(const Zstring& str)
     Zstring strNorm = getUnicodeNormalFormNonAscii(str);
     try
     {
-        static_assert(sizeof(impl::CodePoint) == sizeof(gunichar));
         Zstring output;
         output.reserve(strNorm.size());
 
         UtfDecoder<char> decoder(strNorm.c_str(), strNorm.size());
         while (const std::optional<impl::CodePoint> cp = decoder.getNext())
-            impl::codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent!
+            codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent!
 
+        static_assert(sizeof(impl::CodePoint) == sizeof(gunichar));
         return output;
 
     }
@@ -89,6 +89,10 @@ namespace
 {
 std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
 {
+    //expect Unicode normalized strings!
+    assert(std::string(lhs, lhsLen) == getUnicodeNormalForm(std::string(lhs, lhsLen)));
+    assert(std::string(rhs, rhsLen) == getUnicodeNormalForm(std::string(rhs, rhsLen)));
+
     //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode
     //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c
     // => re-implement comparison based on g_unichar_tolower() to avoid memory allocations
@@ -103,12 +107,13 @@ std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char*
             return !cpR <=> !cpL;
 
         static_assert(sizeof(gunichar) == sizeof(impl::CodePoint));
+        static_assert(std::is_unsigned_v<gunichar>, "unsigned char-comparison is the convention!");
 
         //ordering: "to lower" converts to higher code points than "to upper"
         const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use:
         const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle.
         if (charL != charR)
-            return makeUnsigned(charL) <=> makeUnsigned(charR); //unsigned char-comparison is the convention!
+            return charL <=> charR;
     }
 }
 }
@@ -206,25 +211,48 @@ std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs)
 
 std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs)
 {
-    //fast path: no need for extra memory allocations => ~ 6x speedup
-    const size_t minSize = std::min(lhs.size(), rhs.size());
+    //fast path: no memory allocations => ~ 6x speedup
+    if (isAsciiString(lhs) && isAsciiString(rhs))
+    {
+        const size_t minSize = std::min(lhs.size(), rhs.size());
+        for (size_t i = 0; i < minSize; ++i)
+        {
+            //ordering: do NOT call compareAsciiNoCase(), which uses asciiToLower()!
+            const Zchar lUp = asciiToUpper(lhs[i]); //
+            const Zchar rUp = asciiToUpper(rhs[i]); //no surprises: emulate getUpperCase() [verified!]
+            if (lUp != rUp)                         //
+                return lUp <=> rUp;                 //
+        }
+        return lhs.size() <=> rhs.size();
+    }
+    //--------------------------------------
+
+    //can't we instead skip isAsciiString() and compare chars as long as isAsciiChar()?
+    // => NOPE! e.g. decomposed Unicode! A seemingly single isAsciiChar() might be followed by a combining character!!!
+
+    return getUpperCase(lhs) <=> getUpperCase(rhs);
+}
+
+
+bool equalNoCase(const Zstring& lhs, const Zstring& rhs)
+{
+    //fast-path: no need for extra memory allocations
+    const bool isAsciiL = isAsciiString(lhs);
+    const bool isAsciiR = isAsciiString(rhs);
+    if (isAsciiL != isAsciiR)
+        return false;
 
-    size_t i = 0;
-    for (; i < minSize; ++i)
+    if (isAsciiL)
     {
-        const Zchar l = lhs[i];
-        const Zchar r = rhs[i];
-        if (!isAsciiChar(l) || !isAsciiChar(r))
-            goto slowPath; //=> let's NOT make assumptions how getUpperCase() compares "ASCII <=> non-ASCII"
-
-        const Zchar lUp = asciiToUpper(l); //
-        const Zchar rUp = asciiToUpper(r); //no surprises: emulate getUpperCase() [verified!]
-        if (lUp != rUp)                    //
-            return lUp <=> rUp;            //
+        if (lhs.size() != rhs.size())
+            return false;
+
+        for (size_t i = 0; i < lhs.size(); ++i)
+            if (asciiToUpper(lhs[i]) !=
+                asciiToUpper(rhs[i]))
+                return false;
+        return true;
     }
-    return lhs.size() <=> rhs.size();
-slowPath: //--------------------------------------
 
-    return compareNoCaseUtf8(lhs.c_str() + i, lhs.size() - i, 
-                             rhs.c_str() + i, rhs.size() - i);
+    return getUpperCaseNonAscii(lhs) == getUpperCaseNonAscii(rhs);
 }
diff --git a/zen/zstring.h b/zen/zstring.h
index 70b9f448..692217c1 100644
--- a/zen/zstring.h
+++ b/zen/zstring.h
@@ -63,10 +63,7 @@ template<> struct std::hash<ZstringNoCase> { size_t operator()(const ZstringNoCa
 
 std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs);
 
-inline
-bool equalNoCase(const Zstring& lhs, const Zstring& rhs) { return compareNoCase(lhs, rhs) == std::weak_ordering::equivalent;  }
-//note: the "lhs.size() != rhs.size()" short-cut would require two isAsciiString() checks
-//=> generally SLOWER than starting comparison directly during first pass and breaking on first difference!
+bool equalNoCase(const Zstring& lhs, const Zstring& rhs);
 
 //------------------------------------------------------------------------------------------
 std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs);
author	B. Stack <bgstack15@gmail.com>	2022-09-07 18:55:24 +0000
committer	B. Stack <bgstack15@gmail.com>	2022-09-07 18:55:24 +0000
commit	1e582c4e99fe08c70c75fef7cd8ed22343253297 (patch)
tree	b0047c655d52e4e479ceb73c713414f8d0744c38 /zen
parent	Merge branch 'b11.24' into 'master' (diff)
parent	add upstream 11.25 (diff)
download	FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.tar.gz FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.tar.bz2 FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.zip