diff options
author | B. Stack <bgstack15@gmail.com> | 2022-09-07 18:55:24 +0000 |
---|---|---|
committer | B. Stack <bgstack15@gmail.com> | 2022-09-07 18:55:24 +0000 |
commit | 1e582c4e99fe08c70c75fef7cd8ed22343253297 (patch) | |
tree | b0047c655d52e4e479ceb73c713414f8d0744c38 /zen | |
parent | Merge branch 'b11.24' into 'master' (diff) | |
parent | add upstream 11.25 (diff) | |
download | FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.tar.gz FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.tar.bz2 FreeFileSync-1e582c4e99fe08c70c75fef7cd8ed22343253297.zip |
Merge branch 'b11.25' into 'master'11.25
add upstream 11.25
See merge request opensource-tracking/FreeFileSync!48
Diffstat (limited to 'zen')
-rw-r--r-- | zen/json.h | 2 | ||||
-rw-r--r-- | zen/process_exec.cpp | 4 | ||||
-rw-r--r-- | zen/serialize.h | 86 | ||||
-rw-r--r-- | zen/string_tools.h | 2 | ||||
-rw-r--r-- | zen/utf.h | 21 | ||||
-rw-r--r-- | zen/zstring.cpp | 70 | ||||
-rw-r--r-- | zen/zstring.h | 5 |
7 files changed, 109 insertions, 81 deletions
@@ -140,7 +140,7 @@ namespace { UtfDecoder<impl::Char16> decoder(utf16Buf.c_str(), utf16Buf.size()); while (std::optional<impl::CodePoint> cp = decoder.getNext()) - impl::codePointToUtf<char>(*cp, [&](char c) { output += c; }); + codePointToUtf<char>(*cp, [&](char c) { output += c; }); utf16Buf.clear(); } }; diff --git a/zen/process_exec.cpp b/zen/process_exec.cpp index df41a627..fb691151 100644 --- a/zen/process_exec.cpp +++ b/zen/process_exec.cpp @@ -19,7 +19,7 @@ using namespace zen; Zstring zen::escapeCommandArg(const Zstring& arg) { -//*INDENT-OFF* +//*INDENT-OFF* if not put exactly here, Astyle will seriously mess this .cpp file up! Zstring output; for (const Zchar c : arg) switch (c) @@ -27,7 +27,7 @@ Zstring zen::escapeCommandArg(const Zstring& arg) case '"': output += "\\\""; break; //Windows: not needed; " cannot be used as file name case '\\': output += "\\\\"; break; //Windows: path separator! => don't escape case '`': output += "\\`"; break; //yes, used in some paths => Windows: no escaping required - default: output += c; break; + default: output += c; break; } //*INDENT-ON* if (contains(output, Zstr(' '))) diff --git a/zen/serialize.h b/zen/serialize.h index b2561808..26202d96 100644 --- a/zen/serialize.h +++ b/zen/serialize.h @@ -8,9 +8,6 @@ #define SERIALIZE_H_839405783574356 #include <functional> -//#include <cstdint> -//#include <stdexcept> -//#include "string_base.h" #include "sys_error.h" //keep header clean from specific stream implementations! (e.g.file_io.h)! used by abstract.h! @@ -19,36 +16,35 @@ namespace zen { /* high-performance unformatted serialization (avoiding wxMemoryOutputStream/wxMemoryInputStream inefficiencies) --------------------------- -|Binary Container Concept| --------------------------- -binary container for data storage: must support "basic" std::vector interface (e.g. std::vector<std::byte>, std::string, Zbase<char>) + ---------------------------- + | Binary Container Concept | + ---------------------------- + binary container for data storage: must support "basic" std::vector interface (e.g. std::vector<std::byte>, std::string, Zbase<char>) + --------------------------------- + | Buffered Input Stream Concept | + --------------------------------- + struct BufferedInputStream + { + size_t read(void* buffer, size_t bytesToRead); //throw X; return "bytesToRead" bytes unless end of stream! + + Optional: support stream-copying + -------------------------------- + size_t getBlockSize() const; + const IoCallback& notifyUnbufferedIO + }; + + ---------------------------------- + | Buffered Output Stream Concept | + ---------------------------------- + struct BufferedOutputStream + { + void write(const void* buffer, size_t bytesToWrite); //throw X -------------------------------- -|Buffered Input Stream Concept| -------------------------------- -struct BufferedInputStream -{ - size_t read(void* buffer, size_t bytesToRead); //throw X; return "bytesToRead" bytes unless end of stream! - -Optional: support stream-copying --------------------------------- - size_t getBlockSize() const; - const IoCallback& notifyUnbufferedIO -}; - --------------------------------- -|Buffered Output Stream Concept| --------------------------------- -struct BufferedOutputStream -{ - void write(const void* buffer, size_t bytesToWrite); //throw X - -Optional: support stream-copying --------------------------------- - const IoCallback& notifyUnbufferedIO -}; */ + Optional: support stream-copying + -------------------------------- + const IoCallback& notifyUnbufferedIO + }; */ using IoCallback = std::function<void(int64_t bytesDelta)>; //throw X @@ -116,6 +112,7 @@ private: size_t pos_ = 0; }; + template <class BinContainer> struct MemoryStreamOut { @@ -144,9 +141,6 @@ private: - - - //-----------------------implementation------------------------------- template <class BufferedInputStream, class BufferedOutputStream> inline void bufferedStreamCopy(BufferedInputStream& streamIn, //throw X @@ -214,10 +208,13 @@ void writeNumber(BufferedOutputStream& stream, const N& num) template <class C, class BufferedOutputStream> inline void writeContainer(BufferedOutputStream& stream, const C& cont) //don't even consider UTF8 conversions here, we're handling arbitrary binary data! { - const auto len = cont.size(); - writeNumber(stream, static_cast<uint32_t>(len)); - if (len > 0) - writeArray(stream, &cont[0], sizeof(typename C::value_type) * len); //don't use c_str(), but access uniformly via STL interface + const auto size = cont.size(); + + assert(size <= INT32_MAX); + writeNumber(stream, static_cast<int32_t>(size)); //use *signed* integer to help catch data corruption + + if (size > 0) + writeArray(stream, &cont[0], sizeof(typename C::value_type) * size); //don't use c_str(), but access uniformly via STL interface } @@ -244,18 +241,21 @@ N readNumber(BufferedInputStream& stream) //throw SysErrorUnexpectedEos template <class C, class BufferedInputStream> inline C readContainer(BufferedInputStream& stream) //throw SysErrorUnexpectedEos { + const auto size = readNumber<int32_t>(stream); //throw SysErrorUnexpectedEos + if (size < 0) //most likely due to data corruption! + throw SysErrorUnexpectedEos(); + C cont; - auto strLength = readNumber<uint32_t>(stream); //throw SysErrorUnexpectedEos - if (strLength > 0) + if (size > 0) { try { - cont.resize(strLength); //throw std::length_error, std::bad_alloc + cont.resize(size); //throw std::length_error, std::bad_alloc } - catch (std::length_error&) { throw SysErrorUnexpectedEos(); } //most likely this is due to data corruption! + catch (std::length_error&) { throw SysErrorUnexpectedEos(); } //most likely due to data corruption! catch ( std::bad_alloc&) { throw SysErrorUnexpectedEos(); } // - readArray(stream, &cont[0], sizeof(typename C::value_type) * strLength); //throw SysErrorUnexpectedEos + readArray(stream, &cont[0], sizeof(typename C::value_type) * size); //throw SysErrorUnexpectedEos } return cont; } diff --git a/zen/string_tools.h b/zen/string_tools.h index cafff3d5..181a3951 100644 --- a/zen/string_tools.h +++ b/zen/string_tools.h @@ -263,7 +263,7 @@ bool equalString(const S& lhs, const T& rhs) template <class S, class T> inline bool equalAsciiNoCase(const S& lhs, const T& rhs) { - assert(isAsciiString(lhs) || isAsciiString(rhs)); + //assert(isAsciiString(lhs) || isAsciiString(rhs)); const size_t lhsLen = strLength(lhs); return lhsLen == strLength(rhs) && impl::strcmpAsciiNoCase(strBegin(lhs), strBegin(rhs), lhsLen) == std::weak_ordering::equivalent; } @@ -222,15 +222,9 @@ private: //---------------------------------------------------------------------------------------------------------------- -template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, std::integral_constant<int, 1>) { codePointToUtf8 (cp, writeOutput); } //UTF8-char -template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, std::integral_constant<int, 2>) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t -template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, std::integral_constant<int, 4>) { writeOutput(cp); } //other OS: UTF32-wchar_t - -template <class CharType, class Function> inline -void codePointToUtf(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType -{ - return codePointToUtf(cp, writeOutput, std::integral_constant<int, sizeof(CharType)>()); -} +template <class Function> inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant<int, 1>) { codePointToUtf8 (cp, writeOutput); } //UTF8-char +template <class Function> inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant<int, 2>) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t +template <class Function> inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant<int, 4>) { writeOutput(cp); } //other OS: UTF32-wchar_t //---------------------------------------------------------------------------------------------------------------- @@ -277,9 +271,18 @@ private: }; } + template <class CharType> using UtfDecoder = impl::UtfDecoderImpl<CharType, sizeof(CharType)>; + +template <class CharType, class Function> inline +void codePointToUtf(impl::CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType +{ + return impl::codePointToUtfImpl(cp, writeOutput, std::integral_constant<int, sizeof(CharType)>()); +} + + //------------------------------------------------------------------------------------------- template <class UtfString> inline diff --git a/zen/zstring.cpp b/zen/zstring.cpp index 1e29e461..3f5328f7 100644 --- a/zen/zstring.cpp +++ b/zen/zstring.cpp @@ -15,7 +15,7 @@ Zstring getUnicodeNormalFormNonAscii(const Zstring& str) { //Example: const char* decomposed = "\x6f\xcc\x81"; // const char* precomposed = "\xc3\xb3"; - assert(!isAsciiString(str)); + assert(!isAsciiString(str)); //includes "not-empty" check assert(str.find(Zchar('\0')) == Zstring::npos); //don't expect embedded nulls! try @@ -51,14 +51,14 @@ Zstring getUpperCaseNonAscii(const Zstring& str) Zstring strNorm = getUnicodeNormalFormNonAscii(str); try { - static_assert(sizeof(impl::CodePoint) == sizeof(gunichar)); Zstring output; output.reserve(strNorm.size()); UtfDecoder<char> decoder(strNorm.c_str(), strNorm.size()); while (const std::optional<impl::CodePoint> cp = decoder.getNext()) - impl::codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent! + codePointToUtf<char>(::g_unichar_toupper(*cp), [&](char c) { output += c; }); //don't use std::towupper: *incomplete* and locale-dependent! + static_assert(sizeof(impl::CodePoint) == sizeof(gunichar)); return output; } @@ -89,6 +89,10 @@ namespace { std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen) { + //expect Unicode normalized strings! + assert(std::string(lhs, lhsLen) == getUnicodeNormalForm(std::string(lhs, lhsLen))); + assert(std::string(rhs, rhsLen) == getUnicodeNormalForm(std::string(rhs, rhsLen))); + //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c // => re-implement comparison based on g_unichar_tolower() to avoid memory allocations @@ -103,12 +107,13 @@ std::weak_ordering compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* return !cpR <=> !cpL; static_assert(sizeof(gunichar) == sizeof(impl::CodePoint)); + static_assert(std::is_unsigned_v<gunichar>, "unsigned char-comparison is the convention!"); //ordering: "to lower" converts to higher code points than "to upper" const gunichar charL = ::g_unichar_toupper(*cpL); //note: tolower can be ambiguous, so don't use: const gunichar charR = ::g_unichar_toupper(*cpR); //e.g. "Σ" (upper case) can be lower-case "ς" in the end of the word or "σ" in the middle. if (charL != charR) - return makeUnsigned(charL) <=> makeUnsigned(charR); //unsigned char-comparison is the convention! + return charL <=> charR; } } } @@ -206,25 +211,48 @@ std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs) std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs) { - //fast path: no need for extra memory allocations => ~ 6x speedup - const size_t minSize = std::min(lhs.size(), rhs.size()); + //fast path: no memory allocations => ~ 6x speedup + if (isAsciiString(lhs) && isAsciiString(rhs)) + { + const size_t minSize = std::min(lhs.size(), rhs.size()); + for (size_t i = 0; i < minSize; ++i) + { + //ordering: do NOT call compareAsciiNoCase(), which uses asciiToLower()! + const Zchar lUp = asciiToUpper(lhs[i]); // + const Zchar rUp = asciiToUpper(rhs[i]); //no surprises: emulate getUpperCase() [verified!] + if (lUp != rUp) // + return lUp <=> rUp; // + } + return lhs.size() <=> rhs.size(); + } + //-------------------------------------- + + //can't we instead skip isAsciiString() and compare chars as long as isAsciiChar()? + // => NOPE! e.g. decomposed Unicode! A seemingly single isAsciiChar() might be followed by a combining character!!! + + return getUpperCase(lhs) <=> getUpperCase(rhs); +} + + +bool equalNoCase(const Zstring& lhs, const Zstring& rhs) +{ + //fast-path: no need for extra memory allocations + const bool isAsciiL = isAsciiString(lhs); + const bool isAsciiR = isAsciiString(rhs); + if (isAsciiL != isAsciiR) + return false; - size_t i = 0; - for (; i < minSize; ++i) + if (isAsciiL) { - const Zchar l = lhs[i]; - const Zchar r = rhs[i]; - if (!isAsciiChar(l) || !isAsciiChar(r)) - goto slowPath; //=> let's NOT make assumptions how getUpperCase() compares "ASCII <=> non-ASCII" - - const Zchar lUp = asciiToUpper(l); // - const Zchar rUp = asciiToUpper(r); //no surprises: emulate getUpperCase() [verified!] - if (lUp != rUp) // - return lUp <=> rUp; // + if (lhs.size() != rhs.size()) + return false; + + for (size_t i = 0; i < lhs.size(); ++i) + if (asciiToUpper(lhs[i]) != + asciiToUpper(rhs[i])) + return false; + return true; } - return lhs.size() <=> rhs.size(); -slowPath: //-------------------------------------- - return compareNoCaseUtf8(lhs.c_str() + i, lhs.size() - i, - rhs.c_str() + i, rhs.size() - i); + return getUpperCaseNonAscii(lhs) == getUpperCaseNonAscii(rhs); } diff --git a/zen/zstring.h b/zen/zstring.h index 70b9f448..692217c1 100644 --- a/zen/zstring.h +++ b/zen/zstring.h @@ -63,10 +63,7 @@ template<> struct std::hash<ZstringNoCase> { size_t operator()(const ZstringNoCa std::weak_ordering compareNoCase(const Zstring& lhs, const Zstring& rhs); -inline -bool equalNoCase(const Zstring& lhs, const Zstring& rhs) { return compareNoCase(lhs, rhs) == std::weak_ordering::equivalent; } -//note: the "lhs.size() != rhs.size()" short-cut would require two isAsciiString() checks -//=> generally SLOWER than starting comparison directly during first pass and breaking on first difference! +bool equalNoCase(const Zstring& lhs, const Zstring& rhs); //------------------------------------------------------------------------------------------ std::weak_ordering compareNatural(const Zstring& lhs, const Zstring& rhs); |