summaryrefslogtreecommitdiff
path: root/zen
diff options
context:
space:
mode:
Diffstat (limited to 'zen')
-rwxr-xr-xzen/file_access.cpp14
-rwxr-xr-xzen/file_access.h6
-rwxr-xr-xzen/file_error.h2
-rwxr-xr-xzen/file_io.cpp5
-rwxr-xr-xzen/file_io.h2
-rwxr-xr-xzen/format_unit.cpp4
-rwxr-xr-xzen/globals.h10
-rwxr-xr-xzen/recycler.cpp2
-rwxr-xr-xzen/scope_guard.h2
-rwxr-xr-xzen/serialize.h1
-rwxr-xr-xzen/shell_execute.h2
-rwxr-xr-xzen/string_base.h15
-rwxr-xr-xzen/string_tools.h196
-rwxr-xr-xzen/sys_error.h2
-rwxr-xr-xzen/thread.h18
-rwxr-xr-xzen/utf.h421
-rwxr-xr-xzen/zstring.cpp117
-rwxr-xr-xzen/zstring.h111
18 files changed, 480 insertions, 450 deletions
diff --git a/zen/file_access.cpp b/zen/file_access.cpp
index 61a003bb..71d00386 100755
--- a/zen/file_access.cpp
+++ b/zen/file_access.cpp
@@ -27,7 +27,7 @@
using namespace zen;
-Opt<PathComponents> zen::getPathComponents(const Zstring& itemPath)
+Opt<PathComponents> zen::parsePathComponents(const Zstring& itemPath)
{
if (startsWith(itemPath, "/"))
{
@@ -44,7 +44,7 @@ Opt<PathComponents> zen::getPathComponents(const Zstring& itemPath)
Opt<Zstring> zen::getParentFolderPath(const Zstring& itemPath)
{
- if (const Opt<PathComponents> comp = getPathComponents(itemPath))
+ if (const Opt<PathComponents> comp = parsePathComponents(itemPath))
{
if (comp->relPath.empty())
return NoValue();
@@ -73,7 +73,7 @@ ItemType zen::getItemType(const Zstring& itemPath) //throw FileError
}
-PathDetails zen::getPathDetails(const Zstring& itemPath) //throw FileError
+PathStatus zen::getPathStatus(const Zstring& itemPath) //throw FileError
{
const Opt<Zstring> parentPath = getParentFolderPath(itemPath);
try
@@ -91,7 +91,7 @@ PathDetails zen::getPathDetails(const Zstring& itemPath) //throw FileError
const Zstring itemName = afterLast(itemPath, FILE_NAME_SEPARATOR, IF_MISSING_RETURN_ALL);
assert(!itemName.empty());
- PathDetails pd = getPathDetails(*parentPath); //throw FileError
+ PathStatus pd = getPathStatus(*parentPath); //throw FileError
if (!pd.relPath.empty())
{
pd.relPath.push_back(itemName);
@@ -115,7 +115,7 @@ PathDetails zen::getPathDetails(const Zstring& itemPath) //throw FileError
Opt<ItemType> zen::getItemTypeIfExists(const Zstring& itemPath) //throw FileError
{
- const PathDetails pd = getPathDetails(itemPath); //throw FileError
+ const PathStatus pd = getPathStatus(itemPath); //throw FileError
if (pd.relPath.empty())
return pd.existingType;
return NoValue();
@@ -502,8 +502,8 @@ void zen::createDirectoryIfMissingRecursion(const Zstring& dirPath) //throw File
}
catch (FileError&)
{
- Opt<PathDetails> pd;
- try { pd = getPathDetails(dirPath); /*throw FileError*/ }
+ Opt<PathStatus> pd;
+ try { pd = getPathStatus(dirPath); /*throw FileError*/ }
catch (FileError&) {} //previous exception is more relevant
if (pd && pd->existingType != ItemType::FILE)
diff --git a/zen/file_access.h b/zen/file_access.h
index c3a52f8a..a6b221e5 100755
--- a/zen/file_access.h
+++ b/zen/file_access.h
@@ -22,7 +22,7 @@ struct PathComponents
Zstring rootPath; //itemPath = rootPath + (FILE_NAME_SEPARATOR?) + relPath
Zstring relPath; //
};
-Opt<PathComponents> getPathComponents(const Zstring& itemPath); //no value on failure
+Opt<PathComponents> parsePathComponents(const Zstring& itemPath); //no value on failure
Opt<Zstring> getParentFolderPath(const Zstring& itemPath);
@@ -43,13 +43,13 @@ ItemType getItemType (const Zstring& itemPath); //throw FileError
//execute potentially SLOW folder traversal but distinguish error/not existing
Opt<ItemType> getItemTypeIfExists(const Zstring& itemPath); //throw FileError
-struct PathDetails
+struct PathStatus
{
ItemType existingType;
Zstring existingPath; //itemPath =: existingPath + relPath
std::vector<Zstring> relPath; //
};
-PathDetails getPathDetails(const Zstring& itemPath); //throw FileError
+PathStatus getPathStatus(const Zstring& itemPath); //throw FileError
enum class ProcSymlink
{
diff --git a/zen/file_error.h b/zen/file_error.h
index 87f9525b..949c644f 100755
--- a/zen/file_error.h
+++ b/zen/file_error.h
@@ -46,7 +46,7 @@ DEFINE_NEW_FILE_ERROR(ErrorDifferentVolume);
//----------- facilitate usage of std::wstring for error messages --------------------
inline std::wstring fmtPath(const std::wstring& displayPath) { return L'\"' + displayPath + L'\"'; }
-inline std::wstring fmtPath(const Zstring& displayPath) { return fmtPath(utfCvrtTo<std::wstring>(displayPath)); }
+inline std::wstring fmtPath(const Zstring& displayPath) { return fmtPath(utfTo<std::wstring>(displayPath)); }
inline std::wstring fmtPath(const wchar_t* displayPath) { return fmtPath(std::wstring(displayPath)); } //resolve overload ambiguity
}
diff --git a/zen/file_io.cpp b/zen/file_io.cpp
index b4affd37..0c5ff490 100755
--- a/zen/file_io.cpp
+++ b/zen/file_io.cpp
@@ -140,7 +140,7 @@ size_t FileInput::read(void* buffer, size_t bytesToRead) //throw FileError, X; r
if (notifyUnbufferedIO_) notifyUnbufferedIO_(bytesRead); //throw X
if (bytesRead == 0) //end of file
- bytesToRead = memBuf_.size();
+ bytesToRead = std::min(bytesToRead, memBuf_.size());
}
std::copy(memBuf_.begin(), memBuf_.begin() + bytesToRead, static_cast<char*>(buffer));
@@ -185,9 +185,10 @@ FileOutput::FileOutput(const Zstring& filePath, AccessFlag access, const IOCallb
FileOutput::~FileOutput()
{
+ notifyUnbufferedIO_ = nullptr; //no call-backs during destruction!!!
try
{
- flushBuffers(); //throw FileError, X
+ flushBuffers(); //throw FileError, (X)
}
catch (...) { assert(false); }
}
diff --git a/zen/file_io.h b/zen/file_io.h
index 8a5e0f7f..827abd9e 100755
--- a/zen/file_io.h
+++ b/zen/file_io.h
@@ -90,7 +90,7 @@ private:
size_t tryWrite(const void* buffer, size_t bytesToWrite); //throw FileError; may return short! CONTRACT: bytesToWrite > 0
std::vector<char> memBuf_;
- const IOCallback notifyUnbufferedIO_; //throw X
+ IOCallback notifyUnbufferedIO_; //throw X
};
//-----------------------------------------------------------------------------------------------
diff --git a/zen/format_unit.cpp b/zen/format_unit.cpp
index cf17c8d4..a2208b3e 100755
--- a/zen/format_unit.cpp
+++ b/zen/format_unit.cpp
@@ -5,7 +5,7 @@
// *****************************************************************************
#include "format_unit.h"
-#include <cwchar> //swprintf
+//#include <cwchar> //swprintf
#include <ctime>
#include <cstdio>
#include "basic_math.h"
@@ -168,7 +168,7 @@ std::wstring zen::ffs_Impl::includeNumberSeparator(const std::wstring& number)
//::setlocale (LC_ALL, ""); -> implicitly called by wxLocale
const lconv* localInfo = ::localeconv(); //always bound according to doc
- const std::wstring& thousandSep = utfCvrtTo<std::wstring>(localInfo->thousands_sep);
+ const std::wstring& thousandSep = utfTo<std::wstring>(localInfo->thousands_sep);
// THOUSANDS_SEPARATOR = std::use_facet<std::numpunct<wchar_t>>(std::locale("")).thousands_sep(); - why not working?
// DECIMAL_POINT = std::use_facet<std::numpunct<wchar_t>>(std::locale("")).decimal_point();
diff --git a/zen/globals.h b/zen/globals.h
index a1fd2764..b6c5dd28 100755
--- a/zen/globals.h
+++ b/zen/globals.h
@@ -18,7 +18,11 @@ template <class T>
class Global
{
public:
- Global() { static_assert(std::is_trivially_destructible<Pod>::value, "this memory needs to live forever"); }
+ Global()
+ {
+ static_assert(std::is_trivially_destructible<Pod>::value, "this memory needs to live forever");
+ assert(!pod.inst && !pod.spinLock); //we depend on static zero-initialization!
+ }
explicit Global(std::unique_ptr<T>&& newInst) { set(std::move(newInst)); }
~Global() { set(nullptr); }
@@ -50,9 +54,9 @@ private:
//=> use trivially-destructible POD only!!!
struct Pod
{
- std::shared_ptr<T>* inst = nullptr;
+ std::shared_ptr<T>* inst; // = nullptr;
+ std::atomic<bool> spinLock; // { false }; rely entirely on static zero-initialization! => avoid potential contention with worker thread during Global<> construction!
//serialize access; can't use std::mutex: has non-trival destructor
- std::atomic<bool> spinLock { false };
} pod;
};
diff --git a/zen/recycler.cpp b/zen/recycler.cpp
index 02ea026a..0c71bf3b 100755
--- a/zen/recycler.cpp
+++ b/zen/recycler.cpp
@@ -45,7 +45,7 @@ bool zen::recycleOrDeleteIfExists(const Zstring& itemPath) //throw FileError
return true;
}
- throw FileError(errorMsg, replaceCpy<std::wstring>(L"Glib Error Code %x:", L"%x", numberTo<std::wstring>(error->code)) + L" " + utfCvrtTo<std::wstring>(error->message));
+ throw FileError(errorMsg, formatSystemError(L"g_file_trash", L"Glib Error Code " + numberTo<std::wstring>(error->code), utfTo<std::wstring>(error->message)));
//g_quark_to_string(error->domain)
}
return true;
diff --git a/zen/scope_guard.h b/zen/scope_guard.h
index 09a7fbdb..62552f7b 100755
--- a/zen/scope_guard.h
+++ b/zen/scope_guard.h
@@ -13,7 +13,7 @@
//std::uncaught_exceptions() currently unsupported on GCC and Clang => clean up ASAP
- static_assert(__GNUC__ < 6 || (__GNUC__ == 6 && (__GNUC_MINOR__ < 2 || (__GNUC_MINOR__ == 2 && __GNUC_PATCHLEVEL__ <= 1))), "check std::uncaught_exceptions support");
+ static_assert(__GNUC__ < 6 || (__GNUC__ == 6 && (__GNUC_MINOR__ < 3 || (__GNUC_MINOR__ == 3 && __GNUC_PATCHLEVEL__ <= 1))), "check std::uncaught_exceptions support");
namespace __cxxabiv1
{
diff --git a/zen/serialize.h b/zen/serialize.h
index bb2f7a45..c8dfb96d 100755
--- a/zen/serialize.h
+++ b/zen/serialize.h
@@ -241,6 +241,7 @@ template <class BufferedInputStream> inline
void readArray(BufferedInputStream& stream, void* buffer, size_t len) //throw UnexpectedEndOfStreamError
{
const size_t bytesRead = stream.read(buffer, len);
+ assert(bytesRead <= len); //buffer overflow otherwise not always detected!
if (bytesRead < len)
throw UnexpectedEndOfStreamError();
}
diff --git a/zen/shell_execute.h b/zen/shell_execute.h
index 9ba0aef0..5e4ddf1a 100755
--- a/zen/shell_execute.h
+++ b/zen/shell_execute.h
@@ -41,7 +41,7 @@ void shellExecute(const Zstring& command, ExecutionType type) //throw FileError
//Posix::system - execute a shell command
int rv = ::system(command.c_str()); //do NOT use std::system as its documentation says nothing about "WEXITSTATUS(rv)", ect...
if (rv == -1 || WEXITSTATUS(rv) == 127) //http://linux.die.net/man/3/system "In case /bin/sh could not be executed, the exit status will be that of a command that does exit(127)"
- throw FileError(_("Incorrect command line:") + L"\n" + utfCvrtTo<std::wstring>(command));
+ throw FileError(_("Incorrect command line:") + L"\n" + utfTo<std::wstring>(command));
}
else
runAsync([=] { int rv = ::system(command.c_str()); (void)rv; });
diff --git a/zen/string_base.h b/zen/string_base.h
index 3afa66c6..b5e45c0e 100755
--- a/zen/string_base.h
+++ b/zen/string_base.h
@@ -264,8 +264,8 @@ public:
void push_back(Char val) { operator+=(val); } //STL access
void pop_back();
- Zbase& operator=(const Zbase& str);
Zbase& operator=(Zbase&& tmp) noexcept;
+ Zbase& operator=(const Zbase& str);
Zbase& operator=(const Char* str) { return assign(str, strLength(str)); }
Zbase& operator=(Char ch) { return assign(&ch, 1); }
Zbase& operator+=(const Zbase& str) { return append(str.c_str(), str.length()); }
@@ -573,11 +573,14 @@ template <class InputIterator> inline
Zbase<Char, SP>& Zbase<Char, SP>::append(InputIterator first, InputIterator last)
{
const size_t len = std::distance(first, last);
- const size_t thisLen = length();
- reserve(thisLen + len); //make unshared and check capacity
-
- *std::copy(first, last, rawStr_ + thisLen) = 0;
- this->setLength(rawStr_, thisLen + len);
+ if (len > 0) //avoid making this string unshared for no reason
+ {
+ const size_t thisLen = length();
+ reserve(thisLen + len); //make unshared and check capacity
+
+ *std::copy(first, last, rawStr_ + thisLen) = 0;
+ this->setLength(rawStr_, thisLen + len);
+ }
return *this;
}
diff --git a/zen/string_tools.h b/zen/string_tools.h
index 5a82e0ed..236f8df6 100755
--- a/zen/string_tools.h
+++ b/zen/string_tools.h
@@ -25,11 +25,31 @@ namespace zen
template <class Char> bool isWhiteSpace(Char ch);
template <class Char> bool isDigit (Char ch); //not exactly the same as "std::isdigit" -> we consider '0'-'9' only!
template <class Char> bool isHexDigit (Char ch);
-template <class Char> bool isAlpha (Char ch);
+template <class Char> bool isAsciiAlpha(Char ch);
-template <class S, class T> bool startsWith(const S& str, const T& prefix); //
-template <class S, class T> bool endsWith (const S& str, const T& postfix); //both S and T can be strings or char/wchar_t arrays or simple char/wchar_t
-template <class S, class T> bool contains (const S& str, const T& term); //
+//case-sensitive comparison (compile-time correctness: use different number of arguments as STL comparison predicates!)
+struct CmpBinary { template <class Char> int operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const; };
+
+//basic case-insensitive comparison (considering A-Z only!)
+struct CmpAsciiNoCase { template <class Char> int operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const; };
+
+struct LessAsciiNoCase
+{
+ template <class S> //don't support heterogenous input! => use as container predicate only!
+ bool operator()(const S& lhs, const S& rhs) const { return CmpAsciiNoCase()(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; }
+};
+
+//both S and T can be strings or char/wchar_t arrays or simple char/wchar_t
+template <class S, class T> bool contains(const S& str, const T& term);
+
+template <class S, class T> bool startsWith(const S& str, const T& prefix);
+template <class S, class T, class Function> bool startsWith(const S& str, const T& prefix, Function cmpStringFun);
+
+template <class S, class T> bool endsWith (const S& str, const T& postfix);
+template <class S, class T, class Function> bool endsWith (const S& str, const T& postfix, Function cmpStringFun);
+
+template <class S, class T> bool strEqual(const S& lhs, const T& rhs);
+template <class S, class T, class Function> bool strEqual(const S& lhs, const T& rhs, Function cmpStringFun);
enum FailureReturnVal
{
@@ -42,16 +62,23 @@ template <class S, class T> S beforeLast (const S& str, const T& term, FailureRe
template <class S, class T> S afterFirst (const S& str, const T& term, FailureReturnVal rv);
template <class S, class T> S beforeFirst(const S& str, const T& term, FailureReturnVal rv);
-template <class S, class T> std::vector<S> split(const S& str, const T& delimiter);
-template <class S> S trimCpy(S str, bool fromLeft = true, bool fromRight = true);
-template <class S> void trim (S& str, bool fromLeft = true, bool fromRight = true);
+enum class SplitType
+{
+ ALLOW_EMPTY,
+ SKIP_EMPTY
+};
+template <class S, class T> std::vector<S> split(const S& str, const T& delimiter, SplitType st);
+
+template <class S> S trimCpy(S str, bool fromLeft = true, bool fromRight = true);
+template <class S> void trim (S& str, bool fromLeft = true, bool fromRight = true);
template <class S, class Function> void trim(S& str, bool fromLeft, bool fromRight, Function trimThisChar);
+
template <class S, class T, class U> void replace ( S& str, const T& oldTerm, const U& newTerm, bool replaceAll = true);
template <class S, class T, class U> S replaceCpy(const S& str, const T& oldTerm, const U& newTerm, bool replaceAll = true);
//high-performance conversion between numbers and strings
template <class S, class Num> S numberTo(const Num& number);
-template <class Num, class S > Num stringTo(const S& str);
+template <class Num, class S> Num stringTo(const S& str);
std::pair<char, char> hexify (unsigned char c, bool upperCase = true);
char unhexify(char high, char low);
@@ -61,9 +88,6 @@ template <class S, class T, class Num> S printNumber(const T& format, const Num&
//string to string conversion: converts string-like type into char-compatible target string class
template <class T, class S> T copyStringTo(S&& str);
-//case-sensitive comparison
-template <class S, class T> int cmpString(const S& lhs, const T& rhs);
-
@@ -99,7 +123,7 @@ bool isWhiteSpace(wchar_t ch)
template <class Char> inline
-bool isDigit(Char ch) //similar to implmenetation of std::::isdigit()!
+bool isDigit(Char ch) //similar to implmenetation of std::isdigit()!
{
static_assert(IsSameType<Char, char>::value || IsSameType<Char, wchar_t>::value, "");
return static_cast<Char>('0') <= ch && ch <= static_cast<Char>('9');
@@ -116,40 +140,52 @@ bool isHexDigit(Char c)
}
-template <> bool isAlpha(char ch) = delete; //probably not a good idea with UTF-8 anyway...
-
-template <> inline bool isAlpha(wchar_t ch) { return std::iswalpha(ch) != 0; }
+template <class Char> inline
+bool isAsciiAlpha(Char c)
+{
+ static_assert(IsSameType<Char, char>::value || IsSameType<Char, wchar_t>::value, "");
+ return (static_cast<Char>('A') <= c && c <= static_cast<Char>('Z')) ||
+ (static_cast<Char>('a') <= c && c <= static_cast<Char>('z'));
+}
-template <class S, class T> inline
-bool startsWith(const S& str, const T& prefix)
+template <class S, class T, class Function> inline
+bool startsWith(const S& str, const T& prefix, Function cmpStringFun)
{
- static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
const size_t pfLen = strLength(prefix);
if (strLength(str) < pfLen)
return false;
- const auto* const cmpFirst = strBegin(str);
- return std::equal(cmpFirst, cmpFirst + pfLen,
- strBegin(prefix));
+ return cmpStringFun(strBegin(str), pfLen,
+ strBegin(prefix), pfLen) == 0;
}
-template <class S, class T> inline
-bool endsWith(const S& str, const T& postfix)
+template <class S, class T, class Function> inline
+bool endsWith(const S& str, const T& postfix, Function cmpStringFun)
{
- static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
const size_t strLen = strLength(str);
const size_t pfLen = strLength(postfix);
if (strLen < pfLen)
return false;
- const auto* const cmpFirst = strBegin(str) + strLen - pfLen;
- return std::equal(cmpFirst, cmpFirst + pfLen,
- strBegin(postfix));
+ return cmpStringFun(strBegin(str) + strLen - pfLen, pfLen,
+ strBegin(postfix), pfLen) == 0;
}
+template <class S, class T, class Function> inline
+bool strEqual(const S& lhs, const T& rhs, Function cmpStringFun)
+{
+ return cmpStringFun(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) == 0;
+}
+
+
+template <class S, class T> inline bool startsWith(const S& str, const T& prefix ) { return startsWith(str, prefix, CmpBinary()); }
+template <class S, class T> inline bool endsWith (const S& str, const T& postfix) { return endsWith (str, postfix, CmpBinary()); }
+template <class S, class T> inline bool strEqual (const S& lhs, const T& rhs ) { return strEqual (lhs, rhs, CmpBinary()); }
+
+
template <class S, class T> inline
bool contains(const S& str, const T& term)
{
@@ -173,6 +209,7 @@ S afterLast(const S& str, const T& term, FailureReturnVal rv)
{
static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
const size_t termLen = strLength(term);
+ assert(termLen > 0);
const auto* const strFirst = strBegin(str);
const auto* const strLast = strFirst + strLength(str);
@@ -192,12 +229,15 @@ template <class S, class T> inline
S beforeLast(const S& str, const T& term, FailureReturnVal rv)
{
static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
+ const size_t termLen = strLength(term);
+ assert(termLen > 0);
+
const auto* const strFirst = strBegin(str);
const auto* const strLast = strFirst + strLength(str);
const auto* const termFirst = strBegin(term);
const auto* it = search_last(strFirst, strLast,
- termFirst, termFirst + strLength(term));
+ termFirst, termFirst + termLen);
if (it == strLast)
return rv == IF_MISSING_RETURN_ALL ? str : S();
@@ -210,6 +250,8 @@ S afterFirst(const S& str, const T& term, FailureReturnVal rv)
{
static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
const size_t termLen = strLength(term);
+ assert(termLen > 0);
+
const auto* const strFirst = strBegin(str);
const auto* const strLast = strFirst + strLength(str);
const auto* const termFirst = strBegin(term);
@@ -228,12 +270,15 @@ template <class S, class T> inline
S beforeFirst(const S& str, const T& term, FailureReturnVal rv)
{
static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
+ const size_t termLen = strLength(term);
+ assert(termLen > 0);
+
const auto* const strFirst = strBegin(str);
const auto* const strLast = strFirst + strLength(str);
const auto* const termFirst = strBegin(term);
auto it = std::search(strFirst, strLast,
- termFirst, termFirst + strLength(term));
+ termFirst, termFirst + termLen);
if (it == strLast)
return rv == IF_MISSING_RETURN_ALL ? str : S();
@@ -242,34 +287,35 @@ S beforeFirst(const S& str, const T& term, FailureReturnVal rv)
template <class S, class T> inline
-std::vector<S> split(const S& str, const T& delimiter)
+std::vector<S> split(const S& str, const T& delimiter, SplitType st)
{
static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
-
const size_t delimLen = strLength(delimiter);
-
+ assert(delimLen > 0);
if (delimLen == 0)
- return { str };
- else
{
- const auto* const delimFirst = strBegin(delimiter);
- const auto* const delimLast = delimFirst + delimLen;
+ if (str.empty() && st == SplitType::SKIP_EMPTY)
+ return {};
+ return { str };
+ }
- const auto* blockStart = strBegin(str);
- const auto* const strLast = blockStart + strLength(str);
+ const auto* const delimFirst = strBegin(delimiter);
+ const auto* const delimLast = delimFirst + delimLen;
- std::vector<S> output;
-
- for (;;)
- {
- const auto* const blockEnd = std::search(blockStart, strLast,
- delimFirst, delimLast);
+ const auto* blockStart = strBegin(str);
+ const auto* const strLast = blockStart + strLength(str);
+ std::vector<S> output;
+ for (;;)
+ {
+ const auto* const blockEnd = std::search(blockStart, strLast,
+ delimFirst, delimLast);
+ if (blockStart != blockEnd || st == SplitType::ALLOW_EMPTY)
output.emplace_back(blockStart, blockEnd - blockStart);
- if (blockEnd == strLast) //clients expect: if delimiter not found, return str
- return output;
- blockStart = blockEnd + delimLen;
- }
+
+ if (blockEnd == strLast)
+ return output;
+ blockStart = blockEnd + delimLen;
}
}
@@ -389,33 +435,47 @@ struct CopyStringToString<T, T> //perf: we don't need a deep copy if string type
template <class S>
T copy(S&& str) const { return std::forward<S>(str); }
};
+
+inline int strcmpWithNulls(const char* ptr1, const char* ptr2, size_t num) { return std::memcmp (ptr1, ptr2, num); }
+inline int strcmpWithNulls(const wchar_t* ptr1, const wchar_t* ptr2, size_t num) { return std::wmemcmp(ptr1, ptr2, num); }
}
template <class T, class S> inline
T copyStringTo(S&& str) { return impl::CopyStringToString<std::decay_t<S>, T>().copy(std::forward<S>(str)); }
-template <class S, class T> inline
-int cmpString(const S& lhs, const T& rhs)
+template <class Char> inline
+int CmpBinary::operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const
{
- static_assert(IsSameType<typename GetCharType<S>::Type, typename GetCharType<T>::Type>::value, "");
+ //support embedded 0, unlike strncmp/wcsncmp!
+ const int rv = impl::strcmpWithNulls(lhs, rhs, std::min(lhsLen, rhsLen));
+ if (rv != 0)
+ return rv;
+ return static_cast<int>(lhsLen) - static_cast<int>(rhsLen);
+}
- const size_t lenL = strLength(lhs);
- const size_t lenR = strLength(rhs);
- const auto* strPosL = strBegin(lhs);
- const auto* strPosR = strBegin(rhs);
+template <class Char> inline
+int CmpAsciiNoCase::operator()(const Char* lhs, size_t lhsLen, const Char* rhs, size_t rhsLen) const
+{
+ auto asciiToLower = [](Char c) //ordering: lower-case chars have higher code points than uppper-case
+ {
+ if (static_cast<Char>('A') <= c && c <= static_cast<Char>('Z'))
+ return static_cast<Char>(c - static_cast<Char>('A') + static_cast<Char>('a'));
+ return c;
+ };
- const auto* const strPosLLast = strPosL + std::min(lenL, lenR);
+ const auto* const lhsLast = lhs + std::min(lhsLen, rhsLen);
- while (strPosL != strPosLLast)
+ while (lhs != lhsLast)
{
- const auto charL = static_cast<unsigned int>(*strPosL++); //unsigned char-comparison is the convention!
- const auto charR = static_cast<unsigned int>(*strPosR++);
+ const Char charL = asciiToLower(*lhs++);
+ const Char charR = asciiToLower(*rhs++);
if (charL != charR)
- return static_cast<int>(charL) - static_cast<int>(charR);
+ return static_cast<unsigned int>(charL) - static_cast<unsigned int>(charR); //unsigned char-comparison is the convention!
+ //unsigned underflow is well-defined!
}
- return static_cast<int>(lenL) - static_cast<int>(lenR);
+ return static_cast<int>(lhsLen) - static_cast<int>(rhsLen);
}
@@ -424,13 +484,13 @@ namespace impl
template <class Num> inline
int saferPrintf(char* buffer, size_t bufferSize, const char* format, const Num& number) //there is no such thing as a "safe" printf ;)
{
- return std::snprintf(buffer, bufferSize, format, number); //C99
+ return std::snprintf(buffer, bufferSize, format, number); //C99: returns number of chars written if successful, < 0 or >= bufferSize on failure
}
template <class Num> inline
int saferPrintf(wchar_t* buffer, size_t bufferSize, const wchar_t* format, const Num& number)
{
- return std::swprintf(buffer, bufferSize, format, number); //C99
+ return std::swprintf(buffer, bufferSize, format, number); //C99: returns number of chars written if successful, < 0 on failure (including buffer too small)
}
}
@@ -444,7 +504,7 @@ S printNumber(const T& format, const Num& number) //format a single number using
CharType buffer[BUFFER_SIZE]; //zero-initialize?
const int charsWritten = impl::saferPrintf(buffer, BUFFER_SIZE, strBegin(format), number);
- return charsWritten > 0 ? S(buffer, charsWritten) : S();
+ return 0 < charsWritten && charsWritten < BUFFER_SIZE ? S(buffer, charsWritten) : S();
}
@@ -607,12 +667,8 @@ Num extractInteger(const S& str, bool& hasMinusSign) //very fast conversion to i
number *= 10;
number += c - static_cast<CharType>('0');
}
- else
- {
- //rest of string should contain whitespace only, it's NOT a bug if there is something else!
- //assert(std::all_of(iter, last, &isWhiteSpace<CharType>)); -> this is NO assert situation
- break;
- }
+ else //rest of string should contain whitespace only, it's NOT a bug if there is something else!
+ break; //assert(std::all_of(iter, last, &isWhiteSpace<CharType>)); -> this is NO assert situation
}
return number;
}
diff --git a/zen/sys_error.h b/zen/sys_error.h
index a19409ab..f7c128ef 100755
--- a/zen/sys_error.h
+++ b/zen/sys_error.h
@@ -67,7 +67,7 @@ std::wstring formatSystemErrorRaw(ErrorCode ec) //return empty string on error
std::wstring errorMsg;
ZEN_ON_SCOPE_EXIT(errno = currentError);
- errorMsg = utfCvrtTo<std::wstring>(::strerror(ec));
+ errorMsg = utfTo<std::wstring>(::strerror(ec));
trim(errorMsg); //Windows messages seem to end with a blank...
return errorMsg;
diff --git a/zen/thread.h b/zen/thread.h
index a59f3807..ae4c347e 100755
--- a/zen/thread.h
+++ b/zen/thread.h
@@ -28,26 +28,26 @@ public:
template <class Function>
InterruptibleThread(Function&& f);
- bool joinable () const { return stdThread.joinable(); }
+ bool joinable () const { return stdThread_.joinable(); }
void interrupt();
- void join () { stdThread.join(); }
- void detach () { stdThread.detach(); }
+ void join () { stdThread_.join(); }
+ void detach () { stdThread_.detach(); }
template <class Rep, class Period>
bool tryJoinFor(const std::chrono::duration<Rep, Period>& relTime)
{
- if (threadCompleted.wait_for(relTime) == std::future_status::ready)
+ if (threadCompleted_.wait_for(relTime) == std::future_status::ready)
{
- stdThread.join(); //runs thread-local destructors => this better be fast!!!
+ stdThread_.join(); //runs thread-local destructors => this better be fast!!!
return true;
}
return false;
}
private:
- std::thread stdThread;
+ std::thread stdThread_;
std::shared_ptr<InterruptionStatus> intStatus_;
- std::future<void> threadCompleted;
+ std::future<void> threadCompleted_;
};
//context of worker thread:
@@ -376,9 +376,9 @@ template <class Function> inline
InterruptibleThread::InterruptibleThread(Function&& f) : intStatus_(std::make_shared<InterruptionStatus>())
{
std::promise<void> pFinished;
- threadCompleted = pFinished.get_future();
+ threadCompleted_ = pFinished.get_future();
- stdThread = std::thread([f = std::forward<Function>(f),
+ stdThread_ = std::thread([f = std::forward<Function>(f),
intStatus = this->intStatus_,
pFinished = std::move(pFinished)]() mutable
{
diff --git a/zen/utf.h b/zen/utf.h
index 41fdf58c..ab8fda50 100755
--- a/zen/utf.h
+++ b/zen/utf.h
@@ -10,40 +10,25 @@
#include <cstdint>
#include <iterator>
#include "string_tools.h" //copyStringTo
+#include "optional.h"
namespace zen
{
//convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!)
template <class TargetString, class SourceString>
-TargetString utfCvrtTo(const SourceString& str);
+TargetString utfTo(const SourceString& str);
const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";
-template <class CharString>
-bool isValidUtf8(const CharString& str); //check for UTF-8 encoding errors
-
-//---- explicit conversion: wide <-> utf8 ----
-template <class CharString, class WideString>
-CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8<std::string>(L"abc");
-
-template <class WideString, class CharString>
-WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide<std::wstring>("abc");
+template <class UtfString>
+bool isValidUtf(const UtfString& str); //check for UTF-8 encoding errors
//access unicode characters in UTF-encoded string (char- or wchar_t-based)
template <class UtfString>
size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string
template <class UtfString>
-size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string
-
-
-
-
-
-
-
-
-
+UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast);
@@ -58,7 +43,7 @@ namespace implementation
{
using CodePoint = uint32_t;
using Char16 = uint16_t;
-using Char8 = unsigned char;
+using Char8 = uint8_t;
const CodePoint LEAD_SURROGATE = 0xd800;
const CodePoint TRAIL_SURROGATE = 0xdc00; //== LEAD_SURROGATE_MAX + 1
@@ -72,7 +57,6 @@ template <class Function> inline
void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16
{
//http://en.wikipedia.org/wiki/UTF-16
-
if (cp < LEAD_SURROGATE)
writeOutput(static_cast<Char16>(cp));
else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point
@@ -82,8 +66,8 @@ void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a u
else if (cp <= CODE_POINT_MAX)
{
cp -= 0x10000;
- writeOutput(LEAD_SURROGATE + static_cast<Char16>(cp >> 10));
- writeOutput(TRAIL_SURROGATE + static_cast<Char16>(cp & 0x3ff));
+ writeOutput(static_cast<Char16>( LEAD_SURROGATE + (cp >> 10)));
+ writeOutput(static_cast<Char16>(TRAIL_SURROGATE + (cp & 0x3ff)));
}
else //invalid code point
codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
@@ -104,15 +88,19 @@ size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error!
}
-template <class CharIterator, class Function> inline
-void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
+class Utf16Decoder
{
- static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2, "");
+public:
+ Utf16Decoder(const Char16* str, size_t len) : it_(str), last_(str + len) {}
- for ( ; first != last; ++first)
+ Opt<CodePoint> getNext()
{
- CodePoint cp = static_cast<Char16>(*first);
- switch (getUtf16Len(static_cast<Char16>(cp)))
+ if (it_ == last_)
+ return NoValue();
+
+ const Char16 ch = *it_++;
+ CodePoint cp = ch;
+ switch (getUtf16Len(ch))
{
case 0: //invalid utf16 character
cp = REPLACEMENT_CHAR;
@@ -120,23 +108,33 @@ void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutpu
case 1:
break;
case 2:
- if (++first != last) //trail surrogate expected!
- {
- const Char16 ch = static_cast<Char16>(*first);
- if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected!
- {
- cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000;
- break;
- }
- }
- --first;
- cp = REPLACEMENT_CHAR;
+ decodeTrail(cp);
break;
}
- writeOutput(cp);
+ return cp;
+ }
+
+private:
+ void decodeTrail(CodePoint& cp)
+ {
+ if (it_ != last_) //trail surrogate expected!
+ {
+ const Char16 ch = *it_;
+ if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected!
+ {
+ cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000;
+ ++it_;
+ return;
+ }
+ }
+ cp = REPLACEMENT_CHAR;
}
-}
+ const Char16* it_;
+ const Char16* const last_;
+};
+
+//----------------------------------------------------------------------------------------------------------------
template <class Function> inline
void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8
@@ -155,14 +153,14 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un
{
writeOutput(static_cast<Char8>( (cp >> 12 ) | 0xe0));
writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
- writeOutput(static_cast<Char8>( (cp & 0x3f ) | 0x80));
+ writeOutput(static_cast<Char8>( (cp & 0x3f) | 0x80));
}
else if (cp <= CODE_POINT_MAX)
{
writeOutput(static_cast<Char8>( (cp >> 18 ) | 0xf0));
writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80));
writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
- writeOutput(static_cast<Char8>( (cp & 0x3f ) | 0x80));
+ writeOutput(static_cast<Char8>( (cp & 0x3f) | 0x80));
}
else //invalid code point
codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8
@@ -170,7 +168,7 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un
inline
-size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error!
+size_t getUtf8Len(Char8 ch) //ch must be first code unit! returns 0 on error!
{
if (ch < 0x80)
return 1;
@@ -184,32 +182,19 @@ size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on e
}
-template <class CharIterator> inline
-bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte
-{
- if (++first != last) //trail surrogate expected!
- {
- const Char8 ch = static_cast<Char8>(*first);
- if (ch >> 6 == 0x2) //trail surrogate expected!
- {
- cp = (cp << 6) + (ch & 0x3f);
- return true;
- }
- }
- --first;
- cp = REPLACEMENT_CHAR;
- return false;
-}
-
-template <class CharIterator, class Function> inline
-void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
+class Utf8Decoder
{
- static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1, "");
+public:
+ Utf8Decoder(const Char8* str, size_t len) : it_(str), last_(str + len) {}
- for ( ; first != last; ++first)
+ Opt<CodePoint> getNext()
{
- CodePoint cp = static_cast<Char8>(*first);
- switch (getUtf8Len(static_cast<Char8>(cp)))
+ if (it_ == last_)
+ return NoValue();
+
+ const Char8 ch = *it_++;
+ CodePoint cp = ch;
+ switch (getUtf8Len(ch))
{
case 0: //invalid utf8 character
cp = REPLACEMENT_CHAR;
@@ -218,258 +203,184 @@ void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput
break;
case 2:
cp &= 0x1f;
- decodeTrail(first, last, cp);
+ decodeTrail(cp);
break;
case 3:
cp &= 0xf;
- if (decodeTrail(first, last, cp))
- decodeTrail(first, last, cp);
+ if (decodeTrail(cp))
+ decodeTrail(cp);
break;
case 4:
cp &= 0x7;
- if (decodeTrail(first, last, cp))
- if (decodeTrail(first, last, cp))
- decodeTrail(first, last, cp);
+ if (decodeTrail(cp))
+ if (decodeTrail(cp))
+ decodeTrail(cp);
if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR;
break;
}
- writeOutput(cp);
+ return cp;
}
-}
-
-
-template <class CharString> inline
-size_t unicodeLength(const CharString& str, char) //utf8
-{
- using CharType = typename GetCharType<CharString>::Type;
- const CharType* strFirst = strBegin(str);
- const CharType* const strLast = strFirst + strLength(str);
-
- size_t len = 0;
- while (strFirst < strLast) //[!]
+private:
+ bool decodeTrail(CodePoint& cp)
{
- ++len;
- size_t utf8len = getUtf8Len(*strFirst);
- if (utf8len == 0) ++utf8len; //invalid utf8 character
- strFirst += utf8len;
+ if (it_ != last_) //trail surrogate expected!
+ {
+ const Char8 ch = *it_;
+ if (ch >> 6 == 0x2) //trail surrogate expected!
+ {
+ cp = (cp << 6) + (ch & 0x3f);
+ ++it_;
+ return true;
+ }
+ }
+ cp = REPLACEMENT_CHAR;
+ return false;
}
- return len;
-}
+ const Char8* it_;
+ const Char8* const last_;
+};
-template <class WideString> inline
-size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t
-{
- using CharType = typename GetCharType<WideString>::Type;
+//----------------------------------------------------------------------------------------------------------------
- const CharType* strFirst = strBegin(str);
- const CharType* const strLast = strFirst + strLength(str);
+template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<1>) { codePointToUtf8 (cp, writeOutput); } //UTF8-char
+template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<2>) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t
+template <class Function> inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<4>) { writeOutput(cp); } //other OS: UTF32-wchar_t
- size_t len = 0;
- while (strFirst < strLast) //[!]
- {
- ++len;
- size_t utf16len = getUtf16Len(*strFirst);
- if (utf16len == 0) ++utf16len; //invalid utf16 character
- strFirst += utf16len;
- }
- return len;
-}
-
-
-template <class WideString> inline
-size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t
+template <class CharType, class Function> inline
+void codePointToUtf(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType
{
- return strLength(str);
+ return codePointToUtf(cp, writeOutput, Int2Type<sizeof(CharType)>());
}
+//----------------------------------------------------------------------------------------------------------------
-template <class WideString> inline
-size_t unicodeLength(const WideString& str, wchar_t)
-{
- return unicodeLengthWide(str, Int2Type<sizeof(wchar_t)>());
-}
-}
+template <class CharType, int charSize>
+class UtfDecoderImpl;
-template <class UtfString> inline
-size_t unicodeLength(const UtfString& str) //return number of code points
+template <class CharType>
+class UtfDecoderImpl<CharType, 1> //UTF8-char
{
- return implementation::unicodeLength(str, typename GetCharType<UtfString>::Type());
-}
+public:
+ UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast<const Char8*>(str), len) {}
+ Opt<CodePoint> getNext() { return decoder_.getNext(); }
+private:
+ Utf8Decoder decoder_;
+};
-namespace implementation
-{
-template <class CharString> inline
-size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char
+template <class CharType>
+class UtfDecoderImpl<CharType, 2> //Windows: UTF16-wchar_t
{
- using CharType = typename GetCharType<CharString>::Type;
-
- const CharType* strFirst = strBegin(str);
- const size_t strLen = strLength(str);
-
- size_t utfPos = 0;
- while (unicodePos-- > 0)
- {
- if (utfPos >= strLen)
- return strLen;
+public:
+ UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast<const Char16*>(str), len) {}
+ Opt<CodePoint> getNext() { return decoder_.getNext(); }
+private:
+ Utf16Decoder decoder_;
+};
- size_t utf8len = getUtf8Len(strFirst[utfPos]);
- if (utf8len == 0) ++utf8len; //invalid utf8 character
- utfPos += utf8len;
- }
- if (utfPos >= strLen)
- return strLen;
- return utfPos;
-}
-
-template <class WideString> inline
-size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t
+template <class CharType>
+class UtfDecoderImpl<CharType, 4> //other OS: UTF32-wchar_t
{
- using CharType = typename GetCharType<WideString>::Type;
-
- const CharType* strFirst = strBegin(str);
- const size_t strLen = strLength(str);
-
- size_t utfPos = 0;
- while (unicodePos-- > 0)
+public:
+ UtfDecoderImpl(const CharType* str, size_t len) : it_(reinterpret_cast<const CodePoint*>(str)), last_(it_ + len) {}
+ Opt<CodePoint> getNext()
{
- if (utfPos >= strLen)
- return strLen;
-
- size_t utf16len = getUtf16Len(strFirst[utfPos]);
- if (utf16len == 0) ++utf16len; //invalid utf16 character
- utfPos += utf16len;
+ if (it_ == last_)
+ return NoValue();
+ return *it_++;
}
- if (utfPos >= strLen)
- return strLen;
- return utfPos;
-}
+private:
+ const CodePoint* it_;
+ const CodePoint* last_;
+};
-template <class WideString> inline
-size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t
-{
- return std::min(strLength(str), unicodePos);
-}
-
-
-template <class UtfString> inline
-size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t)
-{
- return findUnicodePosWide(str, unicodePos, Int2Type<sizeof(wchar_t)>());
-}
-}
-
-
-template <class UtfString> inline
-size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string
-{
- return implementation::findUnicodePos(str, unicodePos, typename GetCharType<UtfString>::Type());
+template <class CharType>
+using UtfDecoder = UtfDecoderImpl<CharType, sizeof(CharType)>;
}
//-------------------------------------------------------------------------------------------
-namespace implementation
-{
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t
+template <class UtfString> inline
+bool isValidUtf(const UtfString& str)
{
- WideString output;
- utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast<wchar_t>(c); }); });
- return output;
-}
+ using namespace implementation;
+ UtfDecoder<typename GetCharType<UtfString>::Type> decoder(strBegin(str), strLength(str));
+ while (Opt<CodePoint> cp = decoder.getNext())
+ if (*cp == REPLACEMENT_CHAR)
+ return false;
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t
-{
- WideString output;
- utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { output += static_cast<wchar_t>(cp); });
- return output;
+ return true;
}
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8
+template <class UtfString> inline
+size_t unicodeLength(const UtfString& str) //return number of code points (+ correctly handle broken UTF encoding)
{
- CharString output;
- utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
- return output;
+ size_t uniLen = 0;
+ implementation::UtfDecoder<typename GetCharType<UtfString>::Type> decoder(strBegin(str), strLength(str));
+ while (decoder.getNext())
+ ++uniLen;
+ return uniLen;
}
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8
+template <class UtfString> inline
+UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast) //return position of unicode char in UTF-encoded string
{
- CharString output;
- std::for_each(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
+ assert(uniPosFirst <= uniPosLast && uniPosLast <= unicodeLength(str));
+ using namespace implementation;
+ using CharType = typename GetCharType<UtfString>::Type;
+ UtfString output;
+ if (uniPosFirst >= uniPosLast) //optimize for empty range
+ return output;
+
+ UtfDecoder<CharType> decoder(strBegin(str), strLength(str));
+ for (size_t uniPos = 0; Opt<CodePoint> cp = decoder.getNext(); ++uniPos) //[!] declaration in condition part of the for-loop
+ if (uniPosFirst <= uniPos)
+ {
+ if (uniPos >= uniPosLast)
+ break;
+ codePointToUtf<CharType>(*cp, [&](CharType c) { output += c; });
+ }
return output;
}
-}
+//-------------------------------------------------------------------------------------------
-template <class CharString> inline
-bool isValidUtf8(const CharString& str)
+namespace implementation
{
- using namespace implementation;
- bool valid = true;
- utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp)
- {
- if (cp == REPLACEMENT_CHAR)
- valid = false; //perf: should we use an (expensive) exception for iteration break?
- });
- return valid;
-}
-
-
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str)
+template <class TargetString, class SourceString> inline
+TargetString utfTo(const SourceString& str, FalseType)
{
- static_assert(IsSameType<typename GetCharType<CharString>::Type, char >::value, "");
- static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
+ using CharSrc = typename GetCharType<SourceString>::Type;
+ using CharTrg = typename GetCharType<TargetString>::Type;
+ static_assert(sizeof(CharSrc) != sizeof(CharTrg), "no UTF-conversion needed");
- return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>());
-}
+ TargetString output;
+ UtfDecoder<CharSrc> decoder(strBegin(str), strLength(str));
+ while (Opt<CodePoint> cp = decoder.getNext())
+ codePointToUtf<CharTrg>(*cp, [&](CharTrg c) { output += c; });
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str)
-{
- static_assert(IsSameType<typename GetCharType<CharString>::Type, char >::value, "");
- static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
-
- return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>());
+ return output;
}
-//-------------------------------------------------------------------------------------------
template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo<TargetString>(str); }
+TargetString utfTo(const SourceString& str, TrueType) { return copyStringTo<TargetString>(str); }
+}
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo<TargetString>(str); }
template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str)
+TargetString utfTo(const SourceString& str)
{
- return utfCvrtTo<TargetString>(str,
- typename GetCharType<SourceString>::Type(),
- typename GetCharType<TargetString>::Type());
+ return implementation::utfTo<TargetString>(str, StaticBool<sizeof(typename GetCharType<SourceString>::Type) == sizeof(typename GetCharType<TargetString>::Type)>());
}
}
diff --git a/zen/zstring.cpp b/zen/zstring.cpp
index 5f5b1ec8..a936efb5 100755
--- a/zen/zstring.cpp
+++ b/zen/zstring.cpp
@@ -6,11 +6,14 @@
#include "zstring.h"
#include <stdexcept>
+#include "utf.h"
using namespace zen;
/*
+MSDN "Handling Sorting in Your Applications": https://msdn.microsoft.com/en-us/library/windows/desktop/dd318144
+
Perf test: compare strings 10 mio times; 64 bit build
-----------------------------------------------------
string a = "Fjk84$%kgfj$%T\\\\Gffg\\gsdgf\\fgsx----------d-"
@@ -32,3 +35,117 @@ time per call | function
*/
+
+
+namespace
+{
+int compareNoCaseUtf8(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
+{
+ //- strncasecmp implements ASCII CI-comparsion only! => signature is broken for UTF8-input; toupper() similarly doesn't support Unicode
+ //- wcsncasecmp: https://opensource.apple.com/source/Libc/Libc-763.12/string/wcsncasecmp-fbsd.c
+ // => re-implement comparison based on towlower() to avoid memory allocations
+ using namespace zen::implementation;
+
+ UtfDecoder<char> decL(lhs, lhsLen);
+ UtfDecoder<char> decR(rhs, rhsLen);
+ for (;;)
+ {
+ const Opt<CodePoint> cpL = decL.getNext();
+ const Opt<CodePoint> cpR = decR.getNext();
+ if (!cpL || !cpR)
+ return static_cast<int>(!cpR) - static_cast<int>(!cpL);
+
+ static_assert(sizeof(wchar_t) == sizeof(CodePoint), "");
+ const wchar_t charL = ::towlower(static_cast<wchar_t>(*cpL)); //ordering: towlower() converts to higher code points than towupper()
+ const wchar_t charR = ::towlower(static_cast<wchar_t>(*cpR)); //uses LC_CTYPE category of current locale
+ if (charL != charR)
+ return static_cast<unsigned int>(charL) - static_cast<unsigned int>(charR); //unsigned char-comparison is the convention!
+ //unsigned underflow is well-defined!
+ }
+}
+}
+
+
+int cmpStringNaturalLinux(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
+{
+ const char* const lhsEnd = lhs + lhsLen;
+ const char* const rhsEnd = rhs + rhsLen;
+ /*
+ - compare strings after conceptually creating blocks of whitespace/numbers/text
+ - implement strict weak ordering!
+ - don't follow broken "strnatcasecmp": https://github.com/php/php-src/blob/master/ext/standard/strnatcmp.c
+ 1. incorrect non-ASCII CI-comparison 2. incorrect bounds checks
+ 3. incorrect trimming of *all* whitespace 4. arbitrary handling of leading 0 only at string begin
+ 5. incorrect handling of whitespace following a number 6. code is a mess
+ */
+ for (;;)
+ {
+ if (lhs == lhsEnd || rhs == rhsEnd)
+ return static_cast<int>(lhs != lhsEnd) - static_cast<int>(rhs != rhsEnd); //"nothing" before "something"
+ //note: "something" never would have been condensed to "nothing" further below => can finish evaluation here
+
+ const bool wsL = isWhiteSpace(*lhs);
+ const bool wsR = isWhiteSpace(*rhs);
+ if (wsL != wsR)
+ return static_cast<int>(!wsL) - static_cast<int>(!wsR); //whitespace before non-ws!
+ if (wsL)
+ {
+ ++lhs, ++rhs;
+ while (lhs != lhsEnd && isWhiteSpace(*lhs)) ++lhs;
+ while (rhs != rhsEnd && isWhiteSpace(*rhs)) ++rhs;
+ continue;
+ }
+
+ const bool digitL = isDigit(*lhs);
+ const bool digitR = isDigit(*rhs);
+ if (digitL != digitR)
+ return static_cast<int>(!digitL) - static_cast<int>(!digitR); //number before chars!
+ if (digitL)
+ {
+ while (lhs != lhsEnd && *lhs == '0') ++lhs;
+ while (rhs != rhsEnd && *rhs == '0') ++rhs;
+
+ int rv = 0;
+ for (;; ++lhs, ++rhs)
+ {
+ const bool endL = lhs == lhsEnd || !isDigit(*lhs);
+ const bool endR = rhs == rhsEnd || !isDigit(*rhs);
+ if (endL != endR)
+ return static_cast<int>(!endL) - static_cast<int>(!endR); //more digits means bigger number
+ if (endL)
+ break; //same number of digits
+
+ if (rv == 0 && *lhs != *rhs)
+ rv = *lhs - *rhs; //found first digit difference comparing from left
+ }
+ if (rv != 0)
+ return rv;
+ continue;
+ }
+
+ //compare full junks of text: consider unicode encoding!
+ const char* textBeginL = lhs++;
+ const char* textBeginR = rhs++; //current char is neither white space nor digit at this point!
+ while (lhs != lhsEnd && !isWhiteSpace(*lhs) && !isDigit(*lhs)) ++lhs;
+ while (rhs != rhsEnd && !isWhiteSpace(*rhs) && !isDigit(*rhs)) ++rhs;
+
+ const int rv = compareNoCaseUtf8(textBeginL, lhs - textBeginL, textBeginR, rhs - textBeginR);
+ if (rv != 0)
+ return rv;
+ }
+}
+
+
+namespace
+{
+}
+
+
+int CmpNaturalSort::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const
+{
+ //auto strL = utfTo<std::string>(Zstring(lhs, lhsLen));
+ //auto strR = utfTo<std::string>(Zstring(rhs, rhsLen));
+ //return cmpStringNaturalLinux(strL.c_str(), strL.size(), strR.c_str(), strR.size());
+ return cmpStringNaturalLinux(lhs, lhsLen, rhs, rhsLen);
+
+} \ No newline at end of file
diff --git a/zen/zstring.h b/zen/zstring.h
index 12bda29f..fdb71da0 100755
--- a/zen/zstring.h
+++ b/zen/zstring.h
@@ -19,35 +19,39 @@
using Zstring = zen::Zbase<Zchar>;
-int cmpStringNoCase(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen);
- int cmpStringNoCase(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen);
-
-template <class S>
-S makeUpperCopy(S str);
-
-
//Compare filepaths: Windows/OS X does NOT distinguish between upper/lower-case, while Linux DOES
-int cmpFilePath(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen);
- int cmpFilePath(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen);
+struct CmpFilePath
+{
+ int operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const;
+};
+struct CmpNaturalSort
+{
+ int operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const;
+};
-template <class S, class T> inline
-bool equalFilePath(const S& lhs, const T& rhs) { using namespace zen; return cmpFilePath(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) == 0; }
struct LessFilePath
{
- template <class S, class T>
- bool operator()(const S& lhs, const T& rhs) const { using namespace zen; return cmpFilePath(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; }
+ template <class S> //don't support heterogenous input! => use as container predicate only!
+ bool operator()(const S& lhs, const S& rhs) const { using namespace zen; return CmpFilePath()(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; }
};
-
-struct LessNoCase
+struct LessNaturalSort
{
- template <class S, class T>
- bool operator()(const S& lhs, const T& rhs) const { using namespace zen; return cmpStringNoCase(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; }
+ template <class S> //don't support heterogenous input! => use as container predicate only!
+ bool operator()(const S& lhs, const S& rhs) const { using namespace zen; return CmpNaturalSort()(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) < 0; }
};
+template <class S>
+S makeUpperCopy(S str);
+
+
+template <class S, class T> inline
+bool equalFilePath(const S& lhs, const T& rhs) { using namespace zen; return strEqual(lhs, rhs, CmpFilePath()); }
+
+
inline
Zstring appendSeparator(Zstring path) //support rvalue references!
{
@@ -63,35 +67,6 @@ Zstring getFileExtension(const Zstring& filePath)
}
-template <class S, class T> inline
-bool ciEqual(const S& lhs, const T& rhs) { using namespace zen; return cmpStringNoCase(strBegin(lhs), strLength(lhs), strBegin(rhs), strLength(rhs)) == 0; }
-
-
-template <class S, class T> inline
-bool ciStartsWith(const S& str, const T& prefix)
-{
- using namespace zen;
- const size_t pfLen = strLength(prefix);
- if (strLength(str) < pfLen)
- return false;
-
- return cmpStringNoCase(strBegin(str), pfLen, strBegin(prefix), pfLen) == 0;
-}
-
-
-template <class S, class T> inline
-bool ciEndsWith(const S& str, const T& postfix)
-{
- using namespace zen;
- const size_t strLen = strLength(str);
- const size_t pfLen = strLength(postfix);
- if (strLen < pfLen)
- return false;
-
- return cmpStringNoCase(strBegin(str) + strLen - pfLen, pfLen, strBegin(postfix), pfLen) == 0;
-}
-
-
template <class S, class T, class U>
S ciReplaceCpy(const S& str, const T& oldTerm, const U& newTerm);
@@ -110,37 +85,11 @@ inline
void makeUpperInPlace(char* str, size_t strLen)
{
std::for_each(str, str + strLen, [](char& c) { c = std::toupper(static_cast<unsigned char>(c)); }); //locale-dependent!
- //result of toupper() is an unsigned char mapped to int range, so the char representation is in the last 8 bits and we need not care about signedness!
+ //result of toupper() is an unsigned char mapped to int range: the char representation is in the last 8 bits and we need not care about signedness!
//this should work for UTF-8, too: all chars >= 128 are mapped upon themselves!
}
-inline
-int cmpStringNoCase(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen)
-{
- assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls!
- assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); //
-
- const int rv = ::wcsncasecmp(lhs, rhs, std::min(lhsLen, rhsLen)); //locale-dependent!
- if (rv != 0)
- return rv;
- return static_cast<int>(lhsLen) - static_cast<int>(rhsLen);
-}
-
-
-inline
-int cmpStringNoCase(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
-{
- assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls!
- assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); //
-
- const int rv = ::strncasecmp(lhs, rhs, std::min(lhsLen, rhsLen)); //locale-dependent!
- if (rv != 0)
- return rv;
- return static_cast<int>(lhsLen) - static_cast<int>(rhsLen);
-}
-
-
template <class S> inline
S makeUpperCopy(S str)
{
@@ -153,20 +102,7 @@ S makeUpperCopy(S str)
inline
-int cmpFilePath(const wchar_t* lhs, size_t lhsLen, const wchar_t* rhs, size_t rhsLen)
-{
- assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls!
- assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); //
-
- const int rv = std::wcsncmp(lhs, rhs, std::min(lhsLen, rhsLen));
- if (rv != 0)
- return rv;
- return static_cast<int>(lhsLen) - static_cast<int>(rhsLen);
-}
-
-
-inline
-int cmpFilePath(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen)
+int CmpFilePath::operator()(const Zchar* lhs, size_t lhsLen, const Zchar* rhs, size_t rhsLen) const
{
assert(std::find(lhs, lhs + lhsLen, 0) == lhs + lhsLen); //don't expect embedded nulls!
assert(std::find(rhs, rhs + rhsLen, 0) == rhs + rhsLen); //
@@ -214,6 +150,7 @@ S ciReplaceCpy(const S& str, const T& oldTerm, const U& newTerm)
}
}
+ int cmpStringNaturalLinux(const char* lhs, size_t lhsLen, const char* rhs, size_t rhsLen);
//---------------------------------------------------------------------------
//ZEN macro consistency checks:
bgstack15