Fixes to locale/unicode handling

- Fixes crash on opening spellbook (recent regression) - Fix crash if `haystack` string is shorter than `needle` - Levenstein distance computation is now case-insensitive - Text similarity computation now assumes utf-8 strings, not ascii - Fix corruption of utf-8 string on life drain
2025-11-25 22:42:04 +02:00 · 2025-04-10 21:22:46 +03:00
parent 0b437e8b1d
commit dcb7a9e9f3
4 changed files with 113 additions and 80 deletions
--- a/lib/texts/TextOperations.cpp
+++ b/lib/texts/TextOperations.cpp
@@ -17,7 +17,7 @@

 #include <vstd/DateUtils.h>

-#include <boost/locale.hpp>
+#include <boost/locale/encoding.hpp>

 VCMI_LIB_NAMESPACE_BEGIN

@@ -97,7 +97,7 @@ bool TextOperations::isValidASCII(const char * data, size_t size)
 	return true;
 }

-bool TextOperations::isValidUnicodeString(const std::string & text)
+bool TextOperations::isValidUnicodeString(std::string_view text)
 {
 	for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
 	{
@@ -210,7 +210,7 @@ void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
 	}
 }

-size_t TextOperations::getUnicodeCharactersCount(const std::string & text)
+size_t TextOperations::getUnicodeCharactersCount(std::string_view text)
 {
 	size_t charactersCount = 0;

@@ -253,10 +253,65 @@ std::string TextOperations::getCurrentFormattedDateTimeLocal(std::chrono::second
 	return TextOperations::getFormattedDateTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
 }

+static const std::locale & getLocale()
+{
+	auto getLocale = []() -> std::locale
+	{
+		const std::string & baseLocaleName = Languages::getLanguageOptions(LIBRARY->generaltexth->getPreferredLanguage()).localeName;
+		const std::string fallbackLocale = Languages::getLanguageOptions(Languages::ELanguages::ENGLISH).localeName;
+
+		for (const auto & localeName : { baseLocaleName + ".UTF-8", baseLocaleName, fallbackLocale + ".UTF-8", fallbackLocale })
+		{
+			try
+			{
+				// Locale generation may fail (and throw an exception) in two cases:
+				// - if the corresponding locale is not installed on the system
+				// - on Android named locales are not supported at all and always throw an exception
+				return std::locale(localeName);
+			}
+			catch (const std::exception & e)
+			{
+				logGlobal->warn("Failed to set locale '%s'", localeName);
+			}
+		}
+		return std::locale();
+	};
+
+	static const std::locale locale = getLocale();
+	return locale;
+}
+
 int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view t)
 {
-	int n = t.size();
-	int m = s.size();
+	assert(isValidUnicodeString(s));
+	assert(isValidUnicodeString(t));
+
+	auto charactersEqual = [&s, &t](int sPoint, int tPoint)
+	{
+		uint32_t sUTF32 = getUnicodeCodepoint(s.data() + sPoint, s.size() - sPoint);
+		uint32_t tUTF32 = getUnicodeCodepoint(t.data() + tPoint, t.size() - tPoint);
+
+		if (sUTF32 == tUTF32)
+			return true;
+
+		// Windows - wchar_t represents UTF-16 symbol that does not cover entire Unicode
+		// In UTF-16 such characters can only be represented as 2 wchar_t's, but toupper can only operate on single wchar
+		// Assume symbols are different if one of them cannot be represented as a single UTF-16 symbol
+		if constexpr (sizeof(wchar_t) == 2)
+		{
+			if (sUTF32 > 0xFFFF || (sUTF32 >= 0xD800 && sUTF32 <= 0xDFFF ))
+				return false;
+
+			if (tUTF32 > 0xFFFF || (tUTF32 >= 0xD800 && tUTF32 <= 0xDFFF ))
+				return false;
+		}
+
+		const auto & facet = std::use_facet<std::ctype<wchar_t>>(getLocale());
+		return facet.toupper(sUTF32) == facet.toupper(tUTF32);
+	};
+
+	int n = getUnicodeCharactersCount(t);
+	int m = getUnicodeCharactersCount(s);

 	// create two work vectors of integer distances
 	std::vector<int> v0(n+1, 0);
@@ -268,8 +323,9 @@ int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view
 	for (int i = 0; i < n; ++i)
 		v0[i] = i;

-	for (int i = 0; i < m; ++i)
+	for (int i = 0, iPoint = 0; i < m; ++i, iPoint += getUnicodeCharacterSize(s[iPoint]))
 	{
+
 		// calculate v1 (current row distances) from the previous row v0

 		// first element of v1 is A[i + 1][0]
@@ -277,14 +333,14 @@ int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view
 		v1[0] = i + 1;

 		// use formula to fill in the rest of the row
-		for (int j = 0; j < n; ++j)
+		for (int j = 0, jPoint = 0; j < n; ++j, jPoint += getUnicodeCharacterSize(t[jPoint]))
 		{
 			// calculating costs for A[i + 1][j + 1]
 			int deletionCost = v0[j + 1] + 1;
 			int insertionCost = v1[j] + 1;
 			int substitutionCost;

-			if (s[i] == t[j])
+			if (charactersEqual(iPoint, jPoint))
 				substitutionCost = v0[j];
 			else
 				substitutionCost = v0[j] + 1;
@@ -301,45 +357,17 @@ int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view
 	return v0[n];
 }

-DLL_LINKAGE const std::locale & TextOperations::getLocale()
-{
-    static std::locale loc;
-
-    const std::string & localeName = Languages::getLanguageOptions(LIBRARY->generaltexth->getPreferredLanguage()).localeName;
-    try
-    {
-        loc = std::locale(localeName); // might fail on Android
-    }
-    catch (const std::exception & e)
-    {
-        logGlobal->warn("Failed to set locale '%s'. Falling back to 'en_US.UTF-8'", localeName);
-        try
-        {
-            loc = std::locale("en_US.UTF-8");
-        }
-        catch (...)
-        {
-            logGlobal->warn("Fallback locale 'en_US.UTF-8' failed. Using default 'C' locale.");
-            loc = std::locale::classic();
-        }
-    }
-    return loc;
-}
-
 DLL_LINKAGE bool TextOperations::compareLocalizedStrings(std::string_view str1, std::string_view str2)
 {
-    static const std::collate<char> & col = std::use_facet<std::collate<char>>(getLocale());
-	return col.compare(str1.data(), str1.data() + str1.size(),
-					   str2.data(), str2.data() + str2.size()) < 0;
+	const std::collate<char> & col = std::use_facet<std::collate<char>>(getLocale());
+	return col.compare(
+		str1.data(), str1.data() + str1.size(),
+		str2.data(), str2.data() + str2.size()
+	) < 0;
 }

-std::optional<int> TextOperations::textSearchSimilarityScore(const std::string & s, const std::string & t)
+std::optional<int> TextOperations::textSearchSimilarityScore(const std::string & needle, const std::string & haystack)
 {
-    static const std::collate<char> & col = std::use_facet<std::collate<char>>(getLocale());
-
-	auto haystack = col.transform(t.data(), t.data() + t.size());
-	auto needle = col.transform(s.data(), s.data() + s.size());
-
 	// 0 - Best possible match: text starts with the search string
 	if(haystack.rfind(needle, 0) == 0)
 		return 0;
@@ -349,13 +377,24 @@ std::optional<int> TextOperations::textSearchSimilarityScore(const std::string &
 		return 1;

 	// Dynamic threshold: Reject if too many typos based on word length
-	int maxAllowedDistance = std::max(2, static_cast<int>(needle.size() / 2));
+	int haystackCodepoints = getUnicodeCharactersCount(haystack);
+	int needleCodepoints = getUnicodeCharactersCount(needle);
+	int maxAllowedDistance = needleCodepoints / 2;

 	// Compute Levenshtein distance for fuzzy similarity
 	int minDist = std::numeric_limits<int>::max();
-	for(size_t i = 0; i <= haystack.size() - needle.size(); i++)
+
+	for(int i = 0; i <= haystackCodepoints - needleCodepoints; ++i)
 	{
-		int dist = getLevenshteinDistance(haystack.substr(i, needle.size()), needle);
+		int haystackBegin = 0;
+		for(int j = 0; j < i; ++j)
+			haystackBegin += getUnicodeCharacterSize(haystack[haystackBegin]);
+
+		int haystackEnd = haystackBegin;
+		for(int j = 0; j < needleCodepoints; ++j)
+			haystackEnd += getUnicodeCharacterSize(haystack[haystackEnd]);
+
+		int dist = getLevenshteinDistance(haystack.substr(haystackBegin, haystackEnd - haystackBegin), needle);
 		minDist = std::min(minDist, dist);
 	}