mirror of
https://github.com/vcmi/vcmi.git
synced 2025-08-10 22:31:40 +02:00
Merge pull request #5615 from IvanSavenko/locale_fix
Fixes to locale/unicode handling
This commit is contained in:
@@ -67,7 +67,7 @@ struct Options
|
||||
/// encoding that is used by H3 for this language
|
||||
std::string encoding;
|
||||
|
||||
/// proper locale name, e.g. "en_US.UTF-8"
|
||||
/// proper locale name, e.g. "en_US"
|
||||
std::string localeName;
|
||||
|
||||
/// primary IETF language tag
|
||||
@@ -90,29 +90,29 @@ inline const auto & getLanguageList()
|
||||
{
|
||||
static const std::array<Options, 23> languages
|
||||
{ {
|
||||
{ "belarusian", "Belarusian", "беларускі", "CP1251", "be_BY.UTF-8", "be", "bel", "%d.%m.%Y %H:%M", EPluralForms::UK_3, true },
|
||||
{ "bulgarian", "Bulgarian", "Български", "CP1251", "bg_BG.UTF-8", "bg", "bul", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "czech", "Czech", "Čeština", "CP1250", "cs_CZ.UTF-8", "cs", "cze", "%d.%m.%Y %H:%M", EPluralForms::CZ_3, true },
|
||||
{ "chinese", "Chinese", "简体中文", "GBK", "zh_CN.UTF-8", "zh", "chi", "%Y-%m-%d %H:%M", EPluralForms::VI_1, true }, // Note: actually Simplified Chinese
|
||||
{ "english", "English", "English", "CP1252", "en_US.UTF-8", "en", "eng", "%Y-%m-%d %H:%M", EPluralForms::EN_2, true }, // English uses international date/time format here
|
||||
{ "finnish", "Finnish", "Suomi", "CP1252", "fi_FI.UTF-8", "fi", "fin", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "french", "French", "Français", "CP1252", "fr_FR.UTF-8", "fr", "fre", "%d/%m/%Y %H:%M", EPluralForms::FR_2, true },
|
||||
{ "german", "German", "Deutsch", "CP1252", "de_DE.UTF-8", "de", "ger", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "greek", "Greek", "ελληνικά", "CP1253", "el_GR.UTF-8", "el", "ell", "%d/%m/%Y %H:%M", EPluralForms::EN_2, false },
|
||||
{ "hungarian", "Hungarian", "Magyar", "CP1250", "hu_HU.UTF-8", "hu", "hun", "%Y. %m. %d. %H:%M", EPluralForms::EN_2, true },
|
||||
{ "italian", "Italian", "Italiano", "CP1250", "it_IT.UTF-8", "it", "ita", "%d/%m/%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "japanese", "Japanese", "日本語", "JIS", "ja_JP.UTF-8", "ja", "jpn", "%Y年%m月%d日 %H:%M", EPluralForms::NONE, false },
|
||||
{ "korean", "Korean", "한국어", "CP949", "ko_KR.UTF-8", "ko", "kor", "%Y-%m-%d %H:%M", EPluralForms::VI_1, true },
|
||||
{ "polish", "Polish", "Polski", "CP1250", "pl_PL.UTF-8", "pl", "pol", "%d.%m.%Y %H:%M", EPluralForms::PL_3, true },
|
||||
{ "portuguese", "Portuguese", "Português", "CP1252", "pt_BR.UTF-8", "pt", "por", "%d/%m/%Y %H:%M", EPluralForms::EN_2, true }, // Note: actually Brazilian Portuguese
|
||||
{ "romanian", "Romanian", "Română", "CP28606", "ro_RO.UTF-8", "ro", "rum", "%Y-%m-%d %H:%M", EPluralForms::RO_3, false },
|
||||
{ "russian", "Russian", "Русский", "CP1251", "ru_RU.UTF-8", "ru", "rus", "%d.%m.%Y %H:%M", EPluralForms::UK_3, true },
|
||||
{ "spanish", "Spanish", "Español", "CP1252", "es_ES.UTF-8", "es", "spa", "%d/%m/%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "swedish", "Swedish", "Svenska", "CP1252", "sv_SE.UTF-8", "sv", "swe", "%Y-%m-%d %H:%M", EPluralForms::EN_2, true },
|
||||
{ "norwegian", "Norwegian", "Norsk Bokmål", "UTF-8", "nb_NO.UTF-8", "nb", "nor", "%d/%m/%Y %H:%M", EPluralForms::EN_2, false },
|
||||
{ "turkish", "Turkish", "Türkçe", "CP1254", "tr_TR.UTF-8", "tr", "tur", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "ukrainian", "Ukrainian", "Українська", "CP1251", "uk_UA.UTF-8", "uk", "ukr", "%d.%m.%Y %H:%M", EPluralForms::UK_3, true },
|
||||
{ "vietnamese", "Vietnamese", "Tiếng Việt", "UTF-8", "vi_VN.UTF-8", "vi", "vie", "%d/%m/%Y %H:%M", EPluralForms::VI_1, true }, // Fan translation uses special encoding
|
||||
{ "belarusian", "Belarusian", "Беларускі", "CP1251", "be_BY", "be", "bel", "%d.%m.%Y %H:%M", EPluralForms::UK_3, true },
|
||||
{ "bulgarian", "Bulgarian", "Български", "CP1251", "bg_BG", "bg", "bul", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "czech", "Czech", "Čeština", "CP1250", "cs_CZ", "cs", "cze", "%d.%m.%Y %H:%M", EPluralForms::CZ_3, true },
|
||||
{ "chinese", "Chinese", "简体中文", "GBK", "zh_CN", "zh", "chi", "%Y-%m-%d %H:%M", EPluralForms::VI_1, true }, // Note: actually Simplified Chinese
|
||||
{ "english", "English", "English", "CP1252", "en_US", "en", "eng", "%Y-%m-%d %H:%M", EPluralForms::EN_2, true }, // English uses international date/time format here
|
||||
{ "finnish", "Finnish", "Suomi", "CP1252", "fi_FI", "fi", "fin", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "french", "French", "Français", "CP1252", "fr_FR", "fr", "fre", "%d/%m/%Y %H:%M", EPluralForms::FR_2, true },
|
||||
{ "german", "German", "Deutsch", "CP1252", "de_DE", "de", "ger", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "greek", "Greek", "ελληνικά", "CP1253", "el_GR", "el", "ell", "%d/%m/%Y %H:%M", EPluralForms::EN_2, false },
|
||||
{ "hungarian", "Hungarian", "Magyar", "CP1250", "hu_HU", "hu", "hun", "%Y. %m. %d. %H:%M", EPluralForms::EN_2, true },
|
||||
{ "italian", "Italian", "Italiano", "CP1250", "it_IT", "it", "ita", "%d/%m/%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "japanese", "Japanese", "日本語", "JIS", "ja_JP", "ja", "jpn", "%Y年%m月%d日 %H:%M", EPluralForms::VI_1, false },
|
||||
{ "korean", "Korean", "한국어", "CP949", "ko_KR", "ko", "kor", "%Y-%m-%d %H:%M", EPluralForms::VI_1, true },
|
||||
{ "polish", "Polish", "Polski", "CP1250", "pl_PL", "pl", "pol", "%d.%m.%Y %H:%M", EPluralForms::PL_3, true },
|
||||
{ "portuguese", "Portuguese", "Português", "CP1252", "pt_BR", "pt", "por", "%d/%m/%Y %H:%M", EPluralForms::EN_2, true }, // Note: actually Brazilian Portuguese
|
||||
{ "romanian", "Romanian", "Română", "CP28606", "ro_RO", "ro", "rum", "%Y-%m-%d %H:%M", EPluralForms::RO_3, false },
|
||||
{ "russian", "Russian", "Русский", "CP1251", "ru_RU", "ru", "rus", "%d.%m.%Y %H:%M", EPluralForms::UK_3, true },
|
||||
{ "spanish", "Spanish", "Español", "CP1252", "es_ES", "es", "spa", "%d/%m/%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "swedish", "Swedish", "Svenska", "CP1252", "sv_SE", "sv", "swe", "%Y-%m-%d %H:%M", EPluralForms::EN_2, true },
|
||||
{ "norwegian", "Norwegian", "Norsk Bokmål", "UTF-8", "nb_NO", "nb", "nor", "%d/%m/%Y %H:%M", EPluralForms::EN_2, false },
|
||||
{ "turkish", "Turkish", "Türkçe", "CP1254", "tr_TR", "tr", "tur", "%d.%m.%Y %H:%M", EPluralForms::EN_2, true },
|
||||
{ "ukrainian", "Ukrainian", "Українська", "CP1251", "uk_UA", "uk", "ukr", "%d.%m.%Y %H:%M", EPluralForms::UK_3, true },
|
||||
{ "vietnamese", "Vietnamese", "Tiếng Việt", "UTF-8", "vi_VN", "vi", "vie", "%d/%m/%Y %H:%M", EPluralForms::VI_1, true }, // Fan translation uses special encoding
|
||||
} };
|
||||
static_assert(languages.size() == static_cast<size_t>(ELanguages::COUNT), "Languages array is missing a value!");
|
||||
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#include <vstd/DateUtils.h>
|
||||
|
||||
#include <boost/locale.hpp>
|
||||
#include <boost/locale/encoding.hpp>
|
||||
|
||||
VCMI_LIB_NAMESPACE_BEGIN
|
||||
|
||||
@@ -97,7 +97,7 @@ bool TextOperations::isValidASCII(const char * data, size_t size)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TextOperations::isValidUnicodeString(const std::string & text)
|
||||
bool TextOperations::isValidUnicodeString(std::string_view text)
|
||||
{
|
||||
for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
|
||||
{
|
||||
@@ -210,7 +210,7 @@ void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
|
||||
}
|
||||
}
|
||||
|
||||
size_t TextOperations::getUnicodeCharactersCount(const std::string & text)
|
||||
size_t TextOperations::getUnicodeCharactersCount(std::string_view text)
|
||||
{
|
||||
size_t charactersCount = 0;
|
||||
|
||||
@@ -253,10 +253,65 @@ std::string TextOperations::getCurrentFormattedDateTimeLocal(std::chrono::second
|
||||
return TextOperations::getFormattedDateTimeLocal(std::chrono::system_clock::to_time_t(timepoint));
|
||||
}
|
||||
|
||||
static const std::locale & getLocale()
|
||||
{
|
||||
auto getLocale = []() -> std::locale
|
||||
{
|
||||
const std::string & baseLocaleName = Languages::getLanguageOptions(LIBRARY->generaltexth->getPreferredLanguage()).localeName;
|
||||
const std::string fallbackLocale = Languages::getLanguageOptions(Languages::ELanguages::ENGLISH).localeName;
|
||||
|
||||
for (const auto & localeName : { baseLocaleName + ".UTF-8", baseLocaleName, fallbackLocale + ".UTF-8", fallbackLocale })
|
||||
{
|
||||
try
|
||||
{
|
||||
// Locale generation may fail (and throw an exception) in two cases:
|
||||
// - if the corresponding locale is not installed on the system
|
||||
// - on Android named locales are not supported at all and always throw an exception
|
||||
return std::locale(localeName);
|
||||
}
|
||||
catch (const std::exception & e)
|
||||
{
|
||||
logGlobal->warn("Failed to set locale '%s'", localeName);
|
||||
}
|
||||
}
|
||||
return std::locale();
|
||||
};
|
||||
|
||||
static const std::locale locale = getLocale();
|
||||
return locale;
|
||||
}
|
||||
|
||||
int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view t)
|
||||
{
|
||||
int n = t.size();
|
||||
int m = s.size();
|
||||
assert(isValidUnicodeString(s));
|
||||
assert(isValidUnicodeString(t));
|
||||
|
||||
auto charactersEqual = [&s, &t](int sPoint, int tPoint)
|
||||
{
|
||||
uint32_t sUTF32 = getUnicodeCodepoint(s.data() + sPoint, s.size() - sPoint);
|
||||
uint32_t tUTF32 = getUnicodeCodepoint(t.data() + tPoint, t.size() - tPoint);
|
||||
|
||||
if (sUTF32 == tUTF32)
|
||||
return true;
|
||||
|
||||
// Windows - wchar_t represents UTF-16 symbol that does not cover entire Unicode
|
||||
// In UTF-16 such characters can only be represented as 2 wchar_t's, but toupper can only operate on single wchar
|
||||
// Assume symbols are different if one of them cannot be represented as a single UTF-16 symbol
|
||||
if constexpr (sizeof(wchar_t) == 2)
|
||||
{
|
||||
if (sUTF32 > 0xFFFF || (sUTF32 >= 0xD800 && sUTF32 <= 0xDFFF ))
|
||||
return false;
|
||||
|
||||
if (tUTF32 > 0xFFFF || (tUTF32 >= 0xD800 && tUTF32 <= 0xDFFF ))
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto & facet = std::use_facet<std::ctype<wchar_t>>(getLocale());
|
||||
return facet.toupper(sUTF32) == facet.toupper(tUTF32);
|
||||
};
|
||||
|
||||
int n = getUnicodeCharactersCount(t);
|
||||
int m = getUnicodeCharactersCount(s);
|
||||
|
||||
// create two work vectors of integer distances
|
||||
std::vector<int> v0(n+1, 0);
|
||||
@@ -268,8 +323,9 @@ int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view
|
||||
for (int i = 0; i < n; ++i)
|
||||
v0[i] = i;
|
||||
|
||||
for (int i = 0; i < m; ++i)
|
||||
for (int i = 0, iPoint = 0; i < m; ++i, iPoint += getUnicodeCharacterSize(s[iPoint]))
|
||||
{
|
||||
|
||||
// calculate v1 (current row distances) from the previous row v0
|
||||
|
||||
// first element of v1 is A[i + 1][0]
|
||||
@@ -277,14 +333,14 @@ int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view
|
||||
v1[0] = i + 1;
|
||||
|
||||
// use formula to fill in the rest of the row
|
||||
for (int j = 0; j < n; ++j)
|
||||
for (int j = 0, jPoint = 0; j < n; ++j, jPoint += getUnicodeCharacterSize(t[jPoint]))
|
||||
{
|
||||
// calculating costs for A[i + 1][j + 1]
|
||||
int deletionCost = v0[j + 1] + 1;
|
||||
int insertionCost = v1[j] + 1;
|
||||
int substitutionCost;
|
||||
|
||||
if (s[i] == t[j])
|
||||
if (charactersEqual(iPoint, jPoint))
|
||||
substitutionCost = v0[j];
|
||||
else
|
||||
substitutionCost = v0[j] + 1;
|
||||
@@ -301,45 +357,17 @@ int TextOperations::getLevenshteinDistance(std::string_view s, std::string_view
|
||||
return v0[n];
|
||||
}
|
||||
|
||||
DLL_LINKAGE const std::locale & TextOperations::getLocale()
|
||||
{
|
||||
static std::locale loc;
|
||||
|
||||
const std::string & localeName = Languages::getLanguageOptions(LIBRARY->generaltexth->getPreferredLanguage()).localeName;
|
||||
try
|
||||
{
|
||||
loc = std::locale(localeName); // might fail on Android
|
||||
}
|
||||
catch (const std::exception & e)
|
||||
{
|
||||
logGlobal->warn("Failed to set locale '%s'. Falling back to 'en_US.UTF-8'", localeName);
|
||||
try
|
||||
{
|
||||
loc = std::locale("en_US.UTF-8");
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
logGlobal->warn("Fallback locale 'en_US.UTF-8' failed. Using default 'C' locale.");
|
||||
loc = std::locale::classic();
|
||||
}
|
||||
}
|
||||
return loc;
|
||||
}
|
||||
|
||||
DLL_LINKAGE bool TextOperations::compareLocalizedStrings(std::string_view str1, std::string_view str2)
|
||||
{
|
||||
static const std::collate<char> & col = std::use_facet<std::collate<char>>(getLocale());
|
||||
return col.compare(str1.data(), str1.data() + str1.size(),
|
||||
str2.data(), str2.data() + str2.size()) < 0;
|
||||
const std::collate<char> & col = std::use_facet<std::collate<char>>(getLocale());
|
||||
return col.compare(
|
||||
str1.data(), str1.data() + str1.size(),
|
||||
str2.data(), str2.data() + str2.size()
|
||||
) < 0;
|
||||
}
|
||||
|
||||
std::optional<int> TextOperations::textSearchSimilarityScore(const std::string & s, const std::string & t)
|
||||
std::optional<int> TextOperations::textSearchSimilarityScore(const std::string & needle, const std::string & haystack)
|
||||
{
|
||||
static const std::collate<char> & col = std::use_facet<std::collate<char>>(getLocale());
|
||||
|
||||
auto haystack = col.transform(t.data(), t.data() + t.size());
|
||||
auto needle = col.transform(s.data(), s.data() + s.size());
|
||||
|
||||
// 0 - Best possible match: text starts with the search string
|
||||
if(haystack.rfind(needle, 0) == 0)
|
||||
return 0;
|
||||
@@ -349,13 +377,24 @@ std::optional<int> TextOperations::textSearchSimilarityScore(const std::string &
|
||||
return 1;
|
||||
|
||||
// Dynamic threshold: Reject if too many typos based on word length
|
||||
int maxAllowedDistance = std::max(2, static_cast<int>(needle.size() / 2));
|
||||
int haystackCodepoints = getUnicodeCharactersCount(haystack);
|
||||
int needleCodepoints = getUnicodeCharactersCount(needle);
|
||||
int maxAllowedDistance = needleCodepoints / 2;
|
||||
|
||||
// Compute Levenshtein distance for fuzzy similarity
|
||||
int minDist = std::numeric_limits<int>::max();
|
||||
for(size_t i = 0; i <= haystack.size() - needle.size(); i++)
|
||||
|
||||
for(int i = 0; i <= haystackCodepoints - needleCodepoints; ++i)
|
||||
{
|
||||
int dist = getLevenshteinDistance(haystack.substr(i, needle.size()), needle);
|
||||
int haystackBegin = 0;
|
||||
for(int j = 0; j < i; ++j)
|
||||
haystackBegin += getUnicodeCharacterSize(haystack[haystackBegin]);
|
||||
|
||||
int haystackEnd = haystackBegin;
|
||||
for(int j = 0; j < needleCodepoints; ++j)
|
||||
haystackEnd += getUnicodeCharacterSize(haystack[haystackEnd]);
|
||||
|
||||
int dist = getLevenshteinDistance(haystack.substr(haystackBegin, haystackEnd - haystackBegin), needle);
|
||||
minDist = std::min(minDist, dist);
|
||||
}
|
||||
|
||||
|
@@ -33,7 +33,7 @@ namespace TextOperations
|
||||
bool DLL_LINKAGE isValidASCII(const char * data, size_t size);
|
||||
|
||||
/// test if text contains valid UTF-8 sequence
|
||||
bool DLL_LINKAGE isValidUnicodeString(const std::string & text);
|
||||
bool DLL_LINKAGE isValidUnicodeString(std::string_view text);
|
||||
bool DLL_LINKAGE isValidUnicodeString(const char * data, size_t size);
|
||||
|
||||
/// converts text to UTF-8 from specified encoding or from one specified in settings
|
||||
@@ -47,7 +47,7 @@ namespace TextOperations
|
||||
DLL_LINKAGE void trimRightUnicode(std::string & text, size_t amount = 1);
|
||||
|
||||
/// give back amount of unicode characters
|
||||
size_t DLL_LINKAGE getUnicodeCharactersCount(const std::string & text);
|
||||
size_t DLL_LINKAGE getUnicodeCharactersCount(std::string_view text);
|
||||
|
||||
/// converts number into string using metric system prefixes, e.g. 'k' or 'M' to keep resulting strings within specified size
|
||||
/// Note that resulting string may have more symbols than digits: minus sign and prefix symbol
|
||||
@@ -76,9 +76,6 @@ namespace TextOperations
|
||||
/// https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
|
||||
DLL_LINKAGE int getLevenshteinDistance(std::string_view s, std::string_view t);
|
||||
|
||||
/// Retrieves the locale based on the selected (in config) game language.
|
||||
DLL_LINKAGE const std::locale & getLocale();
|
||||
|
||||
/// Compares two strings using locale-aware collation based on the selected game language.
|
||||
DLL_LINKAGE bool compareLocalizedStrings(std::string_view str1, std::string_view str2);
|
||||
|
||||
|
@@ -1565,9 +1565,8 @@ void BattleActionProcessor::addGenericResurrectedLog(BattleLogMessage& blm, cons
|
||||
{
|
||||
if (resurrected > 0)
|
||||
{
|
||||
auto text = blm.lines.back().toString();
|
||||
text.pop_back(); // erase '.' at the end of line with life drain info
|
||||
MetaString ms = MetaString::createFromRawString(text);
|
||||
MetaString & ms = blm.lines.back();
|
||||
|
||||
if (resurrected == 1)
|
||||
{
|
||||
ms.appendLocalString(EMetaText::GENERAL_TXT, 363); // "\n and one rises from the dead."
|
||||
@@ -1577,9 +1576,7 @@ void BattleActionProcessor::addGenericResurrectedLog(BattleLogMessage& blm, cons
|
||||
ms.appendLocalString(EMetaText::GENERAL_TXT, 364); // "\n and %d rise from the dead."
|
||||
ms.replaceNumber(resurrected);
|
||||
}
|
||||
blm.lines[blm.lines.size() - 1] = std::move(ms);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
bool BattleActionProcessor::makeAutomaticBattleAction(const CBattleInfoCallback & battle, const BattleAction & ba)
|
||||
|
Reference in New Issue
Block a user