renamed Unicode -> TextOperations, to use for all text processing

2025-11-23 22:37:55 +02:00 · 2023-02-12 23:52:35 +02:00
parent 65c020ef34
commit acdb8d6e06
19 changed files with 142 additions and 123 deletions
--- a/lib/TextOperations.cpp
+++ b/lib/TextOperations.cpp
@@ -16,39 +16,52 @@

 VCMI_LIB_NAMESPACE_BEGIN

-size_t Unicode::getCharacterSize(char firstByte)
+size_t TextOperations::getUnicodeCharacterSize(char firstByte)
 {
 	// length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
 	// 0xxxxxxx -> 1 -  ASCII chars
 	// 110xxxxx -> 2
+	// 1110xxxx -> 3
 	// 11110xxx -> 4 - last allowed in current standard
-	// 1111110x -> 6 - last allowed in original standard

-	if ((ui8)firstByte < 0x80)
+	auto value = static_cast<uint8_t>(firstByte);
+
+	if ((value & 0b10000000) == 0)
 		return 1; // ASCII

-	size_t ret = 0;
+	if ((value & 0b11100000) == 0b11000000)
+		return 2;

-	for (size_t i=0; i<8; i++)
-	{
-		if (((ui8)firstByte & (0x80 >> i)) != 0)
-			ret++;
-		else
-			break;
-	}
-	return ret;
+	if ((value & 0b11110000) == 0b11100000)
+		return 3;
+
+	if ((value & 0b11111000) == 0b11110000)
+		return 4;
+
+	assert(0);// invalid unicode sequence
+	return 4;
 }

-bool Unicode::isValidCharacter(const char * character, size_t maxSize)
+bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)
 {
-	// can't be first byte in UTF8
-	if ((ui8)character[0] >= 0x80 && (ui8)character[0] < 0xC0)
-		return false;
-	// first character must follow rules checked in getCharacterSize
-	size_t size = getCharacterSize((ui8)character[0]);
+	assert(maxSize > 0);

-	if ((ui8)character[0] > 0xF4)
-		return false; // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
+	auto value = static_cast<uint8_t>(character[0]);
+
+	// ASCII
+	if ( value < 0b10000000)
+		return maxSize > 0;
+
+	// can't be first byte in UTF8
+	if (value < 0b11000000)
+		return false;
+
+	// above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
+	if (value > 0b11110000)
+		return false;
+
+	// first character must follow rules checked in getUnicodeCharacterSize
+	size_t size = getUnicodeCharacterSize(character[0]);

 	if (size > maxSize)
 		return false;
@@ -56,69 +69,70 @@ bool Unicode::isValidCharacter(const char * character, size_t maxSize)
 	// remaining characters must have highest bit set to 1
 	for (size_t i = 1; i < size; i++)
 	{
-		if (((ui8)character[i] & 0x80) == 0)
+		auto characterValue = static_cast<uint8_t>(character[i]);
+		if (characterValue < 0b10000000)
 			return false;
 	}
 	return true;
 }

-bool Unicode::isValidASCII(const std::string & text)
+bool TextOperations::isValidASCII(const std::string & text)
 {
 	for (const char & ch : text)
-		if (ui8(ch) >= 0x80 )
+		if (static_cast<uint8_t>(ch) >= 0x80 )
 			return false;
 	return true;
 }

-bool Unicode::isValidASCII(const char * data, size_t size)
+bool TextOperations::isValidASCII(const char * data, size_t size)
 {
 	for (size_t i=0; i<size; i++)
-		if (ui8(data[i]) >= 0x80 )
+		if (static_cast<uint8_t>(data[i]) >= 0x80 )
 			return false;
 	return true;
 }

-bool Unicode::isValidString(const std::string & text)
+bool TextOperations::isValidUnicodeString(const std::string & text)
 {
-	for (size_t i=0; i<text.size(); i += getCharacterSize(text[i]))
+	for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
 	{
-		if (!isValidCharacter(text.data() + i, text.size() - i))
+		if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))
 			return false;
 	}
 	return true;
 }

-bool Unicode::isValidString(const char * data, size_t size)
+bool TextOperations::isValidUnicodeString(const char * data, size_t size)
 {
-	for (size_t i=0; i<size; i += getCharacterSize(data[i]))
+	for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))
 	{
-		if (!isValidCharacter(data + i, size - i))
+		if (!isValidUnicodeCharacter(data + i, size - i))
 			return false;
 	}
 	return true;
 }

-std::string Unicode::toUnicode(const std::string &text)
+std::string TextOperations::toUnicode(const std::string &text)
 {
 	return toUnicode(text, CGeneralTextHandler::getInstalledEncoding());
 }

-std::string Unicode::toUnicode(const std::string &text, const std::string &encoding)
+std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)
 {
 	return boost::locale::conv::to_utf<char>(text, encoding);
 }

-std::string Unicode::fromUnicode(const std::string & text)
+std::string TextOperations::fromUnicode(const std::string & text)
 {
 	return fromUnicode(text, CGeneralTextHandler::getInstalledEncoding());
 }

-std::string Unicode::fromUnicode(const std::string &text, const std::string &encoding)
+std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)
 {
 	return boost::locale::conv::from_utf<char>(text, encoding);
 }

-void Unicode::trimRight(std::string & text, const size_t amount)
+void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
 {
 	if(text.empty())
 		return;
@@ -130,9 +144,9 @@ void Unicode::trimRight(std::string & text, const size_t amount)
 		size_t len = 0;
 		while (b != e) {
 			lastLen = len;
-			size_t n = getCharacterSize(*b);
+			size_t n = getUnicodeCharacterSize(*b);

-			if(!isValidCharacter(&(*b),e-b))
+			if(!isValidUnicodeCharacter(&(*b),e-b))
 			{
 				logGlobal->error("Invalid UTF8 sequence");
 				break;//invalid sequence will be trimmed
@@ -146,4 +160,15 @@ void Unicode::trimRight(std::string & text, const size_t amount)
 	}
 }

+std::string TextOperations::escapeString(std::string input)
+{
+	boost::replace_all(input, "\\", "\\\\");
+	boost::replace_all(input, "\n", "\\n");
+	boost::replace_all(input, "\r", "\\r");
+	boost::replace_all(input, "\t", "\\t");
+	boost::replace_all(input, "\"", "\\\"");
+
+	return input;
+}
+
 VCMI_LIB_NAMESPACE_END