vcmi/lib/TextOperations.cpp

/*
 * TextOperations.cpp, part of VCMI engine
 *
 * Authors: listed in file AUTHORS in main folder
 *
 * License: GNU General Public License v2.0 or later
 * Full text of license available in license.txt file, in main folder
 *
 */
#include "StdInc.h"
#include "TextOperations.h"

#include "CGeneralTextHandler.h"
#include "Languages.h"
#include "CConfigHandler.h"

#include <vstd/DateUtils.h>

#include <boost/locale.hpp>

VCMI_LIB_NAMESPACE_BEGIN

size_t TextOperations::getUnicodeCharacterSize(char firstByte)
{
	// length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
	// 0xxxxxxx -> 1 -  ASCII chars
	// 110xxxxx -> 2
	// 1110xxxx -> 3
	// 11110xxx -> 4 - last allowed in current standard

	auto value = static_cast<uint8_t>(firstByte);

	if ((value & 0b10000000) == 0)
		return 1; // ASCII

	if ((value & 0b11100000) == 0b11000000)
		return 2;

	if ((value & 0b11110000) == 0b11100000)
		return 3;

	if ((value & 0b11111000) == 0b11110000)
		return 4;

	assert(0);// invalid unicode sequence
	return 4;
}

bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)
{
	assert(maxSize > 0);

	auto value = static_cast<uint8_t>(character[0]);

	// ASCII
	if ( value < 0b10000000)
		return maxSize > 0;

	// can't be first byte in UTF8
	if (value < 0b11000000)
		return false;

	// above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
	if (value > 0b11110000)
		return false;

	// first character must follow rules checked in getUnicodeCharacterSize
	size_t size = getUnicodeCharacterSize(character[0]);

	if (size > maxSize)
		return false;

	// remaining characters must have highest bit set to 1
	for (size_t i = 1; i < size; i++)
	{
		auto characterValue = static_cast<uint8_t>(character[i]);
		if (characterValue < 0b10000000)
			return false;
	}
	return true;
}

bool TextOperations::isValidASCII(const std::string & text)
{
	for (const char & ch : text)
		if (static_cast<uint8_t>(ch) >= 0x80 )
			return false;
	return true;
}

bool TextOperations::isValidASCII(const char * data, size_t size)
{
	for (size_t i=0; i<size; i++)
		if (static_cast<uint8_t>(data[i]) >= 0x80 )
			return false;
	return true;
}

bool TextOperations::isValidUnicodeString(const std::string & text)
{
	for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))
	{
		if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))
			return false;
	}
	return true;
}

bool TextOperations::isValidUnicodeString(const char * data, size_t size)
{
	for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))
	{
		if (!isValidUnicodeCharacter(data + i, size - i))
			return false;
	}
	return true;
}

uint32_t TextOperations::getUnicodeCodepoint(const char * data, size_t maxSize)
{
	assert(isValidUnicodeCharacter(data, maxSize));
	if (!isValidUnicodeCharacter(data, maxSize))
		return 0;

	// https://en.wikipedia.org/wiki/UTF-8#Encoding
	switch (getUnicodeCharacterSize(data[0]))
	{
		case 1:
			return static_cast<uint8_t>(data[0]) & 0b1111111;
		case 2:
			return
				((static_cast<uint8_t>(data[0]) & 0b11111 ) << 6) +
				((static_cast<uint8_t>(data[1]) & 0b111111) << 0) ;
		case 3:
			return
				((static_cast<uint8_t>(data[0]) & 0b1111 )  << 12) +
				((static_cast<uint8_t>(data[1]) & 0b111111) << 6) +
				((static_cast<uint8_t>(data[2]) & 0b111111) << 0) ;
		case 4:
			return
				((static_cast<uint8_t>(data[0]) & 0b111 )   << 18) +
				((static_cast<uint8_t>(data[1]) & 0b111111) << 12) +
				((static_cast<uint8_t>(data[2]) & 0b111111) << 6) +
				((static_cast<uint8_t>(data[3]) & 0b111111) << 0) ;
	}

	assert(0);
	return 0;
}

uint32_t TextOperations::getUnicodeCodepoint(char data, const std::string & encoding )
{
	std::string stringNative(1, data);
	std::string stringUnicode = toUnicode(stringNative, encoding);

	if (stringUnicode.empty())
		return 0;

	return getUnicodeCodepoint(stringUnicode.data(), stringUnicode.size());
}

std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)
{
	return boost::locale::conv::to_utf<char>(text, encoding);
}

std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)
{
	return boost::locale::conv::from_utf<char>(text, encoding);
}

void TextOperations::trimRightUnicode(std::string & text, const size_t amount)
{
	if(text.empty())
		return;
	//todo: more efficient algorithm
	for(int i = 0; i< amount; i++){
		auto b = text.begin();
		auto e = text.end();
		size_t lastLen = 0;
		size_t len = 0;
		while (b != e) {
			lastLen = len;
			size_t n = getUnicodeCharacterSize(*b);

			if(!isValidUnicodeCharacter(&(*b),e-b))
			{
				logGlobal->error("Invalid UTF8 sequence");
				break;//invalid sequence will be trimmed
			}

			len += n;
			b += n;
		}

		text.resize(lastLen);
	}
}

size_t TextOperations::getUnicodeCharactersCount(const std::string & text)
{
	std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
	return conv.from_bytes(text).size(); 
}

std::string TextOperations::escapeString(std::string input)
{
	boost::replace_all(input, "\\", "\\\\");
	boost::replace_all(input, "\n", "\\n");
	boost::replace_all(input, "\r", "\\r");
	boost::replace_all(input, "\t", "\\t");
	boost::replace_all(input, "\"", "\\\"");

	return input;
}

std::string TextOperations::getFormattedDateTimeLocal(std::time_t dt)
{
	return vstd::getFormattedDateTime(dt, Languages::getLanguageOptions(settings["general"]["language"].String()).dateTimeFormat);
}

VCMI_LIB_NAMESPACE_END
convert line endings from CRLF (Windows) to LF (Linux/Unix) Mixed line endings cause problems when exporting patches with git-format-patch and then trying to "git am" a patch with mixed and non-matching line endings. In such a situation git will fail to apply the patch. This commit runs the dos2unix tools on the remaining files with CRLF (\r\n) line endings to convert them to line-feeds (\n) only. Files that are Windows specific like .vcxproj and .props files were not converted. Closes: #3073 2023-10-19 16:19:09 +02:00			`/*`
			`* TextOperations.cpp, part of VCMI engine`
			`*`
			`* Authors: listed in file AUTHORS in main folder`
			`*`
			`* License: GNU General Public License v2.0 or later`
			`* Full text of license available in license.txt file, in main folder`
			`*`
			`*/`
			`#include "StdInc.h"`
			`#include "TextOperations.h"`

			`#include "CGeneralTextHandler.h"`
code review 2023-12-16 22:10:27 +02:00			`#include "Languages.h"`
			`#include "CConfigHandler.h"`

			`#include <vstd/DateUtils.h>`
convert line endings from CRLF (Windows) to LF (Linux/Unix) Mixed line endings cause problems when exporting patches with git-format-patch and then trying to "git am" a patch with mixed and non-matching line endings. In such a situation git will fail to apply the patch. This commit runs the dos2unix tools on the remaining files with CRLF (\r\n) line endings to convert them to line-feeds (\n) only. Files that are Windows specific like .vcxproj and .props files were not converted. Closes: #3073 2023-10-19 16:19:09 +02:00
			`#include <boost/locale.hpp>`

			`VCMI_LIB_NAMESPACE_BEGIN`

			`size_t TextOperations::getUnicodeCharacterSize(char firstByte)`
			`{`
			`// length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:`
			`// 0xxxxxxx -> 1 - ASCII chars`
			`// 110xxxxx -> 2`
			`// 1110xxxx -> 3`
			`// 11110xxx -> 4 - last allowed in current standard`

			`auto value = static_cast<uint8_t>(firstByte);`

			`if ((value & 0b10000000) == 0)`
			`return 1; // ASCII`

			`if ((value & 0b11100000) == 0b11000000)`
			`return 2;`

			`if ((value & 0b11110000) == 0b11100000)`
			`return 3;`

			`if ((value & 0b11111000) == 0b11110000)`
			`return 4;`

			`assert(0);// invalid unicode sequence`
			`return 4;`
			`}`

			`bool TextOperations::isValidUnicodeCharacter(const char * character, size_t maxSize)`
			`{`
			`assert(maxSize > 0);`

			`auto value = static_cast<uint8_t>(character[0]);`

			`// ASCII`
			`if ( value < 0b10000000)`
			`return maxSize > 0;`

			`// can't be first byte in UTF8`
			`if (value < 0b11000000)`
			`return false;`

			`// above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)`
			`if (value > 0b11110000)`
			`return false;`

			`// first character must follow rules checked in getUnicodeCharacterSize`
			`size_t size = getUnicodeCharacterSize(character[0]);`

			`if (size > maxSize)`
			`return false;`

			`// remaining characters must have highest bit set to 1`
			`for (size_t i = 1; i < size; i++)`
			`{`
			`auto characterValue = static_cast<uint8_t>(character[i]);`
			`if (characterValue < 0b10000000)`
			`return false;`
			`}`
			`return true;`
			`}`

			`bool TextOperations::isValidASCII(const std::string & text)`
			`{`
			`for (const char & ch : text)`
			`if (static_cast<uint8_t>(ch) >= 0x80 )`
			`return false;`
			`return true;`
			`}`

			`bool TextOperations::isValidASCII(const char * data, size_t size)`
			`{`
			`for (size_t i=0; i<size; i++)`
			`if (static_cast<uint8_t>(data[i]) >= 0x80 )`
			`return false;`
			`return true;`
			`}`

			`bool TextOperations::isValidUnicodeString(const std::string & text)`
			`{`
			`for (size_t i=0; i<text.size(); i += getUnicodeCharacterSize(text[i]))`
			`{`
			`if (!isValidUnicodeCharacter(text.data() + i, text.size() - i))`
			`return false;`
			`}`
			`return true;`
			`}`

			`bool TextOperations::isValidUnicodeString(const char * data, size_t size)`
			`{`
			`for (size_t i=0; i<size; i += getUnicodeCharacterSize(data[i]))`
			`{`
			`if (!isValidUnicodeCharacter(data + i, size - i))`
			`return false;`
			`}`
			`return true;`
			`}`

			`uint32_t TextOperations::getUnicodeCodepoint(const char * data, size_t maxSize)`
			`{`
			`assert(isValidUnicodeCharacter(data, maxSize));`
			`if (!isValidUnicodeCharacter(data, maxSize))`
			`return 0;`

			`// https://en.wikipedia.org/wiki/UTF-8#Encoding`
			`switch (getUnicodeCharacterSize(data[0]))`
			`{`
			`case 1:`
			`return static_cast<uint8_t>(data[0]) & 0b1111111;`
			`case 2:`
			`return`
			`((static_cast<uint8_t>(data[0]) & 0b11111 ) << 6) +`
			`((static_cast<uint8_t>(data[1]) & 0b111111) << 0) ;`
			`case 3:`
			`return`
			`((static_cast<uint8_t>(data[0]) & 0b1111 ) << 12) +`
			`((static_cast<uint8_t>(data[1]) & 0b111111) << 6) +`
			`((static_cast<uint8_t>(data[2]) & 0b111111) << 0) ;`
			`case 4:`
			`return`
			`((static_cast<uint8_t>(data[0]) & 0b111 ) << 18) +`
			`((static_cast<uint8_t>(data[1]) & 0b111111) << 12) +`
			`((static_cast<uint8_t>(data[2]) & 0b111111) << 6) +`
			`((static_cast<uint8_t>(data[3]) & 0b111111) << 0) ;`
			`}`

			`assert(0);`
			`return 0;`
			`}`

			`uint32_t TextOperations::getUnicodeCodepoint(char data, const std::string & encoding )`
			`{`
			`std::string stringNative(1, data);`
			`std::string stringUnicode = toUnicode(stringNative, encoding);`

			`if (stringUnicode.empty())`
			`return 0;`

			`return getUnicodeCodepoint(stringUnicode.data(), stringUnicode.size());`
			`}`

			`std::string TextOperations::toUnicode(const std::string &text, const std::string &encoding)`
			`{`
			`return boost::locale::conv::to_utf<char>(text, encoding);`
			`}`

			`std::string TextOperations::fromUnicode(const std::string &text, const std::string &encoding)`
			`{`
			`return boost::locale::conv::from_utf<char>(text, encoding);`
			`}`

			`void TextOperations::trimRightUnicode(std::string & text, const size_t amount)`
			`{`
			`if(text.empty())`
			`return;`
			`//todo: more efficient algorithm`
			`for(int i = 0; i< amount; i++){`
			`auto b = text.begin();`
			`auto e = text.end();`
			`size_t lastLen = 0;`
			`size_t len = 0;`
			`while (b != e) {`
			`lastLen = len;`
			`size_t n = getUnicodeCharacterSize(*b);`

			`if(!isValidUnicodeCharacter(&(*b),e-b))`
			`{`
			`logGlobal->error("Invalid UTF8 sequence");`
			`break;//invalid sequence will be trimmed`
			`}`

			`len += n;`
			`b += n;`
			`}`

			`text.resize(lastLen);`
			`}`
			`}`

			`size_t TextOperations::getUnicodeCharactersCount(const std::string & text)`
			`{`
			`std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;`
			`return conv.from_bytes(text).size();`
			`}`

			`std::string TextOperations::escapeString(std::string input)`
			`{`
			`boost::replace_all(input, "\\", "\\\\");`
			`boost::replace_all(input, "\n", "\\n");`
			`boost::replace_all(input, "\r", "\\r");`
			`boost::replace_all(input, "\t", "\\t");`
			`boost::replace_all(input, "\"", "\\\"");`

			`return input;`
			`}`

code review 2023-12-16 22:10:27 +02:00			`std::string TextOperations::getFormattedDateTimeLocal(std::time_t dt)`
			`{`
			`return vstd::getFormattedDateTime(dt, Languages::getLanguageOptions(settings["general"]["language"].String()).dateTimeFormat);`
			`}`

convert line endings from CRLF (Windows) to LF (Linux/Unix) Mixed line endings cause problems when exporting patches with git-format-patch and then trying to "git am" a patch with mixed and non-matching line endings. In such a situation git will fail to apply the patch. This commit runs the dos2unix tools on the remaining files with CRLF (\r\n) line endings to convert them to line-feeds (\n) only. Files that are Windows specific like .vcxproj and .props files were not converted. Closes: #3073 2023-10-19 16:19:09 +02:00			`VCMI_LIB_NAMESPACE_END`