Unicode support.

- boost-locale library is now required (boost 1.48 or higher) - Unicode namespace that contains UTF-8 handling - All non-ASCII strings from H3 data will be converted to UTF-8 during loading - All JSON files MUST use UTF-8. - H3 data encoding can be selected via launcher or directly in config file
2024-12-24 22:14:36 +02:00 · 2013-10-25 21:45:14 +00:00 · 2013-10-25 21:45:14 +00:00 · e2c037402c
commit e2c037402c
parent f6a3d6770f
12 changed files with 246 additions and 87 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -52,7 +52,7 @@ if (APPLE)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftemplate-depth=256")
 endif()

-find_package(Boost 1.46.0 COMPONENTS program_options filesystem system thread REQUIRED)
+find_package(Boost 1.48.0 COMPONENTS program_options filesystem system thread locale REQUIRED)
 find_package(SDL REQUIRED)
 find_package(SDL_image REQUIRED)
 find_package(SDL_mixer REQUIRED)
--- a/README.linux
+++ b/README.linux
@ -20,11 +20,12 @@ To compile, the following packages (and their development counterparts) are need
 	* zlib and zlib-devel
 	* (optional) Qt 5, widget and network modules
 	* the ffmpeg libraries (libavformat and libswscale). Their name could be libavformat-devel and libswscale-devel, or ffmpeg-libs-devel or similar names.
-	* boost c++ libraries v1.46+ (www.boost.org):
+	* boost c++ libraries v1.48+ (www.boost.org):
 		- program-options
 		- filesystem
 		- system
 		- thread
+		- locale

 On Debian-based systems (e.g. Ubuntu) run:
  sudo apt-get install cmake g++ libsdl1.2debian libsdl-image1.2-dev libsdl-ttf2.0-dev libsdl-mixer1.2-dev zlib1g-dev libavformat-dev libswscale-dev libboost-dev libboost-filesystem-dev libboost-system-dev libboost-thread-dev libboost-program-options-dev
--- a/client/CMessage.cpp
+++ b/client/CMessage.cpp
@ -149,7 +149,7 @@ std::vector<std::string> CMessage::breakText( std::string text, size_t maxLineWi
 		// loops till line is full or end of text reached
 		while(currPos < text.length()  &&  text[currPos] != 0x0a  &&  lineWidth < maxLineWidth)
 		{
-			symbolSize = graphics->fonts[font]->getCharacterSize(text[currPos]);
+			symbolSize = Unicode::getCharacterSize(text[currPos]);
 			glyphWidth = graphics->fonts[font]->getGlyphWidth(text.data() + currPos);

 			// candidate for line break
--- a/client/gui/Fonts.cpp
+++ b/client/gui/Fonts.cpp
@ -7,6 +7,7 @@
 #include "../../lib/JsonNode.h"
 #include "../../lib/vcmi_endian.h"
 #include "../../lib/filesystem/Filesystem.h"
+#include "../../lib/CGeneralTextHandler.h"

 /*
 * Fonts.cpp, part of VCMI engine
@ -22,7 +23,7 @@ size_t IFont::getStringWidth(const std::string & data) const
 {
 	size_t width = 0;

-	for(size_t i=0; i<data.size(); i += getCharacterSize(data[i]))
+	for(size_t i=0; i<data.size(); i += Unicode::getCharacterSize(data[i]))
 	{
 		width += getGlyphWidth(data.data() + i);
 	}
@ -81,9 +82,9 @@ void IFont::renderTextLinesCenter(SDL_Surface * surface, const std::vector<std::
 	}
 }

-std::array<CBitmapFont::Char, CBitmapFont::totalChars> CBitmapFont::loadChars() const
+std::array<CBitmapFont::BitmapChar, CBitmapFont::totalChars> CBitmapFont::loadChars() const
 {
-	std::array<Char, totalChars> ret;
+	std::array<BitmapChar, totalChars> ret;

 	size_t offset = 32;

@ -117,11 +118,17 @@ size_t CBitmapFont::getLineHeight() const

 size_t CBitmapFont::getGlyphWidth(const char * data) const
 {
-	const Char & ch = chars[ui8(*data)];
-	return ch.leftOffset + ch.width + ch.rightOffset;
+	std::string localChar = Unicode::fromUnicode(std::string(data, Unicode::getCharacterSize(data[0])));
+
+	if (localChar.size() == 1)
+	{
+		const BitmapChar & ch = chars[ui8(localChar[0])];
+		return ch.leftOffset + ch.width + ch.rightOffset;
+	}
+	return 0;
 }

-void CBitmapFont::renderCharacter(SDL_Surface * surface, const Char & character, const SDL_Color & color, int &posX, int &posY) const
+void CBitmapFont::renderCharacter(SDL_Surface * surface, const BitmapChar & character, const SDL_Color & color, int &posX, int &posY) const
 {
 	Rect clipRect;
 	SDL_GetClipRect(surface, &clipRect);
@ -186,19 +193,17 @@ void CBitmapFont::renderText(SDL_Surface * surface, const std::string & data, co
 	//assert(data[data.size()-1] != '}');

 	SDL_LockSurface(surface);
-	// for each symbol
-	for(auto & elem : data)
+
+	for(size_t i=0; i<data.size(); i += Unicode::getCharacterSize(data[i]))
 	{
-		renderCharacter(surface, chars[ui8(elem)], color, posX, posY);
+		std::string localChar = Unicode::fromUnicode(data.substr(i, Unicode::getCharacterSize(data[i])));
+
+		if (localChar.size() == 1)
+			renderCharacter(surface, chars[ui8(localChar[0])], color, posX, posY);
 	}
 	SDL_UnlockSurface(surface);
 }

-size_t CBitmapFont::getCharacterSize(char data) const
-{
-	return 1;
-}
-
 std::pair<std::unique_ptr<ui8[]>, ui64> CTrueTypeFont::loadData(const JsonNode & config)
 {
 	std::string filename = "Data/" + config["file"].String();
@ -246,15 +251,18 @@ size_t CTrueTypeFont::getLineHeight() const

 size_t CTrueTypeFont::getGlyphWidth(const char *data) const
 {
+	return getStringWidth(std::string(data, Unicode::getCharacterSize(*data)));
+	/*
 	int advance;
 	TTF_GlyphMetrics(font.get(), *data, nullptr, nullptr, nullptr, nullptr, &advance);
 	return advance;
+	*/
 }

 size_t CTrueTypeFont::getStringWidth(const std::string & data) const
 {
 	int width;
-	TTF_SizeText(font.get(), data.c_str(), &width, nullptr);
+	TTF_SizeUTF8(font.get(), data.c_str(), &width, nullptr);
 	return width;
 }

@ -282,11 +290,6 @@ void CTrueTypeFont::renderText(SDL_Surface * surface, const std::string & data,
 	}
 }

-size_t CTrueTypeFont::getCharacterSize(char data) const
-{
-	return 1;
-}
-
 size_t CBitmapHanFont::getCharacterDataOffset(size_t index) const
 {
 	size_t rowSize  = (size + 7) / 8; // 1 bit per pixel, rounded up
@ -350,12 +353,15 @@ void CBitmapHanFont::renderText(SDL_Surface * surface, const std::string & data,

 	SDL_LockSurface(surface);

-	for(size_t i=0; i<data.size(); i += getCharacterSize(data[i]))
+	for(size_t i=0; i<data.size(); i += Unicode::getCharacterSize(data[i]))
 	{
-		if (ui8(data[i]) < 0x80)
-			fallback->renderCharacter(surface, fallback->chars[data[i]], color, posX, posY);
-		else
-			renderCharacter(surface, getCharacterIndex(data[i], data[i+1]), color, posX, posY);
+		std::string localChar = Unicode::fromUnicode(data.substr(i, Unicode::getCharacterSize(data[i])));
+
+		if (localChar.size() == 1)
+			fallback->renderCharacter(surface, fallback->chars[ui8(localChar[0])], color, posX, posY);
+
+		if (localChar.size() == 2)
+			renderCharacter(surface, getCharacterIndex(localChar[0], localChar[1]), color, posX, posY);
 	}
 	SDL_UnlockSurface(surface);
 }
@ -368,25 +374,24 @@ CBitmapHanFont::CBitmapHanFont(const JsonNode &config):
 	// basic tests to make sure that fonts are OK
 	// 1) fonts must contain 190 "sections", 126 symbols each.
 	assert(getCharacterIndex(0xfe, 0xff) == 190*126);
-	// ensure that font size is correct - enough to fit all possible symbols
+	// 2) ensure that font size is correct - enough to fit all possible symbols
 	assert(getCharacterDataOffset(getCharacterIndex(0xfe, 0xff)) == data.second);
 }

 size_t CBitmapHanFont::getLineHeight() const
 {
-	return size + 1;
+	return std::max(size + 1, fallback->getLineHeight());
 }

 size_t CBitmapHanFont::getGlyphWidth(const char * data) const
 {
-	if (ui8(data[0]) < 0x80)
-		return fallback->getGlyphWidth(data);
-	return size + 1;
-}
+	std::string localChar = Unicode::fromUnicode(std::string(data, Unicode::getCharacterSize(data[0])));

-size_t CBitmapHanFont::getCharacterSize(char data) const
-{
-	if (ui8(data) < 0x80)
-		return 1;
-	return 2;
+	if (localChar.size() == 1)
+		return fallback->getGlyphWidth(data);
+
+	if (localChar.size() == 2)
+		return size + 1;
+
+	return 0;
 }
--- a/client/gui/Fonts.h
+++ b/client/gui/Fonts.h
@ -35,9 +35,6 @@ public:
 	virtual size_t getLineHeight() const = 0;
 	/// Returns width, in pixels of a character glyph. Pointer must contain at least characterSize valid bytes
 	virtual size_t getGlyphWidth(const char * data) const = 0;
-	/// Returns size (in bytes) of one char in current encoding, may be bigger than one for non-ascii
-	/// TODO: move it out of this class. Separate entity for handling localization/different encodings?
-	virtual size_t getCharacterSize(char data) const = 0;
 	/// Return width of the string
 	virtual size_t getStringWidth(const std::string & data) const;

@ -66,7 +63,7 @@ class CBitmapFont : public IFont
 {
 	static const size_t totalChars = 256;

-	struct Char
+	struct BitmapChar
 	{
 		si32 leftOffset;
 		ui32 width;
@ -76,12 +73,12 @@ class CBitmapFont : public IFont

 	const std::pair<std::unique_ptr<ui8[]>, ui64> data;

-	const std::array<Char, totalChars> chars;
+	const std::array<BitmapChar, totalChars> chars;
 	const ui8 height;

-	std::array<Char, totalChars> loadChars() const;
+	std::array<BitmapChar, totalChars> loadChars() const;

-	void renderCharacter(SDL_Surface * surface, const Char & character, const SDL_Color & color, int &posX, int &posY) const;
+	void renderCharacter(SDL_Surface * surface, const BitmapChar & character, const SDL_Color & color, int &posX, int &posY) const;

 	void renderText(SDL_Surface * surface, const std::string & data, const SDL_Color & color, const Point & pos) const override;
 public:
@ -89,7 +86,6 @@ public:

 	size_t getLineHeight() const override;
 	size_t getGlyphWidth(const char * data) const override;
-	size_t getCharacterSize(char data) const override;

 	friend class CBitmapHanFont;
 };
@ -114,7 +110,6 @@ public:

 	size_t getLineHeight() const override;
 	size_t getGlyphWidth(const char * data) const override;
-	size_t getCharacterSize(char data) const override;
 };

 class CTrueTypeFont : public IFont
@ -134,6 +129,5 @@ public:

 	size_t getLineHeight() const override;
 	size_t getGlyphWidth(const char * data) const override;
-	size_t getCharacterSize(char data) const override;
 	size_t getStringWidth(const std::string & data) const override;
 };
--- a/config/schemas/settings.json
+++ b/config/schemas/settings.json
@ -41,7 +41,7 @@
 				},
 				"encoding" : {
 					"type" : "string",
-					"default" : "native"
+					"default" : "CP1252"
 				}
 			}
 		},
--- a/launcher/settingsView/csettingsview_moc.cpp
+++ b/launcher/settingsView/csettingsview_moc.cpp
@ -5,6 +5,19 @@
 #include "../../lib/CConfigHandler.h"
 #include "../../lib/VCMIDirs.h"

+/// List of encoding which can be selected from Launcher.
+/// Note that it is possible to specify enconding manually in settings.json
+static const std::string knownEncodingsList[] = //TODO: remove hardcode
+{
+    // European Windows-125X encodings
+    "CP1250", // West European, covers mostly Slavic languages that use latin script
+    "CP1251", // Covers languages that use cyrillic scrypt
+    "CP1252", // Latin/East European, covers most of latin languages
+    // Chinese encodings
+    "GBK",    // extension of GB2312, also known as CP936
+    "GB2312"  // basic set for Simplified Chinese. Separate from GBK to allow proper detection of H3 fonts
+};
+
 void CSettingsView::loadSettings()
 {
 	int resX = settings["video"]["screenRes"]["width"].Float();
@ -37,6 +50,11 @@ void CSettingsView::loadSettings()
 	for (auto string : VCMIDirs::get().dataPaths())
 		dataDirs += QString::fromUtf8(string.c_str());
 	ui->lineEditGameDir->setText(dataDirs.join(':'));
+
+	std::string encoding = settings["general"]["encoding"].String();
+	size_t encodingIndex = boost::range::find(knownEncodingsList, encoding) - knownEncodingsList;
+	if (encodingIndex < ui->comboBoxEncoding->count())
+		ui->comboBoxEncoding->setCurrentIndex(encodingIndex);
 }

 CSettingsView::CSettingsView(QWidget *parent) :
@ -112,13 +130,6 @@ void CSettingsView::on_plainTextEditRepos_textChanged()

 void CSettingsView::on_comboBoxEncoding_currentIndexChanged(int index)
 {
-	std::string encodings[] =
-	{
-	    "native", // right now indicates disabled unicode, may be removed in future
-	    "CP1250", "CP1251", "CP1252", // european Windows-125X encoding
-	    "GBK", "gb2312"  // chinese, aka CP936. Same encoding rules but different font files.
-	};
-
 	Settings node = settings.write["general"]["encoding"];
-	node->String() = encodings[index];
+	node->String() = knownEncodingsList[index];
 }
--- a/launcher/settingsView/csettingsview_moc.ui
+++ b/launcher/settingsView/csettingsview_moc.ui
@ -341,11 +341,6 @@
   </item>
   <item row="6" column="4">
    <widget class="QComboBox" name="comboBoxEncoding">
-     <item>
-      <property name="text">
-       <string>Native (unicode disabled)</string>
-      </property>
-     </item>
     <item>
      <property name="text">
       <string>Central European (Windows 1250)</string>
--- a/lib/CGeneralTextHandler.cpp
+++ b/lib/CGeneralTextHandler.cpp
@ -1,12 +1,13 @@
 #include "StdInc.h"
 #include "CGeneralTextHandler.h"

-#include "filesystem/Filesystem.h"
-#include "GameConstants.h"
-#include "CModHandler.h"
-#include "VCMI_Lib.h"
+#include <boost/locale.hpp>

-// #include <locale> //needed?
+#include "filesystem/Filesystem.h"
+#include "CConfigHandler.h"
+#include "CModHandler.h"
+#include "GameConstants.h"
+#include "VCMI_Lib.h"

 /*
 * CGeneralTextHandler.cpp, part of VCMI engine
@ -18,6 +19,110 @@
 *
 */

+size_t Unicode::getCharacterSize(ui8 firstByte)
+{
+	// length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
+	// 0xxxxxxx -> 1 -  ASCII chars
+	// 110xxxxx -> 2
+	// 11110xxx -> 4 - last allowed in current standard
+	// 1111110x -> 6 - last allowed in original standard
+
+	if (firstByte < 0x80)
+		return 1; // ASCII
+
+	size_t ret = 0;
+
+	for (size_t i=0; i<8; i++)
+	{
+		if ((firstByte & (0x80 >> i)) != 0)
+			ret++;
+		else
+			break;
+	}
+	return ret;
+}
+
+bool Unicode::isValidCharacter(const ui8 *character, size_t maxSize)
+{
+	// first character must follow rules checked in getCharacterSize
+	size_t size = getCharacterSize(character[0]);
+
+	if (character[0] > 0xF4)
+		return false; // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
+
+	if (size > maxSize)
+		return false;
+
+	// remaining characters must have highest bit set to 1
+	for (size_t i = 1; i < size; i++)
+	{
+		if ((character[i] & 0x80) == 0)
+			return false;
+	}
+	return true;
+}
+
+bool Unicode::isValidASCII(const std::string & text)
+{
+	for (const char & ch : text)
+		if (ui8(ch) >= 0x80 )
+			return false;
+	return true;
+}
+
+bool Unicode::isValidASCII(const char * data, size_t size)
+{
+	for (size_t i=0; i<size; i++)
+		if (ui8(data[i]) >= 0x80 )
+			return false;
+	return true;
+}
+
+bool Unicode::isValidString(const std::string & text)
+{
+	for (size_t i=0; i<text.size(); i += getCharacterSize(text[i]))
+	{
+		if (!isValidCharacter(reinterpret_cast<const ui8*>(text.data() + i), text.size() - i))
+			return false;
+	}
+	return true;
+}
+
+bool Unicode::isValidString(const char * data, size_t size)
+{
+	for (size_t i=0; i<size; i += getCharacterSize(data[i]))
+	{
+		if (!isValidCharacter(reinterpret_cast<const ui8*>(data + i), size - i))
+			return false;
+	}
+	return true;
+}
+
+static std::string getSelectedEncoding()
+{
+	return settings["general"]["encoding"].String();
+}
+
+std::string Unicode::toUnicode(const std::string &text)
+{
+	return toUnicode(text, getSelectedEncoding());
+}
+
+std::string Unicode::toUnicode(const std::string &text, const std::string &encoding)
+{
+	return boost::locale::conv::to_utf<char>(text, encoding);
+}
+
+std::string Unicode::fromUnicode(const std::string & text)
+{
+	return fromUnicode(text, getSelectedEncoding());
+}
+
+std::string Unicode::fromUnicode(const std::string &text, const std::string &encoding)
+{
+	return boost::locale::conv::from_utf<char>(text, encoding);
+}
+
 //Helper for string -> float conversion
 class LocaleWithComma: public std::numpunct<char>
 {
@ -90,7 +195,7 @@ std::string CLegacyConfigParser::extractNormalString()
 	return std::string(begin, curr);
 }

-std::string CLegacyConfigParser::readString()
+std::string CLegacyConfigParser::readRawString()
 {
 	if (curr >= end || *curr == '\n')
 		return "";
@ -106,9 +211,18 @@ std::string CLegacyConfigParser::readString()
 	return ret;
 }

+std::string CLegacyConfigParser::readString()
+{
+	// do not convert strings that are already in ASCII - this will only slow down loading process
+	std::string str = readRawString();
+	if (Unicode::isValidASCII(str))
+		return str;
+	return Unicode::toUnicode(str);
+}
+
 float CLegacyConfigParser::readNumber()
 {
-	std::string input = readString();
+	std::string input = readRawString();

 	std::istringstream stream(input);

--- a/lib/CGeneralTextHandler.h
+++ b/lib/CGeneralTextHandler.h
@ -10,6 +10,34 @@
 *
 */

+/// Namespace that provides utilites for unicode support (UTF-8)
+namespace Unicode
+{
+	/// evaluates size of UTF-8 character
+	size_t getCharacterSize(ui8 firstByte);
+
+	/// test if character is a valid UTF-8 symbol
+	/// maxSize - maximum number of bytes this symbol may consist from ( = remainer of string)
+	bool isValidCharacter(const ui8 *character, size_t maxSize);
+
+	/// test if text contains ASCII-string (no need for unicode conversion)
+	bool isValidASCII(const std::string & text);
+	bool isValidASCII(const char * data, size_t size);
+
+	/// test if text contains valid UTF-8 sequence
+	bool isValidString(const std::string & text);
+	bool isValidString(const char * data, size_t size);
+
+	/// converts text to unicode from specified encoding or from one specified in settings
+	std::string toUnicode(const std::string & text);
+	std::string toUnicode(const std::string & text, const std::string & encoding);
+
+	/// converts text from unicode to specified encoding or to one specified in settings
+	/// NOTE: usage of these functions should be avoided if possible
+	std::string fromUnicode(const std::string & text);
+	std::string fromUnicode(const std::string & text, const std::string & encoding);
+};
+
 class CInputStream;

 /// Parser for any text files from H3
@ -30,6 +58,8 @@ class CLegacyConfigParser
 	/// extracts non-quoted string
 	std::string extractNormalString();

+	/// reads "raw" string without encoding conversion
+	std::string readRawString();
 public:
 	/// read one entry from current line. Return ""/0 if end of line reached
 	std::string readString();
--- a/lib/JsonNode.cpp
+++ b/lib/JsonNode.cpp
@ -17,6 +17,7 @@
 #include "filesystem/Filesystem.h"
 #include "VCMI_Lib.h" //for identifier resolution
 #include "CModHandler.h"
+#include "CGeneralTextHandler.h"

 using namespace JsonDetail;

@ -417,6 +418,9 @@ JsonNode JsonParser::parse(std::string fileName)
 {
 	JsonNode root;

+	if (!Unicode::isValidString(&input[0], input.size()))
+		error("Not a valid UTF-8 file", false);
+
 	extractValue(root);
 	extractWhitespace(false);

@ -426,8 +430,8 @@ JsonNode JsonParser::parse(std::string fileName)

 	if (!errors.empty())
 	{
-        logGlobal->warnStream()<<"File " << fileName << " is not a valid JSON file!";
-        logGlobal->warnStream()<<errors;
+		logGlobal->warnStream()<<"File " << fileName << " is not a valid JSON file!";
+		logGlobal->warnStream()<<errors;
 	}
 	return root;
 }
--- a/lib/filesystem/CBinaryReader.cpp
+++ b/lib/filesystem/CBinaryReader.cpp
@ -3,7 +3,7 @@

 #include <SDL_endian.h>
 #include "CInputStream.h"
-
+#include "../CGeneralTextHandler.h"

 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
 template <typename CData>
@ -41,19 +41,19 @@ void CBinaryReader::setStream(CInputStream * stream)

 si64 CBinaryReader::read(ui8 * data, si64 size)
 {
-	return stream->read(data, size);
+	si64 bytesRead = stream->read(data, size);
+	if(bytesRead != size)
+	{
+		throw std::runtime_error(getEndOfStreamExceptionMsg(size));
+	}
+	return bytesRead;
 }

 template <typename CData>
 CData CBinaryReader::readInteger()
 {
 	CData val;
-	si64 b = stream->read(reinterpret_cast<unsigned char *>(&val), sizeof(val));
-	if(b < sizeof(val))
-	{
-		throw std::runtime_error(getEndOfStreamExceptionMsg(sizeof(val)));
-	}
-
+	stream->read(reinterpret_cast<unsigned char *>(&val), sizeof(val));
 	return readLE(val);
 }

@ -78,15 +78,20 @@ INSTANTIATE(si64, readInt64)

 std::string CBinaryReader::readString()
 {
-    int len = readUInt32();
-	assert(len >= 0 && len <= 500000); //not too long
-    std::string ret;
-    ret.reserve(len);
-    for(int gg = 0; gg < len; ++gg)
+	unsigned int len = readUInt32();
+	assert(len <= 500000); //not too long
+	if (len > 0)
 	{
-        ret += readInt8();
+		std::string ret;
+		ret.resize(len);
+		read(reinterpret_cast<ui8*>(&ret[0]), len);
+		//FIXME: any need to move this into separate "read localized string" method?
+		if (Unicode::isValidASCII(ret))
+			return ret;
+		return Unicode::toUnicode(ret);
 	}
-	return ret;
+	return "";
+
 }

 void CBinaryReader::skip(int count)