1
0
mirror of https://github.com/vcmi/vcmi.git synced 2025-01-12 02:28:11 +02:00

Unicode support.

- boost-locale library is now required (boost 1.48 or higher)
- Unicode namespace that contains UTF-8 handling
- All non-ASCII strings from H3 data will be converted to UTF-8 during loading
- All JSON files MUST use UTF-8. 
- H3 data encoding can be selected via launcher or directly in config file
This commit is contained in:
Ivan Savenko 2013-10-25 21:45:14 +00:00
parent f6a3d6770f
commit e2c037402c
12 changed files with 246 additions and 87 deletions

View File

@ -52,7 +52,7 @@ if (APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftemplate-depth=256")
endif()
find_package(Boost 1.46.0 COMPONENTS program_options filesystem system thread REQUIRED)
find_package(Boost 1.48.0 COMPONENTS program_options filesystem system thread locale REQUIRED)
find_package(SDL REQUIRED)
find_package(SDL_image REQUIRED)
find_package(SDL_mixer REQUIRED)

View File

@ -20,11 +20,12 @@ To compile, the following packages (and their development counterparts) are need
* zlib and zlib-devel
* (optional) Qt 5, widget and network modules
* the ffmpeg libraries (libavformat and libswscale). Their name could be libavformat-devel and libswscale-devel, or ffmpeg-libs-devel or similar names.
* boost c++ libraries v1.46+ (www.boost.org):
* boost c++ libraries v1.48+ (www.boost.org):
- program-options
- filesystem
- system
- thread
- locale
On Debian-based systems (e.g. Ubuntu) run:
sudo apt-get install cmake g++ libsdl1.2debian libsdl-image1.2-dev libsdl-ttf2.0-dev libsdl-mixer1.2-dev zlib1g-dev libavformat-dev libswscale-dev libboost-dev libboost-filesystem-dev libboost-system-dev libboost-thread-dev libboost-program-options-dev

View File

@ -149,7 +149,7 @@ std::vector<std::string> CMessage::breakText( std::string text, size_t maxLineWi
// loops till line is full or end of text reached
while(currPos < text.length() && text[currPos] != 0x0a && lineWidth < maxLineWidth)
{
symbolSize = graphics->fonts[font]->getCharacterSize(text[currPos]);
symbolSize = Unicode::getCharacterSize(text[currPos]);
glyphWidth = graphics->fonts[font]->getGlyphWidth(text.data() + currPos);
// candidate for line break

View File

@ -7,6 +7,7 @@
#include "../../lib/JsonNode.h"
#include "../../lib/vcmi_endian.h"
#include "../../lib/filesystem/Filesystem.h"
#include "../../lib/CGeneralTextHandler.h"
/*
* Fonts.cpp, part of VCMI engine
@ -22,7 +23,7 @@ size_t IFont::getStringWidth(const std::string & data) const
{
size_t width = 0;
for(size_t i=0; i<data.size(); i += getCharacterSize(data[i]))
for(size_t i=0; i<data.size(); i += Unicode::getCharacterSize(data[i]))
{
width += getGlyphWidth(data.data() + i);
}
@ -81,9 +82,9 @@ void IFont::renderTextLinesCenter(SDL_Surface * surface, const std::vector<std::
}
}
std::array<CBitmapFont::Char, CBitmapFont::totalChars> CBitmapFont::loadChars() const
std::array<CBitmapFont::BitmapChar, CBitmapFont::totalChars> CBitmapFont::loadChars() const
{
std::array<Char, totalChars> ret;
std::array<BitmapChar, totalChars> ret;
size_t offset = 32;
@ -117,11 +118,17 @@ size_t CBitmapFont::getLineHeight() const
size_t CBitmapFont::getGlyphWidth(const char * data) const
{
const Char & ch = chars[ui8(*data)];
std::string localChar = Unicode::fromUnicode(std::string(data, Unicode::getCharacterSize(data[0])));
if (localChar.size() == 1)
{
const BitmapChar & ch = chars[ui8(localChar[0])];
return ch.leftOffset + ch.width + ch.rightOffset;
}
return 0;
}
void CBitmapFont::renderCharacter(SDL_Surface * surface, const Char & character, const SDL_Color & color, int &posX, int &posY) const
void CBitmapFont::renderCharacter(SDL_Surface * surface, const BitmapChar & character, const SDL_Color & color, int &posX, int &posY) const
{
Rect clipRect;
SDL_GetClipRect(surface, &clipRect);
@ -186,19 +193,17 @@ void CBitmapFont::renderText(SDL_Surface * surface, const std::string & data, co
//assert(data[data.size()-1] != '}');
SDL_LockSurface(surface);
// for each symbol
for(auto & elem : data)
for(size_t i=0; i<data.size(); i += Unicode::getCharacterSize(data[i]))
{
renderCharacter(surface, chars[ui8(elem)], color, posX, posY);
std::string localChar = Unicode::fromUnicode(data.substr(i, Unicode::getCharacterSize(data[i])));
if (localChar.size() == 1)
renderCharacter(surface, chars[ui8(localChar[0])], color, posX, posY);
}
SDL_UnlockSurface(surface);
}
size_t CBitmapFont::getCharacterSize(char data) const
{
return 1;
}
std::pair<std::unique_ptr<ui8[]>, ui64> CTrueTypeFont::loadData(const JsonNode & config)
{
std::string filename = "Data/" + config["file"].String();
@ -246,15 +251,18 @@ size_t CTrueTypeFont::getLineHeight() const
size_t CTrueTypeFont::getGlyphWidth(const char *data) const
{
return getStringWidth(std::string(data, Unicode::getCharacterSize(*data)));
/*
int advance;
TTF_GlyphMetrics(font.get(), *data, nullptr, nullptr, nullptr, nullptr, &advance);
return advance;
*/
}
size_t CTrueTypeFont::getStringWidth(const std::string & data) const
{
int width;
TTF_SizeText(font.get(), data.c_str(), &width, nullptr);
TTF_SizeUTF8(font.get(), data.c_str(), &width, nullptr);
return width;
}
@ -282,11 +290,6 @@ void CTrueTypeFont::renderText(SDL_Surface * surface, const std::string & data,
}
}
size_t CTrueTypeFont::getCharacterSize(char data) const
{
return 1;
}
size_t CBitmapHanFont::getCharacterDataOffset(size_t index) const
{
size_t rowSize = (size + 7) / 8; // 1 bit per pixel, rounded up
@ -350,12 +353,15 @@ void CBitmapHanFont::renderText(SDL_Surface * surface, const std::string & data,
SDL_LockSurface(surface);
for(size_t i=0; i<data.size(); i += getCharacterSize(data[i]))
for(size_t i=0; i<data.size(); i += Unicode::getCharacterSize(data[i]))
{
if (ui8(data[i]) < 0x80)
fallback->renderCharacter(surface, fallback->chars[data[i]], color, posX, posY);
else
renderCharacter(surface, getCharacterIndex(data[i], data[i+1]), color, posX, posY);
std::string localChar = Unicode::fromUnicode(data.substr(i, Unicode::getCharacterSize(data[i])));
if (localChar.size() == 1)
fallback->renderCharacter(surface, fallback->chars[ui8(localChar[0])], color, posX, posY);
if (localChar.size() == 2)
renderCharacter(surface, getCharacterIndex(localChar[0], localChar[1]), color, posX, posY);
}
SDL_UnlockSurface(surface);
}
@ -368,25 +374,24 @@ CBitmapHanFont::CBitmapHanFont(const JsonNode &config):
// basic tests to make sure that fonts are OK
// 1) fonts must contain 190 "sections", 126 symbols each.
assert(getCharacterIndex(0xfe, 0xff) == 190*126);
// ensure that font size is correct - enough to fit all possible symbols
// 2) ensure that font size is correct - enough to fit all possible symbols
assert(getCharacterDataOffset(getCharacterIndex(0xfe, 0xff)) == data.second);
}
size_t CBitmapHanFont::getLineHeight() const
{
return size + 1;
return std::max(size + 1, fallback->getLineHeight());
}
size_t CBitmapHanFont::getGlyphWidth(const char * data) const
{
if (ui8(data[0]) < 0x80)
return fallback->getGlyphWidth(data);
return size + 1;
}
std::string localChar = Unicode::fromUnicode(std::string(data, Unicode::getCharacterSize(data[0])));
size_t CBitmapHanFont::getCharacterSize(char data) const
{
if (ui8(data) < 0x80)
return 1;
return 2;
if (localChar.size() == 1)
return fallback->getGlyphWidth(data);
if (localChar.size() == 2)
return size + 1;
return 0;
}

View File

@ -35,9 +35,6 @@ public:
virtual size_t getLineHeight() const = 0;
/// Returns width, in pixels of a character glyph. Pointer must contain at least characterSize valid bytes
virtual size_t getGlyphWidth(const char * data) const = 0;
/// Returns size (in bytes) of one char in current encoding, may be bigger than one for non-ascii
/// TODO: move it out of this class. Separate entity for handling localization/different encodings?
virtual size_t getCharacterSize(char data) const = 0;
/// Return width of the string
virtual size_t getStringWidth(const std::string & data) const;
@ -66,7 +63,7 @@ class CBitmapFont : public IFont
{
static const size_t totalChars = 256;
struct Char
struct BitmapChar
{
si32 leftOffset;
ui32 width;
@ -76,12 +73,12 @@ class CBitmapFont : public IFont
const std::pair<std::unique_ptr<ui8[]>, ui64> data;
const std::array<Char, totalChars> chars;
const std::array<BitmapChar, totalChars> chars;
const ui8 height;
std::array<Char, totalChars> loadChars() const;
std::array<BitmapChar, totalChars> loadChars() const;
void renderCharacter(SDL_Surface * surface, const Char & character, const SDL_Color & color, int &posX, int &posY) const;
void renderCharacter(SDL_Surface * surface, const BitmapChar & character, const SDL_Color & color, int &posX, int &posY) const;
void renderText(SDL_Surface * surface, const std::string & data, const SDL_Color & color, const Point & pos) const override;
public:
@ -89,7 +86,6 @@ public:
size_t getLineHeight() const override;
size_t getGlyphWidth(const char * data) const override;
size_t getCharacterSize(char data) const override;
friend class CBitmapHanFont;
};
@ -114,7 +110,6 @@ public:
size_t getLineHeight() const override;
size_t getGlyphWidth(const char * data) const override;
size_t getCharacterSize(char data) const override;
};
class CTrueTypeFont : public IFont
@ -134,6 +129,5 @@ public:
size_t getLineHeight() const override;
size_t getGlyphWidth(const char * data) const override;
size_t getCharacterSize(char data) const override;
size_t getStringWidth(const std::string & data) const override;
};

View File

@ -41,7 +41,7 @@
},
"encoding" : {
"type" : "string",
"default" : "native"
"default" : "CP1252"
}
}
},

View File

@ -5,6 +5,19 @@
#include "../../lib/CConfigHandler.h"
#include "../../lib/VCMIDirs.h"
/// List of encoding which can be selected from Launcher.
/// Note that it is possible to specify enconding manually in settings.json
static const std::string knownEncodingsList[] = //TODO: remove hardcode
{
// European Windows-125X encodings
"CP1250", // West European, covers mostly Slavic languages that use latin script
"CP1251", // Covers languages that use cyrillic scrypt
"CP1252", // Latin/East European, covers most of latin languages
// Chinese encodings
"GBK", // extension of GB2312, also known as CP936
"GB2312" // basic set for Simplified Chinese. Separate from GBK to allow proper detection of H3 fonts
};
void CSettingsView::loadSettings()
{
int resX = settings["video"]["screenRes"]["width"].Float();
@ -37,6 +50,11 @@ void CSettingsView::loadSettings()
for (auto string : VCMIDirs::get().dataPaths())
dataDirs += QString::fromUtf8(string.c_str());
ui->lineEditGameDir->setText(dataDirs.join(':'));
std::string encoding = settings["general"]["encoding"].String();
size_t encodingIndex = boost::range::find(knownEncodingsList, encoding) - knownEncodingsList;
if (encodingIndex < ui->comboBoxEncoding->count())
ui->comboBoxEncoding->setCurrentIndex(encodingIndex);
}
CSettingsView::CSettingsView(QWidget *parent) :
@ -112,13 +130,6 @@ void CSettingsView::on_plainTextEditRepos_textChanged()
void CSettingsView::on_comboBoxEncoding_currentIndexChanged(int index)
{
std::string encodings[] =
{
"native", // right now indicates disabled unicode, may be removed in future
"CP1250", "CP1251", "CP1252", // european Windows-125X encoding
"GBK", "gb2312" // chinese, aka CP936. Same encoding rules but different font files.
};
Settings node = settings.write["general"]["encoding"];
node->String() = encodings[index];
node->String() = knownEncodingsList[index];
}

View File

@ -341,11 +341,6 @@
</item>
<item row="6" column="4">
<widget class="QComboBox" name="comboBoxEncoding">
<item>
<property name="text">
<string>Native (unicode disabled)</string>
</property>
</item>
<item>
<property name="text">
<string>Central European (Windows 1250)</string>

View File

@ -1,12 +1,13 @@
#include "StdInc.h"
#include "CGeneralTextHandler.h"
#include "filesystem/Filesystem.h"
#include "GameConstants.h"
#include "CModHandler.h"
#include "VCMI_Lib.h"
#include <boost/locale.hpp>
// #include <locale> //needed?
#include "filesystem/Filesystem.h"
#include "CConfigHandler.h"
#include "CModHandler.h"
#include "GameConstants.h"
#include "VCMI_Lib.h"
/*
* CGeneralTextHandler.cpp, part of VCMI engine
@ -18,6 +19,110 @@
*
*/
size_t Unicode::getCharacterSize(ui8 firstByte)
{
// length of utf-8 character can be determined from 1st byte by counting number of highest bits set to 1:
// 0xxxxxxx -> 1 - ASCII chars
// 110xxxxx -> 2
// 11110xxx -> 4 - last allowed in current standard
// 1111110x -> 6 - last allowed in original standard
if (firstByte < 0x80)
return 1; // ASCII
size_t ret = 0;
for (size_t i=0; i<8; i++)
{
if ((firstByte & (0x80 >> i)) != 0)
ret++;
else
break;
}
return ret;
}
bool Unicode::isValidCharacter(const ui8 *character, size_t maxSize)
{
// first character must follow rules checked in getCharacterSize
size_t size = getCharacterSize(character[0]);
if (character[0] > 0xF4)
return false; // above maximum allowed in standard (UTF codepoints are capped at 0x0010FFFF)
if (size > maxSize)
return false;
// remaining characters must have highest bit set to 1
for (size_t i = 1; i < size; i++)
{
if ((character[i] & 0x80) == 0)
return false;
}
return true;
}
bool Unicode::isValidASCII(const std::string & text)
{
for (const char & ch : text)
if (ui8(ch) >= 0x80 )
return false;
return true;
}
bool Unicode::isValidASCII(const char * data, size_t size)
{
for (size_t i=0; i<size; i++)
if (ui8(data[i]) >= 0x80 )
return false;
return true;
}
bool Unicode::isValidString(const std::string & text)
{
for (size_t i=0; i<text.size(); i += getCharacterSize(text[i]))
{
if (!isValidCharacter(reinterpret_cast<const ui8*>(text.data() + i), text.size() - i))
return false;
}
return true;
}
bool Unicode::isValidString(const char * data, size_t size)
{
for (size_t i=0; i<size; i += getCharacterSize(data[i]))
{
if (!isValidCharacter(reinterpret_cast<const ui8*>(data + i), size - i))
return false;
}
return true;
}
static std::string getSelectedEncoding()
{
return settings["general"]["encoding"].String();
}
std::string Unicode::toUnicode(const std::string &text)
{
return toUnicode(text, getSelectedEncoding());
}
std::string Unicode::toUnicode(const std::string &text, const std::string &encoding)
{
return boost::locale::conv::to_utf<char>(text, encoding);
}
std::string Unicode::fromUnicode(const std::string & text)
{
return fromUnicode(text, getSelectedEncoding());
}
std::string Unicode::fromUnicode(const std::string &text, const std::string &encoding)
{
return boost::locale::conv::from_utf<char>(text, encoding);
}
//Helper for string -> float conversion
class LocaleWithComma: public std::numpunct<char>
{
@ -90,7 +195,7 @@ std::string CLegacyConfigParser::extractNormalString()
return std::string(begin, curr);
}
std::string CLegacyConfigParser::readString()
std::string CLegacyConfigParser::readRawString()
{
if (curr >= end || *curr == '\n')
return "";
@ -106,9 +211,18 @@ std::string CLegacyConfigParser::readString()
return ret;
}
std::string CLegacyConfigParser::readString()
{
// do not convert strings that are already in ASCII - this will only slow down loading process
std::string str = readRawString();
if (Unicode::isValidASCII(str))
return str;
return Unicode::toUnicode(str);
}
float CLegacyConfigParser::readNumber()
{
std::string input = readString();
std::string input = readRawString();
std::istringstream stream(input);

View File

@ -10,6 +10,34 @@
*
*/
/// Namespace that provides utilites for unicode support (UTF-8)
namespace Unicode
{
/// evaluates size of UTF-8 character
size_t getCharacterSize(ui8 firstByte);
/// test if character is a valid UTF-8 symbol
/// maxSize - maximum number of bytes this symbol may consist from ( = remainer of string)
bool isValidCharacter(const ui8 *character, size_t maxSize);
/// test if text contains ASCII-string (no need for unicode conversion)
bool isValidASCII(const std::string & text);
bool isValidASCII(const char * data, size_t size);
/// test if text contains valid UTF-8 sequence
bool isValidString(const std::string & text);
bool isValidString(const char * data, size_t size);
/// converts text to unicode from specified encoding or from one specified in settings
std::string toUnicode(const std::string & text);
std::string toUnicode(const std::string & text, const std::string & encoding);
/// converts text from unicode to specified encoding or to one specified in settings
/// NOTE: usage of these functions should be avoided if possible
std::string fromUnicode(const std::string & text);
std::string fromUnicode(const std::string & text, const std::string & encoding);
};
class CInputStream;
/// Parser for any text files from H3
@ -30,6 +58,8 @@ class CLegacyConfigParser
/// extracts non-quoted string
std::string extractNormalString();
/// reads "raw" string without encoding conversion
std::string readRawString();
public:
/// read one entry from current line. Return ""/0 if end of line reached
std::string readString();

View File

@ -17,6 +17,7 @@
#include "filesystem/Filesystem.h"
#include "VCMI_Lib.h" //for identifier resolution
#include "CModHandler.h"
#include "CGeneralTextHandler.h"
using namespace JsonDetail;
@ -417,6 +418,9 @@ JsonNode JsonParser::parse(std::string fileName)
{
JsonNode root;
if (!Unicode::isValidString(&input[0], input.size()))
error("Not a valid UTF-8 file", false);
extractValue(root);
extractWhitespace(false);

View File

@ -3,7 +3,7 @@
#include <SDL_endian.h>
#include "CInputStream.h"
#include "../CGeneralTextHandler.h"
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
template <typename CData>
@ -41,19 +41,19 @@ void CBinaryReader::setStream(CInputStream * stream)
si64 CBinaryReader::read(ui8 * data, si64 size)
{
return stream->read(data, size);
si64 bytesRead = stream->read(data, size);
if(bytesRead != size)
{
throw std::runtime_error(getEndOfStreamExceptionMsg(size));
}
return bytesRead;
}
template <typename CData>
CData CBinaryReader::readInteger()
{
CData val;
si64 b = stream->read(reinterpret_cast<unsigned char *>(&val), sizeof(val));
if(b < sizeof(val))
{
throw std::runtime_error(getEndOfStreamExceptionMsg(sizeof(val)));
}
stream->read(reinterpret_cast<unsigned char *>(&val), sizeof(val));
return readLE(val);
}
@ -78,15 +78,20 @@ INSTANTIATE(si64, readInt64)
std::string CBinaryReader::readString()
{
int len = readUInt32();
assert(len >= 0 && len <= 500000); //not too long
std::string ret;
ret.reserve(len);
for(int gg = 0; gg < len; ++gg)
unsigned int len = readUInt32();
assert(len <= 500000); //not too long
if (len > 0)
{
ret += readInt8();
}
std::string ret;
ret.resize(len);
read(reinterpret_cast<ui8*>(&ret[0]), len);
//FIXME: any need to move this into separate "read localized string" method?
if (Unicode::isValidASCII(ret))
return ret;
return Unicode::toUnicode(ret);
}
return "";
}
void CBinaryReader::skip(int count)