Browse Source

Remove all usage of 'word' type, replace by CodePoint

Michael Ragazzon 6 years ago
parent
commit
ab2d4210c7

+ 6 - 4
Include/RmlUi/Core/Context.h

@@ -173,12 +173,14 @@ public:
 	/// @return True if the event was not consumed (ie, was prevented from propagating by an element), false if it was.
 	bool ProcessKeyUp(Input::KeyIdentifier key_identifier, int key_modifier_state);
 
-	/// Sends a single character of text as text input into this context.
-	/// @param[in] character The UCS-2 character to send into this context.
+	/// Sends a single unicode character as text input into this context.
+	/// @param[in] character The unicode code point to send into this context.
 	/// @return True if the event was not consumed (ie, was prevented from propagating by an element), false if it was.
-	bool ProcessTextInput(word character);
+	bool ProcessTextInput(CodePoint character);
+	/// Sends a single ascii character as text input into this context.
+	bool ProcessTextInput(char character);
 	/// Sends a string of text as text input into this context.
-	/// @param[in] string The UCS-2 string to send into this context.
+	/// @param[in] string The UTF8 string to send into this context.
 	/// @return True if the event was not consumed (ie, was prevented from propagating by an element), false if it was.
 	bool ProcessTextInput(const String& string);
 

+ 2 - 2
Include/RmlUi/Core/FontGlyph.h

@@ -43,13 +43,13 @@ namespace Core {
 class FontGlyph
 {
 public:
-	FontGlyph() : character(0), dimensions(0,0), bearing(0,0), advance(0), bitmap_data(nullptr),
+	FontGlyph() : character(CodePoint::Null), dimensions(0,0), bearing(0,0), advance(0), bitmap_data(nullptr),
 		bitmap_dimensions(0,0)
 	{
 	}
 
 	/// The unicode code point for this glyph.
-	word character;
+	CodePoint character;
 
 	/// The glyph's bounding box. Not to be confused with the dimensions of the glyph's bitmap!
 	Vector2i dimensions;

+ 98 - 3
Include/RmlUi/Core/StringUtilities.h

@@ -29,6 +29,8 @@
 #ifndef RMLUICORESTRINGUTILITIES_H
 #define RMLUICORESTRINGUTILITIES_H
 
+#include <algorithm>
+#include <stddef.h>
 #include "Header.h"
 #include "Types.h"
 
@@ -84,14 +86,13 @@ namespace StringUtilities
 	/// Convert UTF8 string to UTF16.
 	RMLUICORE_API WString ToUTF16(const String& str);
 
-	/// Converts a wide string in UCS-2 encoding into a string in UTF-8 encoding. This
+	/// Converts a wide string in UTF16 encoding into a string in UTF8 encoding. This
 	/// function assumes the endianness of the input words to be the same as the host processor.
 	/// Reports a warning if the conversion fails.
-	/// TODO: Convert from UTF-16 instead.
 	RMLUICORE_API String ToUTF8(const WString& wstr);
 
 	/// Returns number of characters in UTF8 string.
-	RMLUICORE_API int LengthUTF8(const String& str);
+	RMLUICORE_API size_t LengthU8(const String& str);
 
 	/// Converts upper-case characters in string to lower-case.
 	RMLUICORE_API String ToLower(const String& string);
@@ -120,8 +121,102 @@ namespace StringUtilities
 	{
 		bool operator()(const String& lhs, const String& rhs) const;
 	};
+
+	RMLUICORE_API CodePoint ToCodePoint(const char* p);
+	RMLUICORE_API String ToUTF8(CodePoint code_point);
+
+
+	//struct StringView {
+	//	const char* p_begin;
+	//	const char* p_end;
+
+	//	StringView(const String& string) : p_begin(string.data()), p_end(string.data() + string.size()) { }
+	//	StringView(const String& string, size_t offset) : p_begin(string.data() + offset), p_end(string.data() + string.size()) { }
+	//	StringView(const String& string, size_t offset, size_t count) : p_begin(string.data() + offset), p_end(string.data() + std::min(offset + count, string.size())) { }
+	//};
+
+	inline const char* SeekForwardU8(const char* p, const char* p_end)
+	{
+		while (p != p_end && (*p & 0b1100'0000) == 0b1000'0000)
+			++p;
+		return p;
+	}
+	inline const char* SeekBackU8(const char* p, const char* p_begin)
+	{
+		while ((p + 1) != p_begin && (*p & 0b1100'0000) == 0b1000'0000)
+			--p;
+		return p;
+	}
+
+
+
 }
 
+	class UTF8Iterator {
+		// p in [p_begin, p_end)
+		const char* p_begin;
+		const char* p;
+		const char* p_end;
+
+		inline void SeekBack() {
+			p = StringUtilities::SeekBackU8(p, p_end);
+		}
+
+		inline void SeekForward() {
+			p = StringUtilities::SeekForwardU8(p, p_end);
+		}
+		
+	public:
+		UTF8Iterator(const char* p_begin, const char* p, const char* p_end) : p_begin(p_begin), p(p), p_end(p_end) { SeekForward(); }
+		UTF8Iterator(const String& string) : p_begin(string.data()), p(string.data()), p_end(string.data() + string.size()) { SeekForward(); }
+		UTF8Iterator(const String& string, size_t offset) : p_begin(string.data()), p(string.data() + offset), p_end(string.data() + string.size()) { SeekForward(); }
+		//UTF8Iterator(const String& string, size_t offset, size_t count) : p_begin(string.data()), p(string.data() + offset), p_end(string.data() + std::min(offset + count, string.size())) { SeekForward(); }
+
+		UTF8Iterator& operator++() {
+			RMLUI_ASSERT(p != p_end);
+			++p;
+			SeekForward();
+			return *this;
+		}
+		UTF8Iterator& operator--() {
+			RMLUI_ASSERT(p - 1 != p_begin);
+			--p;
+			SeekBack();
+			return *this;
+		}
+
+		CodePoint operator*() const { return StringUtilities::ToCodePoint(p); }
+
+		operator bool() const { return (p != p_begin - 1) && (p != p_end); }
+
+		bool operator==(const UTF8Iterator& other) const { return p == other.p; }
+		bool operator!=(const UTF8Iterator& other) const { return !(*this == other); }
+	};
+
+
+
+	class UTF8Parser {
+		UTF8Iterator _begin;
+		UTF8Iterator _end;
+
+	public:
+
+		UTF8Parser(const String& string) : _begin(string.data()), _end(string.data() + string.size()) {}
+		UTF8Parser(const String& string, size_t offset) : _begin(string.data() + offset), _end(string.data() + string.size()) {}
+		//UTF8Parser(const String& string, size_t offset, size_t count) : _begin(string.data() + offset), _end(string.data() + std::min(offset + count, string.size())) {}
+
+		UTF8Iterator begin() const {
+			return _begin;
+		}
+
+		UTF8Iterator end() const {
+			return _end;
+		}
+	};
+
+
+
+
 }
 }
 

+ 2 - 2
Include/RmlUi/Core/TypeConverter.inl

@@ -105,7 +105,7 @@ PASS_THROUGH(unsigned int);
 PASS_THROUGH(float);
 PASS_THROUGH(bool);
 PASS_THROUGH(char);
-PASS_THROUGH(word);
+PASS_THROUGH(CodePoint);
 PASS_THROUGH(Vector2i);
 PASS_THROUGH(Vector2f);
 PASS_THROUGH(Vector3i);
@@ -148,7 +148,7 @@ BASIC_CONVERTER_BOOL(float, bool);
 BASIC_CONVERTER(float, int);
 BASIC_CONVERTER(float, unsigned int);
 
-BASIC_CONVERTER(char, word);
+BASIC_CONVERTER(char, CodePoint);
 
 /////////////////////////////////////////////////
 // From string converters

+ 1 - 1
Include/RmlUi/Core/Types.h

@@ -58,9 +58,9 @@ namespace Core {
 
 // Commonly used basic types
 typedef unsigned char byte;
-typedef wchar_t word;
 typedef double Time;
 typedef void* ScriptObject;
+enum class CodePoint : unsigned int { Null };
 
 }
 }

+ 1 - 1
Include/RmlUi/Core/Variant.h

@@ -125,7 +125,7 @@ private:
 	void Set(const char value);
 	void Set(const float value);
 	void Set(const int value);
-	void Set(const word value);
+	void Set(const CodePoint value);
 	void Set(const char* value);
 	void Set(void* value);
 	void Set(const Vector2f& value);

+ 1 - 1
Include/RmlUi/Core/Variant.inl

@@ -75,7 +75,7 @@ bool Variant::GetInto(T& value) const
 		break;
 
 	case WORD:
-		return TypeConverter< word, T >::Convert(*(word*)data, value);
+		return TypeConverter< CodePoint, T >::Convert(*(CodePoint*)data, value);
 		break;
 
 	case VECTOR2:

+ 1 - 1
Samples/basic/demo/data/demo.rml

@@ -155,7 +155,7 @@ button:focus {
 </panel>
 <tab>Controls</tab>
 <panel>
-	<div>Type something here: <input style="vertical-align: -7px;" size="10" type="text" value="Sample text"/></div>
+	<div>Type something here: <input style="vertical-align: -7px;" size="10" type="text" maxlength="12" value="Sample text"/></div>
 	<textarea cols="30" rows="5" wrap="nowrap">Hello World!</textarea>
 </panel>
 </tabset>

+ 1 - 1
Samples/shell/include/Input.h

@@ -53,7 +53,7 @@ public:
 	/// @param[in] key_identifier The key to generate a character code for.
 	/// @param[in] key_modifier_state The configuration of the key modifiers.
 	/// @return The character code.
-	static Rml::Core::word GetCharacterCode(Rml::Core::Input::KeyIdentifier key_identifier, int key_modifier_state);
+	static Rml::Core::CodePoint GetCharacterCode(Rml::Core::Input::KeyIdentifier key_identifier, int key_modifier_state);
 
 protected:
 	static Rml::Core::Context* context;

+ 11 - 9
Samples/shell/src/Input.cpp

@@ -316,8 +316,10 @@ void Input::SetContext(Rml::Core::Context* _context)
 
 
 // Returns the character code for a key identifer / key modifier combination.
-Rml::Core::word Input::GetCharacterCode(Rml::Core::Input::KeyIdentifier key_identifier, int key_modifier_state)
+Rml::Core::CodePoint Input::GetCharacterCode(Rml::Core::Input::KeyIdentifier key_identifier, int key_modifier_state)
 {
+	using Rml::Core::CodePoint;
+
 	// Check if we have a keycode capable of generating characters on the main keyboard (ie, not on the numeric
 	// keypad; that is dealt with below).
 	if (key_identifier <= Rml::Core::Input::KI_OEM_102)
@@ -328,28 +330,28 @@ Rml::Core::word Input::GetCharacterCode(Rml::Core::Input::KeyIdentifier key_iden
 
 		// Return character code based on identifier and modifiers
 		if (shift && !capslock)
-			return ascii_map[1][key_identifier];
+			return (CodePoint)ascii_map[1][key_identifier];
 
 		if (shift && capslock)
-			return ascii_map[2][key_identifier];	
+			return (CodePoint)ascii_map[2][key_identifier];
 
 		if (!shift && capslock)
-			return ascii_map[3][key_identifier];
+			return (CodePoint)ascii_map[3][key_identifier];
 
-		return ascii_map[0][key_identifier];
+		return (CodePoint)ascii_map[0][key_identifier];
 	}
 
 	// Check if we have a keycode from the numeric keypad.
 	else if (key_identifier <= Rml::Core::Input::KI_OEM_NEC_EQUAL)
 	{
 		if (key_modifier_state & Rml::Core::Input::KM_NUMLOCK)
-			return keypad_map[0][key_identifier - Rml::Core::Input::KI_NUMPAD0];
+			return (CodePoint)keypad_map[0][key_identifier - Rml::Core::Input::KI_NUMPAD0];
 		else
-			return keypad_map[1][key_identifier - Rml::Core::Input::KI_NUMPAD0];
+			return (CodePoint)keypad_map[1][key_identifier - Rml::Core::Input::KI_NUMPAD0];
 	}
 
 	else if (key_identifier == Rml::Core::Input::KI_RETURN)
-		return '\n';
+		return (CodePoint)'\n';
 
-	return 0;
+	return CodePoint::Null;
 }

+ 3 - 2
Samples/shell/src/win32/InputWin32.cpp

@@ -107,11 +107,12 @@ void InputWin32::ProcessWindowsEvent(UINT message, WPARAM w_param, LPARAM l_para
 		case WM_CHAR:
 		{
 			// Only send through printable characters.
+			// TODO: Convert utf16 character to codepoint
 			if (w_param >= 32)
-				context->ProcessTextInput((Rml::Core::word) w_param);
+				context->ProcessTextInput((Rml::Core::CodePoint) w_param);
 			// Or endlines - Windows sends them through as carriage returns.
 			else if (w_param == '\r')
-				context->ProcessTextInput((Rml::Core::word) '\n');
+				context->ProcessTextInput((Rml::Core::CodePoint)'\n');
 		}
 		break;
 

+ 32 - 19
Source/Controls/WidgetTextInput.cpp

@@ -40,6 +40,8 @@ static const float CURSOR_BLINK_TIME = 0.7f;
 
 WidgetTextInput::WidgetTextInput(ElementFormControl* _parent) : internal_dimensions(0, 0), scroll_offset(0, 0), selection_geometry(_parent), cursor_position(0, 0), cursor_size(0, 0), cursor_geometry(_parent)
 {
+	// TODO: Check all usage of .size(), they are not the same as characters anymore
+
 	keyboard_showed = false;
 	
 	parent = _parent;
@@ -146,6 +148,13 @@ int WidgetTextInput::GetMaxLength() const
 	return max_length;
 }
 
+int WidgetTextInput::GetLength() const
+{
+	Core::String value = GetElement()->GetAttribute< Core::String >("value", "");
+	size_t result = Core::StringUtilities::LengthU8(value);
+	return (int)result;
+}
+
 // Update the colours of the selected text.
 void WidgetTextInput::UpdateSelectionColours()
 {
@@ -247,7 +256,7 @@ const Rml::Core::Vector2f& WidgetTextInput::GetTextDimensions() const
 }
 
 // Gets the parent element containing the widget.
-Core::Element* WidgetTextInput::GetElement()
+Core::Element* WidgetTextInput::GetElement() const
 {
 	return parent;
 }
@@ -355,12 +364,11 @@ void WidgetTextInput::ProcessEvent(Core::Event& event)
 				Core::String clipboard_text;
 				Core::GetSystemInterface()->GetClipboardText(clipboard_text);
 
-				for (size_t i = 0; i < clipboard_text.size(); ++i)
+				// @performance: Can be made heaps faster.
+				for (auto it = Core::UTF8Iterator(clipboard_text); it; ++it)
 				{
-					if (max_length > 0 && Core::StringUtilities::LengthUTF8(GetElement()->GetAttribute< Rml::Core::String >("value", "")) > max_length)
-						break;
-
-					AddCharacter(clipboard_text[i]);
+					Core::CodePoint code = *it;
+					AddCharacter(code);
 				}
 			}
 		}
@@ -385,9 +393,8 @@ void WidgetTextInput::ProcessEvent(Core::Event& event)
 			event.GetParameter< int >("alt_key", 0) == 0 &&
 			event.GetParameter< int >("meta_key", 0) == 0)
 		{
-			Rml::Core::word character = event.GetParameter< Rml::Core::word >("data", 0);
-			if (max_length < 0 || (int)Core::String(GetElement()->GetAttribute< Rml::Core::String >("value", "")).size() < max_length)
-				AddCharacter(character);
+			Rml::Core::CodePoint character = event.GetParameter("data", Rml::Core::CodePoint::Null);
+			AddCharacter(character);
 		}
 
 		ShowCursor(true);
@@ -441,21 +448,25 @@ void WidgetTextInput::ProcessEvent(Core::Event& event)
 }
 
 // Adds a new character to the string at the cursor position.
-bool WidgetTextInput::AddCharacter(Rml::Core::word character)
+bool WidgetTextInput::AddCharacter(Rml::Core::CodePoint character)
 {
-	if (!IsCharacterValid(character))
+	if (!IsCharacterValid(static_cast<char>(character)))
 		return false;
 
 	if (selection_length > 0)
 		DeleteSelection();
 
-	Core::WString value = Core::StringUtilities::ToUCS2(GetElement()->GetAttribute< Rml::Core::String >("value", ""));
-	value.insert(GetCursorIndex(), 1, character);
+	if (max_length >= 0 && GetLength() >= max_length)
+		return false;
+
+	Core::String value = GetElement()->GetAttribute< Rml::Core::String >("value", "");
+	
+	Core::String insert = Core::StringUtilities::ToUTF8(character);
+	value.insert(GetCursorIndex(), insert);
 
 	edit_index += 1;
 
-	Rml::Core::String utf8_value = Core::StringUtilities::ToUTF8(value);
-	GetElement()->SetAttribute("value", utf8_value);
+	GetElement()->SetAttribute("value", value);
 	DispatchChangeEvent();
 
 	UpdateSelection(false);
@@ -478,8 +489,8 @@ bool WidgetTextInput::DeleteCharacter(bool back)
 		return true;
 	}
 
-	Core::WString value = Core::StringUtilities::ToUCS2(GetElement()->GetAttribute< Rml::Core::String >("value", ""));
-
+	Core::String value = GetElement()->GetAttribute< Rml::Core::String >("value", "");
+	
 	if (back)
 	{
 		if (GetCursorIndex() == 0)
@@ -496,8 +507,7 @@ bool WidgetTextInput::DeleteCharacter(bool back)
 		value.erase(GetCursorIndex(), 1);
 	}
 
-	Rml::Core::String utf8_value = Core::StringUtilities::ToUTF8(value);
-	GetElement()->SetAttribute("value", utf8_value);
+	GetElement()->SetAttribute("value", value);
 	DispatchChangeEvent();
 
 	UpdateSelection(false);
@@ -516,12 +526,14 @@ void WidgetTextInput::CopySelection()
 // Returns the absolute index of the cursor.
 int WidgetTextInput::GetCursorIndex() const
 {
+	// TODO: Sanitize cursor index ?
 	return edit_index;
 }
 
 // Moves the cursor along the current line.
 void WidgetTextInput::MoveCursorHorizontal(int distance, bool select)
 {
+	// Todo, move properly across multibyte characters
 	absolute_cursor_index += distance;
 	absolute_cursor_index = Rml::Core::Math::Max(0, absolute_cursor_index);
 
@@ -625,6 +637,7 @@ int WidgetTextInput::CalculateLineIndex(float position)
 // Calculates the character index along a line under a specific horizontal position.
 int WidgetTextInput::CalculateCharacterIndex(int line_index, float position)
 {
+	// Todo, move properly across multibyte characters
 	int character_index = 0;
 	float line_width = 0;
 

+ 5 - 3
Source/Controls/WidgetTextInput.h

@@ -66,6 +66,8 @@ public:
 	/// Returns the maximum length (in characters) of this text field.
 	/// @return The maximum number of characters allowed in this text field.
 	int GetMaxLength() const;
+	/// Returns the current length (in characters) of this text field.
+	int GetLength() const;
 
 	/// Update the colours of the selected text.
 	void UpdateSelectionColours();
@@ -92,7 +94,7 @@ protected:
 	/// Adds a new character to the string at the cursor position.
 	/// @param[in] character The character to add to the string.
 	/// @return True if the character was successfully added, false otherwise.
-	bool AddCharacter(Rml::Core::word character);
+	bool AddCharacter(Rml::Core::CodePoint character);
 	/// Deletes a character from the string.
 	/// @param[in] backward True to delete a character behind the cursor, false for in front of the cursor.
 	/// @return True if a character was deleted, false otherwise.
@@ -100,7 +102,7 @@ protected:
 	/// Returns true if the given character is permitted in the input field, false if not.
 	/// @param[in] character The character to validate.
 	/// @return True if the character is allowed, false if not.
-	virtual bool IsCharacterValid(Rml::Core::word character) = 0;
+	virtual bool IsCharacterValid(char character) = 0;
 	/// Called when the user pressed enter.
 	virtual void LineBreak() = 0;
 
@@ -108,7 +110,7 @@ protected:
 	int GetCursorIndex() const;
 
 	/// Gets the parent element containing the widget.
-	Core::Element* GetElement();
+	Core::Element* GetElement() const;
 
 	/// Dispatches a change event to the widget's element.
 	void DispatchChangeEvent(bool linebreak = false);

+ 1 - 1
Source/Controls/WidgetTextInputMultiLine.cpp

@@ -42,7 +42,7 @@ WidgetTextInputMultiLine::~WidgetTextInputMultiLine()
 }
 
 // Returns true if the given character is permitted in the input field, false if not.
-bool WidgetTextInputMultiLine::IsCharacterValid(Rml::Core::word character)
+bool WidgetTextInputMultiLine::IsCharacterValid(char character)
 {
 	return character != '\t';
 }

+ 1 - 1
Source/Controls/WidgetTextInputMultiLine.h

@@ -50,7 +50,7 @@ protected:
 	/// Returns true if the given character is permitted in the input field, false if not.
 	/// @param[in] character The character to validate.
 	/// @return True if the character is allowed, false if not.
-	bool IsCharacterValid(Rml::Core::word character) override;
+	bool IsCharacterValid(char character) override;
 	/// Called when the user pressed enter.
 	void LineBreak() override;		
 };

+ 1 - 1
Source/Controls/WidgetTextInputSingleLine.cpp

@@ -51,7 +51,7 @@ void WidgetTextInputSingleLine::SetValue(const Core::String& value)
 }
 
 // Returns true if the given character is permitted in the input field, false if not.
-bool WidgetTextInputSingleLine::IsCharacterValid(Rml::Core::word character)
+bool WidgetTextInputSingleLine::IsCharacterValid(char character)
 {
 	return character != '\t' && character != '\n' && character != '\r';
 }

+ 1 - 1
Source/Controls/WidgetTextInputSingleLine.h

@@ -54,7 +54,7 @@ protected:
 	/// Returns true if the given character is permitted in the input field, false if not.
 	/// @param[in] character The character to validate.
 	/// @return True if the character is allowed, false if not.
-	bool IsCharacterValid(Rml::Core::word character) override;
+	bool IsCharacterValid(char character) override;
 	/// Called when the user pressed enter.
 	void LineBreak() override;
 

+ 1 - 1
Source/Controls/WidgetTextInputSingleLinePassword.cpp

@@ -45,7 +45,7 @@ void WidgetTextInputSingleLinePassword::SetValue(const Core::String& value)
 {
 	Core::String sanitised_value(value);
 	SanitiseValue(sanitised_value);
-	WidgetTextInput::SetValue(Core::String(sanitised_value.size(), (Rml::Core::word) '*'));
+	WidgetTextInput::SetValue(Core::String(sanitised_value.size(), '*'));
 }
 
 }

+ 6 - 1
Source/Core/Context.cpp

@@ -498,8 +498,13 @@ bool Context::ProcessKeyUp(Input::KeyIdentifier key_identifier, int key_modifier
 		return root->DispatchEvent(EventId::Keyup, parameters);
 }
 
+bool Context::ProcessTextInput(char character)
+{
+	return ProcessTextInput(static_cast<CodePoint>(character));
+}
+
 // Sends a single character of text as text input into RmlUi.
-bool Context::ProcessTextInput(word character)
+bool Context::ProcessTextInput(CodePoint character)
 {
 	// Generate the parameters for the key event.
 	Dictionary parameters;

+ 11 - 9
Source/Core/ElementTextDefault.cpp

@@ -169,7 +169,7 @@ bool ElementTextDefault::GenerateToken(float& token_width, int line_begin)
 	String token;
 
 	BuildToken(token, token_begin, text.c_str() + text.size(), true, collapse_white_space, break_at_endline, computed.text_transform);
-	token_width = (float) font_face_handle->GetStringWidth(token, 0);
+	token_width = (float) font_face_handle->GetStringWidth(token);
 
 	return LastToken(token_begin, text.c_str() + text.size(), collapse_white_space, break_at_endline);
 }
@@ -211,17 +211,19 @@ bool ElementTextDefault::GenerateLine(String& line, int& line_length, float& lin
 	// Starting at the line_begin character, we generate sections of the text (we'll call them tokens) depending on the
 	// white-space parsing parameters. Each section is then appended to the line if it can fit. If not, or if an
 	// endline is found (and we're processing them), then the line is ended. kthxbai!
-
 	const char* token_begin = text.c_str() + line_begin;
 	const char* string_end = text.c_str() + text.size();
 	while (token_begin != string_end)
 	{
 		String token;
 		const char* next_token_begin = token_begin;
+		CodePoint previous_codepoint = CodePoint::Null;
+		if (!line.empty())
+			previous_codepoint = StringUtilities::ToCodePoint(StringUtilities::SeekBackU8(&line.back(), line.data()));
 
 		// Generate the next token and determine its pixel-length.
 		bool break_line = BuildToken(token, next_token_begin, string_end, line.empty() && trim_whitespace_prefix, collapse_white_space, break_at_endline, text_transform_property);
-		int token_width = font_face_handle->GetStringWidth(token, line.empty() ? 0 : line[line.size() - 1]);
+		int token_width = font_face_handle->GetStringWidth(token, previous_codepoint);
 
 		// If we're breaking to fit a line box, check if the token can fit on the line before we add it.
 		if (break_at_line)
@@ -468,17 +470,17 @@ static bool BuildToken(String& token, const char*& token_begin, const char* stri
 			// is not recognised, print the token like normal text.
 			else
 			{
-				String ucs2_escape_code(escape_begin + 1, token_begin);
+				String escape_code(escape_begin + 1, token_begin);
 
-				if (ucs2_escape_code == "lt")
+				if (escape_code == "lt")
 					character = '<';
-				else if (ucs2_escape_code == "gt")
+				else if (escape_code == "gt")
 					character = '>';
-				else if (ucs2_escape_code == "amp")
+				else if (escape_code == "amp")
 					character = '&';
-				else if (ucs2_escape_code == "quot")
+				else if (escape_code == "quot")
 					character = '"';
-				else if (ucs2_escape_code == "nbsp")
+				else if (escape_code == "nbsp")
 				{
 					character = ' ';
 					force_non_whitespace = true;

+ 18 - 17
Source/Core/FontFaceHandle.cpp

@@ -96,25 +96,24 @@ const FontGlyphList& FontFaceHandle::GetGlyphs() const
 }
 
 // Returns the width a string will take up if rendered with this handle.
-int FontFaceHandle::GetStringWidth(const String& string, word prior_character) const
+int FontFaceHandle::GetStringWidth(const String& string, CodePoint prior_character) const
 {
 	int width = 0;
-
-	for (size_t i = 0; i < string.size(); i++)
+	for (auto it = UTF8Iterator(string); it; ++it)
 	{
-		word character_code = string[i];
+		CodePoint code_point = *it;
 
-		if (character_code >= glyphs.size())
+		if ((size_t)code_point >= glyphs.size())
 			continue;
-		const FontGlyph &glyph = glyphs[character_code];
+		const FontGlyph &glyph = glyphs[(size_t)code_point];
 
 		// Adjust the cursor for the kerning between this character and the previous one.
-		if (prior_character != 0)
-			width += GetKerning(prior_character, string[i]);
+		if (prior_character != CodePoint::Null)
+			width += GetKerning(prior_character, code_point);
 		// Adjust the cursor for this character's advance.
 		width += glyph.advance;
 
-		prior_character = character_code;
+		prior_character = code_point;
 	}
 
 	return width;
@@ -225,25 +224,27 @@ int FontFaceHandle::GenerateString(GeometryList& geometry, const String& string,
 			geometry[geometry_index + i].SetTexture(layer->GetTexture(i));
 
 		line_width = 0;
-		word prior_character = 0;
+		CodePoint prior_character = CodePoint::Null;
 
 		geometry[geometry_index].GetIndices().reserve(string.size() * 6);
 		geometry[geometry_index].GetVertices().reserve(string.size() * 4);
 
-		for (const word character : string)
+		for (auto it = UTF8Iterator(string); it; ++it)
 		{
-			if (character >= glyphs.size())
+			CodePoint code_point = *it;
+
+			if ((size_t)code_point >= glyphs.size())
 				continue;
-			const FontGlyph &glyph = glyphs[character];
+			const FontGlyph &glyph = glyphs[(size_t)code_point];
 
 			// Adjust the cursor for the kerning between this character and the previous one.
-			if (prior_character != 0)
-				line_width += GetKerning(prior_character, character);
+			if (prior_character != CodePoint::Null)
+				line_width += GetKerning(prior_character, code_point);
 
-			layer->GenerateGeometry(&geometry[geometry_index], character, Vector2f(position.x + line_width, position.y), layer_colour);
+			layer->GenerateGeometry(&geometry[geometry_index], code_point, Vector2f(position.x + line_width, position.y), layer_colour);
 
 			line_width += glyph.advance;
-			prior_character = character;
+			prior_character = code_point;
 		}
 
 		geometry_index += layer->GetNumTextures();

+ 2 - 2
Source/Core/FontFaceHandle.h

@@ -77,7 +77,7 @@ public:
 	/// @param[in] string The string to measure.
 	/// @param[in] prior_character The optionally-specified character that immediately precedes the string. This may have an impact on the string width due to kerning.
 	/// @return The width, in pixels, this string will occupy if rendered with this handle.
-	int GetStringWidth(const String& string, word prior_character = 0) const;
+	int GetStringWidth(const String& string, CodePoint prior_character = CodePoint::Null) const;
 
 	/// Generates, if required, the layer configuration for a given array of font effects.
 	/// @param[in] font_effects The list of font effects to generate the configuration for.
@@ -114,7 +114,7 @@ public:
 
 protected:
 
-	virtual int GetKerning(word lhs, word rhs) const = 0;
+	virtual int GetKerning(CodePoint lhs, CodePoint rhs) const = 0;
 	virtual FontFaceLayer* CreateNewLayer();
 
 	FontFaceLayer* GenerateLayer(const SharedPtr<const FontEffect>& font_effect);

+ 7 - 6
Source/Core/FontFaceLayer.cpp

@@ -71,10 +71,11 @@ bool FontFaceLayer::Initialise(const FontFaceHandle* _handle, SharedPtr<const Fo
 			{
 				const FontGlyph& glyph = *i;
 
-				if (glyph.character >= characters.size())
+				if ((size_t)glyph.character >= characters.size())
 					continue;
 
-				Character& character = characters[glyph.character];
+				// TODO: Use a look-up map instead (codepoints can get large!)
+				Character& character = characters[(size_t)glyph.character];
 
 				Vector2i glyph_origin(Math::RealToInteger(character.origin.x), Math::RealToInteger(character.origin.y));
 				Vector2i glyph_dimensions(Math::RealToInteger(character.dimensions.x), Math::RealToInteger(character.dimensions.y));
@@ -110,10 +111,10 @@ bool FontFaceLayer::Initialise(const FontFaceHandle* _handle, SharedPtr<const Fo
 			Character character;
 			character.origin = Vector2f((float) (glyph_origin.x + glyph.bearing.x), (float) (glyph_origin.y - glyph.bearing.y));
 			character.dimensions = Vector2f((float) glyph_dimensions.x - glyph_origin.x, (float) glyph_dimensions.y - glyph_origin.y);
-			characters[glyph.character] = character;
+			characters[(size_t)glyph.character] = character;
 
 			// Add the character's dimensions into the texture layout engine.
-			texture_layout.AddRectangle(glyph.character, glyph_dimensions - glyph_origin);
+			texture_layout.AddRectangle((int)glyph.character, glyph_dimensions - glyph_origin);
 		}
 
 		// Generate the texture layout; this will position the glyph rectangles efficiently and
@@ -128,7 +129,7 @@ bool FontFaceLayer::Initialise(const FontFaceHandle* _handle, SharedPtr<const Fo
 		{
 			TextureLayoutRectangle& rectangle = texture_layout.GetRectangle(i);
 			const TextureLayoutTexture& texture = texture_layout.GetTexture(rectangle.GetTextureIndex());
-			Character& character = characters[(word) rectangle.GetId()];
+			Character& character = characters[rectangle.GetId()];
 
 			// Set the character's texture index.
 			character.texture_index = rectangle.GetTextureIndex();
@@ -172,7 +173,7 @@ bool FontFaceLayer::GenerateTexture(const byte*& texture_data, Vector2i& texture
 	for (int i = 0; i < texture_layout.GetNumRectangles(); ++i)
 	{
 		TextureLayoutRectangle& rectangle = texture_layout.GetRectangle(i);
-		Character& character = characters[(word) rectangle.GetId()];	
+		Character& character = characters[rectangle.GetId()];	
 
 		if (character.texture_index != texture_id)
 			continue;

+ 3 - 3
Source/Core/FontFaceLayer.h

@@ -73,12 +73,12 @@ public:
 	/// @param[in] character_code The character to generate geometry for.
 	/// @param[in] position The position of the baseline.
 	/// @param[in] colour The colour of the string.
-	inline void GenerateGeometry(Geometry* geometry, const word character_code, const Vector2f& position, const Colourb& colour) const
+	inline void GenerateGeometry(Geometry* geometry, const CodePoint character_code, const Vector2f& position, const Colourb& colour) const
 	{
-		if (character_code >= characters.size())
+		if ((size_t)character_code >= characters.size())
 			return;
 
-		const Character& character = characters[character_code];
+		const Character& character = characters[(size_t)character_code];
 		if (character.texture_index < 0)
 			return;
 

+ 11 - 8
Source/Core/FreeType/FontFaceHandle.cpp

@@ -116,8 +116,7 @@ void FontFaceHandle::GenerateMetrics()
 		average_advance = Math::RealToInteger((float) average_advance / (num_visible_glyphs * 0.9f));
 
 	// Determine the x-height of this font face.
-	word x = (word) 'x';
-	int index = FT_Get_Char_Index(ft_face, x);
+	int index = FT_Get_Char_Index(ft_face, 'x');
 	if (FT_Load_Glyph(ft_face, index, 0) == 0)
 		x_height = ft_face->glyph->metrics.height >> 6;
 	else
@@ -126,7 +125,7 @@ void FontFaceHandle::GenerateMetrics()
 
 void FontFaceHandle::BuildGlyphMap(const UnicodeRange& unicode_range)
 {
-	for (word character_code = (word) (Math::Max< unsigned int >(unicode_range.min_codepoint, 32)); character_code <= unicode_range.max_codepoint; ++character_code)
+	for (FT_ULong character_code = (FT_ULong)(Math::Max< unsigned int >(unicode_range.min_codepoint, 32)); character_code <= unicode_range.max_codepoint; ++character_code)
 	{
 		int index = FT_Get_Char_Index(ft_face, character_code);
 		if (index != 0)
@@ -146,7 +145,7 @@ void FontFaceHandle::BuildGlyphMap(const UnicodeRange& unicode_range)
 			}
 
 			FontGlyph glyph;
-			glyph.character = character_code;
+			glyph.character = (CodePoint)character_code;
 			BuildGlyph(glyph, ft_face->glyph);
 			glyphs[character_code] = glyph;
 		}
@@ -236,16 +235,20 @@ void FontFaceHandle::BuildGlyph(FontGlyph& glyph, FT_GlyphSlot ft_glyph)
 		glyph.bitmap_data = nullptr;
 }
 
-int FontFaceHandle::GetKerning(word lhs, word rhs) const
+int FontFaceHandle::GetKerning(CodePoint lhs, CodePoint rhs) const
 {
 	if (!FT_HAS_KERNING(ft_face))
 		return 0;
 
 	FT_Vector ft_kerning;
 
-	FT_Error ft_error = FT_Get_Kerning(ft_face,
-		FT_Get_Char_Index(ft_face, lhs), FT_Get_Char_Index(ft_face, rhs),
-		FT_KERNING_DEFAULT, &ft_kerning);
+	FT_Error ft_error = FT_Get_Kerning(
+		ft_face,
+		FT_Get_Char_Index(ft_face, (FT_ULong)lhs), 
+		FT_Get_Char_Index(ft_face, (FT_ULong)rhs), 
+		FT_KERNING_DEFAULT, 
+		&ft_kerning
+	);
 
 	if (ft_error != 0)
 		return 0;

+ 1 - 1
Source/Core/FreeType/FontFaceHandle.h

@@ -61,7 +61,7 @@ public:
 	bool Initialise(FT_Face ft_face, const String& charset, int size);
 
 protected:
-	int GetKerning(word lhs, word rhs) const override;
+	int GetKerning(CodePoint lhs, CodePoint rhs) const override;
 
 private:
 	void GenerateMetrics(void);

+ 105 - 21
Source/Core/StringUtilities.cpp

@@ -1,5 +1,6 @@
 #include "..\..\Include\RmlUi\Core\StringUtilities.h"
 #include "..\..\Include\RmlUi\Core\StringUtilities.h"
+#include "..\..\Include\RmlUi\Core\StringUtilities.h"
 /*
  * This source file is part of RmlUi, the HTML/CSS Interface Middleware
  *
@@ -109,16 +110,28 @@ WString StringUtilities::ToUTF16(const String& str)
 
 String StringUtilities::ToUTF8(const WString& wstr)
 {
+	/// TODO: Convert from UTF-16 instead.
 	String result;
 	if(!UCS2toUTF8(wstr, result))
 		Log::Message(Log::LT_WARNING, "Failed to convert UCS2 string to UTF8.");
 	return result;
 }
 
-int StringUtilities::LengthUTF8(const String& str)
+size_t StringUtilities::LengthU8(const String& str)
 {
-	// TODO: Actually consider multibyte characters
-	return (int)str.size();
+	const char* p = str.data();
+	const char* p_end = str.data() + str.size();
+
+	size_t num_continuation_bytes = 0;
+
+	while (p != p_end)
+	{
+		if ((*p & 0b1100'0000) == 0b1000'0000)
+			++num_continuation_bytes;
+		++p;
+	}
+
+	return str.size() - num_continuation_bytes;
 }
 
 String StringUtilities::Replace(String subject, const String& search, const String& replace)
@@ -278,6 +291,77 @@ String StringUtilities::StripWhitespace(const String& string)
 	return String();
 }
 
+CodePoint StringUtilities::ToCodePoint(const char* p)
+{
+	if ((*p & (1 << 7)) == 0)
+		return static_cast<CodePoint>(*p);
+
+	int num_bytes = 0;
+	int code = 0;
+
+	if ((*p & 0b1110'0000) == 0b1100'0000)
+	{
+		num_bytes = 2;
+		code = (*p & 0b0001'1111);
+	}
+	else if ((*p & 0b1111'0000) == 0b1110'0000)
+	{
+		num_bytes = 3;
+		code = (*p & 0b0000'1111);
+	}
+	else if ((*p & 0b1111'1000) == 0b1111'0000)
+	{
+		num_bytes = 4;
+		code = (*p & 0b0000'0111);
+	}
+	else
+	{
+		// Invalid begin byte
+		return CodePoint::Null;
+	}
+
+	for (int i = 1; i < num_bytes; i++)
+	{
+		const char byte = *(p + i);
+		if ((byte & 0b1100'0000) != 0b1000'0000)
+		{
+			// Invalid continuation byte
+			++p;
+			return CodePoint::Null;
+		}
+
+		code |= ((byte & 0b0011'1111) << 8 * i);
+	}
+
+	return static_cast<CodePoint>(code);
+}
+
+String StringUtilities::ToUTF8(CodePoint code_point)
+{
+	unsigned int c = (unsigned int)code_point;
+
+	constexpr int l3 = 0b0000'0111;
+	constexpr int l4 = 0b0000'1111;
+	constexpr int l5 = 0b0001'1111;
+	constexpr int l6 = 0b0011'1111;
+	constexpr int h1 = 0b1000'0000;
+	constexpr int h2 = 0b1100'0000;
+	constexpr int h3 = 0b1110'0000;
+	constexpr int h4 = 0b1111'0000;
+
+	if (c < 0x80)
+		return String(1, (char)c);
+	else if(c < 0x800)
+		return { char(((c >> 6) & l5) | h2), char((c & l6) | h1) };
+	else if (c < 0x10000)
+		return { char(((c >> 12) & l4) | h3), char(((c >> 6) & l6) | h1), char((c & l6) | h1) };
+	else if (c < 0x10000)
+		return { char(((c >> 18) & l3) | h4), char(((c >> 12) & l6) | h1), char(((c >> 6) & l6) | h1), char((c & l6) | h1) };
+
+	// Invalid code point
+	return String();
+}
+
 // Operators for STL containers using strings.
 bool StringUtilities::StringComparei::operator()(const String& lhs, const String& rhs) const
 {
@@ -286,14 +370,14 @@ bool StringUtilities::StringComparei::operator()(const String& lhs, const String
 
 
 // Defines, helper functions for the UTF8 / UCS2 conversion functions.
-#define _NXT	0x80
-#define _SEQ2	0xc0
-#define _SEQ3	0xe0
-#define _SEQ4	0xf0
-#define _SEQ5	0xf8
-#define _SEQ6	0xfc
-	
-#define _BOM	0xfeff
+constexpr int _NXT = 0x80;
+constexpr int _SEQ2 = 0xc0;
+constexpr int _SEQ3 = 0xe0;
+constexpr int _SEQ4 = 0xf0;
+constexpr int _SEQ5 = 0xf8;
+constexpr int _SEQ6 = 0xfc;
+
+constexpr int _BOM = 0xfeff;
 	
 static int __wchar_forbidden(unsigned int sym)
 {
@@ -330,20 +414,20 @@ static bool UTF8toUCS2(const String& input, WString& output)
 	output.reserve(input.size());
 	
 	unsigned char* p = (unsigned char*) input.c_str();
-	unsigned char* lim = p + input.size();
+	unsigned char* end = p + input.size();
 	
 	// Skip the UTF-8 byte order marker if it exists.
 	if (input.substr(0, 3) == "\xEF\xBB\xBF")
 		p += 3;
 	
 	int num_bytes;
-	for (; p < lim; p += num_bytes)
+	for (; p < end; p += num_bytes)
 	{
 		if (__utf8_forbidden(*p) != 0)
 			return false;
 		
 		// Get number of bytes for one wide character.
-		word high;
+		wchar_t high;
 		num_bytes = 1;
 		
 		if ((*p & 0x80) == 0)
@@ -381,7 +465,7 @@ static bool UTF8toUCS2(const String& input, WString& output)
 		}
 		
 		// Does the sequence header tell us the truth about length?
-		if (lim - p <= num_bytes - 1)
+		if (end - p <= num_bytes - 1)
 		{
 			return false;
 		}
@@ -392,7 +476,7 @@ static bool UTF8toUCS2(const String& input, WString& output)
 			int i;
 			for (i = 1; i < num_bytes; i++)
 			{
-				if ((p[i] & 0xc0) != _NXT)
+				if ((p[i] & 0b1100'0000) != _NXT)
 					break;
 			}
 			
@@ -409,7 +493,7 @@ static bool UTF8toUCS2(const String& input, WString& output)
 		int num_bits = 0;
 		for (int i = 1; i < num_bytes; i++)
 		{
-			ucs4_char |= (word)(p[num_bytes - i] & 0x3f) << num_bits;
+			ucs4_char |= (wchar_t)(p[num_bytes - i] & 0x3f) << num_bits;
 			num_bits += 6;
 		}
 		ucs4_char |= high << num_bits;
@@ -423,7 +507,7 @@ static bool UTF8toUCS2(const String& input, WString& output)
 		// Only add the character to the output if it exists in the Basic Multilingual Plane (ie, fits in a single
 		// word).
 		if (ucs4_char <= 0xffff)
-			output.push_back((word) ucs4_char);
+			output.push_back((wchar_t) ucs4_char);
 	}
 	
 	return true;
@@ -437,8 +521,8 @@ static bool UCS2toUTF8(const WString& input, String& output)
 
 	output.reserve(input.size());
 	
-	const word* w = input.data();
-	const word* wlim = w + input.size();
+	const wchar_t* w = input.data();
+	const wchar_t* wlim = w + input.size();
 	
 	//Log::Message(LC_CORE, Log::LT_ALWAYS, "UCS2TOUTF8 size: %d", input_size);
 	for (; w < wlim; w++)
@@ -465,7 +549,7 @@ static bool UCS2toUTF8(const WString& input, String& output)
 		 n = 6;*/
 		
 		// Convert to little endian.
-		word ch = (*w >> 8) & 0x00FF;
+		wchar_t ch = (*w >> 8) & 0x00FF;
 		ch |= (*w << 8) & 0xFF00;
 		//		word ch = EMPConvertEndian(*w, RMLUI_ENDIAN_BIG);
 		

+ 3 - 3
Source/Core/Variant.cpp

@@ -215,10 +215,10 @@ void Variant::Set(const int value)
 	type = INT;
 	SET_VARIANT(int);
 }
-void Variant::Set(const word value)
+void Variant::Set(const CodePoint value)
 {
 	type = WORD;
-	SET_VARIANT(word);  
+	SET_VARIANT(CodePoint);
 }
 
 void Variant::Set(const char* value) 
@@ -455,7 +455,7 @@ bool Variant::operator==(const Variant & other) const
 	case STRING:
 		return DEFAULT_VARIANT_COMPARE(String);
 	case WORD:
-		return DEFAULT_VARIANT_COMPARE(word);
+		return DEFAULT_VARIANT_COMPARE(CodePoint);
 	case VECTOR2:
 		return DEFAULT_VARIANT_COMPARE(Vector2f);
 	case VECTOR3: