Browse Source

Convert UTF-16 to UTF-8

Michael Ragazzon 6 years ago
parent
commit
1c00fea5aa
2 changed files with 72 additions and 86 deletions
  1. 1 0
      Include/RmlUi/Core/StringUtilities.h
  2. 71 86
      Source/Core/StringUtilities.cpp

+ 1 - 0
Include/RmlUi/Core/StringUtilities.h

@@ -122,6 +122,7 @@ namespace StringUtilities
 
 
 	RMLUICORE_API CodePoint ToCodePoint(const char* p);
 	RMLUICORE_API CodePoint ToCodePoint(const char* p);
 	RMLUICORE_API String ToUTF8(CodePoint code_point);
 	RMLUICORE_API String ToUTF8(CodePoint code_point);
+	RMLUICORE_API String ToUTF8(const CodePoint* code_points, int num_code_points);
 
 
 	inline const char* SeekForwardU8(const char* p, const char* p_end)
 	inline const char* SeekForwardU8(const char* p, const char* p_end)
 	{
 	{

+ 71 - 86
Source/Core/StringUtilities.cpp

@@ -49,7 +49,7 @@ namespace Rml {
 namespace Core {
 namespace Core {
 
 
 static bool UTF8toUCS2(const String& input, WString& output);
 static bool UTF8toUCS2(const String& input, WString& output);
-static bool UCS2toUTF8(const WString& input, String& output);
+static bool UTF16toUTF8(const WString& input, String& output);
 
 
 
 
 static int FormatString(String& string, size_t max_size, const char* format, va_list argument_list)
 static int FormatString(String& string, size_t max_size, const char* format, va_list argument_list)
@@ -117,7 +117,7 @@ String StringUtilities::ToUTF8(const WString& wstr)
 {
 {
 	/// TODO: Convert from UTF-16 instead.
 	/// TODO: Convert from UTF-16 instead.
 	String result;
 	String result;
-	if(!UCS2toUTF8(wstr, result))
+	if(!UTF16toUTF8(wstr, result))
 		Log::Message(Log::LT_WARNING, "Failed to convert UCS2 string to UTF8.");
 		Log::Message(Log::LT_WARNING, "Failed to convert UCS2 string to UTF8.");
 	return result;
 	return result;
 }
 }
@@ -345,28 +345,44 @@ CodePoint StringUtilities::ToCodePoint(const char* p)
 
 
 String StringUtilities::ToUTF8(CodePoint code_point)
 String StringUtilities::ToUTF8(CodePoint code_point)
 {
 {
-	unsigned int c = (unsigned int)code_point;
-
-	constexpr int l3 = 0b0000'0111;
-	constexpr int l4 = 0b0000'1111;
-	constexpr int l5 = 0b0001'1111;
-	constexpr int l6 = 0b0011'1111;
-	constexpr int h1 = 0b1000'0000;
-	constexpr int h2 = 0b1100'0000;
-	constexpr int h3 = 0b1110'0000;
-	constexpr int h4 = 0b1111'0000;
-
-	if (c < 0x80)
-		return String(1, (char)c);
-	else if(c < 0x800)
-		return { char(((c >> 6) & l5) | h2), char((c & l6) | h1) };
-	else if (c < 0x10000)
-		return { char(((c >> 12) & l4) | h3), char(((c >> 6) & l6) | h1), char((c & l6) | h1) };
-	else if (c < 0x10000)
-		return { char(((c >> 18) & l3) | h4), char(((c >> 12) & l6) | h1), char(((c >> 6) & l6) | h1), char((c & l6) | h1) };
-
-	// Invalid code point
-	return String();
+	return ToUTF8(&code_point, 1);
+}
+
+String StringUtilities::ToUTF8(const CodePoint* code_points, int num_code_points)
+{
+	String result;
+
+	bool invalid_code_point = false;
+
+	for (int i = 0; i < num_code_points; i++)
+	{
+		unsigned int c = (unsigned int)code_points[i];
+
+		constexpr int l3 = 0b0000'0111;
+		constexpr int l4 = 0b0000'1111;
+		constexpr int l5 = 0b0001'1111;
+		constexpr int l6 = 0b0011'1111;
+		constexpr int h1 = 0b1000'0000;
+		constexpr int h2 = 0b1100'0000;
+		constexpr int h3 = 0b1110'0000;
+		constexpr int h4 = 0b1111'0000;
+
+		if (c < 0x80)
+			result += (char)c;
+		else if (c < 0x800)
+			result += { char(((c >> 6)& l5) | h2), char((c& l6) | h1) };
+		else if (c < 0x10000)
+			result += { char(((c >> 12)& l4) | h3), char(((c >> 6)& l6) | h1), char((c& l6) | h1) };
+		else if (c <= 0x10FFFF)
+			result += { char(((c >> 18)& l3) | h4), char(((c >> 12)& l6) | h1), char(((c >> 6)& l6) | h1), char((c& l6) | h1) };
+		else
+			invalid_code_point = true;
+	}
+
+	if (invalid_code_point)
+		Log::Message(Log::LT_WARNING, "One or more invalid code points encountered while encoding to UTF-8.");
+
+	return result;
 }
 }
 
 
 // Operators for STL containers using strings.
 // Operators for STL containers using strings.
@@ -521,77 +537,46 @@ static bool UTF8toUCS2(const String& input, WString& output)
 }
 }
 
 
 // Converts an array of words in UCS-2 encoding into a character array in UTF-8 encoding.
 // Converts an array of words in UCS-2 encoding into a character array in UTF-8 encoding.
-static bool UCS2toUTF8(const WString& input, String& output)
+static bool UTF16toUTF8(const WString& input, String& output)
 {
 {
-	unsigned char *oc;
-	size_t n;
+	std::vector<CodePoint> code_points;
+	code_points.reserve(input.size());
+
+	bool valid_input = true;
+	wchar_t w1 = 0;
 
 
-	output.reserve(input.size());
-	
 	const wchar_t* w = input.data();
 	const wchar_t* w = input.data();
 	const wchar_t* wlim = w + input.size();
 	const wchar_t* wlim = w + input.size();
-	
-	//Log::Message(LC_CORE, Log::LT_ALWAYS, "UCS2TOUTF8 size: %d", input_size);
 	for (; w < wlim; w++)
 	for (; w < wlim; w++)
 	{
 	{
-		if (__wchar_forbidden(*w) != 0)
-			return false;
-		
-		if (*w == _BOM)
-			continue;
-		
-		//if (*w < 0)
-		//	return false;
-		if (*w <= 0x007f)
-			n = 1;
-		else if (*w <= 0x07ff)
-			n = 2;
-		else //if (*w <= 0x0000ffff)
-			n = 3;
-		/*else if (*w <= 0x001fffff)
-		 n = 4;
-		 else if (*w <= 0x03ffffff)
-		 n = 5;
-		 else // if (*w <= 0x7fffffff)
-		 n = 6;*/
-		
-		// Convert to little endian.
-		wchar_t ch = (*w >> 8) & 0x00FF;
-		ch |= (*w << 8) & 0xFF00;
-		//		word ch = EMPConvertEndian(*w, RMLUI_ENDIAN_BIG);
-		
-		oc = (unsigned char *)&ch;
-		switch (n)
+		if (*w <= 0xD7FF || *w >= 0xE000)
 		{
 		{
-			case 1:
-				output += oc[1];
-				break;
-				
-			case 2:
-				output += (_SEQ2 | (oc[1] >> 6) | ((oc[0] & 0x07) << 2));
-				output += (_NXT | (oc[1] & 0x3f));
-				break;
-				
-			case 3:
-				output += (_SEQ3 | ((oc[0] & 0xf0) >> 4));
-				output += (_NXT | (oc[1] >> 6) | ((oc[0] & 0x0f) << 2));
-				output += (_NXT | (oc[1] & 0x3f));
-				break;
-				
-			case 4:
-				break;
-				
-			case 5:
-				break;
-				
-			case 6:
-				break;
+			// Single 16-bit code unit.
+			code_points.push_back((CodePoint)(*w));
+		}
+		else 
+		{
+			// Two 16-bit code units.
+			if (!w1 && *w < 0xDC00)
+			{
+				w1 = *w;
+			}
+			else if (w1 && *w >= 0xDC00)
+			{
+				code_points.push_back((CodePoint)(((((unsigned int)w1 & 0x3FF) << 10) | ((unsigned int)(*w) & 0x3FF)) + 0x10000u));
+				w1 = 0;
+			}
+			else
+			{
+				valid_input = false;
+			}
 		}
 		}
-		
-		//Log::Message(LC_CORE, Log::LT_ALWAYS, "Converting...%c(%d) %d -> %d", *w, *w, w - input, output.size());
 	}
 	}
-	
-	return true;
+
+	if(code_points.size() > 0)
+		output = StringUtilities::ToUTF8(code_points.data(), (int)code_points.size());
+
+	return valid_input;
 }
 }
 
 
 StringView::StringView(const char* p_begin, const char* p_end) : p_begin(p_begin), p_end(p_end)
 StringView::StringView(const char* p_begin, const char* p_end) : p_begin(p_begin), p_end(p_end)