2 years ago · 999f3e2c13
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -62,6 +62,7 @@ static _FORCE_INLINE_ char32_t lower_case(char32_t c) {
 
				 const char CharString::_null = 0;
			
 
				 const char16_t Char16String::_null = 0;
			
 
				 const char32_t String::_null = 0;
			
 
				+const char32_t String::_replacement_char = 0xfffd;
			
 
				 
			
 
				 bool select_word(const String &p_s, int p_col, int &r_beg, int &r_end) {
			
 
				 	const String &s = p_s;
			
@@ -307,7 +308,7 @@ void String::copy_from(const char *p_cstr) {
 
				 		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
			
 
				 		if (c == 0 && i < len) {
			
 
				 			print_unicode_error("NUL character", true);
			
 
				-			dst[i] = 0x20;
			
 
				+			dst[i] = _replacement_char;
			
 
				 		} else {
			
 
				 			dst[i] = c;
			
 
				 		}
			
@@ -340,7 +341,7 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) {
 
				 		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
			
 
				 		if (c == 0) {
			
 
				 			print_unicode_error("NUL character", true);
			
 
				-			dst[i] = 0x20;
			
 
				+			dst[i] = _replacement_char;
			
 
				 		} else {
			
 
				 			dst[i] = c;
			
 
				 		}
			
@@ -373,17 +374,21 @@ void String::copy_from(const char32_t &p_char) {
 
				 		print_unicode_error("NUL character", true);
			
 
				 		return;
			
 
				 	}
			
 
				+
			
 
				+	resize(2);
			
 
				+
			
 
				+	char32_t *dst = ptrw();
			
 
				+
			
 
				 	if ((p_char & 0xfffff800) == 0xd800) {
			
 
				 		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
			
 
				-	}
			
 
				-	if (p_char > 0x10ffff) {
			
 
				+		dst[0] = _replacement_char;
			
 
				+	} else if (p_char > 0x10ffff) {
			
 
				 		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
			
 
				+		dst[0] = _replacement_char;
			
 
				+	} else {
			
 
				+		dst[0] = p_char;
			
 
				 	}
			
 
				 
			
 
				-	resize(2);
			
 
				-
			
 
				-	char32_t *dst = ptrw();
			
 
				-	dst[0] = p_char;
			
 
				 	dst[1] = 0;
			
 
				 }
			
 
				 
			
@@ -439,14 +444,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) {
 
				 	for (int i = 0; i < p_length; i++) {
			
 
				 		if (p_char[i] == 0) {
			
 
				 			print_unicode_error("NUL character", true);
			
 
				-			dst[i] = 0x20;
			
 
				+			dst[i] = _replacement_char;
			
 
				 			continue;
			
 
				 		}
			
 
				 		if ((p_char[i] & 0xfffff800) == 0xd800) {
			
 
				 			print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i]));
			
 
				+			dst[i] = _replacement_char;
			
 
				+			continue;
			
 
				 		}
			
 
				 		if (p_char[i] > 0x10ffff) {
			
 
				 			print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i]));
			
 
				+			dst[i] = _replacement_char;
			
 
				+			continue;
			
 
				 		}
			
 
				 		dst[i] = p_char[i];
			
 
				 	}
			
@@ -538,7 +547,7 @@ String &String::operator+=(const char *p_str) {
 
				 		uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]);
			
 
				 		if (c == 0 && i < rhs_len) {
			
 
				 			print_unicode_error("NUL character", true);
			
 
				-			dst[i] = 0x20;
			
 
				+			dst[i] = _replacement_char;
			
 
				 		} else {
			
 
				 			dst[i] = c;
			
 
				 		}
			
@@ -568,17 +577,21 @@ String &String::operator+=(char32_t p_char) {
 
				 		print_unicode_error("NUL character", true);
			
 
				 		return *this;
			
 
				 	}
			
 
				+
			
 
				+	const int lhs_len = length();
			
 
				+	resize(lhs_len + 2);
			
 
				+	char32_t *dst = ptrw();
			
 
				+
			
 
				 	if ((p_char & 0xfffff800) == 0xd800) {
			
 
				 		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
			
 
				-	}
			
 
				-	if (p_char > 0x10ffff) {
			
 
				+		dst[lhs_len] = _replacement_char;
			
 
				+	} else if (p_char > 0x10ffff) {
			
 
				 		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
			
 
				+		dst[lhs_len] = _replacement_char;
			
 
				+	} else {
			
 
				+		dst[lhs_len] = p_char;
			
 
				 	}
			
 
				 
			
 
				-	const int lhs_len = length();
			
 
				-	resize(lhs_len + 2);
			
 
				-	char32_t *dst = ptrw();
			
 
				-	dst[lhs_len] = p_char;
			
 
				 	dst[lhs_len + 1] = 0;
			
 
				 
			
 
				 	return *this;
			
@@ -1646,7 +1659,7 @@ String String::hex_encode_buffer(const uint8_t *p_buffer, int p_len) {
 
				 
			
 
				 void String::print_unicode_error(const String &p_message, bool p_critical) const {
			
 
				 	if (p_critical) {
			
 
				-		print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message));
			
 
				+		print_error(vformat("Unicode parsing error, some characters were replaced with � (U+FFFD): %s", p_message));
			
 
				 	} else {
			
 
				 		print_error(vformat("Unicode parsing error: %s", p_message));
			
 
				 	}
			
@@ -1666,7 +1679,7 @@ CharString String::ascii(bool p_allow_extended) const {
 
				 			cs[i] = c;
			
 
				 		} else {
			
 
				 			print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c));
			
 
				-			cs[i] = 0x20;
			
 
				+			cs[i] = 0x20; // ascii doesn't have a replacement character like unicode, 0x1a is sometimes used but is kinda arcane
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -1806,13 +1819,13 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
 
				 				unichar = (0xff >> 7) & c;
			
 
				 				skip = 5;
			
 
				 			} else {
			
 
				-				*(dst++) = 0x20;
			
 
				+				*(dst++) = _replacement_char;
			
 
				 				unichar = 0;
			
 
				 				skip = 0;
			
 
				 			}
			
 
				 		} else {
			
 
				 			if (c < 0x80 || c > 0xbf) {
			
 
				-				*(dst++) = 0x20;
			
 
				+				*(dst++) = _replacement_char;
			
 
				 				skip = 0;
			
 
				 			} else {
			
 
				 				unichar = (unichar << 6) | (c & 0x3f);
			
@@ -1821,15 +1834,15 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
 
				 					if (unichar == 0) {
			
 
				 						print_unicode_error("NUL character", true);
			
 
				 						decode_failed = true;
			
 
				-						unichar = 0x20;
			
 
				-					}
			
 
				-					if ((unichar & 0xfffff800) == 0xd800) {
			
 
				-						print_unicode_error(vformat("Unpaired surrogate (%x)", unichar));
			
 
				-						decode_error = true;
			
 
				-					}
			
 
				-					if (unichar > 0x10ffff) {
			
 
				-						print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar));
			
 
				-						decode_error = true;
			
 
				+						unichar = _replacement_char;
			
 
				+					} else if ((unichar & 0xfffff800) == 0xd800) {
			
 
				+						print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true);
			
 
				+						decode_failed = true;
			
 
				+						unichar = _replacement_char;
			
 
				+					} else if (unichar > 0x10ffff) {
			
 
				+						print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true);
			
 
				+						decode_failed = true;
			
 
				+						unichar = _replacement_char;
			
 
				 					}
			
 
				 					*(dst++) = unichar;
			
 
				 				}
			
@@ -1923,7 +1936,11 @@ CharString String::utf8() const {
 
				 			APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits.
			
 
				 			APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
			
 
				 		} else {
			
 
				-			APPEND_CHAR(0x20);
			
 
				+			// the string is a valid UTF32, so it should never happen ...
			
 
				+			print_unicode_error(vformat("Non scalar value (%x)", c), true);
			
 
				+			APPEND_CHAR(uint32_t(0xe0 | ((_replacement_char >> 12) & 0x0f))); // Top 4 bits.
			
 
				+			APPEND_CHAR(uint32_t(0x80 | ((_replacement_char >> 6) & 0x3f))); // Middle 6 bits.
			
 
				+			APPEND_CHAR(uint32_t(0x80 | (_replacement_char & 0x3f))); // Bottom 6 bits.
			
 
				 		}
			
 
				 	}
			
 
				 #undef APPEND_CHAR
			
@@ -2096,7 +2113,9 @@ Char16String String::utf16() const {
 
				 			APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate.
			
 
				 			APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate.
			
 
				 		} else {
			
 
				-			APPEND_CHAR(0x20);
			
 
				+			// the string is a valid UTF32, so it should never happen ...
			
 
				+			APPEND_CHAR(uint32_t((_replacement_char >> 10) + 0xd7c0));
			
 
				+			APPEND_CHAR(uint32_t((_replacement_char & 0x3ff) | 0xdc00));
			
 
				 		}
			
 
				 	}
			
 
				 #undef APPEND_CHAR
			
--- a/core/string/ustring.h
+++ b/core/string/ustring.h
@@ -183,6 +183,7 @@ struct StrRange {
 
				 class String {
			
 
				 	CowData<char32_t> _cowdata;
			
 
				 	static const char32_t _null;
			
 
				+	static const char32_t _replacement_char;
			
 
				 
			
 
				 	void copy_from(const char *p_cstr);
			
 
				 	void copy_from(const char *p_cstr, const int p_clip_to);
			
--- a/tests/core/string/test_string.h
+++ b/tests/core/string/test_string.h
@@ -170,10 +170,10 @@ TEST_CASE("[String] Invalid UTF8 (non-standard)") {
 
				 	ERR_PRINT_OFF
			
 
				 	static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 };
			
 
				 	//                               +     +2                +2                +2                +3                      overlong +3             unpaired +2
			
 
				-	static const char32_t u32str[] = { 0x45, 0x304A, 0x3088, 0x3046, 0x1F3A4, 0x20AC, 0xD801, 0 };
			
 
				+	static const char32_t u32str[] = { 0x45, 0x304A, 0x3088, 0x3046, 0x1F3A4, 0x20AC, 0xFFFD, 0 };
			
 
				 	String s;
			
 
				 	Error err = s.parse_utf8((const char *)u8str);
			
 
				-	CHECK(err == ERR_PARSE_ERROR);
			
 
				+	CHECK(err == ERR_INVALID_DATA);
			
 
				 	CHECK(s == u32str);
			
 
				 
			
 
				 	CharString cs = (const char *)u8str;
			
@@ -185,7 +185,7 @@ TEST_CASE("[String] Invalid UTF8 (unrecoverable)") {
 
				 	ERR_PRINT_OFF
			
 
				 	static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0x8F, 0xE3, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xC0, 0x80, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 };
			
 
				 	//                               +     +2                inv   +2    inv   inv   inv   +2                +2                ovl NUL +1  +3                      overlong +3             unpaired +2
			
 
				-	static const char32_t u32str[] = { 0x45, 0x304A, 0x20, 0x20, 0x20, 0x20, 0x3088, 0x3046, 0x20, 0x1F3A4, 0x20AC, 0xD801, 0 };
			
 
				+	static const char32_t u32str[] = { 0x45, 0x304A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x3088, 0x3046, 0xFFFD, 0x1F3A4, 0x20AC, 0xFFFD, 0 };
			
 
				 	String s;
			
 
				 	Error err = s.parse_utf8((const char *)u8str);
			
 
				 	CHECK(err == ERR_INVALID_DATA);
			
@@ -301,8 +301,8 @@ TEST_CASE("[String] Test chr") {
 
				 	CHECK(String::chr('H') == "H");
			
 
				 	CHECK(String::chr(0x3012)[0] == 0x3012);
			
 
				 	ERR_PRINT_OFF
			
 
				-	CHECK(String::chr(0xd812)[0] == 0xd812); // Unpaired UTF-16 surrogate
			
 
				-	CHECK(String::chr(0x20d812)[0] == 0x20d812); // Outside UTF-32 range
			
 
				+	CHECK(String::chr(0xd812)[0] == 0xfffd); // Unpaired UTF-16 surrogate
			
 
				+	CHECK(String::chr(0x20d812)[0] == 0xfffd); // Outside UTF-32 range
			
 
				 	ERR_PRINT_ON
			
 
				 }