|  | @@ -62,6 +62,7 @@ static _FORCE_INLINE_ char32_t lower_case(char32_t c) {
 | 
											
												
													
														|  |  const char CharString::_null = 0;
 |  |  const char CharString::_null = 0;
 | 
											
												
													
														|  |  const char16_t Char16String::_null = 0;
 |  |  const char16_t Char16String::_null = 0;
 | 
											
												
													
														|  |  const char32_t String::_null = 0;
 |  |  const char32_t String::_null = 0;
 | 
											
												
													
														|  | 
 |  | +const char32_t String::_replacement_char = 0xfffd;
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  bool select_word(const String &p_s, int p_col, int &r_beg, int &r_end) {
 |  |  bool select_word(const String &p_s, int p_col, int &r_beg, int &r_end) {
 | 
											
												
													
														|  |  	const String &s = p_s;
 |  |  	const String &s = p_s;
 | 
											
										
											
												
													
														|  | @@ -307,7 +308,7 @@ void String::copy_from(const char *p_cstr) {
 | 
											
												
													
														|  |  		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
 |  |  		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
 | 
											
												
													
														|  |  		if (c == 0 && i < len) {
 |  |  		if (c == 0 && i < len) {
 | 
											
												
													
														|  |  			print_unicode_error("NUL character", true);
 |  |  			print_unicode_error("NUL character", true);
 | 
											
												
													
														|  | -			dst[i] = 0x20;
 |  | 
 | 
											
												
													
														|  | 
 |  | +			dst[i] = _replacement_char;
 | 
											
												
													
														|  |  		} else {
 |  |  		} else {
 | 
											
												
													
														|  |  			dst[i] = c;
 |  |  			dst[i] = c;
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
										
											
												
													
														|  | @@ -340,7 +341,7 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) {
 | 
											
												
													
														|  |  		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
 |  |  		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
 | 
											
												
													
														|  |  		if (c == 0) {
 |  |  		if (c == 0) {
 | 
											
												
													
														|  |  			print_unicode_error("NUL character", true);
 |  |  			print_unicode_error("NUL character", true);
 | 
											
												
													
														|  | -			dst[i] = 0x20;
 |  | 
 | 
											
												
													
														|  | 
 |  | +			dst[i] = _replacement_char;
 | 
											
												
													
														|  |  		} else {
 |  |  		} else {
 | 
											
												
													
														|  |  			dst[i] = c;
 |  |  			dst[i] = c;
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
										
											
												
													
														|  | @@ -373,17 +374,21 @@ void String::copy_from(const char32_t &p_char) {
 | 
											
												
													
														|  |  		print_unicode_error("NUL character", true);
 |  |  		print_unicode_error("NUL character", true);
 | 
											
												
													
														|  |  		return;
 |  |  		return;
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +	resize(2);
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +	char32_t *dst = ptrw();
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  |  	if ((p_char & 0xfffff800) == 0xd800) {
 |  |  	if ((p_char & 0xfffff800) == 0xd800) {
 | 
											
												
													
														|  |  		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
 |  |  		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
 | 
											
												
													
														|  | -	}
 |  | 
 | 
											
												
													
														|  | -	if (p_char > 0x10ffff) {
 |  | 
 | 
											
												
													
														|  | 
 |  | +		dst[0] = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +	} else if (p_char > 0x10ffff) {
 | 
											
												
													
														|  |  		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
 |  |  		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
 | 
											
												
													
														|  | 
 |  | +		dst[0] = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +	} else {
 | 
											
												
													
														|  | 
 |  | +		dst[0] = p_char;
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -	resize(2);
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -	char32_t *dst = ptrw();
 |  | 
 | 
											
												
													
														|  | -	dst[0] = p_char;
 |  | 
 | 
											
												
													
														|  |  	dst[1] = 0;
 |  |  	dst[1] = 0;
 | 
											
												
													
														|  |  }
 |  |  }
 | 
											
												
													
														|  |  
 |  |  
 | 
											
										
											
												
													
														|  | @@ -439,14 +444,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) {
 | 
											
												
													
														|  |  	for (int i = 0; i < p_length; i++) {
 |  |  	for (int i = 0; i < p_length; i++) {
 | 
											
												
													
														|  |  		if (p_char[i] == 0) {
 |  |  		if (p_char[i] == 0) {
 | 
											
												
													
														|  |  			print_unicode_error("NUL character", true);
 |  |  			print_unicode_error("NUL character", true);
 | 
											
												
													
														|  | -			dst[i] = 0x20;
 |  | 
 | 
											
												
													
														|  | 
 |  | +			dst[i] = _replacement_char;
 | 
											
												
													
														|  |  			continue;
 |  |  			continue;
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
												
													
														|  |  		if ((p_char[i] & 0xfffff800) == 0xd800) {
 |  |  		if ((p_char[i] & 0xfffff800) == 0xd800) {
 | 
											
												
													
														|  |  			print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i]));
 |  |  			print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i]));
 | 
											
												
													
														|  | 
 |  | +			dst[i] = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +			continue;
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
												
													
														|  |  		if (p_char[i] > 0x10ffff) {
 |  |  		if (p_char[i] > 0x10ffff) {
 | 
											
												
													
														|  |  			print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i]));
 |  |  			print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i]));
 | 
											
												
													
														|  | 
 |  | +			dst[i] = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +			continue;
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
												
													
														|  |  		dst[i] = p_char[i];
 |  |  		dst[i] = p_char[i];
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
										
											
												
													
														|  | @@ -538,7 +547,7 @@ String &String::operator+=(const char *p_str) {
 | 
											
												
													
														|  |  		uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]);
 |  |  		uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]);
 | 
											
												
													
														|  |  		if (c == 0 && i < rhs_len) {
 |  |  		if (c == 0 && i < rhs_len) {
 | 
											
												
													
														|  |  			print_unicode_error("NUL character", true);
 |  |  			print_unicode_error("NUL character", true);
 | 
											
												
													
														|  | -			dst[i] = 0x20;
 |  | 
 | 
											
												
													
														|  | 
 |  | +			dst[i] = _replacement_char;
 | 
											
												
													
														|  |  		} else {
 |  |  		} else {
 | 
											
												
													
														|  |  			dst[i] = c;
 |  |  			dst[i] = c;
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
										
											
												
													
														|  | @@ -568,17 +577,21 @@ String &String::operator+=(char32_t p_char) {
 | 
											
												
													
														|  |  		print_unicode_error("NUL character", true);
 |  |  		print_unicode_error("NUL character", true);
 | 
											
												
													
														|  |  		return *this;
 |  |  		return *this;
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +	const int lhs_len = length();
 | 
											
												
													
														|  | 
 |  | +	resize(lhs_len + 2);
 | 
											
												
													
														|  | 
 |  | +	char32_t *dst = ptrw();
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  |  	if ((p_char & 0xfffff800) == 0xd800) {
 |  |  	if ((p_char & 0xfffff800) == 0xd800) {
 | 
											
												
													
														|  |  		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
 |  |  		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
 | 
											
												
													
														|  | -	}
 |  | 
 | 
											
												
													
														|  | -	if (p_char > 0x10ffff) {
 |  | 
 | 
											
												
													
														|  | 
 |  | +		dst[lhs_len] = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +	} else if (p_char > 0x10ffff) {
 | 
											
												
													
														|  |  		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
 |  |  		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
 | 
											
												
													
														|  | 
 |  | +		dst[lhs_len] = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +	} else {
 | 
											
												
													
														|  | 
 |  | +		dst[lhs_len] = p_char;
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -	const int lhs_len = length();
 |  | 
 | 
											
												
													
														|  | -	resize(lhs_len + 2);
 |  | 
 | 
											
												
													
														|  | -	char32_t *dst = ptrw();
 |  | 
 | 
											
												
													
														|  | -	dst[lhs_len] = p_char;
 |  | 
 | 
											
												
													
														|  |  	dst[lhs_len + 1] = 0;
 |  |  	dst[lhs_len + 1] = 0;
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  	return *this;
 |  |  	return *this;
 | 
											
										
											
												
													
														|  | @@ -1646,7 +1659,7 @@ String String::hex_encode_buffer(const uint8_t *p_buffer, int p_len) {
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  void String::print_unicode_error(const String &p_message, bool p_critical) const {
 |  |  void String::print_unicode_error(const String &p_message, bool p_critical) const {
 | 
											
												
													
														|  |  	if (p_critical) {
 |  |  	if (p_critical) {
 | 
											
												
													
														|  | -		print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message));
 |  | 
 | 
											
												
													
														|  | 
 |  | +		print_error(vformat("Unicode parsing error, some characters were replaced with � (U+FFFD): %s", p_message));
 | 
											
												
													
														|  |  	} else {
 |  |  	} else {
 | 
											
												
													
														|  |  		print_error(vformat("Unicode parsing error: %s", p_message));
 |  |  		print_error(vformat("Unicode parsing error: %s", p_message));
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
										
											
												
													
														|  | @@ -1666,7 +1679,7 @@ CharString String::ascii(bool p_allow_extended) const {
 | 
											
												
													
														|  |  			cs[i] = c;
 |  |  			cs[i] = c;
 | 
											
												
													
														|  |  		} else {
 |  |  		} else {
 | 
											
												
													
														|  |  			print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c));
 |  |  			print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c));
 | 
											
												
													
														|  | -			cs[i] = 0x20;
 |  | 
 | 
											
												
													
														|  | 
 |  | +			cs[i] = 0x20; // ascii doesn't have a replacement character like unicode, 0x1a is sometimes used but is kinda arcane
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
												
													
														|  |  
 |  |  
 | 
											
										
											
												
													
														|  | @@ -1806,13 +1819,13 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
 | 
											
												
													
														|  |  				unichar = (0xff >> 7) & c;
 |  |  				unichar = (0xff >> 7) & c;
 | 
											
												
													
														|  |  				skip = 5;
 |  |  				skip = 5;
 | 
											
												
													
														|  |  			} else {
 |  |  			} else {
 | 
											
												
													
														|  | -				*(dst++) = 0x20;
 |  | 
 | 
											
												
													
														|  | 
 |  | +				*(dst++) = _replacement_char;
 | 
											
												
													
														|  |  				unichar = 0;
 |  |  				unichar = 0;
 | 
											
												
													
														|  |  				skip = 0;
 |  |  				skip = 0;
 | 
											
												
													
														|  |  			}
 |  |  			}
 | 
											
												
													
														|  |  		} else {
 |  |  		} else {
 | 
											
												
													
														|  |  			if (c < 0x80 || c > 0xbf) {
 |  |  			if (c < 0x80 || c > 0xbf) {
 | 
											
												
													
														|  | -				*(dst++) = 0x20;
 |  | 
 | 
											
												
													
														|  | 
 |  | +				*(dst++) = _replacement_char;
 | 
											
												
													
														|  |  				skip = 0;
 |  |  				skip = 0;
 | 
											
												
													
														|  |  			} else {
 |  |  			} else {
 | 
											
												
													
														|  |  				unichar = (unichar << 6) | (c & 0x3f);
 |  |  				unichar = (unichar << 6) | (c & 0x3f);
 | 
											
										
											
												
													
														|  | @@ -1821,15 +1834,15 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
 | 
											
												
													
														|  |  					if (unichar == 0) {
 |  |  					if (unichar == 0) {
 | 
											
												
													
														|  |  						print_unicode_error("NUL character", true);
 |  |  						print_unicode_error("NUL character", true);
 | 
											
												
													
														|  |  						decode_failed = true;
 |  |  						decode_failed = true;
 | 
											
												
													
														|  | -						unichar = 0x20;
 |  | 
 | 
											
												
													
														|  | -					}
 |  | 
 | 
											
												
													
														|  | -					if ((unichar & 0xfffff800) == 0xd800) {
 |  | 
 | 
											
												
													
														|  | -						print_unicode_error(vformat("Unpaired surrogate (%x)", unichar));
 |  | 
 | 
											
												
													
														|  | -						decode_error = true;
 |  | 
 | 
											
												
													
														|  | -					}
 |  | 
 | 
											
												
													
														|  | -					if (unichar > 0x10ffff) {
 |  | 
 | 
											
												
													
														|  | -						print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar));
 |  | 
 | 
											
												
													
														|  | -						decode_error = true;
 |  | 
 | 
											
												
													
														|  | 
 |  | +						unichar = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +					} else if ((unichar & 0xfffff800) == 0xd800) {
 | 
											
												
													
														|  | 
 |  | +						print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true);
 | 
											
												
													
														|  | 
 |  | +						decode_failed = true;
 | 
											
												
													
														|  | 
 |  | +						unichar = _replacement_char;
 | 
											
												
													
														|  | 
 |  | +					} else if (unichar > 0x10ffff) {
 | 
											
												
													
														|  | 
 |  | +						print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true);
 | 
											
												
													
														|  | 
 |  | +						decode_failed = true;
 | 
											
												
													
														|  | 
 |  | +						unichar = _replacement_char;
 | 
											
												
													
														|  |  					}
 |  |  					}
 | 
											
												
													
														|  |  					*(dst++) = unichar;
 |  |  					*(dst++) = unichar;
 | 
											
												
													
														|  |  				}
 |  |  				}
 | 
											
										
											
												
													
														|  | @@ -1923,7 +1936,11 @@ CharString String::utf8() const {
 | 
											
												
													
														|  |  			APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits.
 |  |  			APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits.
 | 
											
												
													
														|  |  			APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
 |  |  			APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
 | 
											
												
													
														|  |  		} else {
 |  |  		} else {
 | 
											
												
													
														|  | -			APPEND_CHAR(0x20);
 |  | 
 | 
											
												
													
														|  | 
 |  | +			// the string is a valid UTF32, so it should never happen ...
 | 
											
												
													
														|  | 
 |  | +			print_unicode_error(vformat("Non scalar value (%x)", c), true);
 | 
											
												
													
														|  | 
 |  | +			APPEND_CHAR(uint32_t(0xe0 | ((_replacement_char >> 12) & 0x0f))); // Top 4 bits.
 | 
											
												
													
														|  | 
 |  | +			APPEND_CHAR(uint32_t(0x80 | ((_replacement_char >> 6) & 0x3f))); // Middle 6 bits.
 | 
											
												
													
														|  | 
 |  | +			APPEND_CHAR(uint32_t(0x80 | (_replacement_char & 0x3f))); // Bottom 6 bits.
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
												
													
														|  |  #undef APPEND_CHAR
 |  |  #undef APPEND_CHAR
 | 
											
										
											
												
													
														|  | @@ -2096,7 +2113,9 @@ Char16String String::utf16() const {
 | 
											
												
													
														|  |  			APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate.
 |  |  			APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate.
 | 
											
												
													
														|  |  			APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate.
 |  |  			APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate.
 | 
											
												
													
														|  |  		} else {
 |  |  		} else {
 | 
											
												
													
														|  | -			APPEND_CHAR(0x20);
 |  | 
 | 
											
												
													
														|  | 
 |  | +			// the string is a valid UTF32, so it should never happen ...
 | 
											
												
													
														|  | 
 |  | +			APPEND_CHAR(uint32_t((_replacement_char >> 10) + 0xd7c0));
 | 
											
												
													
														|  | 
 |  | +			APPEND_CHAR(uint32_t((_replacement_char & 0x3ff) | 0xdc00));
 | 
											
												
													
														|  |  		}
 |  |  		}
 | 
											
												
													
														|  |  	}
 |  |  	}
 | 
											
												
													
														|  |  #undef APPEND_CHAR
 |  |  #undef APPEND_CHAR
 |