Преглед на файлове

Add support for the escaped UTF-16 and UTF-32 Unicode characters in the scripts and expressions.

bruvzg преди 3 години
родител
ревизия
8e79c5fb8d
променени са 4 файла, в които са добавени 184 реда и са изтрити 12 реда
  1. 40 3
      core/math/expression.cpp
  2. 41 3
      core/variant/variant_parser.cpp
  3. 63 3
      modules/gdscript/gdscript_tokenizer.cpp
  4. 40 3
      modules/visual_script/visual_script_expression.cpp

+ 40 - 3
core/math/expression.cpp

@@ -197,6 +197,7 @@ Error Expression::_get_token(Token &r_token) {
 			case '\'':
 			case '"': {
 				String str;
+				char32_t prev = 0;
 				while (true) {
 					char32_t ch = GET_CHAR();
 
@@ -234,9 +235,11 @@ Error Expression::_get_token(Token &r_token) {
 							case 'r':
 								res = 13;
 								break;
+							case 'U':
 							case 'u': {
-								// hex number
-								for (int j = 0; j < 4; j++) {
+								// Hexadecimal sequence.
+								int hex_len = (next == 'U') ? 6 : 4;
+								for (int j = 0; j < hex_len; j++) {
 									char32_t c = GET_CHAR();
 
 									if (c == 0) {
@@ -273,12 +276,46 @@ Error Expression::_get_token(Token &r_token) {
 							} break;
 						}
 
+						// Parse UTF-16 pair.
+						if ((res & 0xfffffc00) == 0xd800) {
+							if (prev == 0) {
+								prev = res;
+								continue;
+							} else {
+								_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+								r_token.type = TK_ERROR;
+								return ERR_PARSE_ERROR;
+							}
+						} else if ((res & 0xfffffc00) == 0xdc00) {
+							if (prev == 0) {
+								_set_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
+								r_token.type = TK_ERROR;
+								return ERR_PARSE_ERROR;
+							} else {
+								res = (prev << 10UL) + res - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+								prev = 0;
+							}
+						}
+						if (prev != 0) {
+							_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+							r_token.type = TK_ERROR;
+							return ERR_PARSE_ERROR;
+						}
 						str += res;
-
 					} else {
+						if (prev != 0) {
+							_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+							r_token.type = TK_ERROR;
+							return ERR_PARSE_ERROR;
+						}
 						str += ch;
 					}
 				}
+				if (prev != 0) {
+					_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+					r_token.type = TK_ERROR;
+					return ERR_PARSE_ERROR;
+				}
 
 				r_token.type = TK_CONSTANT;
 				r_token.value = str;

+ 41 - 3
core/variant/variant_parser.cpp

@@ -217,6 +217,7 @@ Error VariantParser::get_token(Stream *p_stream, Token &r_token, int &line, Stri
 			}
 			case '"': {
 				String str;
+				char32_t prev = 0;
 				while (true) {
 					char32_t ch = p_stream->get_char();
 
@@ -252,10 +253,13 @@ Error VariantParser::get_token(Stream *p_stream, Token &r_token, int &line, Stri
 							case 'r':
 								res = 13;
 								break;
+							case 'U':
 							case 'u': {
-								//hex number
-								for (int j = 0; j < 4; j++) {
+								// Hexadecimal sequence.
+								int hex_len = (next == 'U') ? 6 : 4;
+								for (int j = 0; j < hex_len; j++) {
 									char32_t c = p_stream->get_char();
+
 									if (c == 0) {
 										r_err_str = "Unterminated String";
 										r_token.type = TK_ERROR;
@@ -290,15 +294,49 @@ Error VariantParser::get_token(Stream *p_stream, Token &r_token, int &line, Stri
 							} break;
 						}
 
+						// Parse UTF-16 pair.
+						if ((res & 0xfffffc00) == 0xd800) {
+							if (prev == 0) {
+								prev = res;
+								continue;
+							} else {
+								r_err_str = "Invalid UTF-16 sequence in string, unpaired lead surrogate";
+								r_token.type = TK_ERROR;
+								return ERR_PARSE_ERROR;
+							}
+						} else if ((res & 0xfffffc00) == 0xdc00) {
+							if (prev == 0) {
+								r_err_str = "Invalid UTF-16 sequence in string, unpaired trail surrogate";
+								r_token.type = TK_ERROR;
+								return ERR_PARSE_ERROR;
+							} else {
+								res = (prev << 10UL) + res - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+								prev = 0;
+							}
+						}
+						if (prev != 0) {
+							r_err_str = "Invalid UTF-16 sequence in string, unpaired lead surrogate";
+							r_token.type = TK_ERROR;
+							return ERR_PARSE_ERROR;
+						}
 						str += res;
-
 					} else {
+						if (prev != 0) {
+							r_err_str = "Invalid UTF-16 sequence in string, unpaired lead surrogate";
+							r_token.type = TK_ERROR;
+							return ERR_PARSE_ERROR;
+						}
 						if (ch == '\n') {
 							line++;
 						}
 						str += ch;
 					}
 				}
+				if (prev != 0) {
+					r_err_str = "Invalid UTF-16 sequence in string, unpaired lead surrogate";
+					r_token.type = TK_ERROR;
+					return ERR_PARSE_ERROR;
+				}
 
 				if (p_stream->is_utf8()) {
 					str.parse_utf8(str.ascii(true).get_data());

+ 63 - 3
modules/gdscript/gdscript_tokenizer.cpp

@@ -786,6 +786,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 	}
 
 	String result;
+	char32_t prev = 0;
+	int prev_pos = 0;
 
 	for (;;) {
 		// Consume actual string.
@@ -852,9 +854,11 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 				case '\\':
 					escaped = '\\';
 					break;
-				case 'u':
+				case 'U':
+				case 'u': {
 					// Hexadecimal sequence.
-					for (int i = 0; i < 4; i++) {
+					int hex_len = (code == 'U') ? 6 : 4;
+					for (int j = 0; j < hex_len; j++) {
 						if (_is_at_end()) {
 							return make_error("Unterminated string.");
 						}
@@ -886,7 +890,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 
 						_advance();
 					}
-					break;
+				} break;
 				case '\r':
 					if (_peek() != '\n') {
 						// Carriage return without newline in string. (???)
@@ -909,11 +913,53 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 					valid_escape = false;
 					break;
 			}
+			// Parse UTF-16 pair.
+			if (valid_escape) {
+				if ((escaped & 0xfffffc00) == 0xd800) {
+					if (prev == 0) {
+						prev = escaped;
+						prev_pos = column - 2;
+						continue;
+					} else {
+						Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+						error.start_column = column - 2;
+						error.leftmost_column = error.start_column;
+						push_error(error);
+						valid_escape = false;
+						prev = 0;
+					}
+				} else if ((escaped & 0xfffffc00) == 0xdc00) {
+					if (prev == 0) {
+						Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
+						error.start_column = column - 2;
+						error.leftmost_column = error.start_column;
+						push_error(error);
+						valid_escape = false;
+					} else {
+						escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+						prev = 0;
+					}
+				}
+				if (prev != 0) {
+					Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+					error.start_column = prev_pos;
+					error.leftmost_column = error.start_column;
+					push_error(error);
+					prev = 0;
+				}
+			}
 
 			if (valid_escape) {
 				result += escaped;
 			}
 		} else if (ch == quote_char) {
+			if (prev != 0) {
+				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+				error.start_column = prev_pos;
+				error.leftmost_column = error.start_column;
+				push_error(error);
+				prev = 0;
+			}
 			_advance();
 			if (is_multiline) {
 				if (_peek() == quote_char && _peek(1) == quote_char) {
@@ -930,6 +976,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 				break;
 			}
 		} else {
+			if (prev != 0) {
+				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+				error.start_column = prev_pos;
+				error.leftmost_column = error.start_column;
+				push_error(error);
+				prev = 0;
+			}
 			result += ch;
 			_advance();
 			if (ch == '\n') {
@@ -937,6 +990,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 			}
 		}
 	}
+	if (prev != 0) {
+		Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+		error.start_column = prev_pos;
+		error.leftmost_column = error.start_column;
+		push_error(error);
+		prev = 0;
+	}
 
 	// Make the literal.
 	Variant string;

+ 40 - 3
modules/visual_script/visual_script_expression.cpp

@@ -328,6 +328,7 @@ Error VisualScriptExpression::_get_token(Token &r_token) {
 			};
 			case '"': {
 				String str;
+				char32_t prev = 0;
 				while (true) {
 					char32_t ch = GET_CHAR();
 
@@ -364,9 +365,11 @@ Error VisualScriptExpression::_get_token(Token &r_token) {
 							case 'r':
 								res = 13;
 								break;
+							case 'U':
 							case 'u': {
-								// hex number
-								for (int j = 0; j < 4; j++) {
+								// Hexadecimal sequence.
+								int hex_len = (next == 'U') ? 6 : 4;
+								for (int j = 0; j < hex_len; j++) {
 									char32_t c = GET_CHAR();
 
 									if (c == 0) {
@@ -403,12 +406,46 @@ Error VisualScriptExpression::_get_token(Token &r_token) {
 							} break;
 						}
 
+						// Parse UTF-16 pair.
+						if ((res & 0xfffffc00) == 0xd800) {
+							if (prev == 0) {
+								prev = res;
+								continue;
+							} else {
+								_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+								r_token.type = TK_ERROR;
+								return ERR_PARSE_ERROR;
+							}
+						} else if ((res & 0xfffffc00) == 0xdc00) {
+							if (prev == 0) {
+								_set_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
+								r_token.type = TK_ERROR;
+								return ERR_PARSE_ERROR;
+							} else {
+								res = (prev << 10UL) + res - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+								prev = 0;
+							}
+						}
+						if (prev != 0) {
+							_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+							r_token.type = TK_ERROR;
+							return ERR_PARSE_ERROR;
+						}
 						str += res;
-
 					} else {
+						if (prev != 0) {
+							_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+							r_token.type = TK_ERROR;
+							return ERR_PARSE_ERROR;
+						}
 						str += ch;
 					}
 				}
+				if (prev != 0) {
+					_set_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+					r_token.type = TK_ERROR;
+					return ERR_PARSE_ERROR;
+				}
 
 				r_token.type = TK_CONSTANT;
 				r_token.value = str;