Browse Source

Merge pull request #47978 from HenryWConklin/3.x-xml-numeric-entities

Add support for numeric XML entities to XMLParser
Rémi Verschelde 4 years ago
parent
commit
1cd14e38cb

+ 3 - 66
core/io/xml_parser.cpp

@@ -36,63 +36,6 @@
 
 VARIANT_ENUM_CAST(XMLParser::NodeType);
 
-static bool _equalsn(const CharType *str1, const CharType *str2, int len) {
-	int i;
-	for (i = 0; i < len && str1[i] && str2[i]; ++i) {
-		if (str1[i] != str2[i]) {
-			return false;
-		}
-	}
-
-	// if one (or both) of the strings was smaller then they
-	// are only equal if they have the same length
-	return (i == len) || (str1[i] == 0 && str2[i] == 0);
-}
-
-String XMLParser::_replace_special_characters(const String &origstr) {
-	int pos = origstr.find("&");
-	int oldPos = 0;
-
-	if (pos == -1) {
-		return origstr;
-	}
-
-	String newstr;
-
-	while (pos != -1 && pos < origstr.length() - 2) {
-		// check if it is one of the special characters
-
-		int specialChar = -1;
-		for (int i = 0; i < (int)special_characters.size(); ++i) {
-			const CharType *p = &origstr[pos] + 1;
-
-			if (_equalsn(&special_characters[i][1], p, special_characters[i].length() - 1)) {
-				specialChar = i;
-				break;
-			}
-		}
-
-		if (specialChar != -1) {
-			newstr += (origstr.substr(oldPos, pos - oldPos));
-			newstr += (special_characters[specialChar][0]);
-			pos += special_characters[specialChar].length();
-		} else {
-			newstr += (origstr.substr(oldPos, pos - oldPos + 1));
-			pos += 1;
-		}
-
-		// find next &
-		oldPos = pos;
-		pos = origstr.find("&", pos);
-	}
-
-	if (oldPos < origstr.length() - 1) {
-		newstr += (origstr.substr(oldPos, origstr.length() - oldPos));
-	}
-
-	return newstr;
-}
-
 static inline bool _is_white_space(char c) {
 	return (c == ' ' || c == '\t' || c == '\n' || c == '\r');
 }
@@ -116,7 +59,7 @@ bool XMLParser::_set_text(char *start, char *end) {
 
 	// set current text to the parsed text, and replace xml special characters
 	String s = String::utf8(start, (int)(end - start));
-	node_name = _replace_special_characters(s);
+	node_name = s.xml_unescape();
 
 	// current XML node type is text
 	node_type = NODE_TEXT;
@@ -321,7 +264,7 @@ void XMLParser::_parse_opening_xml_element() {
 				String s = String::utf8(attributeValueBegin,
 						(int)(attributeValueEnd - attributeValueBegin));
 
-				attr.value = _replace_special_characters(s);
+				attr.value = s.xml_unescape();
 				attributes.push_back(attr);
 			} else {
 				// tag is closed directly
@@ -579,14 +522,8 @@ int XMLParser::get_current_line() const {
 }
 
 XMLParser::XMLParser() {
-	data = nullptr;
-	close();
-	special_characters.push_back("&amp;");
-	special_characters.push_back("<lt;");
-	special_characters.push_back(">gt;");
-	special_characters.push_back("\"quot;");
-	special_characters.push_back("'apos;");
 }
+
 XMLParser::~XMLParser() {
 	if (data) {
 		memdelete_arr(data);

+ 6 - 9
core/io/xml_parser.h

@@ -65,15 +65,13 @@ public:
 	};
 
 private:
-	char *data;
-	char *P;
-	uint64_t length;
-	void unescape(String &p_str);
-	Vector<String> special_characters;
+	char *data = nullptr;
+	char *P = nullptr;
+	uint64_t length = 0;
 	String node_name;
-	bool node_empty;
-	NodeType node_type;
-	uint64_t node_offset;
+	bool node_empty = false;
+	NodeType node_type = NODE_NONE;
+	uint64_t node_offset = 0;
 
 	struct Attribute {
 		String name;
@@ -82,7 +80,6 @@ private:
 
 	Vector<Attribute> attributes;
 
-	String _replace_special_characters(const String &origstr);
 	bool _set_text(char *start, char *end);
 	void _parse_closing_xml_element();
 	void _ignore_definition();

+ 46 - 17
core/ustring.cpp

@@ -3549,29 +3549,58 @@ static _FORCE_INLINE_ int _xml_unescape(const CharType *p_src, int p_src_len, Ch
 
 			if (p_src_len >= 4 && p_src[1] == '#') {
 				CharType c = 0;
-
-				for (int i = 2; i < p_src_len; i++) {
-					eat = i + 1;
-					CharType ct = p_src[i];
-					if (ct == ';') {
-						break;
-					} else if (ct >= '0' && ct <= '9') {
-						ct = ct - '0';
-					} else if (ct >= 'a' && ct <= 'f') {
-						ct = (ct - 'a') + 10;
-					} else if (ct >= 'A' && ct <= 'F') {
-						ct = (ct - 'A') + 10;
-					} else {
-						continue;
+				bool overflow = false;
+				if (p_src[2] == 'x') {
+					// Hex entity &#x<num>;
+					for (int i = 3; i < p_src_len; i++) {
+						eat = i + 1;
+						CharType ct = p_src[i];
+						if (ct == ';') {
+							break;
+						} else if (ct >= '0' && ct <= '9') {
+							ct = ct - '0';
+						} else if (ct >= 'a' && ct <= 'f') {
+							ct = (ct - 'a') + 10;
+						} else if (ct >= 'A' && ct <= 'F') {
+							ct = (ct - 'A') + 10;
+						} else {
+							break;
+						}
+						if (c > (WCHAR_MAX >> 4)) {
+							overflow = true;
+							break;
+						}
+						c <<= 4;
+						c |= ct;
+					}
+				} else {
+					// Decimal entity &#<num>;
+					for (int i = 2; i < p_src_len; i++) {
+						eat = i + 1;
+						CharType ct = p_src[i];
+						if (ct == ';' || ct < '0' || ct > '9') {
+							break;
+						}
+					}
+					if (p_src[eat - 1] == ';') {
+						int64_t val = String::to_int(p_src + 2, eat - 3);
+						if (val > 0 && val <= WCHAR_MAX) {
+							c = (CharType)val;
+						} else {
+							overflow = true;
+						}
 					}
-					c <<= 4;
-					c |= ct;
 				}
 
+				// Value must be non-zero, in the range of char32_t,
+				// actually end with ';'. If invalid, leave the entity as-is
+				if (c == '\0' || overflow || p_src[eat - 1] != ';') {
+					eat = 1;
+					c = *p_src;
+				}
 				if (p_dst) {
 					*p_dst = c;
 				}
-
 			} else if (p_src_len >= 4 && p_src[1] == 'g' && p_src[2] == 't' && p_src[3] == ';') {
 				if (p_dst) {
 					*p_dst = '>';

+ 6 - 0
main/tests/test_main.cpp

@@ -47,6 +47,7 @@
 #include "test_render.h"
 #include "test_shader_lang.h"
 #include "test_string.h"
+#include "test_xml_parser.h"
 
 const char **tests_get_names() {
 	static const char *test_names[] = {
@@ -65,6 +66,7 @@ const char **tests_get_names() {
 		"gd_bytecode",
 		"ordered_hash_map",
 		"astar",
+		"xml_parser",
 		nullptr
 	};
 
@@ -138,6 +140,10 @@ MainLoop *test_main(String p_test, const List<String> &p_args) {
 		return TestAStar::test();
 	}
 
+	if (p_test == "xml_parser") {
+		return TestXMLParser::test();
+	}
+
 	print_line("Unknown test: " + p_test);
 	return nullptr;
 }

+ 40 - 0
main/tests/test_string.cpp

@@ -1123,6 +1123,45 @@ bool test_35() {
 	return state;
 }
 
+bool test_36() {
+#define CHECK(X)                                          \
+	if (!(X)) {                                           \
+		OS::get_singleton()->print("\tFAIL at %s\n", #X); \
+		return false;                                     \
+	} else {                                              \
+		OS::get_singleton()->print("\tPASS\n");           \
+	}
+	OS::get_singleton()->print("\n\nTest 36: xml unescape\n");
+	// Named entities
+	String input = "&quot;&amp;&apos;&lt;&gt;";
+	CHECK(input.xml_unescape() == "\"&\'<>");
+
+	// Numeric entities
+	input = "&#x41;&#66;";
+	CHECK(input.xml_unescape() == "AB");
+
+	input = "&#0;&x#0;More text";
+	String result = input.xml_unescape();
+	// Didn't put in a leading NUL and terminate the string
+	CHECK(input.length() > 0);
+	CHECK(input[0] != '\0');
+	// Entity should be left as-is if invalid
+	CHECK(input.xml_unescape() == input);
+
+	// Shouldn't consume without ending in a ';'
+	input = "&#66";
+	CHECK(input.xml_unescape() == input);
+	input = "&#x41";
+	CHECK(input.xml_unescape() == input);
+
+	// Invalid characters should make the entity ignored
+	input = "&#x41SomeIrrelevantText;";
+	CHECK(input.xml_unescape() == input);
+	input = "&#66SomeIrrelevantText;";
+	CHECK(input.xml_unescape() == input);
+	return true;
+}
+
 typedef bool (*TestFunc)();
 
 TestFunc test_funcs[] = {
@@ -1162,6 +1201,7 @@ TestFunc test_funcs[] = {
 	test_33,
 	test_34,
 	test_35,
+	test_36,
 	nullptr
 
 };

+ 109 - 0
main/tests/test_xml_parser.cpp

@@ -0,0 +1,109 @@
+/*************************************************************************/
+/*  test_xml_parser.cpp                                                  */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "test_xml_parser.h"
+
+#include "core/os/os.h"
+
+namespace TestXMLParser {
+#define CHECK(X)                                          \
+	if (!(X)) {                                           \
+		OS::get_singleton()->print("\tFAIL at %s\n", #X); \
+		return false;                                     \
+	} else {                                              \
+		OS::get_singleton()->print("\tPASS\n");           \
+	}
+bool test_1() {
+	String source = "<?xml version = \"1.0\" encoding=\"UTF-8\" ?>\
+<top attr=\"attr value\">\
+  Text&lt;&#65;&#x42;&gt;\
+</top>";
+	Vector<uint8_t> buff;
+	for (int i = 0; i < source.length(); i++) {
+		buff.push_back(source.get(i));
+	}
+	XMLParser parser;
+	parser.open_buffer(buff);
+
+	// <?xml ...?> gets parsed as NODE_UNKNOWN
+	CHECK(parser.read() == OK);
+	CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_UNKNOWN);
+
+	CHECK(parser.read() == OK);
+	CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_ELEMENT);
+	CHECK(parser.get_node_name() == "top");
+	CHECK(parser.has_attribute("attr"));
+	CHECK(parser.get_attribute_value("attr") == "attr value");
+
+	CHECK(parser.read() == OK);
+	CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_TEXT);
+	CHECK(parser.get_node_data().lstrip(" \t") == "Text<AB>");
+
+	CHECK(parser.read() == OK);
+	CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_ELEMENT_END);
+	CHECK(parser.get_node_name() == "top");
+
+	parser.close();
+	return true;
+}
+
+typedef bool (*TestFunc)();
+TestFunc test_funcs[] = {
+	test_1,
+	nullptr
+};
+
+MainLoop *test() {
+	int count = 0;
+	int passed = 0;
+
+	while (true) {
+		if (!test_funcs[count]) {
+			break;
+		}
+		bool pass = test_funcs[count]();
+		if (pass) {
+			passed++;
+		}
+		OS::get_singleton()->print("\t%s\n", pass ? "PASS" : "FAILED");
+
+		count++;
+	}
+
+	OS::get_singleton()->print("\n\n\n");
+	OS::get_singleton()->print("*************\n");
+	OS::get_singleton()->print("***TOTALS!***\n");
+	OS::get_singleton()->print("*************\n");
+
+	OS::get_singleton()->print("Passed %i of %i tests\n", passed, count);
+
+	return nullptr;
+}
+} // namespace TestXMLParser

+ 45 - 0
main/tests/test_xml_parser.h

@@ -0,0 +1,45 @@
+/*************************************************************************/
+/*  test_xml_parser.h                                                    */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef TEST_XML_PARSER_H
+#define TEST_XML_PARSER_H
+
+#include <inttypes.h>
+
+#include "core/io/xml_parser.h"
+#include "core/os/main_loop.h"
+#include "core/ustring.h"
+#include "core/vector.h"
+
+namespace TestXMLParser {
+
+MainLoop *test();
+}
+#endif // TEST_XML_PARSER_H