Selaa lähdekoodia

[php] improve haxe.xml.Parser performance (#8083)

Alexander Kuzmenko 6 vuotta sitten
vanhempi
commit
7070e6a6b7
2 muutettua tiedostoa jossa 456 lisäystä ja 1 poistoa
  1. 455 0
      std/php/_std/haxe/xml/Parser.hx
  2. 1 1
      tests/unit/src/unit/TestXML.hx

+ 455 - 0
std/php/_std/haxe/xml/Parser.hx

@@ -0,0 +1,455 @@
+/*
+ * Copyright (C)2005-2019 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+package haxe.xml;
+
+import php.Global;
+import php.Syntax;
+import php.NativeString;
+
+using haxe.xml.Parser;
+
+private enum abstract S(Int) {
+	var IGNORE_SPACES;
+	var BEGIN;
+	var BEGIN_NODE;
+	var TAG_NAME;
+	var BODY;
+	var ATTRIB_NAME;
+	var EQUALS;
+	var ATTVAL_BEGIN;
+	var ATTRIB_VAL;
+	var CHILDS;
+	var CLOSE;
+	var WAIT_END;
+	var WAIT_END_RET;
+	var PCDATA;
+	var HEADER;
+	var COMMENT;
+	var DOCTYPE;
+	var CDATA;
+	var ESCAPE;
+}
+
+class XmlParserException
+{
+	/**
+		the XML parsing error message
+	**/
+	public var message:String;
+
+	/**
+		the line number at which the XML parsing error occurred
+	**/
+	public var lineNumber:Int;
+
+	/**
+		the character position in the reported line at which the parsing error occurred
+	**/
+	public var positionAtLine:Int;
+
+	/**
+		the character position in the XML string at which the parsing error occurred
+	**/
+	public var position:Int;
+
+	/**
+		the invalid XML string
+	**/
+	public var xml:String;
+
+	public function new(message:String, xml:String, position:Int)
+	{
+		this.xml = xml;
+		this.message = message;
+		this.position = position;
+		lineNumber = 1;
+		positionAtLine = 0;
+
+		for( i in 0...position)
+		{
+			var c = (xml:NativeString).fastCodeAt(i);
+			if (c == '\n'.code) {
+				lineNumber++;
+				positionAtLine = 0;
+			} else {
+				if (c != '\r'.code) positionAtLine++;
+			}
+		}
+	}
+
+	public function toString():String
+	{
+		return Type.getClassName(Type.getClass(this)) + ": " + message + " at line " + lineNumber + " char " + positionAtLine;
+	}
+}
+
+class Parser
+{
+	static var escapes = {
+		var h = new haxe.ds.StringMap();
+		h.set("lt", "<");
+		h.set("gt", ">");
+		h.set("amp", "&");
+		h.set("quot", '"');
+		h.set("apos", "'");
+		h;
+	}
+
+	/**
+		Parses the String into an XML Document. Set strict parsing to true in order to enable a strict check of XML attributes and entities.
+
+		@throws haxe.xml.XmlParserException
+	**/
+	static public function parse(str:String, strict = false)
+	{
+		var doc = Xml.createDocument();
+		doParse(str, strict, 0, doc);
+		return doc;
+	}
+
+	static function doParse(str:NativeString, strict:Bool, p:Int = 0, ?parent:Xml):Int
+	{
+		var xml:Xml = null;
+		var state = S.BEGIN;
+		var next = S.BEGIN;
+		var aname = null;
+		var start = 0;
+		var nsubs = 0;
+		var nbrackets = 0;
+		var c = str.fastCodeAt(p);
+		var buf:NativeString = '';
+		// need extra state because next is in use
+		var escapeNext = S.BEGIN;
+		var attrValQuote = -1;
+		inline function addChild(xml:Xml) {
+			parent.addChild(xml);
+			nsubs++;
+		}
+		while (!StringTools.isEof(c))
+		{
+			switch(state)
+			{
+				case S.IGNORE_SPACES:
+					switch(c)
+					{
+						case
+							'\n'.code,
+							'\r'.code,
+							'\t'.code,
+							' '.code:
+						default:
+							state = next;
+							continue;
+					}
+				case S.BEGIN:
+					switch(c)
+					{
+						case '<'.code:
+							state = S.IGNORE_SPACES;
+							next = S.BEGIN_NODE;
+						default:
+							start = p;
+							state = S.PCDATA;
+							continue;
+					}
+				case S.PCDATA:
+					if (c == '<'.code)
+					{
+						buf = buf.addSub(str, start, p - start);
+						var child = Xml.createPCData(buf);
+						buf = '';
+						addChild(child);
+						state = S.IGNORE_SPACES;
+						next = S.BEGIN_NODE;
+					} else if (c == '&'.code) {
+						buf = buf.addSub(str, start, p - start);
+						state = S.ESCAPE;
+						escapeNext = S.PCDATA;
+						start = p + 1;
+					}
+				case S.CDATA:
+					if (c == ']'.code && str.fastCodeAt(p + 1) == ']'.code && str.fastCodeAt(p + 2) == '>'.code)
+					{
+						var child = Xml.createCData(str.substr(start, p - start));
+						addChild(child);
+						p += 2;
+						state = S.BEGIN;
+					}
+				case S.BEGIN_NODE:
+					switch(c)
+					{
+						case '!'.code:
+							if (str.fastCodeAt(p + 1) == '['.code)
+							{
+								p += 2;
+								if (Global.strtoupper(str.substr(p, 6)) != "CDATA[")
+									throw new XmlParserException("Expected <![CDATA[", str, p);
+								p += 5;
+								state = S.CDATA;
+								start = p + 1;
+							}
+							else if (str.fastCodeAt(p + 1) == 'D'.code || str.fastCodeAt(p + 1) == 'd'.code)
+							{
+								if(Global.strtoupper(str.substr(p + 2, 6)) != "OCTYPE")
+									throw new XmlParserException("Expected <!DOCTYPE", str, p);
+								p += 8;
+								state = S.DOCTYPE;
+								start = p + 1;
+							}
+							else if( str.fastCodeAt(p + 1) != '-'.code || str.fastCodeAt(p + 2) != '-'.code )
+								throw new XmlParserException("Expected <!--", str, p);
+							else
+							{
+								p += 2;
+								state = S.COMMENT;
+								start = p + 1;
+							}
+						case '?'.code:
+							state = S.HEADER;
+							start = p;
+						case '/'.code:
+							if( parent == null )
+								throw new XmlParserException("Expected node name", str, p);
+							start = p + 1;
+							state = S.IGNORE_SPACES;
+							next = S.CLOSE;
+						default:
+							state = S.TAG_NAME;
+							start = p;
+							continue;
+					}
+				case S.TAG_NAME:
+					if (!isValidChar(c))
+					{
+						if( p == start )
+							throw new XmlParserException("Expected node name", str, p);
+						xml = Xml.createElement(str.substr(start, p - start));
+						addChild(xml);
+						state = S.IGNORE_SPACES;
+						next = S.BODY;
+						continue;
+					}
+				case S.BODY:
+					switch(c)
+					{
+						case '/'.code:
+							state = S.WAIT_END;
+						case '>'.code:
+							state = S.CHILDS;
+						default:
+							state = S.ATTRIB_NAME;
+							start = p;
+							continue;
+					}
+				case S.ATTRIB_NAME:
+					if (!isValidChar(c))
+					{
+						var tmp;
+						if( start == p )
+							throw new XmlParserException("Expected attribute name", str, p);
+						tmp = str.substr(start,p-start);
+						aname = tmp;
+						if( xml.exists(aname) )
+							throw new XmlParserException("Duplicate attribute [" + aname + "]", str, p);
+						state = S.IGNORE_SPACES;
+						next = S.EQUALS;
+						continue;
+					}
+				case S.EQUALS:
+					switch(c)
+					{
+						case '='.code:
+							state = S.IGNORE_SPACES;
+							next = S.ATTVAL_BEGIN;
+						default:
+							throw new XmlParserException("Expected =", str, p);
+					}
+				case S.ATTVAL_BEGIN:
+					switch(c)
+					{
+						case '"'.code | '\''.code:
+							buf = '';
+							state = S.ATTRIB_VAL;
+							start = p + 1;
+							attrValQuote = c;
+						default:
+							throw new XmlParserException("Expected \"", str, p);
+					}
+				case S.ATTRIB_VAL:
+					switch (c) {
+						case '&'.code:
+							buf = buf.addSub(str, start, p - start);
+							state = S.ESCAPE;
+							escapeNext = S.ATTRIB_VAL;
+							start = p + 1;
+						case '>'.code | '<'.code if( strict ):
+							// HTML allows these in attributes values
+							throw new XmlParserException("Invalid unescaped " + String.fromCharCode(c) + " in attribute value", str, p);
+						case _ if (c == attrValQuote):
+							buf = buf.addSub(str, start, p - start);
+							var val = buf;
+							buf = '';
+							xml.set(aname, val);
+							state = S.IGNORE_SPACES;
+							next = S.BODY;
+					}
+				case S.CHILDS:
+					p = doParse(str, strict, p, xml);
+					start = p;
+					state = S.BEGIN;
+				case S.WAIT_END:
+					switch(c)
+					{
+						case '>'.code:
+							state = S.BEGIN;
+						default :
+							throw new XmlParserException("Expected >", str, p);
+					}
+				case S.WAIT_END_RET:
+					switch(c)
+					{
+						case '>'.code:
+							if( nsubs == 0 )
+								parent.addChild(Xml.createPCData(""));
+							return p;
+						default :
+							throw new XmlParserException("Expected >", str, p);
+					}
+				case S.CLOSE:
+					if (!isValidChar(c))
+					{
+						if( start == p )
+							throw new XmlParserException("Expected node name", str, p);
+
+						var v = str.substr(start,p - start);
+						if (parent == null || parent.nodeType != Element) {
+							throw new XmlParserException('Unexpected </$v>, tag is not open', str, p);
+						}
+						if (v != parent.nodeName)
+							throw new XmlParserException("Expected </" +parent.nodeName + ">", str, p);
+
+						state = S.IGNORE_SPACES;
+						next = S.WAIT_END_RET;
+						continue;
+					}
+				case S.COMMENT:
+					if (c == '-'.code && str.fastCodeAt(p +1) == '-'.code && str.fastCodeAt(p + 2) == '>'.code)
+					{
+						addChild(Xml.createComment(str.substr(start, p - start)));
+						p += 2;
+						state = S.BEGIN;
+					}
+				case S.DOCTYPE:
+					if(c == '['.code)
+						nbrackets++;
+					else if(c == ']'.code)
+						nbrackets--;
+					else if (c == '>'.code && nbrackets == 0)
+					{
+						addChild(Xml.createDocType(str.substr(start, p - start)));
+						state = S.BEGIN;
+					}
+				case S.HEADER:
+					if (c == '?'.code && str.fastCodeAt(p + 1) == '>'.code)
+					{
+						p++;
+						var str = str.substr(start + 1, p - start - 2);
+						addChild(Xml.createProcessingInstruction(str));
+						state = S.BEGIN;
+					}
+				case S.ESCAPE:
+					if (c == ';'.code)
+					{
+						var s = str.substr(start, p - start);
+						if (s.fastCodeAt(0) == '#'.code) {
+							var c = s.fastCodeAt(1) == 'x'.code
+								? Std.parseInt("0" +s.substr(1, Global.strlen(s) - 1))
+								: Std.parseInt(s.substr(1, Global.strlen(s) - 1));
+							buf = Syntax.concat(buf, Global.mb_chr(c));
+						} else if (!escapes.exists(s)) {
+							if( strict )
+								throw new XmlParserException("Undefined entity: " + s, str, p);
+							buf = Syntax.concat(buf, '&$s;');
+						} else {
+							buf = Syntax.concat(buf, escapes.get(s));
+						}
+						start = p + 1;
+						state = escapeNext;
+					} else if (!isValidChar(c) && c != "#".code) {
+						if( strict )
+							throw new XmlParserException("Invalid character in entity: " + String.fromCharCode(c), str, p);
+						buf = Syntax.concat(buf, "&");
+						buf = buf.addSub(str, start, p - start);
+						p--;
+						start = p + 1;
+						state = escapeNext;
+					}
+			}
+			c = str.fastCodeAt(++p);
+		}
+
+		if (state == S.BEGIN)
+		{
+			start = p;
+			state = S.PCDATA;
+		}
+
+		if (state == S.PCDATA)
+		{
+			if (parent.nodeType == Element) {
+				throw new XmlParserException("Unclosed node <" + parent.nodeName + ">", str, p);
+			}
+			if (p != start || nsubs == 0) {
+				buf = buf.addSub(str, start, p-start);
+				addChild(Xml.createPCData(buf));
+			}
+			return p;
+		}
+
+		if( !strict && state == S.ESCAPE && escapeNext == S.PCDATA ) {
+			buf = Syntax.concat(buf, "&");
+			buf = buf.addSub(str, start, p - start);
+			addChild(Xml.createPCData(buf));
+			return p;
+		}
+
+		throw new XmlParserException("Unexpected end", str, p);
+	}
+
+	static inline function isValidChar(c) {
+		return (c >= 'a'.code && c <= 'z'.code) || (c >= 'A'.code && c <= 'Z'.code) || (c >= '0'.code && c <= '9'.code) || c == ':'.code || c == '.'.code || c == '_'.code || c == '-'.code;
+	}
+
+// TODO: rewrite the parser using a buffer instead of a string as the data source
+	@:allow(haxe.xml.XmlParserException)
+	static inline function fastCodeAt(s:NativeString, pos:Int):Int {
+		return pos >= Global.strlen(s) ? 0 : Global.ord(s[pos]);
+	}
+
+	static inline function substr(s:NativeString, pos:Int, ?length:Int):NativeString {
+		return Global.substr(s, pos, length);
+	}
+
+	static inline function addSub(buf:NativeString, s:NativeString, pos:Int, length:Int):NativeString {
+		return Syntax.concat(buf, Global.substr(s, pos, length));
+	}
+}

+ 1 - 1
tests/unit/src/unit/TestXML.hx

@@ -164,7 +164,7 @@ class TestXML extends Test {
 		var values = ['<', '>', '"', '&', "'", '&euro;', '@', "ô", String.fromCharCode(0x3F), "ÿ"];
 
 		for( i in 0...entities.length) {
-			eq( haxe.xml.Parser.parse(entities[i], false).firstChild().nodeValue, values[i] );
+			eq( values[i], haxe.xml.Parser.parse(entities[i], false).firstChild().nodeValue );
 		}
 
 		var s = "<a>&gt;<b>&lt;</b>&lt;&gt;<b>&gt;&lt;</b>\"</a>";