Explorar o código

use new haxe.xml.Parser for JS : faster, not based on Regexp but state machine

Nicolas Cannasse %!s(int64=13) %!d(string=hai) anos
pai
achega
4839d4a651
Modificáronse 3 ficheiros con 286 adicións e 111 borrados
  1. 1 0
      doc/CHANGES.txt
  2. 284 0
      std/haxe/xml/Parser.hx
  3. 1 111
      std/js/_std/Xml.hx

+ 1 - 0
doc/CHANGES.txt

@@ -8,6 +8,7 @@
 	js : forbid static 'length' (issue since object is a Function)
 	all : does not allow overriding var/prop
 	flash : removed wrapping for Xml nodes, use instead specific compare when comparing two typed nodes
+	js : use new haxe.xml.Parser (faster, not based on Regexp)
 
 2012-04-14: 2.09
 	all : optimized const == const and const != const (with different const types)

+ 284 - 0
std/haxe/xml/Parser.hx

@@ -0,0 +1,284 @@
+package haxe.xml;
+
+using StringTools;
+
+/* poor'man enum : reduce code size + a bit faster since inlined */
+extern private class S {
+	public static inline var IGNORE_SPACES 	= 0;
+	public static inline var BEGIN			= 1;
+	public static inline var BEGIN_NODE		= 2;
+	public static inline var TAG_NAME		= 3;
+	public static inline var BODY			= 4;
+	public static inline var ATTRIB_NAME	= 5;
+	public static inline var EQUALS			= 6;
+	public static inline var ATTVAL_BEGIN	= 7;
+	public static inline var ATTRIB_VAL		= 8;
+	public static inline var CHILDS			= 9;
+	public static inline var CLOSE			= 10;
+	public static inline var WAIT_END		= 11;
+	public static inline var WAIT_END_RET	= 12;
+	public static inline var PCDATA			= 13;
+	public static inline var HEADER			= 14;
+	public static inline var COMMENT		= 15;
+	public static inline var DOCTYPE		= 16;
+	public static inline var CDATA			= 17;
+}
+
+class Parser
+{
+	static public function parse(str:String)
+	{
+		var doc = Xml.createDocument();
+		doParse(str, 0, doc);
+		return doc;
+	}
+	
+	static function doParse(str:String, ?p:Int = 0, ?parent:Xml):Int
+	{
+		var xml:Xml = null;
+		var state = S.BEGIN;
+		var next = S.BEGIN;
+		var aname = null;
+		var start = 0;
+		var nsubs = 0;
+		var nbrackets = 0;
+		var c = str.fastCodeAt(p);
+
+		while (!c.isEOF())
+		{
+			switch(state)
+			{
+				case S.IGNORE_SPACES:
+					switch(c)
+					{
+						case
+							'\n'.code,
+							'\r'.code,
+							'\t'.code,
+							' '.code:
+						default:
+							state = next;
+							continue;
+					}
+				case S.BEGIN:
+					switch(c)
+					{
+						case '<'.code:
+							state = S.IGNORE_SPACES;
+							next = S.BEGIN_NODE;
+						default:
+							start = p;
+							state = S.PCDATA;
+							continue;
+					}
+				case S.PCDATA:
+					if (c == '<'.code)
+					{
+						var child = Xml.createPCData(str.substr(start, p - start));
+						parent.addChild(child);
+						nsubs++;
+						state = S.IGNORE_SPACES;
+						next = S.BEGIN_NODE;
+					}
+				case S.CDATA:
+					if (c == ']'.code && str.fastCodeAt(p + 1) == ']'.code && str.fastCodeAt(p + 2) == '>'.code)
+					{
+						var child = Xml.createCData(str.substr(start, p - start));
+						parent.addChild(child);
+						nsubs++;
+						p += 2;
+						state = S.BEGIN;
+					}
+				case S.BEGIN_NODE:
+					switch(c)
+					{
+						case '!'.code:
+							if (str.fastCodeAt(p + 1) == '['.code)
+							{
+								p += 2;
+								if (str.substr(p, 6).toUpperCase() != "CDATA[")
+									throw("Expected <![CDATA[");
+								p += 5;
+								state = S.CDATA;
+								start = p + 1;
+							}
+							else if (str.fastCodeAt(p + 1) == 'D'.code || str.fastCodeAt(p + 1) == 'd'.code)
+							{
+								if(str.substr(p + 2, 6).toUpperCase() != "OCTYPE")
+									throw("Expected <!DOCTYPE");
+								p += 8;
+								state = S.DOCTYPE;
+								start = p + 1;
+							}
+							else if( str.fastCodeAt(p + 1) != '-'.code || str.fastCodeAt(p + 2) != '-'.code )
+								throw("Expected <!--");
+							else
+							{
+								p += 2;
+								state = S.COMMENT;
+								start = p + 1;
+							}
+						case '?'.code:
+							state = S.HEADER;
+							start = p;
+						case '/'.code:
+							if( parent == null )
+								throw("Expected node name");
+							start = p + 1;
+							state = S.IGNORE_SPACES;
+							next = S.CLOSE;
+						default:
+							state = S.TAG_NAME;
+							start = p;
+							continue;
+					}
+				case S.TAG_NAME:
+					if (!isValidChar(c))
+					{
+						if( p == start )
+							throw("Expected node name");
+						xml = Xml.createElement(str.substr(start, p - start));
+						parent.addChild(xml);
+						state = S.IGNORE_SPACES;
+						next = S.BODY;
+						continue;
+					}
+				case S.BODY:
+					switch(c)
+					{
+						case '/'.code:
+							state = S.WAIT_END;
+							nsubs++;
+						case '>'.code:
+							state = S.CHILDS;
+							nsubs++;
+						default:
+							state = S.ATTRIB_NAME;
+							start = p;
+							continue;
+					}
+				case S.ATTRIB_NAME:
+					if (!isValidChar(c))
+					{
+						var tmp;
+						if( start == p )
+							throw("Expected attribute name");
+						tmp = str.substr(start,p-start);
+						aname = tmp;
+						if( xml.exists(aname) )
+							throw("Duplicate attribute");
+						state = S.IGNORE_SPACES;
+						next = S.EQUALS;
+						continue;
+					}
+				case S.EQUALS:
+					switch(c)
+					{
+						case '='.code:
+							state = S.IGNORE_SPACES;
+							next = S.ATTVAL_BEGIN;
+						default:
+							throw("Expected =");
+					}
+				case S.ATTVAL_BEGIN:
+					switch(c)
+					{
+						case '"'.code, '\''.code:
+							state = S.ATTRIB_VAL;
+							start = p;
+						default:
+							throw("Expected \"");
+					}
+				case S.ATTRIB_VAL:
+					if (c == str.fastCodeAt(start))
+					{
+						var val = str.substr(start+1,p-start-1);
+						xml.set(aname, val);
+						state = S.IGNORE_SPACES;
+						next = S.BODY;
+					}
+				case S.CHILDS:
+					p = doParse(str, p, xml);
+					start = p;
+					state = S.BEGIN;
+				case S.WAIT_END:
+					switch(c)
+					{
+						case '>'.code:
+							state = S.BEGIN;
+						default :
+							throw("Expected >");
+					}
+				case S.WAIT_END_RET:
+					switch(c)
+					{
+						case '>'.code:
+							if( nsubs == 0 )
+								parent.addChild(Xml.createPCData(""));
+							return p;
+						default :
+							throw("Expected >");
+					}
+				case S.CLOSE:
+					if (!isValidChar(c))
+					{
+						if( start == p )
+							throw("Expected node name");
+
+						var v = str.substr(start,p - start);
+						if (v != parent.nodeName)
+							throw "Expected </" +parent.nodeName + ">";
+
+						state = S.IGNORE_SPACES;
+						next = S.WAIT_END_RET;
+						continue;
+					}
+				case S.COMMENT:
+					if (c == '-'.code && str.fastCodeAt(p +1) == '-'.code && str.fastCodeAt(p + 2) == '>'.code)
+					{
+						parent.addChild(Xml.createComment(str.substr(start, p - start)));
+						p += 2;
+						state = S.BEGIN;
+					}
+				case S.DOCTYPE:
+					if(c == '['.code)
+						nbrackets++;
+					else if(c == ']'.code)
+						nbrackets--;
+					else if (c == '>'.code && nbrackets == 0)
+					{
+						parent.addChild(Xml.createDocType(str.substr(start, p - start)));
+						state = S.BEGIN;
+					}
+				case S.HEADER:
+					if (c == '?'.code && str.fastCodeAt(p + 1) == '>'.code)
+					{
+						p++;
+						var str = str.substr(start + 1, p - start - 2);
+						parent.addChild(Xml.createProlog(str));
+						state = S.BEGIN;
+					}
+			}
+			c = str.fastCodeAt(++p);
+		}
+		
+		if (state == S.BEGIN)
+		{
+			start = p;
+			state = S.PCDATA;
+		}
+		
+		if (state == S.PCDATA)
+		{
+			if (p != start || nsubs == 0)
+				parent.addChild(Xml.createPCData(str.substr(start, p - start)));
+			return p;
+		}
+		
+		throw "Unexpected end";
+	}
+	
+	static inline function isValidChar(c) {
+		return (c >= 'a'.code && c <= 'z'.code) || (c >= 'A'.code && c <= 'Z'.code) || (c >= '0'.code && c <= '9'.code) || c == ':'.code || c == '.'.code || c == '_'.code || c == '-'.code;
+	}
+}

+ 1 - 111
std/js/_std/Xml.hx

@@ -36,20 +36,6 @@ enum XmlType {
 	public static var Prolog(default,null) : XmlType;
 	public static var Document(default,null) : XmlType;
 
-	static var enode = ~/^<([a-zA-Z0-9:._-]+)/;
-	static var ecdata = ~/^<!\[CDATA\[/i;
-	static var edoctype = ~/^<!DOCTYPE /i;
-	static var eend = ~/^<\/([a-zA-Z0-9:._-]+)>/;
-	static var epcdata = ~/^[^<]+/;
-	static var ecomment = ~/^<!--/;
-	static var eprolog = ~/^<\?[^\?]+\?>/;
-
-	static var eattribute = ~/^\s*([a-zA-Z0-9:_-]+)\s*=\s*(["'])([^\2]*?)\2/; //"
-	static var eclose = ~/^[ \r\n\t]*(>|(\/>))/;
-	static var ecdata_end = ~/\]\]>/;
-	static var edoctype_elt = ~/[\[|\]>]/;
-	static var ecomment_end = ~/-->/;
-
 	public var nodeType(default,null) : XmlType;
 	public var nodeName(getNodeName,setNodeName) : String;
 	public var nodeValue(getNodeValue,setNodeValue) : String;
@@ -62,103 +48,7 @@ enum XmlType {
 	var _parent : Xml;
 
 	public static function parse( str : String ) : Xml {
-		var rules = [enode,epcdata,eend,ecdata,edoctype,ecomment,eprolog];
-		var nrules = rules.length;
-		var current = Xml.createDocument();
-
-		var stack = new List();
-		while( str.length > 0 ) {
-			var i = 0;
-			while( i < nrules ) {
-				var r = rules[i];
-				if( r.match(str) ) {
-					switch( i ) {
-					case 0: // Node
-						var x = Xml.createElement(r.matched(1));
-						current.addChild(x);
-						str = r.matchedRight();
-						while( eattribute.match(str) ) {
-							x.set(eattribute.matched(1),eattribute.matched(3));
-							str = eattribute.matchedRight();
-						}
-						if( !eclose.match(str) ) {
-							i = nrules;
-							break;
-						}
-						if( eclose.matched(1) == ">" ) {
-							stack.push(current);
-							current = x;
-						}
-						str = eclose.matchedRight();
-					case 1: // PCData
-						var x = Xml.createPCData(r.matched(0));
-						current.addChild(x);
-						str = r.matchedRight();
-					case 2: // End Node
-						untyped if( current._children != null && current._children.length == 0 ) {
-							var e = Xml.createPCData("");
-							current.addChild(e);
-						}
-						untyped if( r.matched(1) != current._nodeName || stack.isEmpty() ) {
-							i = nrules;
-							break;
-						}
-						current = stack.pop();
-						str = r.matchedRight();
-					case 3: // CData
-						str = r.matchedRight();
-						if( !ecdata_end.match(str) )
-							throw "End of CDATA section not found";
-						var x = Xml.createCData(ecdata_end.matchedLeft());
-						current.addChild(x);
-						str = ecdata_end.matchedRight();
-					case 4: // DocType
-						var pos = 0;
-						var count = 0;
-						var old = str;
-						while( true ) {
-							if( !edoctype_elt.match(str) )
-								throw "End of DOCTYPE section not found";
-							var p = edoctype_elt.matchedPos();
-							pos += p.pos + p.len;
-							str = edoctype_elt.matchedRight();
-							switch( edoctype_elt.matched(0) ) {
-							case "[": count++;
-							case "]": count--; if( count < 0 ) throw "Invalid ] found in DOCTYPE declaration";
-							default:
-								if( count == 0 )
-									break;
-							}
-						}
-						var x = Xml.createDocType(old.substr(10,pos-11));
-						current.addChild(x);
-					case 5: // Comment
-						if( !ecomment_end.match(str) )
-							throw "Unclosed Comment";
-						var p = ecomment_end.matchedPos();
-						var x = Xml.createComment(str.substr(4,p.pos+p.len-7));
-						current.addChild(x);
-						str = ecomment_end.matchedRight();
-					case 6: // Prolog
-						var prolog = r.matched(0);
-						var x = Xml.createProlog(prolog.substr(2,prolog.length - 4));
-						current.addChild(x);
-						str = r.matchedRight();
-					}
-					break;
-				}
-				i += 1;
-			}
-			if( i == nrules ) {
-				if( str.length > 10 )
-					throw ("Xml parse error : Unexpected "+str.substr(0,10)+"...");
-				else
-					throw ("Xml parse error : Unexpected "+str);
-			}
-		}
-		if( !stack.isEmpty() )
-			throw "Xml parse error : Unclosed "+stack.last().nodeName;
-		untyped return current;
+		return haxe.xml.Parser.parse(str);
 	}
 
 	private function new() : Void {