9 éve · 0d8147eae6
--- a/std/lua/_std/haxe/Utf8.hx
+++ b/std/lua/_std/haxe/Utf8.hx
@@ -25,154 +25,261 @@ import lua.NativeStringTools;
 
				 
			
 
				 /**
			
 
				   A Lua-specific implementation of Utf8, using a helper library.
			
 
				-**/
			
 
				+ **/
			
 
				 
			
 
				 class Utf8 {
			
 
				 
			
 
				-	var __b : String;
			
 
				+    var __b : String;
			
 
				 
			
 
				-	/**
			
 
				-		Allocate a new Utf8 buffer using an optional bytes size.
			
 
				-	**/
			
 
				-	public function new( ?size : Int ) {
			
 
				-		__b = "";
			
 
				-	}
			
 
				+    /**
			
 
				+      Allocate a new Utf8 buffer using an optional bytes size.
			
 
				+     **/
			
 
				+    public function new( ?size : Int ) {
			
 
				+	__b = "";
			
 
				+    }
			
 
				 
			
 
				-	/**
			
 
				-		Add the given UTF8 character code to the buffer.
			
 
				-	**/
			
 
				-	public inline function addChar( c : Int ) : Void {
			
 
				-		__b += char(c);
			
 
				-	}
			
 
				+    /**
			
 
				+      Add the given UTF8 character code to the buffer.
			
 
				+     **/
			
 
				+    public inline function addChar( c : Int ) : Void {
			
 
				+	__b += char(c);
			
 
				+    }
			
 
				 
			
 
				-	/**
			
 
				-		Returns the buffer converted to a String;
			
 
				-	**/
			
 
				-	public inline function toString() : String {
			
 
				-		return __b;
			
 
				-	}
			
 
				+    /**
			
 
				+      Returns the buffer converted to a String;
			
 
				+     **/
			
 
				+    public inline function toString() : String {
			
 
				+	return __b;
			
 
				+    }
			
 
				 
			
 
				-	/**
			
 
				-		Call the `chars` function for each UTF8 char of the string.
			
 
				-	**/
			
 
				-	public static function iter( s : String, chars : Int -> Void ) {
			
 
				-		for( i in 0...s.length ) chars(s.charCodeAt(i));
			
 
				+    /**
			
 
				+      Call the `chars` function for each UTF8 char of the string.
			
 
				+     **/
			
 
				+    public static function iter( s : String, chars : Int -> Void ) {
			
 
				+	var cur = 0;
			
 
				+	while (cur < s.length){
			
 
				+	    var code = s.charCodeAt(cur);
			
 
				+	    var width = charWidth(code);
			
 
				+	    var l = (code << 6)  | s.charCodeAt(cur+1);
			
 
				+	    trace(l + " is the value for l");
			
 
				+	    switch(width){
			
 
				+		case 1 : chars(code);
			
 
				+		case 2 : chars((code << 6)  | s.charCodeAt(cur+1));
			
 
				+		case 3 : chars((code << 12) | (s.charCodeAt(cur+1) << 6) | s.charCodeAt(cur+2));
			
 
				+	    }
			
 
				+	    cur += width;
			
 
				 	}
			
 
				+    }
			
 
				 
			
 
				-	/**
			
 
				-		Encode the input ISO string into the corresponding UTF8 one.
			
 
				-	**/
			
 
				-	public static function encode( s : String ) : String {
			
 
				-		throw "Unimplemented";
			
 
				+    /**
			
 
				+      Encode the input ISO string into the corresponding UTF8 one.
			
 
				+     **/
			
 
				+    public static function encode( s : String ) : String {
			
 
				+	// ported from : http://phpjs.org/functions/utf8_encode/
			
 
				+	if (s == null ) {
			
 
				+	    return '';
			
 
				 	}
			
 
				+	var string = (s + ''); // .replace(/\r\n/g, "\n").replace(/\r/g, "\n");
			
 
				+	var utftext = '';
			
 
				+	var start = 0;
			
 
				+	var end = 0;
			
 
				+	var n = 0;
			
 
				+	while (n < s.length) {
			
 
				+	    var c1 = string.charCodeAt(n);
			
 
				+	    var enc = null;
			
 
				 
			
 
				-	/**
			
 
				-		Decode an UTF8 string back to an ISO string.
			
 
				-		Throw an exception if a given UTF8 character is not supported by the decoder.
			
 
				-	**/
			
 
				-	public static function decode( s : String ) : String {
			
 
				-		throw "Unimplemented";
			
 
				-	}
			
 
				-
			
 
				-	/**
			
 
				-		Similar to `String.charCodeAt` but uses the UTF8 character position.
			
 
				-	**/
			
 
				-	public static inline function charCodeAt( s : String, index : Int ) : Int {
			
 
				-		var cur_idx = 0;
			
 
				-		var pos = 0;
			
 
				-		for (i in 0...index){
			
 
				-			pos += charWidth(s.charCodeAt(pos));
			
 
				+	    if (c1 < 128) {
			
 
				+		end++;
			
 
				+	    } else if (c1 > 127 && c1 < 2048) {
			
 
				+		enc = String.fromCharCode( (c1 >> 6) | 192) 
			
 
				+		    + String.fromCharCode( (c1 & 63) | 128);
			
 
				+	    } else if ((c1 & 0xF800) != 0xD800) {
			
 
				+		enc = String.fromCharCode( (c1 >> 12) | 224)
			
 
				+		    + String.fromCharCode( ((c1 >> 6) & 63) | 128)
			
 
				+		    + String.fromCharCode( (c1 & 63) | 128);
			
 
				+	    } else { // surrogate pairs
			
 
				+		if ((c1 & 0xFC00) != 0xD800) {
			
 
				+		    throw 'Unmatched trail surrogate at ' + n;
			
 
				+		}
			
 
				+		var c2 = string.charCodeAt(++n);
			
 
				+		if ((c2 & 0xFC00) != 0xDC00) {
			
 
				+		    throw 'Unmatched lead surrogate at ' + (n - 1);
			
 
				 		}
			
 
				-		var ret = 0;
			
 
				-		var code = s.charCodeAt(pos);
			
 
				-		var bytes = charWidth(code);
			
 
				-		if (bytes == 1){
			
 
				-			return code;
			
 
				-		} else if (bytes == 2){
			
 
				-			return ((code & 0x1F) << 6) | (s.charCodeAt(pos+1) & 0x3F);
			
 
				-		} else if (bytes == 3){
			
 
				-			return ((code & 0x0F) << 12) | (((s.charCodeAt(pos+1) & 0x3F) << 6) | (s.charCodeAt(pos+2) & 0x3F));
			
 
				-		} else {
			
 
				-			return null;
			
 
				+		c1 = ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000;
			
 
				+		enc = String.fromCharCode( (c1 >> 18) | 240)
			
 
				+		    + String.fromCharCode( ((c1 >> 12) & 63) | 128)
			
 
				+		    + String.fromCharCode(((c1 >> 6) & 63) | 128)
			
 
				+		    + String.fromCharCode((c1 & 63) | 128);
			
 
				+	    }
			
 
				+	    if (enc != null) {
			
 
				+		if (end > start) {
			
 
				+		    utftext += string.substring(start, end);
			
 
				 		}
			
 
				+		utftext += enc;
			
 
				+		start = end = n + 1;
			
 
				+	    }
			
 
				+	    n++;
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-		Tells if the String is correctly encoded as UTF8.
			
 
				-	**/
			
 
				-	public static inline function validate( s : String ) : Bool {
			
 
				-		throw "Unimplemented";
			
 
				+	if (end > start) {
			
 
				+	    utftext += string.substring(start, s.length);
			
 
				 	}
			
 
				 
			
 
				-	/**
			
 
				-		Returns the number of UTF8 chars of the String.
			
 
				-	**/
			
 
				-	public static inline function length( s : String ) : Int {
			
 
				-		var pos = 0;
			
 
				-		var len = 0;
			
 
				-		while (pos < s.length){
			
 
				-			pos += charWidth(s.charCodeAt(pos));
			
 
				-			len++;
			
 
				-		}
			
 
				-		return len;
			
 
				-	}
			
 
				+	return utftext;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+      Decode an UTF8 string back to an ISO string.
			
 
				+      Throw an exception if a given UTF8 character is not supported by the decoder.
			
 
				+     **/
			
 
				+    public static function decode( s : String ) : String {
			
 
				+	var ret = new StringBuf();
			
 
				+	iter(s, function(c){
			
 
				+	    if( c == 8364 ) // euro symbol
			
 
				+		c = 164;
			
 
				+	    else if( c > 255 ){
			
 
				+		// throw new RangeError('Utf8 decode invalid character ($c)');
			
 
				+		throw 'Utf8::decode invalid character ($c)';
			
 
				+	    }
			
 
				 
			
 
				-	/**
			
 
				-		Compare two UTF8 strings, character by character.
			
 
				-	**/
			
 
				-	public static function compare( a : String, b : String ) : Int {
			
 
				-		return a > b ? 1 : (a == b ? 0 : -1);
			
 
				+	    if (c != 0xFEFF) // BOM
			
 
				+		ret.add(String.fromCharCode(c));
			
 
				+	});
			
 
				+	return ret.toString();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+      Similar to `String.charCodeAt` but uses the UTF8 character position.
			
 
				+     **/
			
 
				+    public static inline function charCodeAt( s : String, index : Int ) : Int {
			
 
				+	var cur_idx = 0;
			
 
				+	var pos = 0;
			
 
				+	for (i in 0...index){
			
 
				+	    pos += charWidth(s.charCodeAt(pos));
			
 
				+	}
			
 
				+	var ret = 0;
			
 
				+	var code = s.charCodeAt(pos);
			
 
				+	var bytes = charWidth(code);
			
 
				+	if (bytes == 1){
			
 
				+	    return code;
			
 
				+	} else if (bytes == 2){
			
 
				+	    return ((code & 0x1F) << 6) | (s.charCodeAt(pos+1) & 0x3F);
			
 
				+	} else if (bytes == 3){
			
 
				+	    return ((code & 0x0F) << 12) | (((s.charCodeAt(pos+1) & 0x3F) << 6) | (s.charCodeAt(pos+2) & 0x3F));
			
 
				+	} else {
			
 
				+	    return null;
			
 
				 	}
			
 
				+    }
			
 
				 
			
 
				-	/**
			
 
				-		This is similar to `String.substr` but the `pos` and `len` parts are considering UTF8 characters.
			
 
				-	**/
			
 
				-	public static inline function sub( s : String, pos : Int, len : Int ) : String {
			
 
				-		var startpos = 0;
			
 
				-		var ret = new StringBuf();
			
 
				-		for (i in 0...pos){
			
 
				-			startpos += charWidth(s.charCodeAt(startpos));
			
 
				-		}
			
 
				-		var endpos = startpos;
			
 
				-		for (i in 0...len){
			
 
				-			endpos += charWidth(s.charCodeAt(endpos));
			
 
				+    /**
			
 
				+      Tells if the String is correctly encoded as UTF8.
			
 
				+     **/
			
 
				+    public static function validate( s : String ) : Bool {
			
 
				+	if (s == null) return false;
			
 
				+	var cur = 0;
			
 
				+	while (cur < s.length){
			
 
				+	    var code = s.charCodeAt(cur);
			
 
				+	    var width = charWidth(code);
			
 
				+	    var expectedLen = 0;
			
 
				+
			
 
				+		 if ((code & 0x10000000) == 0x00000000) expectedLen = 1;
			
 
				+	    else if ((code & 0x11100000) == 0x11000000) expectedLen = 2;
			
 
				+	    else if ((code & 0x11110000) == 0x11100000) expectedLen = 3;
			
 
				+	    else if ((code & 0x11111000) == 0x11110000) expectedLen = 4;
			
 
				+	    else if ((code & 0x11111100) == 0x11111000) expectedLen = 5;
			
 
				+	    else if ((code & 0x11111110) == 0x11111100) expectedLen = 6;
			
 
				+	    else return false;
			
 
				+
			
 
				+	    if (cur + expectedLen > s.length) return false;
			
 
				+
			
 
				+	    for (i in (cur + 1)...expectedLen) {
			
 
				+		if ((s.charCodeAt(i) & 0x11000000) != 0x10000000) {
			
 
				+		    return false;
			
 
				 		}
			
 
				-		return s.substring(startpos, endpos);
			
 
				+	    }
			
 
				+
			
 
				+	    cur += width;
			
 
				 	}
			
 
				+	return true;
			
 
				+    }
			
 
				 
			
 
				-	private static function charWidth(c:Int) : Int {
			
 
				-		return   if (c >  0   && c <= 127) 1;
			
 
				-			else if (c >= 194 && c <= 223) 2;
			
 
				-			else if (c >= 224 && c <= 239) 3;
			
 
				-			else if (c >= 240 && c <= 244) 4;
			
 
				-			else null;
			
 
				+    /**
			
 
				+      Returns the number of UTF8 chars of the String.
			
 
				+     **/
			
 
				+    public static inline function length( s : String ) : Int {
			
 
				+	var pos = 0;
			
 
				+	var len = 0;
			
 
				+	while (pos < s.length){
			
 
				+	    pos += charWidth(s.charCodeAt(pos));
			
 
				+	    len++;
			
 
				 	}
			
 
				+	return len;
			
 
				+    }
			
 
				 
			
 
				-	private static function char( unicode : Int ) : String {
			
 
				-		if (unicode <= 0x7F) {
			
 
				-			return String.fromCharCode(unicode);
			
 
				-		} else if (unicode <= 0x7FF) {
			
 
				-			var b0 = 0xC0 + Math.floor(unicode / 0x40);
			
 
				-			var b1 = 0x80 + (unicode % 0x40);
			
 
				-			return NativeStringTools.char(b0, b1);
			
 
				-		} else if (unicode <= 0xFFFF) {
			
 
				-			var b0 = 0xE0 +  Math.floor(unicode / 0x1000);
			
 
				-			var b1 = 0x80 + (Math.floor(unicode / 0x40) % 0x40);
			
 
				-			var b2 = 0x80 + (unicode % 0x40);
			
 
				-			return NativeStringTools.char(b0, b1, b2);
			
 
				-		} else if (unicode <= 0x10FFFF) {
			
 
				-			var code = unicode;
			
 
				-			var b3   = 0x80 + (code % 0x40);
			
 
				-			code     = Math.floor(code / 0x40);
			
 
				-			var b2   = 0x80 + (code % 0x40);
			
 
				-			code     = Math.floor(code / 0x40);
			
 
				-			var b1   = 0x80 + (code % 0x40);
			
 
				-			code     = Math.floor(code / 0x40);
			
 
				-			var b0   = 0xF0 + code;
			
 
				-
			
 
				-			return NativeStringTools.char(b0, b1, b2, b3);
			
 
				-		} else {
			
 
				-			throw 'Unicode greater than U+10FFFF';
			
 
				-		}
			
 
				+    /**
			
 
				+      Compare two UTF8 strings, character by character.
			
 
				+     **/
			
 
				+    public static function compare( a : String, b : String ) : Int {
			
 
				+	return a > b ? 1 : (a == b ? 0 : -1);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+      This is similar to `String.substr` but the `pos` and `len` parts are considering UTF8 characters.
			
 
				+     **/
			
 
				+    public static inline function sub( s : String, pos : Int, len : Int ) : String {
			
 
				+	var startpos = 0;
			
 
				+	var ret = new StringBuf();
			
 
				+	for (i in 0...pos){
			
 
				+	    startpos += charWidth(s.charCodeAt(startpos));
			
 
				 	}
			
 
				+	var endpos = startpos;
			
 
				+	for (i in 0...len){
			
 
				+	    endpos += charWidth(s.charCodeAt(endpos));
			
 
				+	}
			
 
				+	return s.substring(startpos, endpos);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+      Determines the expected character width of the utf8 codepoint
			
 
				+     **/
			
 
				+    static function charWidth(c:Int) : Int {
			
 
				+	return   if (c >  0   && c <= 127) 1;
			
 
				+	    else if (c >= 194 && c <= 223) 2;
			
 
				+	    else if (c >= 224 && c <= 239) 3;
			
 
				+	    else if (c >= 240 && c <= 244) 4;
			
 
				+	    else null;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+      Returns the string representation of the unicode codepoint
			
 
				+     **/
			
 
				+    public static function char( unicode : Int ) : String {
			
 
				+	if (unicode <= 0x7F) {
			
 
				+	    return String.fromCharCode(unicode);
			
 
				+	} else if (unicode <= 0x7FF) {
			
 
				+	    var b0 = 0xC0 + Math.floor(unicode / 0x40);
			
 
				+	    var b1 = 0x80 + (unicode % 0x40);
			
 
				+	    return NativeStringTools.char(b0, b1);
			
 
				+	} else if (unicode <= 0xFFFF) {
			
 
				+	    var b0 = 0xE0 +  Math.floor(unicode / 0x1000);
			
 
				+	    var b1 = 0x80 + (Math.floor(unicode / 0x40) % 0x40);
			
 
				+	    var b2 = 0x80 + (unicode % 0x40);
			
 
				+	    return NativeStringTools.char(b0, b1, b2);
			
 
				+	} else if (unicode <= 0x10FFFF) {
			
 
				+	    var code = unicode;
			
 
				+	    var b3   = 0x80 + (code % 0x40);
			
 
				+	    code     = Math.floor(code / 0x40);
			
 
				+	    var b2   = 0x80 + (code % 0x40);
			
 
				+	    code     = Math.floor(code / 0x40);
			
 
				+	    var b1   = 0x80 + (code % 0x40);
			
 
				+	    code     = Math.floor(code / 0x40);
			
 
				+	    var b0   = 0xF0 + code;
			
 
				+
			
 
				+	    return NativeStringTools.char(b0, b1, b2, b3);
			
 
				+	} else {
			
 
				+	    throw 'Unicode greater than U+10FFFF';
			
 
				+	}
			
 
				+    }
			
 
				 }
			
 
				+