Forráskód Böngészése

Lua: Overhaul Utf8 class

Justin Donaldson 9 éve
szülő
commit
0d8147eae6
1 módosított fájl, 231 hozzáadás és 124 törlés
  1. 231 124
      std/lua/_std/haxe/Utf8.hx

+ 231 - 124
std/lua/_std/haxe/Utf8.hx

@@ -25,154 +25,261 @@ import lua.NativeStringTools;
 
 /**
   A Lua-specific implementation of Utf8, using a helper library.
-**/
+ **/
 
 class Utf8 {
 
-	var __b : String;
+    var __b : String;
 
-	/**
-		Allocate a new Utf8 buffer using an optional bytes size.
-	**/
-	public function new( ?size : Int ) {
-		__b = "";
-	}
+    /**
+      Allocate a new Utf8 buffer using an optional bytes size.
+     **/
+    public function new( ?size : Int ) {
+	__b = "";
+    }
 
-	/**
-		Add the given UTF8 character code to the buffer.
-	**/
-	public inline function addChar( c : Int ) : Void {
-		__b += char(c);
-	}
+    /**
+      Add the given UTF8 character code to the buffer.
+     **/
+    public inline function addChar( c : Int ) : Void {
+	__b += char(c);
+    }
 
-	/**
-		Returns the buffer converted to a String;
-	**/
-	public inline function toString() : String {
-		return __b;
-	}
+    /**
+      Returns the buffer converted to a String;
+     **/
+    public inline function toString() : String {
+	return __b;
+    }
 
-	/**
-		Call the `chars` function for each UTF8 char of the string.
-	**/
-	public static function iter( s : String, chars : Int -> Void ) {
-		for( i in 0...s.length ) chars(s.charCodeAt(i));
+    /**
+      Call the `chars` function for each UTF8 char of the string.
+     **/
+    public static function iter( s : String, chars : Int -> Void ) {
+	var cur = 0;
+	while (cur < s.length){
+	    var code = s.charCodeAt(cur);
+	    var width = charWidth(code);
+	    var l = (code << 6)  | s.charCodeAt(cur+1);
+	    trace(l + " is the value for l");
+	    switch(width){
+		case 1 : chars(code);
+		case 2 : chars((code << 6)  | s.charCodeAt(cur+1));
+		case 3 : chars((code << 12) | (s.charCodeAt(cur+1) << 6) | s.charCodeAt(cur+2));
+	    }
+	    cur += width;
 	}
+    }
 
-	/**
-		Encode the input ISO string into the corresponding UTF8 one.
-	**/
-	public static function encode( s : String ) : String {
-		throw "Unimplemented";
+    /**
+      Encode the input ISO string into the corresponding UTF8 one.
+     **/
+    public static function encode( s : String ) : String {
+	// ported from : http://phpjs.org/functions/utf8_encode/
+	if (s == null ) {
+	    return '';
 	}
+	var string = (s + ''); // .replace(/\r\n/g, "\n").replace(/\r/g, "\n");
+	var utftext = '';
+	var start = 0;
+	var end = 0;
+	var n = 0;
+	while (n < s.length) {
+	    var c1 = string.charCodeAt(n);
+	    var enc = null;
 
-	/**
-		Decode an UTF8 string back to an ISO string.
-		Throw an exception if a given UTF8 character is not supported by the decoder.
-	**/
-	public static function decode( s : String ) : String {
-		throw "Unimplemented";
-	}
-
-	/**
-		Similar to `String.charCodeAt` but uses the UTF8 character position.
-	**/
-	public static inline function charCodeAt( s : String, index : Int ) : Int {
-		var cur_idx = 0;
-		var pos = 0;
-		for (i in 0...index){
-			pos += charWidth(s.charCodeAt(pos));
+	    if (c1 < 128) {
+		end++;
+	    } else if (c1 > 127 && c1 < 2048) {
+		enc = String.fromCharCode( (c1 >> 6) | 192) 
+		    + String.fromCharCode( (c1 & 63) | 128);
+	    } else if ((c1 & 0xF800) != 0xD800) {
+		enc = String.fromCharCode( (c1 >> 12) | 224)
+		    + String.fromCharCode( ((c1 >> 6) & 63) | 128)
+		    + String.fromCharCode( (c1 & 63) | 128);
+	    } else { // surrogate pairs
+		if ((c1 & 0xFC00) != 0xD800) {
+		    throw 'Unmatched trail surrogate at ' + n;
+		}
+		var c2 = string.charCodeAt(++n);
+		if ((c2 & 0xFC00) != 0xDC00) {
+		    throw 'Unmatched lead surrogate at ' + (n - 1);
 		}
-		var ret = 0;
-		var code = s.charCodeAt(pos);
-		var bytes = charWidth(code);
-		if (bytes == 1){
-			return code;
-		} else if (bytes == 2){
-			return ((code & 0x1F) << 6) | (s.charCodeAt(pos+1) & 0x3F);
-		} else if (bytes == 3){
-			return ((code & 0x0F) << 12) | (((s.charCodeAt(pos+1) & 0x3F) << 6) | (s.charCodeAt(pos+2) & 0x3F));
-		} else {
-			return null;
+		c1 = ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000;
+		enc = String.fromCharCode( (c1 >> 18) | 240)
+		    + String.fromCharCode( ((c1 >> 12) & 63) | 128)
+		    + String.fromCharCode(((c1 >> 6) & 63) | 128)
+		    + String.fromCharCode((c1 & 63) | 128);
+	    }
+	    if (enc != null) {
+		if (end > start) {
+		    utftext += string.substring(start, end);
 		}
+		utftext += enc;
+		start = end = n + 1;
+	    }
+	    n++;
 	}
 
-	/**
-		Tells if the String is correctly encoded as UTF8.
-	**/
-	public static inline function validate( s : String ) : Bool {
-		throw "Unimplemented";
+	if (end > start) {
+	    utftext += string.substring(start, s.length);
 	}
 
-	/**
-		Returns the number of UTF8 chars of the String.
-	**/
-	public static inline function length( s : String ) : Int {
-		var pos = 0;
-		var len = 0;
-		while (pos < s.length){
-			pos += charWidth(s.charCodeAt(pos));
-			len++;
-		}
-		return len;
-	}
+	return utftext;
+
+    }
+
+    /**
+      Decode an UTF8 string back to an ISO string.
+      Throw an exception if a given UTF8 character is not supported by the decoder.
+     **/
+    public static function decode( s : String ) : String {
+	var ret = new StringBuf();
+	iter(s, function(c){
+	    if( c == 8364 ) // euro symbol
+		c = 164;
+	    else if( c > 255 ){
+		// throw new RangeError('Utf8 decode invalid character ($c)');
+		throw 'Utf8::decode invalid character ($c)';
+	    }
 
-	/**
-		Compare two UTF8 strings, character by character.
-	**/
-	public static function compare( a : String, b : String ) : Int {
-		return a > b ? 1 : (a == b ? 0 : -1);
+	    if (c != 0xFEFF) // BOM
+		ret.add(String.fromCharCode(c));
+	});
+	return ret.toString();
+    }
+
+    /**
+      Similar to `String.charCodeAt` but uses the UTF8 character position.
+     **/
+    public static inline function charCodeAt( s : String, index : Int ) : Int {
+	var cur_idx = 0;
+	var pos = 0;
+	for (i in 0...index){
+	    pos += charWidth(s.charCodeAt(pos));
+	}
+	var ret = 0;
+	var code = s.charCodeAt(pos);
+	var bytes = charWidth(code);
+	if (bytes == 1){
+	    return code;
+	} else if (bytes == 2){
+	    return ((code & 0x1F) << 6) | (s.charCodeAt(pos+1) & 0x3F);
+	} else if (bytes == 3){
+	    return ((code & 0x0F) << 12) | (((s.charCodeAt(pos+1) & 0x3F) << 6) | (s.charCodeAt(pos+2) & 0x3F));
+	} else {
+	    return null;
 	}
+    }
 
-	/**
-		This is similar to `String.substr` but the `pos` and `len` parts are considering UTF8 characters.
-	**/
-	public static inline function sub( s : String, pos : Int, len : Int ) : String {
-		var startpos = 0;
-		var ret = new StringBuf();
-		for (i in 0...pos){
-			startpos += charWidth(s.charCodeAt(startpos));
-		}
-		var endpos = startpos;
-		for (i in 0...len){
-			endpos += charWidth(s.charCodeAt(endpos));
+    /**
+      Tells if the String is correctly encoded as UTF8.
+     **/
+    public static function validate( s : String ) : Bool {
+	if (s == null) return false;
+	var cur = 0;
+	while (cur < s.length){
+	    var code = s.charCodeAt(cur);
+	    var width = charWidth(code);
+	    var expectedLen = 0;
+
+		 if ((code & 0x10000000) == 0x00000000) expectedLen = 1;
+	    else if ((code & 0x11100000) == 0x11000000) expectedLen = 2;
+	    else if ((code & 0x11110000) == 0x11100000) expectedLen = 3;
+	    else if ((code & 0x11111000) == 0x11110000) expectedLen = 4;
+	    else if ((code & 0x11111100) == 0x11111000) expectedLen = 5;
+	    else if ((code & 0x11111110) == 0x11111100) expectedLen = 6;
+	    else return false;
+
+	    if (cur + expectedLen > s.length) return false;
+
+	    for (i in (cur + 1)...expectedLen) {
+		if ((s.charCodeAt(i) & 0x11000000) != 0x10000000) {
+		    return false;
 		}
-		return s.substring(startpos, endpos);
+	    }
+
+	    cur += width;
 	}
+	return true;
+    }
 
-	private static function charWidth(c:Int) : Int {
-		return   if (c >  0   && c <= 127) 1;
-			else if (c >= 194 && c <= 223) 2;
-			else if (c >= 224 && c <= 239) 3;
-			else if (c >= 240 && c <= 244) 4;
-			else null;
+    /**
+      Returns the number of UTF8 chars of the String.
+     **/
+    public static inline function length( s : String ) : Int {
+	var pos = 0;
+	var len = 0;
+	while (pos < s.length){
+	    pos += charWidth(s.charCodeAt(pos));
+	    len++;
 	}
+	return len;
+    }
 
-	private static function char( unicode : Int ) : String {
-		if (unicode <= 0x7F) {
-			return String.fromCharCode(unicode);
-		} else if (unicode <= 0x7FF) {
-			var b0 = 0xC0 + Math.floor(unicode / 0x40);
-			var b1 = 0x80 + (unicode % 0x40);
-			return NativeStringTools.char(b0, b1);
-		} else if (unicode <= 0xFFFF) {
-			var b0 = 0xE0 +  Math.floor(unicode / 0x1000);
-			var b1 = 0x80 + (Math.floor(unicode / 0x40) % 0x40);
-			var b2 = 0x80 + (unicode % 0x40);
-			return NativeStringTools.char(b0, b1, b2);
-		} else if (unicode <= 0x10FFFF) {
-			var code = unicode;
-			var b3   = 0x80 + (code % 0x40);
-			code     = Math.floor(code / 0x40);
-			var b2   = 0x80 + (code % 0x40);
-			code     = Math.floor(code / 0x40);
-			var b1   = 0x80 + (code % 0x40);
-			code     = Math.floor(code / 0x40);
-			var b0   = 0xF0 + code;
-
-			return NativeStringTools.char(b0, b1, b2, b3);
-		} else {
-			throw 'Unicode greater than U+10FFFF';
-		}
+    /**
+      Compare two UTF8 strings, character by character.
+     **/
+    public static function compare( a : String, b : String ) : Int {
+	return a > b ? 1 : (a == b ? 0 : -1);
+    }
+
+    /**
+      This is similar to `String.substr` but the `pos` and `len` parts are considering UTF8 characters.
+     **/
+    public static inline function sub( s : String, pos : Int, len : Int ) : String {
+	var startpos = 0;
+	var ret = new StringBuf();
+	for (i in 0...pos){
+	    startpos += charWidth(s.charCodeAt(startpos));
 	}
+	var endpos = startpos;
+	for (i in 0...len){
+	    endpos += charWidth(s.charCodeAt(endpos));
+	}
+	return s.substring(startpos, endpos);
+    }
+
+    /**
+      Determines the expected character width of the utf8 codepoint
+     **/
+    static function charWidth(c:Int) : Int {
+	return   if (c >  0   && c <= 127) 1;
+	    else if (c >= 194 && c <= 223) 2;
+	    else if (c >= 224 && c <= 239) 3;
+	    else if (c >= 240 && c <= 244) 4;
+	    else null;
+    }
+
+    /**
+      Returns the string representation of the unicode codepoint
+     **/
+    public static function char( unicode : Int ) : String {
+	if (unicode <= 0x7F) {
+	    return String.fromCharCode(unicode);
+	} else if (unicode <= 0x7FF) {
+	    var b0 = 0xC0 + Math.floor(unicode / 0x40);
+	    var b1 = 0x80 + (unicode % 0x40);
+	    return NativeStringTools.char(b0, b1);
+	} else if (unicode <= 0xFFFF) {
+	    var b0 = 0xE0 +  Math.floor(unicode / 0x1000);
+	    var b1 = 0x80 + (Math.floor(unicode / 0x40) % 0x40);
+	    var b2 = 0x80 + (unicode % 0x40);
+	    return NativeStringTools.char(b0, b1, b2);
+	} else if (unicode <= 0x10FFFF) {
+	    var code = unicode;
+	    var b3   = 0x80 + (code % 0x40);
+	    code     = Math.floor(code / 0x40);
+	    var b2   = 0x80 + (code % 0x40);
+	    code     = Math.floor(code / 0x40);
+	    var b1   = 0x80 + (code % 0x40);
+	    code     = Math.floor(code / 0x40);
+	    var b0   = 0xF0 + code;
+
+	    return NativeStringTools.char(b0, b1, b2, b3);
+	} else {
+	    throw 'Unicode greater than U+10FFFF';
+	}
+    }
 }
+