Parcourir la source

Merge branch 'development' of ssh://github.com/HaxeFoundation/haxe; branch 'lua-utf8' of ssh://github.com/mandel59/haxe into mandel59-lua-utf8

Justin Donaldson il y a 9 ans
Parent
commit
0d81357710
2 fichiers modifiés avec 54 ajouts et 34 suppressions
  1. 43 34
      std/lua/_std/haxe/Utf8.hx
  2. 11 0
      tests/unit/src/unitstd/haxe/Utf8.unit.hx

+ 43 - 34
std/lua/_std/haxe/Utf8.hx

@@ -39,18 +39,30 @@ class Utf8 {
 		return __b;
     }
 
+    static inline function decodeChar( s : String, pos : Int, code : Int, width : Int ) {
+        return
+            if (width == 1)
+                code;
+            else if (width == 2)
+                ((code & 0x3F) << 6) |
+                (s.charCodeAt(pos+1) & 0x7F);
+            else if (width == 3)
+                ((code & 0x1F) << 12) |
+                ((s.charCodeAt(pos+1) & 0x7F) << 6) |
+                (s.charCodeAt(pos+2) & 0x7F);
+            else
+                ((code & 0x0F) << 18) |
+                ((s.charCodeAt(pos+1) & 0x7F) << 12) |
+                ((s.charCodeAt(pos+2) & 0x7F) << 6) |
+                (s.charCodeAt(pos+3) & 0x7F);
+    }
+
     public static function iter( s : String, chars : Int -> Void ) {
 		var cur = 0;
 		while (cur < s.length){
 			var code = s.charCodeAt(cur);
 			var width = charWidth(code);
-			var l = (code << 6)  | s.charCodeAt(cur+1);
-			trace(l + " is the value for l");
-			switch(width){
-			case 1 : chars(code);
-			case 2 : chars((code << 6)  | s.charCodeAt(cur+1));
-			case 3 : chars((code << 12) | (s.charCodeAt(cur+1) << 6) | s.charCodeAt(cur+2));
-			}
+			chars( decodeChar( s, cur, code, width ) );
 			cur += width;
 		}
     }
@@ -135,42 +147,39 @@ class Utf8 {
 		var ret = 0;
 		var code = s.charCodeAt(pos);
 		var bytes = charWidth(code);
-		if (bytes == 1){
-			return code;
-		} else if (bytes == 2){
-			return ((code & 0x1F) << 6) | (s.charCodeAt(pos+1) & 0x3F);
-		} else if (bytes == 3){
-			return ((code & 0x0F) << 12) | (((s.charCodeAt(pos+1) & 0x3F) << 6) | (s.charCodeAt(pos+2) & 0x3F));
-		} else {
-			return null;
-		}
+		return decodeChar( s, pos, code, bytes );
     }
 
     public static function validate( s : String ) : Bool {
 		if (s == null) return false;
 		var cur = 0;
 		while (cur < s.length){
-			var code = s.charCodeAt(cur);
-			var width = charWidth(code);
-			var expectedLen = 0;
-
-			 if ((code & 0x10000000) == 0x00000000) expectedLen = 1;
-			else if ((code & 0x11100000) == 0x11000000) expectedLen = 2;
-			else if ((code & 0x11110000) == 0x11100000) expectedLen = 3;
-			else if ((code & 0x11111000) == 0x11110000) expectedLen = 4;
-			else if ((code & 0x11111100) == 0x11111000) expectedLen = 5;
-			else if ((code & 0x11111110) == 0x11111100) expectedLen = 6;
-			else return false;
-
-			if (cur + expectedLen > s.length) return false;
-
-			for (i in (cur + 1)...expectedLen) {
-			if ((s.charCodeAt(i) & 0x11000000) != 0x10000000) {
+			var c1 = s.charCodeAt(cur++);
+			if (c1 < 0x80) continue;
+			if (c1 < 0xC0) return false;
+			if (s.length <= cur) return false;
+			var c2 = s.charCodeAt(cur++);
+			if (c1 < 0xE0) {
+				if ((c1 & 0x1E != 0) && (c2 & 0xC0 == 0x80)) continue;
 				return false;
 			}
+			if (s.length <= cur) return false;
+			var c3 = s.charCodeAt(cur++);
+			if (c1 < 0xF0) {
+				if (((c1 & 0x0F != 0) || (c2 & 0x20 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80)
+						&& !(c1 == 0xED && 0xA0 <= c2 && c2 <= 0xBF))
+					continue;
+				return false;
 			}
-
-			cur += width;
+			if (s.length <= cur) return false;
+			var c4 = s.charCodeAt(cur++);
+			if (c1 < 0xF8) {
+				if (((c1 & 0x07 != 0) || (c2 & 0x30 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80) && (c4 & 0xC0 == 0x80)
+						&& !((c1 == 0xF4 && c2 > 0x8F) || c1 > 0xF4))
+					continue;
+				return false;
+			}
+			return false;
 		}
 		return true;
     }

+ 11 - 0
tests/unit/src/unitstd/haxe/Utf8.unit.hx

@@ -44,6 +44,17 @@ haxe.Utf8.compare(haxe.Utf8.sub(str, 1, 0), "") == 0;
 // unspecify outside of range Utf8.sub
 // haxe.Utf8.compare(haxe.Utf8.sub(str, 9, 0), "") == 0;
 
+#if (neko || php || cpp || lua || macro)
+haxe.Utf8.validate("\xf0\xa9\xb8\xbd\xe3\x81\x82\xc3\xab\x61") == true;
+haxe.Utf8.validate("\xed\x9f\xbf") == true;
+haxe.Utf8.validate("\xee\x80\x80") == true;
+haxe.Utf8.validate("\xf4\x8f\xbf\xbf") == true;
+haxe.Utf8.validate("\xf0\xa9\xb8\xbd\xe3\x81\xc3\xab\x61") == false;
+haxe.Utf8.validate("\xc0\xaf") == false; // redundant sequence
+haxe.Utf8.validate("\xed\xa0\x80") == false; // surrogate byte sequence
+haxe.Utf8.validate("\xed\xbf\xbf") == false; // surrogate byte sequence
+haxe.Utf8.validate("\xf4\x90\x80\x80") == false; // U+110000
+#end
 
 #if php
 }