2
0
Эх сурвалжийг харах

UnicodeString (#8298)

* removed haxe.Utf8; added neko.Utf8

* UnicodeString

* fix UnicodeString for utf16

* fix

* UnicodeString.validate()

* add encoding argument to UnicodeString.encoding

* disable test for non-unicode cases

* fix test for non-unicode targets

* fix test

* fix test for non-unicode cases

* PR fixes

* still, 0xEDBFBF

* cleanup eval; remove `haxe.Utf8` from cppia

* more cleanup
Aleksandr Kuzmenko 6 жил өмнө
parent
commit
b49724d594

+ 0 - 1
src/macro/eval/evalHash.ml

@@ -49,7 +49,6 @@ let key_haxe_ds_ObjectMap = hash "haxe.ds.ObjectMap"
 let key_haxe_macro_Position = hash "haxe.macro.Position"
 let key_haxe_macro_LazyType = hash "haxe.macro.LazyType"
 let key_haxe_macro_TypeDecl = hash "haxe.macro.TypeDecl"
-let key_haxe_Utf8 = hash "haxe.Utf8"
 let key_haxe_macro_Ref = hash "haxe.macro.Ref"
 let key_haxe_io_Error = hash "haxe.io.Error"
 let key_haxe_io_Bytes = hash "haxe.io.Bytes"

+ 0 - 6
src/macro/eval/evalStdLib.ml

@@ -3114,11 +3114,6 @@ let init_constructors builtins =
 			| _ -> assert false
 		);
 	add key_StringBuf (fun _ -> encode_instance key_StringBuf ~kind:(IBuffer (VStringBuffer.create())));
-	add key_haxe_Utf8
-		(fun vl -> match vl with
-			| [size] -> encode_instance key_haxe_Utf8 ~kind:(IUtf8 (UTF8.Buf.create (default_int size 0)))
-			| _ -> assert false
-		);
 	add key_haxe_ds_StringMap (fun _ -> encode_string_map_direct (StringHashtbl.create ()));
 	add key_haxe_ds_IntMap (fun _ -> encode_int_map_direct (IntHashtbl.create ()));
 	add key_haxe_ds_ObjectMap (fun _ -> encode_object_map_direct (Obj.magic (ValueHashtbl.create 0)));
@@ -3209,7 +3204,6 @@ let init_empty_constructors builtins =
 	Hashtbl.add h key_Date (fun () -> encode_instance key_Date ~kind:(IDate 0.));
 	Hashtbl.add h key_EReg (fun () -> encode_instance key_EReg ~kind:(IRegex {r = Pcre.regexp ""; r_rex_string = create_ascii "~//"; r_global = false; r_string = ""; r_groups = [||]}));
 	Hashtbl.add h key_String (fun () -> encode_string "");
-	Hashtbl.add h key_haxe_Utf8 (fun () -> encode_instance key_haxe_Utf8 ~kind:(IUtf8 (UTF8.Buf.create 0)));
 	Hashtbl.add h key_haxe_ds_StringMap (fun () -> encode_instance key_haxe_ds_StringMap ~kind:(IStringMap (StringHashtbl.create ())));
 	Hashtbl.add h key_haxe_ds_IntMap (fun () -> encode_instance key_haxe_ds_IntMap ~kind:(IIntMap (IntHashtbl.create ())));
 	Hashtbl.add h key_haxe_ds_ObjectMap (fun () -> encode_instance key_haxe_ds_ObjectMap ~kind:(IObjectMap (Obj.magic (ValueHashtbl.create 0))));

+ 10 - 0
std/StringTools.hx

@@ -553,4 +553,14 @@ class StringTools {
 	private static var _urlDecode = neko.Lib.load("std","url_decode",1);
 	#end
 
+	#if utf16
+	static inline var MIN_SURROGATE_CODE_POINT = 65536;
+	static inline function utf16CodePointAt(s:String, index:Int):Int {
+		var c = StringTools.fastCodeAt(s, index);
+		if (c >= 0xD800 && c <= 0xDBFF) {
+			c = ((c -0xD7C0) << 10) | (StringTools.fastCodeAt(s, index + 1) & 0x3FF);
+		}
+		return c;
+	}
+	#end
 }

+ 215 - 0
std/UnicodeString.hx

@@ -0,0 +1,215 @@
+/*
+ * Copyright (C)2005-2019 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+import haxe.io.Bytes;
+import haxe.io.Encoding;
+import haxe.iterators.StringIteratorUnicode;
+import haxe.iterators.StringKeyValueIteratorUnicode;
+
+/**
+	This abstract provides consistent cross-target unicode support.
+
+	@see https://haxe.org/manual/std-UnicodeString.html
+**/
+@:forward
+@:access(StringTools)
+abstract UnicodeString(String) from String to String {
+
+	/**
+		Tells if `b` is a correctly encoded UTF8 byte sequence.
+	**/
+	static public function validate(b:Bytes, encoding:Encoding) : Bool {
+		switch(encoding) {
+			case RawNative: throw "UnicodeString.validate: RawNative encoding is not supported";
+			case UTF8:
+				var data = b.getData();
+				var pos = 0;
+				var max = b.length;
+				while( pos < max) {
+					var c:Int = Bytes.fastGet(data, pos++);
+					if(c < 0x80) {
+					} else if(c < 0xC2) {
+						return false;
+					} else if(c < 0xE0) {
+						if(pos + 1 > max) {
+							return false;
+						}
+						var c2:Int = Bytes.fastGet(data, pos++);
+						if(c2 < 0x80 || c2 > 0xBF) {
+							return false;
+						}
+					} else if(c < 0xF0) {
+						if(pos + 2 > max) {
+							return false;
+						}
+						var c2:Int = Bytes.fastGet(data, pos++);
+						if(c == 0xE0) {
+							if(c2 < 0xA0 || c2 > 0xBF) return false;
+						} else {
+							if(c2 < 0x80 || c2 > 0xBF) return false;
+						}
+						var c3:Int = Bytes.fastGet(data, pos++);
+						if(c3 < 0x80 || c3 > 0xBF) {
+							return false;
+						}
+						c = (c << 16) | (c2 << 8) | c3;
+						if(0xEDA080 <= c && c <= 0xEDBFBF) { //surrogate pairs
+							return false;
+						}
+					} else if(c > 0xF4) {
+						return false;
+					} else {
+						if(pos + 3 > max) {
+							return false;
+						}
+						var c2:Int = Bytes.fastGet(data, pos++);
+						if(c == 0xF0) {
+							if(c2 < 0x90 || c2 > 0xBF) return false;
+						} else if(c == 0xF4) {
+							if(c2 < 0x80 || c2 > 0x8F) return false;
+						} else {
+							if(c2 < 0x80 || c2 > 0xBF) return false;
+						}
+						var c3:Int = Bytes.fastGet(data, pos++);
+						if(c3 < 0x80 || c3 > 0xBF) {
+							return false;
+						}
+						var c4:Int = Bytes.fastGet(data, pos++);
+						if(c4 < 0x80 || c4 > 0xBF) {
+							return false;
+						}
+					}
+				}
+				return true;
+		}
+	}
+
+#if (target.unicode)
+
+	/**
+		The number of characters in `this` String.
+	**/
+	public var length(get,never):Int;
+
+	/**
+		Creates an instance of UnicodeString.
+	**/
+	public inline function new(string:String):Void {
+		this = string;
+	}
+
+	/**
+		Returns the character at position `index` of `this` String.
+
+		If `index` is negative or exceeds `this.length`, the empty String `""`
+		is returned.
+	**/
+	#if !utf16 inline #end
+	public function charAt(index:Int):String {
+		#if utf16
+			if(index < 0) return '';
+			var unicodeOffset = 0;
+			var nativeOffset = 0;
+			while(nativeOffset < this.length) {
+				var c = StringTools.utf16CodePointAt(this, nativeOffset++);
+				if(unicodeOffset == index) {
+					return String.fromCharCode(c);
+				}
+				if(c >= StringTools.MIN_SURROGATE_CODE_POINT) {
+					nativeOffset++;
+				}
+				unicodeOffset++;
+			}
+			return '';
+		#else
+			return this.charAt(index);
+		#end
+	}
+
+	/**
+		Returns the character code at position `index` of `this` String.
+
+		If `index` is negative or exceeds `this.length`, `null` is returned.
+	**/
+	#if !utf16 inline #end
+	public function charCodeAt(index:Int):Null<Int> {
+		#if utf16
+			if(index < 0) return null;
+			var unicodeOffset = 0;
+			var nativeOffset = 0;
+			while(nativeOffset < this.length) {
+				var c = StringTools.utf16CodePointAt(this, nativeOffset++);
+				if(unicodeOffset == index) {
+					return c;
+				}
+				if(c >= StringTools.MIN_SURROGATE_CODE_POINT) {
+					nativeOffset++;
+				}
+				unicodeOffset++;
+			}
+			return null;
+		#else
+			return this.charCodeAt(index);
+		#end
+	}
+
+	/**
+		Returns an iterator of the unicode code points.
+	**/
+	public inline function iterator():StringIteratorUnicode {
+		return new StringIteratorUnicode(this);
+	}
+
+	/**
+		Returns an iterator of the code point indices and unicode code points.
+	**/
+	public inline function keyValueIterator():StringKeyValueIteratorUnicode {
+		return new StringKeyValueIteratorUnicode(this);
+	}
+
+	#if !utf16 inline #end
+	function get_length():Int {
+		#if utf16
+			var l = 0;
+			for(c in new StringIteratorUnicode(this)) {
+				l++;
+			}
+			return l;
+		#else
+			return this.length;
+		#end
+	}
+
+#end
+
+	@:op(A < B) static function lt(a:UnicodeString, b:UnicodeString):Bool;
+	@:op(A <= B) static function lte(a:UnicodeString, b:UnicodeString):Bool;
+	@:op(A > B) static function gt(a:UnicodeString, b:UnicodeString):Bool;
+	@:op(A >= B) static function gte(a:UnicodeString, b:UnicodeString):Bool;
+	@:op(A == B) static function eq(a:UnicodeString, b:UnicodeString):Bool;
+	@:op(A != B) static function neq(a:UnicodeString, b:UnicodeString):Bool;
+	@:op(A + B) static function add(a:UnicodeString, b:UnicodeString):UnicodeString;
+	@:op(A += B) static function assignAdd(a:UnicodeString, b:UnicodeString):UnicodeString;
+
+	@:op(A + B) @:commutative static function add(a:UnicodeString, b:String):UnicodeString;
+	@:op(A += B) @:commutative static function assignAdd(a:UnicodeString, b:String):UnicodeString;
+}

+ 0 - 87
std/cpp/_std/haxe/Utf8.hx

@@ -1,87 +0,0 @@
-/*
- * Copyright (C)2005-2019 Haxe Foundation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-package haxe;
-
-using cpp.NativeString;
-
-@:coreApi
-class Utf8
-{
-   var __s:Array<Int>;
-
-	public function new( ?size : Null<Int> ) : Void {
-      __s = new Array<Int>();
-      if (size!=null && size>0)
-         cpp.NativeArray.reserve(__s,size);
-	}
-
-	public function addChar( c : Int ) : Void {
-      __s.push(c);
-	}
-
-	public function toString() : String {
-		return untyped __global__.__hxcpp_char_array_to_utf8_string(__s);
-	}
-
-   // Incoming string is array of bytes containing possibly invalid utf8 chars
-   // Result is the same string with the bytes expanded into utf8 sequences
-	public static function encode( s : String ) : String {
-		return untyped __global__.__hxcpp_char_bytes_to_utf8_string(s);
-	}
-
-   // Incoming string is array of bytes representing valid utf8 chars
-   // Result is a string containing the compressed bytes
-	public static function decode( s : String ) : String {
-		return untyped __global__.__hxcpp_utf8_string_to_char_bytes(s);
-	}
-
-	public #if !cppia inline #end static function iter( s : String, chars : Int -> Void ) : Void {
-      var src = s.c_str();
-      var end = src.add( s.length );
-
-      while(src.lt(end))
-         chars(src.ptr.utf8DecodeAdvance());
-	}
-
-	public static function charCodeAt( s : String, index : Int ) : Int {
-      return s.utf8CharCodeAt(index);
-	}
-
-	public static function validate( s : String ) : Bool {
-      return s.utf8IsValid();
-	}
-
-	public static function length( s : String ) : Int {
-      return s.utf8Length();
-	}
-
-	public static function compare( a : String, b : String ) : Int {
-      return a.compare(b);
-	}
-
-	public static function sub( s : String, pos : Int, len : Int ) : String {
-      return s.utf8Sub(pos,len);
-	}
-
-}
-
-

+ 0 - 2
std/cpp/cppia/HostClasses.hx

@@ -92,7 +92,6 @@ class HostClasses
    "haxe.Unserializer",
    "haxe.Resource",
    "haxe.Template",
-   "haxe.Utf8",
    "haxe.Log",
    "haxe.zip.Compress",
    "haxe.zip.Uncompress",
@@ -124,7 +123,6 @@ class HostClasses
 
    "haxe.CallStack",
    "haxe.Resource",
-   "haxe.Utf8",
    "haxe.Int64",
    "haxe.Int32",
    "haxe.Serializer",

+ 0 - 37
std/eval/_std/haxe/Utf8.hx

@@ -1,37 +0,0 @@
-/*
- * Copyright (C)2005-2019 Haxe Foundation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-package haxe;
-
-@:coreApi
-extern class Utf8 {
-	public function new( ?size : Int ):Void;
-	public function addChar( c : Int ) : Void;
-	public function toString() : String;
-	public static function iter( s : String, chars : Int -> Void ):Void;
-	public static function encode( s : String ) : String;
-	public static function decode( s : String ) : String;
-	public static function charCodeAt( s : String, index : Int ) : Int;
-	public static function validate( s : String ) : Bool;
-	public static function length( s : String ) : Int;
-	public static function compare( a : String, b : String ) : Int;
-	public static function sub( s : String, pos : Int, len : Int ) : String;
-}

+ 0 - 112
std/haxe/Utf8.hx

@@ -1,112 +0,0 @@
-/*
- * Copyright (C)2005-2019 Haxe Foundation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-package haxe;
-
-/**
-	Since not all platforms guarantee that `String` always uses UTF-8 encoding, you
-	can use this cross-platform API to perform operations on such strings.
-**/
-class Utf8 {
-
-	var __b : String;
-
-	/**
-		Allocate a new Utf8 buffer using an optional bytes size.
-	**/
-	public function new( ?size : Int ) {
-		__b = "";
-	}
-
-	/**
-		Add the given UTF8 character code to the buffer.
-	**/
-	public inline function addChar( c : Int ) : Void {
-		__b += String.fromCharCode(c);
-	}
-
-	/**
-		Returns the buffer converted to a String.
-	**/
-	public inline function toString() : String {
-		return __b;
-	}
-
-	/**
-		Call the `chars` function for each UTF8 char of the string.
-	**/
-	public static function iter( s : String, chars : Int -> Void ) {
-		for( i in 0...s.length )
-			chars(s.charCodeAt(i));
-	}
-
-	/**
-		Encode the input ISO string into the corresponding UTF8 one.
-	**/
-	public static function encode( s : String ) : String {
-		throw "Not implemented";
-	}
-
-	/**
-		Decode an UTF8 string back to an ISO string.
-		Throw an exception if a given UTF8 character is not supported by the decoder.
-	**/
-	public static function decode( s : String ) : String {
-		throw "Not implemented";
-	}
-
-	/**
-		Similar to `String.charCodeAt` but uses the UTF8 character position.
-	**/
-	public static inline function charCodeAt( s : String, index : Int ) : Int {
-		return s.charCodeAt(index);
-	}
-
-	/**
-		Tells if the String is correctly encoded as UTF8.
-	**/
-	public static inline function validate( s : String ) : Bool {
-		return true;
-	}
-
-	/**
-		Returns the number of UTF8 chars of the String.
-	**/
-	#if js extern #end
-	public static inline function length( s : String ) : Int {
-		return s.length;
-	}
-
-	/**
-		Compare two UTF8 strings, character by character.
-	**/
-	public static function compare( a : String, b : String ) : Int {
-		return a > b ? 1 : (a == b ? 0 : -1);
-	}
-
-	/**
-		This is similar to `String.substr` but the `pos` and `len` parts are considering UTF8 characters.
-	**/
-	public static inline function sub( s : String, pos : Int, len : Int ) : String {
-		return s.substr(pos,len);
-	}
-
-}

+ 5 - 5
std/haxe/format/JsonPrinter.hx

@@ -185,8 +185,8 @@ class JsonPrinter {
 	}
 
 	function quote( s : String ) {
-		#if (neko || php || cpp)
-		if( s.length != haxe.Utf8.length(s) ) {
+		#if neko
+		if( s.length != neko.Utf8.length(s) ) {
 			quoteUtf8(s);
 			return;
 		}
@@ -236,10 +236,10 @@ class JsonPrinter {
 		addChar('"'.code);
 	}
 
-	#if (neko || php || cpp)
+	#if neko
 	function quoteUtf8( s : String ) {
-		var u = new haxe.Utf8();
-		haxe.Utf8.iter(s,function(c) {
+		var u = new neko.Utf8();
+		neko.Utf8.iter(s,function(c) {
 			switch( c ) {
 			case '\\'.code, '"'.code: u.addChar('\\'.code); u.addChar(c);
 			case '\n'.code: u.addChar('\\'.code); u.addChar('n'.code);

+ 7 - 4
std/haxe/iterators/StringIteratorUnicode.hx

@@ -50,14 +50,17 @@ class StringIteratorUnicode {
 	/**
 		See `Iterator.next`
 	**/
+	@:access(StringTools)
 	public inline function next() {
-		var c = StringTools.fastCodeAt(s, offset++);
 		#if utf16
-		if (c >= 0xD800 && c <= 0xDBFF) {
-			c = ((c -0xD7C0) << 10) | (StringTools.fastCodeAt(s, offset++) & 0x3FF);
+		var c = StringTools.utf16CodePointAt(s, offset++);
+		if(c >= StringTools.MIN_SURROGATE_CODE_POINT) {
+			offset++;
 		}
-		#end
 		return c;
+		#else
+		return StringTools.fastCodeAt(s, offset++);
+		#end
 	}
 
 	/**

+ 7 - 4
std/haxe/iterators/StringKeyValueIteratorUnicode.hx

@@ -53,14 +53,17 @@ class StringKeyValueIteratorUnicode {
 	/**
 		See `Iterator.next`
 	**/
+	@:access(StringTools)
 	public inline function next() {
-		var c = StringTools.fastCodeAt(s, byteOffset++);
 		#if utf16
-		if (c >= 0xD800 && c <= 0xDBFF) {
-			c = ((c -0xD7C0) << 10) | (StringTools.fastCodeAt(s, byteOffset++) & 0x3FF);
+		var c = StringTools.utf16CodePointAt(s, byteOffset++);
+		if(c >= StringTools.MIN_SURROGATE_CODE_POINT) {
+			byteOffset++;
 		}
-		#end
 		return { key: charOffset++, value: c };
+		#else
+		return { key: charOffset++, value: StringTools.fastCodeAt(s, byteOffset++) };
+		#end
 	}
 
 	/**

+ 1 - 1
std/neko/_std/haxe/Utf8.hx → std/neko/Utf8.hx

@@ -19,7 +19,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
-package haxe;
+package neko;
 
 @:coreApi
 class Utf8 {

+ 0 - 88
std/php/_std/haxe/Utf8.hx

@@ -1,88 +0,0 @@
-/*
- * Copyright (C)2005-2019 Haxe Foundation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-package haxe;
-
-import php.Global;
-
-@:coreApi
-class Utf8 {
-
-	var __b : String;
-
-	public function new( ?size : Int ) : Void {
-		__b = '';
-	}
-
-	public function addChar( c : Int ) : Void {
-		__b += uchr(c);
-	}
-
-	public function toString() : String {
-		return __b;
-	}
-
-	public static function encode( s : String ) : String {
-		return Global.utf8_encode(s);
-	}
-
-	public static function decode( s : String ) : String {
-		return Global.utf8_decode(s);
-	}
-
-	public static function iter(s : String, chars : Int -> Void ) : Void {
-		var len = length(s);
-		for(i in 0...len) {
-			chars(charCodeAt(s, i));
-		}
-	}
-
-	public static function charCodeAt( s : String, index : Int ) : Int {
-		return uord(sub(s, index, 1));
-	}
-
-	static function uchr(i : Int) : String {
-		return Global.mb_convert_encoding(Global.pack('N', i), 'UTF-8', 'UCS-4BE');
-	}
-
-	static function uord(s : String) : Int {
-		var c = Global.unpack('N', Global.mb_convert_encoding(s, 'UCS-4BE', 'UTF-8'));
-		return c[1];
-	}
-
-	public static function validate( s : String ) : Bool {
-		return Global.mb_check_encoding(s, enc);
-	}
-
-	public static function length( s : String ) : Int {
-		return Global.mb_strlen(s, enc);
-	}
-
-	public static function compare( a : String, b : String ) : Int {
-		return Global.strcmp(a, b);
-	}
-
-	public static function sub( s : String, pos : Int, len : Int ) : String {
-		return Global.mb_substr(s, pos, len, enc);
-	}
-
-	private static inline var enc = "UTF-8";
-}

+ 0 - 9
tests/unit/src/unit/TestJson.hx

@@ -48,11 +48,6 @@ class TestJson extends Test {
 
 	// TODO: test pretty-printing (also with objects with skipped function fields!)
 	function testHaxeJson() {
-		#if php
-		// php's haxe.Utf8 uses mbstring
-		if (php.Global.extension_loaded("mbstring")) {
-		#end
-
 		var str = haxe.format.JsonPrinter.print( { x : -4500, y : 1.456, a : ["hello", "wor'\"\n\t\rd"], b : function() {} } );
 		str = str.substr(1, str.length - 2); // remove {}
 		var parts = str.split(",");
@@ -96,10 +91,6 @@ class TestJson extends Test {
 		eq(haxe.format.JsonPrinter.print(Math.NaN), "null");
 		eq(haxe.format.JsonPrinter.print(function() {}), "\"<fun>\"");
 		eq(haxe.format.JsonPrinter.print({a: function() {}, b: 1}), "{\"b\":1}");
-
-		#if php
-		}
-		#end
 	}
 
 	function test3690() {

+ 1 - 1
tests/unit/src/unit/UnitBuilder.hx

@@ -127,7 +127,7 @@ class UnitBuilder {
 	static public function read(path:String) {
 		var p = Context.makePosition( { min:0, max:0, file:path } );
 		var file = sys.io.File.getContent(path);
-		var code = Context.parseInlineString("{" + file + "}", p);
+		var code = Context.parseInlineString("{" + file + "\n}", p);
 		function mkBlock(e:Expr) {
 			return switch(e.expr) {
 				case EBlock(b): b;

+ 77 - 0
tests/unit/src/unitstd/UnicodeString.unit.hx

@@ -0,0 +1,77 @@
+#if (target.unicode)
+var s = new UnicodeString("𠜎zя");
+var codes = [132878, 122, 1103];
+
+// length
+s.length == codes.length;
+
+// // toUpperCase, toLowerCase
+// var turkishLower = "ğüşıiöç";
+// var turkishUpper = "ĞÜŞIİÖÇ";
+// turkishUpper == turkishLower.toUpperCase();
+// turkishLower == turkishUpper.toLowerCase();
+
+// charAt
+s.charAt(0) == "𠜎";
+s.charAt(1) == "z";
+s.charAt(2) == "я";
+s.charAt(3) == "";
+s.charAt( -1) == "";
+("":UnicodeString).charAt(0) == "";
+("":UnicodeString).charAt(1) == "";
+("":UnicodeString).charAt( -1) == "";
+
+// charCodeAt
+s.charCodeAt(0) == codes[0];
+s.charCodeAt(1) == codes[1];
+s.charCodeAt(2) == codes[2];
+s.charCodeAt(3) == null;
+s.charCodeAt(-1) == null;
+
+// @:op(UnicodeString)
+var s2 = new UnicodeString("𠜎z");
+s != s2;
+!(s == s2);
+s > s2;
+s >= s2;
+s2 < s;
+s2 <= s;
+(s + s2).length == s.length + s2.length;
+var s3 = s;
+(s3 += s2).length == s.length + s2.length;
+
+// @:op(String)
+var s2 = "abя";
+s != s2;
+!(s == s2);
+s > s2;
+s >= s2;
+s2 < s;
+s2 <= s;
+(s + s2).length == s.length + (s2:UnicodeString).length;
+var s3 = s;
+(s3 += s2).length == s.length + (s2:UnicodeString).length;
+
+// iterator
+aeq(codes, [for(c in s) c]);
+
+// keyValueIterator
+var keys = [for(i in 0...codes.length) i];
+var actualKeyCodes = [for(i => c in s) [i, c]];
+aeq(keys, actualKeyCodes.map(a -> a[0]));
+aeq(codes, actualKeyCodes.map(a -> a[1]));
+
+// validate
+UnicodeString.validate(haxe.io.Bytes.ofHex("f0a9b8bde38182c3ab61"), UTF8) == true;
+UnicodeString.validate(haxe.io.Bytes.ofHex("ed9fbf"), UTF8) == true;
+UnicodeString.validate(haxe.io.Bytes.ofHex("ee8080"), UTF8) == true;
+UnicodeString.validate(haxe.io.Bytes.ofHex("f48fbfbf"), UTF8) == true;
+UnicodeString.validate(haxe.io.Bytes.ofHex("f0a9b8bde381c3ab61"), UTF8) == false;
+UnicodeString.validate(haxe.io.Bytes.ofHex("c0af"), UTF8) == false; // overlong sequence
+UnicodeString.validate(haxe.io.Bytes.ofHex("eda080"), UTF8) == false; // surrogate byte sequence
+UnicodeString.validate(haxe.io.Bytes.ofHex("edbfbf"), UTF8) == false; // surrogate byte sequence
+UnicodeString.validate(haxe.io.Bytes.ofHex("f4908080"), UTF8) == false; // U+110000
+
+#else
+1 == 1;
+#end

+ 0 - 56
tests/unit/src/unitstd/haxe/Utf8.unit.hx

@@ -1,56 +0,0 @@
-#if false
-// disabled tests with outside BMP chars (will be reenabled when we support them)
-var str = "あ𠀀い";
-haxe.Utf8.length(str) == 3;
-haxe.Utf8.charCodeAt(str, 0) == 0x3042;
-haxe.Utf8.charCodeAt(str, 1) == 0x20000;
-haxe.Utf8.charCodeAt(str, 2) == 0x3044;
-var buf = new haxe.Utf8();
-buf.addChar(0x3042);
-buf.addChar(0x20000);
-buf.addChar(0x3044);
-buf.toString() == str;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 0, 3), str) == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 0, 2), "あ𠀀") == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 1, 2), "𠀀い") == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 0, 0), "") == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 1, 0), "") == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 9, 0), "") == 0;
-#end
-
-
-// same tests with BMP chars (actually UCS2 compliance only)
-var str = "あéい";
-haxe.Utf8.length(str) == 3;
-haxe.Utf8.charCodeAt(str, 0) == 0x3042;
-haxe.Utf8.charCodeAt(str, 1) == 0xE9;
-haxe.Utf8.charCodeAt(str, 2) == 0x3044;
-var big = new haxe.Utf8(10);
-big.toString().length == 0;
-var buf = new haxe.Utf8();
-buf.addChar(0x3042);
-buf.addChar(0xE9);
-buf.addChar(0x3044);
-buf.toString() == str;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 0, 3), str) == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 0, 2), "あé") == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 1, 2), "éい") == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 0, 0), "") == 0;
-haxe.Utf8.compare(haxe.Utf8.sub(str, 1, 0), "") == 0;
-
-// unspecify outside of range Utf8.sub
-// haxe.Utf8.compare(haxe.Utf8.sub(str, 9, 0), "") == 0;
-
-// #if (neko || php || cpp || lua || macro)
-// TODO neko, cpp, macro
-#if php
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("f0a9b8bde38182c3ab61").toString()) == true;
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("ed9fbf").toString()) == true;
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("ee8080").toString()) == true;
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("f48fbfbf").toString()) == true;
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("f0a9b8bde381c3ab61").toString()) == false;
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("c0af").toString()) == false; // redundant sequence
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("eda080").toString()) == false; // surrogate byte sequence
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("edbfbf").toString()) == false; // surrogate byte sequence
-haxe.Utf8.validate(haxe.io.Bytes.ofHex("f4908080").toString()) == false; // U+110000
-#end