Przeglądaj źródła

Unicode tests (#7009)

* started unicode unit tests

* more unicode tests and api changes

* turn off neko (will not be made unicode compatible)

* flash unicode support

* added polyfill for missing IE String.fromCodePoint

* make ucs2 with utf16 encoding the default for tests compliance

* [typer] allow the String module to have multiple module types

I think taking the closure of e.g. String's `fromCharCode` on JS causes a `_String_Impl` class to be created.

* [eval] disable unicode tests

* [java/cs] Update Bytes code to support the RawNative encoding

Also fix `String.fromCharCode` when the code point is a surrogate pair

* [unicode] move some code from HL to common

* [hxcpp] Some work to prepare for utf16 strings

* [hxcpp] Remove compiler warnings.

* [hxcpp] Allow hxcpp strings to be non-utf8

* [hxcpp] Use the same hash for utf8 and wide-char representations.  Remove assumptions about hxcpp string format from crypto.Sha

* [hxcpp] Do not assume utf8 on hxcpp.  Add optimization for Std.string in the case of passed String

* Use native utf8 encoding for hxcpp. Disable unicode tests for cpp without 'hxcpp_smart_strings' define

* Add hxcpp_smart_strings define to compile

* [cpp] remove unused variables

* Add some unicode indexOf tests

* [php] load the file with polyfills

* [php] Converted String to multibyte

* [php] fixed StringTools.fastCodeAt() for utf8

* [php] bytes io seems fixed

* [php] fixed xml.Parser for unicode

* [php] fixed haxe.JsonParser for unicode

* [php] php strings are binary-safe

* [php] fixed sha224 & sha256 tests

* [hxcpp] Export spcial cpp types as fsUnkown to cppia.  Change definition of cpp.Star to allow null setting.  Add cpp.Native class for some easier access to cpp.Star pointers.  Bump hxcpp_api_version to 400.

* [lua] refactor lib installation methods

* [lua] lib version adjustment

* update luarocks lib name

* [lua] Add Utf8 extern, use it for base String class

* [lua] use native string tools inside Bytes

* [lua] use fast byte decoding for utf8

* [lua] remove slice allocation inside of Bytes

* [lua] fix utf8 in xml

* [lua] fix stack overflow for byte decoding

* [lua] adjust offsets for byte encoding

* [lua] get rid of defunct Utf8 implementation

* [lua] remove hardcoded stack size limit

* [lua] set utf8 handling for related std string methods and tests

* [lua] Std.string checks for a userdata metatable and will use that if present

* [php] fixed JsonParser and xml Parser

* [lua] remove special utf8 handling logic from json/xml parser

* [lua] skip sha tests for now

* add more tests because I hate myself

* fix

* [eval] it passes!

* [eval] cleanup

* [eval] try to get substr/substring right and add some tests

* [eval] fix and test Reflect.compare

* [as3] make fromCodePoint public

* [eval] make (last)indexOf ECMA-compliant with regards to ""

see #5271

* [lua] use safe table method for decoding bytes to strings

* [lua] reformat NativeStringTools and remove non-existent charCodeAt function

* [lua] use NativeStringTools for byte management in Process

* [python] get tests to pass
Nicolas Cannasse 7 lat temu
rodzic
commit
03659011fc
73 zmienionych plików z 1584 dodań i 788 usunięć
  1. 66 0
      src/context/common.ml
  2. 1 1
      src/generators/gencs.ml
  3. 2 2
      src/generators/genjava.ml
  4. 2 1
      src/generators/genlua.ml
  5. 5 3
      src/generators/genphp7.ml
  6. 1 1
      src/generators/hl2c.ml
  7. 0 21
      src/generators/hlcode.ml
  8. 7 7
      src/generators/hlinterp.ml
  9. 4 7
      src/macro/eval/evalArray.ml
  10. 62 0
      src/macro/eval/evalBytes.ml
  11. 2 7
      src/macro/eval/evalContext.ml
  12. 5 5
      src/macro/eval/evalDebugCLI.ml
  13. 5 5
      src/macro/eval/evalDebugSocket.ml
  14. 4 4
      src/macro/eval/evalDecode.ml
  15. 5 4
      src/macro/eval/evalEmitter.ml
  16. 8 4
      src/macro/eval/evalEncode.ml
  17. 2 2
      src/macro/eval/evalExceptions.ml
  18. 2 2
      src/macro/eval/evalField.ml
  19. 1 1
      src/macro/eval/evalJit.ml
  20. 6 6
      src/macro/eval/evalMain.ml
  21. 16 7
      src/macro/eval/evalMisc.ml
  22. 53 37
      src/macro/eval/evalPrinting.ml
  23. 266 186
      src/macro/eval/evalStdLib.ml
  24. 235 0
      src/macro/eval/evalString.ml
  25. 38 5
      src/macro/eval/evalValue.ml
  26. 4 3
      src/typing/typer.ml
  27. 1 1
      std/StringTools.hx
  28. 3 2
      std/cs/internal/StringExt.hx
  29. 3 2
      std/eval/_std/haxe/io/Bytes.hx
  30. 1 1
      std/eval/_std/haxe/io/BytesBuffer.hx
  31. 9 0
      std/flash/Boot.hx
  32. 43 0
      std/flash/_std/String.hx
  33. 2 2
      std/haxe/format/JsonParser.hx
  34. 56 17
      std/haxe/io/Bytes.hx
  35. 3 3
      std/haxe/io/BytesBuffer.hx
  36. 2 2
      std/haxe/io/BytesInput.hx
  37. 5 2
      std/haxe/io/BytesOutput.hx
  38. 33 0
      std/haxe/io/Encoding.hx
  39. 2 2
      std/haxe/io/Input.hx
  40. 2 2
      std/haxe/io/Output.hx
  41. 1 1
      std/haxe/xml/Parser.hx
  42. 12 6
      std/hl/_std/haxe/io/Bytes.hx
  43. 2 2
      std/hl/_std/haxe/io/BytesBuffer.hx
  44. 1 0
      std/java/_std/String.hx
  45. 2 5
      std/java/internal/StringExt.hx
  46. 8 1
      std/js/_std/String.hx
  47. 37 21
      std/js/_std/haxe/io/Bytes.hx
  48. 11 10
      std/js/_std/haxe/io/BytesBuffer.hx
  49. 11 1
      std/lua/Boot.hx
  50. 36 41
      std/lua/NativeStringTools.hx
  51. 14 17
      std/lua/_std/String.hx
  52. 0 250
      std/lua/_std/haxe/Utf8.hx
  53. 6 5
      std/lua/_std/sys/io/Process.hx
  54. 117 0
      std/lua/lib/luautf8/Utf8.hx
  55. 35 26
      std/php/Boot.hx
  56. 42 0
      std/php/Global.hx
  57. 43 0
      std/php/_polyfills.php
  58. 6 10
      std/php/_std/String.hx
  59. 1 1
      std/php/_std/StringTools.hx
  60. 4 3
      std/php/_std/haxe/io/Bytes.hx
  61. 5 5
      std/php/_std/haxe/io/BytesBuffer.hx
  62. 2 1
      std/python/_std/sys/io/FileInput.hx
  63. 2 1
      std/python/_std/sys/io/FileOutput.hx
  64. 2 1
      std/python/io/IInput.hx
  65. 2 1
      std/python/io/IOutput.hx
  66. 1 1
      tests/optimization/src/issues/Issue6015.hx
  67. 16 18
      tests/runci/targets/Lua.hx
  68. 1 0
      tests/unit/compile-cpp.hxml
  69. 1 0
      tests/unit/compile-cppia-host.hxml
  70. 1 1
      tests/unit/src/unit/TestResource.hx
  71. 193 0
      tests/unit/src/unitstd/Unicode.unit.hx
  72. 1 1
      tests/unit/src/unitstd/haxe/Utf8.unit.hx
  73. 1 1
      tests/unit/src/unitstd/haxe/crypto/Hmac.unit.hx

+ 66 - 0
src/context/common.ml

@@ -604,6 +604,72 @@ let url_encode_s s =
 	url_encode s (Buffer.add_char b);
 	Buffer.contents b
 
+(* UTF8 *)
+
+let to_utf8 str p =
+	let u8 = try
+		UTF8.validate str;
+		str;
+	with
+		UTF8.Malformed_code ->
+			(* ISO to utf8 *)
+			let b = UTF8.Buf.create 0 in
+			String.iter (fun c -> UTF8.Buf.add_char b (UChar.of_char c)) str;
+			UTF8.Buf.contents b
+	in
+	let ccount = ref 0 in
+	UTF8.iter (fun c ->
+		let c = UChar.code c in
+		if (c >= 0xD800 && c <= 0xDFFF) || c >= 0x110000 then abort "Invalid unicode char" p;
+		incr ccount;
+		if c > 0x10000 then incr ccount;
+	) u8;
+	u8, !ccount
+
+let utf16_add buf c =
+	let add c =
+		Buffer.add_char buf (char_of_int (c land 0xFF));
+		Buffer.add_char buf (char_of_int (c lsr 8));
+	in
+	if c >= 0 && c < 0x10000 then begin
+		if c >= 0xD800 && c <= 0xDFFF then failwith ("Invalid unicode char " ^ string_of_int c);
+		add c;
+	end else if c < 0x110000 then begin
+		let c = c - 0x10000 in
+		add ((c asr 10) + 0xD800);
+		add ((c land 1023) + 0xDC00);
+	end else
+		failwith ("Invalid unicode char " ^ string_of_int c)
+
+let utf8_to_utf16 str zt =
+	let b = Buffer.create (String.length str * 2) in
+	(try UTF8.iter (fun c -> utf16_add b (UChar.code c)) str with Invalid_argument _ | UChar.Out_of_range -> ()); (* if malformed *)
+	if zt then utf16_add b 0;
+	Buffer.contents b
+
+let utf16_to_utf8 str =
+	let b = Buffer.create 0 in
+	let add c = Buffer.add_char b (char_of_int (c land 0xFF)) in
+	let get i = int_of_char (String.unsafe_get str i) in
+	let rec loop i =
+		if i >= String.length str then ()
+		else begin
+			let c = get i in
+			if c < 0x80 then begin
+				add c;
+				loop (i + 2);
+			end else if c < 0x800 then begin
+				let c = c lor ((get (i + 1)) lsl 8) in
+				add c;
+				add (c lsr 8);
+				loop (i + 2);
+			end else
+				assert false;
+		end
+	in
+	loop 0;
+	Buffer.contents b
+
 let add_diagnostics_message com s p sev =
 	let di = com.shared.shared_display_information in
 	di.diagnostics_messages <- (s,p,sev) :: di.diagnostics_messages

+ 1 - 1
src/generators/gencs.ml

@@ -382,7 +382,7 @@ struct
 					{ e with eexpr = TField(run ef, FDynamic "ToUpperInvariant") }
 
 				| TCall( { eexpr = TField(_, FStatic({ cl_path = [], "String" }, { cf_name = "fromCharCode" })) }, [cc] ) ->
-					{ e with eexpr = TNew(get_cl_from_t basic.tstring, [], [mk_cast tchar (run cc); make_int gen.gcon.basic 1 cc.epos]) }
+					{ e with eexpr = TCall(mk_static_field_access_infer string_ext "fromCharCode" e.epos [], [run cc]) }
 				| TCall( { eexpr = TField(ef, FInstance({ cl_path = [], "String" }, _, { cf_name = ("charAt" as field) })) }, args )
 				| TCall( { eexpr = TField(ef, FInstance({ cl_path = [], "String" }, _, { cf_name = ("charCodeAt" as field) })) }, args )
 				| TCall( { eexpr = TField(ef, FInstance({ cl_path = [], "String" }, _, { cf_name = ("indexOf" as field) })) }, args )

+ 2 - 2
src/generators/genjava.ml

@@ -1548,9 +1548,9 @@ let generate con =
 					) 0 el);
 					write w "}"
 				| TCall( ( { eexpr = TField(_, FStatic({ cl_path = ([], "String") }, { cf_name = "fromCharCode" })) } ), [cc] ) ->
-						write w "Character.toString((char) ";
+						write w "new java.lang.String( java.lang.Character.toChars((int) ";
 						expr_s w cc;
-						write w ")"
+						write w ") )"
 				| TCall ({ eexpr = TIdent "__is__" }, [ expr; { eexpr = TTypeExpr(md) } ] ) ->
 					write w "( ";
 					expr_s w expr;

+ 2 - 1
src/generators/genlua.ml

@@ -647,8 +647,9 @@ and gen_expr ?(local=true) ctx e = begin
         gen_value ctx x;
         print ctx "[1]"
     | TField (e, ef) when is_string_expr e && field_name ef = "length"->
-        spr ctx "#";
+        spr ctx "__lua_lib_luautf8_Utf8.len(";
         gen_value ctx e;
+        spr ctx ")";
     | TField (e, ef) when is_possible_string_field e (field_name ef)  ->
         add_feature ctx "use._hx_wrap_if_string_field";
         add_feature ctx "use.string";

+ 5 - 3
src/generators/genphp7.ml

@@ -2201,9 +2201,9 @@ class code_writer (ctx:Common.context) hx_type_path php_name =
 		method write_expr_field expr access =
 			match access with
 				| FInstance ({ cl_path = [], "String"}, _, { cf_name = "length"; cf_kind = Var _ }) ->
-					self#write "strlen(";
+					self#write "mb_strlen(";
 					self#write_expr expr;
-					self#write ")"
+					self#write ", 'UTF-8')"
 				| FInstance (_, _, field) -> self#write_expr_for_field_access expr "->" (field_name field)
 				| FStatic (_, ({ cf_kind = Var _ } as field)) ->
 					(match (reveal_expr expr).eexpr with
@@ -2254,7 +2254,9 @@ class code_writer (ctx:Common.context) hx_type_path php_name =
 					self#write "()"
 				end
 		(**
-			Writes field access on Dynamic expression to output buffer
+			Writes field access on Dynamic expression to output buffer.
+			Returns `true` if requested field is most likely belongs to String (and field resolution will be handled at runtime).
+			Otherwise returns `false`
 		*)
 		method write_expr_field_if_string expr field_name =
 			(* Special case for String fields *)

+ 1 - 1
src/generators/hl2c.ml

@@ -1215,7 +1215,7 @@ let write_c com file (code:code) =
 				sexpr "}";
 			end
 		end else if String.length str >= string_data_limit then
-			let s = utf8_to_utf16 str in
+			let s = Common.utf8_to_utf16 str true in
 			sline "// %s..." (String.escaped (String.sub str 0 (string_data_limit-4)));
 			output ctx (Printf.sprintf "vbyte string$%d[] = {" i);
 			output_bytes (output ctx) s;

+ 0 - 21
src/generators/hlcode.ml

@@ -335,27 +335,6 @@ let hl_hash b =
 	in
 	loop 0
 
-let utf16_add buf c =
-	let add c =
-		Buffer.add_char buf (char_of_int (c land 0xFF));
-		Buffer.add_char buf (char_of_int (c lsr 8));
-	in
-	if c >= 0 && c < 0x10000 then begin
-		if c >= 0xD800 && c <= 0xDFFF then failwith ("Invalid unicode char " ^ string_of_int c);
-		add c;
-	end else if c < 0x110000 then begin
-		let c = c - 0x10000 in
-		add ((c asr 10) + 0xD800);
-		add ((c land 1023) + 0xDC00);
-	end else
-		failwith ("Invalid unicode char " ^ string_of_int c)
-
-let utf8_to_utf16 str =
-	let b = Buffer.create (String.length str * 2) in
-	(try UTF8.iter (fun c -> utf16_add b (UChar.code c)) str with Invalid_argument _ | UChar.Out_of_range -> ()); (* if malformed *)
-	utf16_add b 0;
-	Buffer.contents b
-
 let rec get_index name p =
 	try
 		PMap.find name p.pindex

+ 7 - 7
src/generators/hlinterp.ml

@@ -286,7 +286,7 @@ let fstr = function
 	| FFun f -> "function@" ^ string_of_int f.findex
 	| FNativeFun (s,_,_) -> "native[" ^ s ^ "]"
 
-let caml_to_hl str = utf8_to_utf16 str
+let caml_to_hl str = Common.utf8_to_utf16 str true
 
 let hash ctx str =
 	let h = hl_hash str in
@@ -311,7 +311,7 @@ let utf16_iter f s =
 	loop 0
 
 let utf16_char buf c =
-	utf16_add buf (int_of_char c)
+	Common.utf16_add buf (int_of_char c)
 
 let hl_to_caml str =
 	let utf16_eof s =
@@ -1763,9 +1763,9 @@ let load_native ctx lib name t =
 						if c >= int_of_char 'a' && c <= int_of_char 'z' then c + int_of_char 'A' - int_of_char 'a'
 						else c
 					in
-					utf16_add buf c
+					Common.utf16_add buf c
 				) (String.sub s (int pos) ((int len) lsl 1));
-				utf16_add buf 0;
+				Common.utf16_add buf 0;
 				VBytes (Buffer.contents buf)
 			| _ -> assert false)
 		| "ucs2_lower" ->
@@ -1777,9 +1777,9 @@ let load_native ctx lib name t =
 						if c >= int_of_char 'A' && c <= int_of_char 'Z' then c + int_of_char 'a' - int_of_char 'A'
 						else c
 					in
-					utf16_add buf c
+					Common.utf16_add buf c
 				) (String.sub s (int pos) ((int len) lsl 1));
-				utf16_add buf 0;
+				Common.utf16_add buf 0;
 				VBytes (Buffer.contents buf)
 			| _ -> assert false)
 		| "url_encode" ->
@@ -1788,7 +1788,7 @@ let load_native ctx lib name t =
 				let s = hl_to_caml s in
 				let buf = Buffer.create 0 in
 				Common.url_encode s (utf16_char buf);
-				utf16_add buf 0;
+				Common.utf16_add buf 0;
 				let str = Buffer.contents buf in
 				set_ref r (to_int (String.length str lsr 1 - 1));
 				VBytes str

+ 4 - 7
src/macro/eval/evalArray.ml

@@ -19,6 +19,7 @@
 
 open Globals
 open EvalValue
+open EvalString
 
 let create values = {
 	avalues = values;
@@ -26,13 +27,9 @@ let create values = {
 }
 
 let array_join a f sep =
-	let buf = Rope.Buffer.create 0 in
-	let last = Array.length a - 1 in
-	Array.iteri (fun i v ->
-		Rope.Buffer.add_rope buf (f v);
-		if i <> last then Rope.Buffer.add_rope buf sep;
-	) a;
-	Rope.Buffer.contents buf
+	let l = Array.map f a in
+	let l = Array.to_list l in
+	join sep l
 
 let to_list a = Array.to_list (Array.sub a.avalues 0 a.alength)
 

+ 62 - 0
src/macro/eval/evalBytes.ml

@@ -0,0 +1,62 @@
+(*
+	The Haxe Compiler
+	Copyright (C) 2005-2018  Haxe Foundation
+
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version 2
+	of the License, or (at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to the Free Software
+	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *)
+
+let read_byte this i = int_of_char (Bytes.get this i)
+
+let read_ui16 this i =
+	let ch1 = read_byte this i in
+	let ch2 = read_byte this (i + 1) in
+	ch1 lor (ch2 lsl 8)
+
+let read_i32 this i =
+	let ch1 = read_byte this i in
+	let ch2 = read_byte this (i + 1) in
+	let ch3 = read_byte this (i + 2) in
+	let base = Int32.of_int (ch1 lor (ch2 lsl 8) lor (ch3 lsl 16)) in
+	let big = Int32.shift_left (Int32.of_int (read_byte this (i + 3))) 24 in
+	Int32.logor base big
+
+let read_i64 this i =
+	let ch1 = read_byte this i in
+	let ch2 = read_byte this (i + 1) in
+	let ch3 = read_byte this (i + 2) in
+	let ch4 = read_byte this (i + 3) in
+	let base = Int64.of_int (ch1 lor (ch2 lsl 8) lor (ch3 lsl 16)) in
+	let small = Int64.logor base (Int64.shift_left (Int64.of_int ch4) 24) in
+	let big = Int64.of_int32 (read_i32 this (i + 4)) in
+	Int64.logor (Int64.shift_left big 32) small
+
+let write_byte this i v =
+	Bytes.set this i (Char.unsafe_chr v)
+
+let write_ui16 this i v =
+	write_byte this i v;
+	write_byte this (i + 1) (v lsr 8)
+
+let write_i32 this i v =
+	let base = Int32.to_int v in
+	let big = Int32.to_int (Int32.shift_right_logical v 24) in
+	write_byte this i base;
+	write_byte this (i + 1) (base lsr 8);
+	write_byte this (i + 2) (base lsr 16);
+	write_byte this (i + 3) big
+
+let write_i64 this i v =
+	write_i32 this i (Int64.to_int32 v);
+	write_i32 this (i + 4) (Int64.to_int32 (Int64.shift_right_logical v 32))

+ 2 - 7
src/macro/eval/evalContext.ml

@@ -21,6 +21,7 @@ open Globals
 open Type
 open EvalValue
 open EvalHash
+open EvalString
 
 type var_info = string
 
@@ -166,12 +167,6 @@ let rec kind_name ctx kind =
 	in
 	loop kind ctx.environment_offset
 
-let vstring s =
-	VString (s,lazy (Rope.to_string s))
-
-let vstring_direct (r,s) =
-	VString(r,s)
-
 let call_function f vl = f vl
 
 let object_fields o =
@@ -210,7 +205,7 @@ let throw v p =
 
 let exc v = throw v null_pos
 
-let exc_string str = exc (vstring (Rope.of_string str))
+let exc_string str = exc (vstring (EvalString.create_ascii str))
 
 let error_message = exc_string
 

+ 5 - 5
src/macro/eval/evalDebugCLI.ml

@@ -43,13 +43,13 @@ let value_string value =
 		| VFalse -> "Bool","false"
 		| VInt32 i -> "Int",Int32.to_string i
 		| VFloat f -> "Float",string_of_float f
-		| VEnumValue ev -> rev_hash_s ev.epath,Rope.to_string (s_enum_value 0 ev)
+		| VEnumValue ev -> rev_hash_s ev.epath,EvalString.get (s_enum_value 0 ev)
 		| VObject o -> "Anonymous",fields_string (depth + 1) (object_fields o)
-		| VString(_,s) -> "String","\"" ^ (Ast.s_escape (Lazy.force s)) ^ "\""
-		| VArray va -> "Array",Rope.to_string (s_array (depth + 1) va)
-		| VVector vv -> "Vector",Rope.to_string (s_vector (depth + 1) vv)
+		| VString s -> "String","\"" ^ (Ast.s_escape (Lazy.force s.sstring)) ^ "\""
+		| VArray va -> "Array",EvalString.get (s_array (depth + 1) va)
+		| VVector vv -> "Vector",EvalString.get (s_vector (depth + 1) vv)
 		| VInstance vi -> rev_hash_s vi.iproto.ppath,instance_fields (depth + 1) vi
-		| VPrototype proto -> "Anonymous",Rope.to_string (s_proto_kind proto)
+		| VPrototype proto -> "Anonymous",EvalString.get (s_proto_kind proto)
 		| VFunction _ | VFieldClosure _ -> "Function","fun"
 		| VLazy f -> value_string depth (!f())
 	in

+ 5 - 5
src/macro/eval/evalDebugSocket.ml

@@ -32,10 +32,10 @@ let var_to_json name value access =
 				| vl -> name ^ "(...)"
 			end
 		| VObject o -> "{...}"
-		| VString(_,s) -> string_repr s
+		| VString s -> string_repr s.sstring
 		| VArray _ | VVector _ -> "[...]"
 		| VInstance vi -> (rev_hash_s vi.iproto.ppath) ^ " {...}"
-		| VPrototype proto -> Rope.to_string (s_proto_kind proto)
+		| VPrototype proto -> EvalString.get (s_proto_kind proto)
 		| VFunction _ | VFieldClosure _ -> "<fun>"
 		| VLazy f -> level2_value_repr (!f())
 	in
@@ -65,13 +65,13 @@ let var_to_json name value access =
 			in
 			jv type_s value_s is_structured
 		| VObject o -> jv "Anonymous" (fields_string (object_fields o)) true (* TODO: false for empty structures *)
-		| VString(_,s) -> jv "String" (string_repr s) false
+		| VString s -> jv "String" (string_repr s.sstring) false
 		| VArray va -> jv "Array" (array_elems (EvalArray.to_list va)) true (* TODO: false for empty arrays *)
 		| VVector vv -> jv "Vector" (array_elems (Array.to_list vv)) true
 		| VInstance vi ->
 			let class_name = rev_hash_s vi.iproto.ppath in
 			jv class_name (class_name ^ " " ^ (fields_string (instance_fields vi))) true
-		| VPrototype proto -> jv "Anonymous" (Rope.to_string (s_proto_kind proto)) false (* TODO: show statics *)
+		| VPrototype proto -> jv "Anonymous" (EvalString.get (s_proto_kind proto)) false (* TODO: show statics *)
 		| VFunction _ | VFieldClosure _ -> jv "Function" "<fun>" false
 		| VLazy f -> value_string (!f())
 	in
@@ -176,7 +176,7 @@ let output_inner_vars v access =
 				let a = access ^ "." ^ n in
 				n, v, a
 			) fields
-		| VString(_,s) -> []
+		| VString _ -> []
 		| VArray va ->
 			let l = EvalArray.to_list va in
 			List.mapi (fun i v ->

+ 4 - 4
src/macro/eval/evalDecode.ml

@@ -50,15 +50,15 @@ let decode_varray v = match v with
 	| _ -> unexpected_value v "array"
 
 let decode_string v = match v with
-	| VString(r,s) -> Lazy.force s
+	| VString s -> EvalString.get s
 	| _ -> unexpected_value v "string"
 
 let decode_rope v = match v with
-	| VString(s,_) -> s
+	| VString s -> s.srope
 	| _ -> unexpected_value v "string"
 
-let decode_rope_string v = match v with
-	| VString(r,s) -> r,s
+let decode_vstring v = match v with
+	| VString s -> s
 	| _ -> unexpected_value v "string"
 
 let decode_bytes v = match v with

+ 5 - 4
src/macro/eval/evalEmitter.ml

@@ -321,7 +321,7 @@ let emit_proto_field_read proto i env =
 
 let emit_instance_field_read exec i env = match exec env with
 	| VInstance vi -> vi.ifields.(i)
-	| VString(_,s) -> vint (String.length (Lazy.force s))
+	| VString s -> vint (s.slength)
 	| v -> unexpected_value v "instance"
 
 let emit_field_closure exec name env =
@@ -363,10 +363,11 @@ let emit_enum_parameter_read exec i env = match exec env with
 	| v1 -> unexpected_value v1 "enum value"
 
 let emit_string_cca exec1 exec2 p env =
-	let s = decode_string (exec1 env) in
+	let s = decode_vstring (exec1 env) in
 	let index = decode_int_p (exec2 env) p in
-	if index >= String.length s then vnull
-	else vint (int_of_char s.[index])
+	if index < 0 || index >= s.slength then vnull
+	else if s.sascii then vint (int_of_char (String.get (Lazy.force s.sstring) index))
+	else vint (EvalString.read_char s (index lsl 1))
 
 (* Write *)
 

+ 8 - 4
src/macro/eval/evalEncode.ml

@@ -22,6 +22,7 @@ open EvalValue
 open EvalExceptions
 open EvalContext
 open EvalHash
+open EvalString
 
 (* Functions *)
 
@@ -191,10 +192,13 @@ let encode_array l =
 	encode_array_instance (EvalArray.create (Array.of_list l))
 
 let encode_string s =
-	VString(Rope.of_string s,lazy s)
+	vstring (create_ascii s)
 
-let encode_rope s =
-	vstring s
+let encode_rope r =
+	vstring (create_ascii_of_rope r)
+
+let encode_rope_ucs2 r length =
+	vstring (create_ucs2_of_rope r length)
 
 let encode_bytes s =
 	encode_instance key_haxe_io_Bytes ~kind:(IBytes s)
@@ -210,7 +214,7 @@ let encode_object_map_direct h =
 
 let encode_string_map convert m =
 	let h = StringHashtbl.create 0 in
-	PMap.iter (fun key value -> StringHashtbl.add h (Rope.of_string key,lazy key) (convert value)) m;
+	PMap.iter (fun key value -> StringHashtbl.add h (create_ascii key) (convert value)) m;
 	encode_string_map_direct h
 
 let fake_proto path =

+ 2 - 2
src/macro/eval/evalExceptions.ml

@@ -127,8 +127,8 @@ let catch_exceptions ctx ?(final=(fun() -> ())) f p =
 			get_ctx_ref := prev;
 			final();
 			match v1,v2 with
-				| VString(_,s),VInstance {ikind = IPos p} ->
-					raise (Error.Error (Error.Custom (Lazy.force s),p))
+				| VString s,VInstance {ikind = IPos p} ->
+					raise (Error.Error (Error.Custom (EvalString.get s),p))
 				| _ ->
 					Error.error "Something went wrong" null_pos
 		end else begin

+ 2 - 2
src/macro/eval/evalField.ml

@@ -55,8 +55,8 @@ let field_raise v f =
 	| VVector vv ->
 		if f = key_length then vint (Array.length vv)
 		else proto_field_direct (get_ctx()).vector_prototype f
-	| VString (_,s) ->
-		if f = key_length then vint (String.length (Lazy.force s))
+	| VString s ->
+		if f = key_length then vint (s.slength)
 		else proto_field_direct (get_ctx()).string_prototype f
 	| VInstance vi -> (try instance_field vi f with Not_found -> proto_field_raise vi.iproto f)
 	| _ -> raise Not_found

+ 1 - 1
src/macro/eval/evalJit.ml

@@ -36,7 +36,7 @@ let rope_path t = match follow t with
 let eone = mk (TConst(TInt (Int32.one))) t_dynamic null_pos
 
 let eval_const = function
-	| TString s -> vstring (Rope.of_string s)
+	| TString s -> EvalString.bytes_to_utf8 (Bytes.unsafe_of_string s)
 	| TInt i32 -> vint32 i32
 	| TFloat f -> vfloat (float_of_string f)
 	| TBool b -> vbool b

+ 6 - 6
src/macro/eval/evalMain.ml

@@ -226,13 +226,13 @@ let value_signature v =
 		| VInstance {ikind = IDate f} ->
 			cache v (fun () ->
 				addc 'v';
-				add (Rope.to_string (s_date f))
+				add (EvalString.get (s_date f))
 			)
 		| VInstance {ikind = IStringMap map} ->
 			cache v (fun() ->
 				addc 'b';
-				StringHashtbl.iter (fun (_,s) value ->
-					adds (Lazy.force s);
+				StringHashtbl.iter (fun s value ->
+					adds (Lazy.force s.sstring);
 					loop value
 				) map;
 				addc 'h'
@@ -278,8 +278,8 @@ let value_signature v =
 				loop_fields fields;
 				addc 'g';
 			)
-		| VString(_,s) ->
-			adds (Lazy.force s)
+		| VString s ->
+			adds (Lazy.force s.sstring)
 		| VArray {avalues = a} | VVector a ->
 			cache v (fun () ->
 				addc 'a';
@@ -399,7 +399,7 @@ let rec value_to_expr v p =
 	| VFalse -> (EConst (Ident "false"),p)
 	| VInt32 i -> (EConst (Int (Int32.to_string i)),p)
 	| VFloat f -> haxe_float f p
-	| VString(r,s) -> (EConst (String (Lazy.force s)),p)
+	| VString s -> (EConst (String (Lazy.force s.sstring)),p)
 	| VArray va -> (EArrayDecl (List.map (fun v -> value_to_expr v p) (EvalArray.to_list va)),p)
 	| VObject o -> (EObjectDecl (List.map (fun (k,v) ->
 			let n = rev_hash_s k in

+ 16 - 7
src/macro/eval/evalMisc.ml

@@ -27,6 +27,7 @@ open EvalDecode
 open EvalExceptions
 open EvalPrinting
 open EvalHash
+open EvalString
 
 let throw_string s p =
 	throw (encode_string s) p
@@ -96,8 +97,16 @@ let rec compare a b =
 	| VTrue,VTrue | VFalse,VFalse -> CEq
 	| VFalse,VTrue -> CInf
 	| VTrue,VFalse -> CSup
-	| VString(_,s1),VString(_,s2) ->
-		let r = String.compare (Lazy.force s1) (Lazy.force s2) in
+	| VString s1,VString s2 ->
+		let s1' = Lazy.force s1.sstring in
+		let s2' = Lazy.force s2.sstring in
+		let s1,s2 = match s1.sascii,s2.sascii with
+		| true,true
+		| false,false -> s1',s2'
+		| true,false -> extend_ascii s1',s2'
+		| false,true -> s1',extend_ascii s2'
+		in
+		let r = String.compare s1 s2 in
 		if r = 0 then CEq else if r < 0 then CInf else CSup
 	| VFunction(a,_), VFunction(b,_) -> if a == b then CEq else CUndef
 	| VArray va1,VArray va2 -> if va1 == va2 then CEq else CUndef
@@ -139,7 +148,7 @@ and equals_structurally a b =
 	| VFloat a,VFloat b -> a = b
 	| VFloat a,VInt32 b -> a = (Int32.to_float b)
 	| VInt32 a,VFloat b -> (Int32.to_float a) = b
-	| VString(_,s1),VString(_,s2) -> Lazy.force s1 = Lazy.force s2
+	| VString s1,VString s2 -> Lazy.force s1.sstring = Lazy.force s2.sstring (* STODO *)
 	| VArray a,VArray b -> a == b || arrays_equal equals_structurally a.avalues b.avalues
 	| VVector a,VVector b -> a == b || arrays_equal equals_structurally a b
 	| VObject a,VObject b -> a == b || arrays_equal equals_structurally a.ofields b.ofields && IntMap.equal equals_structurally a.oextra b.oextra
@@ -157,10 +166,10 @@ let op_add v1 v2 = match v1,v2 with
 	| VInt32 i1,VInt32 i2 -> vint32 (Int32.add i1 i2)
 	| VFloat f1,VFloat f2 -> vfloat (f1 +. f2)
 	| VInt32 i,VFloat f | VFloat f,VInt32 i -> vfloat ((Int32.to_float i) +. f)
-	| VString(s1,_),VString(s2,_) -> encode_rope (Rope.concat2 s1 s2)
-	| VString(s1,_),v2 -> encode_rope (Rope.concat2 s1 (s_value 0 v2))
-	| v1,VString(s2,_) -> encode_rope (Rope.concat2 (s_value 0 v1) s2)
-	| v1,v2 -> encode_rope (Rope.concat2 (s_value 0 v1) (s_value 0 v2))
+	| VString s1,VString s2 -> vstring (concat s1 s2)
+	| VString s1,v2 -> vstring (concat s1 (s_value 0 v2))
+	| v1,VString s2 -> vstring (concat (s_value 0 v1) s2)
+	| v1,v2 -> vstring (concat (s_value 0 v1) (s_value 0 v2))
 
 let op_mult p v1 v2 = match v1,v2 with
 	| VInt32 i1,VInt32 i2 -> vint32 (Int32.mul i1 i2)

+ 53 - 37
src/macro/eval/evalPrinting.ml

@@ -23,42 +23,56 @@ open EvalValue
 open EvalContext
 open EvalField
 open EvalHash
-
 open Rope
-
-let rnull = of_string "null"
-let rcomma = of_char ','
-let rtrue = of_string "true"
-let rfalse = of_string "false"
-let rfun = of_string "#fun"
-let rclosure = of_string "#closure"
+open EvalString
+
+let rempty = create_ascii ""
+let rbropen = create_ascii "{"
+let rbrclose = create_ascii "}"
+let rbkopen = create_ascii "["
+let rbkclose = create_ascii "]"
+let rpopen = create_ascii "("
+let rpclose = create_ascii ")"
+let rcolon = create_ascii ":"
+let rgt = create_ascii ">"
+let rstop = create_ascii "<...>"
+let rnull = create_ascii "null"
+let rcomma = create_ascii ","
+let rtrue = create_ascii "true"
+let rfalse = create_ascii "false"
+let rfun = create_ascii "#fun"
+let rclosure = create_ascii "#closure"
 
 let s_date d =
 	let open Unix in
 	let t = localtime d in
-	of_string (Printf.sprintf "%.4d-%.2d-%.2d %.2d:%.2d:%.2d" (t.tm_year + 1900) (t.tm_mon + 1) t.tm_mday t.tm_hour t.tm_min t.tm_sec)
+	create_ascii (Printf.sprintf "%.4d-%.2d-%.2d %.2d:%.2d:%.2d" (t.tm_year + 1900) (t.tm_mon + 1) t.tm_mday t.tm_hour t.tm_min t.tm_sec)
+
+let s_hash key = create_ascii_of_rope (EvalHash.rev_hash key)
 
 let rec s_object depth o =
 	let fields = object_fields o in
-	let fields = List.map (fun (key,value) -> (concat empty [EvalHash.rev_hash key; of_string ": "; s_value depth value])) fields in
-	concat empty [
-		of_char '{';
-		concat rcomma fields;
-		of_char '}'
-	]
+	let s,_ = List.fold_left (fun (s,sep) (key,value) ->
+		let s = concat s sep in
+		let s = concat s (s_hash key) in
+		let s = concat s rcolon in
+		let s = concat s (s_value depth value) in
+		(s,rcomma)
+	) (rempty,rbropen) fields in
+	concat s rbrclose
 
 and s_array depth va =
-	concat empty [
-		of_char '[';
+	join rempty [
+		rbkopen;
 		EvalArray.join va (s_value depth) rcomma;
-		of_char ']';
+		rbkclose;
 	]
 
 and s_vector depth vv =
-	concat empty [
-		of_char '[';
+	join rempty [
+		rbkopen;
 		EvalArray.join (EvalArray.create vv) (s_value depth) rcomma;
-		of_char ']';
+		rbkclose;
 	]
 
 and s_enum_ctor_name ve =
@@ -72,18 +86,18 @@ and s_enum_ctor_name ve =
 and s_enum_value depth ve =
 	let name = s_enum_ctor_name ve in
 	match ve.eargs with
-	| [||] -> of_string name
+	| [||] -> create_ascii name
 	| vl ->
-		concat empty [
-			of_string name;
-			of_char '(';
-			concat rcomma (Array.to_list (Array.map (s_value (depth + 1)) vl));
-			of_char ')'
+		join rempty [
+			create_ascii name;
+			rpopen;
+			join rcomma (Array.to_list (Array.map (s_value (depth + 1)) vl));
+			rpclose;
 		]
 
 and s_proto_kind proto = match proto.pkind with
-	| PClass _ -> concat empty [of_string "Class<"; rev_hash proto.ppath; of_char '>']
-	| PEnum _ -> concat empty [of_string "Enum<"; rev_hash proto.ppath; of_char '>']
+	| PClass _ -> join rempty [create_ascii "Class<"; s_hash proto.ppath; rgt]
+	| PEnum _ -> join rempty [create_ascii "Enum<"; s_hash proto.ppath; rgt]
 	| PInstance | PObject -> assert false
 
 and s_value depth v =
@@ -91,25 +105,25 @@ and s_value depth v =
 		let vf = field_raise v EvalHash.key_toString in
 		s_value (depth + 1) (call_value_on v vf [])
 	in
-	if depth > 5 then of_string "<...>"
+	if depth > 5 then rstop
 	else match v with
 	| VNull -> rnull
-	| VInt32 i32 -> of_string (Int32.to_string i32)
+	| VInt32 i32 -> create_ascii(Int32.to_string i32)
 	| VTrue -> rtrue
 	| VFalse -> rfalse
 	| VFloat f ->
 		let s = Numeric.float_repres f in
 		let len = String.length s in
-		of_string (if String.unsafe_get s (len - 1) = '.' then String.sub s 0 (len - 1) else s)
-	| VFunction (f,_) -> concat2 rfun (Rope.of_string (""))
+		create_ascii (if String.unsafe_get s (len - 1) = '.' then String.sub s 0 (len - 1) else s)
+	| VFunction (f,_) -> rfun
 	| VFieldClosure _ -> rclosure
 	| VEnumValue ve -> s_enum_value depth ve
-	| VString(s,_) -> s
+	| VString s -> s
 	| VArray va -> s_array (depth + 1) va
 	| VVector vv -> s_vector (depth + 1) vv
 	| VInstance {ikind=IDate d} -> s_date d
-	| VInstance {ikind=IPos p} -> of_string ("#pos(" ^ Lexer.get_error_pos (Printf.sprintf "%s:%d:") p ^ ")")
-	| VInstance i -> (try call_to_string () with Not_found -> rev_hash i.iproto.ppath)
+	| VInstance {ikind=IPos p} -> create_ascii ("#pos(" ^ Lexer.get_error_pos (Printf.sprintf "%s:%d:") p ^ ")") (* STODO: not ascii? *)
+	| VInstance i -> (try call_to_string () with Not_found -> s_hash i.iproto.ppath)
 	| VObject o -> (try call_to_string () with Not_found -> s_object (depth + 1) o)
 	| VLazy f -> s_value depth (!f())
 	| VPrototype proto ->
@@ -126,4 +140,6 @@ and call_value_on vthis v vl =
 	| VFieldClosure(v1,f) -> call_function f (v1 :: vl)
 	| _ -> exc_string ("Cannot call " ^ (value_string v))
 
-and value_string v = Rope.to_string (s_value 0 v)
+and value_string v =
+	let s = s_value 0 v in
+	EvalString.get s

+ 266 - 186
src/macro/eval/evalStdLib.ml

@@ -27,6 +27,7 @@ open EvalPrinting
 open EvalMisc
 open EvalField
 open EvalHash
+open EvalString
 
 let macro_lib = Hashtbl.create 0
 
@@ -73,8 +74,8 @@ module StdEvalVector = struct
 
 	let join = vifun1 (fun vthis sep ->
 		let this = this vthis in
-		let sep = decode_rope sep in
-		encode_rope (EvalArray.array_join this (s_value 0) sep)
+		let sep = decode_vstring sep in
+		vstring ((EvalArray.array_join this (s_value 0) sep))
 	)
 
 	let map = vifun1 (fun vthis f ->
@@ -141,9 +142,9 @@ module StdArray = struct
 	)
 
 	let join = vifun1 (fun vthis sep ->
-		let sep = decode_rope sep in
+		let sep = decode_vstring sep in
 		let s = EvalArray.join (this vthis) (s_value 0) sep in
-		encode_rope s
+		vstring s
 	)
 
 	let lastIndexOf = vifun2 (fun vthis x fromIndex ->
@@ -230,7 +231,7 @@ module StdArray = struct
 	)
 
 	let toString = vifun0 (fun vthis ->
-		encode_rope (s_array 0 (this vthis))
+		vstring (s_array 0 (this vthis))
 	)
 
 	let unshift = vifun1 (fun vthis v ->
@@ -252,54 +253,12 @@ let outside_bounds () =
 	exc (proto_field_direct haxe_io_Error key_OutsideBounds)
 
 module StdBytes = struct
+	open EvalBytes
+
 	let this vthis = match vthis with
 		| VInstance {ikind = IBytes o} -> o
 		| v -> unexpected_value v "bytes"
 
-	let read_byte this i = int_of_char (Bytes.get this i)
-
-	let read_ui16 this i =
-		let ch1 = read_byte this i in
-		let ch2 = read_byte this (i + 1) in
-		ch1 lor (ch2 lsl 8)
-
-	let read_i32 this i =
-		let ch1 = read_byte this i in
-		let ch2 = read_byte this (i + 1) in
-		let ch3 = read_byte this (i + 2) in
-		let base = Int32.of_int (ch1 lor (ch2 lsl 8) lor (ch3 lsl 16)) in
-		let big = Int32.shift_left (Int32.of_int (read_byte this (i + 3))) 24 in
-		Int32.logor base big
-
-	let read_i64 this i =
-		let ch1 = read_byte this i in
-		let ch2 = read_byte this (i + 1) in
-		let ch3 = read_byte this (i + 2) in
-		let ch4 = read_byte this (i + 3) in
-		let base = Int64.of_int (ch1 lor (ch2 lsl 8) lor (ch3 lsl 16)) in
-		let small = Int64.logor base (Int64.shift_left (Int64.of_int ch4) 24) in
-		let big = Int64.of_int32 (read_i32 this (i + 4)) in
-		Int64.logor (Int64.shift_left big 32) small
-
-	let write_byte this i v =
-		Bytes.set this i (Char.unsafe_chr v)
-
-	let write_ui16 this i v =
-		write_byte this i v;
-		write_byte this (i + 1) (v lsr 8)
-
-	let write_i32 this i v =
-		let base = Int32.to_int v in
-		let big = Int32.to_int (Int32.shift_right_logical v 24) in
-		write_byte this i base;
-		write_byte this (i + 1) (base lsr 8);
-		write_byte this (i + 2) (base lsr 16);
-		write_byte this (i + 3) big
-
-	let write_i64 this i v =
-		write_i32 this i (Int64.to_int32 v);
-		write_i32 this (i + 4) (Int64.to_int32 (Int64.shift_right_logical v 32))
-
 	let alloc = vfun1 (fun length ->
 		let length = decode_int length in
 		encode_bytes (Bytes.make length (Char.chr 0))
@@ -367,11 +326,12 @@ module StdBytes = struct
 			outside_bounds()
 	)
 
-	let getString = vifun2 (fun vthis pos len ->
+	let getString = vifun3 (fun vthis pos len encoding ->
 		let this = this vthis in
 		let pos = decode_int pos in
 		let len = decode_int len in
-		encode_string (Bytes.unsafe_to_string ((try Bytes.sub this pos len with _ -> outside_bounds())));
+		let s = try Bytes.sub this pos len with _ -> outside_bounds() in
+		bytes_to_utf8 s
 	)
 
 	let getUInt16 = vifun1 (fun vthis pos ->
@@ -380,8 +340,14 @@ module StdBytes = struct
 
 	let ofData = vfun1 (fun v -> v)
 
-	let ofString = vfun1 (fun v ->
-		encode_bytes (Bytes.of_string (decode_string v))
+	let ofString = vfun2 (fun v encoding ->
+		let s = decode_vstring v in
+		if s.sascii then
+			encode_bytes (Bytes.of_string (Lazy.force s.sstring))
+		else begin
+			let s = utf16_to_utf8 (Lazy.force s.sstring) in
+			encode_bytes (Bytes.of_string s)
+		end
 	)
 
 	let set = vifun2 (fun vthis pos v ->
@@ -452,7 +418,7 @@ module StdBytes = struct
 	)
 
 	let toString = vifun0 (fun vthis ->
-		encode_string (Bytes.to_string (this vthis))
+		bytes_to_utf8 (this vthis)
 	)
 end
 
@@ -491,7 +457,7 @@ module StdBytesBuffer = struct
 		vnull
 	)
 
-	let addString = vifun1 (fun vthis src ->
+	let addString = vifun2 (fun vthis src encoding ->
 		let this = this vthis in
 		let src = decode_string src in
 		Buffer.add_string this src;
@@ -746,7 +712,7 @@ module StdDate = struct
 	let getSeconds = vifun0 (fun vthis -> vint (localtime (this vthis)).tm_sec)
 	let getTime = vifun0 (fun vthis -> vfloat ((this vthis) *. 1000.))
 	let now = vfun0 (fun () -> encode_date (time()))
-	let toString = vifun0 (fun vthis -> encode_rope (s_date (this vthis)))
+	let toString = vifun0 (fun vthis -> vstring (s_date (this vthis)))
 end
 
 module StdEReg = struct
@@ -1118,9 +1084,9 @@ module StdFPHelper = struct
 		let low = decode_i32 low in
 		let high = decode_i32 high in
 		let b = Bytes.make 8 '0' in
-		StdBytes.write_i32 b 0 low;
-		StdBytes.write_i32 b 4 high;
-		let i64 = StdBytes.read_i64 b 0 in
+		EvalBytes.write_i32 b 0 low;
+		EvalBytes.write_i32 b 4 high;
+		let i64 = EvalBytes.read_i64 b 0 in
 		vfloat (Int64.float_of_bits i64)
 	)
 end
@@ -1415,14 +1381,10 @@ module StdMap (Hashtbl : Hashtbl.S) = struct
 		);
 		"toString",vifun0 (fun vthis ->
 			let open Rope in
-			let s = concat empty [
-				of_char '{';
-				concat rcomma
-					(Hashtbl.fold (fun key vvalue acc -> (concat empty [str key; of_string " => "; s_value 0 vvalue]) :: acc) (this vthis) [])
-				;
-				of_char '}'
-			] in
-			encode_rope s
+			let l = Hashtbl.fold (fun key vvalue acc -> (join rempty [str key; create_ascii " => "; s_value 0 vvalue]) :: acc) (this vthis) [] in
+			let s = join rcomma l in
+			let s = join rempty [rbropen;s;rbrclose] in
+			vstring s
 		);
 	]
 end
@@ -1689,7 +1651,7 @@ module StdResource = struct
 	)
 
 	let getString = vfun1 (fun name ->
-		try encode_string (Hashtbl.find ((get_ctx()).curapi.MacroApi.get_com()).resources (decode_string name)) with Not_found -> vnull
+		try bytes_to_utf8 (Bytes.unsafe_of_string (Hashtbl.find ((get_ctx()).curapi.MacroApi.get_com()).resources (decode_string name))) with Not_found -> vnull
 	)
 
 	let getBytes = vfun1 (fun name ->
@@ -1874,7 +1836,7 @@ module StdStd = struct
 	)
 
 	let string = vfun1 (fun v ->
-		encode_rope (s_value 0 v)
+		vstring (s_value 0 v)
 	)
 
 	let int = vfun1 (fun v ->
@@ -1897,67 +1859,114 @@ end
 
 module StdString = struct
 	let this vthis = match vthis with
-		| VString(r,_) -> r
-		| v -> unexpected_value v "string"
-
-	let this_pair vthis = match vthis with
-		| VString(r,s) -> r,Lazy.force s
-		| v -> unexpected_value v "string"
-
-	let this_string vthis = match vthis with
-		| VString(_,s) -> Lazy.force s
+		| VString s -> s
 		| v -> unexpected_value v "string"
 
 	let charAt = vifun1 (fun vthis index ->
-		let this = this_string vthis in
+		let this = this vthis in
 		let i = decode_int index in
-		if i < 0 || i >= String.length this then encode_rope Rope.empty
-		else encode_rope (Rope.of_char (String.get this i))
+		if i < 0 || i >= this.slength then encode_rope Rope.empty
+		else begin
+			let s = Lazy.force this.sstring in
+			if this.sascii then encode_rope (Rope.of_char (String.get s i))
+			else begin
+				let b = Bytes.create 2 in
+				EvalBytes.write_ui16 b 0 (read_char this (i lsl 1));
+				let c = Bytes.unsafe_get b 0 in
+				let s = if (int_of_char c) < 0x80 then create_ascii (String.make 1 c)
+				else create_ucs2 (Bytes.unsafe_to_string b) 1 in
+				vstring s
+			end
+		end
 	)
 
 	let charCodeAt = vifun1 (fun vthis index ->
-		let this = this_string vthis in
+		let this = this vthis in
 		let i = decode_int index in
-		if i < 0 || i >= String.length this then vnull
-		else vint (int_of_char (String.get this i))
+		if i < 0 || i >= this.slength then vnull
+		else if this.sascii then vint (int_of_char (String.get (Lazy.force this.sstring) i))
+		else vint (read_char this (i lsl 1))
 	)
 
 	let fromCharCode = vfun1 (fun i ->
 		let i = decode_int i in
-		if i < 0 || i > 0xFF then vnull
-		else encode_rope (Rope.of_char (char_of_int i))
+		try
+			vstring (from_char_code i)
+		with
+		| Not_found ->
+			vnull
+		| InvalidUnicodeChar ->
+			exc_string ("Invalid unicode char " ^ (string_of_int i))
 	)
 
 	let indexOf = vifun2 (fun vthis str startIndex ->
+		let str = this str in
 		let this = this vthis in
-		let str = decode_string str in
 		let i = default_int startIndex 0 in
 		try
-			vint (Rope.search_forward_string str this i)
+			if Rope.length str.srope = 0 then
+				vint (max 0 (min i this.slength))
+			else if this.sascii then
+				vint (Rope.search_forward_string (Lazy.force str.sstring) this.srope i)
+			else begin
+				let pat = Str.regexp (maybe_extend_ascii str) in
+				let s = Lazy.force this.sstring in
+				vint ((Str.search_forward pat s (i lsl 1)) lsr 1);
+			end
 		with Not_found ->
 			vint (-1)
 	)
 
 	let lastIndexOf = vifun2 (fun vthis str startIndex ->
-		let this = this_string vthis in
-		let str = decode_string str in
-		let i = default_int startIndex (String.length this - 1) in
+		let str = this str in
+		let this = this vthis in
 		try
-			if i >= String.length this || i < 0 then raise Not_found;
-			vint (Str.search_backward (Str.regexp_string str) this i)
+			if Rope.length str.srope = 0 then begin
+				let i = default_int startIndex this.slength in
+				vint (max 0 (min i this.slength))
+			end else begin
+				let i = default_int startIndex (this.slength - 1) in
+				if i >= this.slength || i < 0 then raise Not_found;
+				let s = Lazy.force this.sstring in
+				if this.sascii then
+					vint (Str.search_backward (Str.regexp_string (Lazy.force str.sstring)) s i)
+				else begin
+					let pat = Str.regexp (maybe_extend_ascii str) in
+					vint ((Str.search_backward pat s (i lsl 1)) lsr 1);
+				end
+			end
 		with Not_found ->
 			vint (-1)
 	)
 
 	let split = vifun1 (fun vthis delimiter ->
-		let this,s = this_pair vthis in
-		let delimiter = decode_string delimiter in
+		let this = this vthis in
+		let ascii = this.sascii in
+		let this,s = this.srope,Lazy.force this.sstring in
+		let delimiter = Lazy.force (decode_vstring delimiter).sstring in
 		let l_delimiter = String.length delimiter in
 		let l_this = Rope.length this in
-		if l_delimiter = 0 then
-			encode_array (List.map (fun chr -> encode_string (String.make 1 chr)) (ExtString.String.explode s))
-		else if l_delimiter > l_this then
-			encode_array [encode_rope this]
+		let encode_range pos length =
+			let s = Rope.sub this pos length in
+			if ascii then encode_rope s
+			else encode_rope_ucs2 s (length lsr 1)
+		in
+		if l_delimiter = 0 then begin
+			if ascii then
+				encode_array (List.map (fun chr -> encode_string (String.make 1 chr)) (ExtString.String.explode s))
+			else begin
+				let acc = DynArray.create () in
+				let bs = Bytes.unsafe_of_string s in
+				for i = 0 to (l_this - 1) lsr 1 do
+					let b = Bytes.create 2 in
+					Bytes.unsafe_set b 0 (Bytes.unsafe_get bs (i lsl 1));
+					Bytes.unsafe_set b 1 (Bytes.unsafe_get bs ((i lsl 1 + 1)));
+					DynArray.add acc (vstring (create_ucs2 (Bytes.unsafe_to_string b) 1));
+				done;
+				encode_array (DynArray.to_list acc)
+			end
+		end else if l_delimiter > l_this then
+			encode_array [encode_range 0 (Rope.length this)]
 		else begin
 			let chr = delimiter.[0] in
 			let acc = DynArray.create () in
@@ -1973,20 +1982,20 @@ module StdString = struct
 					if not (loop2 1) then
 						loop k (index + 1)
 					else begin
-						DynArray.add acc (encode_rope (Rope.sub this k (index - k)));
+						DynArray.add acc (encode_range k (index - k));
 						loop (index + l_delimiter) (index + l_delimiter)
 					end
 				with Not_found ->
-					DynArray.add acc (encode_rope (Rope.sub this k (l_this - k)))
+					DynArray.add acc (encode_range k (l_this - k))
 			in
 			let rec loop1 i =
 				try
 					if i = l_this then raise Not_found;
 					let index = String.index_from s i chr in
-					DynArray.add acc (encode_rope (Rope.sub this i (index - i)));
+					DynArray.add acc (encode_range i (index - i));
 					loop1 (index + l_delimiter)
 				with Not_found ->
-					DynArray.add acc (encode_rope (Rope.sub this i (l_this - i)))
+					DynArray.add acc (encode_range i (l_this - i))
 			in
 			if l_delimiter = 1 then loop1 0 else loop 0 0;
 			encode_array_instance (EvalArray.create (DynArray.to_array acc))
@@ -1996,52 +2005,75 @@ module StdString = struct
 	let substr = vifun2 (fun vthis pos len ->
 		let this = this vthis in
 		let pos = decode_int pos in
-		if pos >= Rope.length this then
+		let r = this.srope in
+		if pos >= this.slength then
 			encode_rope Rope.empty
 		else begin
 			let pos = if pos < 0 then begin
-				let pos = Rope.length this + pos in
+				let pos = this.slength + pos in
 				if pos < 0 then 0 else pos
 			end else pos in
-			let len = default_int len (Rope.length this - pos) in
-			let len = if len < 0 then Rope.length this + len - pos else len in
-			let s =
-				if len < 0 then Rope.empty
-				else if len + pos > Rope.length this then Rope.sub this pos (Rope.length this - pos)
-				else Rope.sub this pos len
-			in
-			encode_rope s
+			if this.sascii then begin
+				let len = default_int len (Rope.length r - pos) in
+				let len = if len < 0 then Rope.length r + len - pos else len in
+				let s =
+					if len < 0 then Rope.empty
+					else if len + pos > Rope.length r then Rope.sub r pos (Rope.length r - pos)
+					else Rope.sub r pos len
+				in
+				encode_rope s
+			end else begin
+				let pos = pos lsl 1 in
+				let len = match len with
+					| VNull -> (Rope.length r - pos)
+					| VInt32 i -> Int32.to_int i lsl 1
+					| _ -> unexpected_value len "int"
+				in
+				let len = if len < 0 then Rope.length r + len - pos else len in
+				let s =
+					if len < 0 then Rope.empty
+					else if len + pos > Rope.length r then Rope.sub r pos (Rope.length r - pos)
+					else Rope.sub r pos len
+				in
+				vstring (create_ucs2_of_rope s (len lsr 1))
+			end
 		end
 	)
 
 	let substring = vifun2 (fun vthis startIndex endIndex ->
 		let this = this vthis in
 		let first = decode_int startIndex in
-		let l = Rope.length this in
+		let l = this.slength in
 		let last = default_int endIndex l in
 		let first = if first < 0 then 0 else first in
 		let last = if last < 0 then 0 else last in
 		let first,last = if first > last then last,first else first,last in
 		let last = if last > l then l else last in
-		let s = if first > l then
-			Rope.empty
-		else
-			Rope.sub this first (last - first)
-		in
-		encode_rope s
+		if first > l then
+			encode_rope Rope.empty
+		else begin
+			if this.sascii then
+				encode_rope (Rope.sub this.srope first (last - first))
+			else begin
+				let first = first lsl 1 in
+				let last = last lsl 1 in
+				let length = last - first in
+				let r = Rope.sub this.srope first length in
+				vstring (create_ucs2_of_rope r length)
+			end
+		end
 	)
 
-	let toLowerCase = vifun0 (fun vthis -> encode_rope (Rope.lowercase (this vthis)))
+	let toLowerCase = vifun0 (fun vthis -> encode_rope (Rope.lowercase (this vthis).srope))
 
 	let toString = vifun0 (fun vthis -> vthis)
 
-	let toUpperCase = vifun0 (fun vthis -> encode_rope (Rope.uppercase (this vthis)))
+	let toUpperCase = vifun0 (fun vthis -> encode_rope (Rope.uppercase (this vthis).srope))
 
-	let cca = vifun1 (fun vthis i ->
-		let this = this_string vthis in
-		let i = decode_int i in
-		if i < 0 || i >= String.length this then vnull
-		else vint (int_of_char (String.unsafe_get this i))
+	let cca = charCodeAt
+
+	let isAscii = vifun0 (fun vthis ->
+		vbool (this vthis).sascii
 	)
 end
 
@@ -2049,46 +2081,75 @@ module StdStringBuf = struct
 	module Buffer = Rope.Buffer
 
 	let this vthis = match vthis with
-		| VInstance {ikind = IBuffer buf} -> buf
+		| VInstance {ikind = IBuffer sb} -> sb
 		| v -> unexpected_value v "string"
 
 	let add = vifun1 (fun vthis x ->
 		let this = this vthis in
-		begin match x with
-			| VString(s,_) -> Buffer.add_rope this s
-			| _ -> Buffer.add_string this (value_string x)
-		end;
+		let s = match x with
+			| VString s -> s
+			| _ -> create_ascii (value_string x)
+		in
+		AwareBuffer.add_string this s;
 		vnull;
 	)
 
 	let addChar = vifun1 (fun vthis c ->
 		let this = this vthis in
-		let c = decode_int c in
-		let c = try char_of_int c with _ -> exc_string "char_of_int" in
-		Buffer.add_char this c;
+		let i = decode_int c in
+		let add i =
+			if this.bascii then AwareBuffer.promote_to_ucs this;
+			Buffer.add_char this.bbuffer (char_of_int (i land 0xFF));
+			Buffer.add_char this.bbuffer (char_of_int (i lsr 8));
+			this.blength <- this.blength + 1;
+		in
+		begin if i < 0 then
+			()
+		else if i < 128 then begin
+			if this.bascii then begin
+				Buffer.add_char this.bbuffer (char_of_int i);
+				this.blength <- this.blength + 1;
+			end else
+				add i
+		end else if i < 0x10000 then begin
+			if i >= 0xD800 && i <= 0xDFFF then exc_string ("Invalid unicode char " ^ (string_of_int i));
+			add i
+		end else if i < 0x110000 then begin
+			let i = i - 0x10000 in
+			add ((i lsr 10 + 0xD800));
+			add ((i land 1023) + 0xDC00);
+		end else
+			exc_string ("Invalid unicode char " ^ (string_of_int i))
+		end;
 		vnull
 	)
 
 	let addSub = vifun3 (fun vthis s pos len ->
 		let this = this vthis in
-		let s = decode_rope s in
+		let s = decode_vstring s in
 		let i = decode_int pos in
+		let i = if s.sascii then i else i lsl 1 in
 		let len = match len with
-			| VNull -> Rope.length s - i
-			| VInt32 i -> Int32.to_int i
+			| VNull -> Rope.length s.srope - i
+			| VInt32 i -> Int32.to_int i lsl (if s.sascii then 0 else 1)
 			| _ -> unexpected_value len "int"
 		in
-		Buffer.add_rope this (Rope.sub s i len);
+		let s' = Rope.sub s.srope i len in
+		let s' = if s.sascii then create_ascii_of_rope s'
+		else create_ucs2_of_rope s' len in
+		AwareBuffer.add_string this s';
 		vnull
 	)
 
 	let get_length = vifun0 (fun vthis ->
 		let this = this vthis in
-		vint (Buffer.length this)
+		vint this.blength
 	)
 
 	let toString = vifun0 (fun vthis ->
-		encode_rope (Buffer.contents (this vthis))
+		let this = this vthis in
+		let s = AwareBuffer.contents this in
+		vstring s
 	)
 end
 
@@ -2098,12 +2159,7 @@ module StdStringTools = struct
 		Common.url_encode s (Rope.Buffer.add_char b);
 		Rope.Buffer.contents b
 
-	let fastCodeAt = vfun2 (fun s index ->
-		let s = decode_string s in
-		let index = decode_int index in
-		if index >= String.length s then vnull
-		else vint (int_of_char s.[index])
-	)
+	let fastCodeAt = StdString.charCodeAt
 
 	let urlEncode = vfun1 (fun s ->
 		let s = decode_string s in
@@ -2112,7 +2168,10 @@ module StdStringTools = struct
 
 	let urlDecode = vfun1 (fun s ->
 		let s = decode_string s in
-		let b = Rope.Buffer.create 0 in
+		let b = AwareBuffer.create () in
+		let add s =
+			AwareBuffer.add_string b s
+		in
 		let len = String.length s in
 		let decode c =
 			match c with
@@ -2121,28 +2180,62 @@ module StdStringTools = struct
 			| 'A'..'F' -> Some (int_of_char c - int_of_char 'A' + 10)
 			| _ -> None
 		in
+		let decode_hex i =
+			let p1 = (try decode (String.get s i) with _ -> None) in
+			let p2 = (try decode (String.get s (i + 1)) with _ -> None) in
+			match p1, p2 with
+			| Some c1, Some c2 ->
+				Some (((c1 lsl 4) lor c2))
+			| _ ->
+				None
+		in
+		let expect_hex i =
+			match String.unsafe_get s i with
+			| '%' ->
+				begin match decode_hex (i + 1) with
+				| None -> exc_string "Malformed"
+				| Some c -> c
+				end
+			| _ -> exc_string "Malformed"
+		in
 		let rec loop i =
 			if i = len then () else
 			let c = String.unsafe_get s i in
 			match c with
 			| '%' ->
-				let p1 = (try decode (String.get s (i + 1)) with _ -> None) in
-				let p2 = (try decode (String.get s (i + 2)) with _ -> None) in
-				(match p1, p2 with
-				| Some c1, Some c2 ->
-					Rope.Buffer.add_char b (char_of_int ((c1 lsl 4) lor c2));
-					loop (i + 3)
-				| _ ->
-					loop (i + 1));
+				begin match decode_hex (i + 1) with
+				| Some c ->
+					if c < 0x80 then begin
+						add (create_ascii (String.make 1 (char_of_int c)));
+						loop (i + 3)
+					end else if c < 0xE0 then begin
+						let c2 = expect_hex (i + 3) in
+						add (from_char_code (((c land 0x3F) lsl 6) lor (c2 land 0x7F)));
+						loop (i + 6)
+					end else if c < 0xF0 then begin
+						let c2 = expect_hex (i + 3) in
+						let c3 = expect_hex (i + 6) in
+						add (from_char_code (((c land 0x1F) lsl 12) lor ((c2 land 0x7F) lsl 6) lor (c3 land 0x7F)));
+						loop (i + 9)
+					end else
+						let c2 = expect_hex (i + 3) in
+						let c3 = expect_hex (i + 6) in
+						let c4 = expect_hex (i + 9) in
+						let k = ((c land 0x0F) lsl 18) lor ((c2 land 0x7F) lsl 12) lor ((c3 land 0x7F) lsl 6) lor (c4 land 0x7F) in
+						add (from_char_code k);
+						loop (i + 12)
+				| None ->
+					loop (i + 1)
+				end;
 			| '+' ->
-				Rope.Buffer.add_char b ' ';
+				add (create_ascii (String.make 1 ' '));
 				loop (i + 1)
 			| c ->
-				Rope.Buffer.add_char b c;
+				add (create_ascii (String.make 1 c));
 				loop (i + 1)
 		in
 		loop 0;
-		encode_rope (Rope.Buffer.contents b)
+		vstring (AwareBuffer.contents b)
 	)
 end
 
@@ -2166,7 +2259,7 @@ module StdSys = struct
 		let h = StringHashtbl.create 0 in
 		Array.iter(fun s ->
 			let k, v = ExtString.String.split s "=" in
-			StringHashtbl.replace h (Rope.of_string k,lazy k) (encode_string v)
+			StringHashtbl.replace h (create_ascii k) (encode_string v)
 		) env;
 		encode_string_map_direct h
 	)
@@ -2543,17 +2636,12 @@ module StdUtf8 = struct
 		vnull
 	)
 
-	let charCodeAt = vfun2 (fun s index ->
-		let s = decode_string s in
-		let i = decode_int index in
-		let c = try UTF8.get s i with exc -> exc_string (Printexc.to_string exc) in
-		vint (UChar.int_of_uchar c)
-	)
+	let charCodeAt = StdString.charCodeAt
 
 	let compare = vfun2 (fun a b ->
 		let a = decode_string a in
 		let b = decode_string b in
-		vint (UTF8.compare a b)
+		vint (Pervasives.compare a b)
 	)
 
 	let decode = vfun1 (fun s ->
@@ -2564,7 +2652,8 @@ module StdUtf8 = struct
 			Bytes.unsafe_set buf !i (UChar.char_of uc);
 			incr i
 		) s;
-		encode_string (Bytes.unsafe_to_string buf)
+		let s = Bytes.unsafe_to_string buf in
+		encode_string s
 	)
 
 	let encode = vfun1 (fun s ->
@@ -2579,25 +2668,15 @@ module StdUtf8 = struct
 	)
 
 	let length = vfun1 (fun s ->
-		let s = decode_string s in
-		vint (UTF8.length s)
+		let s = decode_vstring s in
+		vint (s.slength)
 	)
 
-	let sub = vfun3 (fun s pos len ->
-		let s = decode_string s in
-		let pos = decode_int pos in
-		let len = decode_int len in
-		let buf = UTF8.Buf.create 0 in
-		let i = ref (-1) in
-		UTF8.iter (fun c ->
-			incr i;
-			if !i >= pos && !i < pos + len then UTF8.Buf.add_char buf c;
-		) s;
-		encode_string (UTF8.Buf.contents buf)
-	)
+	let sub = StdString.substr
 
 	let toString = vifun0 (fun vthis ->
-		encode_string (UTF8.Buf.contents (this vthis))
+		let this = this vthis in
+		bytes_to_utf8 (Bytes.unsafe_of_string (UTF8.Buf.contents this))
 	)
 
 	let validate = vfun1 (fun s ->
@@ -2621,12 +2700,12 @@ let init_maps builtins =
 		| VInstance {ikind = IIntMap h} -> h
 		| v -> unexpected_value v "int map"
 	in
-	init_fields builtins (["haxe";"ds"],"IntMap") [] (StdIntMap.map_fields vint decode_int (fun i -> Rope.of_string (string_of_int i)) encode_int_map_direct this);
+	init_fields builtins (["haxe";"ds"],"IntMap") [] (StdIntMap.map_fields vint decode_int (fun i -> create_ascii (string_of_int i)) encode_int_map_direct this);
 	let this vthis = match vthis with
 		| VInstance {ikind = IStringMap h} -> h
 		| v -> unexpected_value v "string map"
 	in
-	init_fields builtins (["haxe";"ds"],"StringMap") [] (StdStringMap.map_fields vstring_direct decode_rope_string (fun (r,_) -> r) encode_string_map_direct this);
+	init_fields builtins (["haxe";"ds"],"StringMap") [] (StdStringMap.map_fields vstring decode_vstring (fun s -> s) encode_string_map_direct this);
 	let this vthis = match vthis with
 		| VInstance {ikind = IObjectMap h} -> Obj.magic h
 		| v -> unexpected_value v "object map"
@@ -2664,7 +2743,7 @@ let init_constructors builtins =
 			| [s] -> s
 			| _ -> assert false
 		);
-	add key_StringBuf (fun _ -> encode_instance key_StringBuf ~kind:(IBuffer (Rope.Buffer.create 0)));
+	add key_StringBuf (fun _ -> encode_instance key_StringBuf ~kind:(IBuffer (AwareBuffer.create())));
 	add key_haxe_Utf8
 		(fun vl -> match vl with
 			| [size] -> encode_instance key_haxe_Utf8 ~kind:(IUtf8 (UTF8.Buf.create (default_int size 0)))
@@ -3033,6 +3112,7 @@ let init_standard_library builtins =
 		"toString",StdString.toString;
 		"toUpperCase",StdString.toUpperCase;
 		"cca",StdString.cca;
+		"isAscii",StdString.isAscii;
 	];
 	init_fields builtins ([],"StringBuf") [] [
 		"add",StdStringBuf.add;

+ 235 - 0
src/macro/eval/evalString.ml

@@ -0,0 +1,235 @@
+(*
+	The Haxe Compiler
+	Copyright (C) 2005-2018  Haxe Foundation
+
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version 2
+	of the License, or (at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to the Free Software
+	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *)
+
+open Globals
+open EvalValue
+open EvalBytes
+
+let create_ascii s = {
+	srope = Rope.of_string s;
+	sstring = lazy s;
+	slength = String.length s;
+	sascii = true;
+}
+
+let create_ascii_of_rope r = {
+	srope = r;
+	sstring = lazy (Rope.to_string r);
+	slength = Rope.length r;
+	sascii = true;
+}
+
+let create_ucs2 s length = {
+	srope = Rope.of_string s;
+	sstring = lazy s;
+	slength = length;
+	sascii = false;
+}
+
+let create_ucs2_of_rope r length = {
+	srope = r;
+	sstring = lazy (Rope.to_string r);
+	slength = length;
+	sascii = false;
+}
+
+let vstring s = VString s
+
+module AwareBuffer = struct
+	type t = vstring_buffer
+
+	let create () = {
+		bbuffer = Rope.Buffer.create 0;
+		blength = 0;
+		bascii = true;
+	}
+
+	let promote_to_ucs this =
+		let current = Rope.to_string (Rope.Buffer.contents this.bbuffer) in
+		let current = extend_ascii current in
+		Rope.Buffer.clear this.bbuffer;
+		this.bascii <- false;
+		Rope.Buffer.add_string this.bbuffer current
+
+	let add_string this s =
+		begin match s.sascii,this.bascii with
+		| true,true
+		| false,false ->
+			Rope.Buffer.add_rope this.bbuffer s.srope
+		| true,false ->
+			Rope.Buffer.add_string this.bbuffer (extend_ascii (Lazy.force s.sstring))
+		| false,true ->
+			promote_to_ucs this;
+			Rope.Buffer.add_rope this.bbuffer s.srope
+		end;
+		this.blength <- this.blength + s.slength
+
+	let contents this =
+		if this.bascii then
+			create_ascii_of_rope (Rope.Buffer.contents this.bbuffer)
+		else
+			create_ucs2_of_rope (Rope.Buffer.contents this.bbuffer) this.blength
+end
+
+let read_char s =
+	read_ui16 (Bytes.unsafe_of_string (Lazy.force s.sstring))
+
+let utf8_to_utf16 s =
+	let only_ascii = ref true in
+	let buf = Buffer.create 0 in
+	let l = ref 0 in
+	let add i =
+		incr l;
+		Buffer.add_char buf (Char.unsafe_chr i);
+		Buffer.add_char buf (Char.unsafe_chr (i lsr 8));
+	in
+	let length = String.length s in
+	let i = ref 0 in
+	let get () =
+		let i' = int_of_char (String.unsafe_get s !i) in
+		incr i;
+		i'
+	in
+	while !i < length do
+		let c = get() in
+		if c < 0x80 then
+			add c
+		else if c < 0xE0 then begin
+			only_ascii := false;
+			add (((c land 0x3F) lsl 6) lor ((get ()) land 0x7F))
+		end else if c < 0xF0 then begin
+			only_ascii := false;
+			let c2 = get () in
+			add (((c land 0x1F) lsl 12) lor ((c2 land 0x7F) lsl 6) lor ((get ()) land 0x7F));
+		end else begin
+			only_ascii := false;
+			let c2 = get () in
+			let c3 = get () in
+			let c = (((c land 0x0F) lsl 18) lor ((c2 land 0x7F) lsl 12) lor ((c3 land 0x7F) lsl 6) lor ((get ()) land 0x7F)) in
+			add ((c lsr 10) + 0xD7C0);
+			add ((c land 0x3FF) lor 0xDC00);
+		end
+	done;
+	Buffer.contents buf,!only_ascii,!l
+
+let utf16_to_utf8 s =
+	let buf = Buffer.create 0 in
+	let i = ref 0 in
+	let add i =
+		Buffer.add_char buf (Char.unsafe_chr i)
+	in
+	let b = Bytes.unsafe_of_string s in
+	let get () =
+		let ch1 = read_byte b !i in
+		let ch2 = read_byte b (!i + 1) in
+		let c = ch1 lor (ch2 lsl 8) in
+		i := !i + 2;
+		c
+	in
+	let length = String.length s in
+	while !i < length do
+		let c = get() in
+		let c = if 0xD800 <= c && c <= 0xDBFF then
+			(((c - 0xD7C0) lsl 10) lor ((get()) land 0X3FF))
+		else
+			c
+		in
+		if c <= 0x7F then
+			add c
+		else if c <= 0x7FF then begin
+			add (0xC0 lor (c lsr 6));
+			add (0x80 lor (c land 63));
+		end else if c <= 0xFFFF then begin
+			add (0xE0 lor (c lsr 12));
+			add (0x80 lor ((c lsr 6) land 63));
+			add (0x80 lor (c land 63));
+		end else begin
+			add (0xF0 lor (c lsr 18));
+			add (0x80 lor ((c lsr 12) land 63));
+			add (0x80 lor ((c lsr 6) land 63));
+			add (0x80 lor (c land 63));
+		end
+	done;
+	Buffer.contents buf
+
+let maybe_extend_ascii s =
+	let s' = Lazy.force s.sstring in
+	if s.sascii then begin
+		extend_ascii s'
+	end else
+		s'
+
+let concat s1 s2 =
+	match s1.sascii,s2.sascii with
+	| true,true ->
+		create_ascii_of_rope (Rope.concat2 s1.srope s2.srope)
+	| false,false ->
+		create_ucs2_of_rope (Rope.concat2 s1.srope s2.srope) (s1.slength + s2.slength)
+	| true,false ->
+		create_ucs2 ((extend_ascii (Lazy.force s1.sstring)) ^ (Lazy.force s2.sstring)) (s1.slength + s2.slength)
+	| false,true ->
+		create_ucs2 ((Lazy.force s1.sstring) ^ (extend_ascii (Lazy.force s2.sstring))) (s1.slength + s2.slength)
+
+let join sep sl =
+	let buf = AwareBuffer.create () in
+	let rec loop sl = match sl with
+		| [s] ->
+			AwareBuffer.add_string buf s;
+		| s :: sl ->
+			AwareBuffer.add_string buf s;
+			AwareBuffer.add_string buf sep;
+			loop sl;
+		| [] ->
+			()
+	in
+	loop sl;
+	AwareBuffer.contents buf
+
+let bytes_to_utf8 s =
+	let s',is_ascii,length = utf8_to_utf16 (Bytes.unsafe_to_string s) in
+	if is_ascii then
+		vstring (create_ascii (Bytes.unsafe_to_string s))
+	else
+		vstring (create_ucs2 s' length)
+
+exception InvalidUnicodeChar
+
+let from_char_code i =
+	if i < 0 then
+		raise Not_found
+	else if i < 128 then
+		create_ascii (String.make 1 (char_of_int i))
+	else if i < 0x10000 then begin
+		if i >= 0xD800 && i <= 0xDFFF then raise InvalidUnicodeChar;
+		let b = Bytes.create 2 in
+		write_ui16 b 0 i;
+		create_ucs2 (Bytes.unsafe_to_string b) 1
+	end else if i < 0x110000 then begin
+		let i = i - 0x10000 in
+		let b = Bytes.create 4 in
+		write_ui16 b 0 ((i lsr 10 + 0xD800));
+		write_ui16 b 2 ((i land 1023) + 0xDC00);
+		create_ucs2 (Bytes.unsafe_to_string b) 2
+	end else
+		raise InvalidUnicodeChar
+
+let get s =
+	let s' = Lazy.force s.sstring in
+	if s.sascii then s'
+	else utf16_to_utf8 s'

+ 38 - 5
src/macro/eval/evalValue.ml

@@ -26,12 +26,45 @@ type cmp =
 	| CInf
 	| CUndef
 
-type vstring = Rope.t * string Lazy.t
+type vstring = {
+	(* The rope representation of the string. This is what we mainly use. *)
+	srope   : Rope.t;
+	(* The bytes representation of the string. This is only evaluated if we
+	   need it for something like random access. *)
+	sstring : string Lazy.t;
+	(* The length of the string. *)
+	slength : int;
+	(* If true, the string is one-byte-per-character ASCII. Otherwise, it is
+	   encoded as UCS2. *)
+	sascii  : bool;
+}
+
+type vstring_buffer = {
+	        bbuffer : Rope.Buffer.t;
+	mutable blength : int;
+	mutable bascii  : bool;
+}
+
+let extend_ascii s =
+	let length = String.length s in
+	let b = Bytes.make (length lsl 1) '\000' in
+	for i = 0 to length - 1 do
+		Bytes.unsafe_set b (i lsl 1) (String.unsafe_get s i)
+	done;
+	Bytes.unsafe_to_string b
+
+let vstring_equal s1 s2 =
+	if s1.sascii = s2.sascii then
+		s1.srope == s2.srope || Lazy.force s1.sstring = Lazy.force s2.sstring
+	else if not s2.sascii then
+		(Lazy.force s1.sstring) = Lazy.force s2.sstring
+	else
+		Lazy.force s1.sstring = extend_ascii (Lazy.force s2.sstring)
 
 module StringHashtbl = Hashtbl.Make(struct
 	type t = vstring
-	let equal (r1,s1) (r2,s2) = r1 == r2 || Lazy.force s1 = Lazy.force s2
-	let hash (_,s) = Hashtbl.hash (Lazy.force s)
+	let equal = vstring_equal
+	let hash s = Hashtbl.hash (Lazy.force s.sstring)
 end)
 
 module IntHashtbl = Hashtbl.Make(struct type t = int let equal = (=) let hash = Hashtbl.hash end)
@@ -114,7 +147,7 @@ and vinstance_kind =
 	| IIntMap of value IntHashtbl.t
 	| IObjectMap of (value,value) Hashtbl.t
 	| IOutput of Buffer.t (* BytesBuffer *)
-	| IBuffer of Rope.Buffer.t (* StringBuf *)
+	| IBuffer of vstring_buffer(* StringBuf *)
 	| IPos of pos
 	| IUtf8 of UTF8.Buf.buf
 	| IProcess of Process.process
@@ -165,7 +198,7 @@ let rec equals a b = match a,b with
 	| VEnumValue a,VEnumValue b -> a == b || a.eindex = b.eindex && Array.length a.eargs = 0 && Array.length b.eargs = 0 && a.epath = b.epath
 	| VObject vo1,VObject vo2 -> vo1 == vo2
 	| VInstance vi1,VInstance vi2 -> vi1 == vi2
-	| VString(r1,s1),VString(r2,s2) -> r1 == r2 || Lazy.force s1 = Lazy.force s2
+	| VString s1,VString s2 -> vstring_equal s1 s2
 	| VArray va1,VArray va2 -> va1 == va2
 	| VVector vv1,VVector vv2 -> vv1 == vv2
 	| VFunction(vf1,_),VFunction(vf2,_) -> vf1 == vf2

+ 4 - 3
src/typing/typer.ml

@@ -2563,9 +2563,10 @@ let rec create com =
 			()
 	) ctx.g.std.m_types;
 	let m = TypeloadModule.load_module ctx ([],"String") null_pos in
-	(match m.m_types with
-	| [TClassDecl c] -> ctx.t.tstring <- TInst (c,[])
-	| _ -> assert false);
+	List.iter (fun mt -> match mt with
+		| TClassDecl c -> ctx.t.tstring <- TInst (c,[])
+		| _ -> ()
+	) m.m_types;
 	let m = TypeloadModule.load_module ctx ([],"Array") null_pos in
 	(try
 		List.iter (fun t -> (

+ 1 - 1
std/StringTools.hx

@@ -457,7 +457,7 @@ class StringTools {
 		#elseif hl
 		return @:privateAccess s.bytes.getUI16(index << 1);
 		#elseif lua
-		return lua.NativeStringTools.byte(s,index+1);
+		return lua.lib.luautf8.Utf8.byte(s,index+1);
 		#else
 		return untyped s.cca(index);
 		#end

+ 3 - 2
std/cs/internal/StringExt.hx

@@ -180,9 +180,10 @@ private typedef NativeString = cs.system.String;
 		return me;
 	}
 
-	public static function fromCharCode(code:Int):NativeString
+	public static function fromCharCode(code:Int):String
 	{
-		return new NativeString( cast(code,cs.StdTypes.Char16), 1 );
+		return cs.system.Char.ConvertFromUtf32(code);
+		// return new NativeString( cast(code,cs.StdTypes.Char16), 1 );
 	}
 }
 

+ 3 - 2
std/eval/_std/haxe/io/Bytes.hx

@@ -21,6 +21,7 @@
  */
 package haxe.io;
 
+//@:coreApi
 extern class Bytes {
 	function new(length:Int,b:BytesData):Void;
 	public var length(default,null):Int;
@@ -40,13 +41,13 @@ extern class Bytes {
 	public function getInt64( pos : Int ) : haxe.Int64;
 	public function setInt32( pos : Int, v : Int ) : Void;
 	public function setInt64( pos : Int, v : haxe.Int64 ) : Void;
-	public function getString( pos : Int, len : Int ) : String;
+	public function getString( pos : Int, len : Int, ?encoding : Encoding ) : String;
 	public function toString() : String;
 	public function toHex() : String;
 	public function getData() : BytesData;
 	public static function alloc( length : Int ) : Bytes;
 	@:pure
-	public static function ofString( s : String ) : Bytes;
+	public static function ofString( s : String, ?encoding : Encoding ) : Bytes;
 	public static function ofData( b : BytesData ) : Bytes;
 	public static function fastGet( b : BytesData, pos : Int ) : Int;
 	static function __init__():Void {

+ 1 - 1
std/eval/_std/haxe/io/BytesBuffer.hx

@@ -28,7 +28,7 @@ extern class BytesBuffer {
 	function get_length():Int;
 	public function addByte(byte:Int):Void;
 	public function add(src:Bytes):Void;
-	public function addString(v:String):Void;
+	public function addString(v:String,?encoding:Encoding):Void;
 	public function addInt32(v:Int):Void;
 	public function addInt64(v:haxe.Int64):Void;
 	public function addFloat(v:Float):Void;

+ 9 - 0
std/flash/Boot.hx

@@ -212,6 +212,15 @@ class Boot extends flash.display.MovieClip {
 		return new String(v);
 	}
 
+	static public function fromCodePoint( code : Int ) {
+		var o = new flash.utils.ByteArray();
+		o.endian = LITTLE_ENDIAN;
+		o.writeShort((code>>10) + 0xD7C0);
+		o.writeShort((code&0x3FF) + 0xDC00);
+		o.position = 0;
+		return o.readMultiByte(4,"unicode");
+	}
+
 	static function __unprotect__( s : String ) {
 		return s;
 	}

+ 43 - 0
std/flash/_std/String.hx

@@ -0,0 +1,43 @@
+/*
+ * Copyright (C)2005-2018 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+@:coreApi
+extern class String {
+
+	var length(default,null) : Int;
+	function new(string:String) : Void;
+	function toUpperCase() : String;
+	function toLowerCase() : String;
+	function charAt(index : Int) : String;
+	function charCodeAt( index : Int) : Null<Int>;
+	function indexOf( str : String, ?startIndex : Int ) : Int;
+	function lastIndexOf( str : String, ?startIndex : Int ) : Int;
+	function split( delimiter : String ) : Array<String>;
+	function substr( pos : Int, ?len : Int ) : String;
+	function substring( startIndex : Int, ?endIndex : Int ) : String;
+	function toString() : String;
+
+	@:pure static inline function fromCharCode( code : Int ) : String untyped {
+		return code < 0x10000 ? String["fromCharCode"](code) : flash.Boot.fromCodePoint(code);
+	}
+	
+}

+ 2 - 2
std/haxe/format/JsonParser.hx

@@ -172,7 +172,7 @@ class JsonParser {
 				case 'u'.code:
 					var uc = Std.parseInt("0x" + str.substr(pos, 4));
 					pos += 4;
-					#if (neko || php || (cpp&&!hxcpp_smart_strings) || lua || eval)
+					#if (neko || (cpp&&!hxcpp_smart_strings))
 					if( uc <= 0x7F )
 						buf.addChar(uc);
 					else if( uc <= 0x7FF ) {
@@ -196,7 +196,7 @@ class JsonParser {
 				}
 				start = pos;
 			}
-			#if (neko || php || (cpp&&!hxcpp_smart_strings) )
+			#if (neko || (cpp&&!hxcpp_smart_strings) )
 			// ensure utf8 chars are not cut
 			else if( c >= 0x80 ) {
 				pos++;

+ 56 - 17
std/haxe/io/Bytes.hx

@@ -346,7 +346,8 @@ class Bytes {
 		setInt32(pos + 4, v.high);
 	}
 
-	public function getString( pos : Int, len : Int ) : String {
+	public function getString( pos : Int, len : Int, ?encoding : Encoding ) : String {
+		if( encoding == null ) encoding == UTF8;
 		#if !neko
 		if( pos < 0 || len < 0 || pos + len > length ) throw Error.OutsideBounds;
 		#end
@@ -354,23 +355,43 @@ class Bytes {
 		return try new String(untyped __dollar__ssub(b,pos,len)) catch( e : Dynamic ) throw Error.OutsideBounds;
 		#elseif flash
 		b.position = pos;
-		return b.readUTFBytes(len);
+		return encoding == RawNative ? b.readMultiByte(len, "unicode") : b.readUTFBytes(len);
 		#elseif cpp
 		var result:String="";
 		untyped __global__.__hxcpp_string_of_bytes(b,result,pos,len);
 		return result;
 		#elseif cs
-		return cs.system.text.Encoding.UTF8.GetString(b, pos, len);
+		switch (encoding) {
+			case UTF8 | null:
+				return cs.system.text.Encoding.UTF8.GetString(b, pos, len);
+			case RawNative:
+				return cs.system.text.Encoding.Unicode.GetString(b, pos, len);
+		}
 		#elseif java
-		try
-			return new String(b, pos, len, "UTF-8")
-		catch (e:Dynamic) throw e;
+		try {
+			switch (encoding) {
+				case UTF8 | null:
+					return new String(b, pos, len, "UTF-8");
+				case RawNative:
+					return new String(b, pos, len, "UTF-16LE");
+			}
+		} catch (e:Dynamic) {
+			throw e;
+		}
 		#elseif python
 		return python.Syntax.code("self.b[{0}:{0}+{1}].decode('UTF-8','replace')", pos, len);
 		#elseif lua
-		var begin = cast(Math.min(pos,b.length),Int);
-		var end = cast(Math.min(pos+len,b.length),Int);
-		return [for (i in begin...end) String.fromCharCode(b[i])].join("");
+
+		if (b.length - pos <= lua.Boot.MAXSTACKSIZE){
+			var end : Int = cast Math.min(b.length, pos+len) - 1;
+			return lua.NativeStringTools.char(lua.TableTools.unpack(untyped b, pos, end));
+		} else {
+			var tbl : lua.Table<Int,String> = lua.Table.create();
+			for (idx in pos...pos+len){
+				lua.Table.insert(tbl, lua.NativeStringTools.char(b[idx]));
+			}
+			return lua.Table.concat(tbl, '');
+		}
 		#else
 		var s = "";
 		var b = b;
@@ -407,6 +428,9 @@ class Bytes {
 		return getString(pos, len);
 	}
 
+	/**
+		Returns string representation of the bytes as UTF8
+	**/
 	public function toString() : String {
 		#if neko
 		return new String(untyped __dollar__ssub(b,0,length));
@@ -469,35 +493,50 @@ class Bytes {
 		#end
 	}
 
+	/**
+		Returns bytes representation of the given String, using specific encoding (UTF-8 by default)
+	**/
 	@:pure
-	public static function ofString( s : String ) : Bytes {
+	public static function ofString( s : String, ?encoding : Encoding ) : Bytes {
 		#if neko
 		return new Bytes(s.length,untyped __dollar__ssub(s.__s,0,s.length));
 		#elseif flash
 		var b = new flash.utils.ByteArray();
-		b.writeUTFBytes(s);
+		if( encoding == RawNative ) b.writeMultiByte(s,"unicode") else b.writeUTFBytes(s);
 		return new Bytes(b.length,b);
 		#elseif cpp
 		var a = new BytesData();
 		untyped __global__.__hxcpp_bytes_of_string(a,s);
 		return new Bytes(a.length, a);
 		#elseif cs
-		var b = cs.system.text.Encoding.UTF8.GetBytes(s);
+		var b = switch (encoding) {
+			case UTF8 | null:
+				cs.system.text.Encoding.UTF8.GetBytes(s);
+			case RawNative:
+				cs.system.text.Encoding.Unicode.GetBytes(s);
+		};
 		return new Bytes(b.Length, b);
 		#elseif java
-		try
-		{
-			var b:BytesData = untyped s.getBytes("UTF-8");
+		try {
+			var b:BytesData = switch (encoding) {
+				case UTF8 | null:
+					@:privateAccess s.getBytes("UTF-8");
+				case RawNative:
+					@:privateAccess s.getBytes("UTF-16LE");
+			};
 			return new Bytes(b.length, b);
+		} catch (e:Dynamic) {
+			throw e;
 		}
-		catch (e:Dynamic) throw e;
 
 		#elseif python
 			var b:BytesData = new python.Bytearray(s, "UTF-8");
 			return new Bytes(b.length, b);
 
 		#elseif lua
-			var bytes = [for (c in 0...s.length) StringTools.fastCodeAt(s,c)];
+			var bytes = [for (i in 0...lua.NativeStringTools.len(s)) {
+					lua.NativeStringTools.byte(s,i+1);
+			}];
 			return new Bytes(bytes.length, bytes);
 		#else
 		var a = new Array();

+ 3 - 3
std/haxe/io/BytesBuffer.hx

@@ -107,13 +107,13 @@ class BytesBuffer {
 		#end
 	}
 
-	public inline function addString( v : String ) {
+	public inline function addString( v : String, ?encoding : Encoding ) {
 		#if neko
 		untyped StringBuf.__add(b, v.__s);
 		#elseif flash
-		b.writeUTFBytes(v);
+		if( encoding == RawNative ) b.writeMultiByte(v, "unicode") else b.writeUTFBytes(v);
 		#else
-		add(Bytes.ofString(v));
+		add(Bytes.ofString(v,encoding));
 		#end
 	}
 

+ 2 - 2
std/haxe/io/BytesInput.hx

@@ -191,8 +191,8 @@ class BytesInput extends Input {
 	}
 
 	@:dox(hide)
-	override function readString( len : Int ) {
-		return try b.readUTFBytes(len) catch( e : Dynamic ) throw new Eof();
+	override function readString( len : Int, ?encoding : Encoding ) {
+		return try encoding == RawNative ? b.readMultiByte(len,"unicode") : b.readUTFBytes(len) catch( e : Dynamic ) throw new Eof();
 	}
 
 	#end

+ 5 - 2
std/haxe/io/BytesOutput.hx

@@ -116,8 +116,11 @@ class BytesOutput extends Output {
 	}
 
 	@:dox(hide)
-	override function writeString( s : String ) {
-		b.writeUTFBytes(s);
+	override function writeString( s : String, ?encoding : Encoding ) {
+		if( encoding == RawNative )
+			b.writeMultiByte(s, "unicode");
+		else
+			b.writeUTFBytes(s);
 	}
 
 	#end

+ 33 - 0
std/haxe/io/Encoding.hx

@@ -0,0 +1,33 @@
+/*
+ * Copyright (C)2005-2018 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+package haxe.io;
+
+/**
+	String binary encoding supported by Haxe I/O
+**/
+enum Encoding {
+	UTF8;
+	/**
+		Output the string the way the platform represent it in memory. This is the most efficient but is platform-specific
+	**/
+	RawNative;
+}

+ 2 - 2
std/haxe/io/Input.hx

@@ -300,13 +300,13 @@ class Input {
 	/**
 		Read and `len` bytes as a string.
 	**/
-	public function readString( len : Int ) : String {
+	public function readString( len : Int, ?encoding : Encoding ) : String {
 		var b = Bytes.alloc(len);
 		readFullBytes(b,0,len);
 		#if neko
 		return neko.Lib.stringReference(b);
 		#else
-		return b.toString();
+		return b.getString(0, len, encoding);
 		#end
 	}
 

+ 2 - 2
std/haxe/io/Output.hx

@@ -275,11 +275,11 @@ class Output {
 	/**
 		Write `s` string.
 	**/
-	public function writeString( s : String ) {
+	public function writeString( s : String, ?encoding : Encoding ) {
 		#if neko
 		var b = untyped new Bytes(s.length,s.__s);
 		#else
-		var b = Bytes.ofString(s);
+		var b = Bytes.ofString(s, encoding);
 		#end
 		writeFullBytes(b,0,b.length);
 	}

+ 1 - 1
std/haxe/xml/Parser.hx

@@ -378,7 +378,7 @@ class Parser
 							var c = s.fastCodeAt(1) == 'x'.code
 								? Std.parseInt("0" +s.substr(1, s.length - 1))
 								: Std.parseInt(s.substr(1, s.length - 1));
-							#if (neko || (cpp && !hxcpp_smart_strings) || php || lua || eval)
+							#if (neko || (cpp && !hxcpp_smart_strings))
 							if( c >= 128 ) {
 								// UTF8-encode it
 								if( c <= 0x7FF ) {

+ 12 - 6
std/hl/_std/haxe/io/Bytes.hx

@@ -119,13 +119,13 @@ class Bytes {
 		setInt32(pos, v.low);
 	}
 
-	public function getString( pos : Int, len : Int ) : String {
+	public function getString( pos : Int, len : Int, ?encoding : Encoding ) : String {
 		if( outRange(pos,len) ) throw Error.OutsideBounds;
 
 		var b = new hl.Bytes(len + 1);
 		b.blit(0, this.b, pos, len);
 		b[len] = 0;
-		return @:privateAccess String.fromUTF8(b);
+		return @:privateAccess (encoding == RawNative ? String.fromUCS2(b) : String.fromUTF8(b));
 	}
 
 	@:deprecated("readString is deprecated, use getString instead")
@@ -162,10 +162,16 @@ class Bytes {
 		return new Bytes(b,length);
 	}
 
-	public static function ofString( s : String ) : Bytes @:privateAccess {
-		var size = 0;
-		var b = s.bytes.utf16ToUtf8(0, size);
-		return new Bytes(b,size);
+	public static function ofString( s : String, ?encoding : Encoding ) : Bytes @:privateAccess {
+		if( encoding == null ) encoding = UTF8;
+		return switch( encoding ) {
+		case RawNative:
+			return new Bytes(s.bytes.sub(0,s.length << 1), s.length << 1);
+		case UTF8:
+			var size = 0;
+			var b = s.bytes.utf16ToUtf8(0, size);
+			return new Bytes(b,size);
+		}
 	}
 
 	public static function ofData( b : BytesData ) : Bytes {

+ 2 - 2
std/hl/_std/haxe/io/BytesBuffer.hx

@@ -64,9 +64,9 @@ class BytesBuffer {
 		__add(@:privateAccess src.b, 0, src.length);
 	}
 
-	public inline function addString( v : String ) : Void {
+	public inline function addString( v : String, ?encoding : Encoding ) : Void {
 		var len = 0;
-		@:privateAccess __add(v.bytes.utf16ToUtf8(0, len), 0, len);
+		@:privateAccess (encoding == RawNative ? __add(v.bytes,0,v.length<<1) : __add(v.bytes.utf16ToUtf8(0, len), 0, len));
 	}
 
 	public inline function addInt32( v : Int ) : Void {

+ 1 - 0
std/java/_std/String.hx

@@ -26,6 +26,7 @@
 
 	@:overload(function(b:haxe.io.BytesData, offset:Int, length:Int, charsetName:String):Void { })
 	@:overload(function(b:haxe.io.BytesData, offset:Int, length:Int):Void { })
+	@:overload(function(b:java.NativeArray<java.StdTypes.Char16>):Void { })
 	function new(string:String) : Void;
 
 	function toUpperCase() : String;

+ 2 - 5
std/java/internal/StringExt.hx

@@ -195,12 +195,9 @@ private typedef NativeString = String;
 		return me;
 	}
 
-	@:functionCode('
-		return java.lang.Character.toString( (char) code );
-	')
-	public static function fromCharCode(code:Int):NativeString
+	public static function fromCharCode(code:Int):String
 	{
-		return null;
+		return new String(java.lang.Character.toChars(code));
 	}
 }
 

+ 8 - 1
std/js/_std/String.hx

@@ -40,5 +40,12 @@
 		return @:privateAccess HxOverrides.substr(this, pos, len);
 	}
 
-	@:pure static function fromCharCode( code : Int ) : String;
+	@:pure static inline function fromCharCode( code : Int ) : String {
+		return js.Syntax.code("String.fromCodePoint({0})",code); 
+	}
+	
+	static function __init__() : Void {
+		js.Syntax.code("if( String.fromCodePoint == null ) String.fromCodePoint = function(c) { return c < 0x10000 ? String.fromCharCode(c) : String.fromCharCode((c>>10)+0xD7C0)+String.fromCharCode((c&0x3FF)+0xDC00); }");
+	}
+	
 }

+ 37 - 21
std/js/_std/haxe/io/Bytes.hx

@@ -132,31 +132,38 @@ class Bytes {
 		setInt32(pos + 4, v.high);
 	}
 
-	public function getString( pos : Int, len : Int ) : String {
+	public function getString( pos : Int, len : Int, ?encoding : Encoding ) : String {
 		if( pos < 0 || len < 0 || pos + len > length ) throw Error.OutsideBounds;
+		if( encoding == null ) encoding = UTF8;
 		var s = "";
 		var b = b;
-		var fcc = String.fromCharCode;
 		var i = pos;
 		var max = pos+len;
-		// utf8-decode and utf16-encode
-		while( i < max ) {
-			var c = b[i++];
-			if( c < 0x80 ) {
-				if( c == 0 ) break;
-				s += fcc(c);
-			} else if( c < 0xE0 )
-				s += fcc( ((c & 0x3F) << 6) | (b[i++] & 0x7F) );
-			else if( c < 0xF0 ) {
-				var c2 = b[i++];
-				s += fcc( ((c & 0x1F) << 12) | ((c2 & 0x7F) << 6) | (b[i++] & 0x7F) );
-			} else {
-				var c2 = b[i++];
-				var c3 = b[i++];
-				var u = ((c & 0x0F) << 18) | ((c2 & 0x7F) << 12) | ((c3 & 0x7F) << 6) | (b[i++] & 0x7F);
-				// surrogate pair
-				s += fcc( (u >> 10) + 0xD7C0 );
-				s += fcc( (u & 0x3FF) | 0xDC00 );
+		switch( encoding ) {
+		case UTF8:
+			var debug = pos > 0;
+			// utf8-decode and utf16-encode
+			while( i < max ) {
+				var c = b[i++];
+				if( c < 0x80 ) {
+					if( c == 0 ) break;
+					s += String.fromCharCode(c);
+				} else if( c < 0xE0 )
+					s += String.fromCharCode( ((c & 0x3F) << 6) | (b[i++] & 0x7F) );
+				else if( c < 0xF0 ) {
+					var c2 = b[i++];
+					s += String.fromCharCode( ((c & 0x1F) << 12) | ((c2 & 0x7F) << 6) | (b[i++] & 0x7F) );
+				} else {
+					var c2 = b[i++];
+					var c3 = b[i++];
+					var u = ((c & 0x0F) << 18) | ((c2 & 0x7F) << 12) | ((c3 & 0x7F) << 6) | (b[i++] & 0x7F);
+					s += String.fromCharCode(u);
+				}
+			}
+		case RawNative:
+			while( i < max ) {
+				var c = b[i++] | (b[i++] << 8);
+				s += String.fromCharCode(c);
 			}
 		}
 		return s;
@@ -194,7 +201,16 @@ class Bytes {
 		return new Bytes(new BytesData(length));
 	}
 
-	public static function ofString( s : String ) : Bytes {
+	public static function ofString( s : String, ?encoding : Encoding ) : Bytes {
+		if( encoding == RawNative ) {
+			var buf = new js.html.Uint8Array(s.length << 1);
+			for( i in 0...s.length ) {
+				var c : Int = StringTools.fastCodeAt(s,i);
+				buf[i << 1] = c & 0xFF;
+				buf[(i << 1)|1] = c >> 8;
+			}
+			return new Bytes(buf.buffer);
+		}
 		var a = new Array();
 		// utf16-decode and utf8-encode
 		var i = 0;

+ 11 - 10
std/js/_std/haxe/io/BytesBuffer.hx

@@ -21,6 +21,7 @@
  */
 package haxe.io;
 
+@:coreApi
 class BytesBuffer {
 
 	var buffer : js.html.ArrayBuffer;
@@ -40,12 +41,12 @@ class BytesBuffer {
 		return pos;
 	}
 
-	public function addByte( byte : Int ) {
+	public function addByte( byte : Int ) : Void {
 		if( pos == size ) grow(1);
 		view.setUint8(pos++, byte);
 	}
 
-	public function add( src : Bytes ) {
+	public function add( src : Bytes ) : Void {
 		if( pos + src.length > size ) grow(src.length);
 		if( size == 0 ) return;
 		var sub = new js.html.Uint8Array(@:privateAccess src.b.buffer, @:privateAccess src.b.byteOffset, src.length);
@@ -53,36 +54,36 @@ class BytesBuffer {
 		pos += src.length;
 	}
 
-	public function addString( v : String ) {
-		add(Bytes.ofString(v));
+	public function addString( v : String, ?encoding : Encoding ) : Void {
+		add(Bytes.ofString(v,encoding));
 	}
 
-	public function addInt32( v : Int ) {
+	public function addInt32( v : Int ) : Void {
 		if( pos + 4 > size ) grow(4);
 		view.setInt32(pos, v, true);
 		pos += 4;
 	}
 
-	public function addInt64( v : haxe.Int64 ) {
+	public function addInt64( v : haxe.Int64 ) : Void {
 		if( pos + 8 > size ) grow(8);
 		view.setInt32(pos, v.low, true);
 		view.setInt32(pos + 4, v.high, true);
 		pos += 8;
 	}
 
-	public function addFloat( v : Float ) {
+	public function addFloat( v : Float ) : Void {
 		if( pos + 4 > size ) grow(4);
 		view.setFloat32(pos, v, true);
 		pos += 4;
 	}
 
-	public function addDouble( v : Float ) {
+	public function addDouble( v : Float ) : Void {
 		if( pos + 8 > size ) grow(8);
 		view.setFloat64(pos, v, true);
 		pos += 8;
 	}
 
-	public function addBytes( src : Bytes, pos : Int, len : Int ) {
+	public function addBytes( src : Bytes, pos : Int, len : Int ) : Void {
 		if( pos < 0 || len < 0 || pos + len > src.length ) throw Error.OutsideBounds;
 		if( this.pos + len > size ) grow(len);
 		if( size == 0 ) return;
@@ -91,7 +92,7 @@ class BytesBuffer {
 		this.pos += len;
 	}
 
-	function grow( delta : Int ) {
+	function grow( delta : Int ) : Void {
 		var req = pos + delta;
 		var nsize = size == 0 ? 16 : size;
 		while( nsize < req )

+ 11 - 1
std/lua/Boot.hx

@@ -32,6 +32,9 @@ class Boot {
 	static var _;
 	static var _fid = 0;
 
+	// A max stack size to respect for unpack operations
+	public static var MAXSTACKSIZE (default, null) = 1000;
+
 	public static var platformBigEndian = NativeStringTools.byte(NativeStringTools.dump(function(){}),7) > 0;
 
 	static var hiddenFields : Table<String,Bool> = untyped __lua__("{__id__=true, hx__closures=true, super=true, prototype=true, __fields__=true, __ifields__=true, __class__=true, __properties__=true}");
@@ -188,7 +191,14 @@ class Boot {
 			}
 			case "boolean" : untyped tostring(o);
 			case "string"  : o;
-			case "userdata": "<userdata>";
+			case "userdata": {
+				var mt = lua.Lua.getmetatable(o);
+				if (mt != null && mt.__tostring != null){
+					lua.Lua.tostring(o);
+				} else {
+					"<userdata>";
+				}
+			}
 			case "function": "<function>";
 			case "thread"  : "<thread>";
 			case "table": {

+ 36 - 41
std/lua/NativeStringTools.hx

@@ -1,6 +1,6 @@
 package lua;
 /**
-	These are all externs for the base Lua "string" class, which functions 
+	These are all externs for the base Lua "string" class, which functions
 	as an additional set of string tools.
 
 	Note that all relevant indexes are "1" based.
@@ -8,14 +8,14 @@ package lua;
 @:native("_G.string")
 extern class NativeStringTools {
 	/**
-		Receives a string and returns its length. The empty string `""` has 
+		Receives a string and returns its length. The empty string `""` has
 		length `0`. Embedded zeros are counted, so `"a\000bc\000"` has length `5`.
 	**/
 	public static function len(str : String): Int;
 
 	/**
-		Receives zero or more integers. Returns a string with length equal to the 
-		number of arguments, in which each character has the internal numerical 
+		Receives zero or more integers. Returns a string with length equal to the
+		number of arguments, in which each character has the internal numerical
 		code equal to its corresponding argument.
 		Note that numerical codes are not necessarily portable across platforms.
 	**/
@@ -24,31 +24,26 @@ extern class NativeStringTools {
 
 	// TODO: make a note about handling matched groups with multireturn
 	/**
-		Returns the substring of `str` that starts at `start` and continues until `end`; 
-		`start` and `end` can be negative. If `end` is absent, then it is assumed to be 
-		equal to `-1` (which is the same as the string length). 
-		In particular, the call `sub(str,1,end)` returns a prefix of `str` 
-		with length `end`, and `sub(str, -end)` returns a suffix of `str` with 
+		Returns the substring of `str` that starts at `start` and continues until `end`;
+		`start` and `end` can be negative. If `end` is absent, then it is assumed to be
+		equal to `-1` (which is the same as the string length).
+		In particular, the call `sub(str,1,end)` returns a prefix of `str`
+		with length `end`, and `sub(str, -end)` returns a suffix of `str` with
 		length `start`.
 	**/
 	public static function sub(str : String, start : Int, ?end : Int): StringSub;
 
 	/**
-		Returns the character code at position `index` of `str`.
-	**/
-	public static function charCodeAt(str : String, index : Int): Int;
-
-	/**
-		Looks for the first match of pattern in the string `str`. 
-		If it finds a match, then `find` returns the indices of `str` where this 
+		Looks for the first match of pattern in the string `str`.
+		If it finds a match, then `find` returns the indices of `str` where this
 		occurrence starts and ends.
-		
-		@param target If the target has captures, then in a successful match the 
+
+		@param target If the target has captures, then in a successful match the
 		       captured values are also returned, after the two indices.
 		@param start specifies where to start the search; its default value is `1`
-		       and can be negative. 
-		@param plain turns off the pattern matching facilities, so the function does 
-		       a plain "find substring" operation, with no characters in pattern 
+		       and can be negative.
+		@param plain turns off the pattern matching facilities, so the function does
+		       a plain "find substring" operation, with no characters in pattern
 		       being considered "magic". Note that if plain is given, then `start` must be given as well.
 	**/
 	public static function find(str : String, target : String, ?start : Int, ?plain : Bool): StringFind;
@@ -60,63 +55,63 @@ extern class NativeStringTools {
 	public static function byte(str : String, ?index : Int) : Int;
 
 	/**
-		Returns a formatted version of its variable number of arguments following 
-		the description given in its first argument (which must be a string). 
-		The format string follows the same rules as the printf family of standard C 
-		functions. The only differences are that the options/modifiers 
-		`*`, `l`, `L`, `n`, `p`, and `h` are not supported and that there is an 
+		Returns a formatted version of its variable number of arguments following
+		the description given in its first argument (which must be a string).
+		The format string follows the same rules as the printf family of standard C
+		functions. The only differences are that the options/modifiers
+		`*`, `l`, `L`, `n`, `p`, and `h` are not supported and that there is an
 		extra option, `q`. The `q` option formats a string in a form suitable to be
-		safely read back by the Lua interpreter: the string is written between 
-		double quotes, and all double quotes, newlines, embedded zeros, 
+		safely read back by the Lua interpreter: the string is written between
+		double quotes, and all double quotes, newlines, embedded zeros,
 		and backslashes in the string are correctly escaped when written.
 		For instance, the call
    `string.format('%q', 'a string with "quotes" and \n new line')`
 		will produce the string:
 		`"a string with \"quotes\" and \
       new line"`
-		
-		The options `c`, `d` `E`, `e`, `f`, `g`, `G`, `i`, `o`, `u, `X-, and `x` all 
+
+		The options `c`, `d` `E`, `e`, `f`, `g`, `G`, `i`, `o`, `u, `X-, and `x` all
 		expect a number as argument, whereas `q` and `s` expect a string.
-		
-		This function does not accept string values containing embedded zeros, 
+
+		This function does not accept string values containing embedded zeros,
 		except as arguments to the `q` option.
 	**/
 	public static function format(str : String, ?e1 : Dynamic, ?e2 : Dynamic, ?e3 : Dynamic, ?e4 : Dynamic): String;
 
 	/**
-		
+
 	**/
 	@:overload(   function     (str : String, pattern : String, replace : String->Void,   ?n : Int): String {})
 	@:overload(   function     (str : String, pattern : String, replace : String->String, ?n : Int): String {})
 	public static function gsub(str : String, pattern : String, replace : String,		  ?n : Int): String;
 
 	/**
-		Returns an iterator function that, each time it is called, returns the next 
-		captures from pattern over string `str`. If `pattern` specifies no captures, 
+		Returns an iterator function that, each time it is called, returns the next
+		captures from pattern over string `str`. If `pattern` specifies no captures,
 		then the whole match is produced in each call.
 	**/
 	@:overload(   function     (str : String, pattern : String, match : Void->String,   ?n : Int): String->Void {})
 	public static function gmatch(str : String, pattern : String): Void->String;
 
 	/**
-		Looks for the first match of pattern in the string s. If it finds one, 
+		Looks for the first match of pattern in the string s. If it finds one,
 		then match returns the captures from the pattern; otherwise it returns `null`.
 		If pattern specifies no captures, then the whole match is returned.
-		The optional argument `n` specifies where to start the search; 
+		The optional argument `n` specifies where to start the search;
 		its default value is `1` and can be negative.
 	**/
 	public static function match(str : String, pattern : String, ?n : Int): String;
 
 	/**
-		Receives a string and returns a copy of this string with all lowercase 
-		letters changed to uppercase. All other characters are left unchanged. 
+		Receives a string and returns a copy of this string with all lowercase
+		letters changed to uppercase. All other characters are left unchanged.
 		The definition of what a lowercase letter is depends on the current locale.
 	**/
 	public static function upper(str:String) : String;
 
 	/**
-		Receives a string and returns a copy of this string with all uppercase 
-		letters changed to lowercase. All other characters are left unchanged. 
+		Receives a string and returns a copy of this string with all uppercase
+		letters changed to lowercase. All other characters are left unchanged.
 		The definition of what an uppercase letter is depends on the current locale.
 	**/
 	public static function lower(str:String) : String;

+ 14 - 17
std/lua/_std/String.hx

@@ -23,7 +23,7 @@
 import lua.Lua;
 import lua.Table;
 import lua.Boot;
-import lua.NativeStringTools;
+import lua.lib.luautf8.Utf8;
 
 @:coreApi
 @:extern
@@ -35,7 +35,7 @@ class String {
 
 	@:keep
 	static function __index(s:Dynamic, k:Dynamic) : Dynamic {
-		if (k == "length") return NativeStringTools.len(s);
+		if (k == "length") return Utf8.len(s);
 		else if (Reflect.hasField(untyped String.prototype, k)) return untyped String.prototype[k];
 		else if (__oldindex != null) {
 			if (Lua.type(__oldindex) == "function"){
@@ -48,12 +48,12 @@ class String {
 		else return null;
 	}
 
-	public inline function toUpperCase() : String return NativeStringTools.upper(this);
-	public inline function toLowerCase() : String return NativeStringTools.lower(this);
+	public inline function toUpperCase() : String return Utf8.upper(this);
+	public inline function toLowerCase() : String return Utf8.lower(this);
 	public inline function indexOf( str : String, ?startIndex : Int ) : Int {
 		if (startIndex == null) startIndex = 1;
 		else startIndex += 1;
-		var r = NativeStringTools.find(this, str, startIndex, true).begin;
+		var r = Utf8.find(this, str, startIndex, true).begin;
 		if (r != null && r > 0) return r-1;
 		else return -1;
 	}
@@ -77,7 +77,7 @@ class String {
 		while (idx != null){
 			var newidx = 0;
 			if (delimiter.length > 0){
-				newidx = NativeStringTools.find(this, delimiter, idx, true).begin;
+				newidx = Utf8.find(this, delimiter, idx, true).begin;
 			} else if (idx >= this.length){
 				newidx = null;
 			} else {
@@ -85,11 +85,11 @@ class String {
 			}
 
 			if (newidx != null){
-				var match = NativeStringTools.sub(this, idx, newidx-1).match;
+				var match = Utf8.sub(this, idx, newidx-1).match;
 				ret.push(match);
 				idx = newidx + delimiter.length;
 			} else {
-				ret.push(NativeStringTools.sub(this,idx,this.length).match);
+				ret.push(Utf8.sub(this,idx,this.length).match);
 				idx = null;
 			}
 		}
@@ -105,20 +105,17 @@ class String {
 		if (startIndex < 0) startIndex = 0;
 		if (endIndex < startIndex) {
 			// swap the index positions
-			return NativeStringTools.sub(this, endIndex+1, startIndex).match;
+			return Utf8.sub(this, endIndex+1, startIndex).match;
 		} else {
-			return NativeStringTools.sub(this, startIndex+1, endIndex).match;
+			return Utf8.sub(this, startIndex+1, endIndex).match;
 		}
 	}
 
-	function get_length() : Int {
-		return NativeStringTools.len(this);
-	}
 	public inline function charAt( index : Int) : String {
-		return NativeStringTools.sub(this,index+1, index+1).match;
+		return Utf8.sub(this,index+1, index+1).match;
 	}
 	public inline function charCodeAt( index : Int) : Null<Int> {
-		return NativeStringTools.byte(this,index+1);
+		return Utf8.byte(this,index+1);
 	}
 
 	public inline function substr( pos : Int, ?len : Int ) : String {
@@ -126,11 +123,11 @@ class String {
 		else if (len < 0) len = length + len;
 		if (pos < 0) pos = length + pos;
 		if (pos < 0) pos = 0;
-		return NativeStringTools.sub(this, pos + 1, pos+len).match;
+		return Utf8.sub(this, pos + 1, pos+len).match;
 	}
 
 	public inline static function fromCharCode( code : Int ) : String {
-		return NativeStringTools.char(code);
+		return Utf8.char(code);
 	}
 
 }

+ 0 - 250
std/lua/_std/haxe/Utf8.hx

@@ -1,250 +0,0 @@
-/*
- * Copyright (C)2005-2018 Haxe Foundation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-package haxe;
-
-import lua.NativeStringTools;
-
-class Utf8 {
-
-    var __b : String;
-
-    public function new( ?size : Int ) {
-		__b = "";
-    }
-
-    public inline function addChar( c : Int ) : Void {
-		__b += char(c);
-    }
-
-    public inline function toString() : String {
-		return __b;
-    }
-
-    static inline function decodeChar( s : String, pos : Int, code : Int, width : Int ) {
-        return
-            if (width == 1)
-                code;
-            else if (width == 2)
-                ((code & 0x3F) << 6) |
-                (s.charCodeAt(pos+1) & 0x7F);
-            else if (width == 3)
-                ((code & 0x1F) << 12) |
-                ((s.charCodeAt(pos+1) & 0x7F) << 6) |
-                (s.charCodeAt(pos+2) & 0x7F);
-            else
-                ((code & 0x0F) << 18) |
-                ((s.charCodeAt(pos+1) & 0x7F) << 12) |
-                ((s.charCodeAt(pos+2) & 0x7F) << 6) |
-                (s.charCodeAt(pos+3) & 0x7F);
-    }
-
-    public static function iter( s : String, chars : Int -> Void ) {
-		var cur = 0;
-		while (cur < s.length){
-			var code = s.charCodeAt(cur);
-			var width = charWidth(code);
-			chars( decodeChar( s, cur, code, width ) );
-			cur += width;
-		}
-    }
-
-    public static function encode( s : String ) : String {
-		// ported from : http://phpjs.org/functions/utf8_encode/
-		if (s == null ) {
-			return '';
-		}
-		var string = (s + ''); // .replace(/\r\n/g, "\n").replace(/\r/g, "\n");
-		var utftext = '';
-		var start = 0;
-		var end = 0;
-		var n = 0;
-		while (n < s.length) {
-			var c1 = string.charCodeAt(n);
-			var enc = null;
-
-			if (c1 < 128) {
-			end++;
-			} else if (c1 > 127 && c1 < 2048) {
-			enc = String.fromCharCode( (c1 >> 6) | 192)
-				+ String.fromCharCode( (c1 & 63) | 128);
-			} else if ((c1 & 0xF800) != 0xD800) {
-			enc = String.fromCharCode( (c1 >> 12) | 224)
-				+ String.fromCharCode( ((c1 >> 6) & 63) | 128)
-				+ String.fromCharCode( (c1 & 63) | 128);
-			} else { // surrogate pairs
-			if ((c1 & 0xFC00) != 0xD800) {
-				throw 'Unmatched trail surrogate at ' + n;
-			}
-			var c2 = string.charCodeAt(++n);
-			if ((c2 & 0xFC00) != 0xDC00) {
-				throw 'Unmatched lead surrogate at ' + (n - 1);
-			}
-			c1 = ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000;
-			enc = String.fromCharCode( (c1 >> 18) | 240)
-				+ String.fromCharCode( ((c1 >> 12) & 63) | 128)
-				+ String.fromCharCode(((c1 >> 6) & 63) | 128)
-				+ String.fromCharCode((c1 & 63) | 128);
-			}
-			if (enc != null) {
-			if (end > start) {
-				utftext += string.substring(start, end);
-			}
-			utftext += enc;
-			start = end = n + 1;
-			}
-			n++;
-		}
-
-		if (end > start) {
-			utftext += string.substring(start, s.length);
-		}
-
-		return utftext;
-
-    }
-
-    public static function decode( s : String ) : String {
-		var ret = new StringBuf();
-		iter(s, function(c){
-			if( c == 8364 ) // euro symbol
-			c = 164;
-			else if( c > 255 ){
-			// throw new RangeError('Utf8 decode invalid character ($c)');
-			throw 'Utf8::decode invalid character ($c)';
-			}
-
-			if (c != 0xFEFF) // BOM
-			ret.add(String.fromCharCode(c));
-		});
-		return ret.toString();
-    }
-
-    public static inline function charCodeAt( s : String, index : Int ) : Int {
-		var cur_idx = 0;
-		var pos = 0;
-		for (i in 0...index){
-			pos += charWidth(s.charCodeAt(pos));
-		}
-		var ret = 0;
-		var code = s.charCodeAt(pos);
-		var bytes = charWidth(code);
-		return decodeChar( s, pos, code, bytes );
-    }
-
-    public static function validate( s : String ) : Bool {
-		if (s == null) return false;
-		var cur = 0;
-		while (cur < s.length){
-			var c1 = s.charCodeAt(cur++);
-			if (c1 < 0x80) continue;
-			if (c1 < 0xC0) return false;
-			if (s.length <= cur) return false;
-			var c2 = s.charCodeAt(cur++);
-			if (c1 < 0xE0) {
-				if ((c1 & 0x1E != 0) && (c2 & 0xC0 == 0x80)) continue;
-				return false;
-			}
-			if (s.length <= cur) return false;
-			var c3 = s.charCodeAt(cur++);
-			if (c1 < 0xF0) {
-				if (((c1 & 0x0F != 0) || (c2 & 0x20 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80)
-						&& !(c1 == 0xED && 0xA0 <= c2 && c2 <= 0xBF))
-					continue;
-				return false;
-			}
-			if (s.length <= cur) return false;
-			var c4 = s.charCodeAt(cur++);
-			if (c1 < 0xF8) {
-				if (((c1 & 0x07 != 0) || (c2 & 0x30 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80) && (c4 & 0xC0 == 0x80)
-						&& !((c1 == 0xF4 && c2 > 0x8F) || c1 > 0xF4))
-					continue;
-				return false;
-			}
-			return false;
-		}
-		return true;
-    }
-
-    public static inline function length( s : String ) : Int {
-		var pos = 0;
-		var len = 0;
-		while (pos < s.length){
-			pos += charWidth(s.charCodeAt(pos));
-			len++;
-		}
-		return len;
-    }
-
-    public static function compare( a : String, b : String ) : Int {
-		return a > b ? 1 : (a == b ? 0 : -1);
-    }
-
-    public static inline function sub( s : String, pos : Int, len : Int ) : String {
-		var startpos = 0;
-		var ret = new StringBuf();
-		for (i in 0...pos){
-			startpos += charWidth(s.charCodeAt(startpos));
-		}
-		var endpos = startpos;
-		for (i in 0...len){
-			endpos += charWidth(s.charCodeAt(endpos));
-		}
-		return s.substring(startpos, endpos);
-    }
-
-    static function charWidth(c:Int) : Int {
-		return   if (c >  0   && c <= 127) 1;
-			else if (c >= 194 && c <= 223) 2;
-			else if (c >= 224 && c <= 239) 3;
-			else if (c >= 240 && c <= 244) 4;
-			else null;
-    }
-
-    public static function char( unicode : Int ) : String {
-		if (unicode <= 0x7F) {
-			return String.fromCharCode(unicode);
-		} else if (unicode <= 0x7FF) {
-			var b0 = 0xC0 + Math.floor(unicode / 0x40);
-			var b1 = 0x80 + (unicode % 0x40);
-			return NativeStringTools.char(b0, b1);
-		} else if (unicode <= 0xFFFF) {
-			var b0 = 0xE0 +  Math.floor(unicode / 0x1000);
-			var b1 = 0x80 + (Math.floor(unicode / 0x40) % 0x40);
-			var b2 = 0x80 + (unicode % 0x40);
-			return NativeStringTools.char(b0, b1, b2);
-		} else if (unicode <= 0x10FFFF) {
-			var code = unicode;
-			var b3   = 0x80 + (code % 0x40);
-			code     = Math.floor(code / 0x40);
-			var b2   = 0x80 + (code % 0x40);
-			code     = Math.floor(code / 0x40);
-			var b1   = 0x80 + (code % 0x40);
-			code     = Math.floor(code / 0x40);
-			var b0   = 0xF0 + code;
-
-			return NativeStringTools.char(b0, b1, b2, b3);
-		} else {
-			throw 'Unicode greater than U+10FFFF';
-		}
-    }
-}
-

+ 6 - 5
std/lua/_std/sys/io/Process.hx

@@ -27,6 +27,7 @@ import lua.lib.luv.Signal;
 import lua.lib.luv.Loop;
 import lua.Boot;
 import lua.Table;
+import lua.NativeStringTools;
 
 import haxe.io.Bytes;
 import haxe.io.Error;
@@ -74,9 +75,9 @@ class Process {
 
 
 	public function new( cmd : String, ?args : Array<String>, ?detached : Bool){
-	
+
 		if( detached ) throw "Detached process is not supported on this platform";
-	
+
 		var _stdout = new Pipe(false);
 		var _stderr = new Pipe(false);
 		var _stdin  = new Pipe(false);
@@ -133,7 +134,7 @@ private class ProcessInput extends haxe.io.Input {
 
 	override public function readByte() {
 		var err_str = null;
-		if (buf == null || idx >= buf.length){
+		if (buf == null || idx >= NativeStringTools.len(buf)){
 			buf = null;
 			idx = 0;
 			var pending = true;
@@ -147,7 +148,7 @@ private class ProcessInput extends haxe.io.Input {
 		}
 		if (buf == null) throw new haxe.io.Eof();
 		if (err_str != null) throw err_str;
-		var code : Int =  cast buf.charCodeAt(idx++);
+		var code = NativeStringTools.byte(buf, ++idx);
 		return code;
 	}
 
@@ -182,7 +183,7 @@ private class ProcessOutput extends haxe.io.Output {
 	}
 
 	override public function writeByte(c : Int ) : Void {
-		b.write(String.fromCharCode(c));
+		b.write(NativeStringTools.char(c));
 	}
 
 	override public function close(){

+ 117 - 0
std/lua/lib/luautf8/Utf8.hx

@@ -0,0 +1,117 @@
+package lua.lib.luautf8;
+/**
+	These are all externs for the lua-utf8 library, which functions
+	as an additional set of string tools.
+
+	Note that all relevant indexes are "1" based.
+**/
+@:luaRequire('lua-utf8')
+extern class Utf8 {
+	/**
+		Receives a string and returns its length. The empty string `""` has
+		length `0`. Embedded zeros are counted, so `"a\000bc\000"` has length `5`.
+	**/
+	public static function len(str : String): Int;
+
+	/**
+		Receives zero or more integers. Returns a string with length equal to the
+		number of arguments, in which each character has the internal numerical
+		code equal to its corresponding argument.
+		Note that numerical codes are not necessarily portable across platforms.
+	**/
+	public static function char(codes: haxe.extern.Rest<Int>): String;
+
+
+	/**
+		Returns the substring of `str` that starts at `start` and continues until `end`;
+		`start` and `end` can be negative. If `end` is absent, then it is assumed to be
+		equal to `-1` (which is the same as the string length).
+		In particular, the call `sub(str,1,end)` returns a prefix of `str`
+		with length `end`, and `sub(str, -end)` returns a suffix of `str` with
+		length `start`.
+	**/
+	public static function sub(str : String, start : Int, ?end : Int): StringSub;
+
+	/**
+		Returns the character code at position `index` of `str`.
+	**/
+	public static function charCodeAt(str : String, index : Int): Int;
+
+	/**
+		Looks for the first match of pattern in the string `str`.
+		If it finds a match, then `find` returns the indices of `str` where this
+		occurrence starts and ends.
+
+		@param target If the target has captures, then in a successful match the
+		       captured values are also returned, after the two indices.
+		@param start specifies where to start the search; its default value is `1`
+		       and can be negative.
+		@param plain turns off the pattern matching facilities, so the function does
+		       a plain "find substring" operation, with no characters in pattern
+		       being considered "magic". Note that if plain is given, then `start` must be given as well.
+	**/
+	public static function find(str : String, target : String, ?start : Int, ?plain : Bool): StringFind;
+
+	/**
+		Returns the internal numerical codes of the characters `str[index]`.
+		Note that numerical codes are not necessarily portable across platforms.
+	**/
+	public static function byte(str : String, ?index : Int) : Int;
+
+	/**
+
+	**/
+	@:overload(   function     (str : String, pattern : String, replace : String->Void,   ?n : Int): String {})
+	@:overload(   function     (str : String, pattern : String, replace : String->String, ?n : Int): String {})
+	public static function gsub(str : String, pattern : String, replace : String,		  ?n : Int): String;
+
+	/**
+		Returns an iterator function that, each time it is called, returns the next
+		captures from pattern over string `str`. If `pattern` specifies no captures,
+		then the whole match is produced in each call.
+	**/
+	@:overload(   function     (str : String, pattern : String, match : Void->String,   ?n : Int): String->Void {})
+	public static function gmatch(str : String, pattern : String): Void->String;
+
+	/**
+		Looks for the first match of pattern in the string s. If it finds one,
+		then match returns the captures from the pattern; otherwise it returns `null`.
+		If pattern specifies no captures, then the whole match is returned.
+		The optional argument `n` specifies where to start the search;
+		its default value is `1` and can be negative.
+	**/
+	public static function match(str : String, pattern : String, ?n : Int): String;
+
+	/**
+		Receives a string and returns a copy of this string with all lowercase
+		letters changed to uppercase. All other characters are left unchanged.
+		The definition of what a lowercase letter is depends on the current locale.
+	**/
+	public static function upper(str:String) : String;
+
+	/**
+		Receives a string and returns a copy of this string with all uppercase
+		letters changed to lowercase. All other characters are left unchanged.
+		The definition of what an uppercase letter is depends on the current locale.
+	**/
+	public static function lower(str:String) : String;
+
+
+	public static function codes(str : String) : Void->StringCodePoint;
+
+}
+
+@:multiReturn extern class StringFind {
+	var begin : Int;
+	var end : Int;
+}
+
+@:multiReturn extern class StringSub {
+	var match : String;
+	var count : Int;
+}
+
+@:multiReturn extern class StringCodePoint {
+	var position : Int;
+	var codepoint : Int;
+}

+ 35 - 26
std/php/Boot.hx

@@ -525,6 +525,15 @@ class Boot {
 	public static function dynamicString( str:String ) : HxDynamicStr {
 		return @:privateAccess new HxDynamicStr(str);
 	}
+
+	static public function utf8CharAt(str:String, index:Int):Null<String> {
+		if (index < 0 || index >= str.length) {
+			return null;
+		}
+		//preg_split() is faster than mb_substr()
+		var chars = Global.preg_split('//u', str, -1, Const.PREG_SPLIT_NO_EMPTY);
+		return chars == false ? null : (chars:NativeArray)[index];
+	}
 }
 
 
@@ -626,27 +635,21 @@ private class HxEnum {
 private class HxString {
 
 	public static function toUpperCase( str:String ) : String {
-		return Global.strtoupper(str);
+		return Global.mb_strtoupper(str, 'UTF-8');
 	}
 
 	public static function toLowerCase( str:String ) : String {
-		return Global.strtolower(str);
+		return Global.mb_strtolower(str, 'UTF-8');
 	}
 
 	public static function charAt( str:String, index:Int) : String {
-		if (index < 0 || index >= str.length) {
-			return '';
-		} else {
-			return (str:NativeString)[index];
-		}
+		return Syntax.coalesce(Boot.utf8CharAt(str, index), '');
 	}
 
 	public static function charCodeAt( str:String, index:Int) : Null<Int> {
-		if (index < 0 || index >= str.length) {
-			return null;
-		} else {
-			return Global.ord((str:NativeString)[index]);
-		}
+		var char = Boot.utf8CharAt(str, index);
+		if(char == null) return null;
+		return Global.mb_ord(char, 'UTF-8');
 	}
 
 	public static function indexOf( str:String, search:String, startIndex:Int = null ) : Int {
@@ -655,12 +658,17 @@ private class HxString {
 		} else if (startIndex < 0) {
 			startIndex += str.length;
 		}
-		var index = Global.strpos(str, search, startIndex);
+		var index = Global.mb_strpos(str, search, startIndex, 'UTF-8');
 		return (index == false ? -1 : index);
 	}
 
 	public static function lastIndexOf( str:String, search:String, startIndex:Int = null ) : Int {
-		var index = Global.strrpos(str, search, (startIndex == null ? 0 : startIndex - str.length));
+		if(startIndex == null) {
+			startIndex = 0;
+		} else {
+			startIndex = startIndex - str.length;
+		}
+		var index = Global.mb_strrpos(str, search, startIndex, 'UTF-8');
 		if (index == false) {
 			return -1;
 		} else {
@@ -670,9 +678,14 @@ private class HxString {
 
 	public static function split( str:String, delimiter:String ) : Array<String> {
 		if (delimiter == '') {
-			return @:privateAccess Array.wrap(Global.str_split(str));
+			var arr:NativeArray = Global.preg_split('//u', str, -1, Const.PREG_SPLIT_NO_EMPTY);
+			return @:privateAccess Array.wrap(arr);
 		} else {
-			return @:privateAccess Array.wrap(Global.explode(delimiter, str));
+			//don't mess with user-defined encoding
+			var prev = Global.mb_regex_encoding();
+			Global.mb_regex_encoding('UTF-8');
+			return @:privateAccess Array.wrap(Global.mb_split(Global.preg_quote(delimiter), str));
+			Global.mb_regex_encoding(prev);
 		}
 	}
 
@@ -682,12 +695,7 @@ private class HxString {
 		} else if (pos >= str.length) {
 			return '';
 		}
-		if (len == null) {
-			return Global.substr(str, pos);
-		} else {
-			var result = Global.substr(str, pos, len);
-			return (result == false ? '' : result);
-		}
+		return Global.mb_substr(str, pos, len, 'UTF-8');
 	}
 
 	public static function substring( str:String, startIndex:Int, ?endIndex:Int ) : String {
@@ -696,14 +704,15 @@ private class HxString {
 		} else if (endIndex < 0) {
 			endIndex = 0;
 		}
-		if (startIndex < 0) startIndex = 0;
+		if (startIndex < 0) {
+			startIndex = 0;
+		}
 		if (startIndex > endIndex) {
 			var tmp = endIndex;
 			endIndex = startIndex;
 			startIndex = tmp;
 		}
-		var result = Global.substr(str, startIndex, endIndex - startIndex);
-		return (result == false ? '' : result);
+		return Global.mb_substr(str, startIndex, endIndex - startIndex, 'UTF-8');
 	}
 
 	public static function toString( str:String ) : String {
@@ -711,7 +720,7 @@ private class HxString {
 	}
 
 	public static function fromCharCode( code:Int ) : String {
-		return Global.chr(code);
+		return Global.mb_chr(code, 'UTF-8');
 	}
 }
 

+ 42 - 0
std/php/Global.hx

@@ -957,6 +957,11 @@ extern class Global {
 	**/
 	static function mb_check_encoding( str:String = null, ?encoding:String ) : Bool;
 
+	/**
+		@see http://php.net/manual/en/function.mb-split.php
+	**/
+	static function mb_split( pattern:String, str:String, ?limit:Int ) : NativeIndexedArray<String>;
+
 	/**
 		@see http://php.net/manual/en/function.mb-strlen.php
 	**/
@@ -967,6 +972,43 @@ extern class Global {
 	**/
 	static function mb_substr( str:String, start:Int, length:Int = null, ?encoding:String ) : String;
 
+	/**
+		@see http://php.net/manual/en/function.mb-chr.php
+		(Polyfilled for php 7.0)
+	**/
+	static function mb_chr( cp:Int, ?encoding:String ) : String;
+
+	/**
+		@see http://php.net/manual/en/function.mb-ord.php
+		(Polyfilled for php 7.0)
+	**/
+	static function mb_ord( str:String, ?encoding:String ) : Int;
+
+	/**
+		@see http://php.net/manual/en/function.mb-regex-encoding.php
+	**/
+	static function mb_regex_encoding( ?encoding:String ) : EitherType<Bool,String>;
+
+	/**
+		@see http://php.net/manual/en/function.mb-strtoupper.php
+	**/
+	static function mb_strtoupper( str:String, ?encoding:String ) : String;
+
+	/**
+		@see http://php.net/manual/en/function.mb-strpos.php
+	**/
+	static function mb_strpos( haystack:String, needle:String, ?offset:Int, ?encoding:String ) : EitherType<Int,Bool>;
+
+	/**
+		@see http://php.net/manual/en/function.mb-strrpos.php
+	**/
+	static function mb_strrpos( haystack:String, needle:String, ?offset:Int, ?encoding:String ) : EitherType<Int,Bool>;
+
+	/**
+		@see http://php.net/manual/en/function.mb-strtolower.php
+	**/
+	static function mb_strtolower( str:String, ?encoding:String ) : String;
+
 	/**
 		@see http://php.net/manual/en/function.proc-open.php
 	**/

+ 43 - 0
std/php/_polyfills.php

@@ -7,4 +7,47 @@
  */
 namespace { //Namespace declaration is required because this file is included under non-root namespace.
 
+	/**
+	 * @see http://php.net/manual/en/function.mb-chr.php
+	 */
+	if(!function_exists('mb_chr')) {
+		function mb_chr($code, $encoding = null) {
+			if($encoding !== 'UTF-8') {
+				throw new Exception("$encoding is not supported in mb_chr() polyfill.");
+			}
+			if (0x80 > $code %= 0x200000) {
+				$s = chr($code);
+			} elseif (0x800 > $code) {
+				$s = chr(0xC0 | $code >> 6) . chr(0x80 | $code & 0x3F);
+			} elseif (0x10000 > $code) {
+				$s = chr(0xE0 | $code >> 12) . chr(0x80 | $code >> 6 & 0x3F) . chr(0x80 | $code & 0x3F);
+			} else {
+				$s = chr(0xF0 | $code >> 18) . chr(0x80 | $code >> 12 & 0x3F) . chr(0x80 | $code >> 6 & 0x3F) . chr(0x80 | $code & 0x3F);
+			}
+			return $s;
+		}
+	}
+
+	/**
+	 * @see http://php.net/manual/en/function.mb-ord.php
+	 */
+	if(!function_exists('mb_ord')) {
+		function mb_ord($s, $encoding = null) {
+			if($encoding !== 'UTF-8') {
+				throw new Exception("$encoding is not supported in mb_ord() polyfill.");
+			}
+			$code = ($s = unpack('C*', substr($s, 0, 4))) ? $s[1] : 0;
+			if (0xF0 <= $code) {
+				return (($code - 0xF0) << 18) + (($s[2] - 0x80) << 12) + (($s[3] - 0x80) << 6) + $s[4] - 0x80;
+			}
+			if (0xE0 <= $code) {
+				return (($code - 0xE0) << 12) + (($s[2] - 0x80) << 6) + $s[3] - 0x80;
+			}
+			if (0xC0 <= $code) {
+				return (($code - 0xC0) << 6) + $s[2] - 0x80;
+			}
+			return $code;
+		}
+	}
+
 }

+ 6 - 10
std/php/_std/String.hx

@@ -29,28 +29,24 @@ import php.*;
 	@:pure function new(string:String) : Void;
 
 	@:pure @:runtime inline function toUpperCase() : String {
-		return Global.strtoupper(this);
+		return Global.mb_strtoupper(this, 'UTF-8');
 	}
 
 	@:pure @:runtime inline function toLowerCase() : String {
-		return Global.strtolower(this);
+		return Global.mb_strtolower(this, 'UTF-8');
 	}
 
 	@:pure @:runtime inline function charAt(index : Int) : String {
-		return (index < 0 || index >= this.length ? '' : (this:NativeString)[index]);
+		return Syntax.coalesce(Boot.utf8CharAt(this, index), '');
 	}
 
-	@:pure @:runtime inline function charCodeAt( index : Int) : Null<Int> {
-		return (index < 0 || index >= this.length ? null : Global.ord((this:NativeString)[index]));
-	}
+	@:pure function charCodeAt( index : Int) : Null<Int>;
 
 	@:pure function indexOf( str : String, ?startIndex : Int ) : Int;
 
 	@:pure function lastIndexOf( str : String, ?startIndex : Int ) : Int;
 
-	@:pure @:runtime inline function split( delimiter : String ) : Array<String> {
-		return @:privateAccess Array.wrap(delimiter == '' ? Global.str_split(this) : Global.explode(delimiter, this));
-	}
+	@:pure function split( delimiter : String ) : Array<String>;
 
 	@:pure function substr( pos : Int, ?len : Int ) : String;
 
@@ -61,6 +57,6 @@ import php.*;
 	}
 
 	@:pure @:runtime static inline function fromCharCode( code : Int ) : String {
-		return Global.chr(code);
+		return Global.mb_chr(code, 'UTF-8');
 	}
 }

+ 1 - 1
std/php/_std/StringTools.hx

@@ -95,7 +95,7 @@ import php.*;
 	}
 
 	public static inline function fastCodeAt( s : String, index : Int ) : Int {
-		return (s.length == index ? 0 : Global.ord((s:NativeString)[index]));
+		return (s.length == index ? 0 : Global.mb_ord(Boot.utf8CharAt(s, index), 'UTF-8'));
 	}
 
 	public static inline function isEof( c : Int ) : Bool {

+ 4 - 3
std/php/_std/haxe/io/Bytes.hx

@@ -147,10 +147,11 @@ class Bytes {
 		setInt32(pos + 4, v.high);
 	}
 
-	public inline function getString( pos : Int, len : Int ) : String {
+	public inline function getString( pos : Int, len : Int, ?encoding : Encoding ) : String {
 		if( pos < 0 || len < 0 || pos + len > length ) {
 			throw Error.OutsideBounds;
 		} else {
+			//no need to handle encoding, because PHP strings are binary safe.
 			return b.getString(pos, len);
 		}
 	}
@@ -177,8 +178,8 @@ class Bytes {
 		return new Bytes(length, BytesData.alloc(length));
 	}
 
-	public static inline function ofString( s : String ) : Bytes {
-		return new Bytes(s.length, s);
+	public static inline function ofString( s : String, ?encoding : Encoding ) : Bytes {
+		return new Bytes(php.Global.strlen(s), s);
 	}
 
 	public static inline function ofData( b : BytesData ) : Bytes {

+ 5 - 5
std/php/_std/haxe/io/BytesBuffer.hx

@@ -24,7 +24,7 @@ package haxe.io;
 import php.*;
 
 class BytesBuffer {
-	var b : String;
+	var b : NativeString;
 
 	/** The length of the buffer in bytes. **/
 	public var length(get,never) : Int;
@@ -41,7 +41,7 @@ class BytesBuffer {
 		b = Syntax.concat(b, src.getData().toNativeString());
 	}
 
-	public inline function addString( v : String ) {
+	public inline function addString( v : String, ?encoding : Encoding ) {
 		b = Syntax.concat(b, v);
 	}
 
@@ -77,13 +77,13 @@ class BytesBuffer {
 		Returns either a copy or a reference of the current bytes.
 		Once called, the buffer can no longer be used.
 	**/
-	public function getBytes() : Bytes untyped {
-		var bytes = new Bytes(b.length, b);
+	public function getBytes() : Bytes {
+		var bytes = @:privateAccess new Bytes(length, b);
 		b = null;
 		return bytes;
 	}
 
 	inline function get_length() : Int {
-		return b.length;
+		return Global.strlen(b);
 	}
 }

+ 2 - 1
std/python/_std/sys/io/FileInput.hx

@@ -22,6 +22,7 @@
 package sys.io;
 
 import haxe.io.Bytes;
+import haxe.io.Encoding;
 import haxe.io.Input;
 import python.io.IFileInput;
 
@@ -115,7 +116,7 @@ class FileInput extends Input
 		return impl.readInt32();
 	}
 
-	override public function readString( len : Int ) : String {
+	override public function readString( len : Int, ?encoding : Encoding ) : String {
 		return impl.readString(len);
 	}
 

+ 2 - 1
std/python/_std/sys/io/FileOutput.hx

@@ -22,6 +22,7 @@
 package sys.io;
 
 import haxe.io.Bytes;
+import haxe.io.Encoding;
 import haxe.io.Input;
 import haxe.io.Output;
 import python.io.IFileOutput;
@@ -110,7 +111,7 @@ class FileOutput extends Output {
 		impl.writeInput(i,bufsize);
 	}
 
-	override public function writeString( s : String ):Void {
+	override public function writeString( s : String, ?encoding : Encoding ):Void {
 		impl.writeString(s);
 	}
 }

+ 2 - 1
std/python/io/IInput.hx

@@ -22,6 +22,7 @@
 package python.io;
 
 import haxe.io.Bytes;
+import haxe.io.Encoding;
 
 interface IInput
 {
@@ -59,5 +60,5 @@ interface IInput
 
 	public function readInt32():Int;
 
-	public function readString( len : Int ) : String;
+	public function readString( len : Int, ?encoding : Encoding ) : String;
 }

+ 2 - 1
std/python/io/IOutput.hx

@@ -22,6 +22,7 @@
 package python.io;
 
 import haxe.io.Bytes;
+import haxe.io.Encoding;
 import haxe.io.Input;
 
 interface IOutput {
@@ -60,5 +61,5 @@ interface IOutput {
 
 	public function writeInput( i : Input, ?bufsize : Int ):Void;
 
-	public function writeString( s : String ):Void;
+	public function writeString( s : String, ?encoding : Encoding ):Void;
 }

+ 1 - 1
tests/optimization/src/issues/Issue6015.hx

@@ -4,7 +4,7 @@ class Issue6015 {
 	@:js('
 		var a = null;
 		var tmp = a.a();
-		String.fromCharCode.apply(null,tmp);
+		_$String_String_$Impl_$.fromCharCode.apply(null,tmp);
 	')
 	static public function main() {
 		var a:Dynamic = null;

+ 16 - 18
tests/runci/targets/Lua.hx

@@ -23,26 +23,24 @@ class Lua {
 		}
 	}
 
-	static public function installLuaVersionDependencies(lv:String){
-		if (lv == "-l5.1"){
-			if (!commandSucceed("luarocks", ["show", "luabit"])) {
-				runCommand("luarocks", ["install", "luabitop", "1.0.2-3", "--server=https://luarocks.org/dev"]);
-			}
-		}
-		if (!commandSucceed("luarocks", ["show", "lrexlib-pcre"])) {
-			runCommand("luarocks", ["install", "lrexlib-pcre", "2.8.0-1", "--server=https://luarocks.org/dev"]);
-		}
-		if (!commandSucceed("luarocks", ["show", "luv"])) {
-			runCommand("luarocks", ["install", "luv", "1.9.1-0", "--server=https://luarocks.org/dev"]);
-		}
-		if (!commandSucceed("luarocks", ["show", "luasocket"])) {
-			runCommand("luarocks", ["install", "luasocket", "3.0rc1-2", "--server=https://luarocks.org/dev"]);
-		}
-		if (!commandSucceed("luarocks", ["show", "environ"])) {
-			runCommand("luarocks", ["install", "environ", "0.1.0-1", "--server=https://luarocks.org/dev"]);
+	static function installLib(lib : String, version : String, server = "https://luarocks.org/dev"){
+		var server_arg = '--server=$server';
+		if (!commandSucceed("luarocks", ["show", lib])) {
+			runCommand("luarocks", ["install",lib, version, server_arg]);
 		}
 	}
 
+	static public function installLuaVersionDependencies(lv:String){
+		if (lv == "-l5.1") installLib("luabitop", "1.0.2-3");
+
+		installLib("lrexlib-pcre" , "2.8.0-1");
+		installLib("luv"          , "1.9.1-0");
+		installLib("luasocket"    , "3.0rc1-2");
+		installLib("environ"      , "0.1.0-1");
+		installLib("luautf8"      , "0.1.1-1");
+
+	}
+
 	static public function run(args:Array<String>) {
 		getLuaDependencies();
 		var envpath = Sys.getEnv("HOME") + '/lua_env';
@@ -70,4 +68,4 @@ class Lua {
 			runCommand("haxe", ["compile.hxml"]);
 		}
 	}
-}
+}

+ 1 - 0
tests/unit/compile-cpp.hxml

@@ -1,4 +1,5 @@
 compile-each.hxml
 --main unit.TestMain
+-D hxcpp_smart_strings
 -cpp bin/cpp
 -D HXCPP_NO_DEBUG_LINK

+ 1 - 0
tests/unit/compile-cppia-host.hxml

@@ -1,6 +1,7 @@
 --main cpp.cppia.Host
 -D source-header=''
 -D scriptable
+-D hxcpp_smart_strings
 -D dll_export=bin/cppia.classes
 --debug
 --dce no

+ 1 - 1
tests/unit/src/unit/TestResource.hx

@@ -16,7 +16,7 @@ class TestResource extends Test {
 			eq( names[1], "re/s?!%[]))(\"'1.txt" );
 		}
 		eq( haxe.Resource.getString("re/s?!%[]))(\"'1.txt"), STR );
-		#if (neko || php)
+		#if (neko || php ||  eval)
 		// allow binary strings
 		eq( haxe.Resource.getBytes("re/s?!%[]))(\"'1.bin").sub(0,9).toString(), "MZ\x90\x00\x03\x00\x00\x00\x04" );
 		#else

+ 193 - 0
tests/unit/src/unitstd/Unicode.unit.hx

@@ -0,0 +1,193 @@
+#if !(neko || (cpp && !cppia && !hxcpp_smart_strings)) // these platforms will not be made unicode-compatible
+
+
+var s = String.fromCharCode(0xE9);
+s == "é";
+s.length == 1;
+s.charCodeAt(0) == 0xE9;
+
+var s = String.fromCharCode("あ".code);
+s == "あ";
+s.length == 1;
+s.charCodeAt(0) == "あ".code;
+
+var s = "aa😂éé";
+s.indexOf(String.fromCharCode(0x80))<0;
+s.indexOf("é")==s.length-2;
+s.indexOf("aa")==0;
+s.indexOf("a")==0;
+s.lastIndexOf("a")==1;
+s.indexOf("😂")>0;
+s.lastIndexOf("😂")>0;
+s.lastIndexOf("é")==s.length-1;
+var s = "abc";
+s.indexOf("éé")<0;
+s.lastIndexOf("éé")<0;
+
+var s = String.fromCharCode(0x1f602);
+s == "😂";
+
+
+#if (php || lua || python)
+// native UTF-16 or 32
+s.length == 1;
+s.charCodeAt(0) == "😂".code;
+#else
+// UTF-16 surrogate pairs encoding
+s.length == 2;
+s.charCodeAt(0) == 55357;
+s.charCodeAt(1) == 56834;
+#end
+
+var s = "é" + "あ";
+s == "éあ";
+s.length == 2;
+s.charCodeAt(1) == "あ".code;
+
+var s = "é" + "😂" + "あ";
+s == "é😂あ";
+var a = s.split('😂');
+a.length == 2;
+a[0] == "é";
+a[1] == "あ";
+a.join('😂') == s;
+
+var a = s.split('');
+#if ( php || lua || python )
+// native UTF-16 or 32
+a.length == 3;
+a[0] == "é";
+a[1] == "😂";
+a[2] == "あ";
+#else
+a.length == 4;
+a[0] == "é";
+a[3] == "あ";
+#end
+
+var buf = new StringBuf();
+buf.addChar(0xE9);
+buf.addChar("あ".code);
+buf.add("é");
+buf.add("あ");
+var str = buf.toString();
+str.length == 4;
+str == "éあéあ";
+str.charCodeAt(3) == "あ".code;
+
+var str = StringTools.urlEncode("éあ😂");
+str == "%C3%A9%E3%81%82%F0%9F%98%82";
+str = StringTools.urlDecode(str);
+str == "éあ😂";
+
+var str = haxe.Serializer.run("éあ");
+str == "y15:%C3%A9%E3%81%82";
+str = haxe.Unserializer.run(str);
+str == "éあ";
+
+var str = haxe.Serializer.run("😂");
+str == "y12:%F0%9F%98%82";
+str = haxe.Unserializer.run(str);
+str == "😂";
+
+var str = haxe.io.Bytes.ofString("éあ😂");
+str.toHex() == "c3a9e38182f09f9882";
+
+["é", "e"].join("é") == "éée";
+["é", "e"].join("e") == "éee";
+
+var bytes = haxe.io.Bytes.ofString("éあ😂",RawNative);
+
+#if (cpp || php || lua || eval || python )
+bytes.toHex() == "c3a9e38182f09f9882"; // UTF-8 native
+#else
+bytes.toHex() == "e90042303dd802de"; // UTF-16 native
+#end
+
+bytes.getString(0,bytes.length,RawNative) == "éあ😂";
+
+haxe.crypto.Md5.encode("éあ😂") == "d30b209e81e40d03dd474b26b77a8a18";
+haxe.crypto.Sha1.encode("éあ😂") == "ec79856a75c98572210430aeb7fe6300b6c4e20c";
+#if php //utf-8
+haxe.crypto.Sha224.encode("éあ😂") == "d7967c5f27bd6868e276647583c55ab09d5f45b40610a3d9c6d91b90";
+haxe.crypto.Sha256.encode("éあ😂") == "d0230b8d8ac2d6d0dbcee11ad0e0eaa68a6565347261871dc241571cab591676";
+#elseif (lua || python)
+null; // skip these until str2blk is updated
+#else //utf-16
+haxe.crypto.Sha224.encode("éあ😂") == "5132a98e08a503350384c765388a1a3b8b0b532f038eca94c881537e";
+haxe.crypto.Sha256.encode("éあ😂") == "e662834bdc1a099b9f7b8d97975a1b1d9b6730c991268bba0e7fe7427e68be74";
+#end
+haxe.crypto.BaseCode.encode("éあ😂","0123456789abcdef") == "c3a9e38182f09f9882";
+
+var buf = new haxe.io.BytesBuffer();
+buf.addString("éあ😂");
+buf.addString("éあ😂",RawNative);
+var bytes = buf.getBytes();
+bytes.getString(0,9) == "éあ😂";
+bytes.getString(2,3) == "あ";
+bytes.getString(5,4) == "😂";
+bytes.getString(2,7) == "あ😂";
+bytes.getString(9,bytes.length - 9,RawNative) == "éあ😂";
+
+var o = new haxe.io.BytesOutput();
+o.writeString("éあ😂");
+o.writeString("éあ😂",RawNative);
+var bytes2 = o.getBytes();
+bytes2.toHex() == bytes.toHex();
+
+var input = new haxe.io.BytesInput(bytes2);
+input.readString(2) == "é";
+input.readString(7) == "あ😂";
+input.readString(bytes.length - 9,RawNative) == "éあ😂";
+
+// Mixed encoding tests... mostly relevant for Eval which has both ASCII and UCS2 at run-time
+
+var s = "ée";
+var s1 = s.charAt(1);
+s1 == "e";
+#if eval
+(untyped s1.isAscii()) == true;
+(untyped s.charAt(0).isAscii()) == false;
+#end
+
+var s1 = s.substr(1, 1);
+var s2 = s.substr(1);
+var s3 = s.substr(-1);
+var s4 = s.substr(-1, 1);
+s1 == "e";
+s2 == "e";
+s3 == "e";
+#if !python
+s4 == "e";
+#end
+#if eval
+// We currently don't asciify anything we extract from UCS2 strings... not sure if this would
+// be worth it or not.
+(untyped s1.isAscii()) == false;
+(untyped s2.isAscii()) == false;
+(untyped s3.isAscii()) == false;
+(untyped s4.isAscii()) == false;
+#end
+
+var s1 = s.substring(1, 2);
+var s2 = s.substring(1);
+var s3 = s.substring(2, 1);
+var s4 = s.substring(1, 20);
+s1 == "e";
+s2 == "e";
+s3 == "e";
+s4 == "e";
+#if eval
+(untyped s1.isAscii()) == false;
+(untyped s2.isAscii()) == false;
+(untyped s3.isAscii()) == false;
+(untyped s4.isAscii()) == false;
+#end
+
+Reflect.compare("ed", "éee".substr(1)) < 0;
+Reflect.compare("éed".substr(1), "éee".substr(1)) < 0;
+Reflect.compare("éed".substr(1), "ee") < 0;
+Reflect.compare("ee", "éed".substr(1)) > 0;
+Reflect.compare("éee".substr(1), "éed".substr(1)) > 0;
+Reflect.compare("éee".substr(1), "ed") > 0;
+#end

+ 1 - 1
tests/unit/src/unitstd/haxe/Utf8.unit.hx

@@ -48,7 +48,7 @@ haxe.Utf8.compare(haxe.Utf8.sub(str, 1, 0), "") == 0;
 
 // #if (neko || php || cpp || lua || macro)
 // TODO neko, cpp, macro
-#if (php || lua)
+#if php
 haxe.Utf8.validate("\xf0\xa9\xb8\xbd\xe3\x81\x82\xc3\xab\x61") == true;
 haxe.Utf8.validate("\xed\x9f\xbf") == true;
 haxe.Utf8.validate("\xee\x80\x80") == true;

+ 1 - 1
tests/unit/src/unitstd/haxe/crypto/Hmac.unit.hx

@@ -8,4 +8,4 @@ hmacSha256.make(haxe.io.Bytes.ofString(""), haxe.io.Bytes.ofString("")).toHex()
 
 hmacMd5.make(haxe.io.Bytes.ofString("key"), haxe.io.Bytes.ofString("The quick brown fox jumps over the lazy dog")).toHex() == "80070713463e7749b90c2dc24911e275";
 hmacSha1.make(haxe.io.Bytes.ofString("key"), haxe.io.Bytes.ofString("The quick brown fox jumps over the lazy dog")).toHex() == "de7c9b85b8b78aa6bc8a7a36f70a90701c9db4d9";
-hmacSha256.make(haxe.io.Bytes.ofString("key"), haxe.io.Bytes.ofString("The quick brown fox jumps over the lazy dog")).toHex() == "f7bc83f430538424b13298e6aa6fb143ef4d59a14946175997479dbc2d1a3cd8";
+hmacSha256.make(haxe.io.Bytes.ofString("key"), haxe.io.Bytes.ofString("The quick brown fox jumps over the lazy dog")).toHex() == "f7bc83f430538424b13298e6aa6fb143ef4d59a14946175997479dbc2d1a3cd8";