Browse Source

started switch from UTF8 to UCS2 for String

Nicolas Cannasse 9 years ago
parent
commit
b4be5b972b
10 changed files with 263 additions and 298 deletions
  1. 1 0
      common.ml
  2. 112 70
      genhl.ml
  3. 62 71
      std/hl/_std/EReg.hx
  4. 3 3
      std/hl/_std/Std.hx
  5. 44 94
      std/hl/_std/String.hx
  6. 21 48
      std/hl/_std/StringBuf.hx
  7. 1 1
      std/hl/_std/Type.hx
  8. 5 7
      std/hl/_std/haxe/ds/StringMap.hx
  9. 9 4
      std/hl/types/Bytes.hx
  10. 5 0
      std/hl/types/Ref.hx

+ 1 - 0
common.ml

@@ -829,6 +829,7 @@ let platforms = [
 	Cs;
 	Java;
 	Python;
+	Hl;
 ]
 
 let platform_name = function

+ 112 - 70
genhl.ml

@@ -393,6 +393,7 @@ let to_utf8 str =
 		str;
 	with
 		UTF8.Malformed_code ->
+			(* ISO to utf8 *)
 			let b = UTF8.Buf.create 0 in
 			String.iter (fun c -> UTF8.Buf.add_char b (UChar.of_char c)) str;
 			UTF8.Buf.contents b
@@ -949,7 +950,7 @@ and cast_to ctx (r:reg) (t:ttype) p =
 		let bytes = alloc_tmp ctx HBytes in
 		op ctx (ORef (lref,len));
 		op ctx (OCall2 (bytes,alloc_std ctx "itos" [HI32;HRef HI32] HBytes,cast_to ctx r HI32 p,lref));
-		op ctx (OCall3 (out,alloc_fun_path ctx ([],"String") "__alloc__",bytes,len,len));
+		op ctx (OCall2 (out,alloc_fun_path ctx ([],"String") "__alloc__",bytes,len));
 		out
 	| (HF32 | HF64), HObj { pname = "String" } ->
 		let out = alloc_tmp ctx t in
@@ -958,7 +959,7 @@ and cast_to ctx (r:reg) (t:ttype) p =
 		let bytes = alloc_tmp ctx HBytes in
 		op ctx (ORef (lref,len));
 		op ctx (OCall2 (bytes,alloc_std ctx "ftos" [HF64;HRef HI32] HBytes,cast_to ctx r HF64 p,lref));
-		op ctx (OCall3 (out,alloc_fun_path ctx ([],"String") "__alloc__",bytes,len,len));
+		op ctx (OCall2 (out,alloc_fun_path ctx ([],"String") "__alloc__",bytes,len));
 		out
 	| (HObj _ | HDynObj | HDyn) , HVirtual _ ->
 		let out = alloc_tmp ctx t in
@@ -1196,13 +1197,13 @@ and eval_expr ctx e =
 			op ctx (OBool (r,b));
 			r
 		| TString s ->
-			let s = to_utf8 s in
+			let str = to_utf8 s in
 			let r = alloc_tmp ctx HBytes in
-			op ctx (OString (r,alloc_string ctx s));
-			let size = reg_int ctx (String.length s) in
-			let len = reg_int ctx (UTF8.length s) in
 			let s = alloc_tmp ctx (to_type ctx e.etype) in
-			op ctx (OCall3 (s,alloc_fun_path ctx ([],"String") "__alloc__",r,size,len));
+			op ctx (ONew s);
+			op ctx (OString (r,alloc_string ctx str));
+			op ctx (OSetField (s,0,r));
+			op ctx (OSetField (s,1,reg_int ctx (UTF8.length str)));
 			s
 		| TThis ->
 			0 (* first reg *)
@@ -2968,13 +2969,17 @@ let rec is_compatible v t =
 exception Runtime_error of string
 exception InterpThrow of value
 
+type cast =
+	| CNo
+	| CDyn of ttype
+	| CUnDyn of ttype
+
 let interp code =
 
 	let globals = Array.map default code.globals in
 	let functions = Array.create (Array.length code.functions + Array.length code.natives) (FNativeFun ("",(fun _ -> assert false),HDyn)) in
 	let cached_protos = Hashtbl.create 0 in
 	let func f = Array.unsafe_get functions f in
-	let streof s = try String.sub s 0 (String.index s '\000') with Not_found -> s in
 
 	let stack = ref [] in
 	let exc_stack = ref [] in
@@ -2991,9 +2996,57 @@ let interp code =
 			proto
 	in
 
+	let caml_to_hl str =
+		let b = Buffer.create (String.length str * 2) in
+		let add c =
+			Buffer.add_char b (char_of_int (c land 0xFF));
+			Buffer.add_char b (char_of_int (c lsr 8));
+		in
+		UTF8.iter (fun c ->
+			let c = UChar.code c in
+			if c >= 0 && c < 0x10000 then begin
+				if c >= 0xD800 && c <= 0xDFFF then failwith ("Invalid unicode char " ^ string_of_int c);
+				add c;
+			end else if c < 0x110000 then begin
+				let c = c - 0x10000 in
+				add ((c asr 10) + 0xD800);
+				add ((c land 1023) + 0xDC00);
+			end else
+				failwith ("Invalid unicode char " ^ string_of_int c);
+		) str;
+		add 0;
+		Buffer.contents b
+	in
+
+	let hl_to_caml str =
+		let b = UTF8.Buf.create (String.length str / 2) in
+		let add c =
+			UTF8.Buf.add_char b (UChar.chr c);
+		in
+		let get v = int_of_char str.[v] in
+		let rec loop p =
+			let c = (get p) lor ((get (p+1)) lsl 8) in
+			if c = 0 then () else if c >= 0xD800 && c <= 0xDFFF then begin
+				let c = c - 0xD800 in
+				let c2 = ((get (p+2)) lor ((get(p+3)) lsl 8)) - 0xDC00 in
+				add ((c2 lor (c lsl 10)) + 0x10000);
+				loop (p + 4);
+			end else begin
+				add c;
+				loop (p + 2);
+			end;
+		in
+		loop 0;
+		UTF8.Buf.contents b
+	in
+
+	let hl_to_caml_sub str pos len =
+		hl_to_caml (String.sub str pos len ^ "\x00\x00")
+	in
+
 	let error msg = raise (Runtime_error msg) in
 	let throw v = exc_stack := []; raise (InterpThrow v) in
-	let throw_msg msg = throw (VDyn (VBytes (msg ^ "\x00"),HBytes)) in
+	let throw_msg msg = throw (VDyn (VBytes (caml_to_hl msg),HBytes)) in
 
 	let hash_cache = Hashtbl.create 0 in
 
@@ -3076,7 +3129,7 @@ let interp code =
 			(match get_method o.oproto.pclass "__string" with
 			| None -> "#" ^ o.oproto.pclass.pname
 			| Some f -> vstr (fcall (func f) [v]) HBytes)
-		| VBytes b -> streof b
+		| VBytes b -> hl_to_caml b
 		| VClosure (f,_) -> fstr f
 		| VArray (a,t) -> "[" ^ String.concat ", " (Array.to_list (Array.map (fun v -> vstr v t) a)) ^ "]"
 		| VUndef -> "undef"
@@ -3110,8 +3163,6 @@ let interp code =
 				raise (InterpThrow v)
 			| Failure msg ->
 				throw_msg msg
-			| e ->
-				error (Printexc.to_string e)
 
 	and dyn_set_field obj field v vt =
 		let v, vt = (match vt with
@@ -3213,17 +3264,22 @@ let interp code =
 			| VNull -> VNull
 			| VClosure (fn,farg) ->
 				let conv = List.map2 (fun t1 t2 ->
-					if safe_cast t2 t1 || (t2 = HDyn && is_dynamic t1) then None
-					else if t2 = HDyn then Some t1
+					if safe_cast t2 t1 || (t2 = HDyn && is_dynamic t1) then CNo
+					else if t2 = HDyn then CDyn t1
+					else if t1 = HDyn then CUnDyn t2
 					else invalid()
 				) args1 args2 in
-				let rconv = if safe_cast t1 t2 then None else if t2 = HDyn then Some t1 else invalid() in
+				let rconv = if safe_cast t1 t2 then CNo else if t2 = HDyn then CDyn t1 else if t1 = HDyn then CUnDyn t2 else invalid() in
+				let convert v c =
+					match c with
+					| CNo -> v
+					| CDyn t -> make_dyn v t
+					| CUnDyn t -> dyn_cast v HDyn t
+				in
 				VClosure (FNativeFun ("~convert",(fun args ->
-					let args = List.map2 (fun v conv -> match conv with None -> v | Some t -> make_dyn v t) args conv in
+					let args = List.map2 convert args conv in
 					let ret = fcall fn (match farg with None -> args | Some a -> a :: args) in
-					match rconv with
-					| None -> ret
-					| Some t -> make_dyn ret t
+					convert ret rconv
 				),rt),None)
 			| _ ->
 				assert false)
@@ -3234,7 +3290,7 @@ let interp code =
 				match get_type v with
 				| None -> assert false
 				| Some t -> dyn_cast (match v with VDyn (v,_) -> v | _ -> v) t rt)
-		| HNull _, _ ->
+		| HNull t, _ ->
 			(match v with
 			| VNull -> default()
 			| VDyn (v,t) -> dyn_cast v t rt
@@ -3442,7 +3498,7 @@ let interp code =
 			| OMov (a,b) -> set a (get b)
 			| OInt (r,i) -> set r (VInt code.ints.(i))
 			| OFloat (r,i) -> set r (VFloat (Array.unsafe_get code.floats i))
-			| OString (r,s) -> set r (VBytes (code.strings.(s) ^ "\x00"))
+			| OString (r,s) -> set r (VBytes (caml_to_hl code.strings.(s)))
 			| OBool (r,b) -> set r (VBool b)
 			| ONull r -> set r VNull
 			| OAdd (r,a,b) -> set r (numop Int32.add ( +. ) a b)
@@ -3631,7 +3687,7 @@ let interp code =
 				| VArray (a,_) -> set r (VInt (Int32.of_int (Array.length a)));
 				| _ -> assert false)
 			| OError s ->
-				throw (VDyn (VBytes (code.strings.(s) ^ "\x00"),HBytes))
+				throw_msg code.strings.(s)
 			| OType (r,t) ->
 				set r (VType t)
 			| OGetType (r,v) ->
@@ -3782,46 +3838,22 @@ let interp code =
 				| [VInt v; VRef (regs,i,_)] ->
 					let str = Int32.to_string v in
 					regs.(i) <- to_int (String.length str);
-					VBytes (str ^ "\x00")
+					VBytes (caml_to_hl str)
 				| _ -> assert false);
 			| "ftos" ->
 				(function
 				| [VFloat _ as v; VRef (regs,i,_)] ->
 					let str = vstr v HF64 in
 					regs.(i) <- to_int (String.length str);
-					VBytes (str ^ "\x00")
+					VBytes (caml_to_hl str)
 				| _ -> assert false);
 			| "value_to_string" ->
 				(function
 				| [v; VRef (regs,i,_)] ->
-					let str = vstr v HDyn in
-					regs.(i) <- to_int (String.length str);
-					VBytes (str ^ "\x00")
+					let str = caml_to_hl (vstr v HDyn) in
+					regs.(i) <- to_int (String.length str - 2);
+					VBytes str
 				| _ -> assert false);
-			| "utf8length" ->
-				(function
-				| [VBytes b; VInt start; VInt len] ->
-					to_int (UTF8.length (String.sub b (int start) (int len)))
-				| _ -> assert false)
-			| "utf8pos" ->
-				(function
-				| [VBytes b; VInt start; VInt len] ->
-					let s = int start in
-					let b = streof b in
-					to_int (UTF8.nth (String.sub b s (String.length b - s)) (int len))
-				| _ -> assert false)
-			| "byteslength" ->
-				(function
-				| [VBytes b; VInt start] ->
-					to_int (try String.index_from b (int start) '\000' with _ -> assert false)
-				| _ -> assert false)
-			| "utf8char" ->
-				(function
-				| [VBytes b; VInt start; VInt index] ->
-					let start = int start in
-					let b = String.sub b start (String.length b - start) in
-					to_int (try UChar.code (UTF8.get b (int index)) with _ -> 0)
-				| _ -> assert false)
 			| "math_isnan" -> (function [VFloat f] -> VBool (classify_float f = FP_nan) | _ -> assert false)
 			| "math_finite" -> (function [VFloat f] -> VBool (match classify_float f with FP_infinite | FP_nan -> false | _ -> true) | _ -> assert false)
 			| "math_round" -> (function [VFloat f] -> VInt (Int32.of_float (floor (f +. 0.5))) | _ -> assert false)
@@ -3844,20 +3876,20 @@ let interp code =
 			| "math_pow" -> (function [VFloat a; VFloat b] -> VFloat (a ** b) | _ -> assert false)
 			| "parse_int" ->
 				(function
-				| [VBytes str; VInt len] ->
+				| [VBytes str; VInt pos; VInt len] ->
 					(try
-						let i = (match Interp.parse_int (String.sub str 0 (int len)) with
+						let i = (match Interp.parse_int (hl_to_caml_sub str (int pos) (int len)) with
 							| Interp.VInt v -> Int32.of_int v
 							| Interp.VInt32 v -> v
 							| _ -> assert false
 						) in
-						VInt i
+						VDyn (VInt i,HI32)
 					with _ ->
 						VNull)
-				| _ -> assert false)
+				| l -> assert false)
 			| "parse_float" ->
 				(function
-				| [VBytes str; VInt len] -> (try VFloat (Interp.parse_float (String.sub str 0 (int len))) with _ -> VFloat nan)
+				| [VBytes str; VInt pos; VInt len] -> (try VFloat (Interp.parse_float (hl_to_caml_sub str (int pos) (int len))) with _ -> VFloat nan)
 				| _ -> assert false)
 			| "dyn_compare" ->
 				(function
@@ -3878,13 +3910,13 @@ let interp code =
 			| "hbset" ->
 				(function
 				| [VAbstract (AHashBytes h);VBytes b;v] ->
-					Hashtbl.replace h (streof b) v;
+					Hashtbl.replace h (hl_to_caml b) v;
 					VUndef
 				| _ -> assert false)
 			| "hbget" ->
 				(function
 				| [VAbstract (AHashBytes h);VBytes b] ->
-					(try Hashtbl.find h (streof b) with Not_found -> VNull)
+					(try Hashtbl.find h (hl_to_caml b) with Not_found -> VNull)
 				| _ -> assert false)
 			| "hbvalues" ->
 				(function
@@ -3895,18 +3927,18 @@ let interp code =
 			| "hbkeys" ->
 				(function
 				| [VAbstract (AHashBytes h)] ->
-					let keys = Hashtbl.fold (fun s _ acc -> VBytes (s ^ "\000") :: acc) h [] in
+					let keys = Hashtbl.fold (fun s _ acc -> VBytes (caml_to_hl s) :: acc) h [] in
 					VArray (Array.of_list keys, HBytes)
 				| _ -> assert false)
 			| "hbexists" ->
 				(function
-				| [VAbstract (AHashBytes h);VBytes b] -> VBool (Hashtbl.mem h (streof b))
+				| [VAbstract (AHashBytes h);VBytes b] -> VBool (Hashtbl.mem h (hl_to_caml b))
 				| _ -> assert false)
 			| "hbremove" ->
 				(function
 				| [VAbstract (AHashBytes h);VBytes b] ->
-					let m = Hashtbl.mem h (streof b) in
-					if m then Hashtbl.remove h (streof b);
+					let m = Hashtbl.mem h (hl_to_caml b) in
+					if m then Hashtbl.remove h (hl_to_caml b);
 					VBool m
 				| _ -> assert false)
 			| "hialloc" ->
@@ -3949,7 +3981,7 @@ let interp code =
 				| _ -> assert false)
 			| "sys_print" ->
 				(function
-				| [VBytes str] -> print_string (streof str); VUndef
+				| [VBytes str] -> print_string (hl_to_caml str); VUndef
 				| _ -> assert false)
 			| "sys_exit" ->
 				(function
@@ -3976,8 +4008,8 @@ let interp code =
 							let sup = (match o.psuper with None -> [||] | Some o -> fields o) in
 							Array.concat [
 								sup;
-								Array.map (fun (s,_,_) -> VBytes (s ^ "\000")) o.pfields;
-								Array.map (fun f -> VBytes (f.fname ^ "\000")) o.pproto
+								Array.map (fun (s,_,_) -> VBytes (caml_to_hl s)) o.pfields;
+								Array.map (fun f -> VBytes (caml_to_hl f.fname)) o.pproto
 							]
 						in
 						VArray (fields o,HBytes)
@@ -4007,6 +4039,16 @@ let interp code =
 					in
 					VBool (loop o)
 				| _ -> assert false)
+			| "ucs2length" ->
+				(function
+				| [VBytes s; VInt pos] ->
+					let delta = int pos in
+					let rec loop p =
+						let c = int_of_char s.[p+delta] lor ((int_of_char s.[p+delta+1]) lsl 8) in
+						if c = 0 then p lsr 1 else loop (p + 2)
+					in
+					to_int (loop 0)
+				| _ -> assert false)
 			| "call_method" ->
 				(function
 				| [f;VArray (args,HDyn)] -> dyn_call f (List.map (fun v -> v,HDyn) (Array.to_list args)) HDyn
@@ -4042,7 +4084,7 @@ let interp code =
 						| 'm' -> () (* always ON ? *)
 						| 'i' -> case_sensitive := false
 						| c -> failwith ("Unsupported regexp option '" ^ String.make 1 c ^ "'")
-					) (ExtString.String.explode (streof opt));
+					) (ExtString.String.explode (hl_to_caml opt));
 					let buf = Buffer.create 0 in
 					let rec loop prev esc = function
 						| [] -> ()
@@ -4075,7 +4117,7 @@ let interp code =
 								Buffer.add_char buf c;
 								loop c false l
 					in
-					loop '\000' false (ExtString.String.explode (streof str));
+					loop '\000' false (ExtString.String.explode (hl_to_caml str));
 					let str = Buffer.contents buf in
 					let r = {
 						r = if !case_sensitive then Str.regexp str else Str.regexp_case_fold str;
@@ -4088,7 +4130,7 @@ let interp code =
 			| "regexp_match" ->
 				(function
 				| [VAbstract (AReg r);VBytes str;VInt pos;VInt len] ->
-					let str = streof str and pos = int pos and len = int len in
+					let str = hl_to_caml str and pos = int pos and len = int len in
 					let nstr, npos, delta = (if len = String.length str - pos then str, pos, 0 else String.sub str pos len, 0, pos) in
 					(try
 						ignore(Str.search_forward r.r nstr npos);
@@ -4124,7 +4166,7 @@ let interp code =
 					| None -> VNull
 					| Some (pos,pend) ->
 						regs.(rlen) <- to_int (pend - pos);
-						VBytes (String.sub r.r_string pos (pend - pos)))
+						VBytes (caml_to_hl (String.sub r.r_string pos (pend - pos))))
 				| _ -> assert false)
 			| _ ->
 				unresolved())
@@ -4139,7 +4181,7 @@ let interp code =
 		String.concat "\n" (List.map (fun (f,pos) ->
 			let pos = !pos - 1 in
 			let file, line = (try let fid, line = f.debug.(pos) in code.debugfiles.(fid), line with _ -> "???", 0) in
-			Printf.sprintf "Called from fun(%d)@%d (%s line %d)" f.findex pos file line
+			Printf.sprintf "%s:%d: Called from fun(%d)@%d" file line f.findex pos
 		) st)
 	in
 	match functions.(code.entrypoint) with

+ 62 - 71
std/hl/_std/EReg.hx

@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2005-2012 Haxe Foundation
+ * Copyright (C)2005-2016 Haxe Foundation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -38,7 +38,7 @@ private typedef ERegValue = hl.types.NativeAbstract<"ereg">;
 	}
 
 	public function match( s : String ) : Bool {
-		var p = regexp_match(r,s.bytes,0,s.size);
+		var p = regexp_match(r,s.bytes,0,s.length);
 		if( p )
 			last = s;
 		else
@@ -48,38 +48,30 @@ private typedef ERegValue = hl.types.NativeAbstract<"ereg">;
 
 	public function matched( n : Int ) : String {
 		var size = 0;
-		var m = regexp_matched(r,n,new hl.types.Ref(size));
-		return m == null ? null : String.__alloc__(m,size,m.utf8Length(0,size));
+		var m = regexp_matched(r,n,size);
+		return (m == null) ? null : String.__alloc__(m, size);
 	}
 
 	public function matchedLeft() : String {
-		var size = 0;
-		var pos = regexp_matched_pos(r, 0, new hl.types.Ref(size));
-		if( pos < 0 ) return null;
-		return last.subBytes(0,pos);
+		var len = 0;
+		var p = regexp_matched_pos(r,0,len);
+		return last.substr(0,p);
 	}
 
 	public function matchedRight() : String {
-		var size = 0;
-		var pos = regexp_matched_pos(r, 0, new hl.types.Ref(size));
-		if( pos < 0 ) return null;
-		return last.subBytes(pos + size, last.size - (pos + size));
+		var len = 0;
+		var p = regexp_matched_pos(r,0,len);
+		return last.substr(p + len);
 	}
 
 	public function matchedPos() : { pos : Int, len : Int } {
 		var len = 0;
-		var pos = regexp_matched_pos(r, 0, new hl.types.Ref(len));
-		if( pos < 0 ) return null;
-		return { pos : last.bytes.utf8Length(0,pos), len : last.bytes.utf8Length(pos,len) };
+		var p = regexp_matched_pos(r, 0, len);
+		return { pos : p, len : len };
 	}
 
 	public function matchSub( s : String, pos : Int, len : Int = -1):Bool {
-		if( pos < 0 ) pos = 0;
-		if( pos > s.length ) pos = s.length;
-		if( len < 0 || pos + len > s.length ) len = s.length - pos;
-		var bpos = pos == 0 ? 0 : s.bytes.utf8Pos(0, pos);
-		var blen = pos + len == s.length ? s.size - bpos : s.bytes.utf8Pos(bpos, len);
-		var p = regexp_match(r, s.bytes, bpos, blen);
+		var p = regexp_match(r, s.bytes, pos, len < 0 ? s.length - pos : len);
 		if( p )
 			last = s;
 		else
@@ -89,67 +81,65 @@ private typedef ERegValue = hl.types.NativeAbstract<"ereg">;
 
 	public function split( s : String ) : Array<String> {
 		var pos = 0;
+		var len = s.length;
 		var a = new Array();
 		var first = true;
-		var sbytes = s.bytes;
-		var ssize = s.size;
 		do {
-			if( !regexp_match(r,sbytes,pos,ssize) )
+			if( !regexp_match(r,s.bytes,pos,len) )
 				break;
-			var msize = 0;
-			var mpos = regexp_matched_pos(r, 0, new hl.types.Ref(msize));
-			if( msize == 0 && !first ) {
-				if( mpos == s.size )
+			var plen = 0;
+			var p = regexp_matched_pos(r,0,plen);
+			if( plen == 0 && !first ) {
+				if( p == s.length )
 					break;
-				mpos++;
+				p++;
 			}
-			a.push(s.subBytes(pos,mpos - pos));
-			var tot = mpos + msize - pos;
+			a.push(s.substr(pos,p - pos));
+			var tot = p + plen - pos;
 			pos += tot;
-			ssize -= tot;
+			len -= tot;
 			first = false;
 		} while( global );
-		a.push(s.subBytes(pos,ssize));
+		a.push(s.substr(pos,len));
 		return a;
 	}
 
-	public function replace( s : String, by : String ) : String @:privateAccess {
+	public function replace( s : String, by : String ) : String {
 		var b = new StringBuf();
 		var pos = 0;
-		var sbytes = s.bytes;
-		var size = s.size;
+		var len = s.length;
 		var a = by.split("$");
 		var first = true;
 		do {
-			if( !regexp_match(r,sbytes,pos,size) )
+			if( !regexp_match(r,s.bytes,pos,len) )
 				break;
-			var msize = 0;
-			var mpos = regexp_matched_pos(r,0,new hl.types.Ref(msize));
-			if( msize == 0 && !first ) {
-				if( mpos == s.size )
+			var plen = 0;
+			var p = regexp_matched_pos(r,0, plen);
+			if( plen == 0 && !first ) {
+				if( p == s.length )
 					break;
-				mpos++;
+				p++;
 			}
-			b.__add(sbytes,pos,mpos-pos);
+			b.addSub(s,pos,p-pos);
 			if( a.length > 0 )
 				b.add(a[0]);
 			var i = 1;
 			while( i < a.length ) {
 				var k = a[i];
-				var c = StringTools.fastCodeAt(k, 0);
+				var c = k.charCodeAt(0);
 				// 1...9
 				if( c >= 49 && c <= 57 ) {
-					var psize = 0;
-					var p = try regexp_matched_pos(r,c-48, new hl.types.Ref(psize)) catch( e : String ) -1;
+					var plen = 0;
+					var p = try regexp_matched_pos(r,Std.int(c)-48,plen) catch( e : String ) -1;
 					if( p < 0 ){
-						b.addChar("$".code);
+						b.add("$");
 						b.add(k);
 					} else {
-						if( p > 0 ) b.__add(sbytes, p, psize);
+						if( p >= 0 ) b.addSub(s,p,plen);
 						b.addSub(k,1,k.length - 1);
 					}
-				} else if( c == 0 ) {
-					b.addChar("$".code);
+				} else if( c == null ) {
+					b.add("$");
 					i++;
 					var k2 = a[i];
 					if( k2 != null && k2.length > 0 )
@@ -158,43 +148,42 @@ private typedef ERegValue = hl.types.NativeAbstract<"ereg">;
 					b.add("$"+k);
 				i++;
 			}
-			var tot = mpos + msize - pos;
+			var tot = p + plen - pos;
 			pos += tot;
-			size -= tot;
+			len -= tot;
 			first = false;
 		} while( global );
-		b.__add(sbytes,pos,size);
+		b.addSub(s,pos,len);
 		return b.toString();
 	}
 
-	public function map( s : String, f : EReg -> String ) : String @:privateAccess {
-		var boffset = 0;
-		var ssize = s.size;
+	public function map( s : String, f : EReg -> String ) : String {
+		var offset = 0;
 		var buf = new StringBuf();
 		do {
-			if( boffset >= ssize )
+			if (offset >= s.length)
 				break;
-			else if (!matchSub(s, s.bytes.utf8Length(0,boffset))) {
-				buf.__add(s.bytes, boffset, ssize - boffset);
+			else if (!matchSub(s, offset)) {
+				buf.add(s.substr(offset));
 				break;
 			}
-			var msize = 0;
-			var mpos = regexp_matched_pos(r,0, new hl.types.Ref(msize));
-			buf.__add(s.bytes, boffset, mpos - boffset);
+			var plen = 0;
+			var p = regexp_matched_pos(r,0,plen);
+			buf.add(s.substr(offset, p - offset));
 			buf.add(f(this));
-			if( msize == 0 ) {
-				if( mpos == ssize ) break;
-				var k = s.bytes.utf8Pos(mpos, 1);
-				buf.__add(s.bytes, mpos, k);
-				boffset = mpos + k;
-			} else
-				boffset = mpos + msize;
+			if (plen == 0) {
+				buf.add(s.substr(p, 1));
+				offset = p + 1;
+			}
+			else
+				offset = p + plen;
 		} while (global);
-		if (!global && boffset > 0 && boffset < ssize )
-			buf.__add(s.bytes, boffset, ssize - boffset);
+		if (!global && offset > 0 && offset < s.length)
+			buf.add(s.substr(offset));
 		return buf.toString();
 	}
 
+
 	@:hlNative("regexp", "regexp_new_options") static function regexp_new_options( bytes : hl.types.Bytes, options : hl.types.Bytes ) : ERegValue {
 		return null;
 	}
@@ -210,4 +199,6 @@ private typedef ERegValue = hl.types.NativeAbstract<"ereg">;
 	@:hlNative("regexp", "regexp_matched_pos") static function regexp_matched_pos( r : ERegValue, n : Int, size : hl.types.Ref<Int> ) : Int {
 		return 0;
 	}
+
+
 }

+ 3 - 3
std/hl/_std/Std.hx

@@ -45,15 +45,15 @@ class Std {
 	public static function string( s : Dynamic ) : String {
 		var len = 0;
 		var bytes = hl.types.Bytes.ofValue(s,new hl.types.Ref(len));
-		return @:privateAccess String.__alloc__(bytes,len,bytes.utf8Length(0,len));
+		return @:privateAccess String.__alloc__(bytes,len>>1);
 	}
 
 	public static function parseInt( x : String ) : Null<Int> {
-		return @:privateAccess x.bytes.parseInt(0, x.size);
+		return @:privateAccess x.bytes.parseInt(0, x.length<<1);
 	}
 
 	public static function parseFloat( x : String ) : Float {
-		return @:privateAccess x.bytes.parseFloat(0, x.size);
+		return @:privateAccess x.bytes.parseFloat(0, x.length<<1);
 	}
 
 	@:keep static function __add__( a : Dynamic, b : Dynamic ) : Dynamic {

+ 44 - 94
std/hl/_std/String.hx

@@ -3,13 +3,11 @@
 class String {
 
 	var bytes : hl.types.Bytes;
-	var size : Int;
 	public var length(default,null) : Int;
 
 	public function new(string:String) : Void {
 		bytes = string.bytes;
 		length = string.length;
-		size = string.size;
 	}
 
 	public function toUpperCase() : String {
@@ -29,9 +27,9 @@ class String {
 
 	public function charCodeAt( index : Int) : Null<Int> {
 		var idx : UInt = index;
-		if( idx >= length )
+		if( idx >= (length:UInt) )
 			return null;
-		return @:privateAccess bytes.utf8Char(0,index);
+		return bytes.getUI16(index << 1);
 	}
 
 	public function indexOf( str : String, ?startIndex : Int ) : Int {
@@ -39,22 +37,23 @@ class String {
 		if( startIndex != null && startIndex > 0 ) {
 			if( startIndex >= length )
 				return -1;
-			startByte = bytes.utf8Length(0, startIndex);
+			startByte = startIndex << 1;
 		}
-		return bytes.find(startByte,size - startByte,str.bytes,0,str.size);
+		return bytes.find(startByte,(length << 1) - startByte,str.bytes,0,str.length << 1);
 	}
 
 	public function lastIndexOf( str : String, ?startIndex : Int ) : Int {
+		var size = length << 1;
 		var lastByte = size;
 		if( startIndex != null && startIndex < length ) {
 			if( startIndex <= 0 )
 				return -1;
-			lastByte = bytes.utf8Length(0, startIndex);
+			lastByte = startIndex << 1;
 		}
 		var last = -1;
 		var pos = 0;
 		while( true ) {
-			var p = bytes.find(pos, size - pos, str.bytes, 0, str.size);
+			var p = bytes.find(pos, size - pos, str.bytes, 0, str.length << 1);
 			if( p < 0 || p >= lastByte ) break;
 			last = p;
 			pos = p + 1;
@@ -63,40 +62,31 @@ class String {
 	}
 
 	public function split( delimiter : String ) : Array<String> {
-		var pos = 0;
 		var out = [];
-		if( size == 0 ) {
+		if( length == 0 ) {
 			out.push("");
 			return out;
 		}
-		var dsize = delimiter.size;
-		if( dsize == 0 ) {
-			while( pos < size ) {
-				var p = bytes.utf8Pos(pos, 1);
-				out.push(subBytes(pos, p));
-				pos += p;
-			}
+		if( delimiter.length == 0 ) {
+			for( i in 0...length )
+				out.push(substr(i,1));
 			return out;
 		}
+		var pos = 0;
+		var dlen = delimiter.length;
 		while( true ) {
-			var p = bytes.find(pos, size - pos, delimiter.bytes, 0, dsize);
+			var p = bytes.find(pos << 1, (length - pos) << 1, delimiter.bytes, 0, dlen << 1);
 			if( p < 0 ) {
-				out.push(subBytes(pos, size-pos));
+				out.push(substr(pos, length-pos));
 				break;
 			}
-			out.push(subBytes(pos, p - pos));
-			pos = p + dsize;
+			p >>= 1;
+			out.push(substr(pos, p - pos));
+			pos = p + dlen;
 		}
 		return out;
 	}
 
-	function subBytes( pos : Int, size : Int ) : String {
-		var b = new hl.types.Bytes(size + 1);
-		b.blit(0, bytes, pos, size);
-		b[size] = 0;
-		return __alloc__(b, size, b.utf8Length(0, size));
-	}
-
 	public function substr( pos : Int, ?len : Int ) : String @:privateAccess {
 		var sl = length;
 		var len : Int = if( len == null ) sl else len;
@@ -118,14 +108,10 @@ class String {
 
 		if( pos < 0 || len <= 0 ) return "";
 
-		var bytes = bytes;
-		var start = pos == 0 ? 0 : bytes.utf8Pos(0, pos);
-		var size = pos + len == sl ? size - start : bytes.utf8Pos(start, len);
-
-		var b = new hl.types.Bytes(size + 1);
-		b.blit(0, bytes, start, size);
-		b[size] = 0;
-		return __alloc__(b, size, len);
+		var b = new hl.types.Bytes((len + 1) << 1);
+		b.blit(0, bytes, pos<<1, len << 1);
+		b.setUI16(len<<1,0);
+		return __alloc__(b, len);
 	}
 
 	public function substring( startIndex : Int, ?endIndex : Int ) : String {
@@ -138,56 +124,21 @@ class String {
 	}
 
 	public static function fromCharCode( code : Int ) : String {
-		if( code < 0 ) throw "Invalid char code " + code;
-		if( code < 0x80 ) {
-			var b = new hl.types.Bytes(2);
-			b[0] = code;
-			b[1] = 0;
-			return __alloc__(b, 1, 1);
-		}
-		if( code < 0x800 ) {
-			var b = new hl.types.Bytes(3);
-			b[0] = 0xC0 | (code >> 6);
-			b[1] = 0x80 | (code & 63);
-			b[2] = 0;
-			return __alloc__(b, 2, 1);
-		}
-		if( code < 0x10000 ) {
+		if( code >= 0 && code < 0x10000 ) {
+			if( code >= 0xD800 && code <= 0xDFFF ) throw "Invalid unicode char " + code;
 			var b = new hl.types.Bytes(4);
-			b[0] = 0xE0 | (code >> 12);
-			b[1] = 0x80 | ((code >> 6) & 63);
-			b[2] = 0x80 | (code & 63);
-			b[3] = 0;
-			return __alloc__(b, 3, 1);
-		}
-		if( code < 0x200000 ) {
-			var b = new hl.types.Bytes(5);
-			b[0] = 0xF0 | (code >> 18);
-			b[1] = 0x80 | ((code >> 12) & 63);
-			b[2] = 0x80 | ((code >> 6) & 63);
-			b[3] = 0x80 | (code & 63);
-			b[4] = 0;
-			return __alloc__(b, 4, 1);
-		}
-		if( code < 0x4000000 ) {
+			b.setUI16(0, code);
+			b.setUI16(2, 0);
+			return __alloc__(b, 1);
+		} else if( code < 0x110000 ) {
 			var b = new hl.types.Bytes(6);
-			b[0] = 0xF8 | (code >> 24);
-			b[1] = 0x80 | ((code >> 18) & 63);
-			b[2] = 0x80 | ((code >> 12) & 63);
-			b[3] = 0x80 | ((code >> 6) & 63);
-			b[4] = 0x80 | (code & 63);
-			b[5] = 0;
-			return __alloc__(b, 5, 1);
-		}
-		var b = new hl.types.Bytes(7);
-		b[0] = 0xFC | (code >> 30);
-		b[1] = 0x80 | ((code >> 24) & 63);
-		b[2] = 0x80 | ((code >> 18) & 63);
-		b[3] = 0x80 | ((code >> 12) & 63);
-		b[4] = 0x80 | ((code >> 6) & 63);
-		b[5] = 0x80 | (code & 63);
-		b[6] = 0;
-		return __alloc__(b, 6, 1);
+			code -= 0x10000;
+			b.setUI16(0, (code >> 10) + 0xD800);
+			b.setUI16(2, (code & 1023) + 0xDC00);
+			b.setUI16(4, 0);
+			return __alloc__(b, 2); // UTF16 encoding but UCS2 API (same as JS)
+		} else
+			throw "Invalid unicode char " + code;
 	}
 
 	@:keep function __string() : hl.types.Bytes {
@@ -195,26 +146,25 @@ class String {
 	}
 
 	@:keep function __compare( s : String ) : Int {
-		var v = bytes.compare(0, s.bytes, 0, size < s.size ? size : s.size);
-		return v == 0 ? size - s.size : v;
+		var v = bytes.compare(0, s.bytes, 0, (length < s.length ? length : s.length) << 1);
+		return v == 0 ? length - s.length : v;
 	}
 
-	@:keep static inline function __alloc__( b : hl.types.Bytes, blen : Int, clen : Int ) : String {
+	@:keep static inline function __alloc__( b : hl.types.Bytes, length : Int ) : String {
 		var s : String = untyped $new(String);
 		s.bytes = b;
-		s.length = clen;
-		s.size = blen;
+		s.length = length;
 		return s;
 	}
 
 	@:keep static function __add__( a : String, b : String ) : String {
 		if( a == null ) a = "null";
 		if( b == null ) b = "null";
-		var asize = a.size, bsize = b.size, tot = asize + bsize;
-		var bytes = new hl.types.Bytes(tot+1);
-		bytes.blit(0,a.bytes,0,asize);
+		var asize = a.length << 1, bsize = b.length << 1, tot = asize + bsize;
+		var bytes = new hl.types.Bytes(tot+2);
+		bytes.blit(0, a.bytes, 0, asize);
 		bytes.blit(asize,b.bytes,0,bsize);
-		bytes[tot] = 0;
-		return __alloc__(bytes, tot, a.length + b.length);
+		bytes.setUI16(tot, 0);
+		return __alloc__(bytes, tot>>1);
 	}
 }

+ 21 - 48
std/hl/_std/StringBuf.hx

@@ -24,17 +24,16 @@
 	var b : hl.types.Bytes;
 	var size : Int;
 	var pos : Int;
-	var slen : Int;
 	public var length(get,never) : Int;
 
 	public function new() : Void {
-		pos = slen = 0;
-		size = 12; // ensure 6 bytes expand for addChar()
+		pos = 0;
+		size = 8; // ensure 4 bytes expand for addChar()
 		b = new hl.types.Bytes(size);
 	}
 
 	inline function get_length() : Int {
-		return slen;
+		return pos >> 1;
 	}
 
 	inline function __expand( need : Int ) : Void {
@@ -46,15 +45,10 @@
 		size = nsize;
 	}
 
-	function __addBytes( bytes : hl.types.Bytes, spos : Int, ssize : Int, slen : Int ) : Void {
+	inline function __add( bytes : hl.types.Bytes, spos : Int, ssize : Int ) : Void {
 		if( pos + ssize > size ) __expand(pos + ssize);
 		b.blit(pos, bytes, spos, ssize);
 		pos += ssize;
-		this.slen += slen;
-	}
-
-	inline function __add( bytes : hl.types.Bytes, spos : Int, ssize : Int ) : Void {
-		__addBytes(bytes, spos, ssize, bytes.utf8Length(spos, ssize));
 	}
 
 	public function add<T>( x : T ) : Void {
@@ -65,57 +59,36 @@
 
 	public function addSub( s : String, pos : Int, ?len : Int ) : Void @:privateAccess {
 		if( pos < 0 ) pos = 0;
-		if( pos > s.length ) pos = s.length;
+		if( pos >= s.length ) return;
 		var slen : Int;
 		if( len == null ) slen = s.length - pos else {
 			slen = len;
 			if( pos + slen > s.length ) slen = s.length - pos;
 			if( slen <= 0 ) return;
 		}
-		var bpos = pos == 0 ? 0 : s.bytes.utf8Pos(0,pos);
-		var blen = (pos + len == s.length ? s.size - bpos : s.bytes.utf8Pos(bpos, len));
-		__addBytes(s.bytes, bpos, blen, len - pos);
+		__add(s.bytes, pos << 1, slen << 1);
 	}
 
 	public function addChar( c : Int ) : Void {
-		if( c < 0 )
-			throw "Invalid char code";
-		if( pos + 6 > size ) __expand(0);
-		if( c < 0x80 )
-			b[pos++] = c;
-		else if( c < 0x800 ) {
-			b[pos++] = 0xC0 | (c >> 6);
-			b[pos++] = 0x80 | (c & 63);
-		} else if( c < 0x10000 ) {
-			b[pos++] = 0xE0 | (c >> 12);
-			b[pos++] = 0x80 | ((c >> 6) & 63);
-			b[pos++] = 0x80 | (c & 63);
-		} else if( c < 0x200000 ) {
-			b[pos++] = 0xF0 | (c >> 18);
-			b[pos++] = 0x80 | ((c >> 12) & 63);
-			b[pos++] = 0x80 | ((c >> 6) & 63);
-			b[pos++] = 0x80 | (c & 63);
-		} else if( c < 0x4000000 ) {
-			b[pos++] = 0xF8 | (c >> 24);
-			b[pos++] = 0x80 | ((c >> 18) & 63);
-			b[pos++] = 0x80 | ((c >> 12) & 63);
-			b[pos++] = 0x80 | ((c >> 6) & 63);
-			b[pos++] = 0x80 | (c & 63);
-		} else {
-			b[pos++] = 0xFC | (c >> 30);
-			b[pos++] = 0x80 | ((c >> 24) & 63);
-			b[pos++] = 0x80 | ((c >> 18) & 63);
-			b[pos++] = 0x80 | ((c >> 12) & 63);
-			b[pos++] = 0x80 | ((c >> 6) & 63);
-			b[pos++] = 0x80 | (c & 63);
-		}
-		slen++;
+		if( c >= 0 && c < 0x10000 ) {
+			if( c >= 0xD800 && c <= 0xDFFF ) throw "Invalid unicode char " + c;
+			if( pos == size ) __expand(0);
+			b.setUI16(pos, c);
+			pos += 2;
+		} else if( c < 0x110000 ) {
+			if( pos + 4 > size ) __expand(0);
+			c -= 0x10000;
+			b.setUI16(pos, (c >> 10) + 0xD800);
+			b.setUI16(pos + 2, (c & 1023) + 0xDC00);
+			pos += 4;
+		} else
+			throw "Invalid unicode char " + c;
 	}
 
 	public function toString() : String {
 		if( pos == size ) __expand(0);
-		b[pos] = 0;
-		return @:privateAccess String.__alloc__(b, pos, slen);
+		b.setUI16(pos,0);
+		return @:privateAccess String.__alloc__(b, pos);
 	}
 
 }

+ 1 - 1
std/hl/_std/Type.hx

@@ -71,7 +71,7 @@ class Type {
 	public static function getInstanceFields( c : Class<Dynamic> ) : Array<String> @:privateAccess {
 		var c : hl.types.Class = cast c;
 		var fields = c.type.getInstanceFields();
-		return [for( f in fields ) { var len = f.bytesLength(0); String.__alloc__(f,len,len); }];
+		return [for( f in fields ) String.__alloc__(f,f.ucs2Length(0))];
 	}
 
 	public static function getClassFields( c : Class<Dynamic> ) : Array<String> {

+ 5 - 7
std/hl/_std/haxe/ds/StringMap.hx

@@ -26,21 +26,20 @@ private class StringMapKeysIterator {
 	var arr : hl.types.NativeArray<hl.types.Bytes>;
 	var pos : Int;
 	var length : Int;
-	
+
 	public inline function new(h:hl.types.NativeBytesMap) {
 		this.arr = h.keysArray();
 		pos = 0;
 		length = arr.length;
 	}
-	
+
 	public inline function hasNext() {
 		return pos < length;
 	}
-	
+
 	public inline function next() @:privateAccess {
 		var b = arr[pos++];
-		var size = b.bytesLength(0);
-		return String.__alloc__(b,size+1,b.utf8Length(0,size));
+		return String.__alloc__(b,b.ucs2Length(0));
 	}
 
 }
@@ -87,8 +86,7 @@ class StringMap<T> implements haxe.Constraints.IMap<String,T> {
 			if( i > 0 )
 				s.add(", ");
 			var k = keys[i];
-			var len = @:privateAccess k.bytesLength(0); 
-			s.add(@:privateAccess String.__alloc__(k,len+1,k.utf8Length(0,len)));
+			@:privateAccess s.__add(k,0,(@:privateAccess k.ucs2Length(0)) << 1);
 			s.add(" => ");
 			s.add(values[i]);
 		}

+ 9 - 4
std/hl/types/Bytes.hx

@@ -87,7 +87,12 @@ package hl.types;
 		Count the number of UTF8 chars into the given Bytes data.
 	**/
 	@:hlNative("std","utf8length")
-	public function utf8Length( pos : Int, size : Int ) : Int {
+	public function _utf8Length( pos : Int, size : Int ) : Int {
+		return 0;
+	}
+
+	@:hlNative("std", "ucs2length")
+	function ucs2Length( bytePos : Int ) : Int {
 		return 0;
 	}
 
@@ -95,7 +100,7 @@ package hl.types;
 		Count the number of bytes until we reach \0
 	**/
 	@:hlNative("std","byteslength")
-	function bytesLength( pos : Int ) : Int {
+	function _bytesLength( pos : Int ) : Int {
 		return 0;
 	}
 
@@ -108,7 +113,7 @@ package hl.types;
 		Decode the utf8 char at the given position
 	**/
 	@:hlNative("std","utf8char")
-	public function utf8Char( pos : Int, charPos : Int ) : Int {
+	public function _utf8Char( pos : Int, charPos : Int ) : Int {
 		return 0;
 	}
 
@@ -116,7 +121,7 @@ package hl.types;
 		Gives the byte position for the utf8 char starting at pos.
 	**/
 	@:hlNative("std","utf8pos")
-	public function utf8Pos( pos : Int, charPos : Int ) : Int {
+	public function _utf8Pos( pos : Int, charPos : Int ) : Int {
 		return 0;
 	}
 

+ 5 - 0
std/hl/types/Ref.hx

@@ -1,6 +1,11 @@
 package hl.types;
 
 @:coreType abstract Ref<T> {
+
+	@:extern @:from public static inline function make<T>( v : T ) {
+		return new Ref<T>(v);
+	}
+
 	@:extern public inline function new( v : T ) {
 		this = untyped $ref(v);
 	}