Selaa lähdekoodia

Change eval to use UTF-8 instad of UCS-2 (#7470)

Simon Krajewski 7 vuotta sitten
vanhempi
commit
f71773b909

+ 1 - 0
src/context/common.ml

@@ -332,6 +332,7 @@ let get_config com =
 			default_config with
 			pf_static = false;
 			pf_pad_nulls = true;
+			pf_uses_utf16 = false;
 		}
 
 let memory_marker = [|Unix.time()|]

+ 1 - 2
src/macro/eval/evalArray.ml

@@ -19,7 +19,6 @@
 
 open Globals
 open EvalValue
-open EvalString
 
 let create values = {
 	avalues = values;
@@ -29,7 +28,7 @@ let create values = {
 let array_join a f sep =
 	let l = Array.map f a in
 	let l = Array.to_list l in
-	join sep l
+	EvalString.join sep l
 
 let to_list a = Array.to_list (Array.sub a.avalues 0 a.alength)
 

+ 4 - 4
src/macro/eval/evalDebugSocket.ml

@@ -18,7 +18,7 @@ let var_to_json name value access =
 	let jv t v structured =
 		JObject ["name",JString name;"type",JString t;"value",JString v;"structured",JBool structured;"access",JString access]
 	in
-	let string_repr s = "\"" ^ (Ast.s_escape (EvalString.get s)) ^ "\"" in
+	let string_repr s = "\"" ^ (Ast.s_escape s.sstring) ^ "\"" in
 	let rec level2_value_repr = function
 		| VNull -> "null"
 		| VTrue -> "true"
@@ -35,7 +35,7 @@ let var_to_json name value access =
 		| VString s -> string_repr s
 		| VArray _ | VVector _ -> "[...]"
 		| VInstance vi -> (rev_hash vi.iproto.ppath) ^ " {...}"
-		| VPrototype proto -> EvalString.get (s_proto_kind proto)
+		| VPrototype proto -> (s_proto_kind proto).sstring
 		| VFunction _ | VFieldClosure _ -> "<fun>"
 		| VLazy f -> level2_value_repr (!f())
 	in
@@ -71,7 +71,7 @@ let var_to_json name value access =
 		| VInstance vi ->
 			let class_name = rev_hash vi.iproto.ppath in
 			jv class_name (class_name ^ " " ^ (fields_string (instance_fields vi))) true
-		| VPrototype proto -> jv "Anonymous" (EvalString.get (s_proto_kind proto)) false (* TODO: show statics *)
+		| VPrototype proto -> jv "Anonymous" (s_proto_kind proto).sstring false (* TODO: show statics *)
 		| VFunction _ | VFieldClosure _ -> jv "Function" "<fun>" false
 		| VLazy f -> value_string (!f())
 	in
@@ -195,7 +195,7 @@ let output_inner_vars v access =
 			) l
 		| VInstance {ikind = IStringMap h} ->
 			StringHashtbl.fold (fun k v acc ->
-				let s = EvalString.get k in
+				let s = k.sstring in
 				let n = Printf.sprintf "[%s]" s in
 				let a = access ^ n in
 				(s,v,a) :: acc

+ 1 - 1
src/macro/eval/evalDecode.ml

@@ -50,7 +50,7 @@ let decode_varray v = match v with
 	| _ -> unexpected_value v "array"
 
 let decode_string v = match v with
-	| VString s -> EvalString.get s
+	| VString s -> s.sstring
 	| _ -> unexpected_value v "string"
 
 let decode_vstring v = match v with

+ 1 - 2
src/macro/eval/evalEmitter.ml

@@ -365,8 +365,7 @@ let emit_string_cca exec1 exec2 p env =
 	let s = decode_vstring (exec1 env) in
 	let index = decode_int_p (exec2 env) p in
 	if index < 0 || index >= s.slength then vnull
-	else if s.sascii then vint (int_of_char (String.get s.sstring index))
-	else vint (EvalString.read_char s (index lsl 1))
+	else vint (EvalString.char_at s index)
 
 (* Write *)
 

+ 1 - 1
src/macro/eval/evalExceptions.ml

@@ -129,7 +129,7 @@ let catch_exceptions ctx ?(final=(fun() -> ())) f p =
 			final();
 			match v1,v2 with
 				| VString s,VInstance {ikind = IPos p} ->
-					raise (Error.Error (Error.Custom (EvalString.get s),p))
+					raise (Error.Error (Error.Custom s.sstring,p))
 				| _ ->
 					Error.error "Something went wrong" null_pos
 		end else begin

+ 1 - 1
src/macro/eval/evalJit.ml

@@ -36,7 +36,7 @@ let rope_path t = match follow t with
 let eone = mk (TConst(TInt (Int32.one))) t_dynamic null_pos
 
 let eval_const = function
-	| TString s -> EvalString.bytes_to_utf8 (Bytes.unsafe_of_string s)
+	| TString s -> EvalString.create_unknown s
 	| TInt i32 -> vint32 i32
 	| TFloat f -> vfloat (float_of_string f)
 	| TBool b -> vbool b

+ 2 - 2
src/macro/eval/evalMain.ml

@@ -232,7 +232,7 @@ let value_signature v =
 		| VInstance {ikind = IDate f} ->
 			cache v (fun () ->
 				addc 'v';
-				add (EvalString.get (s_date f))
+				add ((s_date f).sstring)
 			)
 		| VInstance {ikind = IStringMap map} ->
 			cache v (fun() ->
@@ -405,7 +405,7 @@ let rec value_to_expr v p =
 	| VFalse -> (EConst (Ident "false"),p)
 	| VInt32 i -> (EConst (Int (Int32.to_string i)),p)
 	| VFloat f -> haxe_float f p
-	| VString s -> (EConst (String (EvalString.get s)),p)
+	| VString s -> (EConst (String s.sstring),p)
 	| VArray va -> (EArrayDecl (List.map (fun v -> value_to_expr v p) (EvalArray.to_list va)),p)
 	| VObject o -> (EObjectDecl (List.map (fun (k,v) ->
 			let n = rev_hash k in

+ 1 - 7
src/macro/eval/evalMisc.ml

@@ -107,13 +107,7 @@ let rec compare a b =
 	| VString s1,VString s2 ->
 		let s1' = s1.sstring in
 		let s2' = s2.sstring in
-		let s1,s2 = match s1.sascii,s2.sascii with
-		| true,true
-		| false,false -> s1',s2'
-		| true,false -> extend_ascii s1',s2'
-		| false,true -> s1',extend_ascii s2'
-		in
-		let r = String.compare s1 s2 in
+		let r = String.compare s1' s2' in
 		if r = 0 then CEq else if r < 0 then CInf else CSup
 	| VFunction(a,_), VFunction(b,_) -> if a == b then CEq else CUndef
 	| VArray va1,VArray va2 -> if va1 == va2 then CEq else CUndef

+ 1 - 2
src/macro/eval/evalPrinting.ml

@@ -140,5 +140,4 @@ and call_value_on vthis v vl =
 	| _ -> exc_string ("Cannot call " ^ (value_string v))
 
 and value_string v =
-	let s = s_value 0 v in
-	EvalString.get s
+	(s_value 0 v).sstring

+ 103 - 200
src/macro/eval/evalStdLib.ml

@@ -335,10 +335,7 @@ module StdBytes = struct
 		let pos = decode_int pos in
 		let len = decode_int len in
 		let s = try Bytes.sub this pos len with _ -> outside_bounds() in
-		if encode_native encoding then
-			vstring (create_ucs2 (Bytes.unsafe_to_string s) len)
-		else
-			bytes_to_utf8 s
+		vstring (create_with_length (Bytes.unsafe_to_string s) len)
 	)
 
 	let getUInt16 = vifun1 (fun vthis pos ->
@@ -349,14 +346,7 @@ module StdBytes = struct
 
 	let ofString = vfun2 (fun v encoding ->
 		let s = decode_vstring v in
-		if encode_native encoding then begin
-			let s = maybe_extend_ascii s in
-			encode_bytes (Bytes.of_string s)
-		end else begin
-			let s' = s.sstring in
-			let s = if s.sascii then s' else utf16_to_utf8 s' in
-			encode_bytes (Bytes.of_string s)
-		end
+		encode_bytes (Bytes.of_string s.sstring)
 	)
 
 	let ofHex = vfun1 (fun v ->
@@ -442,7 +432,7 @@ module StdBytes = struct
 	)
 
 	let toString = vifun0 (fun vthis ->
-		bytes_to_utf8 (this vthis)
+		(create_unknown (Bytes.to_string (this vthis)))
 	)
 end
 
@@ -484,12 +474,7 @@ module StdBytesBuffer = struct
 	let addString = vifun2 (fun vthis src encoding ->
 		let this = this vthis in
 		let src = decode_vstring src in
-		let s = if src.sascii || StdBytes.encode_native encoding then
-			src.sstring
-		else
-			utf16_to_utf8 src.sstring
-		in
-		Buffer.add_string this s;
+		Buffer.add_string this src.sstring;
 		vnull
 	)
 
@@ -862,24 +847,11 @@ module StdEReg = struct
 
 	let matchedPos = vifun0 (fun vthis ->
 		let this = this vthis in
-		let rec search_head s i =
-			if i >= String.length s then i else
-			let n = Char.code (String.unsafe_get s i) in
-			if n < 0x80 || n >= 0xc2 then i else
-			search_head s (i + 1)
-		in
-		let next' s i =
-			let n = Char.code s.[i] in
-			if n < 0x80 then i + 1 else
-			if n < 0xc0 then search_head s (i + 1) else
-			if n <= 0xdf then i + 2
-			else i + 3
-		in
 		let rec byte_offset_to_char_offset_lol s i k o =
 			if i = 0 then
 				k
 			else begin
-				let n = next' s o in
+				let n = UTF8.next s o in
 				let d = n - o in
 				byte_offset_to_char_offset_lol s (i - d) (k + 1) n
 			end
@@ -986,7 +958,7 @@ module StdFile = struct
 
 	let getContent = vfun1 (fun path ->
 		let path = decode_string path in
-		try bytes_to_utf8 (Bytes.unsafe_of_string ((Std.input_file ~bin:true path))) with Sys_error _ -> exc_string ("Could not read file " ^ path)
+		try ((create_unknown (Std.input_file ~bin:true path))) with Sys_error _ -> exc_string ("Could not read file " ^ path)
 	)
 
 	let read = vfun2 (fun path binary ->
@@ -1607,7 +1579,7 @@ module StdReflect = struct
 	)
 
 	let deleteField = vfun2 (fun o name ->
-		let name = hash (get (decode_vstring name)) in
+		let name = hash (decode_vstring name).sstring in
 		match vresolve o with
 		| VObject o ->
 			let found = ref false in
@@ -1628,7 +1600,7 @@ module StdReflect = struct
 	)
 
 	let field' = vfun2 (fun o name ->
-		if o = vnull then vnull else dynamic_field o (hash (get (decode_vstring name)))
+		if o = vnull then vnull else dynamic_field o (hash (decode_vstring name).sstring)
 	)
 
 	let fields = vfun1 (fun o ->
@@ -1646,14 +1618,14 @@ module StdReflect = struct
 
 	let getProperty = vfun2 (fun o name ->
 		let name = decode_vstring name in
-		let name_get = hash (get (concat r_get_ name)) in
+		let name_get = hash (concat r_get_ name).sstring in
 		let vget = field o name_get in
 		if vget <> VNull then call_value_on o vget []
-		else dynamic_field o (hash (get name))
+		else dynamic_field o (hash name.sstring)
 	)
 
 	let hasField = vfun2 (fun o field ->
-		let name = hash (get (decode_vstring field)) in
+		let name = hash (decode_vstring field).sstring in
 		let b = match vresolve o with
 			| VObject o -> IntMap.mem name o.oproto.pinstance_names
 			| VInstance vi -> IntMap.mem name vi.iproto.pinstance_names || IntMap.mem name vi.iproto.pnames
@@ -1684,16 +1656,16 @@ module StdReflect = struct
 	)
 
 	let setField = vfun3 (fun o name v ->
-		set_field o (hash (get (decode_vstring name))) v; vnull
+		set_field o (hash (decode_vstring name).sstring) v; vnull
 	)
 
 	let setProperty = vfun3 (fun o name v ->
 		let name = decode_vstring name in
-		let name_set = hash (get (concat r_set_ name)) in
+		let name_set = hash (concat r_set_ name).sstring in
 		let vset = field o name_set in
 		if vset <> VNull then call_value_on o vset [v]
 		else begin
-			set_field o (hash (get name)) v;
+			set_field o (hash name.sstring) v;
 			vnull
 		end
 	)
@@ -1707,7 +1679,7 @@ module StdResource = struct
 	)
 
 	let getString = vfun1 (fun name ->
-		try bytes_to_utf8 (Bytes.unsafe_of_string (Hashtbl.find ((get_ctx()).curapi.MacroApi.get_com()).resources (decode_string name))) with Not_found -> vnull
+		try ((create_unknown (Hashtbl.find ((get_ctx()).curapi.MacroApi.get_com()).resources (decode_string name)))) with Not_found -> vnull
 	)
 
 	let getBytes = vfun1 (fun name ->
@@ -1891,8 +1863,9 @@ module StdStd = struct
 		| _ -> vfalse
 	)
 
-	let string = vfun1 (fun v ->
-		vstring (s_value 0 v)
+	let string = vfun1 (fun v -> match v with
+		| VString _ -> v
+		| _ -> vstring (s_value 0 v)
 	)
 
 	let int = vfun1 (fun v ->
@@ -1922,24 +1895,14 @@ module StdString = struct
 		let this = this vthis in
 		let i = decode_int index in
 		if i < 0 || i >= this.slength then encode_string ""
-		else begin
-			let s = this.sstring in
-			if this.sascii then encode_string (String.make 1 (String.get s i))
-			else begin
-				let b = Bytes.create 2 in
-				EvalBytes.write_ui16 b 0 (read_char this (i lsl 1));
-				let s = create_ucs2 (Bytes.unsafe_to_string b) 1 in
-				vstring s
-			end
-		end
+		else vstring (from_char_code (char_at this i))
 	)
 
 	let charCodeAt = vifun1 (fun vthis index ->
 		let this = this vthis in
 		let i = decode_int index in
 		if i < 0 || i >= this.slength then vnull
-		else if this.sascii then vint (int_of_char (String.get this.sstring i))
-		else vint (read_char this (i lsl 1))
+		else vint (char_at this i)
 	)
 
 	let fromCharCode = vfun1 (fun i ->
@@ -1949,8 +1912,6 @@ module StdString = struct
 		with
 		| Not_found ->
 			vnull
-		| InvalidUnicodeChar ->
-			exc_string ("Invalid unicode char " ^ (string_of_int i))
 	)
 
 	let indexOf = vifun2 (fun vthis str startIndex ->
@@ -1960,10 +1921,10 @@ module StdString = struct
 		try
 			if str.slength = 0 then
 				vint (max 0 (min i this.slength))
-			else if this.sascii then
-				vint ((fst (find_substring this str false i)))
 			else begin
-				vint ((fst (find_substring this str false (i lsl 1))) lsr 1)
+				let b = get_offset this i in
+				let offset,_,_ = find_substring this str false i b in
+				vint offset
 			end
 		with Not_found ->
 			vint (-1)
@@ -1979,147 +1940,112 @@ module StdString = struct
 			end else begin
 				let i = default_int startIndex (this.slength - 1) in
 				let i = if i < 0 then raise Not_found else if i >= this.slength then this.slength - 1 else i in
-				let s = this.sstring in
-				if this.sascii then
-					vint (Str.search_backward (Str.regexp_string str.sstring) s i)
-				else
-					vint ((fst (find_substring this str true (i lsl 1))) lsr 1)
+				let b = get_offset this i in
+				let offset,_,_ = find_substring this str true i b in
+				vint offset
 			end
 		with Not_found ->
 			vint (-1)
 	)
 
 	let split = vifun1 (fun vthis delimiter ->
-		let this' = this vthis in
-		let ascii = this'.sascii in
-		let s = this'.sstring in
-		let delimiter' = (decode_vstring delimiter) in
-		let delimiter = delimiter'.sstring in
-		let l_delimiter = String.length delimiter in
-		let l_this = String.length s in
-		let encode_range pos length =
+		let this = this vthis in
+		let s = this.sstring in
+		let delimiter = decode_vstring delimiter in
+		let bl_delimiter = String.length delimiter.sstring in
+		let bl_this = String.length s in
+		let encode_range pos length clength =
 			let s = String.sub s pos length in
-			if ascii then encode_string s
-			else vstring (create_ucs2 s (length lsr 1))
+			vstring (create_with_length s clength)
 		in
-		if l_delimiter = 0 then begin
-			if ascii then
-				encode_array (List.map (fun chr -> encode_string (String.make 1 chr)) (ExtString.String.explode s))
-			else begin
-				let acc = DynArray.create () in
-				let bs = Bytes.unsafe_of_string s in
-				for i = 0 to (l_this - 1) lsr 1 do
-					let b = Bytes.create 2 in
-					Bytes.unsafe_set b 0 (Bytes.unsafe_get bs (i lsl 1));
-					Bytes.unsafe_set b 1 (Bytes.unsafe_get bs ((i lsl 1 + 1)));
-					DynArray.add acc (vstring (create_ucs2 (Bytes.unsafe_to_string b) 1));
-				done;
-				encode_array (DynArray.to_list acc)
-			end
-		end else if l_delimiter > l_this then
-			encode_array [encode_range 0 (String.length s)]
+		if bl_delimiter = 0 then begin
+			let acc = DynArray.create () in
+			UTF8.iter (fun uc ->
+				DynArray.add acc (vstring (create_ascii (UTF8.init 1 (fun _ -> uc))));
+			) s;
+			encode_array (DynArray.to_list acc)
+		end else if bl_delimiter > bl_this then
+			encode_array [encode_range 0 bl_this this.slength]
 		else begin
 			let acc = DynArray.create () in
-			let f = find_substring this' delimiter' false in
-			let rec loop i =
+			let f = find_substring this delimiter false in
+			let rec loop c_index b_index =
 				try
-					let offset,next = f i in
-					DynArray.add acc (encode_range i (offset - i));
-					loop next;
+					let c_offset,b_offset,next = f c_index b_index in
+					DynArray.add acc (encode_range b_index (b_offset - b_index) (c_offset - c_index));
+					loop (c_offset + delimiter.slength) next;
 				with Not_found ->
-					DynArray.add acc (encode_range i (l_this - i))
+					DynArray.add acc (encode_range b_index (bl_this - b_index) (this.slength - c_index))
 			in
-			loop 0;
+			loop 0 0;
 			encode_array_instance (EvalArray.create (DynArray.to_array acc))
 		end
 	)
 
 	let substr = vifun2 (fun vthis pos len ->
 		let this = this vthis in
-		let s = this.sstring in
-		let l_this = String.length s in
-		let pos = decode_int pos in
-		if pos >= this.slength then
+		let cl_this = this.slength in
+		let c_pos = decode_int pos in
+		if c_pos >= cl_this then
 			encode_string ""
 		else begin
-			let pos = if pos < 0 then begin
-				let pos = this.slength + pos in
-				if pos < 0 then 0 else pos
-			end else pos in
-			if this.sascii then begin
-				let len = default_int len (l_this - pos) in
-				let len = if len < 0 then l_this + len - pos else len in
-				let s =
-					if len < 0 then ""
-					else if len + pos > l_this then String.sub s pos (l_this - pos)
-					else String.sub s pos len
-				in
-				encode_string s
-			end else begin
-				let pos = pos lsl 1 in
-				let len = match len with
-					| VNull -> (l_this - pos)
-					| VInt32 i -> Int32.to_int i lsl 1
+			let c_pos = if c_pos < 0 then begin
+				let c_pos = this.slength + c_pos in
+				if c_pos < 0 then 0 else c_pos
+			end else c_pos in
+			begin
+				let c_len = match len with
+					| VNull -> (cl_this - c_pos)
+					| VInt32 i -> Int32.to_int i
 					| _ -> unexpected_value len "int"
 				in
-				let len = if len < 0 then l_this + len - pos else len in
-				let s =
-					if len < 0 then ""
-					else if len + pos > l_this then String.sub s pos (l_this - pos)
-					else String.sub s pos len
+				let c_len =
+					if c_len < 0 then cl_this + c_len - c_pos
+					else if c_len > cl_this - c_pos then cl_this - c_pos
+					else c_len
 				in
-				vstring (create_ucs2 s (len lsr 1))
+				vstring (substr this c_pos c_len);
 			end
 		end
 	)
 
 	let substring = vifun2 (fun vthis startIndex endIndex ->
 		let this = this vthis in
-		let first = decode_int startIndex in
-		let l = this.slength in
-		let last = default_int endIndex l in
-		let first = if first < 0 then 0 else first in
-		let last = if last < 0 then 0 else last in
-		let first,last = if first > last then last,first else first,last in
-		let last = if last > l then l else last in
-		if first > l then
+		let c_first = decode_int startIndex in
+		let cl_this = this.slength in
+		let c_last = default_int endIndex cl_this in
+		let c_first = if c_first < 0 then 0 else c_first in
+		let c_last = if c_last < 0 then 0 else c_last in
+		let c_first,c_last = if c_first > c_last then c_last,c_first else c_first,c_last in
+		let c_last = if c_last > cl_this then cl_this else c_last in
+		if c_first > cl_this || c_first = c_last then
 			encode_string ""
 		else begin
-			if this.sascii then
-				encode_string (String.sub this.sstring first (last - first))
-			else begin
-				let first = first lsl 1 in
-				let last = last lsl 1 in
-				let length = last - first in
-				let r = String.sub this.sstring first length in
-				vstring (create_ucs2 r length)
+			begin
+				let b_offset1 = get_offset this c_first in
+				let c_len = c_last - c_first in
+				let b_len =
+					if c_last = cl_this then String.length this.sstring - b_offset1
+					else (UTF8.move this.sstring b_offset1 c_len) - b_offset1
+				in
+				vstring (create_with_length (String.sub this.sstring b_offset1 b_len) c_len)
 			end
 		end
 	)
 
 	let toLowerCase = vifun0 (fun vthis ->
 		let this = this vthis in
-		if this.sascii then
-			encode_string (String.lowercase this.sstring)
-		else
-			vstring (case_map this false)
+		vstring (case_map this false)
 	)
 
 	let toString = vifun0 (fun vthis -> vthis)
 
 	let toUpperCase = vifun0 (fun vthis ->
 		let this = this vthis in
-		if this.sascii then
-			encode_string (String.uppercase this.sstring)
-		else
-			vstring (case_map this true)
+		vstring (case_map this true)
 	)
 
 	let cca = charCodeAt
-
-	let isAscii = vifun0 (fun vthis ->
-		vbool (this vthis).sascii
-	)
 end
 
 module StdStringBuf = struct
@@ -2133,54 +2059,32 @@ module StdStringBuf = struct
 			| VString s -> s
 			| _ -> create_ascii (value_string x)
 		in
-		AwareBuffer.add_string this s;
+		VStringBuffer.add_string this s;
 		vnull;
 	)
 
 	let addChar = vifun1 (fun vthis c ->
 		let this = this vthis in
 		let i = decode_int c in
-		let add i =
-			if this.bascii then AwareBuffer.promote_to_ucs this;
-			Buffer.add_char this.bbuffer (char_of_int (i land 0xFF));
-			Buffer.add_char this.bbuffer (char_of_int (i lsr 8));
-			this.blength <- this.blength + 1;
-		in
-		begin if i < 0 then
-			()
-		else if i < 128 then begin
-			if this.bascii then begin
-				Buffer.add_char this.bbuffer (char_of_int i);
-				this.blength <- this.blength + 1;
-			end else
-				add i
-		end else if i < 0x10000 then begin
-			if i >= 0xD800 && i <= 0xDFFF then exc_string ("Invalid unicode char " ^ (string_of_int i));
-			add i
-		end else if i < 0x110000 then begin
-			let i = i - 0x10000 in
-			add ((i lsr 10 + 0xD800));
-			add ((i land 1023) + 0xDC00);
-		end else
-			exc_string ("Invalid unicode char " ^ (string_of_int i))
-		end;
+		Buffer.add_string this.bbuffer (string_of_char_code i);
+		this.blength <- this.blength + 1;
 		vnull
 	)
 
 	let addSub = vifun3 (fun vthis s pos len ->
 		let this = this vthis in
 		let s = decode_vstring s in
-		let i = decode_int pos in
-		let i = if s.sascii then i else i lsl 1 in
-		let len = match len with
-			| VNull -> String.length s.sstring - i
-			| VInt32 i -> Int32.to_int i lsl (if s.sascii then 0 else 1)
+		let c_pos = decode_int pos in
+		let c_len = match len with
+			| VNull -> String.length s.sstring - c_pos
+			| VInt32 i -> Int32.to_int i
 			| _ -> unexpected_value len "int"
 		in
-		let s' = String.sub s.sstring i len in
-		let s' = if s.sascii then create_ascii s'
-		else create_ucs2 s' (len lsr 1) in
-		AwareBuffer.add_string this s';
+		if c_len > 0 then begin
+			let b_offset1 = get_offset s c_pos in
+			let b_offset2 = UTF8.move s.sstring b_offset1 c_len in
+			VStringBuffer.add_substring this s b_offset1 (b_offset2 - b_offset1) c_len;
+		end;
 		vnull
 	)
 
@@ -2191,7 +2095,7 @@ module StdStringBuf = struct
 
 	let toString = vifun0 (fun vthis ->
 		let this = this vthis in
-		let s = AwareBuffer.contents this in
+		let s = VStringBuffer.contents this in
 		vstring s
 	)
 end
@@ -2211,9 +2115,9 @@ module StdStringTools = struct
 
 	let urlDecode = vfun1 (fun s ->
 		let s = decode_string s in
-		let b = AwareBuffer.create () in
+		let b = VStringBuffer.create () in
 		let add s =
-			AwareBuffer.add_string b s
+			VStringBuffer.add_string b s
 		in
 		let len = String.length s in
 		let decode c =
@@ -2278,7 +2182,7 @@ module StdStringTools = struct
 				loop (i + 1)
 		in
 		loop 0;
-		vstring (AwareBuffer.contents b)
+		vstring (VStringBuffer.contents b)
 	)
 end
 
@@ -2457,7 +2361,7 @@ module StdType = struct
 	)
 
 	let createEnum = vfun3 (fun e constr params ->
-		let constr = hash (get (decode_vstring constr)) in
+		let constr = hash (decode_vstring constr).sstring in
 		create_enum e constr params
 	)
 
@@ -2599,12 +2503,12 @@ module StdType = struct
 	)
 
 	let resolveClass = vfun1 (fun v ->
-		let name = get (decode_vstring v) in
+		let name = (decode_vstring v).sstring in
 		try (get_static_prototype_raise (get_ctx()) (hash name)).pvalue with Not_found -> vnull
 	)
 
 	let resolveEnum = vfun1 (fun v ->
-		let name = get (decode_vstring v) in
+		let name = (decode_vstring v).sstring in
 		try
 			let proto = get_static_prototype_raise (get_ctx()) (hash name) in
 			begin match proto.pkind with
@@ -2725,7 +2629,7 @@ module StdUtf8 = struct
 
 	let toString = vifun0 (fun vthis ->
 		let this = this vthis in
-		bytes_to_utf8 (Bytes.unsafe_of_string (UTF8.Buf.contents this))
+		vstring (create_ascii ((UTF8.Buf.contents this)))
 	)
 
 	let validate = vfun1 (fun s ->
@@ -2792,7 +2696,7 @@ let init_constructors builtins =
 			| [s] -> s
 			| _ -> assert false
 		);
-	add key_StringBuf (fun _ -> encode_instance key_StringBuf ~kind:(IBuffer (AwareBuffer.create())));
+	add key_StringBuf (fun _ -> encode_instance key_StringBuf ~kind:(IBuffer (VStringBuffer.create())));
 	add key_haxe_Utf8
 		(fun vl -> match vl with
 			| [size] -> encode_instance key_haxe_Utf8 ~kind:(IUtf8 (UTF8.Buf.create (default_int size 0)))
@@ -3162,7 +3066,6 @@ let init_standard_library builtins =
 		"toString",StdString.toString;
 		"toUpperCase",StdString.toUpperCase;
 		"cca",StdString.cca;
-		"isAscii",StdString.isAscii;
 	];
 	init_fields builtins ([],"StringBuf") [] [
 		"add",StdStringBuf.add;

+ 118 - 227
src/macro/eval/evalString.ml

@@ -17,261 +17,152 @@
 	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *)
 
-open Globals
 open EvalValue
-open EvalBytes
-
-let create_ascii s = {
-	sstring = s;
-	slength = String.length s;
-	sascii = true;
-}
-
-let create_ucs2 s length = {
-	sstring = s;
-	slength = length;
-	sascii = false;
-}
 
 let vstring s = VString s
 
-module AwareBuffer = struct
-	type t = vstring_buffer
-
-	let create () = {
-		bbuffer = Buffer.create 0;
-		blength = 0;
-		bascii = true;
+let create_ascii s =
+	{
+		sstring = s;
+		slength = String.length s;
+		soffset = (0,0);
 	}
 
-	let promote_to_ucs this =
-		let current = Buffer.contents this.bbuffer in
-		let current = extend_ascii current in
-		Buffer.clear this.bbuffer;
-		this.bascii <- false;
-		Buffer.add_string this.bbuffer current
-
-	let add_string this s =
-		begin match s.sascii,this.bascii with
-		| true,true
-		| false,false ->
-			Buffer.add_string this.bbuffer s.sstring
-		| true,false ->
-			Buffer.add_string this.bbuffer (extend_ascii s.sstring)
-		| false,true ->
-			promote_to_ucs this;
-			Buffer.add_string this.bbuffer s.sstring
-		end;
-		this.blength <- this.blength + s.slength
-
-	let contents this =
-		if this.bascii then
-			create_ascii (Buffer.contents this.bbuffer)
-		else
-			create_ucs2 (Buffer.contents this.bbuffer) this.blength
-end
+let create_with_length s length = {
+	sstring = s;
+	slength = length;
+	soffset = (0,0);
+}
 
-let read_char s =
-	read_ui16 (Bytes.unsafe_of_string s.sstring)
+let create_unknown s =
+	vstring (create_with_length s (try UTF8.length s with _ -> String.length s))
 
-let utf8_to_utf16 s =
-	let only_ascii = ref true in
-	let buf = Buffer.create 0 in
-	let l = ref 0 in
-	let add i =
-		incr l;
-		Buffer.add_char buf (Char.unsafe_chr i);
-		Buffer.add_char buf (Char.unsafe_chr (i lsr 8));
-	in
-	let length = String.length s in
-	let i = ref 0 in
-	let get () =
-		let i' = int_of_char (String.unsafe_get s !i) in
-		incr i;
-		i'
-	in
-	while !i < length do
-		let c = get() in
-		if c < 0x80 then
-			add c
-		else if c < 0xE0 then begin
-			only_ascii := false;
-			add (((c land 0x3F) lsl 6) lor ((get ()) land 0x7F))
-		end else if c < 0xF0 then begin
-			only_ascii := false;
-			let c2 = get () in
-			add (((c land 0x1F) lsl 12) lor ((c2 land 0x7F) lsl 6) lor ((get ()) land 0x7F));
-		end else begin
-			only_ascii := false;
-			let c2 = get () in
-			let c3 = get () in
-			let c = (((c land 0x0F) lsl 18) lor ((c2 land 0x7F) lsl 12) lor ((c3 land 0x7F) lsl 6) lor ((get ()) land 0x7F)) in
-			add ((c lsr 10) + 0xD7C0);
-			add ((c land 0x3FF) lor 0xDC00);
-		end
-	done;
-	Buffer.contents buf,!only_ascii,!l
+let concat s1 s2 =
+	create_with_length (s1.sstring ^ s2.sstring) (s1.slength + s2.slength)
 
-let utf16_to_utf8 s =
+let join sep sl =
+	let l_sep = sep.slength in
 	let buf = Buffer.create 0 in
-	let i = ref 0 in
-	let add i =
-		Buffer.add_char buf (Char.unsafe_chr i)
+	let _,length = List.fold_left (fun (first,length) s ->
+		let length = if first then 0 else length + l_sep in
+		let length = length + s.slength in
+		if not first then Buffer.add_string buf sep.sstring;
+		Buffer.add_string buf s.sstring;
+		(false,length)
+	) (true,0) sl in
+	create_with_length (Buffer.contents buf) length
+
+let get_offset s c_index =
+	let rec get_b_offset c_len b_offset =
+		if c_len = 0 then b_offset else
+		get_b_offset (c_len - 1) (UTF8.next s.sstring b_offset)
 	in
-	let b = Bytes.unsafe_of_string s in
-	let read_byte b i = try read_byte b i with _ -> 0 in
-	let get () =
-		let ch1 = read_byte b !i in
-		let ch2 = read_byte b (!i + 1) in
-		let c = ch1 lor (ch2 lsl 8) in
-		i := !i + 2;
-		c
+	let rec rget_b_offset c_len b_offset =
+		if c_len = 0 then b_offset else
+		rget_b_offset (c_len + 1) (UTF8.prev s.sstring b_offset)
 	in
-	let length = String.length s in
-	while !i < length do
-		let c = get() in
-		let c = if 0xD800 <= c && c <= 0xDBFF then
-			(((c - 0xD7C0) lsl 10) lor ((get()) land 0X3FF))
-		else
-			c
-		in
-		if c <= 0x7F then
-			add c
-		else if c <= 0x7FF then begin
-			add (0xC0 lor (c lsr 6));
-			add (0x80 lor (c land 63));
-		end else if c <= 0xFFFF then begin
-			add (0xE0 lor (c lsr 12));
-			add (0x80 lor ((c lsr 6) land 63));
-			add (0x80 lor (c land 63));
-		end else begin
-			add (0xF0 lor (c lsr 18));
-			add (0x80 lor ((c lsr 12) land 63));
-			add (0x80 lor ((c lsr 6) land 63));
-			add (0x80 lor (c land 63));
-		end
-	done;
-	Buffer.contents buf
-
-let maybe_extend_ascii s =
-	let s' = s.sstring in
-	if s.sascii then begin
-		extend_ascii s'
-	end else
-		s'
-
-let concat s1 s2 =
-	match s1.sascii,s2.sascii with
-	| true,true ->
-		create_ascii (s1.sstring ^ s2.sstring)
-	| false,false ->
-		create_ucs2 (s1.sstring ^ s2.sstring) (s1.slength + s2.slength)
-	| true,false ->
-		create_ucs2 ((extend_ascii s1.sstring) ^ s2.sstring) (s1.slength + s2.slength)
-	| false,true ->
-		create_ucs2 (s1.sstring ^ (extend_ascii s2.sstring)) (s1.slength + s2.slength)
-
-let join sep sl =
-	let buf = AwareBuffer.create () in
-	let rec loop sl = match sl with
-		| [s] ->
-			AwareBuffer.add_string buf s;
-		| s :: sl ->
-			AwareBuffer.add_string buf s;
-			AwareBuffer.add_string buf sep;
-			loop sl;
-		| [] ->
-			()
+	let (c_index',b_offset') = s.soffset in
+	let b_offset = match c_index - c_index' with
+		| 0 -> b_offset'
+		| 1 -> UTF8.next s.sstring b_offset'
+		| -1 -> UTF8.prev s.sstring b_offset'
+		| diff ->
+			if diff > 0 then
+				get_b_offset diff b_offset'
+			else if c_index + diff < 0 then
+				(* big step backwards, better to start over *)
+				get_b_offset c_index 0
+			else
+				rget_b_offset diff b_offset'
 	in
-	loop sl;
-	AwareBuffer.contents buf
-
-let bytes_to_utf8 s =
-	let s',is_ascii,length = utf8_to_utf16 (Bytes.unsafe_to_string s) in
-	if is_ascii then
-		vstring (create_ascii (Bytes.unsafe_to_string s))
-	else
-		vstring (create_ucs2 s' length)
+	s.soffset <- (c_index,b_offset);
+	b_offset
 
-let create_unknown s =
-	bytes_to_utf8 (Bytes.unsafe_of_string s)
+let char_at s c_index =
+	let b_offset = get_offset s c_index in
+	let char = UTF8.look s.sstring b_offset in
+	UChar.int_of_uchar char
 
-exception InvalidUnicodeChar
-
-let case_map this upper =
-	let dest = Bytes.of_string this.sstring in
-	let a,m = if upper then EvalBytes.Unicase._UPPER,1022 else EvalBytes.Unicase._LOWER,1021 in
-	let f i c =
-		let up = c lsr 6 in
-		if up < m then begin
-			let c' = a.(up).(c land ((1 lsl 6) - 1)) in
-			if c' <> 0 then EvalBytes.write_ui16 dest i c'
-		end
-	in
-	let l = Bytes.length dest in
-	let rec loop i =
-		if i = l then
-			()
-		else begin
-			let c = EvalBytes.read_ui16 dest i in
-			f i c;
-			loop (i + 2)
-		end
-	in
-	loop 0;
-	(create_ucs2 (Bytes.unsafe_to_string dest) this.slength)
+let string_of_char_code i =
+	UTF8.init 1 (fun _ ->  UChar.uchar_of_int i)
 
 let from_char_code i =
-	if i < 0 then
-		raise Not_found
-	else if i < 128 then
-		create_ascii (String.make 1 (char_of_int i))
-	else if i < 0x10000 then begin
-		if i >= 0xD800 && i <= 0xDFFF then raise InvalidUnicodeChar;
-		let b = Bytes.create 2 in
-		write_ui16 b 0 i;
-		create_ucs2 (Bytes.unsafe_to_string b) 1
-	end else if i < 0x110000 then begin
-		let i = i - 0x10000 in
-		let b = Bytes.create 4 in
-		write_ui16 b 0 ((i lsr 10 + 0xD800));
-		write_ui16 b 2 ((i land 1023) + 0xDC00);
-		create_ucs2 (Bytes.unsafe_to_string b) 2
-	end else
-		raise InvalidUnicodeChar
+	create_with_length (string_of_char_code i) 1
 
 let find_substring this sub reverse =
+	let cl_this = this.slength in
+	let cl_sub = sub.slength in
+	let bl_this = String.length this.sstring in
+	let bl_sub = String.length sub.sstring in
 	let s_this = this.sstring in
-	let l_this = String.length s_this in
-	let s_sub = if not this.sascii then maybe_extend_ascii sub else sub.sstring in
-	let l_sub = String.length s_sub in
-	let rec scan i k =
-		if k = l_sub then true
-		else if String.unsafe_get s_this (i + k) = String.unsafe_get s_sub k then scan i (k + 1)
+	let s_sub = sub.sstring in
+	let rec scan b_index b_len =
+		if b_len = bl_sub then true
+		else if String.unsafe_get s_this (b_index + b_len) = String.unsafe_get s_sub b_len then scan b_index (b_len + 1)
 		else false
 	in
-	let inc = if this.sascii then 1 else 2 in
-	if reverse then begin
-		let rec loop i =
-			if i < 0 then raise Not_found;
-			if scan i 0 then
-				i,i + l_sub
+	if not reverse then begin
+		let rec loop c_index b_index =
+			if c_index > cl_this - cl_sub || b_index >= bl_this then raise Not_found;
+			if scan b_index 0 then
+				c_index,b_index,b_index + bl_sub
 			else
-				loop (i - inc)
+				loop (c_index + 1) (UTF8.next s_this b_index)
 		in
 		loop
 	end else begin
-		let rec loop i =
-			if i > l_this - l_sub then raise Not_found;
-			if scan i 0 then
-				i,i + l_sub
+		let rec loop c_index b_index =
+			if b_index < 0 then raise Not_found;
+			if scan b_index 0 then
+				c_index,b_index,b_index + bl_sub
 			else
-				loop (i + inc)
+				loop (c_index - 1) (UTF8.prev s_this b_index)
 		in
 		loop
 	end
 
-let get s =
-	let s' = s.sstring in
-	if s.sascii then s'
-	else utf16_to_utf8 s'
+let case_map this upper =
+	let buf = UTF8.Buf.create 0 in
+	let a,m = if upper then EvalBytes.Unicase._UPPER,1022 else EvalBytes.Unicase._LOWER,1021 in
+	UTF8.iter (fun uc ->
+		let c = UChar.int_of_uchar uc in
+		let up = c lsr 6 in
+		let uc = if up < m then begin
+			let c = a.(up).(c land ((1 lsl 6) - 1)) in
+			if c <> 0 then UChar.uchar_of_int c
+			else uc
+		end else
+			uc
+		in
+		UTF8.Buf.add_char buf uc
+	) this.sstring;
+	create_with_length (UTF8.Buf.contents buf) this.slength
+
+let substr this c_index c_length =
+	if c_length < 0 then
+		create_with_length "" 0
+	else begin
+		let b_offset1 = get_offset this c_index in
+		let b_offset2 = UTF8.move this.sstring b_offset1 c_length in
+		create_with_length (String.sub this.sstring b_offset1 (b_offset2 - b_offset1)) c_length
+	end
+
+module VStringBuffer = struct
+	type t = vstring_buffer
+
+	let create () = {
+		bbuffer = Buffer.create 0;
+		blength = 0;
+	}
+
+	let add_string this s =
+		Buffer.add_string this.bbuffer s.sstring;
+		this.blength <- this.blength + s.slength
+
+	let add_substring this s b_pos b_len c_len =
+		Buffer.add_substring this.bbuffer s.sstring b_pos b_len;
+		this.blength <- this.blength + c_len
+
+	let contents this =
+		create_with_length (Buffer.contents this.bbuffer) this.blength
+end

+ 5 - 21
src/macro/eval/evalValue.ml

@@ -29,42 +29,26 @@ type cmp =
 type vstring = {
 	(* The bytes representation of the string. This is only evaluated if we
 	   need it for something like random access. *)
-	sstring : string;
+	sstring : UTF8.t;
 	(* The length of the string. *)
 	slength : int;
-	(* If true, the string is one-byte-per-character ASCII. Otherwise, it is
-	   encoded as UCS2. *)
-	sascii  : bool;
+	(* The current (character * byte) offsets. *)
+	mutable soffset : (int * int);
 }
 
 type vstring_buffer = {
 	        bbuffer : Buffer.t;
 	mutable blength : int;
-	mutable bascii  : bool;
 }
 
-let extend_ascii s =
-	let length = String.length s in
-	let b = Bytes.make (length lsl 1) '\000' in
-	for i = 0 to length - 1 do
-		Bytes.unsafe_set b (i lsl 1) (String.unsafe_get s i)
-	done;
-	Bytes.unsafe_to_string b
-
 let vstring_equal s1 s2 =
-	if s1.sascii = s2.sascii then
-		s1.sstring = s2.sstring
-	else if not s2.sascii then
-		extend_ascii s1.sstring = s2.sstring
-	else
-		s1.sstring = extend_ascii s2.sstring
+	s1.sstring = s2.sstring
 
 module StringHashtbl = Hashtbl.Make(struct
 	type t = vstring
 	let equal = vstring_equal
 	let hash s =
-		let s = if s.sascii then extend_ascii s.sstring
-		else s.sstring in
+		let s = s.sstring in
 		Hashtbl.hash s
 end)
 

+ 1 - 27
tests/unit/src/unitstd/Unicode.unit.hx

@@ -19,12 +19,7 @@ s.indexOf("a")==0;
 s.lastIndexOf("a")==1;
 s.indexOf("😂")>0;
 s.lastIndexOf("😂")>0;
-s.lastIndexOf("é") == s.length-1;
-s.lastIndexOf("a", s.length) == 1;
-s.lastIndexOf("a", s.length + 9000) == 1;
-s.lastIndexOf("é", s.length) == s.length-1;
-s.lastIndexOf("é", s.length + 9000) == s.length-1;
-
+s.lastIndexOf("é")==s.length-1;
 var s = "abc";
 s.indexOf("éé")<0;
 s.lastIndexOf("éé")<0;
@@ -150,9 +145,6 @@ input.readString(bytes.length - 9,RawNative) == "éあ😂";
 var s = "ée";
 var s1 = s.charAt(1);
 s1 == "e";
-#if eval
-(untyped s.charAt(0).isAscii()) == false;
-#end
 
 var s1 = s.substr(1, 1);
 var s2 = s.substr(1);
@@ -162,14 +154,6 @@ s1 == "e";
 s2 == "e";
 s3 == "e";
 s4 == "e";
-#if eval
-// We currently don't asciify anything we extract from UCS2 strings... not sure if this would
-// be worth it or not.
-(untyped s1.isAscii()) == false;
-(untyped s2.isAscii()) == false;
-(untyped s3.isAscii()) == false;
-(untyped s4.isAscii()) == false;
-#end
 
 var s1 = s.substring(1, 2);
 var s2 = s.substring(1);
@@ -179,12 +163,6 @@ s1 == "e";
 s2 == "e";
 s3 == "e";
 s4 == "e";
-#if eval
-(untyped s1.isAscii()) == false;
-(untyped s2.isAscii()) == false;
-(untyped s3.isAscii()) == false;
-(untyped s4.isAscii()) == false;
-#end
 
 Reflect.compare("ed", "éee".substr(1)) < 0;
 Reflect.compare("éed".substr(1), "éee".substr(1)) < 0;
@@ -303,8 +281,4 @@ test("ab", "abc", "bc");
 test("ab", "😂bc", "bc");
 #end
 
-for (str in ["ä", "あ", "😂", "aä", "aあ", "a😂"]) {
-	eq(str.charAt(0).charCodeAt(0), str.charCodeAt(0));
-}
-
 #end