Bläddra i källkod

String interpolation vs utf8

Rudy Ges 9 månader sedan
förälder
incheckning
cc33d514d1

+ 142 - 57
src/context/formatString.ml

@@ -1,84 +1,168 @@
+open Extlib_leftovers
 open Globals
 open Ast
 
 let format_string defines s p process_expr =
+	let len = String.length s in
+	let get_next i =
+		if i >= len then raise End_of_file else
+		(UTF8.look s i, UTF8.next s i)
+	in
+
+	let read_char = ref 0 in
+	let char_len = ref 0 in
+
+	let get_next_char i =
+		let (chr, next) = try get_next i
+			with Invalid_argument _ ->
+				raise End_of_file
+		in
+
+		try
+			let c = UCharExt.char_of chr in
+			incr read_char;
+			c, (fun buf ->
+				incr char_len;
+				UTF8.Buf.add_char buf chr
+			), next
+		with UCharExt.Out_of_range ->
+			let get i =
+				let ch = String.unsafe_get s i in
+				(ch, int_of_char ch)
+			in
+			let (ch, c) = get !read_char in
+
+			let buf = Buffer.create 0 in
+			Common.utf16_add buf c;
+			let len = Buffer.length buf in
+
+			read_char := !read_char + len;
+
+			ch, (fun buf ->
+				(* UTF16 handling *)
+				if c >= 0x80 && c < 0x800 then begin
+					let b = Buffer.create 0 in
+					let add c = Buffer.add_char b (char_of_int (c land 0xFF)) in
+					let c' = c lor (snd (get (i + 1)) lsl 8) in
+					add c';
+					add (c' lsr 8);
+
+					let s' = Buffer.contents b in
+
+					(* ok but why? *)
+					if c' lsr 8 < 0x80 then char_len := !char_len + 2
+					else if c' < 0xDFFF then incr char_len;
+
+					UTF8.Buf.add_string buf s'
+				end else
+					die "" __LOC__;
+			), i+len
+	in
+
+	let buf = UTF8.Buf.create len in
 	let e = ref None in
 	let pmin = ref p.pmin in
 	let min = ref (p.pmin + 1) in
-	let add_expr (enext,p) len =
-		min := !min + len;
+
+	let add_expr (enext,p) =
+		min := !min + !char_len;
+		char_len := 0;
 		let enext = process_expr enext p in
 		match !e with
 		| None -> e := Some enext
 		| Some prev ->
 			e := Some (EBinop (OpAdd,prev,enext),punion (pos prev) p)
 	in
-	let add enext len =
-		let p = { p with pmin = !min; pmax = !min + len } in
-		add_expr (enext,p) len
+
+	let add enext =
+		let p = { p with pmin = !min; pmax = !min + !char_len } in
+		add_expr (enext,p)
 	in
-	let add_sub start pos =
-		let len = pos - start in
-		if len > 0 || !e = None then add (EConst (String (String.sub s start len,SDoubleQuotes))) len
+
+	let add_sub () =
+		let s = UTF8.Buf.contents buf in
+		UTF8.Buf.clear buf;
+		if !char_len > 0 || !e = None then add (EConst (String (s,SDoubleQuotes)))
 	in
-	let len = String.length s in
-	let rec parse start pos =
-		if pos = len then add_sub start pos else
-		let c = String.unsafe_get s pos in
-		let pos = pos + 1 in
-		if c = '\'' then begin
-			incr pmin;
-			incr min;
-		end;
-		if c <> '$' || pos = len then parse start pos else
-		match String.unsafe_get s pos with
-		| '$' ->
-			(* double $ *)
-			add_sub start pos;
-			parse (pos + 1) (pos + 1)
-		| '{' ->
-			parse_group start pos '{' '}' "brace"
-		| 'a'..'z' | 'A'..'Z' | '_' ->
-			add_sub start (pos - 1);
-			incr min;
-			let rec loop i =
-				if i = len then i else
-				let c = String.unsafe_get s i in
+
+	let rec parse pos' =
+		try begin
+			let (c, store', pos) = get_next_char pos' in
+
+			if c = '\'' then begin
+				incr pmin;
+				incr min;
+			end;
+
+			if c <> '$' || pos >= len then begin
+				store' buf;
+				parse pos
+			end else
+				let (c, store, pos) = get_next_char pos in
 				match c with
-				| 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' -> loop (i+1)
-				| _ -> i
-			in
-			let iend = loop (pos + 1) in
-			let len = iend - pos in
-			add (EConst (Ident (String.sub s pos len))) len;
-			parse (pos + len) (pos + len)
-		| _ ->
-			(* keep as-it *)
-			parse start pos
-	and parse_group start pos gopen gclose gname =
-		add_sub start (pos - 1);
+				| '$' ->
+					(* double $ *)
+					store buf;
+					add_sub ();
+					parse pos
+				| '{' ->
+					add_sub ();
+					parse_group pos' pos '{' '}' "brace"
+				| 'a'..'z' | 'A'..'Z' | '_' ->
+					add_sub ();
+					incr min;
+					let buf = UTF8.Buf.create len in
+					store buf;
+					let rec loop i =
+						if i = len then i else
+						let (c,store,next) = get_next_char i in
+
+						match c with
+						| 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' ->
+							store buf;
+							loop next
+						| _ -> i
+					in
+					let iend = loop pos in
+					let id = UTF8.Buf.contents buf in
+					add (EConst (Ident id));
+					parse iend
+				| _ ->
+					(* keep as-is *)
+					store' buf;
+					store buf;
+					parse pos
+		end with End_of_file -> add_sub ()
+
+	and parse_group prev pos gopen gclose gname =
+		let buf = UTF8.Buf.create len in
 		let rec loop groups i =
 			if i = len then
 				match groups with
 				| [] -> die "" __LOC__
 				| g :: _ -> Error.raise_typing_error ("Unclosed " ^ gname) { p with pmin = !pmin + g + 1; pmax = !pmin + g + 2 }
 			else
-				let c = String.unsafe_get s i in
-				if c = gopen then
-					loop (i :: groups) (i + 1)
-				else if c = gclose then begin
+				let (c, store, pos) = get_next_char i in
+				if c = gopen then begin
+					store buf;
+					loop (i :: groups) pos
+				end else if c = gclose then begin
 					let groups = List.tl groups in
-					if groups = [] then i else loop groups (i + 1)
-				end else
-					loop groups (i + 1)
+					if groups = [] then pos else begin
+						store buf;
+						loop groups pos
+					end
+				end else begin
+					store buf;
+					loop groups pos
+				end
 		in
-		let send = loop [pos] (pos + 1) in
-		let slen = send - pos - 1 in
-		let scode = String.sub s (pos + 1) slen in
+		let send = loop [prev] pos in
+		let scode = UTF8.Buf.contents buf in
 		min := !min + 2;
 		begin
 			let e =
-				let ep = { p with pmin = !pmin + pos + 2; pmax = !pmin + send + 1 } in
+				let ep = { p with pmin = !pmin + pos + 2; pmax = !pmin + send } in
 				let error msg pos =
 					if Lexer.string_is_whitespace scode then Error.raise_typing_error "Expression cannot be empty" ep
 					else Error.raise_typing_error msg pos
@@ -87,12 +171,13 @@ let format_string defines s p process_expr =
 					| ParseSuccess(data,_,_) -> data
 					| ParseError(_,(msg,p),_) -> error (Parser.error_msg msg) p
 			in
-			add_expr e slen
+			add_expr e
 		end;
 		min := !min + 1;
-		parse (send + 1) (send + 1)
+		parse send
 	in
-	parse 0 0;
+
+	parse 0;
 	match !e with
 	| None -> die "" __LOC__
 	| Some e -> e

+ 14 - 0
tests/display/src/cases/VsHaxeIssue648.hx

@@ -0,0 +1,14 @@
+package cases;
+
+class VsHaxeIssue648 extends DisplayTestCase {
+	/**
+		trace('Jeremy in $ci{-1-}ty');
+		trace('Jérémy in $ci{-2-}ty');
+	**/
+	@:funcCode function test() {
+		var diag = diagnostics().filter(d -> d.kind == DiagnosticKind.DKUnresolvedIdentifier);
+		eq(2, diag.length);
+		eq(diag[0].range.start.character, diag[1].range.start.character);
+		eq(diag[0].range.end.character, diag[1].range.end.character);
+	}
+}

+ 13 - 0
tests/misc/projects/VshaxeIssue648/Main.hx

@@ -0,0 +1,13 @@
+function main() {
+	#if nofail
+	var test = "test";
+	#end
+	trace('Jeremy $test');
+	trace('Jérémy $test');
+	trace('名 字 $test');
+	trace('zя���� $test abcdefghijk');
+	trace('���� $test abcdefghijk');
+	trace('zя $test abcdefghijk');
+	trace('😀 😀 $test abcdefghijk');
+	trace('😀 😀 zя���� $test abcdefghijk');
+}

+ 4 - 0
tests/misc/projects/VshaxeIssue648/compile-fail.hxml

@@ -0,0 +1,4 @@
+--main Main
+--interp
+-D message.reporting=pretty
+-D message.no-color

+ 64 - 0
tests/misc/projects/VshaxeIssue648/compile-fail.hxml.stderr

@@ -0,0 +1,64 @@
+[ERROR] Main.hx:5: characters 17-21
+
+  5 |  trace('Jeremy $test');
+    |                 ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+
+[ERROR] Main.hx:6: characters 17-21
+
+  6 |  trace('Jérémy $test');
+    |                 ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+
+[ERROR] Main.hx:7: characters 16-20
+
+  7 |  trace('名 字 $test');
+    |                ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+
+[ERROR] Main.hx:8: characters 17-21
+
+  8 |  trace('zя���� $test abcdefghijk');
+    |                 ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+
+[ERROR] Main.hx:9: characters 15-19
+
+  9 |  trace('���� $test abcdefghijk');
+    |               ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+
+[ERROR] Main.hx:10: characters 13-17
+
+ 10 |  trace('zя $test abcdefghijk');
+    |             ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+
+[ERROR] Main.hx:11: characters 16-20
+
+ 11 |  trace('😀 😀 $test abcdefghijk');
+    |                ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+
+[ERROR] Main.hx:12: characters 23-27
+
+ 12 |  trace('😀 😀 zя���� $test abcdefghijk');
+    |                       ^^^^
+    | Unknown identifier : test
+
+        | For function argument 'v'
+

+ 5 - 0
tests/misc/projects/VshaxeIssue648/compile.hxml

@@ -0,0 +1,5 @@
+--main Main
+--interp
+-D message.reporting=pretty
+-D message.no-color
+-D nofail

+ 8 - 0
tests/misc/projects/VshaxeIssue648/compile.hxml.stdout

@@ -0,0 +1,8 @@
+Main.hx:5: Jeremy test
+Main.hx:6: Jérémy test
+Main.hx:7: 名 字 test
+Main.hx:8: zя���� test abcdefghijk
+Main.hx:9: ���� test abcdefghijk
+Main.hx:10: zя test abcdefghijk
+Main.hx:11: 😀 😀 test abcdefghijk
+Main.hx:12: 😀 😀 zя���� test abcdefghijk