1 year ago · e9579fe2df
--- a/rtl/inc/ustrings.inc
+++ b/rtl/inc/ustrings.inc
@@ -1805,239 +1805,128 @@ begin
 
															   runerror(217);
														
 
															 end;
														
 
															 {$else EXCLUDE_COMPLEX_PROCS}
														
 
															-  const
														
 
															-    UNICODE_INVALID=63;
														
 
															   var
														
 
															-    InputUTF8: SizeUInt;
														
 
															-    IBYTE: BYTE;
														
 
															-    OutputUnicode: SizeUInt;
														
 
															-    PRECHAR: SizeUInt;
														
 
															-    TempBYTE: BYTE;
														
 
															-    CharLen: SizeUint;
														
 
															-    LookAhead: SizeUInt;
														
 
															-    UC: SizeUInt;
														
 
															+    SourcePos,DestPos: SizeUint;
														
 
															+    UC: int32;
														
 
															   begin
														
 
															-    if not assigned(Source) then
														
 
															-      begin
														
 
															-        result:=0;
														
 
															-        exit;
														
 
															-      end;
														
 
															-    result:=SizeUInt(-1);
														
 
															-    InputUTF8:=0;
														
 
															-    OutputUnicode:=0;
														
 
															-    PreChar:=0;
														
 
															-    if Assigned(Dest) Then
														
 
															+    if not Assigned(Source) then
														
 
															+      exit(0);
														
 
															+    SourcePos:=0;
														
 
															+    DestPos:=0;
														
 
															+
														
 
															+    if Assigned(Dest) then
														
 
															       begin
														
 
															-        while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
														
 
															-          begin
														
 
															-            IBYTE:=byte(Source[InputUTF8]);
														
 
															-            if (IBYTE and $80) = 0 then
														
 
															-              begin
														
 
															-              // One character US-ASCII, convert it to unicode
														
 
															-              // Commented code to convert LF to CRLF has been removed
														
 
															-              Dest[OutputUnicode]:=WideChar(IBYTE);
														
 
															-              inc(OutputUnicode);
														
 
															-              PreChar:=IBYTE;
														
 
															-              inc(InputUTF8);
														
 
															-              end
														
 
															-            else
														
 
															-              begin
														
 
															-                TempByte:=IBYTE;
														
 
															-                CharLen:=0;
														
 
															-                while (TempBYTE and $80)<>0 do
														
 
															-                  begin
														
 
															-                    TempBYTE:=(TempBYTE shl 1) and $FE;
														
 
															-                    inc(CharLen);
														
 
															-                  end;
														
 
															-                //Test for the "CharLen" conforms UTF-8 string
														
 
															-                //This means the 10xxxxxx pattern.
														
 
															-                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
														
 
															+        if SourcePos<SourceBytes then { “repeat until false” + “if C then continue else break” is used instead of “while C” + “continue” for better codegen. }
														
 
															+          repeat
														
 
															+            { See generic.inc:Utf8CodePointLen for explanations. Not continuing = invalid or incomplete character. }
														
 
															+            if DestPos>=MaxDestChars then { Speculate 1 unicodechar. }
														
 
															+              break;
														
 
															+            inc(DestPos);
														
 
															+            UC:=ord(Source[SourcePos]);
														
 
															+            case uint32(UC) of
														
 
															+              0..$7F:
														
 
															+                begin
														
 
															+                  Dest[DestPos-1]:=unicodechar(UC);
														
 
															+                  inc(SourcePos);
														
 
															+                  if SourcePos<SourceBytes then continue else break;
														
 
															+                end;
														
 
															+              $C2..$DF:
														
 
															+                if (SourcePos+1<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) then
														
 
															                   begin
														
 
															-                    //Insuficient chars in string to decode
														
 
															-                    //UTF-8 array. Fallback to single AnsiChar.
														
 
															-                    CharLen:= 1;
														
 
															+                    Dest[DestPos-1]:=unicodechar(UC and $1F shl 6 or ord(Source[SourcePos+1]) and $3F);
														
 
															+                    inc(SourcePos,2);
														
 
															+                    if SourcePos<SourceBytes then continue else break;
														
 
															                   end;
														
 
															-                for LookAhead := 1 to CharLen-1 do
														
 
															+              $E0..$EF:
														
 
															+                if (SourcePos+2<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) then
														
 
															                   begin
														
 
															-                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
														
 
															-                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
														
 
															+                    UC:=UC and $F shl 12 or ord(Source[SourcePos+1]) and $3F shl 6 or ord(Source[SourcePos+2]) and $3F;
														
 
															+                    if (UC>=$800) and (UC<=$FFFD) and not ((UC>=$D800) and (UC<=$DFFF)) then
														
 
															                       begin
														
 
															-                        //Invalid UTF-8 sequence, fallback.
														
 
															-                        CharLen:= LookAhead;
														
 
															-                        break;
														
 
															+                        Dest[DestPos-1]:=unicodechar(UC);
														
 
															+                        inc(SourcePos,3);
														
 
															+                        if SourcePos<SourceBytes then continue else break;
														
 
															                       end;
														
 
															                   end;
														
 
															-                UC:=$FFFF;
														
 
															-                case CharLen of
														
 
															-                  1:  begin
														
 
															-                        //Not valid UTF-8 sequence
														
 
															-                        UC:=UNICODE_INVALID;
														
 
															-                      end;
														
 
															-                  2:  begin
														
 
															-                        //Two bytes UTF, convert it
														
 
															-                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
														
 
															-                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
														
 
															-                        if UC <= $7F then
														
 
															-                          begin
														
 
															-                            //Invalid UTF sequence.
														
 
															-                            UC:=UNICODE_INVALID;
														
 
															-                          end;
														
 
															-                      end;
														
 
															-                  3:  begin
														
 
															-                        //Three bytes, convert it to unicode
														
 
															-                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
														
 
															-                        if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
														
 
															-                          begin
														
 
															-                            //Invalid UTF-8 sequence
														
 
															-                            UC:= UNICODE_INVALID;
														
 
															-                          End;
														
 
															-                      end;
														
 
															-                  4:  begin
														
 
															-                        //Four bytes, convert it to two unicode characters
														
 
															-                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
														
 
															-                        if (UC < $10000) or (UC > $10FFFF) then
														
 
															-                          begin
														
 
															-                            UC:= UNICODE_INVALID;
														
 
															-                          end
														
 
															-                        else
														
 
															-                          begin
														
 
															-                            { only store pair if room }
														
 
															-                            dec(UC,$10000);
														
 
															-                            if (OutputUnicode<MaxDestChars-1) then
														
 
															-                              begin
														
 
															-                                Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
														
 
															-                                inc(OutputUnicode);
														
 
															-                                UC:=(UC and $3ff) + $DC00;
														
 
															-                              end
														
 
															-                            else
														
 
															-                              begin
														
 
															-                                InputUTF8:= InputUTF8 + CharLen;
														
 
															-                                { don't store anything }
														
 
															-                                CharLen:=0;
														
 
															-                              end;
														
 
															-                          end;
														
 
															-                      end;
														
 
															-                  5,6,7:  begin
														
 
															-                            //Invalid UTF8 to unicode conversion,
														
 
															-                            //mask it as invalid UNICODE too.
														
 
															-                            UC:=UNICODE_INVALID;
														
 
															-                          end;
														
 
															-                end;
														
 
															-                if CharLen > 0 then
														
 
															+              $F0..$F4:
														
 
															+                if (SourcePos+3<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) and (ord(Source[SourcePos+3]) and $C0=$80) then
														
 
															                   begin
														
 
															-                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
														
 
															-                      HandleError(231); // Will be converted to EConversionError in sysutils
														
 
															-                    PreChar:=UC;
														
 
															-                    Dest[OutputUnicode]:=WideChar(UC);
														
 
															-                    inc(OutputUnicode);
														
 
															+                    UC:=UC and $7 shl 18 or ord(Source[SourcePos+1]) and $3F shl 12 or ord(Source[SourcePos+2]) and $3F shl 6 or ord(Source[SourcePos+3]) and $3F-$10000;
														
 
															+                    if Cardinal(UC)<=$10FFFF-$10000 then
														
 
															+                      begin
														
 
															+                        dec(DestPos);
														
 
															+                        if DestPos+1>=MaxDestChars then { 2 unicodechars. }
														
 
															+                          break;
														
 
															+                        Dest[DestPos]:=unicodechar($D800+UC shr 10);
														
 
															+                        Dest[DestPos+1]:=unicodechar($DC00+UC and $3ff);
														
 
															+                        inc(SourcePos,4);
														
 
															+                        inc(DestPos,2);
														
 
															+                        if SourcePos<SourceBytes then continue else break;
														
 
															+                      end;
														
 
															                   end;
														
 
															-                InputUTF8:= InputUTF8 + CharLen;
														
 
															-              end;
														
 
															-          end;
														
 
															-        Result:=OutputUnicode+1;
														
 
															+            end;
														
 
															+            { Invalid or incomplete character. }
														
 
															+            if not IgnoreInvalid then
														
 
															+              HandleError(231); // Will be converted to EConversionError in sysutils
														
 
															+            inc(SourcePos); { Skip first byte. }
														
 
															+            if ord(Source[SourcePos-1]) and $C0<>$80 then { If first byte is not a continuation byte... }
														
 
															+              while (SourcePos<SourceBytes) and (ord(Source[SourcePos]) and $C0=$80) do { ..Then skip continuation bytes. }
														
 
															+                inc(SourcePos);
														
 
															+            Dest[DestPos-1]:='?';
														
 
															+            if SourcePos>=SourceBytes then break; { Do not add a condition to the loop, or “continue”s will jump to it instead of the beginning! }
														
 
															+          until false;
														
 
															+        if DestPos<MaxDestChars then { Null-terminate... if there is space. Count in result in either case. }
														
 
															+          Dest[DestPos]:=#0;
														
 
															       end
														
 
															     else
														
 
															-      begin
														
 
															-        while (InputUTF8<SourceBytes) do
														
 
															-          begin
														
 
															-            IBYTE:=byte(Source[InputUTF8]);
														
 
															-            if (IBYTE and $80) = 0 then
														
 
															+      { Same as above but without writing Dest. }
														
 
															+      if SourcePos<SourceBytes then
														
 
															+        repeat
														
 
															+          UC:=ord(Source[SourcePos]);
														
 
															+          inc(DestPos); { Speculate 1 unicodechar. }
														
 
															+          case uint32(UC) of
														
 
															+            0..$7F:
														
 
															               begin
														
 
															-              // One character US-ASCII, convert it to unicode
														
 
															-              // Commented code to convert LF to CRLF has been removed
														
 
															-              inc(OutputUnicode);
														
 
															-              PreChar:=IBYTE;
														
 
															-              inc(InputUTF8);
														
 
															-              end
														
 
															-            else
														
 
															-              begin
														
 
															-                TempByte:=IBYTE;
														
 
															-                CharLen:=0;
														
 
															-                while (TempBYTE and $80)<>0 do
														
 
															-                  begin
														
 
															-                    TempBYTE:=(TempBYTE shl 1) and $FE;
														
 
															-                    inc(CharLen);
														
 
															-                  end;
														
 
															-                //Test for the "CharLen" conforms UTF-8 string
														
 
															-                //This means the 10xxxxxx pattern.
														
 
															-                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
														
 
															-                  begin
														
 
															-                    //Insuficient chars in string to decode
														
 
															-                    //UTF-8 array. Fallback to single AnsiChar.
														
 
															-                    CharLen:= 1;
														
 
															-                  end;
														
 
															-                for LookAhead := 1 to CharLen-1 do
														
 
															-                  begin
														
 
															-                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
														
 
															-                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
														
 
															-                      begin
														
 
															-                        //Invalid UTF-8 sequence, fallback.
														
 
															-                        CharLen:= LookAhead;
														
 
															-                        break;
														
 
															-                      end;
														
 
															-                  end;
														
 
															-                UC:=$FFFF;
														
 
															-                case CharLen of
														
 
															-                  1:  begin
														
 
															-                        //Not valid UTF-8 sequence
														
 
															-                        UC:=UNICODE_INVALID;
														
 
															-                      end;
														
 
															-                  2:  begin
														
 
															-                        //Two bytes UTF, convert it
														
 
															-                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
														
 
															-                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
														
 
															-                        if UC <= $7F then
														
 
															-                          begin
														
 
															-                            //Invalid UTF sequence.
														
 
															-                            UC:=UNICODE_INVALID;
														
 
															-                          end;
														
 
															-                      end;
														
 
															-                  3:  begin
														
 
															-                        //Three bytes, convert it to unicode
														
 
															-                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
														
 
															-                        If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
														
 
															-                          begin
														
 
															-                            //Invalid UTF-8 sequence
														
 
															-                            UC:= UNICODE_INVALID;
														
 
															-                          end;
														
 
															-                      end;
														
 
															-                  4:  begin
														
 
															-                        //Four bytes, convert it to two unicode characters
														
 
															-                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
														
 
															-                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
														
 
															-                        if (UC < $10000) or (UC > $10FFFF) then
														
 
															-                          UC:= UNICODE_INVALID
														
 
															-                        else
														
 
															-                          { extra character character }
														
 
															-                          inc(OutputUnicode);
														
 
															-                      end;
														
 
															-                  5,6,7:  begin
														
 
															-                            //Invalid UTF8 to unicode conversion,
														
 
															-                            //mask it as invalid UNICODE too.
														
 
															-                            UC:=UNICODE_INVALID;
														
 
															-                          end;
														
 
															-                end;
														
 
															-                if CharLen > 0 then
														
 
															-                  begin
														
 
															-                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
														
 
															-                      HandleError(231); // Will be converted to EConversionError in sysutils
														
 
															-                    PreChar:=UC;
														
 
															-                    inc(OutputUnicode);
														
 
															-                  end;
														
 
															-                InputUTF8:= InputUTF8 + CharLen;
														
 
															+                inc(SourcePos);
														
 
															+                if SourcePos<SourceBytes then continue else break;
														
 
															               end;
														
 
															+            $C2..$DF:
														
 
															+              if (SourcePos+1<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) then
														
 
															+                begin
														
 
															+                  inc(SourcePos,2);
														
 
															+                  if SourcePos<SourceBytes then continue else break;
														
 
															+                end;
														
 
															+            $E0..$EF:
														
 
															+              if (SourcePos+2<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) then
														
 
															+                begin
														
 
															+                  UC:=UC and $F shl 12 or ord(Source[SourcePos+1]) and $3F shl 6 or ord(Source[SourcePos+2]) and $3F;
														
 
															+                  if (UC>=$800) and (UC<=$FFFD) and not ((UC>=$D800) and (UC<=$DFFF)) then
														
 
															+                    begin
														
 
															+                      inc(SourcePos,3);
														
 
															+                      if SourcePos<SourceBytes then continue else break;
														
 
															+                    end;
														
 
															+                end;
														
 
															+            $F0..$F4:
														
 
															+              if (SourcePos+3<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) and (ord(Source[SourcePos+3]) and $C0=$80) then
														
 
															+                begin
														
 
															+                  UC:=UC and $7 shl 18 or ord(Source[SourcePos+1]) and $3F shl 12 or ord(Source[SourcePos+2]) and $3F shl 6 or ord(Source[SourcePos+3]) and $3F-$10000;
														
 
															+                  if Cardinal(UC)<=$10FFFF-$10000 then
														
 
															+                    begin
														
 
															+                      inc(SourcePos,4);
														
 
															+                      inc(DestPos); { To 2 unicodechars in total. }
														
 
															+                      if SourcePos<SourceBytes then continue else break;
														
 
															+                    end;
														
 
															+                end;
														
 
															           end;
														
 
															-        Result:=OutputUnicode+1;
														
 
															-      end;
														
 
															+          if not IgnoreInvalid then
														
 
															+            HandleError(231);
														
 
															+          inc(SourcePos);
														
 
															+          if ord(Source[SourcePos-1]) and $C0<>$80 then
														
 
															+            while (SourcePos<SourceBytes) and (ord(Source[SourcePos]) and $C0=$80) do
														
 
															+              inc(SourcePos);
														
 
															+          if SourcePos>=SourceBytes then break;
														
 
															+        until false;
														
 
															+    Result:=DestPos+1 {null terminator, in both branches};
														
 
															   end;
														
 
															 {$endif EXCLUDE_COMPLEX_PROCS}