Browse Source

Better Utf8ToUnicode.

Rika Ichinose 1 year ago
parent
commit
e9579fe2df
1 changed files with 106 additions and 217 deletions
  1. 106 217
      rtl/inc/ustrings.inc

+ 106 - 217
rtl/inc/ustrings.inc

@@ -1805,239 +1805,128 @@ begin
   runerror(217);
 end;
 {$else EXCLUDE_COMPLEX_PROCS}
-  const
-    UNICODE_INVALID=63;
   var
-    InputUTF8: SizeUInt;
-    IBYTE: BYTE;
-    OutputUnicode: SizeUInt;
-    PRECHAR: SizeUInt;
-    TempBYTE: BYTE;
-    CharLen: SizeUint;
-    LookAhead: SizeUInt;
-    UC: SizeUInt;
+    SourcePos,DestPos: SizeUint;
+    UC: int32;
   begin
-    if not assigned(Source) then
-      begin
-        result:=0;
-        exit;
-      end;
-    result:=SizeUInt(-1);
-    InputUTF8:=0;
-    OutputUnicode:=0;
-    PreChar:=0;
-    if Assigned(Dest) Then
+    if not Assigned(Source) then
+      exit(0);
+    SourcePos:=0;
+    DestPos:=0;
+
+    if Assigned(Dest) then
       begin
-        while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
-          begin
-            IBYTE:=byte(Source[InputUTF8]);
-            if (IBYTE and $80) = 0 then
-              begin
-              // One character US-ASCII, convert it to unicode
-              // Commented code to convert LF to CRLF has been removed
-              Dest[OutputUnicode]:=WideChar(IBYTE);
-              inc(OutputUnicode);
-              PreChar:=IBYTE;
-              inc(InputUTF8);
-              end
-            else
-              begin
-                TempByte:=IBYTE;
-                CharLen:=0;
-                while (TempBYTE and $80)<>0 do
-                  begin
-                    TempBYTE:=(TempBYTE shl 1) and $FE;
-                    inc(CharLen);
-                  end;
-                //Test for the "CharLen" conforms UTF-8 string
-                //This means the 10xxxxxx pattern.
-                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
+        if SourcePos<SourceBytes then { “repeat until false” + “if C then continue else break” is used instead of “while C” + “continue” for better codegen. }
+          repeat
+            { See generic.inc:Utf8CodePointLen for explanations. Not continuing = invalid or incomplete character. }
+            if DestPos>=MaxDestChars then { Speculate 1 unicodechar. }
+              break;
+            inc(DestPos);
+            UC:=ord(Source[SourcePos]);
+            case uint32(UC) of
+              0..$7F:
+                begin
+                  Dest[DestPos-1]:=unicodechar(UC);
+                  inc(SourcePos);
+                  if SourcePos<SourceBytes then continue else break;
+                end;
+              $C2..$DF:
+                if (SourcePos+1<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) then
                   begin
-                    //Insuficient chars in string to decode
-                    //UTF-8 array. Fallback to single AnsiChar.
-                    CharLen:= 1;
+                    Dest[DestPos-1]:=unicodechar(UC and $1F shl 6 or ord(Source[SourcePos+1]) and $3F);
+                    inc(SourcePos,2);
+                    if SourcePos<SourceBytes then continue else break;
                   end;
-                for LookAhead := 1 to CharLen-1 do
+              $E0..$EF:
+                if (SourcePos+2<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) then
                   begin
-                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
-                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
+                    UC:=UC and $F shl 12 or ord(Source[SourcePos+1]) and $3F shl 6 or ord(Source[SourcePos+2]) and $3F;
+                    if (UC>=$800) and (UC<=$FFFD) and not ((UC>=$D800) and (UC<=$DFFF)) then
                       begin
-                        //Invalid UTF-8 sequence, fallback.
-                        CharLen:= LookAhead;
-                        break;
+                        Dest[DestPos-1]:=unicodechar(UC);
+                        inc(SourcePos,3);
+                        if SourcePos<SourceBytes then continue else break;
                       end;
                   end;
-                UC:=$FFFF;
-                case CharLen of
-                  1:  begin
-                        //Not valid UTF-8 sequence
-                        UC:=UNICODE_INVALID;
-                      end;
-                  2:  begin
-                        //Two bytes UTF, convert it
-                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
-                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
-                        if UC <= $7F then
-                          begin
-                            //Invalid UTF sequence.
-                            UC:=UNICODE_INVALID;
-                          end;
-                      end;
-                  3:  begin
-                        //Three bytes, convert it to unicode
-                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
-                        if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
-                          begin
-                            //Invalid UTF-8 sequence
-                            UC:= UNICODE_INVALID;
-                          End;
-                      end;
-                  4:  begin
-                        //Four bytes, convert it to two unicode characters
-                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
-                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
-                        if (UC < $10000) or (UC > $10FFFF) then
-                          begin
-                            UC:= UNICODE_INVALID;
-                          end
-                        else
-                          begin
-                            { only store pair if room }
-                            dec(UC,$10000);
-                            if (OutputUnicode<MaxDestChars-1) then
-                              begin
-                                Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
-                                inc(OutputUnicode);
-                                UC:=(UC and $3ff) + $DC00;
-                              end
-                            else
-                              begin
-                                InputUTF8:= InputUTF8 + CharLen;
-                                { don't store anything }
-                                CharLen:=0;
-                              end;
-                          end;
-                      end;
-                  5,6,7:  begin
-                            //Invalid UTF8 to unicode conversion,
-                            //mask it as invalid UNICODE too.
-                            UC:=UNICODE_INVALID;
-                          end;
-                end;
-                if CharLen > 0 then
+              $F0..$F4:
+                if (SourcePos+3<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) and (ord(Source[SourcePos+3]) and $C0=$80) then
                   begin
-                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
-                      HandleError(231); // Will be converted to EConversionError in sysutils
-                    PreChar:=UC;
-                    Dest[OutputUnicode]:=WideChar(UC);
-                    inc(OutputUnicode);
+                    UC:=UC and $7 shl 18 or ord(Source[SourcePos+1]) and $3F shl 12 or ord(Source[SourcePos+2]) and $3F shl 6 or ord(Source[SourcePos+3]) and $3F-$10000;
+                    if Cardinal(UC)<=$10FFFF-$10000 then
+                      begin
+                        dec(DestPos);
+                        if DestPos+1>=MaxDestChars then { 2 unicodechars. }
+                          break;
+                        Dest[DestPos]:=unicodechar($D800+UC shr 10);
+                        Dest[DestPos+1]:=unicodechar($DC00+UC and $3ff);
+                        inc(SourcePos,4);
+                        inc(DestPos,2);
+                        if SourcePos<SourceBytes then continue else break;
+                      end;
                   end;
-                InputUTF8:= InputUTF8 + CharLen;
-              end;
-          end;
-        Result:=OutputUnicode+1;
+            end;
+            { Invalid or incomplete character. }
+            if not IgnoreInvalid then
+              HandleError(231); // Will be converted to EConversionError in sysutils
+            inc(SourcePos); { Skip first byte. }
+            if ord(Source[SourcePos-1]) and $C0<>$80 then { If first byte is not a continuation byte... }
+              while (SourcePos<SourceBytes) and (ord(Source[SourcePos]) and $C0=$80) do { ..Then skip continuation bytes. }
+                inc(SourcePos);
+            Dest[DestPos-1]:='?';
+            if SourcePos>=SourceBytes then break; { Do not add a condition to the loop, or “continue”s will jump to it instead of the beginning! }
+          until false;
+        if DestPos<MaxDestChars then { Null-terminate... if there is space. Count in result in either case. }
+          Dest[DestPos]:=#0;
       end
     else
-      begin
-        while (InputUTF8<SourceBytes) do
-          begin
-            IBYTE:=byte(Source[InputUTF8]);
-            if (IBYTE and $80) = 0 then
+      { Same as above but without writing Dest. }
+      if SourcePos<SourceBytes then
+        repeat
+          UC:=ord(Source[SourcePos]);
+          inc(DestPos); { Speculate 1 unicodechar. }
+          case uint32(UC) of
+            0..$7F:
               begin
-              // One character US-ASCII, convert it to unicode
-              // Commented code to convert LF to CRLF has been removed
-              inc(OutputUnicode);
-              PreChar:=IBYTE;
-              inc(InputUTF8);
-              end
-            else
-              begin
-                TempByte:=IBYTE;
-                CharLen:=0;
-                while (TempBYTE and $80)<>0 do
-                  begin
-                    TempBYTE:=(TempBYTE shl 1) and $FE;
-                    inc(CharLen);
-                  end;
-                //Test for the "CharLen" conforms UTF-8 string
-                //This means the 10xxxxxx pattern.
-                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
-                  begin
-                    //Insuficient chars in string to decode
-                    //UTF-8 array. Fallback to single AnsiChar.
-                    CharLen:= 1;
-                  end;
-                for LookAhead := 1 to CharLen-1 do
-                  begin
-                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
-                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
-                      begin
-                        //Invalid UTF-8 sequence, fallback.
-                        CharLen:= LookAhead;
-                        break;
-                      end;
-                  end;
-                UC:=$FFFF;
-                case CharLen of
-                  1:  begin
-                        //Not valid UTF-8 sequence
-                        UC:=UNICODE_INVALID;
-                      end;
-                  2:  begin
-                        //Two bytes UTF, convert it
-                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
-                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
-                        if UC <= $7F then
-                          begin
-                            //Invalid UTF sequence.
-                            UC:=UNICODE_INVALID;
-                          end;
-                      end;
-                  3:  begin
-                        //Three bytes, convert it to unicode
-                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
-                        If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
-                          begin
-                            //Invalid UTF-8 sequence
-                            UC:= UNICODE_INVALID;
-                          end;
-                      end;
-                  4:  begin
-                        //Four bytes, convert it to two unicode characters
-                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
-                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
-                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
-                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
-                        if (UC < $10000) or (UC > $10FFFF) then
-                          UC:= UNICODE_INVALID
-                        else
-                          { extra character character }
-                          inc(OutputUnicode);
-                      end;
-                  5,6,7:  begin
-                            //Invalid UTF8 to unicode conversion,
-                            //mask it as invalid UNICODE too.
-                            UC:=UNICODE_INVALID;
-                          end;
-                end;
-                if CharLen > 0 then
-                  begin
-                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
-                      HandleError(231); // Will be converted to EConversionError in sysutils
-                    PreChar:=UC;
-                    inc(OutputUnicode);
-                  end;
-                InputUTF8:= InputUTF8 + CharLen;
+                inc(SourcePos);
+                if SourcePos<SourceBytes then continue else break;
               end;
+            $C2..$DF:
+              if (SourcePos+1<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) then
+                begin
+                  inc(SourcePos,2);
+                  if SourcePos<SourceBytes then continue else break;
+                end;
+            $E0..$EF:
+              if (SourcePos+2<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) then
+                begin
+                  UC:=UC and $F shl 12 or ord(Source[SourcePos+1]) and $3F shl 6 or ord(Source[SourcePos+2]) and $3F;
+                  if (UC>=$800) and (UC<=$FFFD) and not ((UC>=$D800) and (UC<=$DFFF)) then
+                    begin
+                      inc(SourcePos,3);
+                      if SourcePos<SourceBytes then continue else break;
+                    end;
+                end;
+            $F0..$F4:
+              if (SourcePos+3<SourceBytes) and (ord(Source[SourcePos+1]) and $C0=$80) and (ord(Source[SourcePos+2]) and $C0=$80) and (ord(Source[SourcePos+3]) and $C0=$80) then
+                begin
+                  UC:=UC and $7 shl 18 or ord(Source[SourcePos+1]) and $3F shl 12 or ord(Source[SourcePos+2]) and $3F shl 6 or ord(Source[SourcePos+3]) and $3F-$10000;
+                  if Cardinal(UC)<=$10FFFF-$10000 then
+                    begin
+                      inc(SourcePos,4);
+                      inc(DestPos); { To 2 unicodechars in total. }
+                      if SourcePos<SourceBytes then continue else break;
+                    end;
+                end;
           end;
-        Result:=OutputUnicode+1;
-      end;
+          if not IgnoreInvalid then
+            HandleError(231);
+          inc(SourcePos);
+          if ord(Source[SourcePos-1]) and $C0<>$80 then
+            while (SourcePos<SourceBytes) and (ord(Source[SourcePos]) and $C0=$80) do
+              inc(SourcePos);
+          if SourcePos>=SourceBytes then break;
+        until false;
+    Result:=DestPos+1 {null terminator, in both branches};
   end;
 {$endif EXCLUDE_COMPLEX_PROCS}