Browse Source

* Fix bug ID #38008: allow UTF8 to unicode conversion to react on/ignore invalid input

git-svn-id: trunk@47391 -
michael 4 years ago
parent
commit
257ef24a1e
2 changed files with 26 additions and 66 deletions
  1. 2 1
      rtl/inc/ustringh.inc
  2. 24 65
      rtl/inc/ustrings.inc

+ 2 - 1
rtl/inc/ustringh.inc

@@ -134,7 +134,8 @@ var
 function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
 function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar; SourceChars: SizeUInt): SizeUInt;
 function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
-function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
+function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt; IgnoreInvalid : Boolean): SizeUInt;
 function UTF8Encode(const s : RawByteString) : RawByteString; inline;
 function UTF8Encode(const s : UnicodeString) : RawByteString;
 function UTF8Decode(const s : RawByteString): UnicodeString;

+ 24 - 65
rtl/inc/ustrings.inc

@@ -1792,13 +1792,20 @@ end;
 function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
   begin
     if assigned(Source) then
-      Result:=Utf8ToUnicode(Dest,MaxChars,Source,length(Source))
+      Result:=Utf8ToUnicode(Dest,MaxChars,Source,length(Source),True)
     else
       Result:=0;
   end;
 
 
-function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
+function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+
+begin
+  Result:=Utf8ToUnicode(Dest,MaxDestChars,Source,SourceBytes,True);
+end;
+
+function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt; IgnoreInvalid : Boolean): SizeUInt;
+
 {$ifdef EXCLUDE_COMPLEX_PROCS}
 begin
   runerror(217);
@@ -1832,44 +1839,12 @@ end;
             IBYTE:=byte(Source[InputUTF8]);
             if (IBYTE and $80) = 0 then
               begin
-                //One character US-ASCII, convert it to unicode
-(*
-                if IBYTE = 10 then
-                  begin
-                    If (PreChar<>13) and FALSE then
-                      begin
-                        //Expand to crlf, conform UTF-8.
-                        //This procedure will break the memory alocation by
-                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
-                        if OutputUnicode+1<MaxDestChars then
-                          begin
-                            Dest[OutputUnicode]:=WideChar(13);
-                            inc(OutputUnicode);
-                            Dest[OutputUnicode]:=WideChar(10);
-                            inc(OutputUnicode);
-                            PreChar:=10;
-                          end
-                        else
-                          begin
-                            Dest[OutputUnicode]:=WideChar(13);
-                            inc(OutputUnicode);
-                          end;
-                      end
-                    else
-                      begin
-                        Dest[OutputUnicode]:=WideChar(IBYTE);
-                        inc(OutputUnicode);
-                        PreChar:=IBYTE;
-                      end;
-                  end
-                else
-*)
-                  begin
-                    Dest[OutputUnicode]:=WideChar(IBYTE);
-                    inc(OutputUnicode);
-                    PreChar:=IBYTE;
-                  end;
-                inc(InputUTF8);
+              // One character US-ASCII, convert it to unicode
+              // Commented code to convert LF to CRLF has been removed
+              Dest[OutputUnicode]:=WideChar(IBYTE);
+              inc(OutputUnicode);
+              PreChar:=IBYTE;
+              inc(InputUTF8);
               end
             else
               begin
@@ -1961,6 +1936,8 @@ end;
                 end;
                 if CharLen > 0 then
                   begin
+                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
+                      HandleError(231); // Will be converted to EConversionError in sysutils
                     PreChar:=UC;
                     Dest[OutputUnicode]:=WideChar(UC);
                     inc(OutputUnicode);
@@ -1977,31 +1954,11 @@ end;
             IBYTE:=byte(Source[InputUTF8]);
             if (IBYTE and $80) = 0 then
               begin
-                //One character US-ASCII, convert it to unicode
-(*
-                if IBYTE = 10 then
-                  begin
-                    if (PreChar<>13) and FALSE then
-                      begin
-                        //Expand to crlf, conform UTF-8.
-                        //This procedure will break the memory alocation by
-                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
-                        inc(OutputUnicode,2);
-                        PreChar:=10;
-                      end
-                    else
-                      begin
-                        inc(OutputUnicode);
-                        PreChar:=IBYTE;
-                      end;
-                  end
-                else
-*)
-                  begin
-                    inc(OutputUnicode);
-                    PreChar:=IBYTE;
-                  end;
-                inc(InputUTF8);
+              // One character US-ASCII, convert it to unicode
+              // Commented code to convert LF to CRLF has been removed
+              inc(OutputUnicode);
+              PreChar:=IBYTE;
+              inc(InputUTF8);
               end
             else
               begin
@@ -2077,6 +2034,8 @@ end;
                 end;
                 if CharLen > 0 then
                   begin
+                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
+                      HandleError(231); // Will be converted to EConversionError in sysutils
                     PreChar:=UC;
                     inc(OutputUnicode);
                   end;