소스 검색

+ support for arbitrary encodings in readstr/writestr
o set the code page of the temporary "text" file to utf-8 for writestr with
unicodestring/widestring as destination, so that no data loss can occur
(+ properly deal with cases whereby part of an utf-8 character is
written to the textbuf in this case)
o explicitly pass the code page of the destination ansistring for writestr
with ansistring as destination and set it for the temporary "text" file
o set the code page of the text file for readstr

git-svn-id: trunk@26317 -

Jonas Maebe 11 년 전
부모
커밋
d2b8275b99
5개의 변경된 파일162개의 추가작업 그리고 30개의 파일을 삭제
  1. 1 0
      .gitattributes
  2. 5 0
      compiler/ninl.pas
  3. 1 1
      rtl/inc/compproc.inc
  4. 122 29
      rtl/inc/text.inc
  5. 33 0
      tests/test/twrstr9.pp

+ 1 - 0
.gitattributes

@@ -12041,6 +12041,7 @@ tests/test/twrstr5.pp svneol=native#text/plain
 tests/test/twrstr6.pp svneol=native#text/plain
 tests/test/twrstr7.pp svneol=native#text/plain
 tests/test/twrstr8.pp svneol=native#text/plain
+tests/test/twrstr9.pp svneol=native#text/plain
 tests/test/uabstrcl.pp svneol=native#text/plain
 tests/test/uchlp12.pp svneol=native#text/pascal
 tests/test/uchlp18.pp svneol=native#text/pascal

+ 5 - 0
compiler/ninl.pas

@@ -1261,6 +1261,11 @@ implementation
             { parameter chain                                         }
             left:=filepara.right;
             filepara.right:=ccallparanode.create(ctemprefnode.create(filetemp),nil);
+            { in case of a writestr() to an ansistring, also pass the string's
+              code page }
+            if not do_read and
+               is_ansistring(filepara.left.resultdef) then
+              filepara:=ccallparanode.create(genintconstnode(tstringdef(filepara.left.resultdef).encoding),filepara);
             { pass the temp text file and the source/destination string to the
               setup routine, which will store the string's address in the
               textrec }

+ 1 - 1
rtl/inc/compproc.inc

@@ -426,7 +426,7 @@ Procedure fpc_Write_Text_WideChar(Len : Longint;var t : Text;c : WideChar); comp
   can appear inside the other arguments of writerstr }
 procedure fpc_SetupWriteStr_Shortstr(var ReadWriteStrText: text; var s: shortstring); compilerproc;
 {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
-procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring); compilerproc;
+procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring; cp: TSystemCodePage); compilerproc;
 {$endif FPC_HAS_FEATURE_ANSISTRINGS}
 {$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
 procedure fpc_SetupWriteStr_Unicodestr(var ReadWriteStrText: text; var s: unicodestring); compilerproc;

+ 122 - 29
rtl/inc/text.inc

@@ -2209,25 +2209,104 @@ end;
 
 
 {$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
-procedure WriteStrUnicode(var t: textrec);
+function UTF8CodePointLength(firstbyte: byte): SizeInt;
+var
+  firstzerobit: SizeInt;
+begin
+  result:=1;
+  { bsr searches for the leftmost 1 bit. We are interested in the
+    leftmost 0 bit, so first invert the value
+  }
+  firstzerobit:=BsrByte(not(firstbyte));
+  { if there is no zero bit or the first zero bit is the rightmost bit
+    (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
+    UTF-8-encoded string, and in the worst case bit 1 has to be zero)
+  }
+  if (firstzerobit=0) or (firstzerobit=255)  then
+    exit;
+  { the number of bytes belonging to this code point is
+    7-(pos first 0-bit).
+  }
+  result:=7-firstzerobit;
+end;
+
+
+function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
+var
+  i, lenfound, codepointlen: SizeInt;
+  b: byte;
+begin
+  lenfound:=0;
+  for i:=t.bufpos-1 downto 0 do
+    begin
+      b:=byte(t.bufptr^[i]);
+      if b<=127 then
+        begin
+          if lenfound = 0 then
+            { valid simple code point }
+            result:=i+1
+          else
+            { valid simple code point followed by a bunch of invalid data ->
+              handle everything since it can't become valid by adding more
+              bytes }
+            result:=t.bufpos;
+          exit;
+        end;
+      { start of a complex character }
+      if (b and %11000000)<>0 then
+        begin
+          codepointlen:=UTF8CodePointLength(b);
+          { we did not yet get all bytes of the last code point -> handle
+            everything until the start of this character }
+          if codepointlen>lenfound+1 then
+            result:=i
+          { the last code point is invalid -> handle everything since it can't
+            become valid by adding more bytes; in case it's complete, we also
+            handle everything, of course}
+          else
+            result:=t.bufpos;
+          exit;
+        end;
+      inc(lenfound);
+    end;
+  { all invalid data, or the buffer is too small to be able to deal with the
+    complete utf8char -> nothing else to do but to handle the entire buffer }
+  result:=t.bufpos;
+end;
+
+
+procedure WriteStrUnicodeIntern(var t: textrec; flush: boolean);
 var
-  temp: ansistring;
+  temp: unicodestring;
   str: punicodestring;
+  validend: SizeInt;
 begin
   if (t.bufpos=0) then
     exit;
   str:=punicodestring(@t.userdata[TempWriteStrDestIndex]);
-  setlength(temp,t.bufpos);
-  move(t.bufptr^,temp[1],t.bufpos);
+  if not flush then
+    validend:=EndOfLastCompleteUTF8CodePoint(t)
+  else
+    validend:=t.bufpos;
+  widestringmanager.Ansi2UnicodeMoveProc(@t.bufptr^[0],CP_UTF8,temp,validend);
   str^:=str^+temp;
-  t.bufpos:=0;
+  dec(t.bufpos,validend);
+  { move remainder to the start }
+  if t.bufpos<>0 then
+    move(t.bufptr^[validend],t.bufptr^[0],t.bufpos);
+end;
+
+
+procedure WriteStrUnicode(var t: textrec);
+begin
+  WriteStrUnicodeIntern(t,false);
 end;
 
 
 procedure WriteStrUnicodeFlush(var t: textrec);
 begin
   { see comment in WriteStrShortFlush }
-  WriteStrUnicode(t);
+  WriteStrUnicodeIntern(t,true);
   punicodestring(ppointer(@t.userdata[StrPtrIndex])^)^:=
     punicodestring(@t.userdata[TempWriteStrDestIndex])^;
   { free memory/finalize temp }
@@ -2236,25 +2315,38 @@ end;
 {$endif FPC_HAS_FEATURE_WIDESTRINGS}
 
 {$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
-procedure WriteStrWide(var t: textrec);
+procedure WriteStrWideIntern(var t: textrec; flush: boolean);
 var
-  temp: ansistring;
+  temp: unicodestring;
   str: pwidestring;
+  validend: SizeInt;
 begin
   if (t.bufpos=0) then
     exit;
   str:=pwidestring(@t.userdata[TempWriteStrDestIndex]);
-  setlength(temp,t.bufpos);
-  move(t.bufptr^,temp[1],t.bufpos);
+  if not flush then
+    validend:=EndOfLastCompleteUTF8CodePoint(t)
+  else
+    validend:=t.bufpos;
+  widestringmanager.Ansi2UnicodeMoveProc(@t.bufptr^[0],CP_UTF8,temp,validend);
   str^:=str^+temp;
-  t.bufpos:=0;
+  dec(t.bufpos,validend);
+  { move remainder to the start }
+  if t.bufpos<>0 then
+    move(t.bufptr^[validend],t.bufptr^[0],t.bufpos);
+end;
+
+
+procedure WriteStrWide(var t: textrec);
+begin
+  WriteStrUnicodeIntern(t,false);
 end;
 
 
 procedure WriteStrWideFlush(var t: textrec);
 begin
   { see comment in WriteStrShortFlush }
-  WriteStrWide(t);
+  WriteStrWideIntern(t,true);
   pwidestring(ppointer(@t.userdata[StrPtrIndex])^)^:=
     pwidestring(@t.userdata[TempWriteStrDestIndex])^;
   { free memory/finalize temp }
@@ -2262,7 +2354,7 @@ begin
 end;
 {$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
 
-procedure SetupWriteStrCommon(out t: textrec);
+procedure SetupWriteStrCommon(out t: textrec; cp: TSystemCodePage);
 begin
   // initialise
   Assign(text(t),'');
@@ -2270,14 +2362,14 @@ begin
   t.OpenFunc:=nil;
   t.CloseFunc:=nil;
   {$ifdef FPC_HAS_CPSTRING}
-  t.CodePage:=DefaultSystemCodePage;
+  t.CodePage:=TranslatePlaceholderCP(cp);
   {$endif}
 end;
 
 
 procedure fpc_SetupWriteStr_Shortstr(var ReadWriteStrText: text; var s: shortstring); compilerproc;
 begin
-  setupwritestrcommon(TextRec(ReadWriteStrText));
+  SetupWriteStrCommon(TextRec(ReadWriteStrText),DefaultSystemCodePage);
   PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
 
   { temporary destination (see comments for TempWriteStrDestIndex) }
@@ -2291,9 +2383,12 @@ end;
 
 
 {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
-procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring); compilerproc;
+procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring; cp: TSystemCodePage); compilerproc;
 begin
-  setupwritestrcommon(TextRec(ReadWriteStrText));
+  { destination rawbytestring -> use CP_ACP }
+  if cp=CP_NONE then
+    cp:=CP_ACP;
+  SetupWriteStrCommon(TextRec(ReadWriteStrText),cp);
   PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
 
   { temp destination ansistring, nil = empty string }
@@ -2308,7 +2403,7 @@ end;
 {$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
 procedure fpc_SetupWriteStr_Unicodestr(var ReadWriteStrText: text; var s: unicodestring); compilerproc;
 begin
-  setupwritestrcommon(TextRec(ReadWriteStrText));
+  SetupWriteStrCommon(TextRec(ReadWriteStrText),CP_UTF8);
   PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
 
   { temp destination unicodestring, nil = empty string }
@@ -2323,7 +2418,7 @@ end;
 {$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
 procedure fpc_SetupWriteStr_Widestr(var ReadWriteStrText: text; var s: widestring); compilerproc;
 begin
-  setupwritestrcommon(TextRec(ReadWriteStrText));
+  SetupWriteStrCommon(TextRec(ReadWriteStrText),CP_UTF8);
   PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
 
   { temp destination widestring }
@@ -2370,7 +2465,7 @@ begin
 end;
 
 
-procedure SetupReadStrCommon(out t: textrec);
+procedure SetupReadStrCommon(out t: textrec; cp: TSystemCodePage);
 begin
   // initialise
   Assign(text(t),'');
@@ -2378,7 +2473,7 @@ begin
   t.OpenFunc:=nil;
   t.CloseFunc:=nil;
   {$ifdef FPC_HAS_CPSTRING}
-  t.CodePage:=DefaultSystemCodePage;
+  t.CodePage:=TranslatePlaceholderCP(cp);
   {$endif}
   PSizeInt(@t.userdata[BytesReadIndex])^:=0;
 end;
@@ -2387,7 +2482,7 @@ end;
 {$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
 procedure fpc_SetupReadStr_Ansistr(var ReadWriteStrText: text; const s: ansistring); [public, alias: 'FPC_SETUPREADSTR_ANSISTR']; compilerproc;
 begin
-  setupreadstrcommon(TextRec(ReadWriteStrText));
+  SetupReadStrCommon(TextRec(ReadWriteStrText),StringCodePage(s));
   { we need a reference, because 's' may be a temporary expression }
   PAnsiString(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=s;
   TextRec(ReadWriteStrText).InOutFunc:=@ReadStrAnsi;
@@ -2395,7 +2490,7 @@ begin
   TextRec(ReadWriteStrText).FlushFunc:=@ReadAnsiStrFinal;
 end;
 
-procedure fpc_SetupReadStr_Ansistr_Intern(var ReadWriteStrText: text; const s: ansistring); [external name 'FPC_SETUPREADSTR_ANSISTR'];
+procedure fpc_SetupReadStr_Ansistr_Intern(var ReadWriteStrText: text; const s: rawbytestring); [external name 'FPC_SETUPREADSTR_ANSISTR'];
 {$endif FPC_HAS_FEATURE_ANSISTRINGS}
 
 
@@ -2436,9 +2531,8 @@ end;
 {$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
 procedure fpc_SetupReadStr_Unicodestr(var ReadWriteStrText: text; const s: unicodestring); compilerproc;
 begin
-  { we use an ansistring to avoid code duplication, and let the    }
-  { assignment convert the widestring to an equivalent ansistring  }
-  fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,s);
+  { we use an utf8string to avoid code duplication }
+  fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,utf8string(s));
 end;
 {$endif FPC_HAS_FEATURE_WIDESTRINGS}
 
@@ -2446,9 +2540,8 @@ end;
 {$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
 procedure fpc_SetupReadStr_Widestr(var ReadWriteStrText: text; const s: widestring); compilerproc;
 begin
-  { we use an ansistring to avoid code duplication, and let the    }
-  { assignment convert the widestring to an equivalent ansistring  }
-  fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,s);
+  { we use an utf8string to avoid code duplication }
+  fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,utf8string(s));
 end;
 {$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
 

+ 33 - 0
tests/test/twrstr9.pp

@@ -0,0 +1,33 @@
+{$codepage utf8}
+
+{$ifdef unix}
+uses
+  cwstring;
+{$endif}
+
+type
+  ts866 = type ansistring(866);
+var
+  u: unicodestring;
+  s: utf8string;
+  rs: ts866;
+  p: pointer;
+  i: longint;
+begin
+  DefaultSystemCodePage:=CP_ASCII;
+  s:='§èà£ù';
+  rs:=ts866('Популярные фото');
+  writestr(u,s,1,s,rs);
+  if u <>'§èà£ù1§èà£ùПопулярные фото' then
+    halt(1);
+  getmem(p,length(s)-1);
+  s:='';
+  for i:=1 to (256 div 3) do
+    s:=s+utf8string('㒨');
+  s:=s+utf8string('㒨');
+  { check that splitting the last 㒨 into two parts during writestr doesn't cause a
+    conversion error }
+  writestr(u,s);
+  if utf8string(u)<>s then
+    halt(2);
+end.