Parcourir la source

+ support UTF-8 in ascii2unicode(), this fixes the UTF-16 output of
resourcestring data .rsj files in case the source file is interpreted as
UTF-8. Previously, the individual UTF-8 bytes were each stored in a
separate widechar in the Json file (mantis #28717)

* due to the fact that rstconv didn't use the cwstring unit on Unix, rstconv
until now just concatenated the bytes stored in the widechars of the Json
file on those platforms, i.e., the strings put in the resource file were
byte for byte equal to what was in the source file. On Windows, these bytes
were interpreted as individual widechars, converted to the
DefaultSystemCodePage and then written. This means that for anything but
ISO-8859-1 (where every widechar from #0000 to #0255 maps to #0 to #255),
the output got corrupted.

In order to keep compatibility with the old behaviour whereby rstconv wrote
the resource strings using the same encoding as in the source file (except
if the data got completely corrupted, in which case compatibility is
useless), we now store all resourcestrings twice in the .rsj file: once as
the exact byte sequence from the source file, and once (properly) encoded
in UTF-16.

By default, rstconv will use the byte string and just write that one to the
resource file. Additionally, there is a new -p option that accepts a code
page name (see rstconv -h for the list of supported names), which can be
used to make rstconv use the UTF-16 version and convert that to the desired
code page (as long as the system on which rstconv runs supports that
codepage).

And this also finally resolves mantis #6477.

git-svn-id: trunk@31881 -

Jonas Maebe il y a 9 ans
Parent
commit
05bf826342
3 fichiers modifiés avec 102 ajouts et 24 suppressions
  1. 12 1
      compiler/cresstr.pas
  2. 20 8
      compiler/widestr.pas
  3. 70 15
      utils/rstconv.pp

+ 12 - 1
compiler/cresstr.pas

@@ -237,11 +237,22 @@ uses
             message1(general_e_errorwritingresourcefile,ResFileName);
             message1(general_e_errorwritingresourcefile,ResFileName);
             exit;
             exit;
           end;
           end;
+        { write the data in two formats:
+           a) backward compatible: the plain bytes from the source file
+           b) portable: converted to utf-16
+        }
         writeln(f,'{"version":1,"strings":[');
         writeln(f,'{"version":1,"strings":[');
         R:=TResourceStringItem(List.First);
         R:=TResourceStringItem(List.First);
         while assigned(R) do
         while assigned(R) do
           begin
           begin
-            write(f, '{"hash":',R.Hash,',"name":"',R.Name,'","value":"');
+            write(f, '{"hash":',R.Hash,',"name":"',R.Name,'","sourcebytes":[');
+            for i:=0 to R.Len-1 do
+              begin
+                write(f,ord(R.Value[i]));
+                if i<>R.Len-1 then
+                  write(f,',');
+              end;
+            write(f,'],"value":"');
             initwidestring(W);
             initwidestring(W);
             ascii2unicode(R.Value,R.Len,current_settings.sourcecodepage,W);
             ascii2unicode(R.Value,R.Len,current_settings.sourcecodepage,W);
             for I := 0 to W^.len - 1 do
             for I := 0 to W^.len - 1 do

+ 20 - 8
compiler/widestr.pas

@@ -201,6 +201,7 @@ unit widestr;
          Result := getascii(c,getmap(current_settings.sourcecodepage))[1];
          Result := getascii(c,getmap(current_settings.sourcecodepage))[1];
       end;
       end;
 
 
+
     procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring;codepagetranslation : boolean = true);
     procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring;codepagetranslation : boolean = true);
       var
       var
          source : pchar;
          source : pchar;
@@ -212,15 +213,25 @@ unit widestr;
          setlengthwidestring(r,l);
          setlengthwidestring(r,l);
          source:=p;
          source:=p;
          dest:=tcompilerwidecharptr(r^.data);
          dest:=tcompilerwidecharptr(r^.data);
-         if (cp<>CP_UTF8) and
-            codepagetranslation then
+         if codepagetranslation then
            begin
            begin
-             for i:=1 to l do
-                begin
-                  dest^:=getunicode(source^,m);
-                  inc(dest);
-                  inc(source);
-                end;
+             if cp<>CP_UTF8 then
+               begin
+                 for i:=1 to l do
+                    begin
+                      dest^:=getunicode(source^,m);
+                      inc(dest);
+                      inc(source);
+                    end;
+               end
+             else
+               begin
+                 r^.len:=Utf8ToUnicode(punicodechar(r^.data),r^.maxlen,p,l);
+                 { -1, because utf8tounicode includes room for a terminating 0 in
+                   its result count }
+                 if r^.len>0 then
+                   dec(r^.len);
+               end;
            end
            end
          else
          else
            begin
            begin
@@ -233,6 +244,7 @@ unit widestr;
            end;
            end;
       end;
       end;
 
 
+
     procedure unicode2ascii(r : pcompilerwidestring;p:pchar;cp : tstringencoding);
     procedure unicode2ascii(r : pcompilerwidestring;p:pchar;cp : tstringencoding);
       var
       var
         m : punicodemap;
         m : punicodemap;

+ 70 - 15
utils/rstconv.pp

@@ -18,7 +18,11 @@
 
 
 program rstconv;
 program rstconv;
 
 
-uses sysutils, classes, jsonparser, fpjson;
+uses
+{$ifdef unix}
+  cwstring,
+{$endif}
+  sysutils, classes, jsonparser, fpjson, charset, cpall;
 
 
 resourcestring
 resourcestring
   help =
   help =
@@ -40,7 +44,10 @@ resourcestring
     'Resource compiler script only options are:'+LineEnding+
     'Resource compiler script only options are:'+LineEnding+
     '  -s             Use STRINGTABLE instead of MESSAGETABLE'+LineEnding+
     '  -s             Use STRINGTABLE instead of MESSAGETABLE'+LineEnding+
     '  -c identifier  Use identifier as ID base (ID+n) (OPTIONAL)'+LineEnding+
     '  -c identifier  Use identifier as ID base (ID+n) (OPTIONAL)'+LineEnding+
-    '  -n number      Specifies the first ID number (OPTIONAL)'+LineEnding;
+    '  -n number      Specifies the first ID number (OPTIONAL)'+LineEnding+
+    '.rsj-input format-only options are:'+LineEnding+
+    '  -p codepage    Convert the string data to the specified code page before'+LineEnding+
+    '                 writing it to the output file. Possible values:';
 
 
 
 
   InvalidOption = 'Invalid option - ';
   InvalidOption = 'Invalid option - ';
@@ -50,7 +57,9 @@ resourcestring
   InvalidOutputFormat = 'Invalid output format -';
   InvalidOutputFormat = 'Invalid output format -';
   MessageNumberTooBig = 'Message number too big';
   MessageNumberTooBig = 'Message number too big';
   InvalidRange = 'Invalid range of the first message number';
   InvalidRange = 'Invalid range of the first message number';
-
+  MissingOption = 'Missing option after parameter ';
+  UnsupportedOutputCodePage = 'Unsupported output code page specified: ';
+  RstNoOutputCodePage = 'It is not possible to specify an output code page when using a .rst file';
 
 
 type
 type
 
 
@@ -62,8 +71,9 @@ type
 var
 var
   InFilename, OutFilename: String;
   InFilename, OutFilename: String;
   ConstItems: TCollection;
   ConstItems: TCollection;
-  CharSet: String;
+  HeaderCharSet: String;
   Identifier: String;
   Identifier: String;
+  OutputCodePage: Longint;
   FirstMessage: Word;
   FirstMessage: Word;
   MessageTable: Boolean;
   MessageTable: Boolean;
 
 
@@ -121,12 +131,15 @@ procedure ReadRSJFile;
 var
 var
   Stream: TFileStream;
   Stream: TFileStream;
   Parser: TJSONParser;
   Parser: TJSONParser;
-  JsonItems: TJSONArray;
+  JsonItems,
+  RawStringData: TJSONArray;
   JsonData, JsonItem: TJSONObject;
   JsonData, JsonItem: TJSONObject;
   S: String;
   S: String;
   item: TConstItem;
   item: TConstItem;
-  DotPos, I: Integer;
+  DotPos, I, J: Integer;
 begin
 begin
+  if OutputCodePage<>-1 then
+    DefaultSystemCodePage:=OutputCodePage;
   Stream := TFileStream.Create(InFilename, fmOpenRead or fmShareDenyNone);
   Stream := TFileStream.Create(InFilename, fmOpenRead or fmShareDenyNone);
   Parser := TJSONParser.Create(Stream);
   Parser := TJSONParser.Create(Stream);
   try
   try
@@ -141,7 +154,17 @@ begin
         DotPos := Pos('.', s);
         DotPos := Pos('.', s);
         item.ModuleName := Copy(s, 1, DotPos - 1);
         item.ModuleName := Copy(s, 1, DotPos - 1);
         item.ConstName := Copy(s, DotPos + 1, Length(S) - DotPos);
         item.ConstName := Copy(s, DotPos + 1, Length(S) - DotPos);
-        item.Value := JsonItem.Get('value');
+        if OutputCodePage=-1 then
+          begin
+            RawStringData:=JsonItem.Get('sourcebytes',TJSONArray(nil));
+            SetLength(item.Value, RawStringData.Count);
+            for J := 1 to Length(item.Value) do
+              item.Value[J]:=char(RawStringData.Integers[J-1]);
+          end
+        else
+          { automatically converts from UTF-16 to the correct code page due
+            to the change of DefaultSystemCodePage to OutputCodePage above }
+          item.Value := JsonItem.Get('value');
       end;
       end;
     finally
     finally
       JsonData.Free;
       JsonData.Free;
@@ -164,12 +187,12 @@ begin
   Assign(f, OutFilename);
   Assign(f, OutFilename);
   Rewrite(f);
   Rewrite(f);
   
   
-  if CharSet<>'' then begin
+  if HeaderCharSet<>'' then begin
     // Write file header  with
     // Write file header  with
     WriteLn(f, 'msgid ""');
     WriteLn(f, 'msgid ""');
     WriteLn(f, 'msgstr ""');
     WriteLn(f, 'msgstr ""');
     WriteLn(f, '"MIME-Version: 1.0\n"');
     WriteLn(f, '"MIME-Version: 1.0\n"');
-    WriteLn(f, '"Content-Type: text/plain; charset=', CharSet, '\n"');
+    WriteLn(f, '"Content-Type: text/plain; charset=', HeaderCharSet, '\n"');
     WriteLn(f, '"Content-Transfer-Encoding: 8bit\n"');
     WriteLn(f, '"Content-Transfer-Encoding: 8bit\n"');
     WriteLn(f);
     WriteLn(f);
   end;
   end;
@@ -345,15 +368,21 @@ begin
 
 
   if (ParamStr(1) = '-h') or (ParamStr(1) = '--help') then begin
   if (ParamStr(1) = '-h') or (ParamStr(1) = '--help') then begin
     WriteLn(help);
     WriteLn(help);
+    for i:=low(word) to high(word) do
+      if mappingavailable(i) then
+        writeln('                   ',getmap(i)^.cpname);
+    { UTF-8 is not supported via the CharSet unit }
+    writeln('                   UTF-8');
     exit;
     exit;
   end;
   end;
 
 
   ConversionProc := @ConvertToGettextPO;
   ConversionProc := @ConvertToGettextPO;
   OutputFormat:='';
   OutputFormat:='';
-  CharSet:='';
+  HeaderCharSet:='';
   Identifier:='';
   Identifier:='';
   FirstMessage:=0;
   FirstMessage:=0;
   MessageTable:=True;
   MessageTable:=True;
+  OutputCodePage:=-1;
 
 
   i := 1;
   i := 1;
   while i <= ParamCount do begin
   while i <= ParamCount do begin
@@ -391,11 +420,11 @@ begin
       Inc(i, 2);
       Inc(i, 2);
     end else if ParamStr(i) = '-c' then begin
     end else if ParamStr(i) = '-c' then begin
       if (OutputFormat='') or (OutputFormat='po') then begin
       if (OutputFormat='') or (OutputFormat='po') then begin
-        if CharSet <> '' then begin
+        if HeaderCharSet <> '' then begin
           WriteLn(StdErr, OptionAlreadySpecified, '-c');
           WriteLn(StdErr, OptionAlreadySpecified, '-c');
           Halt(1);
           Halt(1);
         end;
         end;
-        CharSet:=ParamStr(i+1);
+        HeaderCharSet:=ParamStr(i+1);
       end else
       end else
       begin
       begin
         if Identifier <> '' then begin
         if Identifier <> '' then begin
@@ -428,13 +457,32 @@ begin
         end;
         end;
       end;
       end;
       Inc(i, 2);
       Inc(i, 2);
-    end else begin
+    end else if ParamStr(i) = '-p' then
+      begin
+        if paramcount=i then
+          begin
+            WriteLn(StdErr, MissingOption,'-p');
+            Halt(1)
+          end;
+        if UpperCase(paramstr(i+1))<>'UTF-8' then
+          if not mappingavailable(ParamStr(i+1)) then
+            begin
+              WriteLn(StdErr, UnsupportedOutputCodePage, ParamStr(i+1));
+              Halt(1);
+            end
+          else
+            OutputCodePage:=getmap(ParamStr(i+1))^.cp
+        else
+          OutputCodePage:=CP_UTF8;
+        Inc(i, 2);
+      end
+    else begin
       WriteLn(StdErr, InvalidOption, ParamStr(i));
       WriteLn(StdErr, InvalidOption, ParamStr(i));
       Halt(1);
       Halt(1);
     end;
     end;
   end;
   end;
 
 
-  If ((OutputFormat<>'') and (OutputFormat<>'po')) and (CharSet<>'')  then begin
+  If ((OutputFormat<>'') and (OutputFormat<>'po')) and (HeaderCharSet<>'')  then begin
     WriteLn(StdErr, InvalidOption, '');
     WriteLn(StdErr, InvalidOption, '');
     Halt(1);
     Halt(1);
   end;
   end;
@@ -459,7 +507,14 @@ begin
   if ExtractFileExt(InFilename) = '.rsj' then
   if ExtractFileExt(InFilename) = '.rsj' then
     ReadRSJFile
     ReadRSJFile
   else
   else
-    ReadRSTFile;
+    begin
+      if OutputCodePage<>-1 then
+        begin
+          WriteLn(StdErr, RstNoOutputCodePage);
+          Halt(1);
+        end;
+      ReadRSTFile;
+    end;
 
 
   ConversionProc;
   ConversionProc;
 end.
 end.