Browse Source

* more unicodestring stuff fixed, test results on win32 are already good

git-svn-id: branches/unicodestring@11667 -
florian 17 years ago
parent
commit
cc4f9e5643

+ 3 - 0
.gitattributes

@@ -106,6 +106,7 @@ compiler/avr/rgcpu.pas svneol=native#text/plain
 compiler/browcol.pas svneol=native#text/plain
 compiler/browcol.pas svneol=native#text/plain
 compiler/bsdcompile -text
 compiler/bsdcompile -text
 compiler/catch.pas svneol=native#text/plain
 compiler/catch.pas svneol=native#text/plain
+compiler/ccharset.pas svneol=native#text/plain
 compiler/cclasses.pas svneol=native#text/plain
 compiler/cclasses.pas svneol=native#text/plain
 compiler/cfidwarf.pas svneol=native#text/plain
 compiler/cfidwarf.pas svneol=native#text/plain
 compiler/cfileutl.pas svneol=native#text/plain
 compiler/cfileutl.pas svneol=native#text/plain
@@ -7838,6 +7839,8 @@ tests/test/tunistr1.pp svneol=native#text/plain
 tests/test/tunistr2.pp svneol=native#text/plain
 tests/test/tunistr2.pp svneol=native#text/plain
 tests/test/tunistr4.pp svneol=native#text/plain
 tests/test/tunistr4.pp svneol=native#text/plain
 tests/test/tunistr5.pp svneol=native#text/plain
 tests/test/tunistr5.pp svneol=native#text/plain
+tests/test/tunistr6.pp svneol=native#text/plain
+tests/test/tunistr7.pp svneol=native#text/plain
 tests/test/tunit1.pp svneol=native#text/plain
 tests/test/tunit1.pp svneol=native#text/plain
 tests/test/tunit2.pp svneol=native#text/plain
 tests/test/tunit2.pp svneol=native#text/plain
 tests/test/tunit3.pp svneol=native#text/plain
 tests/test/tunit3.pp svneol=native#text/plain

+ 254 - 0
compiler/ccharset.pas

@@ -0,0 +1,254 @@
+{
+    This file is part of the Free Pascal run time library.
+    Copyright (c) 2000 by Florian Klaempfl
+    member of the Free Pascal development team.
+
+    This unit implements several classes for charset conversions
+
+    See the file COPYING.FPC, included in this distribution,
+    for details about the copyright.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ **********************************************************************}
+
+{ this unit is included temporarily for 2.2 bootstrapping and can be
+  removed after the next release after 2.2.2 }
+{$mode objfpc}
+unit ccharset;
+
+  interface
+
+    type
+       tunicodechar = word;
+       tunicodestring = ^tunicodechar;
+
+       tcsconvert = class
+         // !!!!!!1constructor create;
+       end;
+
+       tunicodecharmappingflag = (umf_noinfo,umf_leadbyte,umf_undefined,
+         umf_unused);
+
+       punicodecharmapping = ^tunicodecharmapping;
+       tunicodecharmapping = record
+          unicode : tunicodechar;
+          flag : tunicodecharmappingflag;
+          reserved : byte;
+       end;
+
+       punicodemap = ^tunicodemap;
+       tunicodemap = record
+          cpname : string[20];
+          map : punicodecharmapping;
+          lastchar : longint;
+          next : punicodemap;
+          internalmap : boolean;
+       end;
+
+       tcp2unicode = class(tcsconvert)
+       end;
+
+    function loadunicodemapping(const cpname,f : string) : punicodemap;
+    procedure registermapping(p : punicodemap);
+    function getmap(const s : string) : punicodemap;
+    function mappingavailable(const s : string) : boolean;
+    function getunicode(c : char;p : punicodemap) : tunicodechar;
+    function getascii(c : tunicodechar;p : punicodemap) : string;
+
+  implementation
+
+    var
+       mappings : punicodemap;
+
+    function loadunicodemapping(const cpname,f : string) : punicodemap;
+
+      var
+         data : punicodecharmapping;
+         datasize : longint;
+         t : text;
+         s,hs : string;
+         scanpos,charpos,unicodevalue : longint;
+         code : word;
+         flag : tunicodecharmappingflag;
+         p : punicodemap;
+         lastchar : longint;
+
+      begin
+         lastchar:=-1;
+         loadunicodemapping:=nil;
+         datasize:=256;
+         getmem(data,sizeof(tunicodecharmapping)*datasize);
+         assign(t,f);
+         {$I-}
+         reset(t);
+         {$I+}
+         if ioresult<>0 then
+           begin
+              freemem(data,sizeof(tunicodecharmapping)*datasize);
+              exit;
+           end;
+         while not(eof(t)) do
+           begin
+              readln(t,s);
+              if (s[1]='0') and (s[2]='x') then
+                begin
+                   flag:=umf_unused;
+                   scanpos:=3;
+                   hs:='$';
+                   while s[scanpos] in ['0'..'9','A'..'F','a'..'f'] do
+                     begin
+                        hs:=hs+s[scanpos];
+                        inc(scanpos);
+                     end;
+                   val(hs,charpos,code);
+                   if code<>0 then
+                     begin
+                        freemem(data,sizeof(tunicodecharmapping)*datasize);
+                        close(t);
+                        exit;
+                     end;
+                   while not(s[scanpos] in ['0','#']) do
+                     inc(scanpos);
+                   if s[scanpos]='#' then
+                     begin
+                        { special char }
+                        unicodevalue:=$ffff;
+                        hs:=copy(s,scanpos,length(s)-scanpos+1);
+                        if hs='#DBCS LEAD BYTE' then
+                          flag:=umf_leadbyte;
+                     end
+                   else
+                     begin
+                        { C hex prefix }
+                        inc(scanpos,2);
+                        hs:='$';
+                        while s[scanpos] in ['0'..'9','A'..'F','a'..'f'] do
+                          begin
+                             hs:=hs+s[scanpos];
+                             inc(scanpos);
+                          end;
+                        val(hs,unicodevalue,code);
+                        if code<>0 then
+                          begin
+                             freemem(data,sizeof(tunicodecharmapping)*datasize);
+                             close(t);
+                             exit;
+                          end;
+                        if charpos>datasize then
+                          begin
+                             { allocate 1024 bytes more because         }
+                             { if we need more than 256 entries it's    }
+                             { probably a mbcs with a lot of            }
+                             { entries                                  }
+                             datasize:=charpos+1024;
+                             reallocmem(data,sizeof(tunicodecharmapping)*datasize);
+                          end;
+                        flag:=umf_noinfo;
+                     end;
+                   data[charpos].flag:=flag;
+                   data[charpos].unicode:=unicodevalue;
+                   if charpos>lastchar then
+                     lastchar:=charpos;
+                end;
+           end;
+         close(t);
+         new(p);
+         p^.lastchar:=lastchar;
+         p^.cpname:=cpname;
+         p^.internalmap:=false;
+         p^.next:=nil;
+         p^.map:=data;
+         loadunicodemapping:=p;
+      end;
+
+    procedure registermapping(p : punicodemap);
+
+      begin
+         p^.next:=mappings;
+         mappings:=p;
+      end;
+
+    function getmap(const s : string) : punicodemap;
+
+      var
+         hp : punicodemap;
+
+      const
+         mapcache : string = '';
+         mapcachep : punicodemap = nil;
+
+      begin
+         if (mapcache=s) and assigned(mapcachep) and (mapcachep^.cpname=s) then
+           begin
+              getmap:=mapcachep;
+              exit;
+           end;
+         hp:=mappings;
+         while assigned(hp) do
+           begin
+              if hp^.cpname=s then
+                begin
+                   getmap:=hp;
+                   mapcache:=s;
+                   mapcachep:=hp;
+                   exit;
+                end;
+              hp:=hp^.next;
+           end;
+         getmap:=nil;
+      end;
+
+    function mappingavailable(const s : string) : boolean;
+
+      begin
+         mappingavailable:=getmap(s)<>nil;
+      end;
+
+    function getunicode(c : char;p : punicodemap) : tunicodechar;
+
+      begin
+         if ord(c)<=p^.lastchar then
+           getunicode:=p^.map[ord(c)].unicode
+         else
+           getunicode:=0;
+      end;
+
+    function getascii(c : tunicodechar;p : punicodemap) : string;
+
+      var
+         i : longint;
+
+      begin
+         { at least map to space }
+         getascii:=#32;
+         for i:=0 to p^.lastchar do
+           if p^.map[i].unicode=c then
+             begin
+                if i<256 then
+                  getascii:=chr(i)
+                else
+                  getascii:=chr(i div 256)+chr(i mod 256);
+                exit;
+             end;
+      end;
+
+  var
+     hp : punicodemap;
+
+initialization
+  mappings:=nil;
+finalization
+  while assigned(mappings) do
+    begin
+       hp:=mappings^.next;
+       if not(mappings^.internalmap) then
+         begin
+            freemem(mappings^.map);
+            dispose(mappings);
+         end;
+       mappings:=hp;
+    end;
+end.

+ 1 - 1
compiler/cp1251.pas

@@ -6,7 +6,7 @@ unit cp1251;
   implementation
   implementation
 
 
   uses
   uses
-     charset;
+     {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2};
 
 
   const
   const
      map : array[0..255] of tunicodecharmapping = (
      map : array[0..255] of tunicodecharmapping = (

+ 1 - 1
compiler/cp437.pas

@@ -6,7 +6,7 @@ unit cp437;
   implementation
   implementation
 
 
   uses
   uses
-     charset;
+     {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2};
 
 
   const
   const
      map : array[0..255] of tunicodecharmapping = (
      map : array[0..255] of tunicodecharmapping = (

+ 1 - 1
compiler/cp850.pas

@@ -6,7 +6,7 @@ unit cp850;
   implementation
   implementation
 
 
   uses
   uses
-     charset;
+     {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2};
 
 
   const
   const
      map : array[0..255] of tunicodecharmapping = (
      map : array[0..255] of tunicodecharmapping = (

+ 1 - 1
compiler/cp866.pas

@@ -6,7 +6,7 @@ unit cp866;
   implementation
   implementation
 
 
   uses
   uses
-     charset;
+     {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2};
 
 
   const
   const
      map : array[0..255] of tunicodecharmapping = (
      map : array[0..255] of tunicodecharmapping = (

+ 1 - 1
compiler/cp8859_1.pas

@@ -6,7 +6,7 @@ unit cp8859_1;
   implementation
   implementation
 
 
   uses
   uses
-     charset;
+     {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2};
 
 
   const
   const
      map : array[0..255] of tunicodecharmapping = (
      map : array[0..255] of tunicodecharmapping = (

+ 1 - 1
compiler/cp8859_5.pas

@@ -6,7 +6,7 @@ unit cp8859_5;
   implementation
   implementation
 
 
   uses
   uses
-     charset;
+     {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2};
 
 
   const
   const
      map : array[0..255] of tunicodecharmapping = (
      map : array[0..255] of tunicodecharmapping = (

+ 5 - 5
compiler/ncgcon.pas

@@ -270,7 +270,7 @@ implementation
          pooltype: TConstPoolType;
          pooltype: TConstPoolType;
          pool: THashSet;
          pool: THashSet;
          entry: PHashSetItem;
          entry: PHashSetItem;
-         
+
       const
       const
         PoolMap: array[tconststringtype] of TConstPoolType = (
         PoolMap: array[tconststringtype] of TConstPoolType = (
           sp_conststr,
           sp_conststr,
@@ -295,7 +295,7 @@ implementation
               if current_asmdata.ConstPools[pooltype] = nil then
               if current_asmdata.ConstPools[pooltype] = nil then
                 current_asmdata.ConstPools[pooltype] := THashSet.Create(64, True, False);
                 current_asmdata.ConstPools[pooltype] := THashSet.Create(64, True, False);
               pool := current_asmdata.ConstPools[pooltype];
               pool := current_asmdata.ConstPools[pooltype];
-           
+
               if cst_type in [cst_widestring, cst_unicodestring] then
               if cst_type in [cst_widestring, cst_unicodestring] then
                 entry := pool.FindOrAdd(pcompilerwidestring(value_str)^.data, len*cwidechartype.size)
                 entry := pool.FindOrAdd(pcompilerwidestring(value_str)^.data, len*cwidechartype.size)
               else
               else
@@ -402,7 +402,7 @@ implementation
                    end;
                    end;
                 end;
                 end;
            end;
            end;
-         if cst_type in [cst_ansistring, cst_widestring] then
+         if cst_type in [cst_ansistring, cst_widestring, cst_unicodestring] then
            begin
            begin
              location_reset(location, LOC_REGISTER, OS_ADDR);
              location_reset(location, LOC_REGISTER, OS_ADDR);
              reference_reset_symbol(href, lab_str, 0);
              reference_reset_symbol(href, lab_str, 0);
@@ -412,8 +412,8 @@ implementation
          else
          else
            begin
            begin
              location_reset(location, LOC_CREFERENCE, def_cgsize(resultdef));
              location_reset(location, LOC_CREFERENCE, def_cgsize(resultdef));
-         location.reference.symbol:=lab_str;
-      end;
+             location.reference.symbol:=lab_str;
+           end;
       end;
       end;
 
 
 
 

+ 1 - 1
compiler/options.pas

@@ -71,7 +71,7 @@ implementation
 
 
 uses
 uses
   widestr,
   widestr,
-  charset,
+  {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2},
   SysUtils,
   SysUtils,
   version,
   version,
   cutils,cmsgs,
   cutils,cmsgs,

+ 1 - 2
compiler/widestr.pas

@@ -28,8 +28,7 @@ unit widestr;
   interface
   interface
 
 
     uses
     uses
-       charset,globtype
-       ;
+       {$ifdef VER2_2}ccharset{$else VER2_2}charset{$endif VER2_2},globtype;
 
 
 
 
     type
     type

+ 56 - 5
rtl/win32/system.pp

@@ -899,10 +899,6 @@ end;
 
 
 {$endif Set_i386_Exception_handler}
 {$endif Set_i386_Exception_handler}
 
 
-{****************************************************************************
-                      OS dependend widestrings
-****************************************************************************}
-
 const
 const
   { MultiByteToWideChar  }
   { MultiByteToWideChar  }
      MB_PRECOMPOSED = 1;
      MB_PRECOMPOSED = 1;
@@ -918,6 +914,9 @@ function CharUpperBuff(lpsz:LPWSTR; cchLength:DWORD):DWORD;
 function CharLowerBuff(lpsz:LPWSTR; cchLength:DWORD):DWORD;
 function CharLowerBuff(lpsz:LPWSTR; cchLength:DWORD):DWORD;
     stdcall; external 'user32' name 'CharLowerBuffW';
     stdcall; external 'user32' name 'CharLowerBuffW';
 
 
+{******************************************************************************
+                              Widestring
+ ******************************************************************************}
 
 
 procedure Win32Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
 procedure Win32Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
   var
   var
@@ -947,13 +946,57 @@ procedure Win32Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt);
 function Win32WideUpper(const s : WideString) : WideString;
 function Win32WideUpper(const s : WideString) : WideString;
   begin
   begin
     result:=s;
     result:=s;
-    UniqueString(result);
     if length(result)>0 then
     if length(result)>0 then
       CharUpperBuff(LPWSTR(result),length(result));
       CharUpperBuff(LPWSTR(result),length(result));
   end;
   end;
 
 
 
 
 function Win32WideLower(const s : WideString) : WideString;
 function Win32WideLower(const s : WideString) : WideString;
+  begin
+    result:=s;
+    if length(result)>0 then
+      CharLowerBuff(LPWSTR(result),length(result));
+  end;
+
+{******************************************************************************
+                              Unicode
+ ******************************************************************************}
+
+procedure Win32Unicode2AnsiMove(source:punicodechar;var dest:ansistring;len:SizeInt);
+  var
+    destlen: SizeInt;
+  begin
+    // retrieve length including trailing #0
+    // not anymore, because this must also be usable for single characters
+    destlen:=WideCharToMultiByte(CP_ACP, WC_NO_BEST_FIT_CHARS, source, len, nil, 0, nil, nil);
+    // this will null-terminate
+    setlength(dest, destlen);
+    WideCharToMultiByte(CP_ACP, WC_NO_BEST_FIT_CHARS, source, len, @dest[1], destlen, nil, nil);
+  end;
+
+procedure Win32Ansi2UnicodeMove(source:pchar;var dest:UnicodeString;len:SizeInt);
+  var
+    destlen: SizeInt;
+  begin
+    // retrieve length including trailing #0
+    // not anymore, because this must also be usable for single characters
+    destlen:=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len, nil, 0);
+    // this will null-terminate
+    setlength(dest, destlen);
+    MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len, @dest[1], destlen);
+  end;
+
+
+function Win32UnicodeUpper(const s : UnicodeString) : UnicodeString;
+  begin
+    result:=s;
+    UniqueString(result);
+    if length(result)>0 then
+      CharUpperBuff(LPWSTR(result),length(result));
+  end;
+
+
+function Win32UnicodeLower(const s : UnicodeString) : UnicodeString;
   begin
   begin
     result:=s;
     result:=s;
     UniqueString(result);
     UniqueString(result);
@@ -966,10 +1009,17 @@ function Win32WideLower(const s : WideString) : WideString;
   are only relevant for the sysutils units }
   are only relevant for the sysutils units }
 procedure InitWin32Widestrings;
 procedure InitWin32Widestrings;
   begin
   begin
+    { Widestring }
     widestringmanager.Wide2AnsiMoveProc:=@Win32Wide2AnsiMove;
     widestringmanager.Wide2AnsiMoveProc:=@Win32Wide2AnsiMove;
     widestringmanager.Ansi2WideMoveProc:=@Win32Ansi2WideMove;
     widestringmanager.Ansi2WideMoveProc:=@Win32Ansi2WideMove;
     widestringmanager.UpperWideStringProc:=@Win32WideUpper;
     widestringmanager.UpperWideStringProc:=@Win32WideUpper;
     widestringmanager.LowerWideStringProc:=@Win32WideLower;
     widestringmanager.LowerWideStringProc:=@Win32WideLower;
+
+    { Unicode }
+    widestringmanager.Unicode2AnsiMoveProc:=@Win32Unicode2AnsiMove;
+    widestringmanager.Ansi2UnicodeMoveProc:=@Win32Ansi2UnicodeMove;
+    widestringmanager.UpperUnicodeStringProc:=@Win32UnicodeUpper;
+    widestringmanager.LowerUnicodeStringProc:=@Win32UnicodeLower;
   end;
   end;
 
 
 
 
@@ -1198,3 +1248,4 @@ begin
   InitWin32Widestrings;
   InitWin32Widestrings;
   DispCallByIDProc:=@DoDispCallByIDError;
   DispCallByIDProc:=@DoDispCallByIDError;
 end.
 end.
+

+ 8 - 8
tests/test/tunistr5.pp

@@ -20,18 +20,18 @@ begin
      (ws[7]<>#$d87e) or
      (ws[7]<>#$d87e) or
      (ws[8]<>#$dc04) then
      (ws[8]<>#$dc04) then
     halt(1);
     halt(1);
-  us:=WideStringToUCS4String(ws);
+  us:=UnicodeStringToUCS4String(ws);
   if (length(us)<>8) or
   if (length(us)<>8) or
-     (us[0]<>UCS4Char(widechar('é'))) or
-     (us[1]<>UCS4Char(widechar('ł'))) or
-     (us[2]<>UCS4Char(widechar('Ł'))) or
-     (us[3]<>UCS4Char(widechar('ć'))) or
-     (us[4]<>UCS4Char(widechar('ç'))) or
-     (us[5]<>UCS4Char(widechar('Ź'))) or
+     (us[0]<>UCS4Char(unicodechar('é'))) or
+     (us[1]<>UCS4Char(unicodechar('ł'))) or
+     (us[2]<>UCS4Char(unicodechar('Ł'))) or
+     (us[3]<>UCS4Char(unicodechar('ć'))) or
+     (us[4]<>UCS4Char(unicodechar('ç'))) or
+     (us[5]<>UCS4Char(unicodechar('Ź'))) or
      (us[6]<>UCS4Char($2F804)) or
      (us[6]<>UCS4Char($2F804)) or
      (us[7]<>UCS4Char(0)) then
      (us[7]<>UCS4Char(0)) then
     halt(2);
     halt(2);
-  ws:=UCS4StringToWideString(us);
+  ws:=UCS4StringToUnicodeString(us);
   if (length(ws)<>8) or
   if (length(ws)<>8) or
      (ws[1]<>'é') or
      (ws[1]<>'é') or
      (ws[2]<>'ł') or
      (ws[2]<>'ł') or

+ 397 - 0
tests/test/tunistr6.pp

@@ -0,0 +1,397 @@
+{%skiptarget=wince}
+{$codepage utf-8}
+uses
+  {$ifdef unix}
+  cwstring,
+  {$endif}
+  sysutils;
+
+procedure doerror(i : integer);
+  begin
+    writeln('Error: ',i);
+    halt(i);
+  end;
+
+
+{ normal upper case testing }
+procedure testupper;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  w1:='aé'#0'èàł'#$d87e#$dc04;
+  w2:='AÉ'#0'ÈÀŁ'#$d87e#$dc04;
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original upper: ',w2);
+{$endif print}
+  s:=w1;
+{$ifdef print}
+  writeln('ansi: ',s);
+{$endif print}
+  w3:=s;
+  w4:=AnsiUpperCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeUpperCase(w1);
+{$ifdef print}
+  writeln('unicodeupper: ',w1);
+  writeln('original upper: ',w2);
+  writeln('ansiupper: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(1);
+  if (w4 <> w2) then
+    doerror(2);
+
+  w1:='aéèàł'#$d87e#$dc04;
+  w2:='AÉÈÀŁ'#$d87e#$dc04;
+  s:=w1;
+  w3:=s;
+  w4:=AnsiStrUpper(pchar(s));
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeUpperCase(w1);
+{$ifdef print}
+  writeln('unicodeupper: ',w1);
+  writeln('ansistrupper: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(21);
+  if (w4 <> w2) then
+    doerror(22);
+
+end;
+
+
+{ normal lower case testing }
+procedure testlower;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  w1:='AÉ'#0'ÈÀŁ'#$d87e#$dc04;
+  w2:='aé'#0'èàł'#$d87e#$dc04;
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original lower: ',w2);
+{$endif print}
+  s:=w1;
+  w3:=s;
+  w4:=AnsiLowerCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeLowerCase(w1);
+{$ifdef print}
+  writeln('unicodelower: ',w1);
+  writeln('ansilower: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(3);
+  if (w4 <> w2) then
+    doerror(4);
+
+
+  w1:='AÉÈÀŁ'#$d87e#$dc04;
+  w2:='aéèàł'#$d87e#$dc04;
+  s:=w1;
+  w3:=s;
+  w4:=AnsiStrLower(pchar(s));
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeLowerCase(w1);
+{$ifdef print}
+  writeln('unicodelower: ',w1);
+  writeln('ansistrlower: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(3);
+  if (w4 <> w2) then
+    doerror(4);
+end;
+
+
+
+{ upper case testing with a missing utf-16 pair at the end }
+procedure testupperinvalid;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  { missing utf-16 pair at end }
+  w1:='aé'#0'èàł'#$d87e;
+  w2:='AÉ'#0'ÈÀŁ'#$d87e;
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original upper: ',w2);
+{$endif print}
+  s:=w1;
+  w3:=s;
+  w4:=AnsiUpperCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeUpperCase(w1);
+{$ifdef print}
+  writeln('unicodeupper: ',w1);
+  writeln('ansiupper: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(5);
+  if (w4 <> w2) then
+    doerror(6);
+end;
+
+
+{ lower case testing with a missing utf-16 pair at the end }
+procedure testlowerinvalid;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  { missing utf-16 pair at end}
+  w1:='AÉ'#0'ÈÀŁ'#$d87e;
+  w2:='aé'#0'èàł'#$d87e;
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original lower: ',w2);
+{$endif print}
+  s:=w1;
+  w3:=s;
+  w4:=AnsiLowerCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeLowerCase(w1);
+{$ifdef print}
+  writeln('unicodelower: ',w1);
+  writeln('ansilower: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(7);
+  if (w4 <> w2) then
+    doerror(8);
+end;
+
+
+
+{ upper case testing with a missing utf-16 pair at the end, followed by a normal char }
+procedure testupperinvalid1;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  { missing utf-16 pair at end with char after it}
+  w1:='aé'#0'èàł'#$d87e'j';
+  w2:='AÉ'#0'ÈÀŁ'#$d87e'J';
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original upper: ',w2);
+{$endif print}
+  s:=w1;
+  w3:=s;
+  w4:=AnsiUpperCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeUpperCase(w1);
+{$ifdef print}
+  writeln('unicodeupper: ',w1);
+  writeln('ansiupper: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(9);
+  if (w4 <> w2) then
+    doerror(10);
+end;
+
+
+{ lower case testing with a missing utf-16 pair at the end, followed by a normal char }
+procedure testlowerinvalid1;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  { missing utf-16 pair at end with char after it}
+  w1:='AÉ'#0'ÈÀŁ'#$d87e'J';
+  w2:='aé'#0'èàł'#$d87e'j';
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original lower: ',w2);
+{$endif print}
+  s:=w1;
+  w3:=s;
+  w4:=AnsiLowerCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeLowerCase(w1);
+{$ifdef print}
+  writeln('unicodelower: ',w1);
+  writeln('ansilower: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(11);
+  if (w4 <> w2) then
+    doerror(12);
+end;
+
+
+{ upper case testing with corrupting the utf-8 string after conversion }
+procedure testupperinvalid2;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  w1:='aé'#0'èàł'#$d87e#$dc04'ö';
+  w2:='AÉ'#0'ÈÀŁ'#$d87e#$dc04'Ö';
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original upper: ',w2);
+{$endif print}
+  s:=w1;
+  { truncate the last utf-8 character }
+  setlength(s,length(s)-1);
+  w3:=s;
+  { adjust checking values for new length due to corruption }
+  if length(w3)<>length(w2) then
+    begin
+      setlength(w2,length(w3));
+      setlength(w1,length(w3));
+    end;
+  w4:=AnsiUpperCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeUpperCase(w1);
+{$ifdef print}
+  writeln('unicodeupper: ',w1);
+  writeln('ansiupper: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(13);
+  if (w4 <> w2) then
+    doerror(14);
+end;
+
+
+{ lower case testing with corrupting the utf-8 string after conversion }
+procedure testlowerinvalid2;
+var
+  s: ansistring;
+  w1,w2,w3,w4: unicodestring;
+  i: longint;
+begin
+  w1:='AÉ'#0'ÈÀŁ'#$d87e#$dc04'Ö';
+  w2:='aé'#0'èàł'#$d87e#$dc04'ö';
+{$ifdef print}
+// the utf-8 output can confuse the testsuite parser
+  writeln('original: ',w1);
+  writeln('original lower: ',w2);
+{$endif print}
+  s:=w1;
+  { truncate the last utf-8 character }
+  setlength(s,length(s)-1);
+  w3:=s;
+  { adjust checking values for new length due to corruption }
+  if length(w3)<>length(w2) then
+    begin
+      setlength(w2,length(w3));
+      setlength(w1,length(w3));
+    end;
+  w4:=AnsiLowerCase(s);
+  { filter out unsupported characters }
+  for i:=1 to length(w3) do
+    if w3[i]='?' then
+      begin
+        w2[i]:='?';
+        w1[i]:='?';
+      end;
+  w1:=UnicodeLowerCase(w1);
+{$ifdef print}
+  writeln('unicodelower: ',w1);
+  writeln('ansilower: ',w4);
+{$endif print}
+  if (w1 <> w2) then
+    doerror(15);
+  if (w4 <> w2) then
+    doerror(16);
+end;
+
+
+
+begin
+  testupper;
+  writeln;
+  testlower;
+  writeln;
+  writeln;
+  testupperinvalid;
+  writeln;
+  testlowerinvalid;
+  writeln;
+  writeln;
+  testupperinvalid1;
+  writeln;
+  testlowerinvalid1;
+  writeln;
+  writeln;
+  testupperinvalid2;
+  writeln;
+  testlowerinvalid2;
+  writeln('ok');
+end.

+ 47 - 0
tests/test/tunistr7.pp

@@ -0,0 +1,47 @@
+{$codepage utf-8}
+
+uses
+{$ifdef unix}
+  cwstring,
+{$endif unix}
+  sysutils;
+
+procedure testwcmp;
+var
+  w1,w2: unicodestring;
+  s: ansistring;
+begin
+  w1:='aécde';
+  { filter unsupported characters }
+  s:=w1;
+  w1:=s;
+  w2:=w1;
+
+  if (w1<>w2) then
+    halt(1);
+  w1[2]:='f';
+  if (w1=w2) or
+     WideSameStr(w1,w2) or
+     (WideCompareText(w1,w2)=0) or
+     (WideCompareStr(w1,w2)<0) or
+     (WideCompareStr(w2,w1)>0) then
+    halt(2);
+  w1[2]:=#0;
+  w2[2]:=#0;
+  if (w1<>w2) or
+     not WideSameStr(w1,w2) or
+     (WideCompareStr(w1,w2)<>0) or
+     (WideCompareText(w1,w2)<>0) then
+    halt(3);
+  w1[3]:='m';
+  if WideSameStr(w1,w2) or
+     (WideCompareText(w1,w2)=0) or
+     (WideCompareStr(w1,w2)<0) or
+     (WideCompareStr(w2,w1)>0) then
+    halt(4);
+end;
+
+
+begin
+  testwcmp;
+end.