widestrutils.pp 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. {
  2. Delphi/Kylix compatibility unit: String handling routines.
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 1999-2005 by the Free Pascal development team
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. unit widestrutils;
  12. {$mode objfpc}
  13. {$H+}
  14. {$inline on}
  15. interface
  16. uses
  17. SysUtils, Classes;
  18. function WideStringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;
  19. function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
  20. function WideReplaceText(const AText, AFromText, AToText: WideString): WideString; inline;
  21. function UnicodeStringReplace(const S, OldPattern, NewPattern: UnicodeString; Flags: TReplaceFlags): UnicodeString;
  22. function UnicodeReplaceStr(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline;
  23. function UnicodeReplaceText(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline;
  24. type
  25. TEncodeType = (etUSASCII, etUTF8, etANSI);
  26. const
  27. sUTF8BOMString: array[1..3] of AnsiChar = (#$EF, #$BB, #$BF);
  28. function HasUTF8BOM(S: TStream): boolean; overload;
  29. function HasUTF8BOM(const S: RawByteString): boolean; overload;
  30. function HasExtendCharacter(const S: RawByteString): boolean;
  31. function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
  32. function IsUTF8String(const S: RawByteString): boolean;
  33. type
  34. TBufferUTF8State = (u8sUnknown, u8sYes, u8sNo);
  35. //PartialAllowed must be set to true if the buffer is smaller than the file.
  36. function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
  37. implementation
  38. {
  39. The IsBufferUtf8 function code was created by Christian Ghisler (ghisler.com)
  40. Christian gave code to open-source at Total Commander public forum
  41. }
  42. const bytesFromUTF8:array[AnsiChar] of byte = (
  43. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32
  44. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 64
  45. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 96
  46. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //128
  47. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //160
  48. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //192
  49. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, //224
  50. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5); //256
  51. function IsFirstUTF8Char(thechar:AnsiChar):boolean; inline;
  52. {The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
  53. begin
  54. result:=(byte(thechar) and (128+64))<>128;
  55. end;
  56. function IsSecondaryUTF8Char(thechar:AnsiChar):boolean; inline;
  57. {The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
  58. begin
  59. result:=(byte(thechar) and (128+64))=128;
  60. end;
  61. function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
  62. {Buffer contains only valid UTF-8 characters, no secondary alone,
  63. no primary without the correct nr of secondary}
  64. var
  65. p: PAnsiChar;
  66. i: SizeInt;
  67. utf8bytes: integer;
  68. hadutf8bytes: boolean;
  69. begin
  70. p:=buf;
  71. hadutf8bytes:=false;
  72. result:=u8sUnknown;
  73. utf8bytes:=0;
  74. for i:= 1 to bufSize do
  75. begin
  76. if utf8bytes>0 then
  77. begin {Expecting secondary AnsiChar}
  78. hadutf8bytes:=true;
  79. if not IsSecondaryUTF8Char(p^) then exit(u8sNo); {Fail!}
  80. dec(utf8bytes);
  81. end
  82. else
  83. if IsFirstUTF8Char(p^) then
  84. utf8bytes:=bytesFromUTF8[p^]
  85. else
  86. if IsSecondaryUTF8Char(p^) then
  87. exit(u8sNo); {Fail!}
  88. inc(p);
  89. end;
  90. if hadutf8bytes and (PartialAllowed or (utf8bytes=0)) then
  91. result:=u8sYes;
  92. end;
  93. function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
  94. begin
  95. Result := WideStringReplace(AText, AFromText, AToText, [rfReplaceAll]);
  96. end;
  97. function WideReplaceText(const AText, AFromText, AToText: WideString): WideString; inline;
  98. begin
  99. Result := WideStringReplace(AText, AFromText, AToText, [rfReplaceAll, rfIgnoreCase]);
  100. end;
  101. function UnicodeReplaceStr(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline;
  102. begin
  103. Result := UnicodeStringReplace(AText, AFromText, AToText, [rfReplaceAll]);
  104. end;
  105. function UnicodeReplaceText(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline;
  106. begin
  107. Result := UnicodeStringReplace(AText, AFromText, AToText, [rfReplaceAll, rfIgnoreCase]);
  108. end;
  109. Function WideStringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;
  110. begin
  111. Result:= sysutils.WideStringReplace(S,OldPattern,NewPattern,Flags);
  112. end;
  113. Function UnicodeStringReplace(const S, OldPattern, NewPattern: UnicodeString; Flags: TReplaceFlags): UnicodeString;
  114. begin
  115. Result:= sysutils.UnicodeStringReplace(S,OldPattern,NewPattern,Flags);
  116. end;
  117. function HasUTF8BOM(S: TStream): boolean;
  118. var
  119. OldPos: Int64;
  120. Buf: array[1..3] of AnsiChar;
  121. begin
  122. Result := false;
  123. if S.Size<3 then exit;
  124. FillChar(Buf, SizeOf(Buf), 0);
  125. try
  126. OldPos := S.Position;
  127. S.Position := 0;
  128. if S.Read(Buf, 3)<>3 then exit;
  129. Result :=
  130. (Buf[1]=sUTF8BOMString[1]) and
  131. (Buf[2]=sUTF8BOMString[2]) and
  132. (Buf[3]=sUTF8BOMString[3]);
  133. finally
  134. S.Position := OldPos;
  135. end;
  136. end;
  137. function HasUTF8BOM(const S: RawByteString): boolean;
  138. begin
  139. Result := (Length(S)>=3) and
  140. (S[1]=sUTF8BOMString[1]) and
  141. (S[2]=sUTF8BOMString[2]) and
  142. (S[3]=sUTF8BOMString[3]);
  143. end;
  144. function HasExtendCharacter(const S: RawByteString): boolean;
  145. var
  146. i: integer;
  147. begin
  148. for i := 1 to Length(S) do
  149. if Ord(S[i])>=$80 then
  150. begin
  151. Result := true;
  152. exit;
  153. end;
  154. Result := false;
  155. end;
  156. function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
  157. var
  158. FirstExtChar, i: integer;
  159. begin
  160. FirstExtChar := 0;
  161. for i := 1 to Length(S) do
  162. if Ord(S[i])>=$80 then
  163. begin
  164. FirstExtChar := i;
  165. Break;
  166. end;
  167. if FirstExtChar=0 then
  168. Result := etUSASCII
  169. else
  170. if IsBufferUtf8(@S[FirstExtChar], Length(S)-FirstExtChar+1, false)=u8sYes then
  171. Result := etUTF8
  172. else
  173. Result := etANSI;
  174. end;
  175. function IsUTF8String(const S: RawByteString): boolean;
  176. begin
  177. Result := DetectUTF8Encoding(S) = etUTF8;
  178. end;
  179. end.