character.pas 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867
  1. { Unicode "Character" properties handler.
  2. Copyright (c) 2012 by Inoussa OUEDRAOGO
  3. The source code is distributed under the Library GNU
  4. General Public License with the following modification:
  5. - object files and libraries linked into an application may be
  6. distributed without source code.
  7. If you didn't receive a copy of the file COPYING, contact:
  8. Free Software Foundation
  9. 675 Mass Ave
  10. Cambridge, MA 02139
  11. USA
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
  15. {$IFNDEF FPC_DOTTEDUNITS}
  16. unit Character;
  17. {$ENDIF FPC_DOTTEDUNITS}
  18. interface
  19. {$mode objfpc}
  20. {$H+}
  21. {$PACKENUM 1}
  22. {$SCOPEDENUMS ON}
  23. {$IFDEF FPC_DOTTEDUNITS}
  24. uses
  25. System.CodePages.unicodedata;
  26. {$ELSE FPC_DOTTEDUNITS}
  27. uses
  28. unicodedata;
  29. {$ENDIF FPC_DOTTEDUNITS}
  30. type
  31. // Unicode General Category
  32. TUnicodeCategory = (
  33. ucUppercaseLetter, // Lu = Letter, uppercase
  34. ucLowercaseLetter, // Ll = Letter, lowercase
  35. ucTitlecaseLetter, // Lt = Letter, titlecase
  36. ucModifierLetter, // Lm = Letter, modifier
  37. ucOtherLetter, // Lo = Letter, other
  38. ucNonSpacingMark, // Mn = Mark, nonspacing
  39. ucCombiningMark, // Mc = Mark, spacing combining
  40. ucEnclosingMark, // Me = Mark, enclosing
  41. ucDecimalNumber, // Nd = Number, decimal digit
  42. ucLetterNumber, // Nl = Number, letter
  43. ucOtherNumber, // No = Number, other
  44. ucConnectPunctuation, // Pc = Punctuation, connector
  45. ucDashPunctuation, // Pd = Punctuation, dash
  46. ucOpenPunctuation, // Ps = Punctuation, open
  47. ucClosePunctuation, // Pe = Punctuation, close
  48. ucInitialPunctuation, // Pi = Punctuation, initial quote (may behave like Ps or Pe depending on usage)
  49. ucFinalPunctuation, // Pf = Punctuation, final quote (may behave like Ps or Pe depending on usage)
  50. ucOtherPunctuation, // Po = Punctuation, other
  51. ucMathSymbol, // Sm = Symbol, math
  52. ucCurrencySymbol, // Sc = Symbol, currency
  53. ucModifierSymbol, // Sk = Symbol, modifier
  54. ucOtherSymbol, // So = Symbol, other
  55. ucSpaceSeparator, // Zs = Separator, space
  56. ucLineSeparator, // Zl = Separator, line
  57. ucParagraphSeparator, // Zp = Separator, paragraph
  58. ucControl, // Cc = Other, control
  59. ucFormat, // Cf = Other, format
  60. ucSurrogate, // Cs = Other, surrogate
  61. ucPrivateUse, // Co = Other, private use
  62. ucUnassigned // Cn = Other, not assigned (including noncharacters)
  63. );
  64. TUnicodeCategorySet = set of TUnicodeCategory;
  65. TCharacterOption = (coIgnoreInvalidSequence);
  66. TCharacterOptions = set of TCharacterOption;
  67. { TCharacter }
  68. TCharacter = class sealed
  69. private
  70. class function TestCategory(const AString : UnicodeString; AIndex : Integer; ACategory : TUnicodeCategory) : Boolean; overload; static;
  71. class function TestCategory(const AString : UnicodeString; AIndex : Integer; ACategory : TUnicodeCategorySet) : Boolean; overload; static;
  72. public
  73. constructor Create;
  74. class function ConvertFromUtf32(AChar : UCS4Char) : UnicodeString; static;
  75. class function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer) : UCS4Char; overload; static;
  76. class function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer; out ACharLength : Integer) : UCS4Char; overload; static;
  77. class function ConvertToUtf32(const AHighSurrogate, ALowSurrogate : UnicodeChar) : UCS4Char; overload; static;
  78. class function GetNumericValue(AChar : UnicodeChar) : Double; static; overload;
  79. class function GetNumericValue(const AString : UnicodeString; AIndex : Integer) : Double; overload; static;
  80. class function GetUnicodeCategory(AChar : UnicodeChar) : TUnicodeCategory; overload; static; inline;
  81. class function GetUnicodeCategory(const AString : UnicodeString; AIndex : Integer) : TUnicodeCategory; overload; static;
  82. class function IsControl(AChar : UnicodeChar) : Boolean; overload; static; inline;
  83. class function IsControl(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  84. class function IsDigit(AChar : UnicodeChar) : Boolean; overload; static; inline;
  85. class function IsDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  86. class function IsSurrogate(AChar : UnicodeChar) : Boolean; overload; static; inline;
  87. class function IsSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  88. class function IsHighSurrogate(AChar : UnicodeChar) : Boolean; overload; static; inline;
  89. class function IsHighSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  90. class function IsLowSurrogate(AChar : UnicodeChar) : Boolean; overload; static; inline;
  91. class function IsLowSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  92. class function IsSurrogatePair(const AHighSurrogate, ALowSurrogate : UnicodeChar) : Boolean; overload; static; inline;
  93. class function IsSurrogatePair(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  94. class function IsLetter(AChar : UnicodeChar) : Boolean; overload; static; inline;
  95. class function IsLetter(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  96. class function IsLetterOrDigit(AChar : UnicodeChar) : Boolean; overload; static; inline;
  97. class function IsLetterOrDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  98. class function IsLower(AChar : UnicodeChar) : Boolean; overload; static; inline;
  99. class function IsLower(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  100. class function IsNumber(AChar : UnicodeChar) : Boolean; overload; static; inline;
  101. class function IsNumber(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  102. class function IsPunctuation(AChar : UnicodeChar) : Boolean; overload; static; inline;
  103. class function IsPunctuation(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  104. class function IsSeparator(AChar : UnicodeChar) : Boolean; overload; static; inline;
  105. class function IsSeparator(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  106. class function IsSymbol(AChar : UnicodeChar) : Boolean; overload; static; inline;
  107. class function IsSymbol(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  108. class function IsUpper(AChar : UnicodeChar) : Boolean; overload; static; inline;
  109. class function IsUpper(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  110. class function IsWhiteSpace(AChar : UnicodeChar) : Boolean; overload; static; inline;
  111. class function IsWhiteSpace(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  112. class function ToLower(AChar : UnicodeChar) : UnicodeChar; overload; static;
  113. class function ToLower(const AString : UnicodeString) : UnicodeString; inline;overload; static;
  114. class function ToLower(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString; overload; static;
  115. class function ToUpper(AChar : UnicodeChar) : UnicodeChar; overload; static;
  116. class function ToUpper(const AString : UnicodeString) : UnicodeString; inline; overload; static;
  117. class function ToUpper(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString; overload; static;
  118. end;
  119. // flat functions
  120. function ConvertFromUtf32(AChar : UCS4Char) : UnicodeString;
  121. function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer) : UCS4Char; overload;
  122. function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer; out ACharLength : Integer) : UCS4Char; overload;
  123. function ConvertToUtf32(const AHighSurrogate, ALowSurrogate : UnicodeChar) : UCS4Char; overload;
  124. function GetNumericValue(AChar : UnicodeChar) : Double; overload;
  125. function GetNumericValue(const AString : UnicodeString; AIndex : Integer) : Double; overload;
  126. function GetUnicodeCategory(AChar : UnicodeChar) : TUnicodeCategory; overload;
  127. function GetUnicodeCategory(const AString : UnicodeString; AIndex : Integer) : TUnicodeCategory; overload;
  128. function IsControl(AChar : UnicodeChar) : Boolean; overload;
  129. function IsControl(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  130. function IsDigit(AChar : UnicodeChar) : Boolean; overload;
  131. function IsDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  132. function IsSurrogate(AChar : UnicodeChar) : Boolean; overload;
  133. function IsSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  134. function IsHighSurrogate(AChar : UnicodeChar) : Boolean; overload;
  135. function IsHighSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  136. function IsLowSurrogate(AChar : UnicodeChar) : Boolean; overload;
  137. function IsLowSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  138. function IsSurrogatePair(const AHighSurrogate, ALowSurrogate : UnicodeChar) : Boolean; overload;
  139. function IsSurrogatePair(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  140. function IsLetter(AChar : UnicodeChar) : Boolean; overload;
  141. function IsLetter(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  142. function IsLetterOrDigit(AChar : UnicodeChar) : Boolean; overload;
  143. function IsLetterOrDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  144. function IsLower(AChar : UnicodeChar) : Boolean; overload;
  145. function IsLower(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  146. function IsNumber(AChar : UnicodeChar) : Boolean; overload;
  147. function IsNumber(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  148. function IsPunctuation(AChar : UnicodeChar) : Boolean; overload;
  149. function IsPunctuation(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  150. function IsSeparator(AChar : UnicodeChar) : Boolean; overload;
  151. function IsSeparator(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  152. function IsSymbol(AChar : UnicodeChar) : Boolean; overload;
  153. function IsSymbol(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  154. function IsUpper(AChar : UnicodeChar) : Boolean; overload;
  155. function IsUpper(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  156. function IsWhiteSpace(AChar : UnicodeChar) : Boolean; overload;
  157. function IsWhiteSpace(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  158. function ToLower(AChar : UnicodeChar) : UnicodeChar; overload;
  159. function ToLower(const AString : UnicodeString) : UnicodeString; overload;
  160. function ToUpper(AChar : UnicodeChar) : UnicodeChar; overload;
  161. function ToUpper(const AString : UnicodeString) : UnicodeString; overload;
  162. implementation
  163. {$IFDEF FPC_DOTTEDUNITS}
  164. uses
  165. System.SysUtils,
  166. System.RtlConsts;
  167. {$ELSE FPC_DOTTEDUNITS}
  168. uses
  169. SysUtils,
  170. RtlConsts;
  171. {$ENDIF FPC_DOTTEDUNITS}
  172. const
  173. LETTER_CATEGORIES = [
  174. TUnicodeCategory.ucUppercaseLetter, TUnicodeCategory.ucLowercaseLetter,
  175. TUnicodeCategory.ucTitlecaseLetter, TUnicodeCategory.ucModifierLetter,
  176. TUnicodeCategory.ucOtherLetter
  177. ];
  178. LETTER_OR_DIGIT_CATEGORIES =
  179. LETTER_CATEGORIES +
  180. [TUnicodeCategory.ucDecimalNumber,TUnicodeCategory.ucLetterNumber];
  181. NUMBER_CATEGORIES =
  182. [ TUnicodeCategory.ucDecimalNumber, TUnicodeCategory.ucLetterNumber,
  183. TUnicodeCategory.ucOtherNumber
  184. ];
  185. PUNCTUATION_CATEGORIES = [
  186. TUnicodeCategory.ucConnectPunctuation, TUnicodeCategory.ucDashPunctuation,
  187. TUnicodeCategory.ucOpenPunctuation, TUnicodeCategory.ucClosePunctuation,
  188. TUnicodeCategory.ucInitialPunctuation, TUnicodeCategory.ucFinalPunctuation,
  189. TUnicodeCategory.ucOtherPunctuation
  190. ];
  191. SEPARATOR_CATEGORIES =
  192. [ TUnicodeCategory.ucSpaceSeparator, TUnicodeCategory.ucLineSeparator,
  193. TUnicodeCategory.ucParagraphSeparator
  194. ];
  195. SYMBOL_CATEGORIES =
  196. [ TUnicodeCategory.ucMathSymbol, TUnicodeCategory.ucCurrencySymbol,
  197. TUnicodeCategory.ucModifierSymbol, TUnicodeCategory.ucOtherSymbol
  198. ];
  199. function ConvertFromUtf32(AChar: UCS4Char): UnicodeString;
  200. begin
  201. Result := TCharacter.ConvertFromUtf32(AChar);
  202. end;
  203. function ConvertToUtf32(const AString: UnicodeString; AIndex: Integer): UCS4Char;
  204. begin
  205. Result := TCharacter.ConvertToUtf32(AString, AIndex);
  206. end;
  207. function ConvertToUtf32(const AString: UnicodeString; AIndex: Integer; out ACharLength: Integer): UCS4Char;
  208. begin
  209. Result := TCharacter.ConvertToUtf32(AString, AIndex, ACharLength);
  210. end;
  211. function ConvertToUtf32(const AHighSurrogate, ALowSurrogate: UnicodeChar): UCS4Char;
  212. begin
  213. Result := TCharacter.ConvertToUtf32(AHighSurrogate, ALowSurrogate);
  214. end;
  215. function GetNumericValue(AChar: UnicodeChar): Double;
  216. begin
  217. Result := TCharacter.GetNumericValue(AChar);
  218. end;
  219. function GetNumericValue(const AString: UnicodeString; AIndex: Integer): Double;
  220. begin
  221. Result := TCharacter.GetNumericValue(AString, AIndex);
  222. end;
  223. function GetUnicodeCategory(AChar: UnicodeChar): TUnicodeCategory;
  224. begin
  225. Result := TCharacter.GetUnicodeCategory(AChar);
  226. end;
  227. function GetUnicodeCategory(const AString: UnicodeString; AIndex: Integer): TUnicodeCategory;
  228. begin
  229. Result := TCharacter.GetUnicodeCategory(AString, AIndex);
  230. end;
  231. function IsControl(AChar: UnicodeChar): Boolean;
  232. begin
  233. Result := TCharacter.IsControl(AChar);
  234. end;
  235. function IsControl(const AString: UnicodeString; AIndex: Integer): Boolean;
  236. begin
  237. Result := TCharacter.IsControl(AString, AIndex);
  238. end;
  239. function IsDigit(AChar: UnicodeChar): Boolean;
  240. begin
  241. Result := TCharacter.IsDigit(AChar);
  242. end;
  243. function IsDigit(const AString: UnicodeString; AIndex: Integer): Boolean;
  244. begin
  245. Result := TCharacter.IsDigit(AString, AIndex);
  246. end;
  247. function IsSurrogate(AChar: UnicodeChar): Boolean;
  248. begin
  249. Result := TCharacter.IsSurrogate(AChar);
  250. end;
  251. function IsSurrogate(const AString: UnicodeString; AIndex: Integer): Boolean;
  252. begin
  253. Result := TCharacter.IsSurrogate(AString, AIndex);
  254. end;
  255. function IsHighSurrogate(AChar: UnicodeChar): Boolean;
  256. begin
  257. Result := TCharacter.IsHighSurrogate(AChar);
  258. end;
  259. function IsHighSurrogate(const AString: UnicodeString; AIndex: Integer): Boolean;
  260. begin
  261. Result := TCharacter.IsHighSurrogate(AString, AIndex);
  262. end;
  263. function IsLowSurrogate(AChar: UnicodeChar): Boolean;
  264. begin
  265. Result := TCharacter.IsLowSurrogate(AChar);
  266. end;
  267. function IsLowSurrogate(const AString: UnicodeString; AIndex: Integer): Boolean;
  268. begin
  269. Result := TCharacter.IsLowSurrogate(AString, AIndex);
  270. end;
  271. function IsSurrogatePair(const AHighSurrogate, ALowSurrogate: UnicodeChar): Boolean;
  272. begin
  273. Result := TCharacter.IsSurrogatePair(AHighSurrogate, ALowSurrogate);
  274. end;
  275. function IsSurrogatePair(const AString: UnicodeString; AIndex: Integer): Boolean;
  276. begin
  277. Result := TCharacter.IsSurrogatePair(AString, AIndex);
  278. end;
  279. function IsLetter(AChar: UnicodeChar): Boolean;
  280. begin
  281. Result := TCharacter.IsLetter(AChar);
  282. end;
  283. function IsLetter(const AString: UnicodeString; AIndex: Integer): Boolean;
  284. begin
  285. Result := TCharacter.IsLetter(AString, AIndex);
  286. end;
  287. function IsLetterOrDigit(AChar: UnicodeChar): Boolean;
  288. begin
  289. Result := TCharacter.IsLetterOrDigit(AChar);
  290. end;
  291. function IsLetterOrDigit(const AString: UnicodeString; AIndex: Integer): Boolean;
  292. begin
  293. Result := TCharacter.IsLetterOrDigit(AString, AIndex);
  294. end;
  295. function IsLower(AChar: UnicodeChar): Boolean;
  296. begin
  297. Result := TCharacter.IsLower(AChar);
  298. end;
  299. function IsLower(const AString: UnicodeString; AIndex: Integer): Boolean;
  300. begin
  301. Result := TCharacter.IsLower(AString, AIndex);
  302. end;
  303. function IsNumber(AChar: UnicodeChar): Boolean;
  304. begin
  305. Result := TCharacter.IsNumber(AChar);
  306. end;
  307. function IsNumber(const AString: UnicodeString; AIndex: Integer): Boolean;
  308. begin
  309. Result := TCharacter.IsNumber(AString, AIndex);
  310. end;
  311. function IsPunctuation(AChar: UnicodeChar): Boolean;
  312. begin
  313. Result := TCharacter.IsPunctuation(AChar);
  314. end;
  315. function IsPunctuation(const AString: UnicodeString; AIndex: Integer): Boolean;
  316. begin
  317. Result := TCharacter.IsPunctuation(AString, AIndex);
  318. end;
  319. function IsSeparator(AChar: UnicodeChar): Boolean;
  320. begin
  321. Result := TCharacter.IsSeparator(AChar);
  322. end;
  323. function IsSeparator(const AString: UnicodeString; AIndex: Integer): Boolean;
  324. begin
  325. Result := TCharacter.IsSeparator(AString, AIndex);
  326. end;
  327. function IsSymbol(AChar: UnicodeChar): Boolean;
  328. begin
  329. Result := TCharacter.IsSymbol(AChar);
  330. end;
  331. function IsSymbol(const AString: UnicodeString; AIndex: Integer): Boolean;
  332. begin
  333. Result := TCharacter.IsSymbol(AString, AIndex);
  334. end;
  335. function IsUpper(AChar: UnicodeChar): Boolean;
  336. begin
  337. Result := TCharacter.IsUpper(AChar);
  338. end;
  339. function IsUpper(const AString: UnicodeString; AIndex: Integer): Boolean;
  340. begin
  341. Result := TCharacter.IsUpper(AString, AIndex);
  342. end;
  343. function IsWhiteSpace(AChar: UnicodeChar): Boolean;
  344. begin
  345. Result := TCharacter.IsWhiteSpace(AChar);
  346. end;
  347. function IsWhiteSpace(const AString: UnicodeString; AIndex: Integer): Boolean;
  348. begin
  349. Result := TCharacter.IsWhiteSpace(AString, AIndex);
  350. end;
  351. function ToLower(AChar: UnicodeChar): UnicodeChar;
  352. begin
  353. Result := TCharacter.ToLower(AChar);
  354. end;
  355. function ToLower(const AString: UnicodeString): UnicodeString;
  356. begin
  357. Result := TCharacter.ToLower(AString);
  358. end;
  359. function ToUpper(AChar: UnicodeChar): UnicodeChar;
  360. begin
  361. Result := TCharacter.ToUpper(AChar);
  362. end;
  363. function ToUpper(const AString: UnicodeString): UnicodeString;
  364. begin
  365. Result := TCharacter.ToUpper(AString);
  366. end;
  367. { TCharacter }
  368. class function TCharacter.TestCategory(
  369. const AString : UnicodeString;
  370. AIndex : Integer;
  371. ACategory : TUnicodeCategory
  372. ) : Boolean;
  373. var
  374. pu : PUC_Prop;
  375. begin
  376. if (AIndex < 1) or (AIndex > Length(AString)) then
  377. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  378. pu := GetProps(Word(AString[AIndex]));
  379. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  380. if not IsSurrogatePair(AString,AIndex) then
  381. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  382. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  383. end;
  384. Result := (TUnicodeCategory(pu^.Category) = ACategory);
  385. end;
  386. class function TCharacter.TestCategory(
  387. const AString : UnicodeString;
  388. AIndex : Integer;
  389. ACategory : TUnicodeCategorySet
  390. ) : Boolean;
  391. var
  392. pu : PUC_Prop;
  393. begin
  394. if (AIndex < 1) or (AIndex > Length(AString)) then
  395. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  396. pu := GetProps(Word(AString[AIndex]));
  397. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  398. if not IsSurrogatePair(AString,AIndex) then
  399. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  400. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  401. end;
  402. Result := (TUnicodeCategory(pu^.Category) in ACategory);
  403. end;
  404. constructor TCharacter.Create;
  405. begin
  406. raise ENoConstructException.CreateFmt(SClassCantBeConstructed, [ClassName]);
  407. end;
  408. class function TCharacter.ConvertFromUtf32(AChar : UCS4Char) : UnicodeString;
  409. begin
  410. if AChar < UCS4_HALF_BASE then
  411. begin
  412. if IsSurrogate(UnicodeChar(AChar)) then
  413. raise EArgumentOutOfRangeException.CreateFmt(SInvalidUTF32Char, [AChar]);
  414. Result := UnicodeChar(AChar);
  415. end
  416. else
  417. begin
  418. if AChar > MAX_LEGAL_UTF32 then
  419. raise EArgumentOutOfRangeException.CreateFmt(SInvalidUTF32Char, [AChar]);
  420. SetLength(Result, 2);
  421. AChar := AChar - UCS4_HALF_BASE;
  422. Result[1] := UnicodeChar((AChar shr 10) + HIGH_SURROGATE_BEGIN);
  423. Result[2] := UnicodeChar((AChar and UCS4_HALF_MASK) + LOW_SURROGATE_BEGIN);
  424. end;
  425. end;
  426. class function TCharacter.ConvertToUtf32(const AString : UnicodeString; AIndex : Integer) : UCS4Char; overload;
  427. begin
  428. if (AIndex < 1) or (AIndex > Length(AString)) then
  429. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  430. Result := Word(AString[AIndex]);
  431. if IsHighSurrogate(UnicodeChar(Result)) then
  432. begin
  433. if Length(AString) < Succ(AIndex) then
  434. raise EArgumentException.CreateFmt(SInvalidHighSurrogate, [AIndex]);
  435. Result := ConvertToUtf32(UnicodeChar(Result), AString[Succ(AIndex)]);
  436. end;
  437. end;
  438. class function TCharacter.ConvertToUtf32(const AString : UnicodeString; AIndex : Integer; out ACharLength : Integer) : UCS4Char; overload;
  439. begin
  440. if (AIndex < 1) or (AIndex > Length(AString)) then
  441. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  442. Result := Word(AString[AIndex]);
  443. if IsHighSurrogate(UnicodeChar(Result)) then
  444. begin
  445. if Length(AString) < Succ(AIndex) then
  446. raise EArgumentException.CreateFmt(SInvalidHighSurrogate, [AIndex]);
  447. Result := ConvertToUtf32(UnicodeChar(Result), AString[Succ(AIndex)]);
  448. ACharLength := 2;
  449. end
  450. else
  451. ACharLength := 1;
  452. end;
  453. class function TCharacter.ConvertToUtf32(const AHighSurrogate, ALowSurrogate : UnicodeChar) : UCS4Char; overload;
  454. begin
  455. if not IsHighSurrogate(AHighSurrogate) then
  456. raise EArgumentOutOfRangeException.CreateFmt(SHighSurrogateOutOfRange, [Word(AHighSurrogate)]);
  457. if not IsLowSurrogate(ALowSurrogate) then
  458. raise EArgumentOutOfRangeException.CreateFmt(SLowSurrogateOutOfRange, [Word(ALowSurrogate)]);
  459. Result := ToUCS4(AHighSurrogate, ALowSurrogate);
  460. end;
  461. class function TCharacter.GetNumericValue(AChar : UnicodeChar) : Double;
  462. begin
  463. Result := GetProps(Word(AChar))^.NumericValue;
  464. end;
  465. class function TCharacter.GetNumericValue(
  466. const AString : UnicodeString;
  467. AIndex : Integer
  468. ) : Double;
  469. var
  470. pu : PUC_Prop;
  471. begin
  472. if (AIndex < 1) or (AIndex > Length(AString)) then
  473. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  474. pu := GetProps(Word(AString[AIndex]));
  475. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  476. if not IsSurrogatePair(AString,AIndex) then
  477. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  478. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  479. end;
  480. Result := pu^.NumericValue;
  481. end;
  482. class function TCharacter.GetUnicodeCategory(AChar : UnicodeChar) : TUnicodeCategory;
  483. begin
  484. Result := TUnicodeCategory(GetProps(Word(AChar))^.Category);
  485. end;
  486. class function TCharacter.GetUnicodeCategory(
  487. const AString : UnicodeString;
  488. AIndex : Integer
  489. ) : TUnicodeCategory;
  490. var
  491. pu : PUC_Prop;
  492. begin
  493. if (AIndex < 1) or (AIndex > Length(AString)) then
  494. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  495. pu := GetProps(Word(AString[AIndex]));
  496. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  497. if not IsSurrogatePair(AString,AIndex) then
  498. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  499. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  500. end;
  501. Result := TUnicodeCategory(pu^.Category);
  502. end;
  503. class function TCharacter.IsControl(AChar : UnicodeChar) : Boolean;
  504. begin
  505. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucControl);
  506. end;
  507. class function TCharacter.IsControl(
  508. const AString : UnicodeString;
  509. AIndex : Integer
  510. ) : Boolean;
  511. begin
  512. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucControl);
  513. end;
  514. class function TCharacter.IsDigit(AChar : UnicodeChar) : Boolean;
  515. begin
  516. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucDecimalNumber);
  517. end;
  518. class function TCharacter.IsDigit(
  519. const AString : UnicodeString;
  520. AIndex : Integer
  521. ) : Boolean;
  522. begin
  523. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucDecimalNumber);
  524. end;
  525. class function TCharacter.IsSurrogate(AChar : UnicodeChar) : Boolean;
  526. begin
  527. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucSurrogate);
  528. end;
  529. class function TCharacter.IsSurrogate(
  530. const AString : UnicodeString;
  531. AIndex : Integer
  532. ) : Boolean;
  533. begin
  534. if (AIndex < 1) or (AIndex > Length(AString)) then
  535. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  536. Result := IsSurrogate(AString[AIndex]);
  537. end;
  538. class function TCharacter.IsHighSurrogate(AChar : UnicodeChar) : Boolean;
  539. begin
  540. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucSurrogate) and
  541. (Word(AChar) >= HIGH_SURROGATE_BEGIN) and
  542. (Word(AChar) <= HIGH_SURROGATE_END);
  543. end;
  544. class function TCharacter.IsHighSurrogate(
  545. const AString : UnicodeString;
  546. AIndex : Integer
  547. ) : Boolean;
  548. begin
  549. if (AIndex < 1) or (AIndex > Length(AString)) then
  550. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  551. Result := IsHighSurrogate(AString[AIndex]);
  552. end;
  553. class function TCharacter.IsLowSurrogate(AChar : UnicodeChar) : Boolean;
  554. begin
  555. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucSurrogate) and
  556. (Word(AChar) >= LOW_SURROGATE_BEGIN) and
  557. (Word(AChar) <= LOW_SURROGATE_END);
  558. end;
  559. class function TCharacter.IsLowSurrogate(
  560. const AString : UnicodeString;
  561. AIndex : Integer
  562. ) : Boolean;
  563. begin
  564. if (AIndex < 1) or (AIndex > Length(AString)) then
  565. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  566. Result := IsLowSurrogate(AString[AIndex]);
  567. end;
  568. class function TCharacter.IsSurrogatePair(
  569. const AHighSurrogate,
  570. ALowSurrogate : UnicodeChar
  571. ) : Boolean;
  572. begin
  573. Result := UnicodeIsSurrogatePair(AHighSurrogate,ALowSurrogate);
  574. end;
  575. class function TCharacter.IsSurrogatePair(
  576. const AString : UnicodeString;
  577. AIndex : Integer
  578. ) : Boolean;
  579. begin
  580. if (AIndex < 1) or (AIndex > Length(AString)) then
  581. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  582. if not IsHighSurrogate(AString[AIndex]) then begin
  583. Result := False;
  584. exit;
  585. end;
  586. if ((AIndex+1) > Length(AString)) then
  587. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex+1, Length(AString)]);
  588. Result := IsSurrogatePair(AString[AIndex],AString[AIndex+1]);
  589. end;
  590. class function TCharacter.IsLetter(AChar : UnicodeChar) : Boolean;
  591. begin
  592. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in LETTER_CATEGORIES);
  593. end;
  594. class function TCharacter.IsLetter(
  595. const AString : UnicodeString;
  596. AIndex : Integer
  597. ) : Boolean;
  598. begin
  599. Result := TestCategory(AString,AIndex,LETTER_CATEGORIES);
  600. end;
  601. class function TCharacter.IsLetterOrDigit(AChar : UnicodeChar) : Boolean;
  602. begin
  603. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in LETTER_OR_DIGIT_CATEGORIES);
  604. end;
  605. class function TCharacter.IsLetterOrDigit(
  606. const AString : UnicodeString;
  607. AIndex : Integer
  608. ) : Boolean;
  609. begin
  610. Result := TestCategory(AString,AIndex,LETTER_OR_DIGIT_CATEGORIES);
  611. end;
  612. class function TCharacter.IsLower(AChar : UnicodeChar) : Boolean;
  613. begin
  614. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucLowercaseLetter);
  615. end;
  616. class function TCharacter.IsLower(
  617. const AString : UnicodeString;
  618. AIndex : Integer
  619. ) : Boolean;
  620. begin
  621. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucLowercaseLetter);
  622. end;
  623. class function TCharacter.IsNumber(AChar : UnicodeChar) : Boolean;
  624. begin
  625. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in NUMBER_CATEGORIES);
  626. end;
  627. class function TCharacter.IsNumber(
  628. const AString : UnicodeString;
  629. AIndex : Integer
  630. ) : Boolean;
  631. begin
  632. Result := TestCategory(AString,AIndex,NUMBER_CATEGORIES);
  633. end;
  634. class function TCharacter.IsPunctuation(AChar : UnicodeChar) : Boolean;
  635. begin
  636. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in PUNCTUATION_CATEGORIES);
  637. end;
  638. class function TCharacter.IsPunctuation(
  639. const AString : UnicodeString;
  640. AIndex : Integer
  641. ) : Boolean;
  642. begin
  643. Result := TestCategory(AString,AIndex,PUNCTUATION_CATEGORIES);
  644. end;
  645. class function TCharacter.IsSeparator(AChar: UnicodeChar): Boolean;
  646. begin
  647. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in SEPARATOR_CATEGORIES);
  648. end;
  649. class function TCharacter.IsSeparator(
  650. const AString : UnicodeString;
  651. AIndex : Integer
  652. ) : Boolean;
  653. begin
  654. Result := TestCategory(AString,AIndex,SEPARATOR_CATEGORIES);
  655. end;
  656. class function TCharacter.IsSymbol(AChar: UnicodeChar): Boolean;
  657. begin
  658. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in SYMBOL_CATEGORIES);
  659. end;
  660. class function TCharacter.IsSymbol(
  661. const AString : UnicodeString;
  662. AIndex : Integer
  663. ) : Boolean;
  664. begin
  665. Result := TestCategory(AString,AIndex,SYMBOL_CATEGORIES);
  666. end;
  667. class function TCharacter.IsUpper(AChar : UnicodeChar) : Boolean;
  668. begin
  669. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucUppercaseLetter);
  670. end;
  671. class function TCharacter.IsUpper(
  672. const AString : UnicodeString;
  673. AIndex : Integer
  674. ) : Boolean;
  675. begin
  676. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucUppercaseLetter);
  677. end;
  678. class function TCharacter.IsWhiteSpace(AChar : UnicodeChar) : Boolean;
  679. begin
  680. Result := GetProps(Word(AChar))^.WhiteSpace;
  681. end;
  682. class function TCharacter.IsWhiteSpace(
  683. const AString : UnicodeString;
  684. AIndex : Integer
  685. ) : Boolean;
  686. var
  687. pu : PUC_Prop;
  688. begin
  689. if (AIndex < 1) or (AIndex > Length(AString)) then
  690. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  691. pu := GetProps(Word(AString[AIndex]));
  692. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  693. if not IsSurrogatePair(AString,AIndex) then
  694. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  695. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  696. end;
  697. Result := pu^.WhiteSpace;
  698. end;
  699. class function TCharacter.ToLower(AChar : UnicodeChar) : UnicodeChar;
  700. begin
  701. Result := UnicodeChar(Word(GetProps(Word(AChar))^.SimpleLowerCase));
  702. if (Result = UnicodeChar(0)) then
  703. Result := AChar;
  704. end;
  705. class function TCharacter.ToLower(const AString : UnicodeString) : UnicodeString;
  706. begin
  707. Result := ToLower(AString,[]);
  708. end;
  709. class function TCharacter.ToLower(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString;
  710. begin
  711. if (UnicodeToLower(
  712. AString,(TCharacterOption.coIgnoreInvalidSequence in AOptions),Result
  713. ) <> 0
  714. )
  715. then
  716. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  717. end;
  718. class function TCharacter.ToUpper(AChar : UnicodeChar) : UnicodeChar;
  719. begin
  720. Result := UnicodeChar(Word(GetProps(Word(AChar))^.SimpleUpperCase));
  721. if (Result = UnicodeChar(0)) then
  722. Result := AChar;
  723. end;
  724. class function TCharacter.ToUpper(const AString : UnicodeString) : UnicodeString;
  725. begin
  726. Result := ToUpper(AString,[]);
  727. end;
  728. class function TCharacter.ToUpper(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString;
  729. begin
  730. if (UnicodeToUpper(
  731. AString,(TCharacterOption.coIgnoreInvalidSequence in AOptions),Result
  732. ) <> 0
  733. )
  734. then
  735. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  736. end;
  737. end.