character.pas 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854
  1. { Unicode "Character" properties handler.
  2. Copyright (c) 2012 by Inoussa OUEDRAOGO
  3. The source code is distributed under the Library GNU
  4. General Public License with the following modification:
  5. - object files and libraries linked into an application may be
  6. distributed without source code.
  7. If you didn't receive a copy of the file COPYING, contact:
  8. Free Software Foundation
  9. 675 Mass Ave
  10. Cambridge, MA 02139
  11. USA
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
  15. unit Character;
  16. interface
  17. {$mode objfpc}
  18. {$H+}
  19. {$PACKENUM 1}
  20. {$SCOPEDENUMS ON}
  21. uses
  22. unicodedata;
  23. type
  24. // Unicode General Category
  25. TUnicodeCategory = (
  26. ucUppercaseLetter, // Lu = Letter, uppercase
  27. ucLowercaseLetter, // Ll = Letter, lowercase
  28. ucTitlecaseLetter, // Lt = Letter, titlecase
  29. ucModifierLetter, // Lm = Letter, modifier
  30. ucOtherLetter, // Lo = Letter, other
  31. ucNonSpacingMark, // Mn = Mark, nonspacing
  32. ucCombiningMark, // Mc = Mark, spacing combining
  33. ucEnclosingMark, // Me = Mark, enclosing
  34. ucDecimalNumber, // Nd = Number, decimal digit
  35. ucLetterNumber, // Nl = Number, letter
  36. ucOtherNumber, // No = Number, other
  37. ucConnectPunctuation, // Pc = Punctuation, connector
  38. ucDashPunctuation, // Pd = Punctuation, dash
  39. ucOpenPunctuation, // Ps = Punctuation, open
  40. ucClosePunctuation, // Pe = Punctuation, close
  41. ucInitialPunctuation, // Pi = Punctuation, initial quote (may behave like Ps or Pe depending on usage)
  42. ucFinalPunctuation, // Pf = Punctuation, final quote (may behave like Ps or Pe depending on usage)
  43. ucOtherPunctuation, // Po = Punctuation, other
  44. ucMathSymbol, // Sm = Symbol, math
  45. ucCurrencySymbol, // Sc = Symbol, currency
  46. ucModifierSymbol, // Sk = Symbol, modifier
  47. ucOtherSymbol, // So = Symbol, other
  48. ucSpaceSeparator, // Zs = Separator, space
  49. ucLineSeparator, // Zl = Separator, line
  50. ucParagraphSeparator, // Zp = Separator, paragraph
  51. ucControl, // Cc = Other, control
  52. ucFormat, // Cf = Other, format
  53. ucSurrogate, // Cs = Other, surrogate
  54. ucPrivateUse, // Co = Other, private use
  55. ucUnassigned // Cn = Other, not assigned (including noncharacters)
  56. );
  57. TUnicodeCategorySet = set of TUnicodeCategory;
  58. TCharacterOption = (coIgnoreInvalidSequence);
  59. TCharacterOptions = set of TCharacterOption;
  60. { TCharacter }
  61. TCharacter = class sealed
  62. private
  63. class function TestCategory(const AString : UnicodeString; AIndex : Integer; ACategory : TUnicodeCategory) : Boolean; overload; static;
  64. class function TestCategory(const AString : UnicodeString; AIndex : Integer; ACategory : TUnicodeCategorySet) : Boolean; overload; static;
  65. public
  66. constructor Create;
  67. class function ConvertFromUtf32(AChar : UCS4Char) : UnicodeString; static;
  68. class function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer) : UCS4Char; overload; static;
  69. class function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer; out ACharLength : Integer) : UCS4Char; overload; static;
  70. class function ConvertToUtf32(const AHighSurrogate, ALowSurrogate : UnicodeChar) : UCS4Char; overload; static;
  71. class function GetNumericValue(AChar : UnicodeChar) : Double; static; overload;
  72. class function GetNumericValue(const AString : UnicodeString; AIndex : Integer) : Double; overload; static;
  73. class function GetUnicodeCategory(AChar : UnicodeChar) : TUnicodeCategory; overload; static; inline;
  74. class function GetUnicodeCategory(const AString : UnicodeString; AIndex : Integer) : TUnicodeCategory; overload; static;
  75. class function IsControl(AChar : UnicodeChar) : Boolean; overload; static; inline;
  76. class function IsControl(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  77. class function IsDigit(AChar : UnicodeChar) : Boolean; overload; static; inline;
  78. class function IsDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  79. class function IsSurrogate(AChar : UnicodeChar) : Boolean; overload; static; inline;
  80. class function IsSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  81. class function IsHighSurrogate(AChar : UnicodeChar) : Boolean; overload; static; inline;
  82. class function IsHighSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  83. class function IsLowSurrogate(AChar : UnicodeChar) : Boolean; overload; static; inline;
  84. class function IsLowSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  85. class function IsSurrogatePair(const AHighSurrogate, ALowSurrogate : UnicodeChar) : Boolean; overload; static; inline;
  86. class function IsSurrogatePair(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  87. class function IsLetter(AChar : UnicodeChar) : Boolean; overload; static; inline;
  88. class function IsLetter(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  89. class function IsLetterOrDigit(AChar : UnicodeChar) : Boolean; overload; static; inline;
  90. class function IsLetterOrDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  91. class function IsLower(AChar : UnicodeChar) : Boolean; overload; static; inline;
  92. class function IsLower(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  93. class function IsNumber(AChar : UnicodeChar) : Boolean; overload; static; inline;
  94. class function IsNumber(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  95. class function IsPunctuation(AChar : UnicodeChar) : Boolean; overload; static; inline;
  96. class function IsPunctuation(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  97. class function IsSeparator(AChar : UnicodeChar) : Boolean; overload; static; inline;
  98. class function IsSeparator(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  99. class function IsSymbol(AChar : UnicodeChar) : Boolean; overload; static; inline;
  100. class function IsSymbol(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  101. class function IsUpper(AChar : UnicodeChar) : Boolean; overload; static; inline;
  102. class function IsUpper(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static; inline;
  103. class function IsWhiteSpace(AChar : UnicodeChar) : Boolean; overload; static; inline;
  104. class function IsWhiteSpace(const AString : UnicodeString; AIndex : Integer) : Boolean; overload; static;
  105. class function ToLower(AChar : UnicodeChar) : UnicodeChar; overload; static;
  106. class function ToLower(const AString : UnicodeString) : UnicodeString; inline;overload; static;
  107. class function ToLower(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString; overload; static;
  108. class function ToUpper(AChar : UnicodeChar) : UnicodeChar; overload; static;
  109. class function ToUpper(const AString : UnicodeString) : UnicodeString; inline; overload; static;
  110. class function ToUpper(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString; overload; static;
  111. end;
  112. // flat functions
  113. function ConvertFromUtf32(AChar : UCS4Char) : UnicodeString;
  114. function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer) : UCS4Char; overload;
  115. function ConvertToUtf32(const AString : UnicodeString; AIndex : Integer; out ACharLength : Integer) : UCS4Char; overload;
  116. function ConvertToUtf32(const AHighSurrogate, ALowSurrogate : UnicodeChar) : UCS4Char; overload;
  117. function GetNumericValue(AChar : UnicodeChar) : Double; overload;
  118. function GetNumericValue(const AString : UnicodeString; AIndex : Integer) : Double; overload;
  119. function GetUnicodeCategory(AChar : UnicodeChar) : TUnicodeCategory; overload;
  120. function GetUnicodeCategory(const AString : UnicodeString; AIndex : Integer) : TUnicodeCategory; overload;
  121. function IsControl(AChar : UnicodeChar) : Boolean; overload;
  122. function IsControl(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  123. function IsDigit(AChar : UnicodeChar) : Boolean; overload;
  124. function IsDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  125. function IsSurrogate(AChar : UnicodeChar) : Boolean; overload;
  126. function IsSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  127. function IsHighSurrogate(AChar : UnicodeChar) : Boolean; overload;
  128. function IsHighSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  129. function IsLowSurrogate(AChar : UnicodeChar) : Boolean; overload;
  130. function IsLowSurrogate(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  131. function IsSurrogatePair(const AHighSurrogate, ALowSurrogate : UnicodeChar) : Boolean; overload;
  132. function IsSurrogatePair(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  133. function IsLetter(AChar : UnicodeChar) : Boolean; overload;
  134. function IsLetter(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  135. function IsLetterOrDigit(AChar : UnicodeChar) : Boolean; overload;
  136. function IsLetterOrDigit(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  137. function IsLower(AChar : UnicodeChar) : Boolean; overload;
  138. function IsLower(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  139. function IsNumber(AChar : UnicodeChar) : Boolean; overload;
  140. function IsNumber(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  141. function IsPunctuation(AChar : UnicodeChar) : Boolean; overload;
  142. function IsPunctuation(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  143. function IsSeparator(AChar : UnicodeChar) : Boolean; overload;
  144. function IsSeparator(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  145. function IsSymbol(AChar : UnicodeChar) : Boolean; overload;
  146. function IsSymbol(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  147. function IsUpper(AChar : UnicodeChar) : Boolean; overload;
  148. function IsUpper(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  149. function IsWhiteSpace(AChar : UnicodeChar) : Boolean; overload;
  150. function IsWhiteSpace(const AString : UnicodeString; AIndex : Integer) : Boolean; overload;
  151. function ToLower(AChar : UnicodeChar) : UnicodeChar; overload;
  152. function ToLower(const AString : UnicodeString) : UnicodeString; overload;
  153. function ToUpper(AChar : UnicodeChar) : UnicodeChar; overload;
  154. function ToUpper(const AString : UnicodeString) : UnicodeString; overload;
  155. implementation
  156. uses
  157. SysUtils,
  158. RtlConsts;
  159. const
  160. LETTER_CATEGORIES = [
  161. TUnicodeCategory.ucUppercaseLetter, TUnicodeCategory.ucLowercaseLetter,
  162. TUnicodeCategory.ucTitlecaseLetter, TUnicodeCategory.ucModifierLetter,
  163. TUnicodeCategory.ucOtherLetter
  164. ];
  165. LETTER_OR_DIGIT_CATEGORIES =
  166. LETTER_CATEGORIES +
  167. [TUnicodeCategory.ucDecimalNumber,TUnicodeCategory.ucLetterNumber];
  168. NUMBER_CATEGORIES =
  169. [ TUnicodeCategory.ucDecimalNumber, TUnicodeCategory.ucLetterNumber,
  170. TUnicodeCategory.ucOtherNumber
  171. ];
  172. PUNCTUATION_CATEGORIES = [
  173. TUnicodeCategory.ucConnectPunctuation, TUnicodeCategory.ucDashPunctuation,
  174. TUnicodeCategory.ucOpenPunctuation, TUnicodeCategory.ucClosePunctuation,
  175. TUnicodeCategory.ucInitialPunctuation, TUnicodeCategory.ucFinalPunctuation,
  176. TUnicodeCategory.ucOtherPunctuation
  177. ];
  178. SEPARATOR_CATEGORIES =
  179. [ TUnicodeCategory.ucSpaceSeparator, TUnicodeCategory.ucLineSeparator,
  180. TUnicodeCategory.ucParagraphSeparator
  181. ];
  182. SYMBOL_CATEGORIES =
  183. [ TUnicodeCategory.ucMathSymbol, TUnicodeCategory.ucCurrencySymbol,
  184. TUnicodeCategory.ucModifierSymbol, TUnicodeCategory.ucOtherSymbol
  185. ];
  186. function ConvertFromUtf32(AChar: UCS4Char): UnicodeString;
  187. begin
  188. Result := TCharacter.ConvertFromUtf32(AChar);
  189. end;
  190. function ConvertToUtf32(const AString: UnicodeString; AIndex: Integer): UCS4Char;
  191. begin
  192. Result := TCharacter.ConvertToUtf32(AString, AIndex);
  193. end;
  194. function ConvertToUtf32(const AString: UnicodeString; AIndex: Integer; out ACharLength: Integer): UCS4Char;
  195. begin
  196. Result := TCharacter.ConvertToUtf32(AString, AIndex, ACharLength);
  197. end;
  198. function ConvertToUtf32(const AHighSurrogate, ALowSurrogate: UnicodeChar): UCS4Char;
  199. begin
  200. Result := TCharacter.ConvertToUtf32(AHighSurrogate, ALowSurrogate);
  201. end;
  202. function GetNumericValue(AChar: UnicodeChar): Double;
  203. begin
  204. Result := TCharacter.GetNumericValue(AChar);
  205. end;
  206. function GetNumericValue(const AString: UnicodeString; AIndex: Integer): Double;
  207. begin
  208. Result := TCharacter.GetNumericValue(AString, AIndex);
  209. end;
  210. function GetUnicodeCategory(AChar: UnicodeChar): TUnicodeCategory;
  211. begin
  212. Result := TCharacter.GetUnicodeCategory(AChar);
  213. end;
  214. function GetUnicodeCategory(const AString: UnicodeString; AIndex: Integer): TUnicodeCategory;
  215. begin
  216. Result := TCharacter.GetUnicodeCategory(AString, AIndex);
  217. end;
  218. function IsControl(AChar: UnicodeChar): Boolean;
  219. begin
  220. Result := TCharacter.IsControl(AChar);
  221. end;
  222. function IsControl(const AString: UnicodeString; AIndex: Integer): Boolean;
  223. begin
  224. Result := TCharacter.IsControl(AString, AIndex);
  225. end;
  226. function IsDigit(AChar: UnicodeChar): Boolean;
  227. begin
  228. Result := TCharacter.IsDigit(AChar);
  229. end;
  230. function IsDigit(const AString: UnicodeString; AIndex: Integer): Boolean;
  231. begin
  232. Result := TCharacter.IsDigit(AString, AIndex);
  233. end;
  234. function IsSurrogate(AChar: UnicodeChar): Boolean;
  235. begin
  236. Result := TCharacter.IsSurrogate(AChar);
  237. end;
  238. function IsSurrogate(const AString: UnicodeString; AIndex: Integer): Boolean;
  239. begin
  240. Result := TCharacter.IsSurrogate(AString, AIndex);
  241. end;
  242. function IsHighSurrogate(AChar: UnicodeChar): Boolean;
  243. begin
  244. Result := TCharacter.IsHighSurrogate(AChar);
  245. end;
  246. function IsHighSurrogate(const AString: UnicodeString; AIndex: Integer): Boolean;
  247. begin
  248. Result := TCharacter.IsHighSurrogate(AString, AIndex);
  249. end;
  250. function IsLowSurrogate(AChar: UnicodeChar): Boolean;
  251. begin
  252. Result := TCharacter.IsLowSurrogate(AChar);
  253. end;
  254. function IsLowSurrogate(const AString: UnicodeString; AIndex: Integer): Boolean;
  255. begin
  256. Result := TCharacter.IsLowSurrogate(AString, AIndex);
  257. end;
  258. function IsSurrogatePair(const AHighSurrogate, ALowSurrogate: UnicodeChar): Boolean;
  259. begin
  260. Result := TCharacter.IsSurrogatePair(AHighSurrogate, ALowSurrogate);
  261. end;
  262. function IsSurrogatePair(const AString: UnicodeString; AIndex: Integer): Boolean;
  263. begin
  264. Result := TCharacter.IsSurrogatePair(AString, AIndex);
  265. end;
  266. function IsLetter(AChar: UnicodeChar): Boolean;
  267. begin
  268. Result := TCharacter.IsLetter(AChar);
  269. end;
  270. function IsLetter(const AString: UnicodeString; AIndex: Integer): Boolean;
  271. begin
  272. Result := TCharacter.IsLetter(AString, AIndex);
  273. end;
  274. function IsLetterOrDigit(AChar: UnicodeChar): Boolean;
  275. begin
  276. Result := TCharacter.IsLetterOrDigit(AChar);
  277. end;
  278. function IsLetterOrDigit(const AString: UnicodeString; AIndex: Integer): Boolean;
  279. begin
  280. Result := TCharacter.IsLetterOrDigit(AString, AIndex);
  281. end;
  282. function IsLower(AChar: UnicodeChar): Boolean;
  283. begin
  284. Result := TCharacter.IsLower(AChar);
  285. end;
  286. function IsLower(const AString: UnicodeString; AIndex: Integer): Boolean;
  287. begin
  288. Result := TCharacter.IsLower(AString, AIndex);
  289. end;
  290. function IsNumber(AChar: UnicodeChar): Boolean;
  291. begin
  292. Result := TCharacter.IsNumber(AChar);
  293. end;
  294. function IsNumber(const AString: UnicodeString; AIndex: Integer): Boolean;
  295. begin
  296. Result := TCharacter.IsNumber(AString, AIndex);
  297. end;
  298. function IsPunctuation(AChar: UnicodeChar): Boolean;
  299. begin
  300. Result := TCharacter.IsPunctuation(AChar);
  301. end;
  302. function IsPunctuation(const AString: UnicodeString; AIndex: Integer): Boolean;
  303. begin
  304. Result := TCharacter.IsPunctuation(AString, AIndex);
  305. end;
  306. function IsSeparator(AChar: UnicodeChar): Boolean;
  307. begin
  308. Result := TCharacter.IsSeparator(AChar);
  309. end;
  310. function IsSeparator(const AString: UnicodeString; AIndex: Integer): Boolean;
  311. begin
  312. Result := TCharacter.IsSeparator(AString, AIndex);
  313. end;
  314. function IsSymbol(AChar: UnicodeChar): Boolean;
  315. begin
  316. Result := TCharacter.IsSymbol(AChar);
  317. end;
  318. function IsSymbol(const AString: UnicodeString; AIndex: Integer): Boolean;
  319. begin
  320. Result := TCharacter.IsSymbol(AString, AIndex);
  321. end;
  322. function IsUpper(AChar: UnicodeChar): Boolean;
  323. begin
  324. Result := TCharacter.IsUpper(AChar);
  325. end;
  326. function IsUpper(const AString: UnicodeString; AIndex: Integer): Boolean;
  327. begin
  328. Result := TCharacter.IsUpper(AString, AIndex);
  329. end;
  330. function IsWhiteSpace(AChar: UnicodeChar): Boolean;
  331. begin
  332. Result := TCharacter.IsWhiteSpace(AChar);
  333. end;
  334. function IsWhiteSpace(const AString: UnicodeString; AIndex: Integer): Boolean;
  335. begin
  336. Result := TCharacter.IsWhiteSpace(AString, AIndex);
  337. end;
  338. function ToLower(AChar: UnicodeChar): UnicodeChar;
  339. begin
  340. Result := TCharacter.ToLower(AChar);
  341. end;
  342. function ToLower(const AString: UnicodeString): UnicodeString;
  343. begin
  344. Result := TCharacter.ToLower(AString);
  345. end;
  346. function ToUpper(AChar: UnicodeChar): UnicodeChar;
  347. begin
  348. Result := TCharacter.ToUpper(AChar);
  349. end;
  350. function ToUpper(const AString: UnicodeString): UnicodeString;
  351. begin
  352. Result := TCharacter.ToUpper(AString);
  353. end;
  354. { TCharacter }
  355. class function TCharacter.TestCategory(
  356. const AString : UnicodeString;
  357. AIndex : Integer;
  358. ACategory : TUnicodeCategory
  359. ) : Boolean;
  360. var
  361. pu : PUC_Prop;
  362. begin
  363. if (AIndex < 1) or (AIndex > Length(AString)) then
  364. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  365. pu := GetProps(Word(AString[AIndex]));
  366. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  367. if not IsSurrogatePair(AString,AIndex) then
  368. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  369. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  370. end;
  371. Result := (TUnicodeCategory(pu^.Category) = ACategory);
  372. end;
  373. class function TCharacter.TestCategory(
  374. const AString : UnicodeString;
  375. AIndex : Integer;
  376. ACategory : TUnicodeCategorySet
  377. ) : Boolean;
  378. var
  379. pu : PUC_Prop;
  380. begin
  381. if (AIndex < 1) or (AIndex > Length(AString)) then
  382. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  383. pu := GetProps(Word(AString[AIndex]));
  384. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  385. if not IsSurrogatePair(AString,AIndex) then
  386. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  387. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  388. end;
  389. Result := (TUnicodeCategory(pu^.Category) in ACategory);
  390. end;
  391. constructor TCharacter.Create;
  392. begin
  393. raise ENoConstructException.CreateFmt(SClassCantBeConstructed, [ClassName]);
  394. end;
  395. class function TCharacter.ConvertFromUtf32(AChar : UCS4Char) : UnicodeString;
  396. begin
  397. if AChar < UCS4_HALF_BASE then
  398. begin
  399. if IsSurrogate(UnicodeChar(AChar)) then
  400. raise EArgumentOutOfRangeException.CreateFmt(SInvalidUTF32Char, [AChar]);
  401. Result := UnicodeChar(AChar);
  402. end
  403. else
  404. begin
  405. if AChar > MAX_LEGAL_UTF32 then
  406. raise EArgumentOutOfRangeException.CreateFmt(SInvalidUTF32Char, [AChar]);
  407. SetLength(Result, 2);
  408. AChar := AChar - UCS4_HALF_BASE;
  409. Result[1] := UnicodeChar((AChar shr 10) + HIGH_SURROGATE_BEGIN);
  410. Result[2] := UnicodeChar((AChar and UCS4_HALF_MASK) + LOW_SURROGATE_BEGIN);
  411. end;
  412. end;
  413. class function TCharacter.ConvertToUtf32(const AString : UnicodeString; AIndex : Integer) : UCS4Char; overload;
  414. begin
  415. if (AIndex < 1) or (AIndex > Length(AString)) then
  416. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  417. Result := Word(AString[AIndex]);
  418. if IsHighSurrogate(UnicodeChar(Result)) then
  419. begin
  420. if Length(AString) < Succ(AIndex) then
  421. raise EArgumentException.CreateFmt(SInvalidHighSurrogate, [AIndex]);
  422. Result := ConvertToUtf32(UnicodeChar(Result), AString[Succ(AIndex)]);
  423. end;
  424. end;
  425. class function TCharacter.ConvertToUtf32(const AString : UnicodeString; AIndex : Integer; out ACharLength : Integer) : UCS4Char; overload;
  426. begin
  427. if (AIndex < 1) or (AIndex > Length(AString)) then
  428. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  429. Result := Word(AString[AIndex]);
  430. if IsHighSurrogate(UnicodeChar(Result)) then
  431. begin
  432. if Length(AString) < Succ(AIndex) then
  433. raise EArgumentException.CreateFmt(SInvalidHighSurrogate, [AIndex]);
  434. Result := ConvertToUtf32(UnicodeChar(Result), AString[Succ(AIndex)]);
  435. ACharLength := 2;
  436. end
  437. else
  438. ACharLength := 1;
  439. end;
  440. class function TCharacter.ConvertToUtf32(const AHighSurrogate, ALowSurrogate : UnicodeChar) : UCS4Char; overload;
  441. begin
  442. if not IsHighSurrogate(AHighSurrogate) then
  443. raise EArgumentOutOfRangeException.CreateFmt(SHighSurrogateOutOfRange, [Word(AHighSurrogate)]);
  444. if not IsLowSurrogate(ALowSurrogate) then
  445. raise EArgumentOutOfRangeException.CreateFmt(SLowSurrogateOutOfRange, [Word(ALowSurrogate)]);
  446. Result := ToUCS4(AHighSurrogate, ALowSurrogate);
  447. end;
  448. class function TCharacter.GetNumericValue(AChar : UnicodeChar) : Double;
  449. begin
  450. Result := GetProps(Word(AChar))^.NumericValue;
  451. end;
  452. class function TCharacter.GetNumericValue(
  453. const AString : UnicodeString;
  454. AIndex : Integer
  455. ) : Double;
  456. var
  457. pu : PUC_Prop;
  458. begin
  459. if (AIndex < 1) or (AIndex > Length(AString)) then
  460. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  461. pu := GetProps(Word(AString[AIndex]));
  462. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  463. if not IsSurrogatePair(AString,AIndex) then
  464. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  465. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  466. end;
  467. Result := pu^.NumericValue;
  468. end;
  469. class function TCharacter.GetUnicodeCategory(AChar : UnicodeChar) : TUnicodeCategory;
  470. begin
  471. Result := TUnicodeCategory(GetProps(Word(AChar))^.Category);
  472. end;
  473. class function TCharacter.GetUnicodeCategory(
  474. const AString : UnicodeString;
  475. AIndex : Integer
  476. ) : TUnicodeCategory;
  477. var
  478. pu : PUC_Prop;
  479. begin
  480. if (AIndex < 1) or (AIndex > Length(AString)) then
  481. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  482. pu := GetProps(Word(AString[AIndex]));
  483. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  484. if not IsSurrogatePair(AString,AIndex) then
  485. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  486. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  487. end;
  488. Result := TUnicodeCategory(pu^.Category);
  489. end;
  490. class function TCharacter.IsControl(AChar : UnicodeChar) : Boolean;
  491. begin
  492. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucControl);
  493. end;
  494. class function TCharacter.IsControl(
  495. const AString : UnicodeString;
  496. AIndex : Integer
  497. ) : Boolean;
  498. begin
  499. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucControl);
  500. end;
  501. class function TCharacter.IsDigit(AChar : UnicodeChar) : Boolean;
  502. begin
  503. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucDecimalNumber);
  504. end;
  505. class function TCharacter.IsDigit(
  506. const AString : UnicodeString;
  507. AIndex : Integer
  508. ) : Boolean;
  509. begin
  510. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucDecimalNumber);
  511. end;
  512. class function TCharacter.IsSurrogate(AChar : UnicodeChar) : Boolean;
  513. begin
  514. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucSurrogate);
  515. end;
  516. class function TCharacter.IsSurrogate(
  517. const AString : UnicodeString;
  518. AIndex : Integer
  519. ) : Boolean;
  520. begin
  521. if (AIndex < 1) or (AIndex > Length(AString)) then
  522. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  523. Result := IsSurrogate(AString[AIndex]);
  524. end;
  525. class function TCharacter.IsHighSurrogate(AChar : UnicodeChar) : Boolean;
  526. begin
  527. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucSurrogate) and
  528. (Word(AChar) >= HIGH_SURROGATE_BEGIN) and
  529. (Word(AChar) <= HIGH_SURROGATE_END);
  530. end;
  531. class function TCharacter.IsHighSurrogate(
  532. const AString : UnicodeString;
  533. AIndex : Integer
  534. ) : Boolean;
  535. begin
  536. if (AIndex < 1) or (AIndex > Length(AString)) then
  537. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  538. Result := IsHighSurrogate(AString[AIndex]);
  539. end;
  540. class function TCharacter.IsLowSurrogate(AChar : UnicodeChar) : Boolean;
  541. begin
  542. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucSurrogate) and
  543. (Word(AChar) >= LOW_SURROGATE_BEGIN) and
  544. (Word(AChar) <= LOW_SURROGATE_END);
  545. end;
  546. class function TCharacter.IsLowSurrogate(
  547. const AString : UnicodeString;
  548. AIndex : Integer
  549. ) : Boolean;
  550. begin
  551. if (AIndex < 1) or (AIndex > Length(AString)) then
  552. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  553. Result := IsLowSurrogate(AString[AIndex]);
  554. end;
  555. class function TCharacter.IsSurrogatePair(
  556. const AHighSurrogate,
  557. ALowSurrogate : UnicodeChar
  558. ) : Boolean;
  559. begin
  560. Result := UnicodeIsSurrogatePair(AHighSurrogate,ALowSurrogate);
  561. end;
  562. class function TCharacter.IsSurrogatePair(
  563. const AString : UnicodeString;
  564. AIndex : Integer
  565. ) : Boolean;
  566. begin
  567. if (AIndex < 1) or (AIndex > Length(AString)) then
  568. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  569. if not IsHighSurrogate(AString[AIndex]) then begin
  570. Result := False;
  571. exit;
  572. end;
  573. if ((AIndex+1) > Length(AString)) then
  574. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex+1, Length(AString)]);
  575. Result := IsSurrogatePair(AString[AIndex],AString[AIndex+1]);
  576. end;
  577. class function TCharacter.IsLetter(AChar : UnicodeChar) : Boolean;
  578. begin
  579. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in LETTER_CATEGORIES);
  580. end;
  581. class function TCharacter.IsLetter(
  582. const AString : UnicodeString;
  583. AIndex : Integer
  584. ) : Boolean;
  585. begin
  586. Result := TestCategory(AString,AIndex,LETTER_CATEGORIES);
  587. end;
  588. class function TCharacter.IsLetterOrDigit(AChar : UnicodeChar) : Boolean;
  589. begin
  590. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in LETTER_OR_DIGIT_CATEGORIES);
  591. end;
  592. class function TCharacter.IsLetterOrDigit(
  593. const AString : UnicodeString;
  594. AIndex : Integer
  595. ) : Boolean;
  596. begin
  597. Result := TestCategory(AString,AIndex,LETTER_OR_DIGIT_CATEGORIES);
  598. end;
  599. class function TCharacter.IsLower(AChar : UnicodeChar) : Boolean;
  600. begin
  601. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucLowercaseLetter);
  602. end;
  603. class function TCharacter.IsLower(
  604. const AString : UnicodeString;
  605. AIndex : Integer
  606. ) : Boolean;
  607. begin
  608. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucLowercaseLetter);
  609. end;
  610. class function TCharacter.IsNumber(AChar : UnicodeChar) : Boolean;
  611. begin
  612. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in NUMBER_CATEGORIES);
  613. end;
  614. class function TCharacter.IsNumber(
  615. const AString : UnicodeString;
  616. AIndex : Integer
  617. ) : Boolean;
  618. begin
  619. Result := TestCategory(AString,AIndex,NUMBER_CATEGORIES);
  620. end;
  621. class function TCharacter.IsPunctuation(AChar : UnicodeChar) : Boolean;
  622. begin
  623. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in PUNCTUATION_CATEGORIES);
  624. end;
  625. class function TCharacter.IsPunctuation(
  626. const AString : UnicodeString;
  627. AIndex : Integer
  628. ) : Boolean;
  629. begin
  630. Result := TestCategory(AString,AIndex,PUNCTUATION_CATEGORIES);
  631. end;
  632. class function TCharacter.IsSeparator(AChar: UnicodeChar): Boolean;
  633. begin
  634. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in SEPARATOR_CATEGORIES);
  635. end;
  636. class function TCharacter.IsSeparator(
  637. const AString : UnicodeString;
  638. AIndex : Integer
  639. ) : Boolean;
  640. begin
  641. Result := TestCategory(AString,AIndex,SEPARATOR_CATEGORIES);
  642. end;
  643. class function TCharacter.IsSymbol(AChar: UnicodeChar): Boolean;
  644. begin
  645. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) in SYMBOL_CATEGORIES);
  646. end;
  647. class function TCharacter.IsSymbol(
  648. const AString : UnicodeString;
  649. AIndex : Integer
  650. ) : Boolean;
  651. begin
  652. Result := TestCategory(AString,AIndex,SYMBOL_CATEGORIES);
  653. end;
  654. class function TCharacter.IsUpper(AChar : UnicodeChar) : Boolean;
  655. begin
  656. Result := (TUnicodeCategory(GetProps(Word(AChar))^.Category) = TUnicodeCategory.ucUppercaseLetter);
  657. end;
  658. class function TCharacter.IsUpper(
  659. const AString : UnicodeString;
  660. AIndex : Integer
  661. ) : Boolean;
  662. begin
  663. Result := TestCategory(AString,AIndex,TUnicodeCategory.ucUppercaseLetter);
  664. end;
  665. class function TCharacter.IsWhiteSpace(AChar : UnicodeChar) : Boolean;
  666. begin
  667. Result := GetProps(Word(AChar))^.WhiteSpace;
  668. end;
  669. class function TCharacter.IsWhiteSpace(
  670. const AString : UnicodeString;
  671. AIndex : Integer
  672. ) : Boolean;
  673. var
  674. pu : PUC_Prop;
  675. begin
  676. if (AIndex < 1) or (AIndex > Length(AString)) then
  677. raise EArgumentOutOfRangeException.CreateFmt(SStringIndexOutOfRange, [AIndex, Length(AString)]);
  678. pu := GetProps(Word(AString[AIndex]));
  679. if (TUnicodeCategory(pu^.Category) = TUnicodeCategory.ucSurrogate) then begin
  680. if not IsSurrogatePair(AString,AIndex) then
  681. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  682. pu := GetProps(AString[AIndex],AString[AIndex+1]);
  683. end;
  684. Result := pu^.WhiteSpace;
  685. end;
  686. class function TCharacter.ToLower(AChar : UnicodeChar) : UnicodeChar;
  687. begin
  688. Result := UnicodeChar(Word(GetProps(Word(AChar))^.SimpleLowerCase));
  689. if (Result = UnicodeChar(0)) then
  690. Result := AChar;
  691. end;
  692. class function TCharacter.ToLower(const AString : UnicodeString) : UnicodeString;
  693. begin
  694. Result := ToLower(AString,[]);
  695. end;
  696. class function TCharacter.ToLower(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString;
  697. begin
  698. if (UnicodeToLower(
  699. AString,(TCharacterOption.coIgnoreInvalidSequence in AOptions),Result
  700. ) <> 0
  701. )
  702. then
  703. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  704. end;
  705. class function TCharacter.ToUpper(AChar : UnicodeChar) : UnicodeChar;
  706. begin
  707. Result := UnicodeChar(Word(GetProps(Word(AChar))^.SimpleUpperCase));
  708. if (Result = UnicodeChar(0)) then
  709. Result := AChar;
  710. end;
  711. class function TCharacter.ToUpper(const AString : UnicodeString) : UnicodeString;
  712. begin
  713. Result := ToUpper(AString,[]);
  714. end;
  715. class function TCharacter.ToUpper(const AString : UnicodeString; const AOptions : TCharacterOptions) : UnicodeString;
  716. begin
  717. if (UnicodeToUpper(
  718. AString,(TCharacterOption.coIgnoreInvalidSequence in AOptions),Result
  719. ) <> 0
  720. )
  721. then
  722. raise EArgumentException.Create(SInvalidUnicodeCodePointSequence);
  723. end;
  724. end.