unicodeset.pas 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. { UnicodeSet implementation.
  2. Copyright (c) 2013-2015 by Inoussa OUEDRAOGO
  3. The source code is distributed under the Library GNU
  4. General Public License with the following modification:
  5. - object files and libraries linked into an application may be
  6. distributed without source code.
  7. If you didn't receive a copy of the file COPYING, contact:
  8. Free Software Foundation
  9. 675 Mass Ave
  10. Cambridge, MA 02139
  11. USA
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. }
  16. unit unicodeset;
  17. {$mode delphi}{$H+}
  18. {$scopedenums on}
  19. interface
  20. uses
  21. SysUtils,
  22. grbtree, helper;
  23. type
  24. EUnicodeSetException = class(Exception)
  25. end;
  26. TUnicodeSet = class;
  27. { TPatternParser }
  28. TPatternParser = class
  29. private
  30. FBufferStr : UnicodeString;
  31. FBuffer : PUnicodeChar;
  32. FBufferLength : Integer;
  33. FSet : TUnicodeSet;
  34. FPosition : Integer;
  35. FSpecialChar: Boolean;
  36. private
  37. procedure Error(const AMsg : string; const AArgs : array of const);overload;inline;
  38. procedure Error(const AMsg : string);overload;inline;
  39. procedure SetBuffer(const APattern : PUnicodeChar; const ALength : Integer);
  40. procedure CheckEOF();inline;overload;
  41. procedure CheckEOF(ALength : Integer);overload;inline;
  42. procedure UnexpectedEOF();inline;
  43. function IsThis(AItem : UnicodeString; const APosition : Integer) : Boolean;overload;
  44. function IsThis(AItem : UnicodeString) : Boolean;overload;inline;
  45. procedure Expect(AItem : UnicodeString; const APosition : Integer);overload;inline;
  46. procedure Expect(AItem : UnicodeString);overload;inline;
  47. procedure SkipSpaces();inline;
  48. function NextChar() : TUnicodeCodePoint;
  49. procedure ParseItem();
  50. procedure DoParse();
  51. property SpecialChar : Boolean read FSpecialChar;
  52. public
  53. procedure Parse(const APattern : PUnicodeChar; const ALength : Integer);overload;
  54. procedure Parse(const APattern : UnicodeString);overload;inline;
  55. property CurrentSet : TUnicodeSet read FSet write FSet;
  56. end;
  57. TUnicodeCodePointArrayComparator = class
  58. public
  59. // Return
  60. // * if A>B then 1
  61. // * if A=B then 0
  62. // * if A<B then -1
  63. class function Compare(const A, B : TUnicodeCodePointArray) : Integer;static;inline;
  64. end;
  65. { TUnicodeSet }
  66. TUnicodeSet = class
  67. private type
  68. TItem = TUnicodeCodePointArray;
  69. TTree = TRBTree<TItem,TUnicodeCodePointArrayComparator>;
  70. public type
  71. TIterator = TTree.TIterator;
  72. private
  73. FTree : TTree;
  74. FParser : TPatternParser;
  75. private
  76. procedure CreateParser();inline;
  77. function InternalContains(const AString : UnicodeString) : Boolean;overload;
  78. public
  79. constructor Create();
  80. destructor Destroy;override;
  81. procedure Add(AChar : TUnicodeCodePoint);inline;overload;
  82. procedure Add(AString : TUnicodeCodePointArray);inline;overload;
  83. procedure AddRange(const AStart, AEnd : TUnicodeCodePoint);inline;
  84. procedure AddPattern(const APattern : UnicodeString);inline;overload;
  85. procedure AddPattern(const APattern : RawByteString);inline;overload;
  86. function CreateIterator() : TIterator;
  87. function Contains(const AString : array of TUnicodeCodePoint) : Boolean;overload;
  88. function Contains(const AChar : TUnicodeCodePoint) : Boolean;inline;overload;
  89. function Contains(const AChar : UnicodeChar) : Boolean;inline;overload;
  90. function Contains(const AChar : AnsiChar) : Boolean;inline;overload;
  91. function Contains(const AString : UnicodeString) : Boolean;overload;
  92. function Contains(const AString : RawByteString) : Boolean;overload;
  93. end;
  94. resourcestring
  95. SInvalidLength = 'Invalid length value : "%d".';
  96. SInvalidPosition = 'Invalid position : "%d".';
  97. SInvalidRangeLimits = 'Invalid range limits : ["%x" , "%x"].';
  98. SExpectedBut = 'Expects "%s" but got "%s..." .';
  99. SUnexpectedEOF = 'Unexpected end of file.';
  100. implementation
  101. uses
  102. unicodedata;
  103. function ToArray(const AItem : TUnicodeCodePoint) : TUnicodeCodePointArray;inline;
  104. begin
  105. SetLength(Result,1);
  106. Result[Low(Result)] := AItem;
  107. end;
  108. function CompareItem(const Item1, Item2 : TUnicodeCodePointArray): Integer;
  109. var
  110. a, b : ^TUnicodeCodePoint;
  111. i, ha, hb : Integer;
  112. begin
  113. if (Pointer(Item1) = Pointer(Item2)) then
  114. exit(0);
  115. if (Item1 = nil) then
  116. exit(-1);
  117. if (Item2 = nil) then
  118. exit(1);
  119. a := @Item1[0];
  120. b := @Item2[0];
  121. Result := 1;
  122. ha := Length(Item1) - 1;
  123. hb := Length(Item2) - 1;
  124. for i := 0 to ha do begin
  125. if (i > hb) then
  126. exit;
  127. if (a^ < b^) then
  128. exit(-1);
  129. if (a^ > b^) then
  130. exit(1);
  131. Inc(a);
  132. Inc(b);
  133. end;
  134. if (ha = hb) then
  135. exit(0);
  136. exit(-1);
  137. end;
  138. { TUnicodeCodePointArrayComparator }
  139. class function TUnicodeCodePointArrayComparator.Compare(const A, B : TUnicodeCodePointArray): Integer;
  140. begin
  141. Result := CompareItem(A,B);
  142. end;
  143. { TPatternParser }
  144. procedure TPatternParser.Error(const AMsg: string; const AArgs: array of const);
  145. begin
  146. raise EUnicodeSetException.CreateFmt(AMsg,AArgs);
  147. end;
  148. procedure TPatternParser.Error(const AMsg: string);
  149. begin
  150. raise EUnicodeSetException.Create(AMsg);
  151. end;
  152. procedure TPatternParser.SetBuffer(
  153. const APattern : PUnicodeChar;
  154. const ALength : Integer
  155. );
  156. begin
  157. FPosition := 0;
  158. if (ALength <= 1) then begin
  159. FBufferStr := '';
  160. FBuffer := nil;
  161. FBufferLength := 0;
  162. exit;
  163. end;
  164. FBufferLength := ALength;
  165. SetLength(FBufferStr,FBufferLength);
  166. FBuffer := @FBufferStr[1];
  167. Move(APattern^,FBuffer^,(FBufferLength*SizeOf(FBuffer^)));
  168. end;
  169. procedure TPatternParser.CheckEOF();
  170. begin
  171. CheckEOF(0);
  172. end;
  173. procedure TPatternParser.CheckEOF(ALength : Integer);
  174. begin
  175. if (ALength < 0) then
  176. Error(SInvalidLength,[ALength]);
  177. if ((FPosition+ALength) >= FBufferLength) then
  178. UnexpectedEOF();
  179. end;
  180. procedure TPatternParser.UnexpectedEOF();
  181. begin
  182. Error(SUnexpectedEOF);
  183. end;
  184. function TPatternParser.IsThis(AItem: UnicodeString; const APosition: Integer): Boolean;
  185. var
  186. i, k, c : Integer;
  187. begin
  188. if (APosition < 0) then
  189. Error(SInvalidPosition,[APosition]);
  190. Result := False;
  191. c := Length(AItem);
  192. if (c = 0) then
  193. exit;
  194. i := APosition;
  195. k := i + c;
  196. if (k >= FBufferLength) then
  197. exit;
  198. if CompareMem(@AItem[1], @FBuffer[APosition],c) then
  199. Result := True;
  200. end;
  201. function TPatternParser.IsThis(AItem : UnicodeString) : Boolean;
  202. begin
  203. Result := IsThis(AItem,FPosition);
  204. end;
  205. procedure TPatternParser.Expect(AItem: UnicodeString; const APosition: Integer);
  206. begin
  207. if not IsThis(AItem,APosition) then
  208. Error(SExpectedBut,[AItem,Copy(FBuffer,APosition,Length(AItem))]);
  209. end;
  210. procedure TPatternParser.Expect(AItem: UnicodeString);
  211. begin
  212. Expect(AItem,FPosition);
  213. end;
  214. procedure TPatternParser.SkipSpaces();
  215. begin
  216. while (FPosition < FBufferLength) do begin
  217. if (FBuffer[FPosition] <> ' ') then
  218. Break;
  219. Inc(FPosition);
  220. end;
  221. end;
  222. function TPatternParser.NextChar(): TUnicodeCodePoint;
  223. var
  224. i : Integer;
  225. c : UnicodeChar;
  226. cp : TUnicodeCodePoint;
  227. s : UnicodeString;
  228. begin
  229. SkipSpaces();
  230. CheckEOF();
  231. c := FBuffer[FPosition];
  232. cp := Ord(c);
  233. Inc(FPosition);
  234. if (c = '\') and (FPosition < FBufferLength) then begin
  235. if IsThis('\') then begin
  236. Inc(FPosition);
  237. CheckEOF();
  238. cp := Ord(FBuffer[FPosition]);
  239. Inc(FPosition);
  240. end else if IsThis('u') then begin
  241. Inc(FPosition);
  242. CheckEOF(4);
  243. s := Copy(FBufferStr,(FPosition+1),4);
  244. Inc(FPosition,4);
  245. if not TryStrToInt(string('$'+s),i) then
  246. Error(SExpectedBut,['\uXXXX',s]);
  247. cp := i;
  248. end;
  249. end;
  250. if (cp <= MAX_WORD) and UnicodeIsLowSurrogate(UnicodeChar(Word(cp))) then begin
  251. SkipSpaces();
  252. CheckEOF();
  253. c := UnicodeChar(Word(cp));
  254. if UnicodeIsSurrogatePair(c,FBuffer[FPosition]) then begin
  255. cp := ToUCS4(c,FBuffer[FPosition]);
  256. Inc(FPosition);
  257. end;
  258. end;
  259. FSpecialChar := (cp = Ord('{')) or (cp = Ord('}'));
  260. Result := cp;
  261. end;
  262. function CompareTo(const A : TUnicodeCodePoint; const B : UnicodeChar) : Boolean;inline;
  263. begin
  264. Result := (A = Ord(B));
  265. end;
  266. procedure TPatternParser.ParseItem();
  267. var
  268. cp, lastCp : TUnicodeCodePoint;
  269. charCount, k : Integer;
  270. cpa : TUnicodeCodePointArray;
  271. begin
  272. SkipSpaces();
  273. Expect('[');
  274. charCount := 0;
  275. Inc(FPosition);
  276. cp:=0;
  277. while (FPosition < FBufferLength) do begin
  278. lastCp := cp;
  279. cp := NextChar();
  280. if CompareTo(cp,']') then
  281. Break;
  282. if SpecialChar and (cp = Ord('{')) then begin
  283. SetLength(cpa,12);
  284. k := 0;
  285. while True do begin
  286. cp := NextChar();
  287. if SpecialChar and (cp = Ord('}')) then
  288. break;
  289. if (k >= Length(cpa)) then
  290. SetLength(cpa,(2*k));
  291. cpa[k] := cp;
  292. k := k+1;
  293. end;
  294. if (k > 0) then begin
  295. SetLength(cpa,k);
  296. FSet.Add(cpa);
  297. end;
  298. end else begin
  299. if CompareTo(cp,'-') then begin
  300. if (charCount = 0) then
  301. Error(SExpectedBut,['<char>','-']);
  302. cp := NextChar();
  303. FSet.AddRange(lastCp,cp);
  304. end else begin
  305. FSet.Add(cp);
  306. end;
  307. end;
  308. Inc(charCount);
  309. end;
  310. end;
  311. procedure TPatternParser.DoParse();
  312. begin
  313. SkipSpaces();
  314. while (FPosition < FBufferLength) do begin
  315. ParseItem();
  316. SkipSpaces();
  317. end;
  318. end;
  319. procedure TPatternParser.Parse(const APattern: PUnicodeChar; const ALength: Integer);
  320. begin
  321. if (ALength < 2) then
  322. exit;
  323. SetBuffer(APattern,ALength);
  324. DoParse();
  325. end;
  326. procedure TPatternParser.Parse(const APattern : UnicodeString);
  327. begin
  328. Parse(@APattern[1],Length(APattern));
  329. end;
  330. { TUnicodeSet }
  331. procedure TUnicodeSet.CreateParser();
  332. begin
  333. if (FParser = nil) then begin
  334. FParser := TPatternParser.Create();
  335. FParser.CurrentSet := Self;
  336. end;
  337. end;
  338. function TUnicodeSet.InternalContains(const AString: UnicodeString): Boolean;
  339. var
  340. u4 : UCS4String;
  341. c, i : Integer;
  342. cpa : TUnicodeCodePointArray;
  343. begin
  344. u4 := UnicodeStringToUCS4String(AString);
  345. c := Length(u4)-1;
  346. if (c = 1) then
  347. exit(Contains(u4[0]));
  348. SetLength(cpa,c);
  349. for i := 0 to c-1 do
  350. cpa[i] := u4[i];
  351. Result := Contains(cpa);
  352. end;
  353. constructor TUnicodeSet.Create;
  354. begin
  355. FTree := TTree.Create();
  356. end;
  357. destructor TUnicodeSet.Destroy;
  358. begin
  359. FParser.Free();
  360. FTree.Free();
  361. inherited Destroy;
  362. end;
  363. procedure TUnicodeSet.Add(AChar: TUnicodeCodePoint);
  364. begin
  365. FTree.Insert(ToArray(AChar));
  366. end;
  367. procedure TUnicodeSet.Add(AString: TUnicodeCodePointArray);
  368. begin
  369. if (AString <> nil) then
  370. FTree.Insert(AString);
  371. end;
  372. procedure TUnicodeSet.AddRange(const AStart, AEnd : TUnicodeCodePoint);
  373. var
  374. i : Integer;
  375. begin
  376. if (AStart > AEnd) then
  377. raise EUnicodeSetException.CreateFmt(SInvalidRangeLimits,[AStart,AEnd]);
  378. for i := AStart to AEnd do
  379. Add(i);
  380. end;
  381. procedure TUnicodeSet.AddPattern(const APattern : UnicodeString);
  382. begin
  383. CreateParser();
  384. FParser.Parse(APattern);
  385. end;
  386. procedure TUnicodeSet.AddPattern(const APattern: RawByteString);
  387. var
  388. us : UnicodeString;
  389. begin
  390. us := UnicodeString(APattern);
  391. AddPattern(us);
  392. end;
  393. function TUnicodeSet.CreateIterator() : TIterator;
  394. begin
  395. Result := FTree.CreateForwardIterator();
  396. end;
  397. function TUnicodeSet.Contains(const AString : array of TUnicodeCodePoint) : Boolean;
  398. var
  399. c : Integer;
  400. x : TUnicodeCodePointArray;
  401. begin
  402. Result := False;
  403. c := Length(AString);
  404. if (c = 0) then
  405. exit;
  406. SetLength(x,c);
  407. Move(AString[Low(AString)],x[Low(x)],(c*SizeOf(x[0])));
  408. if (FTree.FindNode(x) <> nil) then
  409. Result := True;
  410. end;
  411. function TUnicodeSet.Contains(const AChar : TUnicodeCodePoint) : Boolean;
  412. begin
  413. Result := Contains([AChar]);
  414. end;
  415. function TUnicodeSet.Contains(const AChar : UnicodeChar) : Boolean;
  416. begin
  417. Result := Contains(TUnicodeCodePoint(Ord(AChar)));
  418. end;
  419. function TUnicodeSet.Contains(const AChar : AnsiChar) : Boolean;
  420. begin
  421. Result := Contains(TUnicodeCodePoint(Ord(AChar)));
  422. end;
  423. function TUnicodeSet.Contains(const AString: UnicodeString): Boolean;
  424. begin
  425. if (AString = '') then
  426. exit(Contains([]));
  427. if (Length(AString) = 1) then
  428. exit(Contains(AString[1]));
  429. Result := InternalContains(AString);
  430. end;
  431. function TUnicodeSet.Contains(const AString: RawByteString): Boolean;
  432. var
  433. us : UnicodeString;
  434. begin
  435. us := UnicodeString(AString);
  436. Result := Contains(us);
  437. end;
  438. end.