unicodeset.pas 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. { UnicodeSet implementation.
  2. Copyright (c) 2013 by Inoussa OUEDRAOGO
  3. The source code is distributed under the Library GNU
  4. General Public License with the following modification:
  5. - object files and libraries linked into an application may be
  6. distributed without source code.
  7. If you didn't receive a copy of the file COPYING, contact:
  8. Free Software Foundation
  9. 675 Mass Ave
  10. Cambridge, MA 02139
  11. USA
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. }
  16. unit unicodeset;
  17. {$mode delphi}{$H+}
  18. {$scopedenums on}
  19. interface
  20. uses
  21. SysUtils,
  22. grbtree, helper;
  23. type
  24. EUnicodeSetException = class(Exception)
  25. end;
  26. TUnicodeSet = class;
  27. TPatternParser = class
  28. private
  29. FBufferStr : UnicodeString;
  30. FBuffer : PUnicodeChar;
  31. FBufferLength : Integer;
  32. FSet : TUnicodeSet;
  33. FPosition : Integer;
  34. private
  35. procedure Error(const AMsg : string; const AArgs : array of const);overload;inline;
  36. procedure Error(const AMsg : string);overload;inline;
  37. procedure SetBuffer(const APattern : PUnicodeChar; const ALength : Integer);
  38. procedure CheckEOF();inline;overload;
  39. procedure CheckEOF(ALength : Integer);overload;inline;
  40. procedure UnexpectedEOF();inline;
  41. function IsThis(AItem : UnicodeString; const APosition : Integer) : Boolean;overload;
  42. function IsThis(AItem : UnicodeString) : Boolean;overload;inline;
  43. procedure Expect(AItem : UnicodeString; const APosition : Integer);overload;inline;
  44. procedure Expect(AItem : UnicodeString);overload;inline;
  45. procedure SkipSpaces();inline;
  46. function NextChar() : TUnicodeCodePoint;
  47. procedure ParseItem();
  48. procedure DoParse();
  49. public
  50. procedure Parse(const APattern : PUnicodeChar; const ALength : Integer);overload;
  51. procedure Parse(const APattern : UnicodeString);overload;inline;
  52. property CurrentSet : TUnicodeSet read FSet write FSet;
  53. end;
  54. TUnicodeCodePointArrayComparator = class
  55. public
  56. // Return
  57. // * if A>B then 1
  58. // * if A=B then 0
  59. // * if A<B then -1
  60. class function Compare(const A, B : TUnicodeCodePointArray) : Integer;static;inline;
  61. end;
  62. TUnicodeSet = class
  63. private type
  64. TItem = TUnicodeCodePointArray;
  65. TTree = TRBTree<TItem,TUnicodeCodePointArrayComparator>;
  66. public type
  67. TIterator = TTree.TIterator;
  68. private
  69. FTree : TTree;
  70. FParser : TPatternParser;
  71. private
  72. procedure CreateParser();inline;
  73. public
  74. constructor Create();
  75. destructor Destroy;override;
  76. procedure Add(AChar : TUnicodeCodePoint);inline;overload;
  77. procedure Add(AString : TUnicodeCodePointArray);inline;overload;
  78. procedure AddRange(const AStart, AEnd : TUnicodeCodePoint);inline;
  79. procedure AddPattern(const APattern : UnicodeString);inline;
  80. function CreateIterator() : TIterator;
  81. function Contains(const AString : array of TUnicodeCodePoint) : Boolean;overload;
  82. function Contains(const AChar : TUnicodeCodePoint) : Boolean;inline;overload;
  83. function Contains(const AChar : UnicodeChar) : Boolean;inline;overload;
  84. function Contains(const AChar : AnsiChar) : Boolean;inline;overload;
  85. end;
  86. resourcestring
  87. SInvalidLength = 'Invalid length value : "%d".';
  88. SInvalidPosition = 'Invalid position : "%d".';
  89. SInvalidRangeLimits = 'Invalid range limits : ["%x" , "%x"].';
  90. SExpectedBut = 'Expects "%s" but got "%s..." .';
  91. SUnexpectedEOF = 'Unexpected end of file.';
  92. implementation
  93. uses
  94. unicodedata;
  95. function ToArray(const AItem : TUnicodeCodePoint) : TUnicodeCodePointArray;inline;
  96. begin
  97. SetLength(Result,1);
  98. Result[Low(Result)] := AItem;
  99. end;
  100. function CompareItem(const Item1, Item2 : TUnicodeCodePointArray): Integer;
  101. var
  102. a, b : ^TUnicodeCodePoint;
  103. i, ha, hb : Integer;
  104. begin
  105. if (Pointer(Item1) = Pointer(Item2)) then
  106. exit(0);
  107. if (Item1 = nil) then
  108. exit(-1);
  109. if (Item2 = nil) then
  110. exit(1);
  111. a := @Item1[0];
  112. b := @Item2[0];
  113. Result := 1;
  114. ha := Length(Item1) - 1;
  115. hb := Length(Item2) - 1;
  116. for i := 0 to ha do begin
  117. if (i > hb) then
  118. exit;
  119. if (a^ < b^) then
  120. exit(-1);
  121. if (a^ > b^) then
  122. exit(1);
  123. Inc(a);
  124. Inc(b);
  125. end;
  126. if (ha = hb) then
  127. exit(0);
  128. exit(-1);
  129. end;
  130. { TUnicodeCodePointArrayComparator }
  131. class function TUnicodeCodePointArrayComparator.Compare(const A, B : TUnicodeCodePointArray): Integer;
  132. begin
  133. Result := CompareItem(A,B);
  134. end;
  135. { TPatternParser }
  136. procedure TPatternParser.Error(const AMsg: string; const AArgs: array of const);
  137. begin
  138. raise EUnicodeSetException.CreateFmt(AMsg,AArgs);
  139. end;
  140. procedure TPatternParser.Error(const AMsg: string);
  141. begin
  142. raise EUnicodeSetException.Create(AMsg);
  143. end;
  144. procedure TPatternParser.SetBuffer(
  145. const APattern : PUnicodeChar;
  146. const ALength : Integer
  147. );
  148. begin
  149. FPosition := 0;
  150. if (ALength <= 1) then begin
  151. FBufferStr := '';
  152. FBuffer := nil;
  153. FBufferLength := 0;
  154. exit;
  155. end;
  156. FBufferLength := ALength;
  157. SetLength(FBufferStr,FBufferLength);
  158. FBuffer := @FBufferStr[1];
  159. Move(APattern^,FBuffer^,(FBufferLength*SizeOf(FBuffer^)));
  160. end;
  161. procedure TPatternParser.CheckEOF();
  162. begin
  163. CheckEOF(0);
  164. end;
  165. procedure TPatternParser.CheckEOF(ALength : Integer);
  166. begin
  167. if (ALength < 0) then
  168. Error(SInvalidLength,[ALength]);
  169. if ((FPosition+ALength) >= FBufferLength) then
  170. UnexpectedEOF();
  171. end;
  172. procedure TPatternParser.UnexpectedEOF();
  173. begin
  174. Error(SUnexpectedEOF);
  175. end;
  176. function TPatternParser.IsThis(AItem: UnicodeString; const APosition: Integer): Boolean;
  177. var
  178. i, k, c : Integer;
  179. begin
  180. if (APosition < 0) then
  181. Error(SInvalidPosition,[APosition]);
  182. Result := False;
  183. c := Length(AItem);
  184. if (c = 0) then
  185. exit;
  186. i := APosition;
  187. k := i + c;
  188. if (k >= FBufferLength) then
  189. exit;
  190. if CompareMem(@AItem[1], @FBuffer[APosition],c) then
  191. Result := True;
  192. end;
  193. function TPatternParser.IsThis(AItem : UnicodeString) : Boolean;
  194. begin
  195. Result := IsThis(AItem,FPosition);
  196. end;
  197. procedure TPatternParser.Expect(AItem: UnicodeString; const APosition: Integer);
  198. begin
  199. if not IsThis(AItem,APosition) then
  200. Error(SExpectedBut,[AItem,Copy(FBuffer,APosition,Length(AItem))]);
  201. end;
  202. procedure TPatternParser.Expect(AItem: UnicodeString);
  203. begin
  204. Expect(AItem,FPosition);
  205. end;
  206. procedure TPatternParser.SkipSpaces();
  207. begin
  208. while (FPosition < FBufferLength) do begin
  209. if (FBuffer[FPosition] <> ' ') then
  210. Break;
  211. Inc(FPosition);
  212. end;
  213. end;
  214. function TPatternParser.NextChar(): TUnicodeCodePoint;
  215. var
  216. i : Integer;
  217. c : UnicodeChar;
  218. cp : TUnicodeCodePoint;
  219. s : UnicodeString;
  220. begin
  221. SkipSpaces();
  222. CheckEOF();
  223. c := FBuffer[FPosition];
  224. cp := Ord(c);
  225. Inc(FPosition);
  226. if (c = '\') and (FPosition < FBufferLength) then begin
  227. if IsThis('\') then begin
  228. Inc(FPosition);
  229. CheckEOF();
  230. cp := Ord(FBuffer[FPosition]);
  231. Inc(FPosition);
  232. end else if IsThis('u') then begin
  233. Inc(FPosition);
  234. CheckEOF(4);
  235. s := Copy(FBufferStr,(FPosition+1),4);
  236. Inc(FPosition,4);
  237. if not TryStrToInt('$'+s,i) then
  238. Error(SExpectedBut,['\uXXXX',s]);
  239. cp := i;
  240. end;
  241. end;
  242. if (cp <= MAX_WORD) and UnicodeIsLowSurrogate(UnicodeChar(Word(cp))) then begin
  243. SkipSpaces();
  244. CheckEOF();
  245. c := UnicodeChar(Word(cp));
  246. if UnicodeIsSurrogatePair(c,FBuffer[FPosition]) then begin
  247. cp := ToUCS4(c,FBuffer[FPosition]);
  248. Inc(FPosition);
  249. end;
  250. end;
  251. Result := cp;
  252. end;
  253. function CompareTo(const A : TUnicodeCodePoint; const B : UnicodeChar) : Boolean;inline;
  254. begin
  255. Result := (A = Ord(B));
  256. end;
  257. procedure TPatternParser.ParseItem();
  258. var
  259. cp, lastCp : TUnicodeCodePoint;
  260. charCount : Integer;
  261. begin
  262. SkipSpaces();
  263. Expect('[');
  264. charCount := 0;
  265. Inc(FPosition);
  266. cp:=0;
  267. while (FPosition < FBufferLength) do begin
  268. lastCp := cp;
  269. cp := NextChar();
  270. if CompareTo(cp,']') then
  271. Break;
  272. if CompareTo(cp,'-') then begin
  273. if (charCount = 0) then
  274. Error(SExpectedBut,['<char>','-']);
  275. cp := NextChar();
  276. FSet.AddRange(lastCp,cp);
  277. end else begin
  278. FSet.Add(cp);
  279. end;
  280. Inc(charCount);
  281. end;
  282. end;
  283. procedure TPatternParser.DoParse();
  284. begin
  285. SkipSpaces();
  286. while (FPosition < FBufferLength) do begin
  287. ParseItem();
  288. SkipSpaces();
  289. end;
  290. end;
  291. procedure TPatternParser.Parse(const APattern: PUnicodeChar; const ALength: Integer);
  292. begin
  293. if (ALength < 2) then
  294. exit;
  295. SetBuffer(APattern,ALength);
  296. DoParse();
  297. end;
  298. procedure TPatternParser.Parse(const APattern : UnicodeString);
  299. begin
  300. Parse(@APattern[1],Length(APattern));
  301. end;
  302. { TUnicodeSet }
  303. procedure TUnicodeSet.CreateParser();
  304. begin
  305. if (FParser = nil) then begin
  306. FParser := TPatternParser.Create();
  307. FParser.CurrentSet := Self;
  308. end;
  309. end;
  310. constructor TUnicodeSet.Create;
  311. begin
  312. FTree := TTree.Create();
  313. end;
  314. destructor TUnicodeSet.Destroy;
  315. begin
  316. FParser.Free();
  317. FTree.Free();
  318. inherited Destroy;
  319. end;
  320. procedure TUnicodeSet.Add(AChar: TUnicodeCodePoint);
  321. begin
  322. FTree.Insert(ToArray(AChar));
  323. end;
  324. procedure TUnicodeSet.Add(AString: TUnicodeCodePointArray);
  325. begin
  326. if (AString <> nil) then
  327. FTree.Insert(AString);
  328. end;
  329. procedure TUnicodeSet.AddRange(const AStart, AEnd : TUnicodeCodePoint);
  330. var
  331. i : Integer;
  332. begin
  333. if (AStart > AEnd) then
  334. raise EUnicodeSetException.CreateFmt(SInvalidRangeLimits,[AStart,AEnd]);
  335. for i := AStart to AEnd do
  336. Add(i);
  337. end;
  338. procedure TUnicodeSet.AddPattern(const APattern : UnicodeString);
  339. begin
  340. CreateParser();
  341. FParser.Parse(APattern);
  342. end;
  343. function TUnicodeSet.CreateIterator() : TIterator;
  344. begin
  345. Result := FTree.CreateForwardIterator();
  346. end;
  347. function TUnicodeSet.Contains(const AString : array of TUnicodeCodePoint) : Boolean;
  348. var
  349. c : Integer;
  350. x : TUnicodeCodePointArray;
  351. begin
  352. Result := False;
  353. c := Length(AString);
  354. if (c = 0) then
  355. exit;
  356. SetLength(x,c);
  357. Move(AString[Low(AString)],x[Low(x)],(c*SizeOf(x[0])));
  358. if (FTree.FindNode(x) <> nil) then
  359. Result := True;
  360. end;
  361. function TUnicodeSet.Contains(const AChar : TUnicodeCodePoint) : Boolean;
  362. begin
  363. Result := Contains([AChar]);
  364. end;
  365. function TUnicodeSet.Contains(const AChar : UnicodeChar) : Boolean;
  366. begin
  367. Result := Contains(TUnicodeCodePoint(Ord(AChar)));
  368. end;
  369. function TUnicodeSet.Contains(const AChar : AnsiChar) : Boolean;
  370. begin
  371. Result := Contains(TUnicodeCodePoint(Ord(AChar)));
  372. end;
  373. end.