regexpr.pp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. {
  2. This unit implements basic regular expression support
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000-2005 by Florian Klaempfl
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. { $define DEBUG}
  12. {
  13. TODO:
  14. - correct backtracking, for example in (...)*
  15. - | support
  16. - getting substrings and using substrings with \1 etc.
  17. - test ^ and $
  18. - newline handling in DOS?
  19. - locals dependend upper/lowercase routines
  20. - extend the interface
  21. }
  22. {$mode objfpc}
  23. unit regexpr;
  24. interface
  25. { the following declarions are only in the interface because }
  26. { some procedures return pregexprentry but programs which }
  27. { use this unit shouldn't access this data structures }
  28. type
  29. tcharset = set of char;
  30. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  31. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  32. ret_endline);
  33. pregexprentry = ^tregexprentry;
  34. tregexprentry = record
  35. next,nextdestroy : pregexprentry;
  36. case typ : tregexprentrytype of
  37. ret_charset : (chars : tcharset;
  38. elsepath : pregexprentry);
  39. ret_or : (alternative : pregexprentry);
  40. end;
  41. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  42. tregexprflags = set of tregexprflag;
  43. TRegExprEngine = record
  44. Data : pregexprentry;
  45. DestroyList : pregexprentry;
  46. Flags : TRegExprFlags;
  47. end;
  48. const
  49. cs_allchars : tcharset = [#0..#255];
  50. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  51. cs_newline : tcharset = [#10];
  52. cs_digits : tcharset = ['0'..'9'];
  53. cs_whitespace : tcharset = [' ',#9];
  54. var
  55. { these are initilized in the init section of the unit }
  56. cs_nonwordchars : tcharset;
  57. cs_nondigits : tcharset;
  58. cs_nonwhitespace : tcharset;
  59. { the following procedures can be used by units basing }
  60. { on the regexpr unit }
  61. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  62. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  63. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  64. implementation
  65. {$ifdef DEBUG}
  66. procedure writecharset(c : tcharset);
  67. var
  68. b : byte;
  69. begin
  70. for b:=0 to 255 do
  71. if chr(b) in c then
  72. write(chr(b));
  73. writeln;
  74. end;
  75. {$endif DEBUG}
  76. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  77. var
  78. first : pregexprentry;
  79. procedure doregister(p : pregexprentry);
  80. begin
  81. p^.nextdestroy:=first;
  82. if not(assigned(first)) then
  83. first:=p;
  84. end;
  85. var
  86. currentpos : pchar;
  87. error : boolean;
  88. function readchars : tcharset;
  89. var
  90. c1 : char;
  91. begin
  92. readchars:=[];
  93. case currentpos^ of
  94. #0:
  95. exit;
  96. '.':
  97. begin
  98. inc(currentpos);
  99. readchars:=cs_allchars-cs_newline;
  100. end;
  101. '\':
  102. begin
  103. inc(currentpos);
  104. case currentpos^ of
  105. #0:
  106. begin
  107. error:=true;
  108. exit;
  109. end;
  110. 't':
  111. begin
  112. inc(currentpos);
  113. readchars:=[#9];
  114. end;
  115. 'n':
  116. begin
  117. inc(currentpos);
  118. readchars:=[#10];
  119. end;
  120. 'r':
  121. begin
  122. inc(currentpos);
  123. readchars:=[#13];
  124. end;
  125. 'd':
  126. begin
  127. inc(currentpos);
  128. readchars:=cs_digits;
  129. end;
  130. 'D':
  131. begin
  132. inc(currentpos);
  133. readchars:=cs_nondigits;
  134. end;
  135. 's':
  136. begin
  137. inc(currentpos);
  138. readchars:=cs_whitespace;
  139. end;
  140. 'S':
  141. begin
  142. inc(currentpos);
  143. readchars:=cs_nonwhitespace;
  144. end;
  145. 'w':
  146. begin
  147. inc(currentpos);
  148. readchars:=cs_wordchars;
  149. end;
  150. 'W':
  151. begin
  152. inc(currentpos);
  153. readchars:=cs_nonwordchars;
  154. end;
  155. else
  156. begin //Some basic escaping...
  157. readchars := [currentpos^];
  158. inc (currentpos);
  159. {error:=true;
  160. exit;}
  161. end;
  162. end;
  163. end;
  164. else
  165. begin
  166. if ref_caseinsensitive in flags then
  167. c1:=upcase(currentpos^)
  168. else
  169. c1:=currentpos^;
  170. inc(currentpos);
  171. if currentpos^='-' then
  172. begin
  173. inc(currentpos);
  174. if currentpos^=#0 then
  175. begin
  176. error:=true;
  177. exit;
  178. end;
  179. if ref_caseinsensitive in flags then
  180. readchars:=[c1..upcase(currentpos^)]
  181. else
  182. readchars:=[c1..currentpos^];
  183. inc(currentpos);
  184. end
  185. else
  186. readchars:=[c1];
  187. end;
  188. end;
  189. end;
  190. function readcharset : tcharset;
  191. begin
  192. readcharset:=[];
  193. case currentpos^ of
  194. #0:
  195. exit;
  196. '[':
  197. begin
  198. inc(currentpos);
  199. while currentpos^<>']' do
  200. begin
  201. if currentpos^='^' then
  202. begin
  203. inc(currentpos);
  204. readcharset:=readcharset+(cs_allchars-readchars);
  205. end
  206. else
  207. readcharset:=readcharset+readchars;
  208. if error or (currentpos^=#0) then
  209. begin
  210. error:=true;
  211. exit;
  212. end;
  213. end;
  214. inc(currentpos);
  215. end;
  216. '^':
  217. begin
  218. inc(currentpos);
  219. readcharset:=cs_allchars-readchars;
  220. end;
  221. else
  222. readcharset:=readchars;
  223. end;
  224. end;
  225. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  226. var
  227. hp,hp2,ep : pregexprentry;
  228. cs : tcharset;
  229. chaining : ^pregexprentry;
  230. begin
  231. chaining:=nil;
  232. parseregexpr:=nil;
  233. if error then
  234. exit;
  235. { this dummy allows us to redirect the elsepath later }
  236. new(ep);
  237. doregister(ep);
  238. ep^.typ:=ret_charset;
  239. ep^.chars:=[];
  240. ep^.elsepath:=elsepath;
  241. elsepath:=ep;
  242. while true do
  243. begin
  244. if error then
  245. exit;
  246. case currentpos^ of
  247. '(':
  248. begin
  249. inc(currentpos);
  250. new(hp2);
  251. doregister(hp2);
  252. hp2^.typ:=ret_charset;
  253. hp2^.chars:=[];
  254. hp2^.elsepath:=next;
  255. hp:=parseregexpr(hp2,ep);
  256. if assigned(chaining) then
  257. chaining^:=hp
  258. else
  259. parseregexpr:=hp;
  260. chaining:=@hp2^.elsepath;
  261. if currentpos^<>')' then
  262. begin
  263. error:=true;
  264. exit;
  265. end;
  266. inc(currentpos);
  267. end;
  268. {
  269. '|':
  270. begin
  271. {$ifdef DEBUG}
  272. writeln('Creating backtrace entry');
  273. {$endif DEBUG}
  274. while currentpos^='|' do
  275. begin
  276. inc(currentpos);
  277. if currentpos^=#0 then
  278. begin
  279. error:=true;
  280. exit;
  281. end;
  282. doregister(hp2);
  283. hp2^.typ:=ret_charset;
  284. hp2^.chars:=[];
  285. hp2^.elsepath:=next;
  286. new(hp);
  287. doregister(hp);
  288. hp^.typ:=ret_backtrace;
  289. hp^.elsepath:=parseregexpr();
  290. hp^.next:=next;
  291. if assigned(chaining) then
  292. chaining^:=hp
  293. else
  294. parseregexpr:=hp;
  295. chaining:=@hp^.elsepath;
  296. end;
  297. exit;
  298. end;
  299. }
  300. ')':
  301. exit;
  302. '^':
  303. begin
  304. inc(currentpos);
  305. new(hp);
  306. doregister(hp);
  307. hp^.typ:=ret_startline;
  308. hp^.elsepath:=ep;
  309. // hp^.next:=parseregexpr(ep);
  310. end;
  311. '$':
  312. begin
  313. inc(currentpos);
  314. new(hp);
  315. doregister(hp);
  316. hp^.typ:=ret_endline;
  317. hp^.elsepath:=ep;
  318. // hp^.next:=parseregexpr(ep);
  319. end;
  320. #0:
  321. exit;
  322. else
  323. begin
  324. cs:=readcharset;
  325. if error then
  326. exit;
  327. case currentpos^ of
  328. '*':
  329. begin
  330. inc(currentpos);
  331. new(hp);
  332. doregister(hp);
  333. hp^.typ:=ret_charset;
  334. hp^.chars:=cs;
  335. hp^.elsepath:=next;
  336. hp^.next:=hp;
  337. if assigned(chaining) then
  338. chaining^:=hp
  339. else
  340. parseregexpr:=hp;
  341. chaining:=@hp^.elsepath;
  342. end;
  343. '+':
  344. begin
  345. inc(currentpos);
  346. new(hp);
  347. new(hp2);
  348. doregister(hp);
  349. doregister(hp2);
  350. hp^.typ:=ret_charset;
  351. hp2^.typ:=ret_charset;
  352. hp^.chars:=cs;
  353. hp2^.chars:=cs;
  354. hp^.elsepath:=elsepath;
  355. hp^.next:=hp2;
  356. hp2^.elsepath:=next;
  357. hp2^.next:=hp2;
  358. if assigned(chaining) then
  359. chaining^:=hp
  360. else
  361. parseregexpr:=hp;
  362. chaining:=@hp2^.elsepath;
  363. end;
  364. '?':
  365. begin
  366. inc(currentpos);
  367. new(hp);
  368. { this is a dummy }
  369. new(hp2);
  370. doregister(hp);
  371. doregister(hp2);
  372. hp^.typ:=ret_charset;
  373. hp^.chars:=cs;
  374. hp^.next:=hp2;
  375. hp^.elsepath:=hp2;
  376. hp2^.typ:=ret_charset;
  377. hp2^.chars:=[];
  378. hp2^.elsepath:=next;
  379. if assigned(chaining) then
  380. chaining^:=hp
  381. else
  382. parseregexpr:=hp;
  383. chaining:=@hp2^.elsepath;
  384. end;
  385. else
  386. begin
  387. new(hp);
  388. doregister(hp);
  389. hp^.typ:=ret_charset;
  390. hp^.chars:=cs;
  391. hp^.elsepath:=elsepath;
  392. hp^.next:=next;
  393. if assigned(chaining) then
  394. chaining^:=hp
  395. else
  396. parseregexpr:=hp;
  397. chaining:=@hp^.next;
  398. end;
  399. end;
  400. end;
  401. end;
  402. end;
  403. end;
  404. var
  405. endp : pregexprentry;
  406. begin
  407. GenerateRegExprEngine.Data:=nil;
  408. GenerateRegExprEngine.DestroyList:=nil;
  409. if regexpr=nil then
  410. exit;
  411. first:=nil;
  412. if (ref_singleline in flags) and (ref_multiline in flags) then
  413. exit;
  414. currentpos:=regexpr;
  415. error:=false;
  416. new(endp);
  417. doregister(endp);
  418. endp^.typ:=ret_illegalend;
  419. GenerateRegExprEngine.flags:=flags;
  420. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  421. GenerateRegExprEngine.DestroyList:=first;
  422. if error or (currentpos^<>#0) then
  423. DestroyRegExprEngine(Result);
  424. end;
  425. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  426. var
  427. hp : pregexprentry;
  428. begin
  429. hp:=regexpr.DestroyList;
  430. while assigned(hp) do
  431. begin
  432. regexpr.DestroyList:=hp^.nextdestroy;
  433. dispose(hp);
  434. hp:=regexpr.DestroyList;
  435. end;
  436. regexpr.Data:=nil;
  437. regexpr.DestroyList:=nil;
  438. end;
  439. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  440. var
  441. lastpos : pchar;
  442. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  443. begin
  444. dosearch:=false;
  445. while true do
  446. begin
  447. {$IFDEF Debug}
  448. writeln(byte(regexpr^.typ));
  449. {$ENDIF Debug}
  450. case regexpr^.typ of
  451. ret_endline:
  452. begin
  453. if ref_multiline in regexprengine.flags then
  454. begin
  455. if ((pos+1)^ in [#10,#0]) then
  456. regexpr:=regexpr^.next
  457. else
  458. regexpr:=regexpr^.elsepath;
  459. end
  460. else
  461. begin
  462. if (pos+1)^=#0 then
  463. regexpr:=regexpr^.next
  464. else
  465. regexpr:=regexpr^.elsepath;
  466. end;
  467. end;
  468. ret_startline:
  469. begin
  470. if ref_multiline in regexprengine.flags then
  471. begin
  472. if (pos=p) or ((pos-1)^=#10) then
  473. regexpr:=regexpr^.next
  474. else
  475. regexpr:=regexpr^.elsepath;
  476. end
  477. else
  478. begin
  479. if pos=p then
  480. regexpr:=regexpr^.next
  481. else
  482. regexpr:=regexpr^.elsepath;
  483. end;
  484. end;
  485. ret_charset:
  486. begin
  487. if (pos^ in regexpr^.chars) or
  488. ((ref_caseinsensitive in regexprengine.flags) and
  489. (upcase(pos^) in regexpr^.chars)) then
  490. begin
  491. {$ifdef DEBUG}
  492. writeln('Found matching: ',pos^);
  493. {$endif DEBUG}
  494. regexpr:=regexpr^.next;
  495. inc(pos);
  496. end
  497. else
  498. begin
  499. {$ifdef DEBUG}
  500. writeln('Found unmatching: ',pos^);
  501. {$endif DEBUG}
  502. regexpr:=regexpr^.elsepath;
  503. end;
  504. end;
  505. ret_backtrace:
  506. begin
  507. {$ifdef DEBUG}
  508. writeln('Starting backtrace');
  509. {$endif DEBUG}
  510. if dosearch(regexpr^.next,pos) then
  511. begin
  512. dosearch:=true;
  513. exit;
  514. end
  515. else if dosearch(regexpr^.elsepath,pos) then
  516. begin
  517. dosearch:=true;
  518. exit;
  519. end
  520. else
  521. exit;
  522. end;
  523. end;
  524. lastpos:=pos;
  525. if regexpr=nil then
  526. begin
  527. dosearch:=true;
  528. exit;
  529. end;
  530. if regexpr^.typ=ret_illegalend then
  531. exit;
  532. if pos^=#0 then
  533. exit;
  534. end;
  535. end;
  536. begin
  537. RegExprPos:=false;
  538. index:=0;
  539. len:=0;
  540. if regexprengine.Data=nil then
  541. exit;
  542. while p^<>#0 do
  543. begin
  544. if dosearch(regexprengine.Data,p) then
  545. begin
  546. len:=lastpos-p;
  547. RegExprPos:=true;
  548. exit;
  549. end
  550. else
  551. begin
  552. inc(p);
  553. inc(index);
  554. end;
  555. end;
  556. index:=-1;
  557. end;
  558. begin
  559. cs_nonwordchars:=cs_allchars-cs_wordchars;
  560. cs_nondigits:=cs_allchars-cs_digits;
  561. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  562. end.