regexpr.pp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. {
  2. $Id$
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000 by Florian Klaempfl
  5. This unit implements basic regular expression support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. { $define DEBUG}
  13. {
  14. TODO:
  15. - correct backtracking, for example in (...)*
  16. - | support
  17. - getting substrings and using substrings with \1 etc.
  18. - test ^ and $
  19. - newline handling in DOS?
  20. - locals dependend upper/lowercase routines
  21. - extend the interface
  22. }
  23. unit regexpr;
  24. interface
  25. { the following declarions are only in the interface because }
  26. { some procedures return pregexprentry but programs which }
  27. { use this unit shouldn't access this data structures }
  28. type
  29. tcharset = set of char;
  30. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  31. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  32. ret_endline);
  33. pregexprentry = ^tregexprentry;
  34. tregexprentry = record
  35. next,nextdestroy : pregexprentry;
  36. case typ : tregexprentrytype of
  37. ret_charset : (chars : tcharset;
  38. elsepath : pregexprentry);
  39. ret_or : (alternative : pregexprentry);
  40. end;
  41. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  42. tregexprflags = set of tregexprflag;
  43. TRegExprEngine = record
  44. Data : pregexprentry;
  45. DestroyList : pregexprentry;
  46. Flags : TRegExprFlags;
  47. end;
  48. const
  49. cs_allchars : tcharset = [#0..#255];
  50. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  51. cs_newline : tcharset = [#10];
  52. cs_digits : tcharset = ['0'..'9'];
  53. cs_whitespace : tcharset = [' ',#9];
  54. var
  55. { these are initilized in the init section of the unit }
  56. cs_nonwordchars : tcharset;
  57. cs_nondigits : tcharset;
  58. cs_nonwhitespace : tcharset;
  59. { the following procedures can be used by units basing }
  60. { on the regexpr unit }
  61. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  62. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  63. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  64. implementation
  65. uses
  66. strings;
  67. {$ifdef DEBUG}
  68. procedure writecharset(c : tcharset);
  69. var
  70. b : byte;
  71. begin
  72. for b:=0 to 255 do
  73. if chr(b) in c then
  74. write(chr(b));
  75. writeln;
  76. end;
  77. {$endif DEBUG}
  78. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  79. var
  80. first : pregexprentry;
  81. procedure doregister(p : pregexprentry);
  82. begin
  83. p^.nextdestroy:=first;
  84. if not(assigned(first)) then
  85. first:=p;
  86. end;
  87. var
  88. currentpos : pchar;
  89. error : boolean;
  90. function readchars : tcharset;
  91. var
  92. c1 : char;
  93. begin
  94. readchars:=[];
  95. case currentpos^ of
  96. #0:
  97. exit;
  98. '.':
  99. begin
  100. inc(currentpos);
  101. readchars:=cs_allchars-cs_newline;
  102. end;
  103. '\':
  104. begin
  105. inc(currentpos);
  106. case currentpos^ of
  107. #0:
  108. begin
  109. error:=true;
  110. exit;
  111. end;
  112. 't':
  113. begin
  114. inc(currentpos);
  115. readchars:=[#9];
  116. end;
  117. 'n':
  118. begin
  119. inc(currentpos);
  120. readchars:=[#10];
  121. end;
  122. 'r':
  123. begin
  124. inc(currentpos);
  125. readchars:=[#13];
  126. end;
  127. 'd':
  128. begin
  129. inc(currentpos);
  130. readchars:=cs_digits;
  131. end;
  132. 'D':
  133. begin
  134. inc(currentpos);
  135. readchars:=cs_nondigits;
  136. end;
  137. 's':
  138. begin
  139. inc(currentpos);
  140. readchars:=cs_whitespace;
  141. end;
  142. 'S':
  143. begin
  144. inc(currentpos);
  145. readchars:=cs_nonwhitespace;
  146. end;
  147. 'w':
  148. begin
  149. inc(currentpos);
  150. readchars:=cs_wordchars;
  151. end;
  152. 'W':
  153. begin
  154. inc(currentpos);
  155. readchars:=cs_nonwordchars;
  156. end;
  157. else
  158. begin
  159. error:=true;
  160. exit;
  161. end;
  162. end;
  163. end;
  164. else
  165. begin
  166. if ref_caseinsensitive in flags then
  167. c1:=upcase(currentpos^)
  168. else
  169. c1:=currentpos^;
  170. inc(currentpos);
  171. if currentpos^='-' then
  172. begin
  173. inc(currentpos);
  174. if currentpos^=#0 then
  175. begin
  176. error:=true;
  177. exit;
  178. end;
  179. if ref_caseinsensitive in flags then
  180. readchars:=[c1..upcase(currentpos^)]
  181. else
  182. readchars:=[c1..currentpos^];
  183. inc(currentpos);
  184. end
  185. else
  186. readchars:=[c1];
  187. end;
  188. end;
  189. end;
  190. function readcharset : tcharset;
  191. var
  192. c1,c2 : char;
  193. begin
  194. readcharset:=[];
  195. case currentpos^ of
  196. #0:
  197. exit;
  198. '[':
  199. begin
  200. inc(currentpos);
  201. while currentpos^<>']' do
  202. begin
  203. if currentpos^='^' then
  204. begin
  205. inc(currentpos);
  206. readcharset:=readcharset+(cs_allchars-readchars);
  207. end
  208. else
  209. readcharset:=readcharset+readchars;
  210. if error or (currentpos^=#0) then
  211. begin
  212. error:=true;
  213. exit;
  214. end;
  215. end;
  216. inc(currentpos);
  217. end;
  218. '^':
  219. begin
  220. inc(currentpos);
  221. readcharset:=cs_allchars-readchars;
  222. end;
  223. else
  224. readcharset:=readchars;
  225. end;
  226. end;
  227. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  228. var
  229. hp,hp2,ep : pregexprentry;
  230. cs : tcharset;
  231. chaining : ^pregexprentry;
  232. begin
  233. chaining:=nil;
  234. parseregexpr:=nil;
  235. if error then
  236. exit;
  237. { this dummy allows us to redirect the elsepath later }
  238. new(ep);
  239. doregister(ep);
  240. ep^.typ:=ret_charset;
  241. ep^.chars:=[];
  242. ep^.elsepath:=elsepath;
  243. elsepath:=ep;
  244. while true do
  245. begin
  246. if error then
  247. exit;
  248. case currentpos^ of
  249. '(':
  250. begin
  251. inc(currentpos);
  252. new(hp2);
  253. doregister(hp2);
  254. hp2^.typ:=ret_charset;
  255. hp2^.chars:=[];
  256. hp2^.elsepath:=next;
  257. hp:=parseregexpr(hp2,ep);
  258. if assigned(chaining) then
  259. chaining^:=hp
  260. else
  261. parseregexpr:=hp;
  262. chaining:=@hp2^.elsepath;
  263. if currentpos^<>')' then
  264. begin
  265. error:=true;
  266. exit;
  267. end;
  268. inc(currentpos);
  269. end;
  270. '|':
  271. begin
  272. {$ifdef DEBUG}
  273. writeln('Creating backtrace entry');
  274. {$endif DEBUG}
  275. inc(currentpos);
  276. if currentpos^=#0 then
  277. begin
  278. error:=true;
  279. exit;
  280. end;
  281. new(hp);
  282. doregister(hp);
  283. hp^.typ:=ret_backtrace;
  284. // hp^.elsepath:=parseregexpr(elsepath);
  285. hp^.next:=parseregexpr;
  286. parseregexpr:=hp;
  287. exit;
  288. end;
  289. ')':
  290. exit;
  291. '^':
  292. begin
  293. inc(currentpos);
  294. new(hp);
  295. doregister(hp);
  296. hp^.typ:=ret_startline;
  297. hp^.elsepath:=ep;
  298. // hp^.next:=parseregexpr(ep);
  299. end;
  300. '$':
  301. begin
  302. inc(currentpos);
  303. new(hp);
  304. doregister(hp);
  305. hp^.typ:=ret_endline;
  306. hp^.elsepath:=ep;
  307. // hp^.next:=parseregexpr(ep);
  308. end;
  309. #0:
  310. exit;
  311. else
  312. begin
  313. cs:=readcharset;
  314. if error then
  315. exit;
  316. case currentpos^ of
  317. '*':
  318. begin
  319. inc(currentpos);
  320. new(hp);
  321. doregister(hp);
  322. hp^.typ:=ret_charset;
  323. hp^.chars:=cs;
  324. hp^.elsepath:=next;
  325. hp^.next:=hp;
  326. if assigned(chaining) then
  327. chaining^:=hp
  328. else
  329. parseregexpr:=hp;
  330. chaining:=@hp^.elsepath;
  331. end;
  332. '+':
  333. begin
  334. inc(currentpos);
  335. new(hp);
  336. new(hp2);
  337. doregister(hp);
  338. doregister(hp2);
  339. hp^.typ:=ret_charset;
  340. hp2^.typ:=ret_charset;
  341. hp^.chars:=cs;
  342. hp2^.chars:=cs;
  343. hp^.elsepath:=elsepath;
  344. hp^.next:=hp2;
  345. hp2^.elsepath:=next;
  346. hp2^.next:=hp2;
  347. if assigned(chaining) then
  348. chaining^:=hp
  349. else
  350. parseregexpr:=hp;
  351. chaining:=@hp2^.elsepath;
  352. end;
  353. '?':
  354. begin
  355. inc(currentpos);
  356. new(hp);
  357. { this is a dummy }
  358. new(hp2);
  359. doregister(hp);
  360. doregister(hp2);
  361. hp^.typ:=ret_charset;
  362. hp^.chars:=cs;
  363. hp^.next:=hp2;
  364. hp^.elsepath:=hp2;
  365. hp2^.typ:=ret_charset;
  366. hp2^.chars:=[];
  367. hp2^.elsepath:=next;
  368. if assigned(chaining) then
  369. chaining^:=hp
  370. else
  371. parseregexpr:=hp;
  372. chaining:=@hp2^.elsepath;
  373. end;
  374. else
  375. begin
  376. new(hp);
  377. doregister(hp);
  378. hp^.typ:=ret_charset;
  379. hp^.chars:=cs;
  380. hp^.elsepath:=elsepath;
  381. hp^.next:=next;
  382. if assigned(chaining) then
  383. chaining^:=hp
  384. else
  385. parseregexpr:=hp;
  386. chaining:=@hp^.next;
  387. end;
  388. end;
  389. end;
  390. end;
  391. end;
  392. end;
  393. var
  394. endp : pregexprentry;
  395. begin
  396. GenerateRegExprEngine.Data:=nil;
  397. GenerateRegExprEngine.DestroyList:=nil;
  398. if regexpr=nil then
  399. exit;
  400. first:=nil;
  401. if (ref_singleline in flags) and (ref_multiline in flags) then
  402. exit;
  403. currentpos:=regexpr;
  404. new(endp);
  405. doregister(endp);
  406. endp^.typ:=ret_illegalend;
  407. GenerateRegExprEngine.flags:=flags;
  408. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  409. GenerateRegExprEngine.DestroyList:=first;
  410. if error or (currentpos^<>#0) then
  411. DestroyRegExprEngine(GenerateRegExprEngine);
  412. end;
  413. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  414. var
  415. hp : pregexprentry;
  416. begin
  417. hp:=regexpr.DestroyList;
  418. while assigned(hp) do
  419. begin
  420. regexpr.DestroyList:=hp^.nextdestroy;
  421. dispose(hp);
  422. hp:=regexpr.DestroyList;
  423. end;
  424. regexpr.Data:=nil;
  425. regexpr.DestroyList:=nil;
  426. end;
  427. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  428. var
  429. lastpos,startpos : pchar;
  430. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  431. begin
  432. dosearch:=false;
  433. while true do
  434. begin
  435. writeln(byte(regexpr^.typ));
  436. case regexpr^.typ of
  437. ret_endline:
  438. begin
  439. if ref_multiline in regexprengine.flags then
  440. begin
  441. if ((pos+1)^ in [#10,#0]) then
  442. regexpr:=regexpr^.next
  443. else
  444. regexpr:=regexpr^.elsepath;
  445. end
  446. else
  447. begin
  448. if (pos+1)^=#0 then
  449. regexpr:=regexpr^.next
  450. else
  451. regexpr:=regexpr^.elsepath;
  452. end;
  453. end;
  454. ret_startline:
  455. begin
  456. if ref_multiline in regexprengine.flags then
  457. begin
  458. if (pos=p) or ((pos-1)^=#10) then
  459. regexpr:=regexpr^.next
  460. else
  461. regexpr:=regexpr^.elsepath;
  462. end
  463. else
  464. begin
  465. if pos=p then
  466. regexpr:=regexpr^.next
  467. else
  468. regexpr:=regexpr^.elsepath;
  469. end;
  470. end;
  471. ret_charset:
  472. begin
  473. if (pos^ in regexpr^.chars) or
  474. ((ref_caseinsensitive in regexprengine.flags) and
  475. (upcase(pos^) in regexpr^.chars)) then
  476. begin
  477. {$ifdef DEBUG}
  478. writeln('Found matching: ',pos^);
  479. {$endif DEBUG}
  480. regexpr:=regexpr^.next;
  481. inc(pos);
  482. end
  483. else
  484. begin
  485. {$ifdef DEBUG}
  486. writeln('Found unmatching: ',pos^);
  487. {$endif DEBUG}
  488. regexpr:=regexpr^.elsepath;
  489. end;
  490. end;
  491. ret_backtrace:
  492. begin
  493. {$ifdef DEBUG}
  494. writeln('Starting backtrace');
  495. {$endif DEBUG}
  496. if dosearch(regexpr^.next,pos) then
  497. begin
  498. dosearch:=true;
  499. exit;
  500. end
  501. else if dosearch(regexpr^.elsepath,pos) then
  502. begin
  503. dosearch:=true;
  504. exit;
  505. end
  506. else
  507. exit;
  508. end;
  509. end;
  510. lastpos:=pos;
  511. if regexpr=nil then
  512. begin
  513. dosearch:=true;
  514. exit;
  515. end;
  516. if regexpr^.typ=ret_illegalend then
  517. exit;
  518. if pos^=#0 then
  519. exit;
  520. end;
  521. end;
  522. begin
  523. RegExprPos:=false;
  524. index:=0;
  525. len:=0;
  526. if regexprengine.Data=nil then
  527. exit;
  528. while p^<>#0 do
  529. begin
  530. if dosearch(regexprengine.Data,p) then
  531. begin
  532. len:=lastpos-p;
  533. RegExprPos:=true;
  534. exit;
  535. end
  536. else
  537. begin
  538. inc(p);
  539. inc(index);
  540. end;
  541. end;
  542. index:=-1;
  543. end;
  544. begin
  545. cs_nonwordchars:=cs_allchars-cs_wordchars;
  546. cs_nondigits:=cs_allchars-cs_digits;
  547. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  548. end.
  549. {
  550. $Log$
  551. Revision 1.3 2000-03-19 16:20:44 florian
  552. * some improvements
  553. Revision 1.2 2000/03/14 22:57:51 florian
  554. + added flags
  555. + support of case insensitive search
  556. Revision 1.1 2000/03/14 22:09:03 florian
  557. * Initial revision
  558. }