regexpr.pp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. {
  2. $Id$
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 2000 by Florian Klaempfl
  5. This unit implements basic regular expression support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. { $define DEBUG}
  13. {
  14. TODO:
  15. - correct backtracking, for example in (...)*
  16. - | support
  17. - getting substrings and using substrings with \1 etc.
  18. - test ^ and $
  19. - newline handling in DOS?
  20. - locals dependend upper/lowercase routines
  21. - extend the interface
  22. }
  23. {$mode objfpc}
  24. unit regexpr;
  25. interface
  26. { the following declarions are only in the interface because }
  27. { some procedures return pregexprentry but programs which }
  28. { use this unit shouldn't access this data structures }
  29. type
  30. tcharset = set of char;
  31. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  32. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  33. ret_endline);
  34. pregexprentry = ^tregexprentry;
  35. tregexprentry = record
  36. next,nextdestroy : pregexprentry;
  37. case typ : tregexprentrytype of
  38. ret_charset : (chars : tcharset;
  39. elsepath : pregexprentry);
  40. ret_or : (alternative : pregexprentry);
  41. end;
  42. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  43. tregexprflags = set of tregexprflag;
  44. TRegExprEngine = record
  45. Data : pregexprentry;
  46. DestroyList : pregexprentry;
  47. Flags : TRegExprFlags;
  48. end;
  49. const
  50. cs_allchars : tcharset = [#0..#255];
  51. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  52. cs_newline : tcharset = [#10];
  53. cs_digits : tcharset = ['0'..'9'];
  54. cs_whitespace : tcharset = [' ',#9];
  55. var
  56. { these are initilized in the init section of the unit }
  57. cs_nonwordchars : tcharset;
  58. cs_nondigits : tcharset;
  59. cs_nonwhitespace : tcharset;
  60. { the following procedures can be used by units basing }
  61. { on the regexpr unit }
  62. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  63. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  64. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  65. implementation
  66. {$ifdef DEBUG}
  67. procedure writecharset(c : tcharset);
  68. var
  69. b : byte;
  70. begin
  71. for b:=0 to 255 do
  72. if chr(b) in c then
  73. write(chr(b));
  74. writeln;
  75. end;
  76. {$endif DEBUG}
  77. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  78. var
  79. first : pregexprentry;
  80. procedure doregister(p : pregexprentry);
  81. begin
  82. p^.nextdestroy:=first;
  83. if not(assigned(first)) then
  84. first:=p;
  85. end;
  86. var
  87. currentpos : pchar;
  88. error : boolean;
  89. function readchars : tcharset;
  90. var
  91. c1 : char;
  92. begin
  93. readchars:=[];
  94. case currentpos^ of
  95. #0:
  96. exit;
  97. '.':
  98. begin
  99. inc(currentpos);
  100. readchars:=cs_allchars-cs_newline;
  101. end;
  102. '\':
  103. begin
  104. inc(currentpos);
  105. case currentpos^ of
  106. #0:
  107. begin
  108. error:=true;
  109. exit;
  110. end;
  111. 't':
  112. begin
  113. inc(currentpos);
  114. readchars:=[#9];
  115. end;
  116. 'n':
  117. begin
  118. inc(currentpos);
  119. readchars:=[#10];
  120. end;
  121. 'r':
  122. begin
  123. inc(currentpos);
  124. readchars:=[#13];
  125. end;
  126. 'd':
  127. begin
  128. inc(currentpos);
  129. readchars:=cs_digits;
  130. end;
  131. 'D':
  132. begin
  133. inc(currentpos);
  134. readchars:=cs_nondigits;
  135. end;
  136. 's':
  137. begin
  138. inc(currentpos);
  139. readchars:=cs_whitespace;
  140. end;
  141. 'S':
  142. begin
  143. inc(currentpos);
  144. readchars:=cs_nonwhitespace;
  145. end;
  146. 'w':
  147. begin
  148. inc(currentpos);
  149. readchars:=cs_wordchars;
  150. end;
  151. 'W':
  152. begin
  153. inc(currentpos);
  154. readchars:=cs_nonwordchars;
  155. end;
  156. else
  157. begin
  158. error:=true;
  159. exit;
  160. end;
  161. end;
  162. end;
  163. else
  164. begin
  165. if ref_caseinsensitive in flags then
  166. c1:=upcase(currentpos^)
  167. else
  168. c1:=currentpos^;
  169. inc(currentpos);
  170. if currentpos^='-' then
  171. begin
  172. inc(currentpos);
  173. if currentpos^=#0 then
  174. begin
  175. error:=true;
  176. exit;
  177. end;
  178. if ref_caseinsensitive in flags then
  179. readchars:=[c1..upcase(currentpos^)]
  180. else
  181. readchars:=[c1..currentpos^];
  182. inc(currentpos);
  183. end
  184. else
  185. readchars:=[c1];
  186. end;
  187. end;
  188. end;
  189. function readcharset : tcharset;
  190. begin
  191. readcharset:=[];
  192. case currentpos^ of
  193. #0:
  194. exit;
  195. '[':
  196. begin
  197. inc(currentpos);
  198. while currentpos^<>']' do
  199. begin
  200. if currentpos^='^' then
  201. begin
  202. inc(currentpos);
  203. readcharset:=readcharset+(cs_allchars-readchars);
  204. end
  205. else
  206. readcharset:=readcharset+readchars;
  207. if error or (currentpos^=#0) then
  208. begin
  209. error:=true;
  210. exit;
  211. end;
  212. end;
  213. inc(currentpos);
  214. end;
  215. '^':
  216. begin
  217. inc(currentpos);
  218. readcharset:=cs_allchars-readchars;
  219. end;
  220. else
  221. readcharset:=readchars;
  222. end;
  223. end;
  224. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  225. var
  226. hp,hp2,ep : pregexprentry;
  227. cs : tcharset;
  228. chaining : ^pregexprentry;
  229. begin
  230. chaining:=nil;
  231. parseregexpr:=nil;
  232. if error then
  233. exit;
  234. { this dummy allows us to redirect the elsepath later }
  235. new(ep);
  236. doregister(ep);
  237. ep^.typ:=ret_charset;
  238. ep^.chars:=[];
  239. ep^.elsepath:=elsepath;
  240. elsepath:=ep;
  241. while true do
  242. begin
  243. if error then
  244. exit;
  245. case currentpos^ of
  246. '(':
  247. begin
  248. inc(currentpos);
  249. new(hp2);
  250. doregister(hp2);
  251. hp2^.typ:=ret_charset;
  252. hp2^.chars:=[];
  253. hp2^.elsepath:=next;
  254. hp:=parseregexpr(hp2,ep);
  255. if assigned(chaining) then
  256. chaining^:=hp
  257. else
  258. parseregexpr:=hp;
  259. chaining:=@hp2^.elsepath;
  260. if currentpos^<>')' then
  261. begin
  262. error:=true;
  263. exit;
  264. end;
  265. inc(currentpos);
  266. end;
  267. '|':
  268. begin
  269. {$ifdef DEBUG}
  270. writeln('Creating backtrace entry');
  271. {$endif DEBUG}
  272. inc(currentpos);
  273. if currentpos^=#0 then
  274. begin
  275. error:=true;
  276. exit;
  277. end;
  278. new(hp);
  279. doregister(hp);
  280. hp^.typ:=ret_backtrace;
  281. // hp^.elsepath:=parseregexpr(elsepath);
  282. hp^.next:=@parseregexpr;
  283. parseregexpr:=hp;
  284. exit;
  285. end;
  286. ')':
  287. exit;
  288. '^':
  289. begin
  290. inc(currentpos);
  291. new(hp);
  292. doregister(hp);
  293. hp^.typ:=ret_startline;
  294. hp^.elsepath:=ep;
  295. // hp^.next:=parseregexpr(ep);
  296. end;
  297. '$':
  298. begin
  299. inc(currentpos);
  300. new(hp);
  301. doregister(hp);
  302. hp^.typ:=ret_endline;
  303. hp^.elsepath:=ep;
  304. // hp^.next:=parseregexpr(ep);
  305. end;
  306. #0:
  307. exit;
  308. else
  309. begin
  310. cs:=readcharset;
  311. if error then
  312. exit;
  313. case currentpos^ of
  314. '*':
  315. begin
  316. inc(currentpos);
  317. new(hp);
  318. doregister(hp);
  319. hp^.typ:=ret_charset;
  320. hp^.chars:=cs;
  321. hp^.elsepath:=next;
  322. hp^.next:=hp;
  323. if assigned(chaining) then
  324. chaining^:=hp
  325. else
  326. parseregexpr:=hp;
  327. chaining:=@hp^.elsepath;
  328. end;
  329. '+':
  330. begin
  331. inc(currentpos);
  332. new(hp);
  333. new(hp2);
  334. doregister(hp);
  335. doregister(hp2);
  336. hp^.typ:=ret_charset;
  337. hp2^.typ:=ret_charset;
  338. hp^.chars:=cs;
  339. hp2^.chars:=cs;
  340. hp^.elsepath:=elsepath;
  341. hp^.next:=hp2;
  342. hp2^.elsepath:=next;
  343. hp2^.next:=hp2;
  344. if assigned(chaining) then
  345. chaining^:=hp
  346. else
  347. parseregexpr:=hp;
  348. chaining:=@hp2^.elsepath;
  349. end;
  350. '?':
  351. begin
  352. inc(currentpos);
  353. new(hp);
  354. { this is a dummy }
  355. new(hp2);
  356. doregister(hp);
  357. doregister(hp2);
  358. hp^.typ:=ret_charset;
  359. hp^.chars:=cs;
  360. hp^.next:=hp2;
  361. hp^.elsepath:=hp2;
  362. hp2^.typ:=ret_charset;
  363. hp2^.chars:=[];
  364. hp2^.elsepath:=next;
  365. if assigned(chaining) then
  366. chaining^:=hp
  367. else
  368. parseregexpr:=hp;
  369. chaining:=@hp2^.elsepath;
  370. end;
  371. else
  372. begin
  373. new(hp);
  374. doregister(hp);
  375. hp^.typ:=ret_charset;
  376. hp^.chars:=cs;
  377. hp^.elsepath:=elsepath;
  378. hp^.next:=next;
  379. if assigned(chaining) then
  380. chaining^:=hp
  381. else
  382. parseregexpr:=hp;
  383. chaining:=@hp^.next;
  384. end;
  385. end;
  386. end;
  387. end;
  388. end;
  389. end;
  390. var
  391. endp : pregexprentry;
  392. begin
  393. GenerateRegExprEngine.Data:=nil;
  394. GenerateRegExprEngine.DestroyList:=nil;
  395. if regexpr=nil then
  396. exit;
  397. first:=nil;
  398. if (ref_singleline in flags) and (ref_multiline in flags) then
  399. exit;
  400. currentpos:=regexpr;
  401. error:=false;
  402. new(endp);
  403. doregister(endp);
  404. endp^.typ:=ret_illegalend;
  405. GenerateRegExprEngine.flags:=flags;
  406. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  407. GenerateRegExprEngine.DestroyList:=first;
  408. if error or (currentpos^<>#0) then
  409. DestroyRegExprEngine(Result);
  410. end;
  411. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  412. var
  413. hp : pregexprentry;
  414. begin
  415. hp:=regexpr.DestroyList;
  416. while assigned(hp) do
  417. begin
  418. regexpr.DestroyList:=hp^.nextdestroy;
  419. dispose(hp);
  420. hp:=regexpr.DestroyList;
  421. end;
  422. regexpr.Data:=nil;
  423. regexpr.DestroyList:=nil;
  424. end;
  425. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  426. var
  427. lastpos : pchar;
  428. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  429. begin
  430. dosearch:=false;
  431. while true do
  432. begin
  433. {$IFDEF Debug}
  434. writeln(byte(regexpr^.typ));
  435. {$ENDIF Debug}
  436. case regexpr^.typ of
  437. ret_endline:
  438. begin
  439. if ref_multiline in regexprengine.flags then
  440. begin
  441. if ((pos+1)^ in [#10,#0]) then
  442. regexpr:=regexpr^.next
  443. else
  444. regexpr:=regexpr^.elsepath;
  445. end
  446. else
  447. begin
  448. if (pos+1)^=#0 then
  449. regexpr:=regexpr^.next
  450. else
  451. regexpr:=regexpr^.elsepath;
  452. end;
  453. end;
  454. ret_startline:
  455. begin
  456. if ref_multiline in regexprengine.flags then
  457. begin
  458. if (pos=p) or ((pos-1)^=#10) then
  459. regexpr:=regexpr^.next
  460. else
  461. regexpr:=regexpr^.elsepath;
  462. end
  463. else
  464. begin
  465. if pos=p then
  466. regexpr:=regexpr^.next
  467. else
  468. regexpr:=regexpr^.elsepath;
  469. end;
  470. end;
  471. ret_charset:
  472. begin
  473. if (pos^ in regexpr^.chars) or
  474. ((ref_caseinsensitive in regexprengine.flags) and
  475. (upcase(pos^) in regexpr^.chars)) then
  476. begin
  477. {$ifdef DEBUG}
  478. writeln('Found matching: ',pos^);
  479. {$endif DEBUG}
  480. regexpr:=regexpr^.next;
  481. inc(pos);
  482. end
  483. else
  484. begin
  485. {$ifdef DEBUG}
  486. writeln('Found unmatching: ',pos^);
  487. {$endif DEBUG}
  488. regexpr:=regexpr^.elsepath;
  489. end;
  490. end;
  491. ret_backtrace:
  492. begin
  493. {$ifdef DEBUG}
  494. writeln('Starting backtrace');
  495. {$endif DEBUG}
  496. if dosearch(regexpr^.next,pos) then
  497. begin
  498. dosearch:=true;
  499. exit;
  500. end
  501. else if dosearch(regexpr^.elsepath,pos) then
  502. begin
  503. dosearch:=true;
  504. exit;
  505. end
  506. else
  507. exit;
  508. end;
  509. end;
  510. lastpos:=pos;
  511. if regexpr=nil then
  512. begin
  513. dosearch:=true;
  514. exit;
  515. end;
  516. if regexpr^.typ=ret_illegalend then
  517. exit;
  518. if pos^=#0 then
  519. exit;
  520. end;
  521. end;
  522. begin
  523. RegExprPos:=false;
  524. index:=0;
  525. len:=0;
  526. if regexprengine.Data=nil then
  527. exit;
  528. while p^<>#0 do
  529. begin
  530. if dosearch(regexprengine.Data,p) then
  531. begin
  532. len:=lastpos-p;
  533. RegExprPos:=true;
  534. exit;
  535. end
  536. else
  537. begin
  538. inc(p);
  539. inc(index);
  540. end;
  541. end;
  542. index:=-1;
  543. end;
  544. begin
  545. cs_nonwordchars:=cs_allchars-cs_wordchars;
  546. cs_nondigits:=cs_allchars-cs_digits;
  547. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  548. end.
  549. {
  550. $Log$
  551. Revision 1.3 2004-11-14 15:55:25 marco
  552. * initialise error to false. Why was this not found yet? :-)
  553. Revision 1.2 2002/09/07 15:42:53 peter
  554. * old logs removed and tabs fixed
  555. Revision 1.1 2002/01/29 17:54:56 peter
  556. * splitted to base and extra
  557. Revision 1.4 2002/01/22 13:13:14 pierre
  558. + add mode objfpc to avoid problems if compiling from IDE dir
  559. }