regexpr.pp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2000 by Florian Klaempfl
  4. This unit implements basic regular expression support
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. { $define DEBUG}
  12. {
  13. TODO:
  14. - correct backtracking, for example in (...)*
  15. - | support
  16. - getting substrings and using substrings with \1 etc.
  17. - test ^ and $
  18. - newline handling in DOS?
  19. - locals dependend upper/lowercase routines
  20. - extend the interface
  21. }
  22. {$mode objfpc}
  23. unit regexpr;
  24. interface
  25. { the following declarions are only in the interface because }
  26. { some procedures return pregexprentry but programs which }
  27. { use this unit shouldn't access this data structures }
  28. type
  29. tcharset = set of char;
  30. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  31. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  32. ret_endline);
  33. pregexprentry = ^tregexprentry;
  34. tregexprentry = record
  35. next,nextdestroy : pregexprentry;
  36. case typ : tregexprentrytype of
  37. ret_charset : (chars : tcharset;
  38. elsepath : pregexprentry);
  39. ret_or : (alternative : pregexprentry);
  40. end;
  41. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  42. tregexprflags = set of tregexprflag;
  43. TRegExprEngine = record
  44. Data : pregexprentry;
  45. DestroyList : pregexprentry;
  46. Flags : TRegExprFlags;
  47. end;
  48. const
  49. cs_allchars : tcharset = [#0..#255];
  50. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  51. cs_newline : tcharset = [#10];
  52. cs_digits : tcharset = ['0'..'9'];
  53. cs_whitespace : tcharset = [' ',#9];
  54. var
  55. { these are initilized in the init section of the unit }
  56. cs_nonwordchars : tcharset;
  57. cs_nondigits : tcharset;
  58. cs_nonwhitespace : tcharset;
  59. { the following procedures can be used by units basing }
  60. { on the regexpr unit }
  61. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  62. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  63. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  64. implementation
  65. {$ifdef DEBUG}
  66. procedure writecharset(c : tcharset);
  67. var
  68. b : byte;
  69. begin
  70. for b:=0 to 255 do
  71. if chr(b) in c then
  72. write(chr(b));
  73. writeln;
  74. end;
  75. {$endif DEBUG}
  76. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  77. var
  78. first : pregexprentry;
  79. procedure doregister(p : pregexprentry);
  80. begin
  81. p^.nextdestroy:=first;
  82. if not(assigned(first)) then
  83. first:=p;
  84. end;
  85. var
  86. currentpos : pchar;
  87. error : boolean;
  88. function readchars : tcharset;
  89. var
  90. c1 : char;
  91. begin
  92. readchars:=[];
  93. case currentpos^ of
  94. #0:
  95. exit;
  96. '.':
  97. begin
  98. inc(currentpos);
  99. readchars:=cs_allchars-cs_newline;
  100. end;
  101. '\':
  102. begin
  103. inc(currentpos);
  104. case currentpos^ of
  105. #0:
  106. begin
  107. error:=true;
  108. exit;
  109. end;
  110. 't':
  111. begin
  112. inc(currentpos);
  113. readchars:=[#9];
  114. end;
  115. 'n':
  116. begin
  117. inc(currentpos);
  118. readchars:=[#10];
  119. end;
  120. 'r':
  121. begin
  122. inc(currentpos);
  123. readchars:=[#13];
  124. end;
  125. 'd':
  126. begin
  127. inc(currentpos);
  128. readchars:=cs_digits;
  129. end;
  130. 'D':
  131. begin
  132. inc(currentpos);
  133. readchars:=cs_nondigits;
  134. end;
  135. 's':
  136. begin
  137. inc(currentpos);
  138. readchars:=cs_whitespace;
  139. end;
  140. 'S':
  141. begin
  142. inc(currentpos);
  143. readchars:=cs_nonwhitespace;
  144. end;
  145. 'w':
  146. begin
  147. inc(currentpos);
  148. readchars:=cs_wordchars;
  149. end;
  150. 'W':
  151. begin
  152. inc(currentpos);
  153. readchars:=cs_nonwordchars;
  154. end;
  155. else
  156. begin
  157. error:=true;
  158. exit;
  159. end;
  160. end;
  161. end;
  162. else
  163. begin
  164. if ref_caseinsensitive in flags then
  165. c1:=upcase(currentpos^)
  166. else
  167. c1:=currentpos^;
  168. inc(currentpos);
  169. if currentpos^='-' then
  170. begin
  171. inc(currentpos);
  172. if currentpos^=#0 then
  173. begin
  174. error:=true;
  175. exit;
  176. end;
  177. if ref_caseinsensitive in flags then
  178. readchars:=[c1..upcase(currentpos^)]
  179. else
  180. readchars:=[c1..currentpos^];
  181. inc(currentpos);
  182. end
  183. else
  184. readchars:=[c1];
  185. end;
  186. end;
  187. end;
  188. function readcharset : tcharset;
  189. begin
  190. readcharset:=[];
  191. case currentpos^ of
  192. #0:
  193. exit;
  194. '[':
  195. begin
  196. inc(currentpos);
  197. while currentpos^<>']' do
  198. begin
  199. if currentpos^='^' then
  200. begin
  201. inc(currentpos);
  202. readcharset:=readcharset+(cs_allchars-readchars);
  203. end
  204. else
  205. readcharset:=readcharset+readchars;
  206. if error or (currentpos^=#0) then
  207. begin
  208. error:=true;
  209. exit;
  210. end;
  211. end;
  212. inc(currentpos);
  213. end;
  214. '^':
  215. begin
  216. inc(currentpos);
  217. readcharset:=cs_allchars-readchars;
  218. end;
  219. else
  220. readcharset:=readchars;
  221. end;
  222. end;
  223. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  224. var
  225. hp,hp2,ep : pregexprentry;
  226. cs : tcharset;
  227. chaining : ^pregexprentry;
  228. begin
  229. chaining:=nil;
  230. parseregexpr:=nil;
  231. if error then
  232. exit;
  233. { this dummy allows us to redirect the elsepath later }
  234. new(ep);
  235. doregister(ep);
  236. ep^.typ:=ret_charset;
  237. ep^.chars:=[];
  238. ep^.elsepath:=elsepath;
  239. elsepath:=ep;
  240. while true do
  241. begin
  242. if error then
  243. exit;
  244. case currentpos^ of
  245. '(':
  246. begin
  247. inc(currentpos);
  248. new(hp2);
  249. doregister(hp2);
  250. hp2^.typ:=ret_charset;
  251. hp2^.chars:=[];
  252. hp2^.elsepath:=next;
  253. hp:=parseregexpr(hp2,ep);
  254. if assigned(chaining) then
  255. chaining^:=hp
  256. else
  257. parseregexpr:=hp;
  258. chaining:=@hp2^.elsepath;
  259. if currentpos^<>')' then
  260. begin
  261. error:=true;
  262. exit;
  263. end;
  264. inc(currentpos);
  265. end;
  266. '|':
  267. begin
  268. {$ifdef DEBUG}
  269. writeln('Creating backtrace entry');
  270. {$endif DEBUG}
  271. inc(currentpos);
  272. if currentpos^=#0 then
  273. begin
  274. error:=true;
  275. exit;
  276. end;
  277. new(hp);
  278. doregister(hp);
  279. hp^.typ:=ret_backtrace;
  280. // hp^.elsepath:=parseregexpr(elsepath);
  281. hp^.next:=@parseregexpr;
  282. parseregexpr:=hp;
  283. exit;
  284. end;
  285. ')':
  286. exit;
  287. '^':
  288. begin
  289. inc(currentpos);
  290. new(hp);
  291. doregister(hp);
  292. hp^.typ:=ret_startline;
  293. hp^.elsepath:=ep;
  294. // hp^.next:=parseregexpr(ep);
  295. end;
  296. '$':
  297. begin
  298. inc(currentpos);
  299. new(hp);
  300. doregister(hp);
  301. hp^.typ:=ret_endline;
  302. hp^.elsepath:=ep;
  303. // hp^.next:=parseregexpr(ep);
  304. end;
  305. #0:
  306. exit;
  307. else
  308. begin
  309. cs:=readcharset;
  310. if error then
  311. exit;
  312. case currentpos^ of
  313. '*':
  314. begin
  315. inc(currentpos);
  316. new(hp);
  317. doregister(hp);
  318. hp^.typ:=ret_charset;
  319. hp^.chars:=cs;
  320. hp^.elsepath:=next;
  321. hp^.next:=hp;
  322. if assigned(chaining) then
  323. chaining^:=hp
  324. else
  325. parseregexpr:=hp;
  326. chaining:=@hp^.elsepath;
  327. end;
  328. '+':
  329. begin
  330. inc(currentpos);
  331. new(hp);
  332. new(hp2);
  333. doregister(hp);
  334. doregister(hp2);
  335. hp^.typ:=ret_charset;
  336. hp2^.typ:=ret_charset;
  337. hp^.chars:=cs;
  338. hp2^.chars:=cs;
  339. hp^.elsepath:=elsepath;
  340. hp^.next:=hp2;
  341. hp2^.elsepath:=next;
  342. hp2^.next:=hp2;
  343. if assigned(chaining) then
  344. chaining^:=hp
  345. else
  346. parseregexpr:=hp;
  347. chaining:=@hp2^.elsepath;
  348. end;
  349. '?':
  350. begin
  351. inc(currentpos);
  352. new(hp);
  353. { this is a dummy }
  354. new(hp2);
  355. doregister(hp);
  356. doregister(hp2);
  357. hp^.typ:=ret_charset;
  358. hp^.chars:=cs;
  359. hp^.next:=hp2;
  360. hp^.elsepath:=hp2;
  361. hp2^.typ:=ret_charset;
  362. hp2^.chars:=[];
  363. hp2^.elsepath:=next;
  364. if assigned(chaining) then
  365. chaining^:=hp
  366. else
  367. parseregexpr:=hp;
  368. chaining:=@hp2^.elsepath;
  369. end;
  370. else
  371. begin
  372. new(hp);
  373. doregister(hp);
  374. hp^.typ:=ret_charset;
  375. hp^.chars:=cs;
  376. hp^.elsepath:=elsepath;
  377. hp^.next:=next;
  378. if assigned(chaining) then
  379. chaining^:=hp
  380. else
  381. parseregexpr:=hp;
  382. chaining:=@hp^.next;
  383. end;
  384. end;
  385. end;
  386. end;
  387. end;
  388. end;
  389. var
  390. endp : pregexprentry;
  391. begin
  392. GenerateRegExprEngine.Data:=nil;
  393. GenerateRegExprEngine.DestroyList:=nil;
  394. if regexpr=nil then
  395. exit;
  396. first:=nil;
  397. if (ref_singleline in flags) and (ref_multiline in flags) then
  398. exit;
  399. currentpos:=regexpr;
  400. error:=false;
  401. new(endp);
  402. doregister(endp);
  403. endp^.typ:=ret_illegalend;
  404. GenerateRegExprEngine.flags:=flags;
  405. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  406. GenerateRegExprEngine.DestroyList:=first;
  407. if error or (currentpos^<>#0) then
  408. DestroyRegExprEngine(Result);
  409. end;
  410. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  411. var
  412. hp : pregexprentry;
  413. begin
  414. hp:=regexpr.DestroyList;
  415. while assigned(hp) do
  416. begin
  417. regexpr.DestroyList:=hp^.nextdestroy;
  418. dispose(hp);
  419. hp:=regexpr.DestroyList;
  420. end;
  421. regexpr.Data:=nil;
  422. regexpr.DestroyList:=nil;
  423. end;
  424. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  425. var
  426. lastpos : pchar;
  427. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  428. begin
  429. dosearch:=false;
  430. while true do
  431. begin
  432. {$IFDEF Debug}
  433. writeln(byte(regexpr^.typ));
  434. {$ENDIF Debug}
  435. case regexpr^.typ of
  436. ret_endline:
  437. begin
  438. if ref_multiline in regexprengine.flags then
  439. begin
  440. if ((pos+1)^ in [#10,#0]) then
  441. regexpr:=regexpr^.next
  442. else
  443. regexpr:=regexpr^.elsepath;
  444. end
  445. else
  446. begin
  447. if (pos+1)^=#0 then
  448. regexpr:=regexpr^.next
  449. else
  450. regexpr:=regexpr^.elsepath;
  451. end;
  452. end;
  453. ret_startline:
  454. begin
  455. if ref_multiline in regexprengine.flags then
  456. begin
  457. if (pos=p) or ((pos-1)^=#10) then
  458. regexpr:=regexpr^.next
  459. else
  460. regexpr:=regexpr^.elsepath;
  461. end
  462. else
  463. begin
  464. if pos=p then
  465. regexpr:=regexpr^.next
  466. else
  467. regexpr:=regexpr^.elsepath;
  468. end;
  469. end;
  470. ret_charset:
  471. begin
  472. if (pos^ in regexpr^.chars) or
  473. ((ref_caseinsensitive in regexprengine.flags) and
  474. (upcase(pos^) in regexpr^.chars)) then
  475. begin
  476. {$ifdef DEBUG}
  477. writeln('Found matching: ',pos^);
  478. {$endif DEBUG}
  479. regexpr:=regexpr^.next;
  480. inc(pos);
  481. end
  482. else
  483. begin
  484. {$ifdef DEBUG}
  485. writeln('Found unmatching: ',pos^);
  486. {$endif DEBUG}
  487. regexpr:=regexpr^.elsepath;
  488. end;
  489. end;
  490. ret_backtrace:
  491. begin
  492. {$ifdef DEBUG}
  493. writeln('Starting backtrace');
  494. {$endif DEBUG}
  495. if dosearch(regexpr^.next,pos) then
  496. begin
  497. dosearch:=true;
  498. exit;
  499. end
  500. else if dosearch(regexpr^.elsepath,pos) then
  501. begin
  502. dosearch:=true;
  503. exit;
  504. end
  505. else
  506. exit;
  507. end;
  508. end;
  509. lastpos:=pos;
  510. if regexpr=nil then
  511. begin
  512. dosearch:=true;
  513. exit;
  514. end;
  515. if regexpr^.typ=ret_illegalend then
  516. exit;
  517. if pos^=#0 then
  518. exit;
  519. end;
  520. end;
  521. begin
  522. RegExprPos:=false;
  523. index:=0;
  524. len:=0;
  525. if regexprengine.Data=nil then
  526. exit;
  527. while p^<>#0 do
  528. begin
  529. if dosearch(regexprengine.Data,p) then
  530. begin
  531. len:=lastpos-p;
  532. RegExprPos:=true;
  533. exit;
  534. end
  535. else
  536. begin
  537. inc(p);
  538. inc(index);
  539. end;
  540. end;
  541. index:=-1;
  542. end;
  543. begin
  544. cs_nonwordchars:=cs_allchars-cs_wordchars;
  545. cs_nondigits:=cs_allchars-cs_digits;
  546. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  547. end.