regexpr.pp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2000 by Florian Klaempfl
  4. This unit implements basic regular expression support
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. { $define DEBUG}
  12. {
  13. TODO:
  14. - correct backtracking, for example in (...)*
  15. - | support
  16. - getting substrings and using substrings with \1 etc.
  17. - test ^ and $
  18. - newline handling in DOS?
  19. - locals dependend upper/lowercase routines
  20. - extend the interface
  21. }
  22. {$mode objfpc}
  23. unit regexpr;
  24. interface
  25. { the following declarions are only in the interface because }
  26. { some procedures return pregexprentry but programs which }
  27. { use this unit shouldn't access this data structures }
  28. type
  29. tcharset = set of char;
  30. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  31. ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
  32. ret_endline);
  33. pregexprentry = ^tregexprentry;
  34. tregexprentry = record
  35. next,nextdestroy : pregexprentry;
  36. case typ : tregexprentrytype of
  37. ret_charset : (chars : tcharset;
  38. elsepath : pregexprentry);
  39. ret_or : (alternative : pregexprentry);
  40. end;
  41. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  42. tregexprflags = set of tregexprflag;
  43. TRegExprEngine = record
  44. Data : pregexprentry;
  45. DestroyList : pregexprentry;
  46. Flags : TRegExprFlags;
  47. end;
  48. const
  49. cs_allchars : tcharset = [#0..#255];
  50. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  51. cs_newline : tcharset = [#10];
  52. cs_digits : tcharset = ['0'..'9'];
  53. cs_whitespace : tcharset = [' ',#9];
  54. var
  55. { these are initilized in the init section of the unit }
  56. cs_nonwordchars : tcharset;
  57. cs_nondigits : tcharset;
  58. cs_nonwhitespace : tcharset;
  59. { the following procedures can be used by units basing }
  60. { on the regexpr unit }
  61. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  62. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  63. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  64. implementation
  65. {$ifdef DEBUG}
  66. procedure writecharset(c : tcharset);
  67. var
  68. b : byte;
  69. begin
  70. for b:=0 to 255 do
  71. if chr(b) in c then
  72. write(chr(b));
  73. writeln;
  74. end;
  75. {$endif DEBUG}
  76. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  77. var
  78. first : pregexprentry;
  79. procedure doregister(p : pregexprentry);
  80. begin
  81. p^.nextdestroy:=first;
  82. if not(assigned(first)) then
  83. first:=p;
  84. end;
  85. var
  86. currentpos : pchar;
  87. error : boolean;
  88. function readchars : tcharset;
  89. var
  90. c1 : char;
  91. begin
  92. readchars:=[];
  93. case currentpos^ of
  94. #0:
  95. exit;
  96. '.':
  97. begin
  98. inc(currentpos);
  99. readchars:=cs_allchars-cs_newline;
  100. end;
  101. '\':
  102. begin
  103. inc(currentpos);
  104. case currentpos^ of
  105. #0:
  106. begin
  107. error:=true;
  108. exit;
  109. end;
  110. 't':
  111. begin
  112. inc(currentpos);
  113. readchars:=[#9];
  114. end;
  115. 'n':
  116. begin
  117. inc(currentpos);
  118. readchars:=[#10];
  119. end;
  120. 'r':
  121. begin
  122. inc(currentpos);
  123. readchars:=[#13];
  124. end;
  125. 'd':
  126. begin
  127. inc(currentpos);
  128. readchars:=cs_digits;
  129. end;
  130. 'D':
  131. begin
  132. inc(currentpos);
  133. readchars:=cs_nondigits;
  134. end;
  135. 's':
  136. begin
  137. inc(currentpos);
  138. readchars:=cs_whitespace;
  139. end;
  140. 'S':
  141. begin
  142. inc(currentpos);
  143. readchars:=cs_nonwhitespace;
  144. end;
  145. 'w':
  146. begin
  147. inc(currentpos);
  148. readchars:=cs_wordchars;
  149. end;
  150. 'W':
  151. begin
  152. inc(currentpos);
  153. readchars:=cs_nonwordchars;
  154. end;
  155. else
  156. begin //Some basic escaping...
  157. readchars := [currentpos^];
  158. inc (currentpos);
  159. {error:=true;
  160. exit;}
  161. end;
  162. end;
  163. end;
  164. else
  165. begin
  166. if ref_caseinsensitive in flags then
  167. c1:=upcase(currentpos^)
  168. else
  169. c1:=currentpos^;
  170. inc(currentpos);
  171. if currentpos^='-' then
  172. begin
  173. inc(currentpos);
  174. if currentpos^=#0 then
  175. begin
  176. error:=true;
  177. exit;
  178. end;
  179. if ref_caseinsensitive in flags then
  180. readchars:=[c1..upcase(currentpos^)]
  181. else
  182. readchars:=[c1..currentpos^];
  183. inc(currentpos);
  184. end
  185. else
  186. readchars:=[c1];
  187. end;
  188. end;
  189. end;
  190. function readcharset : tcharset;
  191. begin
  192. readcharset:=[];
  193. case currentpos^ of
  194. #0:
  195. exit;
  196. '[':
  197. begin
  198. inc(currentpos);
  199. while currentpos^<>']' do
  200. begin
  201. if currentpos^='^' then
  202. begin
  203. inc(currentpos);
  204. readcharset:=readcharset+(cs_allchars-readchars);
  205. end
  206. else
  207. readcharset:=readcharset+readchars;
  208. if error or (currentpos^=#0) then
  209. begin
  210. error:=true;
  211. exit;
  212. end;
  213. end;
  214. inc(currentpos);
  215. end;
  216. '^':
  217. begin
  218. inc(currentpos);
  219. readcharset:=cs_allchars-readchars;
  220. end;
  221. else
  222. readcharset:=readchars;
  223. end;
  224. end;
  225. function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
  226. var
  227. hp,hp2,ep : pregexprentry;
  228. cs : tcharset;
  229. chaining : ^pregexprentry;
  230. begin
  231. chaining:=nil;
  232. parseregexpr:=nil;
  233. if error then
  234. exit;
  235. { this dummy allows us to redirect the elsepath later }
  236. new(ep);
  237. doregister(ep);
  238. ep^.typ:=ret_charset;
  239. ep^.chars:=[];
  240. ep^.elsepath:=elsepath;
  241. elsepath:=ep;
  242. while true do
  243. begin
  244. if error then
  245. exit;
  246. case currentpos^ of
  247. '(':
  248. begin
  249. inc(currentpos);
  250. new(hp2);
  251. doregister(hp2);
  252. hp2^.typ:=ret_charset;
  253. hp2^.chars:=[];
  254. hp2^.elsepath:=next;
  255. hp:=parseregexpr(hp2,ep);
  256. if assigned(chaining) then
  257. chaining^:=hp
  258. else
  259. parseregexpr:=hp;
  260. chaining:=@hp2^.elsepath;
  261. if currentpos^<>')' then
  262. begin
  263. error:=true;
  264. exit;
  265. end;
  266. inc(currentpos);
  267. end;
  268. '|':
  269. begin
  270. {$ifdef DEBUG}
  271. writeln('Creating backtrace entry');
  272. {$endif DEBUG}
  273. inc(currentpos);
  274. if currentpos^=#0 then
  275. begin
  276. error:=true;
  277. exit;
  278. end;
  279. new(hp);
  280. doregister(hp);
  281. hp^.typ:=ret_backtrace;
  282. // hp^.elsepath:=parseregexpr(elsepath);
  283. hp^.next:=@parseregexpr;
  284. parseregexpr:=hp;
  285. exit;
  286. end;
  287. ')':
  288. exit;
  289. '^':
  290. begin
  291. inc(currentpos);
  292. new(hp);
  293. doregister(hp);
  294. hp^.typ:=ret_startline;
  295. hp^.elsepath:=ep;
  296. // hp^.next:=parseregexpr(ep);
  297. end;
  298. '$':
  299. begin
  300. inc(currentpos);
  301. new(hp);
  302. doregister(hp);
  303. hp^.typ:=ret_endline;
  304. hp^.elsepath:=ep;
  305. // hp^.next:=parseregexpr(ep);
  306. end;
  307. #0:
  308. exit;
  309. else
  310. begin
  311. cs:=readcharset;
  312. if error then
  313. exit;
  314. case currentpos^ of
  315. '*':
  316. begin
  317. inc(currentpos);
  318. new(hp);
  319. doregister(hp);
  320. hp^.typ:=ret_charset;
  321. hp^.chars:=cs;
  322. hp^.elsepath:=next;
  323. hp^.next:=hp;
  324. if assigned(chaining) then
  325. chaining^:=hp
  326. else
  327. parseregexpr:=hp;
  328. chaining:=@hp^.elsepath;
  329. end;
  330. '+':
  331. begin
  332. inc(currentpos);
  333. new(hp);
  334. new(hp2);
  335. doregister(hp);
  336. doregister(hp2);
  337. hp^.typ:=ret_charset;
  338. hp2^.typ:=ret_charset;
  339. hp^.chars:=cs;
  340. hp2^.chars:=cs;
  341. hp^.elsepath:=elsepath;
  342. hp^.next:=hp2;
  343. hp2^.elsepath:=next;
  344. hp2^.next:=hp2;
  345. if assigned(chaining) then
  346. chaining^:=hp
  347. else
  348. parseregexpr:=hp;
  349. chaining:=@hp2^.elsepath;
  350. end;
  351. '?':
  352. begin
  353. inc(currentpos);
  354. new(hp);
  355. { this is a dummy }
  356. new(hp2);
  357. doregister(hp);
  358. doregister(hp2);
  359. hp^.typ:=ret_charset;
  360. hp^.chars:=cs;
  361. hp^.next:=hp2;
  362. hp^.elsepath:=hp2;
  363. hp2^.typ:=ret_charset;
  364. hp2^.chars:=[];
  365. hp2^.elsepath:=next;
  366. if assigned(chaining) then
  367. chaining^:=hp
  368. else
  369. parseregexpr:=hp;
  370. chaining:=@hp2^.elsepath;
  371. end;
  372. else
  373. begin
  374. new(hp);
  375. doregister(hp);
  376. hp^.typ:=ret_charset;
  377. hp^.chars:=cs;
  378. hp^.elsepath:=elsepath;
  379. hp^.next:=next;
  380. if assigned(chaining) then
  381. chaining^:=hp
  382. else
  383. parseregexpr:=hp;
  384. chaining:=@hp^.next;
  385. end;
  386. end;
  387. end;
  388. end;
  389. end;
  390. end;
  391. var
  392. endp : pregexprentry;
  393. begin
  394. GenerateRegExprEngine.Data:=nil;
  395. GenerateRegExprEngine.DestroyList:=nil;
  396. if regexpr=nil then
  397. exit;
  398. first:=nil;
  399. if (ref_singleline in flags) and (ref_multiline in flags) then
  400. exit;
  401. currentpos:=regexpr;
  402. error:=false;
  403. new(endp);
  404. doregister(endp);
  405. endp^.typ:=ret_illegalend;
  406. GenerateRegExprEngine.flags:=flags;
  407. GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
  408. GenerateRegExprEngine.DestroyList:=first;
  409. if error or (currentpos^<>#0) then
  410. DestroyRegExprEngine(Result);
  411. end;
  412. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  413. var
  414. hp : pregexprentry;
  415. begin
  416. hp:=regexpr.DestroyList;
  417. while assigned(hp) do
  418. begin
  419. regexpr.DestroyList:=hp^.nextdestroy;
  420. dispose(hp);
  421. hp:=regexpr.DestroyList;
  422. end;
  423. regexpr.Data:=nil;
  424. regexpr.DestroyList:=nil;
  425. end;
  426. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  427. var
  428. lastpos : pchar;
  429. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  430. begin
  431. dosearch:=false;
  432. while true do
  433. begin
  434. {$IFDEF Debug}
  435. writeln(byte(regexpr^.typ));
  436. {$ENDIF Debug}
  437. case regexpr^.typ of
  438. ret_endline:
  439. begin
  440. if ref_multiline in regexprengine.flags then
  441. begin
  442. if ((pos+1)^ in [#10,#0]) then
  443. regexpr:=regexpr^.next
  444. else
  445. regexpr:=regexpr^.elsepath;
  446. end
  447. else
  448. begin
  449. if (pos+1)^=#0 then
  450. regexpr:=regexpr^.next
  451. else
  452. regexpr:=regexpr^.elsepath;
  453. end;
  454. end;
  455. ret_startline:
  456. begin
  457. if ref_multiline in regexprengine.flags then
  458. begin
  459. if (pos=p) or ((pos-1)^=#10) then
  460. regexpr:=regexpr^.next
  461. else
  462. regexpr:=regexpr^.elsepath;
  463. end
  464. else
  465. begin
  466. if pos=p then
  467. regexpr:=regexpr^.next
  468. else
  469. regexpr:=regexpr^.elsepath;
  470. end;
  471. end;
  472. ret_charset:
  473. begin
  474. if (pos^ in regexpr^.chars) or
  475. ((ref_caseinsensitive in regexprengine.flags) and
  476. (upcase(pos^) in regexpr^.chars)) then
  477. begin
  478. {$ifdef DEBUG}
  479. writeln('Found matching: ',pos^);
  480. {$endif DEBUG}
  481. regexpr:=regexpr^.next;
  482. inc(pos);
  483. end
  484. else
  485. begin
  486. {$ifdef DEBUG}
  487. writeln('Found unmatching: ',pos^);
  488. {$endif DEBUG}
  489. regexpr:=regexpr^.elsepath;
  490. end;
  491. end;
  492. ret_backtrace:
  493. begin
  494. {$ifdef DEBUG}
  495. writeln('Starting backtrace');
  496. {$endif DEBUG}
  497. if dosearch(regexpr^.next,pos) then
  498. begin
  499. dosearch:=true;
  500. exit;
  501. end
  502. else if dosearch(regexpr^.elsepath,pos) then
  503. begin
  504. dosearch:=true;
  505. exit;
  506. end
  507. else
  508. exit;
  509. end;
  510. end;
  511. lastpos:=pos;
  512. if regexpr=nil then
  513. begin
  514. dosearch:=true;
  515. exit;
  516. end;
  517. if regexpr^.typ=ret_illegalend then
  518. exit;
  519. if pos^=#0 then
  520. exit;
  521. end;
  522. end;
  523. begin
  524. RegExprPos:=false;
  525. index:=0;
  526. len:=0;
  527. if regexprengine.Data=nil then
  528. exit;
  529. while p^<>#0 do
  530. begin
  531. if dosearch(regexprengine.Data,p) then
  532. begin
  533. len:=lastpos-p;
  534. RegExprPos:=true;
  535. exit;
  536. end
  537. else
  538. begin
  539. inc(p);
  540. inc(index);
  541. end;
  542. end;
  543. index:=-1;
  544. end;
  545. begin
  546. cs_nonwordchars:=cs_allchars-cs_wordchars;
  547. cs_nondigits:=cs_allchars-cs_digits;
  548. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  549. end.