regexpr.pp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. {
  2. $Id$
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 1999-2000 by the Free Pascal development team
  5. This unit implements basic regular expression support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. { $define DEBUG}
  13. unit regexpr;
  14. interface
  15. { the following declarions are only in the interface because }
  16. { some procedures return pregexprentry but programs which }
  17. { use this unit shouldn't access this data structures }
  18. type
  19. tcharset = set of char;
  20. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  21. ret_illegalend,ret_backtrace);
  22. pregexprentry = ^tregexprentry;
  23. tregexprentry = record
  24. next,nextdestroy : pregexprentry;
  25. case typ : tregexprentrytype of
  26. ret_charset : (chars : tcharset;
  27. elsepath : pregexprentry);
  28. ret_or : (alternative : pregexprentry);
  29. end;
  30. TRegExprEngine = record
  31. Data : pregexprentry;
  32. DestroyList : pregexprentry;
  33. end;
  34. const
  35. cs_allchars : tcharset = [#0..#255];
  36. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  37. cs_newline : tcharset = [#10];
  38. cs_digits : tcharset = ['0'..'9'];
  39. cs_whitespace : tcharset = [' ',#9];
  40. var
  41. { these are initilized in the init section of the unit }
  42. cs_nonwordchars : tcharset;
  43. cs_nondigits : tcharset;
  44. cs_nonwhitespace : tcharset;
  45. { the following procedures can be used by units basing }
  46. { on the regexpr unit }
  47. function GenerateRegExprEngine(regexpr : pchar) : TRegExprEngine;
  48. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  49. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  50. implementation
  51. uses
  52. strings;
  53. {$ifdef DEBUG}
  54. procedure writecharset(c : tcharset);
  55. var
  56. b : byte;
  57. begin
  58. for b:=0 to 255 do
  59. if chr(b) in c then
  60. write(chr(b));
  61. writeln;
  62. end;
  63. {$endif DEBUG}
  64. function GenerateRegExprEngine(regexpr : pchar) : TRegExprEngine;
  65. var
  66. first : pregexprentry;
  67. procedure doregister(p : pregexprentry);
  68. begin
  69. p^.nextdestroy:=first;
  70. if not(assigned(first)) then
  71. first:=p;
  72. end;
  73. var
  74. currentpos : pchar;
  75. error : boolean;
  76. function readchars : tcharset;
  77. var
  78. c1 : char;
  79. begin
  80. readchars:=[];
  81. case currentpos^ of
  82. #0:
  83. exit;
  84. '.':
  85. begin
  86. inc(currentpos);
  87. readchars:=cs_allchars-cs_newline;
  88. end;
  89. '\':
  90. begin
  91. inc(currentpos);
  92. case currentpos^ of
  93. #0:
  94. begin
  95. error:=true;
  96. exit;
  97. end;
  98. 't':
  99. begin
  100. inc(currentpos);
  101. readchars:=[#9];
  102. end;
  103. 'n':
  104. begin
  105. inc(currentpos);
  106. readchars:=[#10];
  107. end;
  108. 'r':
  109. begin
  110. inc(currentpos);
  111. readchars:=[#13];
  112. end;
  113. 'd':
  114. begin
  115. inc(currentpos);
  116. readchars:=cs_digits;
  117. end;
  118. 'D':
  119. begin
  120. inc(currentpos);
  121. readchars:=cs_nondigits;
  122. end;
  123. 's':
  124. begin
  125. inc(currentpos);
  126. readchars:=cs_whitespace;
  127. end;
  128. 'S':
  129. begin
  130. inc(currentpos);
  131. readchars:=cs_nonwhitespace;
  132. end;
  133. 'w':
  134. begin
  135. inc(currentpos);
  136. readchars:=cs_wordchars;
  137. end;
  138. 'W':
  139. begin
  140. inc(currentpos);
  141. readchars:=cs_nonwordchars;
  142. end;
  143. else
  144. begin
  145. error:=true;
  146. exit;
  147. end;
  148. end;
  149. end;
  150. else
  151. begin
  152. c1:=currentpos^;
  153. inc(currentpos);
  154. if currentpos^='-' then
  155. begin
  156. inc(currentpos);
  157. if currentpos^=#0 then
  158. begin
  159. error:=true;
  160. exit;
  161. end;
  162. readchars:=[c1..currentpos^];
  163. inc(currentpos);
  164. end
  165. else
  166. readchars:=[c1];
  167. end;
  168. end;
  169. end;
  170. function readcharset : tcharset;
  171. var
  172. c1,c2 : char;
  173. begin
  174. readcharset:=[];
  175. case currentpos^ of
  176. #0:
  177. exit;
  178. '[':
  179. begin
  180. inc(currentpos);
  181. while currentpos^<>']' do
  182. begin
  183. if currentpos^='^' then
  184. begin
  185. inc(currentpos);
  186. readcharset:=readcharset+(cs_allchars-readchars);
  187. end
  188. else
  189. readcharset:=readcharset+readchars;
  190. if error or (currentpos^=#0) then
  191. begin
  192. error:=true;
  193. exit;
  194. end;
  195. end;
  196. inc(currentpos);
  197. end;
  198. '^':
  199. begin
  200. inc(currentpos);
  201. readcharset:=cs_allchars-readchars;
  202. end;
  203. else
  204. readcharset:=readchars;
  205. end;
  206. end;
  207. function parseregexpr(elsepath : pregexprentry) : pregexprentry;
  208. var
  209. hp,hp2,ep : pregexprentry;
  210. cs : tcharset;
  211. begin
  212. parseregexpr:=nil;
  213. if error then
  214. exit;
  215. { this dummy allows us to redirect the elsepath later }
  216. new(ep);
  217. doregister(ep);
  218. ep^.typ:=ret_charset;
  219. ep^.chars:=[];
  220. ep^.elsepath:=elsepath;
  221. while true do
  222. begin
  223. if error then
  224. exit;
  225. case currentpos^ of
  226. '(':
  227. begin
  228. inc(currentpos);
  229. parseregexpr:=parseregexpr(ep);
  230. if currentpos^<>')' then
  231. begin
  232. error:=true;
  233. exit;
  234. end;
  235. inc(currentpos);
  236. end;
  237. '|':
  238. begin
  239. inc(currentpos);
  240. if currentpos^=#0 then
  241. begin
  242. error:=true;
  243. exit;
  244. end;
  245. ep^.typ:=ret_backtrace;
  246. ep^.elsepath:=parseregexpr(elsepath);
  247. ep^.next:=parseregexpr;
  248. parseregexpr:=ep;
  249. end;
  250. ')':
  251. exit;
  252. #0:
  253. exit;
  254. else
  255. begin
  256. cs:=readcharset;
  257. if error then
  258. exit;
  259. case currentpos^ of
  260. '*':
  261. begin
  262. inc(currentpos);
  263. new(hp);
  264. doregister(hp);
  265. hp^.typ:=ret_charset;
  266. hp^.chars:=cs;
  267. hp^.elsepath:=parseregexpr(ep);
  268. hp^.next:=hp;
  269. end;
  270. '+':
  271. begin
  272. inc(currentpos);
  273. new(hp);
  274. new(hp2);
  275. doregister(hp);
  276. doregister(hp2);
  277. hp^.typ:=ret_charset;
  278. hp2^.typ:=ret_charset;
  279. hp^.chars:=cs;
  280. hp^.elsepath:=ep;
  281. hp^.next:=hp2;
  282. hp2^.chars:=cs;
  283. hp2^.elsepath:=parseregexpr(ep);
  284. hp2^.next:=hp2;
  285. end;
  286. '?':
  287. begin
  288. inc(currentpos);
  289. new(hp);
  290. doregister(hp);
  291. hp^.typ:=ret_charset;
  292. hp^.chars:=cs;
  293. hp^.elsepath:=parseregexpr(ep);
  294. hp^.next:=hp^.elsepath;
  295. end;
  296. else
  297. begin
  298. new(hp);
  299. doregister(hp);
  300. hp^.typ:=ret_charset;
  301. hp^.chars:=cs;
  302. hp^.elsepath:=ep;
  303. hp^.next:=parseregexpr(ep);
  304. end;
  305. end;
  306. parseregexpr:=hp;
  307. end;
  308. end;
  309. end;
  310. end;
  311. var
  312. endp : pregexprentry;
  313. begin
  314. GenerateRegExprEngine.Data:=nil;
  315. GenerateRegExprEngine.DestroyList:=nil;
  316. if regexpr=nil then
  317. exit;
  318. first:=nil;
  319. currentpos:=regexpr;
  320. new(endp);
  321. doregister(endp);
  322. endp^.typ:=ret_illegalend;
  323. GenerateRegExprEngine.Data:=parseregexpr(endp);
  324. GenerateRegExprEngine.DestroyList:=first;
  325. if error then
  326. DestroyRegExprEngine(GenerateRegExprEngine);
  327. end;
  328. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  329. var
  330. hp : pregexprentry;
  331. begin
  332. hp:=regexpr.DestroyList;
  333. while assigned(hp) do
  334. begin
  335. regexpr.DestroyList:=hp^.nextdestroy;
  336. dispose(hp);
  337. hp:=regexpr.DestroyList;
  338. end;
  339. regexpr.Data:=nil;
  340. regexpr.DestroyList:=nil;
  341. end;
  342. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  343. var
  344. lastpos,startpos : pchar;
  345. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  346. begin
  347. dosearch:=false;
  348. while true do
  349. begin
  350. if regexpr^.typ=ret_backtrace then
  351. begin
  352. if dosearch(regexpr^.next,pos) then
  353. begin
  354. dosearch:=true;
  355. exit;
  356. end
  357. else if dosearch(regexpr^.elsepath,pos) then
  358. begin
  359. dosearch:=true;
  360. exit;
  361. end
  362. else
  363. exit;
  364. end;
  365. if pos^ in regexpr^.chars then
  366. begin
  367. {$ifdef DEBUG}
  368. writeln('Found matching: ',pos^);
  369. {$endif DEBUG}
  370. regexpr:=regexpr^.next;
  371. inc(pos);
  372. end
  373. else
  374. begin
  375. {$ifdef DEBUG}
  376. writeln('Found unmatching: ',pos^);
  377. {$endif DEBUG}
  378. regexpr:=regexpr^.elsepath;
  379. end;
  380. lastpos:=pos;
  381. if regexpr=nil then
  382. begin
  383. dosearch:=true;
  384. exit;
  385. end;
  386. if regexpr^.typ=ret_illegalend then
  387. exit;
  388. if pos^=#0 then
  389. exit;
  390. end;
  391. end;
  392. begin
  393. RegExprPos:=false;
  394. index:=0;
  395. len:=0;
  396. if regexprengine.Data=nil then
  397. exit;
  398. while p^<>#0 do
  399. begin
  400. if dosearch(regexprengine.Data,p) then
  401. begin
  402. len:=lastpos-p;
  403. RegExprPos:=true;
  404. exit;
  405. end
  406. else
  407. begin
  408. inc(p);
  409. inc(index);
  410. end;
  411. end;
  412. index:=-1;
  413. end;
  414. begin
  415. cs_nonwordchars:=cs_allchars-cs_wordchars;
  416. cs_nondigits:=cs_allchars-cs_digits;
  417. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  418. end.
  419. {
  420. $Log$
  421. Revision 1.1 2000-03-14 22:09:03 florian
  422. * Initial revision
  423. }