regexpr.pp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. {
  2. $Id$
  3. This file is part of the Free Pascal run time library.
  4. Copyright (c) 1999-2000 by the Free Pascal development team
  5. This unit implements basic regular expression support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. { $define DEBUG}
  13. unit regexpr;
  14. interface
  15. { the following declarions are only in the interface because }
  16. { some procedures return pregexprentry but programs which }
  17. { use this unit shouldn't access this data structures }
  18. type
  19. tcharset = set of char;
  20. tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
  21. ret_illegalend,ret_backtrace);
  22. pregexprentry = ^tregexprentry;
  23. tregexprentry = record
  24. next,nextdestroy : pregexprentry;
  25. case typ : tregexprentrytype of
  26. ret_charset : (chars : tcharset;
  27. elsepath : pregexprentry);
  28. ret_or : (alternative : pregexprentry);
  29. end;
  30. tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
  31. tregexprflags = set of tregexprflag;
  32. TRegExprEngine = record
  33. Data : pregexprentry;
  34. DestroyList : pregexprentry;
  35. Flags : TRegExprFlags;
  36. end;
  37. const
  38. cs_allchars : tcharset = [#0..#255];
  39. cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
  40. cs_newline : tcharset = [#10];
  41. cs_digits : tcharset = ['0'..'9'];
  42. cs_whitespace : tcharset = [' ',#9];
  43. var
  44. { these are initilized in the init section of the unit }
  45. cs_nonwordchars : tcharset;
  46. cs_nondigits : tcharset;
  47. cs_nonwhitespace : tcharset;
  48. { the following procedures can be used by units basing }
  49. { on the regexpr unit }
  50. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  51. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  52. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  53. implementation
  54. uses
  55. strings;
  56. {$ifdef DEBUG}
  57. procedure writecharset(c : tcharset);
  58. var
  59. b : byte;
  60. begin
  61. for b:=0 to 255 do
  62. if chr(b) in c then
  63. write(chr(b));
  64. writeln;
  65. end;
  66. {$endif DEBUG}
  67. function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
  68. var
  69. first : pregexprentry;
  70. procedure doregister(p : pregexprentry);
  71. begin
  72. p^.nextdestroy:=first;
  73. if not(assigned(first)) then
  74. first:=p;
  75. end;
  76. var
  77. currentpos : pchar;
  78. error : boolean;
  79. function readchars : tcharset;
  80. var
  81. c1 : char;
  82. begin
  83. readchars:=[];
  84. case currentpos^ of
  85. #0:
  86. exit;
  87. '.':
  88. begin
  89. inc(currentpos);
  90. readchars:=cs_allchars-cs_newline;
  91. end;
  92. '\':
  93. begin
  94. inc(currentpos);
  95. case currentpos^ of
  96. #0:
  97. begin
  98. error:=true;
  99. exit;
  100. end;
  101. 't':
  102. begin
  103. inc(currentpos);
  104. readchars:=[#9];
  105. end;
  106. 'n':
  107. begin
  108. inc(currentpos);
  109. readchars:=[#10];
  110. end;
  111. 'r':
  112. begin
  113. inc(currentpos);
  114. readchars:=[#13];
  115. end;
  116. 'd':
  117. begin
  118. inc(currentpos);
  119. readchars:=cs_digits;
  120. end;
  121. 'D':
  122. begin
  123. inc(currentpos);
  124. readchars:=cs_nondigits;
  125. end;
  126. 's':
  127. begin
  128. inc(currentpos);
  129. readchars:=cs_whitespace;
  130. end;
  131. 'S':
  132. begin
  133. inc(currentpos);
  134. readchars:=cs_nonwhitespace;
  135. end;
  136. 'w':
  137. begin
  138. inc(currentpos);
  139. readchars:=cs_wordchars;
  140. end;
  141. 'W':
  142. begin
  143. inc(currentpos);
  144. readchars:=cs_nonwordchars;
  145. end;
  146. else
  147. begin
  148. error:=true;
  149. exit;
  150. end;
  151. end;
  152. end;
  153. else
  154. begin
  155. if ref_caseinsensitive in flags then
  156. c1:=upcase(currentpos^)
  157. else
  158. c1:=currentpos^;
  159. inc(currentpos);
  160. if currentpos^='-' then
  161. begin
  162. inc(currentpos);
  163. if currentpos^=#0 then
  164. begin
  165. error:=true;
  166. exit;
  167. end;
  168. if ref_caseinsensitive in flags then
  169. readchars:=[c1..upcase(currentpos^)]
  170. else
  171. readchars:=[c1..currentpos^];
  172. inc(currentpos);
  173. end
  174. else
  175. readchars:=[c1];
  176. end;
  177. end;
  178. end;
  179. function readcharset : tcharset;
  180. var
  181. c1,c2 : char;
  182. begin
  183. readcharset:=[];
  184. case currentpos^ of
  185. #0:
  186. exit;
  187. '[':
  188. begin
  189. inc(currentpos);
  190. while currentpos^<>']' do
  191. begin
  192. if currentpos^='^' then
  193. begin
  194. inc(currentpos);
  195. readcharset:=readcharset+(cs_allchars-readchars);
  196. end
  197. else
  198. readcharset:=readcharset+readchars;
  199. if error or (currentpos^=#0) then
  200. begin
  201. error:=true;
  202. exit;
  203. end;
  204. end;
  205. inc(currentpos);
  206. end;
  207. '^':
  208. begin
  209. inc(currentpos);
  210. readcharset:=cs_allchars-readchars;
  211. end;
  212. else
  213. readcharset:=readchars;
  214. end;
  215. end;
  216. function parseregexpr(elsepath : pregexprentry) : pregexprentry;
  217. var
  218. hp,hp2,ep : pregexprentry;
  219. cs : tcharset;
  220. begin
  221. parseregexpr:=nil;
  222. if error then
  223. exit;
  224. { this dummy allows us to redirect the elsepath later }
  225. new(ep);
  226. doregister(ep);
  227. ep^.typ:=ret_charset;
  228. ep^.chars:=[];
  229. ep^.elsepath:=elsepath;
  230. while true do
  231. begin
  232. if error then
  233. exit;
  234. case currentpos^ of
  235. '(':
  236. begin
  237. inc(currentpos);
  238. parseregexpr:=parseregexpr(ep);
  239. if currentpos^<>')' then
  240. begin
  241. error:=true;
  242. exit;
  243. end;
  244. inc(currentpos);
  245. end;
  246. '|':
  247. begin
  248. inc(currentpos);
  249. if currentpos^=#0 then
  250. begin
  251. error:=true;
  252. exit;
  253. end;
  254. ep^.typ:=ret_backtrace;
  255. ep^.elsepath:=parseregexpr(elsepath);
  256. ep^.next:=parseregexpr;
  257. parseregexpr:=ep;
  258. end;
  259. ')':
  260. exit;
  261. #0:
  262. exit;
  263. else
  264. begin
  265. cs:=readcharset;
  266. if error then
  267. exit;
  268. case currentpos^ of
  269. '*':
  270. begin
  271. inc(currentpos);
  272. new(hp);
  273. doregister(hp);
  274. hp^.typ:=ret_charset;
  275. hp^.chars:=cs;
  276. hp^.elsepath:=parseregexpr(ep);
  277. hp^.next:=hp;
  278. end;
  279. '+':
  280. begin
  281. inc(currentpos);
  282. new(hp);
  283. new(hp2);
  284. doregister(hp);
  285. doregister(hp2);
  286. hp^.typ:=ret_charset;
  287. hp2^.typ:=ret_charset;
  288. hp^.chars:=cs;
  289. hp^.elsepath:=ep;
  290. hp^.next:=hp2;
  291. hp2^.chars:=cs;
  292. hp2^.elsepath:=parseregexpr(ep);
  293. hp2^.next:=hp2;
  294. end;
  295. '?':
  296. begin
  297. inc(currentpos);
  298. new(hp);
  299. doregister(hp);
  300. hp^.typ:=ret_charset;
  301. hp^.chars:=cs;
  302. hp^.elsepath:=parseregexpr(ep);
  303. hp^.next:=hp^.elsepath;
  304. end;
  305. else
  306. begin
  307. new(hp);
  308. doregister(hp);
  309. hp^.typ:=ret_charset;
  310. hp^.chars:=cs;
  311. hp^.elsepath:=ep;
  312. hp^.next:=parseregexpr(ep);
  313. end;
  314. end;
  315. parseregexpr:=hp;
  316. end;
  317. end;
  318. end;
  319. end;
  320. var
  321. endp : pregexprentry;
  322. begin
  323. GenerateRegExprEngine.Data:=nil;
  324. GenerateRegExprEngine.DestroyList:=nil;
  325. if regexpr=nil then
  326. exit;
  327. first:=nil;
  328. currentpos:=regexpr;
  329. new(endp);
  330. doregister(endp);
  331. endp^.typ:=ret_illegalend;
  332. GenerateRegExprEngine.flags:=flags;
  333. GenerateRegExprEngine.Data:=parseregexpr(endp);
  334. GenerateRegExprEngine.DestroyList:=first;
  335. if error then
  336. DestroyRegExprEngine(GenerateRegExprEngine);
  337. end;
  338. procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
  339. var
  340. hp : pregexprentry;
  341. begin
  342. hp:=regexpr.DestroyList;
  343. while assigned(hp) do
  344. begin
  345. regexpr.DestroyList:=hp^.nextdestroy;
  346. dispose(hp);
  347. hp:=regexpr.DestroyList;
  348. end;
  349. regexpr.Data:=nil;
  350. regexpr.DestroyList:=nil;
  351. end;
  352. function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
  353. var
  354. lastpos,startpos : pchar;
  355. function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
  356. begin
  357. dosearch:=false;
  358. while true do
  359. begin
  360. if regexpr^.typ=ret_backtrace then
  361. begin
  362. if dosearch(regexpr^.next,pos) then
  363. begin
  364. dosearch:=true;
  365. exit;
  366. end
  367. else if dosearch(regexpr^.elsepath,pos) then
  368. begin
  369. dosearch:=true;
  370. exit;
  371. end
  372. else
  373. exit;
  374. end;
  375. if (pos^ in regexpr^.chars) or
  376. ((ref_caseinsensitive in regexprengine.flags) and
  377. (upcase(pos^) in regexpr^.chars)) then
  378. begin
  379. {$ifdef DEBUG}
  380. writeln('Found matching: ',pos^);
  381. {$endif DEBUG}
  382. regexpr:=regexpr^.next;
  383. inc(pos);
  384. end
  385. else
  386. begin
  387. {$ifdef DEBUG}
  388. writeln('Found unmatching: ',pos^);
  389. {$endif DEBUG}
  390. regexpr:=regexpr^.elsepath;
  391. end;
  392. lastpos:=pos;
  393. if regexpr=nil then
  394. begin
  395. dosearch:=true;
  396. exit;
  397. end;
  398. if regexpr^.typ=ret_illegalend then
  399. exit;
  400. if pos^=#0 then
  401. exit;
  402. end;
  403. end;
  404. begin
  405. RegExprPos:=false;
  406. index:=0;
  407. len:=0;
  408. if regexprengine.Data=nil then
  409. exit;
  410. while p^<>#0 do
  411. begin
  412. if dosearch(regexprengine.Data,p) then
  413. begin
  414. len:=lastpos-p;
  415. RegExprPos:=true;
  416. exit;
  417. end
  418. else
  419. begin
  420. inc(p);
  421. inc(index);
  422. end;
  423. end;
  424. index:=-1;
  425. end;
  426. begin
  427. cs_nonwordchars:=cs_allchars-cs_wordchars;
  428. cs_nondigits:=cs_allchars-cs_digits;
  429. cs_nonwhitespace:=cs_allchars-cs_whitespace;
  430. end.
  431. {
  432. $Log$
  433. Revision 1.2 2000-03-14 22:57:51 florian
  434. + added flags
  435. + support of case insensitive search
  436. Revision 1.1 2000/03/14 22:09:03 florian
  437. * Initial revision
  438. }