2
0

pcre.ml 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034
  1. (*
  2. PCRE-OCAML - Perl Compatibility Regular Expressions for OCaml
  3. Copyright (C) 1999- Markus Mottl
  4. email: [email protected]
  5. WWW: http://www.ocaml.info
  6. This library is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU Lesser General Public
  8. License as published by the Free Software Foundation; either
  9. version 2.1 of the License, or (at your option) any later version.
  10. This library is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. Lesser General Public License for more details.
  14. You should have received a copy of the GNU Lesser General Public
  15. License along with this library; if not, write to the Free Software
  16. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. *)
  18. (* Public exceptions and their registration with the C runtime *)
  19. let string_copy str = str
  20. let buffer_add_subbytes = Buffer.add_subbytes
  21. type error =
  22. | Partial
  23. | BadPartial
  24. | BadPattern of string * int
  25. | BadUTF8
  26. | BadUTF8Offset
  27. | MatchLimit
  28. | RecursionLimit
  29. | InternalError of string
  30. exception Error of error
  31. exception Backtrack
  32. exception Regexp_or of string * error
  33. (* Puts exceptions into global C-variables for fast retrieval *)
  34. external pcre_ocaml_init : unit -> unit = "pcre_ocaml_init"
  35. (* Registers exceptions with the C runtime and caches polymorphic variants *)
  36. let () =
  37. Callback.register_exception "Pcre.Error" (Error (InternalError ""));
  38. Callback.register_exception "Pcre.Backtrack" Backtrack;
  39. pcre_ocaml_init ()
  40. (* Compilation and runtime flags and their conversion functions *)
  41. type icflag = int
  42. type irflag = int
  43. (* Compilation flags *)
  44. type cflag =
  45. [
  46. | `CASELESS
  47. | `MULTILINE
  48. | `DOTALL
  49. | `EXTENDED
  50. | `ANCHORED
  51. | `DOLLAR_ENDONLY
  52. | `EXTRA
  53. | `UNGREEDY
  54. | `UTF8
  55. | `NO_UTF8_CHECK
  56. | `NO_AUTO_CAPTURE
  57. | `AUTO_CALLOUT
  58. | `FIRSTLINE
  59. | `UCP
  60. ]
  61. let int_of_cflag = function
  62. | `CASELESS -> 0x0001
  63. | `MULTILINE -> 0x0002
  64. | `DOTALL -> 0x0004
  65. | `EXTENDED -> 0x0008
  66. | `ANCHORED -> 0x0010
  67. | `DOLLAR_ENDONLY -> 0x0020
  68. | `EXTRA -> 0x0040
  69. | `UNGREEDY -> 0x0200
  70. | `UTF8 -> 0x0800
  71. | `NO_AUTO_CAPTURE -> 0x1000
  72. | `NO_UTF8_CHECK -> 0x2000
  73. | `AUTO_CALLOUT -> 0x4000
  74. | `FIRSTLINE -> 0x40000
  75. | `UCP -> 0x20000000
  76. let coll_icflag icflag flag = int_of_cflag flag lor icflag
  77. let cflags flags = List.fold_left coll_icflag 0 flags
  78. let cflag_of_int = function
  79. | 0x0001 -> `CASELESS
  80. | 0x0002 -> `MULTILINE
  81. | 0x0004 -> `DOTALL
  82. | 0x0008 -> `EXTENDED
  83. | 0x0010 -> `ANCHORED
  84. | 0x0020 -> `DOLLAR_ENDONLY
  85. | 0x0040 -> `EXTRA
  86. | 0x0200 -> `UNGREEDY
  87. | 0x0800 -> `UTF8
  88. | 0x1000 -> `NO_AUTO_CAPTURE
  89. | 0x2000 -> `NO_UTF8_CHECK
  90. | 0x4000 -> `AUTO_CALLOUT
  91. | 0x40000 -> `FIRSTLINE
  92. | 0x20000000 -> `UCP
  93. | _ -> failwith "Pcre.cflag_list: unknown compilation flag"
  94. let all_cflags =
  95. [
  96. 0x0001; 0x0002; 0x0004; 0x0008; 0x0010; 0x0020;
  97. 0x0040; 0x0200; 0x0800; 0x1000; 0x2000; 0x4000; 0x40000;
  98. 0x20000000
  99. ]
  100. let cflag_list icflags =
  101. let coll flag_list flag =
  102. if icflags land flag <> 0 then cflag_of_int flag :: flag_list
  103. else flag_list in
  104. List.fold_left coll [] all_cflags
  105. (* Runtime flags *)
  106. type rflag =
  107. [
  108. | `ANCHORED
  109. | `NOTBOL
  110. | `NOTEOL
  111. | `NOTEMPTY
  112. | `PARTIAL
  113. ]
  114. let int_of_rflag = function
  115. | `ANCHORED -> 0x0010
  116. | `NOTBOL -> 0x0080
  117. | `NOTEOL -> 0x0100
  118. | `NOTEMPTY -> 0x0400
  119. | `PARTIAL -> 0x8000
  120. let coll_irflag irflag flag = int_of_rflag flag lor irflag
  121. let rflags flags = List.fold_left coll_irflag 0 flags
  122. let rflag_of_int = function
  123. | 0x0010 -> `ANCHORED
  124. | 0x0080 -> `NOTBOL
  125. | 0x0100 -> `NOTEOL
  126. | 0x0400 -> `NOTEMPTY
  127. | 0x8000 -> `PARTIAL
  128. | _ -> failwith "Pcre.rflag_list: unknown runtime flag"
  129. let all_rflags = [0x0010; 0x0080; 0x0100; 0x0400; 0x8000]
  130. let rflag_list irflags =
  131. let coll flag_list flag =
  132. if irflags land flag <> 0 then rflag_of_int flag :: flag_list
  133. else flag_list in
  134. List.fold_left coll [] all_rflags
  135. (* Information on the PCRE-configuration (build-time options) *)
  136. external pcre_version : unit -> string = "pcre_version_stub"
  137. external pcre_config_utf8 : unit -> bool = "pcre_config_utf8_stub" [@@noalloc]
  138. external pcre_config_newline :
  139. unit -> char = "pcre_config_newline_stub" [@@noalloc]
  140. external pcre_config_link_size :
  141. unit -> int = "pcre_config_link_size_stub" [@@noalloc]
  142. external pcre_config_match_limit :
  143. unit -> int = "pcre_config_match_limit_stub" [@@noalloc]
  144. external pcre_config_match_limit_recursion :
  145. unit -> int = "pcre_config_match_limit_recursion_stub" [@@noalloc]
  146. external pcre_config_stackrecurse :
  147. unit -> bool = "pcre_config_stackrecurse_stub" [@@noalloc]
  148. let version = pcre_version ()
  149. let config_utf8 = pcre_config_utf8 ()
  150. let config_newline = pcre_config_newline ()
  151. let config_link_size = pcre_config_link_size ()
  152. let config_match_limit = pcre_config_match_limit ()
  153. let config_match_limit_recursion = pcre_config_match_limit_recursion ()
  154. let config_stackrecurse = pcre_config_stackrecurse ()
  155. (* Information on patterns *)
  156. type firstbyte_info =
  157. [ `Char of char
  158. | `Start_only
  159. | `ANCHORED ]
  160. type study_stat =
  161. [ `Not_studied
  162. | `Studied
  163. | `Optimal ]
  164. type regexp
  165. external options : regexp -> icflag = "pcre_options_stub"
  166. external size : regexp -> int = "pcre_size_stub"
  167. external studysize : regexp -> int = "pcre_studysize_stub"
  168. external capturecount : regexp -> int = "pcre_capturecount_stub"
  169. external backrefmax : regexp -> int = "pcre_backrefmax_stub"
  170. external namecount : regexp -> int = "pcre_namecount_stub"
  171. external names : regexp -> string array = "pcre_names_stub"
  172. external nameentrysize : regexp -> int = "pcre_nameentrysize_stub"
  173. external firstbyte : regexp -> firstbyte_info = "pcre_firstbyte_stub"
  174. external firsttable : regexp -> string option = "pcre_firsttable_stub"
  175. external lastliteral : regexp -> char option = "pcre_lastliteral_stub"
  176. external study_stat : regexp -> study_stat = "pcre_study_stat_stub" [@@noalloc]
  177. (* Compilation of patterns *)
  178. type chtables
  179. external maketables : unit -> chtables = "pcre_maketables_stub"
  180. (* Internal use only! *)
  181. external pcre_study : regexp -> unit = "pcre_study_stub"
  182. external compile :
  183. icflag -> chtables option -> string -> regexp = "pcre_compile_stub"
  184. external get_match_limit : regexp -> int option = "pcre_get_match_limit_stub"
  185. (* Internal use only! *)
  186. external set_imp_match_limit :
  187. regexp -> int -> regexp = "pcre_set_imp_match_limit_stub" [@@noalloc]
  188. external get_match_limit_recursion :
  189. regexp -> int option = "pcre_get_match_limit_recursion_stub"
  190. (* Internal use only! *)
  191. external set_imp_match_limit_recursion :
  192. regexp -> int -> regexp = "pcre_set_imp_match_limit_recursion_stub" [@@noalloc]
  193. let regexp
  194. ?(study = true) ?limit ?limit_recursion
  195. ?(iflags = 0) ?flags ?chtables pat =
  196. let rex =
  197. match flags with
  198. | Some flag_list -> compile (cflags flag_list) chtables pat
  199. | _ -> compile iflags chtables pat
  200. in
  201. if study then pcre_study rex;
  202. let rex =
  203. match limit with
  204. | None -> rex
  205. | Some lim -> set_imp_match_limit rex lim
  206. in
  207. match limit_recursion with
  208. | None -> rex
  209. | Some lim -> set_imp_match_limit_recursion rex lim
  210. let regexp_or
  211. ?study ?limit ?limit_recursion ?(iflags = 0) ?flags ?chtables pats =
  212. let check pat =
  213. try ignore (regexp ~study:false ~iflags ?flags ?chtables pat)
  214. with Error error -> raise (Regexp_or (pat, error))
  215. in
  216. List.iter check pats;
  217. let big_pat =
  218. let cnv pat = "(?:" ^ pat ^ ")" in
  219. String.concat "|" (List.rev (List.rev_map cnv pats))
  220. in
  221. regexp ?study ?limit ?limit_recursion ~iflags ?flags ?chtables big_pat
  222. let bytes_unsafe_blit_string str str_ofs bts bts_ofs len =
  223. let str_bts = Bytes.unsafe_of_string str in
  224. Bytes.unsafe_blit str_bts str_ofs bts bts_ofs len
  225. let string_unsafe_sub str ofs len =
  226. let res = Bytes.create len in
  227. bytes_unsafe_blit_string str ofs res 0 len;
  228. Bytes.unsafe_to_string res
  229. let quote s =
  230. let len = String.length s in
  231. let buf = Bytes.create (len lsl 1) in
  232. let pos = ref 0 in
  233. for i = 0 to len - 1 do
  234. match String.unsafe_get s i with
  235. | '\\' | '^' | '$' | '.' | '[' | '|'
  236. | '(' | ')' | '?' | '*' | '+' | '{' as c ->
  237. Bytes.unsafe_set buf !pos '\\';
  238. incr pos;
  239. Bytes.unsafe_set buf !pos c;
  240. incr pos
  241. | c -> Bytes.unsafe_set buf !pos c; incr pos
  242. done;
  243. string_unsafe_sub (Bytes.unsafe_to_string buf) 0 !pos
  244. (* Matching of patterns and subpattern extraction *)
  245. (* Default regular expression when none is provided by the user *)
  246. let def_rex = regexp "\\s+"
  247. type substrings = string * int array
  248. type callout_data =
  249. {
  250. callout_number : int;
  251. substrings : substrings;
  252. start_match : int;
  253. current_position : int;
  254. capture_top : int;
  255. capture_last : int;
  256. pattern_position : int;
  257. next_item_length : int;
  258. }
  259. type callout = callout_data -> unit
  260. let get_subject (subj, _) = subj
  261. let num_of_subs (_, ovector) = Array.length ovector / 3
  262. let get_offset_start ovector str_num =
  263. if str_num < 0 || str_num >= Array.length ovector / 3 then
  264. invalid_arg "Pcre.get_offset_start: illegal offset";
  265. let offset = str_num lsl 1 in
  266. offset, Array.unsafe_get ovector offset
  267. let get_substring_aux (subj, ovector) offset start =
  268. if start < 0 then raise Not_found
  269. else
  270. string_unsafe_sub subj start (Array.unsafe_get ovector (offset + 1) - start)
  271. let get_substring (_, ovector as substrings) str_num =
  272. let offset, start = get_offset_start ovector str_num in
  273. get_substring_aux substrings offset start
  274. let get_substring_ofs (_subj, ovector) str_num =
  275. let offset, start = get_offset_start ovector str_num in
  276. if start < 0 then raise Not_found
  277. else start, Array.unsafe_get ovector (offset + 1)
  278. let unsafe_get_substring (_, ovector as substrings) str_num =
  279. let offset = str_num lsl 1 in
  280. try get_substring_aux substrings offset (Array.unsafe_get ovector offset)
  281. with Not_found -> ""
  282. let get_substrings ?(full_match = true) (_, ovector as substrings) =
  283. if full_match then
  284. Array.init (Array.length ovector / 3) (unsafe_get_substring substrings)
  285. else
  286. let len = (Array.length ovector / 3) - 1 in
  287. Array.init len (fun n -> unsafe_get_substring substrings (n + 1))
  288. let unsafe_get_opt_substring (_, ovector as substrings) str_num =
  289. let offset = str_num lsl 1 in
  290. try
  291. let start = Array.unsafe_get ovector offset in
  292. let str = get_substring_aux substrings offset start in
  293. Some str
  294. with Not_found -> None
  295. let get_opt_substrings ?(full_match = true) (_, ovector as substrings) =
  296. if full_match then
  297. Array.init (Array.length ovector / 3) (unsafe_get_opt_substring substrings)
  298. else
  299. let len = (Array.length ovector / 3) - 1 in
  300. Array.init len (fun n -> unsafe_get_opt_substring substrings (n + 1))
  301. external get_stringnumber :
  302. regexp -> string -> int = "pcre_get_stringnumber_stub"
  303. let get_named_substring rex name substrings =
  304. get_substring substrings (get_stringnumber rex name)
  305. let get_named_substring_ofs rex name substrings =
  306. get_substring_ofs substrings (get_stringnumber rex name)
  307. external unsafe_pcre_exec :
  308. irflag ->
  309. regexp ->
  310. pos : int ->
  311. subj_start : int ->
  312. subj : string ->
  313. int array ->
  314. callout option ->
  315. unit = "pcre_exec_stub_bc" "pcre_exec_stub"
  316. let make_ovector rex =
  317. let subgroups1 = capturecount rex + 1 in
  318. let subgroups2 = subgroups1 lsl 1 in
  319. subgroups2, Array.make (subgroups1 + subgroups2) 0
  320. let pcre_exec ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0)
  321. ?callout subj =
  322. let rex = match pat with Some str -> regexp str | _ -> rex in
  323. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  324. let _, ovector = make_ovector rex in
  325. unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  326. ovector
  327. let exec ?iflags ?flags ?rex ?pat ?pos ?callout subj =
  328. subj, pcre_exec ?iflags ?flags ?rex ?pat ?pos ?callout subj
  329. let next_match ?iflags ?flags ?rex ?pat ?(pos = 0) ?callout (subj, ovector) =
  330. let pos = Array.unsafe_get ovector 1 + pos in
  331. let subj_len = String.length subj in
  332. if pos < 0 || pos > subj_len then
  333. invalid_arg "Pcre.next_match: illegal offset";
  334. subj, pcre_exec ?iflags ?flags ?rex ?pat ~pos ?callout subj
  335. let rec copy_lst ar n = function
  336. | [] -> ar
  337. | h :: t -> Array.unsafe_set ar n h; copy_lst ar (n - 1) t
  338. let exec_all ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?pos ?callout subj =
  339. let rex = match pat with Some str -> regexp str | _ -> rex in
  340. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  341. let (_, ovector as sstrs) = exec ~iflags ~rex ?pos ?callout subj in
  342. let null_flags = iflags lor 0x0400 in
  343. let subj_len = String.length subj in
  344. let rec loop pos (subj, ovector as sstrs) n lst =
  345. let maybe_ovector =
  346. try
  347. let first = Array.unsafe_get ovector 0 in
  348. if first = pos && Array.unsafe_get ovector 1 = pos then
  349. if pos = subj_len then None
  350. else Some (pcre_exec ~iflags:null_flags ~rex ~pos ?callout subj)
  351. else Some (pcre_exec ~iflags ~rex ~pos ?callout subj)
  352. with Not_found -> None in
  353. match maybe_ovector with
  354. | Some ovector ->
  355. let new_pos = Array.unsafe_get ovector 1 in
  356. loop new_pos (subj, ovector) (n + 1) (sstrs :: lst)
  357. | None -> copy_lst (Array.make (n + 1) sstrs) (n - 1) lst in
  358. loop (Array.unsafe_get ovector 1) sstrs 0 []
  359. let extract ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  360. get_substrings ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj)
  361. let extract_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  362. get_opt_substrings
  363. ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj)
  364. let extract_all ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  365. let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in
  366. Array.map (get_substrings ?full_match) many_sstrs
  367. let extract_all_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  368. let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in
  369. Array.map (get_opt_substrings ?full_match) many_sstrs
  370. let pmatch ?iflags ?flags ?rex ?pat ?pos ?callout subj =
  371. try ignore (pcre_exec ?iflags ?flags ?rex ?pat ?pos ?callout subj); true
  372. with Not_found -> false
  373. (* String substitution *)
  374. (* Elements of a substitution pattern *)
  375. type subst =
  376. | SubstString of int * int (* Denotes a substring in the substitution *)
  377. | Backref of int (* nth backreference ($0 is program name!) *)
  378. | Match (* The whole matched string *)
  379. | PreMatch (* The string before the match *)
  380. | PostMatch (* The string after the match *)
  381. | LastParenMatch (* The last matched group *)
  382. (* Information on substitution patterns *)
  383. type substitution = string (* The substitution string *)
  384. * int (* Highest group number of backreferences *)
  385. * bool (* Makes use of "LastParenMatch" *)
  386. * subst list (* The list of substitution elements *)
  387. (* Only used internally in "subst" *)
  388. exception FoundAt of int
  389. let zero = Char.code '0'
  390. let subst str =
  391. let max_br = ref 0 in
  392. let with_lp = ref false in
  393. let lix = String.length str - 1 in
  394. let rec loop acc n =
  395. if lix < n then acc
  396. else
  397. try
  398. for i = n to lix do
  399. if String.unsafe_get str i = '$' then raise (FoundAt i)
  400. done;
  401. SubstString (n, lix - n + 1) :: acc
  402. with FoundAt i ->
  403. if i = lix then SubstString (n, lix - n + 1) :: acc
  404. else
  405. let i1 = i + 1 in
  406. let acc = if n = i then acc else SubstString (n, i - n) :: acc in
  407. match String.unsafe_get str i1 with
  408. | '0'..'9' as c ->
  409. let subpat_nr = ref (Char.code c - zero) in
  410. (try
  411. for j = i1 + 1 to lix do
  412. let c = String.unsafe_get str j in
  413. if c >= '0' && c <= '9' then
  414. subpat_nr := 10 * !subpat_nr + Char.code c - zero
  415. else raise (FoundAt j)
  416. done;
  417. max_br := max !subpat_nr !max_br;
  418. Backref !subpat_nr :: acc
  419. with FoundAt j ->
  420. max_br := max !subpat_nr !max_br;
  421. loop (Backref !subpat_nr :: acc) j)
  422. | '!' -> loop acc (i1 + 1)
  423. | '$' -> loop (SubstString (i1, 1) :: acc) (i1 + 1)
  424. | '&' -> loop (Match :: acc) (i1 + 1)
  425. | '`' -> loop (PreMatch :: acc) (i1 + 1)
  426. | '\'' -> loop (PostMatch :: acc) (i1 + 1)
  427. | '+' ->
  428. with_lp := true;
  429. loop (LastParenMatch :: acc) (i1 + 1)
  430. | _ -> loop acc i1 in
  431. let subst_lst = loop [] 0 in
  432. str, !max_br, !with_lp, subst_lst
  433. let def_subst = subst ""
  434. (* Calculates a list of tuples (str, offset, len) which contain
  435. substrings to be copied on substitutions. Internal use only! *)
  436. let calc_trans_lst subgroups2 ovector subj templ subst_lst =
  437. let prefix_len = Array.unsafe_get ovector 0 in
  438. let last = Array.unsafe_get ovector 1 in
  439. let coll (res_len, trans_lst as accu) =
  440. let return_lst (_str, _ix, len as el) =
  441. if len = 0 then accu else res_len + len, el :: trans_lst in
  442. function
  443. | SubstString (ix, len) -> return_lst (templ, ix, len)
  444. | Backref 0 ->
  445. let prog_name = Sys.argv.(0) in
  446. return_lst (prog_name, 0, String.length prog_name)
  447. | Backref n ->
  448. let offset = n lsl 1 in
  449. let start = Array.unsafe_get ovector offset in
  450. let len = Array.unsafe_get ovector (offset + 1) - start in
  451. return_lst (subj, start, len)
  452. | Match -> return_lst (subj, prefix_len, last - prefix_len)
  453. | PreMatch -> return_lst (subj, 0, prefix_len)
  454. | PostMatch -> return_lst (subj, last, String.length subj - last)
  455. | LastParenMatch ->
  456. let subgroups2_2 = subgroups2 - 2 in
  457. let pos = ref subgroups2_2 in
  458. let ix = ref (Array.unsafe_get ovector subgroups2_2) in
  459. while !ix < 0 do
  460. let pos_2 = !pos - 2 in
  461. pos := pos_2;
  462. ix := Array.unsafe_get ovector pos_2
  463. done;
  464. return_lst (subj, !ix, Array.unsafe_get ovector (!pos + 1) - !ix) in
  465. List.fold_left coll (0, []) subst_lst
  466. let replace ?(iflags = 0) ?flags ?(rex = def_rex) ?pat
  467. ?(pos = 0) ?(itempl = def_subst) ?templ ?callout subj =
  468. let rex = match pat with Some str -> regexp str | _ -> rex in
  469. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  470. let templ, max_br, with_lp, subst_lst =
  471. match templ with
  472. | Some str -> subst str
  473. | _ -> itempl in
  474. let subj_len = String.length subj in
  475. if pos < 0 || pos > subj_len then invalid_arg "Pcre.replace: illegal offset";
  476. let subgroups2, ovector = make_ovector rex in
  477. let nsubs = (subgroups2 lsr 1) - 1 in
  478. if max_br > nsubs then
  479. failwith "Pcre.replace: backreference denotes nonexistent subpattern";
  480. if with_lp && nsubs = 0 then failwith "Pcre.replace: no backreferences";
  481. let rec loop full_len trans_lsts cur_pos =
  482. if
  483. cur_pos > subj_len ||
  484. try
  485. unsafe_pcre_exec
  486. iflags rex ~pos:cur_pos ~subj_start:0 ~subj
  487. ovector callout;
  488. false
  489. with Not_found -> true
  490. then
  491. let postfix_len = max (subj_len - cur_pos) 0 in
  492. let left = pos + full_len in
  493. let res = Bytes.create (left + postfix_len) in
  494. bytes_unsafe_blit_string subj 0 res 0 pos;
  495. bytes_unsafe_blit_string subj cur_pos res left postfix_len;
  496. let inner_coll ofs (templ, ix, len) =
  497. bytes_unsafe_blit_string templ ix res ofs len; ofs + len in
  498. let coll ofs (res_len, trans_lst) =
  499. let new_ofs = ofs - res_len in
  500. let _ = List.fold_left inner_coll new_ofs trans_lst in
  501. new_ofs in
  502. let _ = List.fold_left coll left trans_lsts in
  503. Bytes.unsafe_to_string res
  504. else
  505. let first = Array.unsafe_get ovector 0 in
  506. let len = first - cur_pos in
  507. let res_len, _ as trans_lst_el =
  508. calc_trans_lst subgroups2 ovector subj templ subst_lst in
  509. let trans_lsts =
  510. if len > 0 then
  511. trans_lst_el :: (len, [(subj, cur_pos, len)]) :: trans_lsts
  512. else trans_lst_el :: trans_lsts in
  513. let full_len = full_len + len + res_len in
  514. let next = first + 1 in
  515. let last = Array.unsafe_get ovector 1 in
  516. if last < next then
  517. if first < subj_len then
  518. let new_trans_lsts = (1, [(subj, cur_pos + len, 1)]) :: trans_lsts in
  519. loop (full_len + 1) new_trans_lsts next
  520. else loop full_len trans_lsts next
  521. else loop full_len trans_lsts last in
  522. loop 0 [] pos
  523. let qreplace ?(iflags = 0) ?flags ?(rex = def_rex) ?pat
  524. ?(pos = 0) ?(templ = "") ?callout subj =
  525. let rex = match pat with Some str -> regexp str | _ -> rex in
  526. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  527. let subj_len = String.length subj in
  528. if pos < 0 || pos > subj_len then invalid_arg "Pcre.qreplace: illegal offset";
  529. let templ_len = String.length templ in
  530. let _, ovector = make_ovector rex in
  531. let rec loop full_len subst_lst cur_pos =
  532. if
  533. cur_pos > subj_len ||
  534. try
  535. unsafe_pcre_exec
  536. iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout;
  537. false
  538. with Not_found -> true
  539. then
  540. let postfix_len = max (subj_len - cur_pos) 0 in
  541. let left = pos + full_len in
  542. let res = Bytes.create (left + postfix_len) in
  543. bytes_unsafe_blit_string subj 0 res 0 pos;
  544. bytes_unsafe_blit_string subj cur_pos res left postfix_len;
  545. let coll ofs = function
  546. | Some (substr, ix, len) ->
  547. let new_ofs = ofs - len in
  548. bytes_unsafe_blit_string substr ix res new_ofs len;
  549. new_ofs
  550. | None ->
  551. let new_ofs = ofs - templ_len in
  552. bytes_unsafe_blit_string templ 0 res new_ofs templ_len;
  553. new_ofs in
  554. let _ = List.fold_left coll left subst_lst in
  555. Bytes.unsafe_to_string res
  556. else
  557. let first = Array.unsafe_get ovector 0 in
  558. let len = first - cur_pos in
  559. let subst_lst =
  560. if len > 0 then None :: Some (subj, cur_pos, len) :: subst_lst
  561. else None :: subst_lst in
  562. let last = Array.unsafe_get ovector 1 in
  563. let full_len = full_len + len + templ_len in
  564. let next = first + 1 in
  565. if last < next then
  566. if first < subj_len then
  567. loop (full_len + 1) (Some (subj, cur_pos + len, 1) :: subst_lst) next
  568. else loop full_len subst_lst next
  569. else loop full_len subst_lst last in
  570. loop 0 [] pos
  571. let substitute_substrings ?(iflags = 0) ?flags ?(rex = def_rex) ?pat
  572. ?(pos = 0) ?callout ~subst subj =
  573. let rex = match pat with Some str -> regexp str | _ -> rex in
  574. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  575. let subj_len = String.length subj in
  576. if pos < 0 || pos > subj_len then invalid_arg "Pcre.substitute: illegal offset";
  577. let _, ovector = make_ovector rex in
  578. let rec loop full_len subst_lst cur_pos =
  579. if
  580. cur_pos > subj_len ||
  581. try
  582. unsafe_pcre_exec
  583. iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout;
  584. false
  585. with Not_found -> true
  586. then
  587. let postfix_len = max (subj_len - cur_pos) 0 in
  588. let left = pos + full_len in
  589. let res = Bytes.create (left + postfix_len) in
  590. bytes_unsafe_blit_string subj 0 res 0 pos;
  591. bytes_unsafe_blit_string subj cur_pos res left postfix_len;
  592. let coll ofs (templ, ix, len) =
  593. let new_ofs = ofs - len in
  594. bytes_unsafe_blit_string templ ix res new_ofs len;
  595. new_ofs in
  596. let _ = List.fold_left coll left subst_lst in
  597. Bytes.unsafe_to_string res
  598. else
  599. let first = Array.unsafe_get ovector 0 in
  600. let len = first - cur_pos in
  601. let templ = subst (subj, ovector) in
  602. let templ_len = String.length templ in
  603. let subst_lst =
  604. if len > 0 then
  605. (templ, 0, templ_len) :: (subj, cur_pos, len) :: subst_lst
  606. else (templ, 0, templ_len) :: subst_lst in
  607. let last = Array.unsafe_get ovector 1 in
  608. let full_len = full_len + len + templ_len in
  609. let next = first + 1 in
  610. if last < next then
  611. if first < subj_len then
  612. loop (full_len + 1) ((subj, cur_pos + len, 1) :: subst_lst) next
  613. else loop full_len subst_lst next
  614. else loop full_len subst_lst last in
  615. loop 0 [] pos
  616. let substitute ?iflags ?flags ?rex ?pat ?pos ?callout ~subst:str_subst subj =
  617. let subst (subj, ovector) =
  618. let first = Array.unsafe_get ovector 0 in
  619. let last = Array.unsafe_get ovector 1 in
  620. str_subst (string_unsafe_sub subj first (last - first)) in
  621. substitute_substrings ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj
  622. let replace_first ?(iflags = 0) ?flags ?(rex = def_rex) ?pat ?(pos = 0)
  623. ?(itempl = def_subst) ?templ ?callout subj =
  624. let rex = match pat with Some str -> regexp str | _ -> rex in
  625. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  626. let templ, max_br, with_lp, subst_lst =
  627. match templ with
  628. | Some str -> subst str
  629. | _ -> itempl in
  630. let subgroups2, ovector = make_ovector rex in
  631. let nsubs = (subgroups2 lsr 1) - 1 in
  632. if max_br > nsubs then
  633. failwith "Pcre.replace_first: backreference denotes nonexistent subpattern";
  634. if with_lp && nsubs = 0 then failwith "Pcre.replace_first: no backreferences";
  635. try
  636. unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  637. let res_len, trans_lst =
  638. calc_trans_lst subgroups2 ovector subj templ subst_lst in
  639. let first = Array.unsafe_get ovector 0 in
  640. let last = Array.unsafe_get ovector 1 in
  641. let rest = String.length subj - last in
  642. let res = Bytes.create (first + res_len + rest) in
  643. bytes_unsafe_blit_string subj 0 res 0 first;
  644. let coll ofs (templ, ix, len) =
  645. bytes_unsafe_blit_string templ ix res ofs len; ofs + len in
  646. let ofs = List.fold_left coll first trans_lst in
  647. bytes_unsafe_blit_string subj last res ofs rest;
  648. Bytes.unsafe_to_string res
  649. with Not_found -> string_copy subj
  650. let qreplace_first ?(iflags = 0) ?flags ?(rex = def_rex) ?pat
  651. ?(pos = 0) ?(templ = "") ?callout subj =
  652. let rex = match pat with Some str -> regexp str | _ -> rex in
  653. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  654. let _, ovector = make_ovector rex in
  655. try
  656. unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  657. let first = Array.unsafe_get ovector 0 in
  658. let last = Array.unsafe_get ovector 1 in
  659. let len = String.length templ in
  660. let rest = String.length subj - last in
  661. let postfix_start = first + len in
  662. let res = Bytes.create (postfix_start + rest) in
  663. bytes_unsafe_blit_string subj 0 res 0 first;
  664. bytes_unsafe_blit_string templ 0 res first len;
  665. bytes_unsafe_blit_string subj last res postfix_start rest;
  666. Bytes.unsafe_to_string res
  667. with Not_found -> string_copy subj
  668. let substitute_substrings_first ?(iflags = 0) ?flags ?(rex = def_rex) ?pat
  669. ?(pos = 0) ?callout ~subst subj =
  670. let rex = match pat with Some str -> regexp str | _ -> rex in
  671. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  672. let _, ovector = make_ovector rex in
  673. try
  674. unsafe_pcre_exec iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  675. let subj_len = String.length subj in
  676. let prefix_len = Array.unsafe_get ovector 0 in
  677. let last = Array.unsafe_get ovector 1 in
  678. let templ = subst (subj, ovector) in
  679. let postfix_len = subj_len - last in
  680. let templ_len = String.length templ in
  681. let postfix_start = prefix_len + templ_len in
  682. let res = Bytes.create (postfix_start + postfix_len) in
  683. bytes_unsafe_blit_string subj 0 res 0 prefix_len;
  684. bytes_unsafe_blit_string templ 0 res prefix_len templ_len;
  685. bytes_unsafe_blit_string subj last res postfix_start postfix_len;
  686. Bytes.unsafe_to_string res
  687. with Not_found -> string_copy subj
  688. let substitute_first ?iflags ?flags ?rex ?pat ?pos
  689. ?callout ~subst:str_subst subj =
  690. let subst (subj, ovector) =
  691. let first = Array.unsafe_get ovector 0 in
  692. let last = Array.unsafe_get ovector 1 in
  693. str_subst (string_unsafe_sub subj first (last - first)) in
  694. substitute_substrings_first
  695. ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj
  696. (* Splitting *)
  697. let internal_psplit flags rex max pos callout subj =
  698. let subj_len = String.length subj in
  699. if subj_len = 0 then []
  700. else if max = 1 then [string_copy subj]
  701. else
  702. let subgroups2, ovector = make_ovector rex in
  703. (* Adds contents of subgroups to the string accumulator *)
  704. let handle_subgroups strs =
  705. let strs = ref strs in
  706. let i = ref 2 in
  707. while !i < subgroups2 do
  708. let first = Array.unsafe_get ovector !i in
  709. incr i;
  710. let last = Array.unsafe_get ovector !i in
  711. let str =
  712. if first < 0 then ""
  713. else string_unsafe_sub subj first (last - first) in
  714. strs := str :: !strs; incr i
  715. done;
  716. !strs in
  717. (* Performs the recursive split *)
  718. let rec loop strs cnt pos prematch =
  719. let len = subj_len - pos in
  720. if len < 0 then strs
  721. else
  722. (* Checks termination due to max restriction *)
  723. if cnt = 0 then
  724. if prematch &&
  725. try
  726. unsafe_pcre_exec
  727. flags rex ~pos ~subj_start:pos ~subj ovector callout;
  728. true
  729. with Not_found -> false
  730. then
  731. let last = Array.unsafe_get ovector 1 in
  732. let strs = handle_subgroups strs in
  733. string_unsafe_sub subj last (subj_len - last) :: strs
  734. else string_unsafe_sub subj pos len :: strs
  735. (* Calculates next accumulator state for splitting *)
  736. else
  737. if
  738. try
  739. unsafe_pcre_exec
  740. flags rex ~pos ~subj_start:pos ~subj ovector callout;
  741. false
  742. with Not_found -> true
  743. then string_unsafe_sub subj pos len :: strs
  744. else
  745. let first = Array.unsafe_get ovector 0 in
  746. let last = Array.unsafe_get ovector 1 in
  747. if first = pos then
  748. if last = pos then
  749. let strs = if prematch then handle_subgroups strs else strs in
  750. if len = 0 then "" :: strs
  751. else if
  752. try
  753. unsafe_pcre_exec
  754. (flags lor 0x0410) rex ~pos ~subj_start:pos ~subj
  755. ovector callout;
  756. true
  757. with Not_found -> false
  758. then
  759. let new_strs = handle_subgroups ("" :: strs) in
  760. loop new_strs (cnt - 1) (Array.unsafe_get ovector 1) false
  761. else
  762. let new_strs = string_unsafe_sub subj pos 1 :: strs in
  763. loop new_strs (cnt - 1) (pos + 1) true
  764. else
  765. if prematch then loop (handle_subgroups strs) cnt last false
  766. else loop (handle_subgroups ("" :: strs)) (cnt - 1) last false
  767. else
  768. let new_strs = string_unsafe_sub subj pos (first - pos) :: strs in
  769. loop (handle_subgroups new_strs) (cnt - 1) last false in
  770. loop [] (max - 1) pos false
  771. let rec strip_all_empty = function "" :: t -> strip_all_empty t | l -> l
  772. external isspace : char -> bool = "pcre_isspace_stub" [@@noalloc]
  773. let rec find_no_space ix len str =
  774. if ix = len || not (isspace (String.unsafe_get str ix)) then ix
  775. else find_no_space (ix + 1) len str
  776. let split ?(iflags = 0) ?flags ?rex ?pat ?(pos = 0) ?(max = 0) ?callout subj =
  777. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  778. let res =
  779. match pat, rex with
  780. | Some str, _ -> internal_psplit iflags (regexp str) max pos callout subj
  781. | _, Some rex -> internal_psplit iflags rex max pos callout subj
  782. | _ ->
  783. (* special case for Perl-splitting semantics *)
  784. let len = String.length subj in
  785. if pos > len || pos < 0 then failwith "Pcre.split: illegal offset";
  786. let new_pos = find_no_space pos len subj in
  787. internal_psplit iflags def_rex max new_pos callout subj in
  788. List.rev (if max = 0 then strip_all_empty res else res)
  789. let asplit ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj =
  790. Array.of_list (split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj)
  791. (* Full splitting *)
  792. type split_result = Text of string
  793. | Delim of string
  794. | Group of int * string
  795. | NoGroup
  796. let rec strip_all_empty_full = function
  797. | Delim _ :: rest -> strip_all_empty_full rest
  798. | l -> l
  799. let full_split ?(iflags = 0) ?flags ?(rex = def_rex) ?pat
  800. ?(pos = 0) ?(max = 0) ?callout subj =
  801. let rex = match pat with Some str -> regexp str | _ -> rex in
  802. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  803. let subj_len = String.length subj in
  804. if subj_len = 0 then []
  805. else if max = 1 then [Text (string_copy subj)]
  806. else
  807. let subgroups2, ovector = make_ovector rex in
  808. (* Adds contents of subgroups to the string accumulator *)
  809. let handle_subgroups strs =
  810. let strs = ref strs in
  811. let i = ref 2 in
  812. while !i < subgroups2 do
  813. let group_nr = !i lsr 1 in
  814. let first = Array.unsafe_get ovector !i in
  815. incr i;
  816. let last = Array.unsafe_get ovector !i in
  817. let str =
  818. if first < 0 then NoGroup
  819. else
  820. let group_str = string_unsafe_sub subj first (last - first) in
  821. Group (group_nr, group_str) in
  822. strs := str :: !strs; incr i
  823. done;
  824. !strs in
  825. (* Performs the recursive split *)
  826. let rec loop strs cnt pos prematch =
  827. let len = subj_len - pos in
  828. if len < 0 then strs
  829. else
  830. (* Checks termination due to max restriction *)
  831. if cnt = 0 then
  832. if prematch &&
  833. try
  834. unsafe_pcre_exec
  835. iflags rex ~pos ~subj_start:pos ~subj ovector callout;
  836. true
  837. with Not_found -> false
  838. then
  839. let first = Array.unsafe_get ovector 0 in
  840. let last = Array.unsafe_get ovector 1 in
  841. let delim = Delim (string_unsafe_sub subj first (last - first)) in
  842. Text (string_unsafe_sub subj last (subj_len - last))
  843. :: handle_subgroups (delim :: strs)
  844. else
  845. if len = 0 then strs
  846. else Text (string_unsafe_sub subj pos len) :: strs
  847. (* Calculates next accumulator state for splitting *)
  848. else
  849. if
  850. try
  851. unsafe_pcre_exec
  852. iflags rex ~pos ~subj_start:pos ~subj ovector callout;
  853. false
  854. with Not_found -> true
  855. then
  856. if len = 0 then strs
  857. else Text (string_unsafe_sub subj pos len) :: strs
  858. else
  859. let first = Array.unsafe_get ovector 0 in
  860. let last = Array.unsafe_get ovector 1 in
  861. if first = pos then
  862. if last = pos then
  863. if len = 0 then handle_subgroups (Delim "" :: strs)
  864. else
  865. let empty_groups = handle_subgroups [] in
  866. if
  867. try
  868. unsafe_pcre_exec
  869. (iflags lor 0x0410) rex ~pos ~subj_start:pos ~subj
  870. ovector callout;
  871. true
  872. with Not_found -> false
  873. then
  874. let first = Array.unsafe_get ovector 0 in
  875. let last = Array.unsafe_get ovector 1 in
  876. let delim =
  877. Delim (string_unsafe_sub subj first (last - first)) in
  878. let new_strs =
  879. handle_subgroups (
  880. delim :: (if prematch then strs
  881. else empty_groups @ (Delim "" :: strs))) in
  882. loop new_strs (cnt - 1) last false
  883. else
  884. let new_strs =
  885. Text (string_unsafe_sub subj pos 1)
  886. :: empty_groups @ Delim "" :: strs in
  887. loop new_strs (cnt - 1) (pos + 1) true
  888. else
  889. let delim =
  890. Delim (string_unsafe_sub subj first (last - first)) in
  891. loop (handle_subgroups (delim :: strs)) cnt last false
  892. else
  893. let delim = Delim (string_unsafe_sub subj first (last - first)) in
  894. let pre_strs =
  895. Text (string_unsafe_sub subj pos (first - pos)) :: strs in
  896. loop
  897. (handle_subgroups (delim :: pre_strs)) (cnt - 1) last false in
  898. let res = loop [] (max - 1) pos true in
  899. List.rev (if max = 0 then strip_all_empty_full res else res)
  900. (* Additional convenience functions useful in combination with this library *)
  901. let foreach_line ?(ic = stdin) f =
  902. try while true do f (input_line ic) done with End_of_file -> ()
  903. let foreach_file filenames f =
  904. let do_with_file filename =
  905. let file = open_in filename in
  906. try f filename file; close_in file
  907. with exn -> close_in file; raise exn in
  908. List.iter do_with_file filenames