pcre2.ml 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149
  1. (*
  2. PCRE2-OCAML - Perl Compatibility Regular Expressions for OCaml
  3. Copyright (C) 1999- Markus Mottl
  4. email: [email protected]
  5. WWW: http://www.ocaml.info
  6. This library is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU Lesser General Public
  8. License as published by the Free Software Foundation; either
  9. version 2.1 of the License, or (at your option) any later version.
  10. This library is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. Lesser General Public License for more details.
  14. You should have received a copy of the GNU Lesser General Public
  15. License along with this library; if not, write to the Free Software
  16. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. *)
  18. (* Public exceptions and their registration with the C runtime *)
  19. type error =
  20. | Partial
  21. | BadPattern of string * int
  22. | BadUTF
  23. | BadUTFOffset
  24. | MatchLimit
  25. | DepthLimit
  26. | WorkspaceSize
  27. | InternalError of string
  28. exception Error of error
  29. exception Backtrack
  30. exception Regexp_or of string * error
  31. (* Puts exceptions into global C-variables for fast retrieval *)
  32. external pcre2_ocaml_init : unit -> unit = "pcre2_ocaml_init"
  33. (* Registers exceptions with the C runtime and caches polymorphic variants *)
  34. let () =
  35. Callback.register_exception "Pcre2.Error" (Error (InternalError ""));
  36. Callback.register_exception "Pcre2.Backtrack" Backtrack;
  37. pcre2_ocaml_init ()
  38. (* Compilation and runtime flags and their conversion functions *)
  39. type icflag = int64
  40. type irflag = int64
  41. (* Compilation flags *)
  42. type cflag =
  43. [
  44. | `ALLOW_EMPTY_CLASS
  45. | `ALT_BSUX
  46. | `ALT_CIRCUMFLEX
  47. | `ALT_VERBNAMES
  48. | `ANCHORED
  49. | `AUTO_CALLOUT
  50. | `CASELESS
  51. | `DOLLAR_ENDONLY
  52. | `DOTALL
  53. | `DUPNAMES
  54. | `ENDANCHORED
  55. | `EXTENDED
  56. | `EXTENDED_MORE
  57. | `FIRSTLINE
  58. | `LITERAL
  59. | `MATCH_INVALID_UTF
  60. | `MATCH_UNSET_BACKREF
  61. | `MULTILINE
  62. | `NEVER_BACKSLASH_C
  63. | `NEVER_UCP
  64. | `NEVER_UTF
  65. | `NO_AUTO_CAPTURE
  66. | `NO_AUTO_POSSESS
  67. | `NO_DOTSTAR_ANCHOR
  68. | `NO_START_OPTIMIZE
  69. | `NO_UTF_CHECK
  70. | `UCP
  71. | `UNGREEDY
  72. | `USE_OFFSET_LIMIT
  73. | `UTF
  74. ]
  75. let int_of_cflag = function
  76. | `ALLOW_EMPTY_CLASS -> 0x00000001L
  77. | `ALT_BSUX -> 0x00000002L
  78. | `AUTO_CALLOUT -> 0x00000004L
  79. | `CASELESS -> 0x00000008L
  80. | `DOLLAR_ENDONLY -> 0x00000010L
  81. | `DOTALL -> 0x00000020L
  82. | `DUPNAMES -> 0x00000040L
  83. | `EXTENDED -> 0x00000080L
  84. | `FIRSTLINE -> 0x00000100L
  85. | `MATCH_UNSET_BACKREF -> 0x00000200L
  86. | `MULTILINE -> 0x00000400L
  87. | `NEVER_UCP -> 0x00000800L
  88. | `NEVER_UTF -> 0x00001000L
  89. | `NO_AUTO_CAPTURE -> 0x00002000L
  90. | `NO_AUTO_POSSESS -> 0x00004000L
  91. | `NO_DOTSTAR_ANCHOR -> 0x00008000L
  92. | `NO_START_OPTIMIZE -> 0x00010000L
  93. | `UCP -> 0x00020000L
  94. | `UNGREEDY -> 0x00040000L
  95. | `UTF -> 0x00080000L
  96. | `NEVER_BACKSLASH_C -> 0x00100000L
  97. | `ALT_CIRCUMFLEX -> 0x00200000L
  98. | `ALT_VERBNAMES -> 0x00400000L
  99. | `USE_OFFSET_LIMIT -> 0x00800000L
  100. | `EXTENDED_MORE -> 0x01000000L
  101. | `LITERAL -> 0x02000000L
  102. | `MATCH_INVALID_UTF -> 0x04000000L
  103. | `ENDANCHORED -> 0x20000000L
  104. | `NO_UTF_CHECK -> 0x40000000L
  105. | `ANCHORED -> 0x80000000L
  106. let coll_icflag icflag flag = Int64.logor (int_of_cflag flag) icflag
  107. let cflags flags = List.fold_left coll_icflag 0L flags
  108. let cflag_of_int = function
  109. | 0x00000001L -> `ALLOW_EMPTY_CLASS
  110. | 0x00000002L -> `ALT_BSUX
  111. | 0x00000004L -> `AUTO_CALLOUT
  112. | 0x00000008L -> `CASELESS
  113. | 0x00000010L -> `DOLLAR_ENDONLY
  114. | 0x00000020L -> `DOTALL
  115. | 0x00000040L -> `DUPNAMES
  116. | 0x00000080L -> `EXTENDED
  117. | 0x00000100L -> `FIRSTLINE
  118. | 0x00000200L -> `MATCH_UNSET_BACKREF
  119. | 0x00000400L -> `MULTILINE
  120. | 0x00000800L -> `NEVER_UCP
  121. | 0x00001000L -> `NEVER_UTF
  122. | 0x00002000L -> `NO_AUTO_CAPTURE
  123. | 0x00004000L -> `NO_AUTO_POSSESS
  124. | 0x00008000L -> `NO_DOTSTAR_ANCHOR
  125. | 0x00010000L -> `NO_START_OPTIMIZE
  126. | 0x00020000L -> `UCP
  127. | 0x00040000L -> `UNGREEDY
  128. | 0x00080000L -> `UTF
  129. | 0x00100000L -> `NEVER_BACKSLASH_C
  130. | 0x00200000L -> `ALT_CIRCUMFLEX
  131. | 0x00400000L -> `ALT_VERBNAMES
  132. | 0x00800000L -> `USE_OFFSET_LIMIT
  133. | 0x01000000L -> `EXTENDED_MORE
  134. | 0x02000000L -> `LITERAL
  135. | 0x04000000L -> `MATCH_INVALID_UTF
  136. | 0x20000000L -> `ENDANCHORED
  137. | 0x40000000L -> `NO_UTF_CHECK
  138. | 0x80000000L -> `ANCHORED
  139. | _ -> failwith "Pcre2.cflag_list: unknown compilation flag"
  140. let all_cflags =
  141. [
  142. 0x00000001L; 0x00000002L; 0x00000004L; 0x00000008L;
  143. 0x00000010L; 0x00000020L; 0x00000040L; 0x00000080L;
  144. 0x00000100L; 0x00000200L; 0x00000400L; 0x00000800L;
  145. 0x00001000L; 0x00002000L; 0x00004000L; 0x00008000L;
  146. 0x00010000L; 0x00020000L; 0x00040000L; 0x00080000L;
  147. 0x00100000L; 0x00200000L; 0x00400000L; 0x00800000L;
  148. 0x01000000L; 0x02000000L; 0x04000000L;
  149. 0x20000000L; 0x40000000L; 0x80000000L;
  150. ]
  151. let cflag_list icflags =
  152. let coll flag_list flag =
  153. if Int64.equal (Int64.logand icflags flag) 0L then flag_list
  154. else cflag_of_int flag :: flag_list in
  155. List.fold_left coll [] all_cflags
  156. (* Runtime flags *)
  157. type rflag =
  158. [
  159. | `ANCHORED
  160. | `COPY_MATCHED_SUBJECT
  161. | `DFA_RESTART
  162. | `DFA_SHORTEST
  163. | `ENDANCHORED
  164. | `NOTBOL
  165. | `NOTEOL
  166. | `NOTEMPTY
  167. | `NOTEMPTY_ATSTART
  168. | `NO_JIT
  169. | `NO_UTF_CHECK
  170. | `PARTIAL_HARD
  171. | `PARTIAL_SOFT
  172. ]
  173. let int_of_rflag = function
  174. | `NOTBOL -> 0x00000001L
  175. | `NOTEOL -> 0x00000002L
  176. | `NOTEMPTY -> 0x00000004L
  177. | `NOTEMPTY_ATSTART -> 0x00000008L
  178. | `PARTIAL_SOFT -> 0x00000010L
  179. | `PARTIAL_HARD -> 0x00000020L
  180. | `DFA_RESTART -> 0x00000040L
  181. | `DFA_SHORTEST -> 0x00000080L
  182. | `NO_JIT -> 0x00002000L
  183. | `COPY_MATCHED_SUBJECT -> 0x00004000L
  184. | `ENDANCHORED -> 0x20000000L
  185. | `NO_UTF_CHECK -> 0x40000000L
  186. | `ANCHORED -> 0x80000000L
  187. let coll_irflag irflag flag = Int64.logor (int_of_rflag flag) irflag
  188. let rflags flags = List.fold_left coll_irflag 0L flags
  189. let rflag_of_int = function
  190. | 0x00000001L -> `NOTBOL
  191. | 0x00000002L -> `NOTEOL
  192. | 0x00000004L -> `NOTEMPTY
  193. | 0x00000008L -> `NOTEMPTY_ATSTART
  194. | 0x00000010L -> `PARTIAL_SOFT
  195. | 0x00000020L -> `PARTIAL_HARD
  196. | 0x00000040L -> `DFA_RESTART
  197. | 0x00000080L -> `DFA_SHORTEST
  198. | 0x00002000L -> `NO_JIT
  199. | 0x00004000L -> `COPY_MATCHED_SUBJECT
  200. | 0x20000000L -> `ENDANCHORED
  201. | 0x40000000L -> `NO_UTF_CHECK
  202. | 0x80000000L -> `ANCHORED
  203. | _ -> failwith "Pcre2.rflag_list: unknown runtime flag"
  204. let all_rflags =
  205. [
  206. 0x00000001L; 0x00000002L; 0x00000004L; 0x00000008L;
  207. 0x00000010L; 0x00000020L; 0x00000040L; 0x00000080L;
  208. 0x00002000L; 0x00004000L;
  209. 0x20000000L; 0x40000000L; 0x80000000L;
  210. ]
  211. let rflag_list irflags =
  212. let coll flag_list flag =
  213. if Int64.equal (Int64.logand irflags flag) 0L then flag_list
  214. else rflag_of_int flag :: flag_list in
  215. List.fold_left coll [] all_rflags
  216. (* Information on the PCRE2-configuration (build-time options) *)
  217. external pcre2_version : unit -> string = "pcre2_version_stub"
  218. external pcre2_config_unicode : unit -> bool
  219. = "pcre2_config_unicode_stub" [@@noalloc]
  220. external pcre2_config_newline : unit -> char
  221. = "pcre2_config_newline_stub" [@@noalloc]
  222. external pcre2_config_link_size : unit -> (int [@untagged])
  223. = "pcre2_config_link_size_stub_bc" "pcre2_config_link_size_stub" [@@noalloc]
  224. external pcre2_config_match_limit : unit -> (int [@untagged])
  225. = "pcre2_config_match_limit_stub_bc" "pcre2_config_match_limit_stub"
  226. [@@noalloc]
  227. external pcre2_config_depth_limit : unit -> (int [@untagged])
  228. = "pcre2_config_depth_limit_stub_bc" "pcre2_config_depth_limit_stub"
  229. [@@noalloc]
  230. external pcre2_config_stackrecurse :
  231. unit -> bool = "pcre2_config_stackrecurse_stub" [@@noalloc]
  232. let version = pcre2_version ()
  233. let config_unicode = pcre2_config_unicode ()
  234. let config_newline = pcre2_config_newline ()
  235. let config_link_size = pcre2_config_link_size ()
  236. let config_match_limit = pcre2_config_match_limit ()
  237. let config_depth_limit = pcre2_config_depth_limit ()
  238. let config_stackrecurse = pcre2_config_stackrecurse ()
  239. (* Information on patterns *)
  240. type firstcodeunit_info =
  241. [ `Char of char
  242. | `Start_only
  243. | `ANCHORED ]
  244. type regexp
  245. external options : regexp -> (icflag [@unboxed])
  246. = "pcre2_argoptions_stub_bc" "pcre2_argoptions_stub"
  247. external size : regexp -> (int [@untagged])
  248. = "pcre2_size_stub_bc" "pcre2_size_stub"
  249. external capturecount : regexp -> (int [@untagged])
  250. = "pcre2_capturecount_stub_bc" "pcre2_capturecount_stub"
  251. external backrefmax : regexp -> (int [@untagged])
  252. = "pcre2_backrefmax_stub_bc" "pcre2_backrefmax_stub"
  253. external namecount : regexp -> (int [@untagged])
  254. = "pcre2_namecount_stub_bc" "pcre2_namecount_stub"
  255. external nameentrysize : regexp -> (int [@untagged])
  256. = "pcre2_nameentrysize_stub_bc" "pcre2_nameentrysize_stub"
  257. external names : regexp -> string array = "pcre2_names_stub"
  258. external firstcodeunit : regexp -> firstcodeunit_info = "pcre2_firstcodeunit_stub"
  259. external lastcodeunit : regexp -> char option = "pcre2_lastcodeunit_stub"
  260. (* Compilation of patterns *)
  261. type chtables
  262. external maketables : unit -> chtables = "pcre2_maketables_stub"
  263. external compile : (icflag [@unboxed]) -> chtables option -> string -> regexp
  264. = "pcre2_compile_stub_bc" "pcre2_compile_stub"
  265. (* external get_match_limit : regexp -> int option = "pcre2_get_match_limit_stub" *)
  266. (* Internal use only! *)
  267. external set_imp_match_limit : regexp -> (int [@untagged]) -> regexp
  268. = "pcre2_set_imp_match_limit_stub_bc" "pcre2_set_imp_match_limit_stub"
  269. [@@noalloc]
  270. (* external get_depth_limit :
  271. regexp -> int option = "pcre2_get_depth_limit_stub" *)
  272. (* Internal use only! *)
  273. external set_imp_depth_limit : regexp -> (int [@untagged]) -> regexp
  274. = "pcre2_set_imp_depth_limit_stub_bc" "pcre2_set_imp_depth_limit_stub"
  275. [@@noalloc]
  276. (* TODO implement jit using new pcre2_jit_compile api *)
  277. let regexp
  278. (* ?(jit_compile = false) *)
  279. ?limit ?depth_limit
  280. ?(iflags = 0L) ?flags ?chtables pat =
  281. let rex =
  282. match flags with
  283. | Some flag_list -> compile (cflags flag_list) chtables pat
  284. | _ -> compile iflags chtables pat
  285. in
  286. let rex =
  287. match limit with
  288. | None -> rex
  289. | Some lim -> set_imp_match_limit rex lim
  290. in
  291. match depth_limit with
  292. | None -> rex
  293. | Some lim -> set_imp_depth_limit rex lim
  294. let regexp_or
  295. (* ?jit_compile *) ?limit ?depth_limit ?(iflags = 0L) ?flags ?chtables pats =
  296. let check pat =
  297. try ignore (regexp ~iflags ?flags ?chtables pat)
  298. with Error error -> raise (Regexp_or (pat, error))
  299. in
  300. List.iter check pats;
  301. let big_pat =
  302. let cnv pat = "(?:" ^ pat ^ ")" in
  303. String.concat "|" (List.rev (List.rev_map cnv pats))
  304. in
  305. regexp (* ?jit_compile *) ?limit ?depth_limit ~iflags ?flags ?chtables big_pat
  306. let bytes_unsafe_blit_string str str_ofs bts bts_ofs len =
  307. let str_bts = Bytes.unsafe_of_string str in
  308. Bytes.unsafe_blit str_bts str_ofs bts bts_ofs len
  309. let string_unsafe_sub str ofs len =
  310. let res = Bytes.create len in
  311. bytes_unsafe_blit_string str ofs res 0 len;
  312. Bytes.unsafe_to_string res
  313. let quote s =
  314. let len = String.length s in
  315. let buf = Bytes.create (len lsl 1) in
  316. let pos = ref 0 in
  317. for i = 0 to len - 1 do
  318. match String.unsafe_get s i with
  319. | '\\' | '^' | '$' | '.' | '[' | '|'
  320. | '(' | ')' | '?' | '*' | '+' | '{' as c ->
  321. Bytes.unsafe_set buf !pos '\\';
  322. incr pos;
  323. Bytes.unsafe_set buf !pos c;
  324. incr pos
  325. | c -> Bytes.unsafe_set buf !pos c; incr pos
  326. done;
  327. string_unsafe_sub (Bytes.unsafe_to_string buf) 0 !pos
  328. (* Matching of patterns and subpattern extraction *)
  329. (* Default regular expression when none is provided by the user *)
  330. let def_rex = regexp (* ~jit_compile:true *) "\\s+"
  331. type substrings = string * int array
  332. type callout_data =
  333. {
  334. callout_number : int;
  335. substrings : substrings;
  336. start_match : int;
  337. current_position : int;
  338. capture_top : int;
  339. capture_last : int;
  340. pattern_position : int;
  341. next_item_length : int;
  342. }
  343. type callout = callout_data -> unit
  344. let get_subject (subj, _) = subj
  345. let num_of_subs (_, ovector) = Array.length ovector / 3
  346. let get_offset_start ovector str_num =
  347. if str_num < 0 || str_num >= Array.length ovector / 3 then
  348. invalid_arg "Pcre2.get_offset_start: illegal offset";
  349. let offset = str_num lsl 1 in
  350. offset, Array.unsafe_get ovector offset
  351. let get_substring_aux (subj, ovector) offset start =
  352. if start < 0 then raise Not_found
  353. else
  354. string_unsafe_sub subj start (Array.unsafe_get ovector (offset + 1) - start)
  355. let get_substring (_, ovector as substrings) str_num =
  356. let offset, start = get_offset_start ovector str_num in
  357. get_substring_aux substrings offset start
  358. let get_substring_ofs (_subj, ovector) str_num =
  359. let offset, start = get_offset_start ovector str_num in
  360. if start < 0 then raise Not_found
  361. else start, Array.unsafe_get ovector (offset + 1)
  362. let unsafe_get_substring (_, ovector as substrings) str_num =
  363. let offset = str_num lsl 1 in
  364. try get_substring_aux substrings offset (Array.unsafe_get ovector offset)
  365. with Not_found -> ""
  366. let get_substrings ?(full_match = true) (_, ovector as substrings) =
  367. if full_match then
  368. Array.init (Array.length ovector / 3) (unsafe_get_substring substrings)
  369. else
  370. let len = (Array.length ovector / 3) - 1 in
  371. Array.init len (fun n -> unsafe_get_substring substrings (n + 1))
  372. let unsafe_get_opt_substring (_, ovector as substrings) str_num =
  373. let offset = str_num lsl 1 in
  374. try
  375. let start = Array.unsafe_get ovector offset in
  376. let str = get_substring_aux substrings offset start in
  377. Some str
  378. with Not_found -> None
  379. let get_opt_substrings ?(full_match = true) (_, ovector as substrings) =
  380. if full_match then
  381. Array.init (Array.length ovector / 3) (unsafe_get_opt_substring substrings)
  382. else
  383. let len = (Array.length ovector / 3) - 1 in
  384. Array.init len (fun n -> unsafe_get_opt_substring substrings (n + 1))
  385. external get_stringnumber : regexp -> string -> (int [@untagged])
  386. =
  387. "pcre2_substring_number_from_name_stub_bc"
  388. "pcre2_substring_number_from_name_stub"
  389. let get_named_substring rex name substrings =
  390. get_substring substrings (get_stringnumber rex name)
  391. let get_named_substring_ofs rex name substrings =
  392. get_substring_ofs substrings (get_stringnumber rex name)
  393. external unsafe_pcre2_match :
  394. (irflag [@unboxed]) ->
  395. regexp ->
  396. pos : (int [@untagged]) ->
  397. subj_start : (int [@untagged]) ->
  398. subj : string ->
  399. int array ->
  400. callout option ->
  401. unit = "pcre2_match_stub_bc" "pcre2_match_stub"
  402. let make_ovector rex =
  403. let subgroups1 = capturecount rex + 1 in
  404. let subgroups2 = subgroups1 lsl 1 in
  405. subgroups2, Array.make (subgroups1 + subgroups2) 0
  406. external unsafe_pcre2_dfa_match :
  407. (irflag [@unboxed]) ->
  408. regexp ->
  409. pos : (int [@untagged]) ->
  410. subj_start : (int [@untagged]) ->
  411. subj : string ->
  412. int array ->
  413. callout option ->
  414. workspace : int array ->
  415. unit = "pcre2_dfa_match_stub_bc" "pcre2_match_stub0"
  416. let pcre2_dfa_match ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0)
  417. ?callout ?(workspace = Array.make 20 0) subj =
  418. let rex = match pat with Some str -> regexp str | _ -> rex in
  419. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  420. let _, ovector = make_ovector rex in
  421. unsafe_pcre2_dfa_match
  422. iflags rex ~pos ~subj_start:0 ~subj ovector callout ~workspace;
  423. ovector
  424. let pcre2_match ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0)
  425. ?callout subj =
  426. let rex = match pat with Some str -> regexp str | _ -> rex in
  427. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  428. let _, ovector = make_ovector rex in
  429. unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  430. ovector
  431. let exec ?iflags ?flags ?rex ?pat ?pos ?callout subj =
  432. subj, pcre2_match ?iflags ?flags ?rex ?pat ?pos ?callout subj
  433. let next_match ?iflags ?flags ?rex ?pat ?(pos = 0) ?callout (subj, ovector) =
  434. let pos = Array.unsafe_get ovector 1 + pos in
  435. let subj_len = String.length subj in
  436. if pos < 0 || pos > subj_len then
  437. invalid_arg "Pcre2.next_match: illegal offset";
  438. subj, pcre2_match ?iflags ?flags ?rex ?pat ~pos ?callout subj
  439. let rec copy_lst ar n = function
  440. | [] -> ar
  441. | h :: t -> Array.unsafe_set ar n h; copy_lst ar (n - 1) t
  442. let exec_all ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?pos ?callout subj =
  443. let rex = match pat with Some str -> regexp str | _ -> rex in
  444. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  445. let (_, ovector as sstrs) = exec ~iflags ~rex ?pos ?callout subj in
  446. let null_flags = Int64.logor iflags 0x00000004L in (* `NOTEMPTY *)
  447. let subj_len = String.length subj in
  448. let rec loop pos (subj, ovector as sstrs) n lst =
  449. let maybe_ovector =
  450. try
  451. let first = Array.unsafe_get ovector 0 in
  452. if first = pos && Array.unsafe_get ovector 1 = pos then
  453. if pos = subj_len then None
  454. else Some (pcre2_match ~iflags:null_flags ~rex ~pos ?callout subj)
  455. else Some (pcre2_match ~iflags ~rex ~pos ?callout subj)
  456. with Not_found -> None in
  457. match maybe_ovector with
  458. | Some ovector ->
  459. let new_pos = Array.unsafe_get ovector 1 in
  460. loop new_pos (subj, ovector) (n + 1) (sstrs :: lst)
  461. | None -> copy_lst (Array.make (n + 1) sstrs) (n - 1) lst in
  462. loop (Array.unsafe_get ovector 1) sstrs 0 []
  463. let extract ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  464. get_substrings ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj)
  465. let extract_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  466. get_opt_substrings
  467. ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj)
  468. let extract_all ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  469. let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in
  470. Array.map (get_substrings ?full_match) many_sstrs
  471. let extract_all_opt ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
  472. let many_sstrs = exec_all ?iflags ?flags ?rex ?pat ?pos ?callout subj in
  473. Array.map (get_opt_substrings ?full_match) many_sstrs
  474. let pmatch ?iflags ?flags ?rex ?pat ?pos ?callout subj =
  475. try ignore (pcre2_match ?iflags ?flags ?rex ?pat ?pos ?callout subj); true
  476. with Not_found -> false
  477. (* String substitution *)
  478. (* Elements of a substitution pattern *)
  479. type subst =
  480. | SubstString of int * int (* Denotes a substring in the substitution *)
  481. | Backref of int (* nth backreference ($0 is program name!) *)
  482. | Match (* The whole matched string *)
  483. | PreMatch (* The string before the match *)
  484. | PostMatch (* The string after the match *)
  485. | LastParenMatch (* The last matched group *)
  486. (* Information on substitution patterns *)
  487. type substitution = string (* The substitution string *)
  488. * int (* Highest group number of backreferences *)
  489. * bool (* Makes use of "LastParenMatch" *)
  490. * subst list (* The list of substitution elements *)
  491. (* Only used internally in "subst" *)
  492. exception FoundAt of int
  493. let zero = Char.code '0'
  494. let subst str =
  495. let max_br = ref 0 in
  496. let with_lp = ref false in
  497. let lix = String.length str - 1 in
  498. let rec loop acc n =
  499. if lix < n then acc
  500. else
  501. try
  502. for i = n to lix do
  503. if String.unsafe_get str i = '$' then raise (FoundAt i)
  504. done;
  505. SubstString (n, lix - n + 1) :: acc
  506. with FoundAt i ->
  507. if i = lix then SubstString (n, lix - n + 1) :: acc
  508. else
  509. let i1 = i + 1 in
  510. let acc = if n = i then acc else SubstString (n, i - n) :: acc in
  511. match String.unsafe_get str i1 with
  512. | '0'..'9' as c ->
  513. let subpat_nr = ref (Char.code c - zero) in
  514. (try
  515. for j = i1 + 1 to lix do
  516. let c = String.unsafe_get str j in
  517. if c >= '0' && c <= '9' then
  518. subpat_nr := 10 * !subpat_nr + Char.code c - zero
  519. else raise (FoundAt j)
  520. done;
  521. max_br := max !subpat_nr !max_br;
  522. Backref !subpat_nr :: acc
  523. with FoundAt j ->
  524. max_br := max !subpat_nr !max_br;
  525. loop (Backref !subpat_nr :: acc) j)
  526. | '!' -> loop acc (i1 + 1)
  527. | '$' -> loop (SubstString (i1, 1) :: acc) (i1 + 1)
  528. | '&' -> loop (Match :: acc) (i1 + 1)
  529. | '`' -> loop (PreMatch :: acc) (i1 + 1)
  530. | '\'' -> loop (PostMatch :: acc) (i1 + 1)
  531. | '+' ->
  532. with_lp := true;
  533. loop (LastParenMatch :: acc) (i1 + 1)
  534. | _ -> loop acc i1 in
  535. let subst_lst = loop [] 0 in
  536. str, !max_br, !with_lp, subst_lst
  537. let def_subst = subst ""
  538. (* Calculates a list of tuples (str, offset, len) which contain
  539. substrings to be copied on substitutions. Internal use only! *)
  540. let calc_trans_lst subgroups2 ovector subj templ subst_lst =
  541. let prefix_len = Array.unsafe_get ovector 0 in
  542. let last = Array.unsafe_get ovector 1 in
  543. let coll (res_len, trans_lst as accu) =
  544. let return_lst (_str, _ix, len as el) =
  545. if len = 0 then accu else res_len + len, el :: trans_lst in
  546. function
  547. | SubstString (ix, len) -> return_lst (templ, ix, len)
  548. | Backref 0 ->
  549. let prog_name = Sys.argv.(0) in
  550. return_lst (prog_name, 0, String.length prog_name)
  551. | Backref n ->
  552. let offset = n lsl 1 in
  553. let start = Array.unsafe_get ovector offset in
  554. let len = Array.unsafe_get ovector (offset + 1) - start in
  555. return_lst (subj, start, len)
  556. | Match -> return_lst (subj, prefix_len, last - prefix_len)
  557. | PreMatch -> return_lst (subj, 0, prefix_len)
  558. | PostMatch -> return_lst (subj, last, String.length subj - last)
  559. | LastParenMatch ->
  560. let subgroups2_2 = subgroups2 - 2 in
  561. let pos = ref subgroups2_2 in
  562. let ix = ref (Array.unsafe_get ovector subgroups2_2) in
  563. while !ix < 0 do
  564. let pos_2 = !pos - 2 in
  565. pos := pos_2;
  566. ix := Array.unsafe_get ovector pos_2
  567. done;
  568. return_lst (subj, !ix, Array.unsafe_get ovector (!pos + 1) - !ix) in
  569. List.fold_left coll (0, []) subst_lst
  570. let replace ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat
  571. ?(pos = 0) ?(itempl = def_subst) ?templ ?callout subj =
  572. let rex = match pat with Some str -> regexp str | _ -> rex in
  573. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  574. let templ, max_br, with_lp, subst_lst =
  575. match templ with
  576. | Some str -> subst str
  577. | _ -> itempl in
  578. let subj_len = String.length subj in
  579. if pos < 0 || pos > subj_len then invalid_arg "Pcre2.replace: illegal offset";
  580. let subgroups2, ovector = make_ovector rex in
  581. let nsubs = (subgroups2 lsr 1) - 1 in
  582. if max_br > nsubs then
  583. failwith "Pcre2.replace: backreference denotes nonexistent subpattern";
  584. if with_lp && nsubs = 0 then failwith "Pcre2.replace: no backreferences";
  585. let rec loop full_len trans_lsts cur_pos =
  586. if
  587. cur_pos > subj_len ||
  588. try
  589. unsafe_pcre2_match
  590. iflags rex ~pos:cur_pos ~subj_start:0 ~subj
  591. ovector callout;
  592. false
  593. with Not_found -> true
  594. then
  595. let postfix_len = max (subj_len - cur_pos) 0 in
  596. let left = pos + full_len in
  597. let res = Bytes.create (left + postfix_len) in
  598. bytes_unsafe_blit_string subj 0 res 0 pos;
  599. bytes_unsafe_blit_string subj cur_pos res left postfix_len;
  600. let inner_coll ofs (templ, ix, len) =
  601. bytes_unsafe_blit_string templ ix res ofs len; ofs + len in
  602. let coll ofs (res_len, trans_lst) =
  603. let new_ofs = ofs - res_len in
  604. let _ = List.fold_left inner_coll new_ofs trans_lst in
  605. new_ofs in
  606. let _ = List.fold_left coll left trans_lsts in
  607. Bytes.unsafe_to_string res
  608. else
  609. let first = Array.unsafe_get ovector 0 in
  610. let len = first - cur_pos in
  611. let res_len, _ as trans_lst_el =
  612. calc_trans_lst subgroups2 ovector subj templ subst_lst in
  613. let trans_lsts =
  614. if len > 0 then
  615. trans_lst_el :: (len, [(subj, cur_pos, len)]) :: trans_lsts
  616. else trans_lst_el :: trans_lsts in
  617. let full_len = full_len + len + res_len in
  618. let next = first + 1 in
  619. let last = Array.unsafe_get ovector 1 in
  620. if last < next then
  621. if first < subj_len then
  622. let new_trans_lsts = (1, [(subj, cur_pos + len, 1)]) :: trans_lsts in
  623. loop (full_len + 1) new_trans_lsts next
  624. else loop full_len trans_lsts next
  625. else loop full_len trans_lsts last in
  626. loop 0 [] pos
  627. let qreplace ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat
  628. ?(pos = 0) ?(templ = "") ?callout subj =
  629. let rex = match pat with Some str -> regexp str | _ -> rex in
  630. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  631. let subj_len = String.length subj in
  632. if pos < 0 || pos > subj_len then invalid_arg "Pcre2.qreplace: illegal offset";
  633. let templ_len = String.length templ in
  634. let _, ovector = make_ovector rex in
  635. let rec loop full_len subst_lst cur_pos =
  636. if
  637. cur_pos > subj_len ||
  638. try
  639. unsafe_pcre2_match
  640. iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout;
  641. false
  642. with Not_found -> true
  643. then
  644. let postfix_len = max (subj_len - cur_pos) 0 in
  645. let left = pos + full_len in
  646. let res = Bytes.create (left + postfix_len) in
  647. bytes_unsafe_blit_string subj 0 res 0 pos;
  648. bytes_unsafe_blit_string subj cur_pos res left postfix_len;
  649. let coll ofs = function
  650. | Some (substr, ix, len) ->
  651. let new_ofs = ofs - len in
  652. bytes_unsafe_blit_string substr ix res new_ofs len;
  653. new_ofs
  654. | None ->
  655. let new_ofs = ofs - templ_len in
  656. bytes_unsafe_blit_string templ 0 res new_ofs templ_len;
  657. new_ofs in
  658. let _ = List.fold_left coll left subst_lst in
  659. Bytes.unsafe_to_string res
  660. else
  661. let first = Array.unsafe_get ovector 0 in
  662. let len = first - cur_pos in
  663. let subst_lst =
  664. if len > 0 then None :: Some (subj, cur_pos, len) :: subst_lst
  665. else None :: subst_lst in
  666. let last = Array.unsafe_get ovector 1 in
  667. let full_len = full_len + len + templ_len in
  668. let next = first + 1 in
  669. if last < next then
  670. if first < subj_len then
  671. loop (full_len + 1) (Some (subj, cur_pos + len, 1) :: subst_lst) next
  672. else loop full_len subst_lst next
  673. else loop full_len subst_lst last in
  674. loop 0 [] pos
  675. let substitute_substrings ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat
  676. ?(pos = 0) ?callout ~subst subj =
  677. let rex = match pat with Some str -> regexp str | _ -> rex in
  678. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  679. let subj_len = String.length subj in
  680. if pos < 0 || pos > subj_len then invalid_arg "Pcre2.substitute: illegal offset";
  681. let _, ovector = make_ovector rex in
  682. let rec loop full_len subst_lst cur_pos =
  683. if
  684. cur_pos > subj_len ||
  685. try
  686. unsafe_pcre2_match
  687. iflags rex ~pos:cur_pos ~subj_start:0 ~subj ovector callout;
  688. false
  689. with Not_found -> true
  690. then
  691. let postfix_len = max (subj_len - cur_pos) 0 in
  692. let left = pos + full_len in
  693. let res = Bytes.create (left + postfix_len) in
  694. bytes_unsafe_blit_string subj 0 res 0 pos;
  695. bytes_unsafe_blit_string subj cur_pos res left postfix_len;
  696. let coll ofs (templ, ix, len) =
  697. let new_ofs = ofs - len in
  698. bytes_unsafe_blit_string templ ix res new_ofs len;
  699. new_ofs in
  700. let _ = List.fold_left coll left subst_lst in
  701. Bytes.unsafe_to_string res
  702. else
  703. let first = Array.unsafe_get ovector 0 in
  704. let len = first - cur_pos in
  705. let templ = subst (subj, ovector) in
  706. let templ_len = String.length templ in
  707. let subst_lst =
  708. if len > 0 then
  709. (templ, 0, templ_len) :: (subj, cur_pos, len) :: subst_lst
  710. else (templ, 0, templ_len) :: subst_lst in
  711. let last = Array.unsafe_get ovector 1 in
  712. let full_len = full_len + len + templ_len in
  713. let next = first + 1 in
  714. if last < next then
  715. if first < subj_len then
  716. loop (full_len + 1) ((subj, cur_pos + len, 1) :: subst_lst) next
  717. else loop full_len subst_lst next
  718. else loop full_len subst_lst last in
  719. loop 0 [] pos
  720. let substitute ?iflags ?flags ?rex ?pat ?pos ?callout ~subst:str_subst subj =
  721. let subst (subj, ovector) =
  722. let first = Array.unsafe_get ovector 0 in
  723. let last = Array.unsafe_get ovector 1 in
  724. str_subst (string_unsafe_sub subj first (last - first)) in
  725. substitute_substrings ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj
  726. let replace_first ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?(pos = 0)
  727. ?(itempl = def_subst) ?templ ?callout subj =
  728. let rex = match pat with Some str -> regexp str | _ -> rex in
  729. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  730. let templ, max_br, with_lp, subst_lst =
  731. match templ with
  732. | Some str -> subst str
  733. | _ -> itempl in
  734. let subgroups2, ovector = make_ovector rex in
  735. let nsubs = (subgroups2 lsr 1) - 1 in
  736. if max_br > nsubs then
  737. failwith "Pcre2.replace_first: backreference denotes nonexistent subpattern";
  738. if with_lp && nsubs = 0 then failwith "Pcre2.replace_first: no backreferences";
  739. try
  740. unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  741. let res_len, trans_lst =
  742. calc_trans_lst subgroups2 ovector subj templ subst_lst in
  743. let first = Array.unsafe_get ovector 0 in
  744. let last = Array.unsafe_get ovector 1 in
  745. let rest = String.length subj - last in
  746. let res = Bytes.create (first + res_len + rest) in
  747. bytes_unsafe_blit_string subj 0 res 0 first;
  748. let coll ofs (templ, ix, len) =
  749. bytes_unsafe_blit_string templ ix res ofs len; ofs + len in
  750. let ofs = List.fold_left coll first trans_lst in
  751. bytes_unsafe_blit_string subj last res ofs rest;
  752. Bytes.unsafe_to_string res
  753. with Not_found -> subj
  754. let qreplace_first ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat
  755. ?(pos = 0) ?(templ = "") ?callout subj =
  756. let rex = match pat with Some str -> regexp str | _ -> rex in
  757. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  758. let _, ovector = make_ovector rex in
  759. try
  760. unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  761. let first = Array.unsafe_get ovector 0 in
  762. let last = Array.unsafe_get ovector 1 in
  763. let len = String.length templ in
  764. let rest = String.length subj - last in
  765. let postfix_start = first + len in
  766. let res = Bytes.create (postfix_start + rest) in
  767. bytes_unsafe_blit_string subj 0 res 0 first;
  768. bytes_unsafe_blit_string templ 0 res first len;
  769. bytes_unsafe_blit_string subj last res postfix_start rest;
  770. Bytes.unsafe_to_string res
  771. with Not_found -> subj
  772. let substitute_substrings_first ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat
  773. ?(pos = 0) ?callout ~subst subj =
  774. let rex = match pat with Some str -> regexp str | _ -> rex in
  775. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  776. let _, ovector = make_ovector rex in
  777. try
  778. unsafe_pcre2_match iflags rex ~pos ~subj_start:0 ~subj ovector callout;
  779. let subj_len = String.length subj in
  780. let prefix_len = Array.unsafe_get ovector 0 in
  781. let last = Array.unsafe_get ovector 1 in
  782. let templ = subst (subj, ovector) in
  783. let postfix_len = subj_len - last in
  784. let templ_len = String.length templ in
  785. let postfix_start = prefix_len + templ_len in
  786. let res = Bytes.create (postfix_start + postfix_len) in
  787. bytes_unsafe_blit_string subj 0 res 0 prefix_len;
  788. bytes_unsafe_blit_string templ 0 res prefix_len templ_len;
  789. bytes_unsafe_blit_string subj last res postfix_start postfix_len;
  790. Bytes.unsafe_to_string res
  791. with Not_found -> subj
  792. let substitute_first ?iflags ?flags ?rex ?pat ?pos
  793. ?callout ~subst:str_subst subj =
  794. let subst (subj, ovector) =
  795. let first = Array.unsafe_get ovector 0 in
  796. let last = Array.unsafe_get ovector 1 in
  797. str_subst (string_unsafe_sub subj first (last - first)) in
  798. substitute_substrings_first
  799. ?iflags ?flags ?rex ?pat ?pos ?callout ~subst subj
  800. (* Splitting *)
  801. let internal_psplit flags rex max pos callout subj =
  802. let subj_len = String.length subj in
  803. if subj_len = 0 then []
  804. else if max = 1 then [subj]
  805. else
  806. let subgroups2, ovector = make_ovector rex in
  807. (* Adds contents of subgroups to the string accumulator *)
  808. let handle_subgroups strs =
  809. let strs = ref strs in
  810. let i = ref 2 in
  811. while !i < subgroups2 do
  812. let first = Array.unsafe_get ovector !i in
  813. incr i;
  814. let last = Array.unsafe_get ovector !i in
  815. let str =
  816. if first < 0 then ""
  817. else string_unsafe_sub subj first (last - first) in
  818. strs := str :: !strs; incr i
  819. done;
  820. !strs in
  821. (* Performs the recursive split *)
  822. let rec loop strs cnt pos prematch =
  823. let len = subj_len - pos in
  824. if len < 0 then strs
  825. else
  826. (* Checks termination due to max restriction *)
  827. if cnt = 0 then
  828. if prematch &&
  829. try
  830. unsafe_pcre2_match
  831. flags rex ~pos ~subj_start:pos ~subj ovector callout;
  832. true
  833. with Not_found -> false
  834. then
  835. let last = Array.unsafe_get ovector 1 in
  836. let strs = handle_subgroups strs in
  837. string_unsafe_sub subj last (subj_len - last) :: strs
  838. else string_unsafe_sub subj pos len :: strs
  839. (* Calculates next accumulator state for splitting *)
  840. else
  841. if
  842. try
  843. unsafe_pcre2_match
  844. flags rex ~pos ~subj_start:pos ~subj ovector callout;
  845. false
  846. with Not_found -> true
  847. then string_unsafe_sub subj pos len :: strs
  848. else
  849. let first = Array.unsafe_get ovector 0 in
  850. let last = Array.unsafe_get ovector 1 in
  851. if first = pos then
  852. if last = pos then
  853. let strs = if prematch then handle_subgroups strs else strs in
  854. if len = 0 then "" :: strs
  855. else if
  856. try
  857. unsafe_pcre2_match
  858. (* `ANCHORED | `NOTEMPTY *)
  859. (Int64.logor flags 0x80000004L) rex ~pos ~subj_start:pos ~subj
  860. ovector callout;
  861. true
  862. with Not_found -> false
  863. then
  864. let new_strs = handle_subgroups ("" :: strs) in
  865. loop new_strs (cnt - 1) (Array.unsafe_get ovector 1) false
  866. else
  867. let new_strs = string_unsafe_sub subj pos 1 :: strs in
  868. loop new_strs (cnt - 1) (pos + 1) true
  869. else
  870. if prematch then loop (handle_subgroups strs) cnt last false
  871. else loop (handle_subgroups ("" :: strs)) (cnt - 1) last false
  872. else
  873. let new_strs = string_unsafe_sub subj pos (first - pos) :: strs in
  874. loop (handle_subgroups new_strs) (cnt - 1) last false in
  875. loop [] (max - 1) pos false
  876. let rec strip_all_empty = function "" :: t -> strip_all_empty t | l -> l
  877. external isspace : char -> bool = "pcre2_isspace_stub" [@@noalloc]
  878. let rec find_no_space ix len str =
  879. if ix = len || not (isspace (String.unsafe_get str ix)) then ix
  880. else find_no_space (ix + 1) len str
  881. let split ?(iflags = 0L) ?flags ?rex ?pat ?(pos = 0) ?(max = 0) ?callout subj =
  882. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  883. let res =
  884. match pat, rex with
  885. | Some str, _ -> internal_psplit iflags (regexp str) max pos callout subj
  886. | _, Some rex -> internal_psplit iflags rex max pos callout subj
  887. | _ ->
  888. (* special case for Perl-splitting semantics *)
  889. let len = String.length subj in
  890. if pos > len || pos < 0 then failwith "Pcre2.split: illegal offset";
  891. let new_pos = find_no_space pos len subj in
  892. internal_psplit iflags def_rex max new_pos callout subj in
  893. List.rev (if max = 0 then strip_all_empty res else res)
  894. let asplit ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj =
  895. Array.of_list (split ?iflags ?flags ?rex ?pat ?pos ?max ?callout subj)
  896. (* Full splitting *)
  897. type split_result = Text of string
  898. | Delim of string
  899. | Group of int * string
  900. | NoGroup
  901. let rec strip_all_empty_full = function
  902. | Delim _ :: rest -> strip_all_empty_full rest
  903. | l -> l
  904. let full_split ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat
  905. ?(pos = 0) ?(max = 0) ?callout subj =
  906. let rex = match pat with Some str -> regexp str | _ -> rex in
  907. let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
  908. let subj_len = String.length subj in
  909. if subj_len = 0 then []
  910. else if max = 1 then [Text (subj)]
  911. else
  912. let subgroups2, ovector = make_ovector rex in
  913. (* Adds contents of subgroups to the string accumulator *)
  914. let handle_subgroups strs =
  915. let strs = ref strs in
  916. let i = ref 2 in
  917. while !i < subgroups2 do
  918. let group_nr = !i lsr 1 in
  919. let first = Array.unsafe_get ovector !i in
  920. incr i;
  921. let last = Array.unsafe_get ovector !i in
  922. let str =
  923. if first < 0 then NoGroup
  924. else
  925. let group_str = string_unsafe_sub subj first (last - first) in
  926. Group (group_nr, group_str) in
  927. strs := str :: !strs; incr i
  928. done;
  929. !strs in
  930. (* Performs the recursive split *)
  931. let rec loop strs cnt pos prematch =
  932. let len = subj_len - pos in
  933. if len < 0 then strs
  934. else
  935. (* Checks termination due to max restriction *)
  936. if cnt = 0 then
  937. if prematch &&
  938. try
  939. unsafe_pcre2_match
  940. iflags rex ~pos ~subj_start:pos ~subj ovector callout;
  941. true
  942. with Not_found -> false
  943. then
  944. let first = Array.unsafe_get ovector 0 in
  945. let last = Array.unsafe_get ovector 1 in
  946. let delim = Delim (string_unsafe_sub subj first (last - first)) in
  947. Text (string_unsafe_sub subj last (subj_len - last))
  948. :: handle_subgroups (delim :: strs)
  949. else
  950. if len = 0 then strs
  951. else Text (string_unsafe_sub subj pos len) :: strs
  952. (* Calculates next accumulator state for splitting *)
  953. else
  954. if
  955. try
  956. unsafe_pcre2_match
  957. iflags rex ~pos ~subj_start:pos ~subj ovector callout;
  958. false
  959. with Not_found -> true
  960. then
  961. if len = 0 then strs
  962. else Text (string_unsafe_sub subj pos len) :: strs
  963. else
  964. let first = Array.unsafe_get ovector 0 in
  965. let last = Array.unsafe_get ovector 1 in
  966. if first = pos then
  967. if last = pos then
  968. if len = 0 then handle_subgroups (Delim "" :: strs)
  969. else
  970. let empty_groups = handle_subgroups [] in
  971. if
  972. try
  973. unsafe_pcre2_match
  974. (* `ANCHORED | `NOTEMPTY *)
  975. (Int64.logor iflags 0x80000004L) rex ~pos ~subj_start:pos ~subj
  976. ovector callout;
  977. true
  978. with Not_found -> false
  979. then
  980. let first = Array.unsafe_get ovector 0 in
  981. let last = Array.unsafe_get ovector 1 in
  982. let delim =
  983. Delim (string_unsafe_sub subj first (last - first)) in
  984. let new_strs =
  985. handle_subgroups (
  986. delim :: (if prematch then strs
  987. else empty_groups @ (Delim "" :: strs))) in
  988. loop new_strs (cnt - 1) last false
  989. else
  990. let new_strs =
  991. Text (string_unsafe_sub subj pos 1)
  992. :: empty_groups @ Delim "" :: strs in
  993. loop new_strs (cnt - 1) (pos + 1) true
  994. else
  995. let delim =
  996. Delim (string_unsafe_sub subj first (last - first)) in
  997. loop (handle_subgroups (delim :: strs)) cnt last false
  998. else
  999. let delim = Delim (string_unsafe_sub subj first (last - first)) in
  1000. let pre_strs =
  1001. Text (string_unsafe_sub subj pos (first - pos)) :: strs in
  1002. loop
  1003. (handle_subgroups (delim :: pre_strs)) (cnt - 1) last false in
  1004. let res = loop [] (max - 1) pos true in
  1005. List.rev (if max = 0 then strip_all_empty_full res else res)
  1006. (* Additional convenience functions useful in combination with this library *)
  1007. let foreach_line ?(ic = stdin) f =
  1008. try while true do f (input_line ic) done with End_of_file -> ()
  1009. let foreach_file filenames f =
  1010. let do_with_file filename =
  1011. let file = open_in filename in
  1012. try f filename file; close_in file
  1013. with exn -> close_in file; raise exn in
  1014. List.iter do_with_file filenames