pcrecpp.cc 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. // Copyright (c) 2005, Google Inc.
  2. // All rights reserved.
  3. //
  4. // Redistribution and use in source and binary forms, with or without
  5. // modification, are permitted provided that the following conditions are
  6. // met:
  7. //
  8. // * Redistributions of source code must retain the above copyright
  9. // notice, this list of conditions and the following disclaimer.
  10. // * Redistributions in binary form must reproduce the above
  11. // copyright notice, this list of conditions and the following disclaimer
  12. // in the documentation and/or other materials provided with the
  13. // distribution.
  14. // * Neither the name of Google Inc. nor the names of its
  15. // contributors may be used to endorse or promote products derived from
  16. // this software without specific prior written permission.
  17. //
  18. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. //
  30. // Author: Sanjay Ghemawat
  31. #ifdef HAVE_CONFIG_H
  32. #include "config.h"
  33. #endif
  34. #include <stdlib.h>
  35. #include <stdio.h>
  36. #include <ctype.h>
  37. #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
  38. #include <assert.h>
  39. #include <errno.h>
  40. #include <string>
  41. #include <algorithm>
  42. #include "pcrecpp_internal.h"
  43. #include "pcre.h"
  44. #include "pcrecpp.h"
  45. #include "pcre_stringpiece.h"
  46. namespace pcrecpp {
  47. // Maximum number of args we can set
  48. static const int kMaxArgs = 16;
  49. static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
  50. // Special object that stands-in for no argument
  51. Arg RE::no_arg((void*)NULL);
  52. // If a regular expression has no error, its error_ field points here
  53. static const string empty_string;
  54. // If the user doesn't ask for any options, we just use this one
  55. static RE_Options default_options;
  56. void RE::Init(const string& pat, const RE_Options* options) {
  57. pattern_ = pat;
  58. if (options == NULL) {
  59. options_ = default_options;
  60. } else {
  61. options_ = *options;
  62. }
  63. error_ = &empty_string;
  64. re_full_ = NULL;
  65. re_partial_ = NULL;
  66. re_partial_ = Compile(UNANCHORED);
  67. if (re_partial_ != NULL) {
  68. re_full_ = Compile(ANCHOR_BOTH);
  69. }
  70. }
  71. void RE::Cleanup() {
  72. if (re_full_ != NULL) (*pcre_free)(re_full_);
  73. if (re_partial_ != NULL) (*pcre_free)(re_partial_);
  74. if (error_ != &empty_string) delete error_;
  75. }
  76. RE::~RE() {
  77. Cleanup();
  78. }
  79. pcre* RE::Compile(Anchor anchor) {
  80. // First, convert RE_Options into pcre options
  81. int pcre_options = 0;
  82. pcre_options = options_.all_options();
  83. // Special treatment for anchoring. This is needed because at
  84. // runtime pcre only provides an option for anchoring at the
  85. // beginning of a string (unless you use offset).
  86. //
  87. // There are three types of anchoring we want:
  88. // UNANCHORED Compile the original pattern, and use
  89. // a pcre unanchored match.
  90. // ANCHOR_START Compile the original pattern, and use
  91. // a pcre anchored match.
  92. // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
  93. // and use a pcre anchored match.
  94. const char* compile_error;
  95. int eoffset;
  96. pcre* re;
  97. if (anchor != ANCHOR_BOTH) {
  98. re = pcre_compile(pattern_.c_str(), pcre_options,
  99. &compile_error, &eoffset, NULL);
  100. } else {
  101. // Tack a '\z' at the end of RE. Parenthesize it first so that
  102. // the '\z' applies to all top-level alternatives in the regexp.
  103. string wrapped = "(?:"; // A non-counting grouping operator
  104. wrapped += pattern_;
  105. wrapped += ")\\z";
  106. re = pcre_compile(wrapped.c_str(), pcre_options,
  107. &compile_error, &eoffset, NULL);
  108. }
  109. if (re == NULL) {
  110. if (error_ == &empty_string) error_ = new string(compile_error);
  111. }
  112. return re;
  113. }
  114. /***** Matching interfaces *****/
  115. bool RE::FullMatch(const StringPiece& text,
  116. const Arg& ptr1,
  117. const Arg& ptr2,
  118. const Arg& ptr3,
  119. const Arg& ptr4,
  120. const Arg& ptr5,
  121. const Arg& ptr6,
  122. const Arg& ptr7,
  123. const Arg& ptr8,
  124. const Arg& ptr9,
  125. const Arg& ptr10,
  126. const Arg& ptr11,
  127. const Arg& ptr12,
  128. const Arg& ptr13,
  129. const Arg& ptr14,
  130. const Arg& ptr15,
  131. const Arg& ptr16) const {
  132. const Arg* args[kMaxArgs];
  133. int n = 0;
  134. if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
  135. if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
  136. if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
  137. if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
  138. if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
  139. if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
  140. if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
  141. if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
  142. if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
  143. if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
  144. if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
  145. if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
  146. if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
  147. if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
  148. if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
  149. if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
  150. done:
  151. int consumed;
  152. int vec[kVecSize];
  153. return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
  154. }
  155. bool RE::PartialMatch(const StringPiece& text,
  156. const Arg& ptr1,
  157. const Arg& ptr2,
  158. const Arg& ptr3,
  159. const Arg& ptr4,
  160. const Arg& ptr5,
  161. const Arg& ptr6,
  162. const Arg& ptr7,
  163. const Arg& ptr8,
  164. const Arg& ptr9,
  165. const Arg& ptr10,
  166. const Arg& ptr11,
  167. const Arg& ptr12,
  168. const Arg& ptr13,
  169. const Arg& ptr14,
  170. const Arg& ptr15,
  171. const Arg& ptr16) const {
  172. const Arg* args[kMaxArgs];
  173. int n = 0;
  174. if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
  175. if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
  176. if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
  177. if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
  178. if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
  179. if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
  180. if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
  181. if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
  182. if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
  183. if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
  184. if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
  185. if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
  186. if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
  187. if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
  188. if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
  189. if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
  190. done:
  191. int consumed;
  192. int vec[kVecSize];
  193. return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
  194. }
  195. bool RE::Consume(StringPiece* input,
  196. const Arg& ptr1,
  197. const Arg& ptr2,
  198. const Arg& ptr3,
  199. const Arg& ptr4,
  200. const Arg& ptr5,
  201. const Arg& ptr6,
  202. const Arg& ptr7,
  203. const Arg& ptr8,
  204. const Arg& ptr9,
  205. const Arg& ptr10,
  206. const Arg& ptr11,
  207. const Arg& ptr12,
  208. const Arg& ptr13,
  209. const Arg& ptr14,
  210. const Arg& ptr15,
  211. const Arg& ptr16) const {
  212. const Arg* args[kMaxArgs];
  213. int n = 0;
  214. if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
  215. if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
  216. if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
  217. if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
  218. if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
  219. if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
  220. if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
  221. if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
  222. if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
  223. if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
  224. if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
  225. if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
  226. if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
  227. if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
  228. if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
  229. if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
  230. done:
  231. int consumed;
  232. int vec[kVecSize];
  233. if (DoMatchImpl(*input, ANCHOR_START, &consumed,
  234. args, n, vec, kVecSize)) {
  235. input->remove_prefix(consumed);
  236. return true;
  237. } else {
  238. return false;
  239. }
  240. }
  241. bool RE::FindAndConsume(StringPiece* input,
  242. const Arg& ptr1,
  243. const Arg& ptr2,
  244. const Arg& ptr3,
  245. const Arg& ptr4,
  246. const Arg& ptr5,
  247. const Arg& ptr6,
  248. const Arg& ptr7,
  249. const Arg& ptr8,
  250. const Arg& ptr9,
  251. const Arg& ptr10,
  252. const Arg& ptr11,
  253. const Arg& ptr12,
  254. const Arg& ptr13,
  255. const Arg& ptr14,
  256. const Arg& ptr15,
  257. const Arg& ptr16) const {
  258. const Arg* args[kMaxArgs];
  259. int n = 0;
  260. if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
  261. if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
  262. if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
  263. if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
  264. if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
  265. if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
  266. if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
  267. if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
  268. if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
  269. if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
  270. if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
  271. if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
  272. if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
  273. if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
  274. if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
  275. if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
  276. done:
  277. int consumed;
  278. int vec[kVecSize];
  279. if (DoMatchImpl(*input, UNANCHORED, &consumed,
  280. args, n, vec, kVecSize)) {
  281. input->remove_prefix(consumed);
  282. return true;
  283. } else {
  284. return false;
  285. }
  286. }
  287. bool RE::Replace(const StringPiece& rewrite,
  288. string *str) const {
  289. int vec[kVecSize];
  290. int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize);
  291. if (matches == 0)
  292. return false;
  293. string s;
  294. if (!Rewrite(&s, rewrite, *str, vec, matches))
  295. return false;
  296. assert(vec[0] >= 0);
  297. assert(vec[1] >= 0);
  298. str->replace(vec[0], vec[1] - vec[0], s);
  299. return true;
  300. }
  301. // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
  302. // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
  303. // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
  304. static int NewlineMode(int pcre_options) {
  305. // TODO: if we can make it threadsafe, cache this var
  306. int newline_mode = 0;
  307. /* if (newline_mode) return newline_mode; */ // do this once it's cached
  308. if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  309. PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
  310. newline_mode = (pcre_options &
  311. (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  312. PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
  313. } else {
  314. int newline;
  315. pcre_config(PCRE_CONFIG_NEWLINE, &newline);
  316. if (newline == 10)
  317. newline_mode = PCRE_NEWLINE_LF;
  318. else if (newline == 13)
  319. newline_mode = PCRE_NEWLINE_CR;
  320. else if (newline == 3338)
  321. newline_mode = PCRE_NEWLINE_CRLF;
  322. else if (newline == -1)
  323. newline_mode = PCRE_NEWLINE_ANY;
  324. else if (newline == -2)
  325. newline_mode = PCRE_NEWLINE_ANYCRLF;
  326. else
  327. assert("" == "Unexpected return value from pcre_config(NEWLINE)");
  328. }
  329. return newline_mode;
  330. }
  331. int RE::GlobalReplace(const StringPiece& rewrite,
  332. string *str) const {
  333. int count = 0;
  334. int vec[kVecSize];
  335. string out;
  336. int start = 0;
  337. int lastend = -1;
  338. while (start <= static_cast<int>(str->length())) {
  339. int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
  340. if (matches <= 0)
  341. break;
  342. int matchstart = vec[0], matchend = vec[1];
  343. assert(matchstart >= start);
  344. assert(matchend >= matchstart);
  345. if (matchstart == matchend && matchstart == lastend) {
  346. // advance one character if we matched an empty string at the same
  347. // place as the last match occurred
  348. matchend = start + 1;
  349. // If the current char is CR and we're in CRLF mode, skip LF too.
  350. // Note it's better to call pcre_fullinfo() than to examine
  351. // all_options(), since options_ could have changed bewteen
  352. // compile-time and now, but this is simpler and safe enough.
  353. // Modified by PH to add ANY and ANYCRLF.
  354. if (start+1 < static_cast<int>(str->length()) &&
  355. (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
  356. (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
  357. NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
  358. NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
  359. ) {
  360. matchend++;
  361. }
  362. // We also need to advance more than one char if we're in utf8 mode.
  363. #ifdef SUPPORT_UTF8
  364. if (options_.utf8()) {
  365. while (matchend < static_cast<int>(str->length()) &&
  366. ((*str)[matchend] & 0xc0) == 0x80)
  367. matchend++;
  368. }
  369. #endif
  370. if (matchend <= static_cast<int>(str->length()))
  371. out.append(*str, start, matchend - start);
  372. start = matchend;
  373. } else {
  374. out.append(*str, start, matchstart - start);
  375. Rewrite(&out, rewrite, *str, vec, matches);
  376. start = matchend;
  377. lastend = matchend;
  378. count++;
  379. }
  380. }
  381. if (count == 0)
  382. return 0;
  383. if (start < static_cast<int>(str->length()))
  384. out.append(*str, start, str->length() - start);
  385. swap(out, *str);
  386. return count;
  387. }
  388. bool RE::Extract(const StringPiece& rewrite,
  389. const StringPiece& text,
  390. string *out) const {
  391. int vec[kVecSize];
  392. int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);
  393. if (matches == 0)
  394. return false;
  395. out->erase();
  396. return Rewrite(out, rewrite, text, vec, matches);
  397. }
  398. /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
  399. string result;
  400. // Escape any ascii character not in [A-Za-z_0-9].
  401. //
  402. // Note that it's legal to escape a character even if it has no
  403. // special meaning in a regular expression -- so this function does
  404. // that. (This also makes it identical to the perl function of the
  405. // same name; see `perldoc -f quotemeta`.)
  406. for (int ii = 0; ii < unquoted.size(); ++ii) {
  407. // Note that using 'isalnum' here raises the benchmark time from
  408. // 32ns to 58ns:
  409. if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
  410. (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
  411. (unquoted[ii] < '0' || unquoted[ii] > '9') &&
  412. unquoted[ii] != '_' &&
  413. // If this is the part of a UTF8 or Latin1 character, we need
  414. // to copy this byte without escaping. Experimentally this is
  415. // what works correctly with the regexp library.
  416. !(unquoted[ii] & 128)) {
  417. result += '\\';
  418. }
  419. result += unquoted[ii];
  420. }
  421. return result;
  422. }
  423. /***** Actual matching and rewriting code *****/
  424. int RE::TryMatch(const StringPiece& text,
  425. int startpos,
  426. Anchor anchor,
  427. int *vec,
  428. int vecsize) const {
  429. pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
  430. if (re == NULL) {
  431. //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
  432. return 0;
  433. }
  434. pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
  435. if (options_.match_limit() > 0) {
  436. extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
  437. extra.match_limit = options_.match_limit();
  438. }
  439. if (options_.match_limit_recursion() > 0) {
  440. extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  441. extra.match_limit_recursion = options_.match_limit_recursion();
  442. }
  443. int rc = pcre_exec(re, // The regular expression object
  444. &extra,
  445. (text.data() == NULL) ? "" : text.data(),
  446. text.size(),
  447. startpos,
  448. (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,
  449. vec,
  450. vecsize);
  451. // Handle errors
  452. if (rc == PCRE_ERROR_NOMATCH) {
  453. return 0;
  454. } else if (rc < 0) {
  455. //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
  456. // re, pattern_.c_str());
  457. return 0;
  458. } else if (rc == 0) {
  459. // pcre_exec() returns 0 as a special case when the number of
  460. // capturing subpatterns exceeds the size of the vector.
  461. // When this happens, there is a match and the output vector
  462. // is filled, but we miss out on the positions of the extra subpatterns.
  463. rc = vecsize / 2;
  464. }
  465. return rc;
  466. }
  467. bool RE::DoMatchImpl(const StringPiece& text,
  468. Anchor anchor,
  469. int* consumed,
  470. const Arg* const* args,
  471. int n,
  472. int* vec,
  473. int vecsize) const {
  474. assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
  475. int matches = TryMatch(text, 0, anchor, vec, vecsize);
  476. assert(matches >= 0); // TryMatch never returns negatives
  477. if (matches == 0)
  478. return false;
  479. *consumed = vec[1];
  480. if (n == 0 || args == NULL) {
  481. // We are not interested in results
  482. return true;
  483. }
  484. if (NumberOfCapturingGroups() < n) {
  485. // RE has fewer capturing groups than number of arg pointers passed in
  486. return false;
  487. }
  488. // If we got here, we must have matched the whole pattern.
  489. // We do not need (can not do) any more checks on the value of 'matches' here
  490. // -- see the comment for TryMatch.
  491. for (int i = 0; i < n; i++) {
  492. const int start = vec[2*(i+1)];
  493. const int limit = vec[2*(i+1)+1];
  494. if (!args[i]->Parse(text.data() + start, limit-start)) {
  495. // TODO: Should we indicate what the error was?
  496. return false;
  497. }
  498. }
  499. return true;
  500. }
  501. bool RE::DoMatch(const StringPiece& text,
  502. Anchor anchor,
  503. int* consumed,
  504. const Arg* const args[],
  505. int n) const {
  506. assert(n >= 0);
  507. size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
  508. // (as for kVecSize)
  509. int space[21]; // use stack allocation for small vecsize (common case)
  510. int* vec = vecsize <= 21 ? space : new int[vecsize];
  511. bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
  512. if (vec != space) delete [] vec;
  513. return retval;
  514. }
  515. bool RE::Rewrite(string *out, const StringPiece &rewrite,
  516. const StringPiece &text, int *vec, int veclen) const {
  517. for (const char *s = rewrite.data(), *end = s + rewrite.size();
  518. s < end; s++) {
  519. int c = *s;
  520. if (c == '\\') {
  521. c = *++s;
  522. if (isdigit(c)) {
  523. int n = (c - '0');
  524. if (n >= veclen) {
  525. //fprintf(stderr, requested group %d in regexp %.*s\n",
  526. // n, rewrite.size(), rewrite.data());
  527. return false;
  528. }
  529. int start = vec[2 * n];
  530. if (start >= 0)
  531. out->append(text.data() + start, vec[2 * n + 1] - start);
  532. } else if (c == '\\') {
  533. out->push_back('\\');
  534. } else {
  535. //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
  536. // rewrite.size(), rewrite.data());
  537. return false;
  538. }
  539. } else {
  540. out->push_back(c);
  541. }
  542. }
  543. return true;
  544. }
  545. // Return the number of capturing subpatterns, or -1 if the
  546. // regexp wasn't valid on construction.
  547. int RE::NumberOfCapturingGroups() const {
  548. if (re_partial_ == NULL) return -1;
  549. int result;
  550. int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object
  551. NULL, // We did not study the pattern
  552. PCRE_INFO_CAPTURECOUNT,
  553. &result);
  554. assert(pcre_retval == 0);
  555. return result;
  556. }
  557. /***** Parsers for various types *****/
  558. bool Arg::parse_null(const char* str, int n, void* dest) {
  559. // We fail if somebody asked us to store into a non-NULL void* pointer
  560. return (dest == NULL);
  561. }
  562. bool Arg::parse_string(const char* str, int n, void* dest) {
  563. if (dest == NULL) return true;
  564. reinterpret_cast<string*>(dest)->assign(str, n);
  565. return true;
  566. }
  567. bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
  568. if (dest == NULL) return true;
  569. reinterpret_cast<StringPiece*>(dest)->set(str, n);
  570. return true;
  571. }
  572. bool Arg::parse_char(const char* str, int n, void* dest) {
  573. if (n != 1) return false;
  574. if (dest == NULL) return true;
  575. *(reinterpret_cast<char*>(dest)) = str[0];
  576. return true;
  577. }
  578. bool Arg::parse_uchar(const char* str, int n, void* dest) {
  579. if (n != 1) return false;
  580. if (dest == NULL) return true;
  581. *(reinterpret_cast<unsigned char*>(dest)) = str[0];
  582. return true;
  583. }
  584. // Largest number spec that we are willing to parse
  585. static const int kMaxNumberLength = 32;
  586. // REQUIRES "buf" must have length at least kMaxNumberLength+1
  587. // REQUIRES "n > 0"
  588. // Copies "str" into "buf" and null-terminates if necessary.
  589. // Returns one of:
  590. // a. "str" if no termination is needed
  591. // b. "buf" if the string was copied and null-terminated
  592. // c. "" if the input was invalid and has no hope of being parsed
  593. static const char* TerminateNumber(char* buf, const char* str, int n) {
  594. if ((n > 0) && isspace(*str)) {
  595. // We are less forgiving than the strtoxxx() routines and do not
  596. // allow leading spaces.
  597. return "";
  598. }
  599. // See if the character right after the input text may potentially
  600. // look like a digit.
  601. if (isdigit(str[n]) ||
  602. ((str[n] >= 'a') && (str[n] <= 'f')) ||
  603. ((str[n] >= 'A') && (str[n] <= 'F'))) {
  604. if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
  605. memcpy(buf, str, n);
  606. buf[n] = '\0';
  607. return buf;
  608. } else {
  609. // We can parse right out of the supplied string, so return it.
  610. return str;
  611. }
  612. }
  613. bool Arg::parse_long_radix(const char* str,
  614. int n,
  615. void* dest,
  616. int radix) {
  617. if (n == 0) return false;
  618. char buf[kMaxNumberLength+1];
  619. str = TerminateNumber(buf, str, n);
  620. char* end;
  621. errno = 0;
  622. long r = strtol(str, &end, radix);
  623. if (end != str + n) return false; // Leftover junk
  624. if (errno) return false;
  625. if (dest == NULL) return true;
  626. *(reinterpret_cast<long*>(dest)) = r;
  627. return true;
  628. }
  629. bool Arg::parse_ulong_radix(const char* str,
  630. int n,
  631. void* dest,
  632. int radix) {
  633. if (n == 0) return false;
  634. char buf[kMaxNumberLength+1];
  635. str = TerminateNumber(buf, str, n);
  636. if (str[0] == '-') return false; // strtoul() on a negative number?!
  637. char* end;
  638. errno = 0;
  639. unsigned long r = strtoul(str, &end, radix);
  640. if (end != str + n) return false; // Leftover junk
  641. if (errno) return false;
  642. if (dest == NULL) return true;
  643. *(reinterpret_cast<unsigned long*>(dest)) = r;
  644. return true;
  645. }
  646. bool Arg::parse_short_radix(const char* str,
  647. int n,
  648. void* dest,
  649. int radix) {
  650. long r;
  651. if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  652. if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
  653. if (dest == NULL) return true;
  654. *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
  655. return true;
  656. }
  657. bool Arg::parse_ushort_radix(const char* str,
  658. int n,
  659. void* dest,
  660. int radix) {
  661. unsigned long r;
  662. if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  663. if (r > USHRT_MAX) return false; // Out of range
  664. if (dest == NULL) return true;
  665. *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
  666. return true;
  667. }
  668. bool Arg::parse_int_radix(const char* str,
  669. int n,
  670. void* dest,
  671. int radix) {
  672. long r;
  673. if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  674. if (r < INT_MIN || r > INT_MAX) return false; // Out of range
  675. if (dest == NULL) return true;
  676. *(reinterpret_cast<int*>(dest)) = r;
  677. return true;
  678. }
  679. bool Arg::parse_uint_radix(const char* str,
  680. int n,
  681. void* dest,
  682. int radix) {
  683. unsigned long r;
  684. if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  685. if (r > UINT_MAX) return false; // Out of range
  686. if (dest == NULL) return true;
  687. *(reinterpret_cast<unsigned int*>(dest)) = r;
  688. return true;
  689. }
  690. bool Arg::parse_longlong_radix(const char* str,
  691. int n,
  692. void* dest,
  693. int radix) {
  694. #ifndef HAVE_LONG_LONG
  695. return false;
  696. #else
  697. if (n == 0) return false;
  698. char buf[kMaxNumberLength+1];
  699. str = TerminateNumber(buf, str, n);
  700. char* end;
  701. errno = 0;
  702. #if defined HAVE_STRTOQ
  703. long long r = strtoq(str, &end, radix);
  704. #elif defined HAVE_STRTOLL
  705. long long r = strtoll(str, &end, radix);
  706. #elif defined HAVE__STRTOI64
  707. long long r = _strtoi64(str, &end, radix);
  708. #else
  709. #error parse_longlong_radix: cannot convert input to a long-long
  710. #endif
  711. if (end != str + n) return false; // Leftover junk
  712. if (errno) return false;
  713. if (dest == NULL) return true;
  714. *(reinterpret_cast<long long*>(dest)) = r;
  715. return true;
  716. #endif /* HAVE_LONG_LONG */
  717. }
  718. bool Arg::parse_ulonglong_radix(const char* str,
  719. int n,
  720. void* dest,
  721. int radix) {
  722. #ifndef HAVE_UNSIGNED_LONG_LONG
  723. return false;
  724. #else
  725. if (n == 0) return false;
  726. char buf[kMaxNumberLength+1];
  727. str = TerminateNumber(buf, str, n);
  728. if (str[0] == '-') return false; // strtoull() on a negative number?!
  729. char* end;
  730. errno = 0;
  731. #if defined HAVE_STRTOQ
  732. unsigned long long r = strtouq(str, &end, radix);
  733. #elif defined HAVE_STRTOLL
  734. unsigned long long r = strtoull(str, &end, radix);
  735. #elif defined HAVE__STRTOI64
  736. unsigned long long r = _strtoui64(str, &end, radix);
  737. #else
  738. #error parse_ulonglong_radix: cannot convert input to a long-long
  739. #endif
  740. if (end != str + n) return false; // Leftover junk
  741. if (errno) return false;
  742. if (dest == NULL) return true;
  743. *(reinterpret_cast<unsigned long long*>(dest)) = r;
  744. return true;
  745. #endif /* HAVE_UNSIGNED_LONG_LONG */
  746. }
  747. bool Arg::parse_double(const char* str, int n, void* dest) {
  748. if (n == 0) return false;
  749. static const int kMaxLength = 200;
  750. char buf[kMaxLength];
  751. if (n >= kMaxLength) return false;
  752. memcpy(buf, str, n);
  753. buf[n] = '\0';
  754. errno = 0;
  755. char* end;
  756. double r = strtod(buf, &end);
  757. if (end != buf + n) return false; // Leftover junk
  758. if (errno) return false;
  759. if (dest == NULL) return true;
  760. *(reinterpret_cast<double*>(dest)) = r;
  761. return true;
  762. }
  763. bool Arg::parse_float(const char* str, int n, void* dest) {
  764. double r;
  765. if (!parse_double(str, n, &r)) return false;
  766. if (dest == NULL) return true;
  767. *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
  768. return true;
  769. }
  770. #define DEFINE_INTEGER_PARSERS(name) \
  771. bool Arg::parse_##name(const char* str, int n, void* dest) { \
  772. return parse_##name##_radix(str, n, dest, 10); \
  773. } \
  774. bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
  775. return parse_##name##_radix(str, n, dest, 16); \
  776. } \
  777. bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
  778. return parse_##name##_radix(str, n, dest, 8); \
  779. } \
  780. bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
  781. return parse_##name##_radix(str, n, dest, 0); \
  782. }
  783. DEFINE_INTEGER_PARSERS(short) /* */
  784. DEFINE_INTEGER_PARSERS(ushort) /* */
  785. DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
  786. DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
  787. DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
  788. DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
  789. DEFINE_INTEGER_PARSERS(longlong) /* */
  790. DEFINE_INTEGER_PARSERS(ulonglong) /* */
  791. #undef DEFINE_INTEGER_PARSERS
  792. } // namespace pcrecpp