RegularExpression.cpp 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. //
  2. // RegularExpression.h
  3. //
  4. // $Id: //poco/1.4/Foundation/src/RegularExpression.cpp#1 $
  5. //
  6. // Library: Foundation
  7. // Package: RegExp
  8. // Module: RegularExpression
  9. //
  10. // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH.
  11. // and Contributors.
  12. //
  13. // SPDX-License-Identifier: BSL-1.0
  14. //
  15. #include "Poco/RegularExpression.h"
  16. #include "Poco/Exception.h"
  17. #include <sstream>
  18. #if defined(POCO_UNBUNDLED)
  19. #include <pcre.h>
  20. #else
  21. #include "pcre_config.h"
  22. #include "pcre.h"
  23. #endif
  24. namespace Poco {
  25. const int RegularExpression::OVEC_SIZE = 64;
  26. RegularExpression::RegularExpression(const std::string& pattern, int options, bool study): _pcre(0), _extra(0)
  27. {
  28. const char* error;
  29. int offs;
  30. _pcre = pcre_compile(pattern.c_str(), options, &error, &offs, 0);
  31. if (!_pcre)
  32. {
  33. std::ostringstream msg;
  34. msg << error << " (at offset " << offs << ")";
  35. throw RegularExpressionException(msg.str());
  36. }
  37. if (study)
  38. _extra = pcre_study(_pcre, 0, &error);
  39. }
  40. RegularExpression::~RegularExpression()
  41. {
  42. if (_pcre) pcre_free(_pcre);
  43. if (_extra) pcre_free(_extra);
  44. }
  45. int RegularExpression::match(const std::string& subject, std::string::size_type offset, Match& mtch, int options) const
  46. {
  47. poco_assert (offset <= subject.length());
  48. int ovec[OVEC_SIZE];
  49. int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE);
  50. if (rc == PCRE_ERROR_NOMATCH)
  51. {
  52. mtch.offset = std::string::npos;
  53. mtch.length = 0;
  54. return 0;
  55. }
  56. else if (rc == PCRE_ERROR_BADOPTION)
  57. {
  58. throw RegularExpressionException("bad option");
  59. }
  60. else if (rc == 0)
  61. {
  62. throw RegularExpressionException("too many captured substrings");
  63. }
  64. else if (rc < 0)
  65. {
  66. std::ostringstream msg;
  67. msg << "PCRE error " << rc;
  68. throw RegularExpressionException(msg.str());
  69. }
  70. mtch.offset = ovec[0] < 0 ? std::string::npos : ovec[0];
  71. mtch.length = ovec[1] - mtch.offset;
  72. return rc;
  73. }
  74. int RegularExpression::match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options) const
  75. {
  76. poco_assert (offset <= subject.length());
  77. matches.clear();
  78. int ovec[OVEC_SIZE];
  79. int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE);
  80. if (rc == PCRE_ERROR_NOMATCH)
  81. {
  82. return 0;
  83. }
  84. else if (rc == PCRE_ERROR_BADOPTION)
  85. {
  86. throw RegularExpressionException("bad option");
  87. }
  88. else if (rc == 0)
  89. {
  90. throw RegularExpressionException("too many captured substrings");
  91. }
  92. else if (rc < 0)
  93. {
  94. std::ostringstream msg;
  95. msg << "PCRE error " << rc;
  96. throw RegularExpressionException(msg.str());
  97. }
  98. matches.reserve(rc);
  99. for (int i = 0; i < rc; ++i)
  100. {
  101. Match m;
  102. m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ;
  103. m.length = ovec[i*2 + 1] - m.offset;
  104. matches.push_back(m);
  105. }
  106. return rc;
  107. }
  108. bool RegularExpression::match(const std::string& subject, std::string::size_type offset) const
  109. {
  110. Match mtch;
  111. match(subject, offset, mtch, RE_ANCHORED | RE_NOTEMPTY);
  112. return mtch.offset == offset && mtch.length == subject.length() - offset;
  113. }
  114. bool RegularExpression::match(const std::string& subject, std::string::size_type offset, int options) const
  115. {
  116. Match mtch;
  117. match(subject, offset, mtch, options);
  118. return mtch.offset == offset && mtch.length == subject.length() - offset;
  119. }
  120. int RegularExpression::extract(const std::string& subject, std::string& str, int options) const
  121. {
  122. Match mtch;
  123. int rc = match(subject, 0, mtch, options);
  124. if (mtch.offset != std::string::npos)
  125. str.assign(subject, mtch.offset, mtch.length);
  126. else
  127. str.clear();
  128. return rc;
  129. }
  130. int RegularExpression::extract(const std::string& subject, std::string::size_type offset, std::string& str, int options) const
  131. {
  132. Match mtch;
  133. int rc = match(subject, offset, mtch, options);
  134. if (mtch.offset != std::string::npos)
  135. str.assign(subject, mtch.offset, mtch.length);
  136. else
  137. str.clear();
  138. return rc;
  139. }
  140. int RegularExpression::split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options) const
  141. {
  142. MatchVec matches;
  143. strings.clear();
  144. int rc = match(subject, offset, matches, options);
  145. strings.reserve(matches.size());
  146. for (MatchVec::const_iterator it = matches.begin(); it != matches.end(); ++it)
  147. {
  148. if (it->offset != std::string::npos)
  149. strings.push_back(subject.substr(it->offset, it->length));
  150. else
  151. strings.push_back(std::string());
  152. }
  153. return rc;
  154. }
  155. int RegularExpression::subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const
  156. {
  157. if (options & RE_GLOBAL)
  158. {
  159. int rc = 0;
  160. std::string::size_type pos = substOne(subject, offset, replacement, options);
  161. while (pos != std::string::npos)
  162. {
  163. ++rc;
  164. pos = substOne(subject, pos, replacement, options);
  165. }
  166. return rc;
  167. }
  168. else
  169. {
  170. return substOne(subject, offset, replacement, options) != std::string::npos ? 1 : 0;
  171. }
  172. }
  173. std::string::size_type RegularExpression::substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const
  174. {
  175. if (offset >= subject.length()) return std::string::npos;
  176. int ovec[OVEC_SIZE];
  177. int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE);
  178. if (rc == PCRE_ERROR_NOMATCH)
  179. {
  180. return std::string::npos;
  181. }
  182. else if (rc == PCRE_ERROR_BADOPTION)
  183. {
  184. throw RegularExpressionException("bad option");
  185. }
  186. else if (rc == 0)
  187. {
  188. throw RegularExpressionException("too many captured substrings");
  189. }
  190. else if (rc < 0)
  191. {
  192. std::ostringstream msg;
  193. msg << "PCRE error " << rc;
  194. throw RegularExpressionException(msg.str());
  195. }
  196. std::string result;
  197. std::string::size_type len = subject.length();
  198. std::string::size_type pos = 0;
  199. std::string::size_type rp = std::string::npos;
  200. while (pos < len)
  201. {
  202. if (ovec[0] == pos)
  203. {
  204. std::string::const_iterator it = replacement.begin();
  205. std::string::const_iterator end = replacement.end();
  206. while (it != end)
  207. {
  208. if (*it == '$' && !(options & RE_NO_VARS))
  209. {
  210. ++it;
  211. if (it != end)
  212. {
  213. char d = *it;
  214. if (d >= '0' && d <= '9')
  215. {
  216. int c = d - '0';
  217. if (c < rc)
  218. {
  219. int o = ovec[c*2];
  220. int l = ovec[c*2 + 1] - o;
  221. result.append(subject, o, l);
  222. }
  223. }
  224. else
  225. {
  226. result += '$';
  227. result += d;
  228. }
  229. ++it;
  230. }
  231. else result += '$';
  232. }
  233. else result += *it++;
  234. }
  235. pos = ovec[1];
  236. rp = result.length();
  237. }
  238. else result += subject[pos++];
  239. }
  240. subject = result;
  241. return rp;
  242. }
  243. bool RegularExpression::match(const std::string& subject, const std::string& pattern, int options)
  244. {
  245. int ctorOptions = options & (RE_CASELESS | RE_MULTILINE | RE_DOTALL | RE_EXTENDED | RE_ANCHORED | RE_DOLLAR_ENDONLY | RE_EXTRA | RE_UNGREEDY | RE_UTF8 | RE_NO_AUTO_CAPTURE);
  246. int mtchOptions = options & (RE_ANCHORED | RE_NOTBOL | RE_NOTEOL | RE_NOTEMPTY | RE_NO_AUTO_CAPTURE | RE_NO_UTF8_CHECK);
  247. RegularExpression re(pattern, ctorOptions, false);
  248. return re.match(subject, 0, mtchOptions);
  249. }
  250. } // namespace Poco