RegularExpression.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. //
  2. // RegularExpression.h
  3. //
  4. // $Id: //poco/1.4/Foundation/include/Poco/RegularExpression.h#2 $
  5. //
  6. // Library: Foundation
  7. // Package: RegExp
  8. // Module: RegularExpression
  9. //
  10. // Definitions of class RegularExpression.
  11. //
  12. // A wrapper class for Philip Hazel's PCRE - Perl Compatible Regular Expressions
  13. // library (http://www.pcre.org).
  14. //
  15. // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH.
  16. // and Contributors.
  17. //
  18. // SPDX-License-Identifier: BSL-1.0
  19. //
  20. #ifndef Foundation_RegularExpression_INCLUDED
  21. #define Foundation_RegularExpression_INCLUDED
  22. #include "Poco/Foundation.h"
  23. #include <vector>
  24. //
  25. // Copy these definitions from pcre.h
  26. // to avoid pulling in the entire header file
  27. //
  28. extern "C"
  29. {
  30. struct real_pcre8_or_16; /* declaration; the definition is private */
  31. typedef struct real_pcre8_or_16 pcre;
  32. struct pcre_extra;
  33. }
  34. namespace Poco {
  35. class Foundation_API RegularExpression
  36. /// A class for working with regular expressions.
  37. /// Implemented using PCRE, the Perl Compatible
  38. /// Regular Expressions library by Philip Hazel
  39. /// (see http://www.pcre.org).
  40. {
  41. public:
  42. enum Options // These must match the corresponsing options in pcre.h!
  43. /// Some of the following options can only be passed to the constructor;
  44. /// some can be passed only to matching functions, and some can be used
  45. /// everywhere.
  46. ///
  47. /// * Options marked [ctor] can be passed to the constructor.
  48. /// * Options marked [match] can be passed to match, extract, split and subst.
  49. /// * Options marked [subst] can be passed to subst.
  50. ///
  51. /// See the PCRE documentation for more information.
  52. {
  53. RE_CASELESS = 0x00000001, /// case insensitive matching (/i) [ctor]
  54. RE_MULTILINE = 0x00000002, /// enable multi-line mode; affects ^ and $ (/m) [ctor]
  55. RE_DOTALL = 0x00000004, /// dot matches all characters, including newline (/s) [ctor]
  56. RE_EXTENDED = 0x00000008, /// totally ignore whitespace (/x) [ctor]
  57. RE_ANCHORED = 0x00000010, /// treat pattern as if it starts with a ^ [ctor, match]
  58. RE_DOLLAR_ENDONLY = 0x00000020, /// dollar matches end-of-string only, not last newline in string [ctor]
  59. RE_EXTRA = 0x00000040, /// enable optional PCRE functionality [ctor]
  60. RE_NOTBOL = 0x00000080, /// circumflex does not match beginning of string [match]
  61. RE_NOTEOL = 0x00000100, /// $ does not match end of string [match]
  62. RE_UNGREEDY = 0x00000200, /// make quantifiers ungreedy [ctor]
  63. RE_NOTEMPTY = 0x00000400, /// empty string never matches [match]
  64. RE_UTF8 = 0x00000800, /// assume pattern and subject is UTF-8 encoded [ctor]
  65. RE_NO_AUTO_CAPTURE = 0x00001000, /// disable numbered capturing parentheses [ctor, match]
  66. RE_NO_UTF8_CHECK = 0x00002000, /// do not check validity of UTF-8 code sequences [match]
  67. RE_FIRSTLINE = 0x00040000, /// an unanchored pattern is required to match
  68. /// before or at the first newline in the subject string,
  69. /// though the matched text may continue over the newline [ctor]
  70. RE_DUPNAMES = 0x00080000, /// names used to identify capturing subpatterns need not be unique [ctor]
  71. RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor]
  72. RE_NEWLINE_LF = 0x00200000, /// assume newline is LF ('\n') [ctor]
  73. RE_NEWLINE_CRLF = 0x00300000, /// assume newline is CRLF ("\r\n") [ctor]
  74. RE_NEWLINE_ANY = 0x00400000, /// assume newline is any valid Unicode newline character [ctor]
  75. RE_NEWLINE_ANYCRLF = 0x00500000, /// assume newline is any of CR, LF, CRLF [ctor]
  76. RE_GLOBAL = 0x10000000, /// replace all occurences (/g) [subst]
  77. RE_NO_VARS = 0x20000000 /// treat dollar in replacement string as ordinary character [subst]
  78. };
  79. struct Match
  80. {
  81. std::string::size_type offset; /// zero based offset (std::string::npos if subexpr does not match)
  82. std::string::size_type length; /// length of substring
  83. };
  84. typedef std::vector<Match> MatchVec;
  85. RegularExpression(const std::string& pattern, int options = 0, bool study = true);
  86. /// Creates a regular expression and parses the given pattern.
  87. /// If study is true, the pattern is analyzed and optimized. This
  88. /// is mainly useful if the pattern is used more than once.
  89. /// For a description of the options, please see the PCRE documentation.
  90. /// Throws a RegularExpressionException if the patter cannot be compiled.
  91. ~RegularExpression();
  92. /// Destroys the regular expression.
  93. int match(const std::string& subject, Match& mtch, int options = 0) const;
  94. /// Matches the given subject string against the pattern. Returns the position
  95. /// of the first captured substring in mtch.
  96. /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and
  97. /// mtch.length is 0.
  98. /// Throws a RegularExpressionException in case of an error.
  99. /// Returns the number of matches.
  100. int match(const std::string& subject, std::string::size_type offset, Match& mtch, int options = 0) const;
  101. /// Matches the given subject string, starting at offset, against the pattern.
  102. /// Returns the position of the captured substring in mtch.
  103. /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and
  104. /// mtch.length is 0.
  105. /// Throws a RegularExpressionException in case of an error.
  106. /// Returns the number of matches.
  107. int match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options = 0) const;
  108. /// Matches the given subject string against the pattern.
  109. /// The first entry in matches contains the position of the captured substring.
  110. /// The following entries identify matching subpatterns. See the PCRE documentation
  111. /// for a more detailed explanation.
  112. /// If no part of the subject matches the pattern, matches is empty.
  113. /// Throws a RegularExpressionException in case of an error.
  114. /// Returns the number of matches.
  115. bool match(const std::string& subject, std::string::size_type offset = 0) const;
  116. /// Returns true if and only if the subject matches the regular expression.
  117. ///
  118. /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for
  119. /// matching, which means that the empty string will never match and
  120. /// the pattern is treated as if it starts with a ^.
  121. bool match(const std::string& subject, std::string::size_type offset, int options) const;
  122. /// Returns true if and only if the subject matches the regular expression.
  123. bool operator == (const std::string& subject) const;
  124. /// Returns true if and only if the subject matches the regular expression.
  125. ///
  126. /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for
  127. /// matching, which means that the empty string will never match and
  128. /// the pattern is treated as if it starts with a ^.
  129. bool operator != (const std::string& subject) const;
  130. /// Returns true if and only if the subject does not match the regular expression.
  131. ///
  132. /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for
  133. /// matching, which means that the empty string will never match and
  134. /// the pattern is treated as if it starts with a ^.
  135. int extract(const std::string& subject, std::string& str, int options = 0) const;
  136. /// Matches the given subject string against the pattern.
  137. /// Returns the captured string.
  138. /// Throws a RegularExpressionException in case of an error.
  139. /// Returns the number of matches.
  140. int extract(const std::string& subject, std::string::size_type offset, std::string& str, int options = 0) const;
  141. /// Matches the given subject string, starting at offset, against the pattern.
  142. /// Returns the captured string.
  143. /// Throws a RegularExpressionException in case of an error.
  144. /// Returns the number of matches.
  145. int split(const std::string& subject, std::vector<std::string>& strings, int options = 0) const;
  146. /// Matches the given subject string against the pattern.
  147. /// The first entry in captured is the captured substring.
  148. /// The following entries contain substrings matching subpatterns. See the PCRE documentation
  149. /// for a more detailed explanation.
  150. /// If no part of the subject matches the pattern, captured is empty.
  151. /// Throws a RegularExpressionException in case of an error.
  152. /// Returns the number of matches.
  153. int split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options = 0) const;
  154. /// Matches the given subject string against the pattern.
  155. /// The first entry in captured is the captured substring.
  156. /// The following entries contain substrings matching subpatterns. See the PCRE documentation
  157. /// for a more detailed explanation.
  158. /// If no part of the subject matches the pattern, captured is empty.
  159. /// Throws a RegularExpressionException in case of an error.
  160. /// Returns the number of matches.
  161. int subst(std::string& subject, const std::string& replacement, int options = 0) const;
  162. /// Substitute in subject all matches of the pattern with replacement.
  163. /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise,
  164. /// only the first match is replaced.
  165. /// Occurences of $<n> (for example, $1, $2, ...) in replacement are replaced
  166. /// with the corresponding captured string. $0 is the original subject string.
  167. /// Returns the number of replaced occurences.
  168. int subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options = 0) const;
  169. /// Substitute in subject all matches of the pattern with replacement,
  170. /// starting at offset.
  171. /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise,
  172. /// only the first match is replaced.
  173. /// Unless RE_NO_VARS is specified, occurences of $<n> (for example, $0, $1, $2, ... $9)
  174. /// in replacement are replaced with the corresponding captured string.
  175. /// $0 is the captured substring. $1 ... $n are the substrings maching the subpatterns.
  176. /// Returns the number of replaced occurences.
  177. static bool match(const std::string& subject, const std::string& pattern, int options = 0);
  178. /// Matches the given subject string against the regular expression given in pattern,
  179. /// using the given options.
  180. protected:
  181. std::string::size_type substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const;
  182. private:
  183. pcre* _pcre;
  184. pcre_extra* _extra;
  185. static const int OVEC_SIZE;
  186. RegularExpression();
  187. RegularExpression(const RegularExpression&);
  188. RegularExpression& operator = (const RegularExpression&);
  189. };
  190. //
  191. // inlines
  192. //
  193. inline int RegularExpression::match(const std::string& subject, Match& mtch, int options) const
  194. {
  195. return match(subject, 0, mtch, options);
  196. }
  197. inline int RegularExpression::split(const std::string& subject, std::vector<std::string>& strings, int options) const
  198. {
  199. return split(subject, 0, strings, options);
  200. }
  201. inline int RegularExpression::subst(std::string& subject, const std::string& replacement, int options) const
  202. {
  203. return subst(subject, 0, replacement, options);
  204. }
  205. inline bool RegularExpression::operator == (const std::string& subject) const
  206. {
  207. return match(subject);
  208. }
  209. inline bool RegularExpression::operator != (const std::string& subject) const
  210. {
  211. return !match(subject);
  212. }
  213. } // namespace Poco
  214. #endif // Foundation_RegularExpression_INCLUDED