_re2.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. // Copyright 2019 The RE2 Authors. All Rights Reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. #include <stddef.h>
  5. #include <sys/types.h>
  6. #include <memory>
  7. #include <stdexcept>
  8. #include <string>
  9. #include <tuple>
  10. #include <utility>
  11. #include <vector>
  12. #include "absl/strings/string_view.h"
  13. #include "pybind11/buffer_info.h"
  14. #include "pybind11/gil.h"
  15. #include "pybind11/pybind11.h"
  16. #include "pybind11/pytypes.h"
  17. #include "pybind11/stl.h" // IWYU pragma: keep
  18. #include "re2/filtered_re2.h"
  19. #include "re2/re2.h"
  20. #include "re2/set.h"
  21. #ifdef _WIN32
  22. #include <basetsd.h>
  23. #define ssize_t SSIZE_T
  24. #endif
  25. namespace re2_python {
  26. // This is conventional.
  27. namespace py = pybind11;
  28. // In terms of the pybind11 API, a py::buffer is merely a py::object that
  29. // supports the buffer interface/protocol and you must explicitly request
  30. // a py::buffer_info in order to access the actual bytes. Under the hood,
  31. // the py::buffer_info manages a reference count to the py::buffer, so it
  32. // must be constructed and subsequently destructed while holding the GIL.
  33. static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
  34. char* data = reinterpret_cast<char*>(bytes.ptr);
  35. ssize_t size = bytes.size;
  36. return absl::string_view(data, size);
  37. }
  38. static inline int OneCharLen(const char* ptr) {
  39. return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
  40. }
  41. // Helper function for when Python encodes str to bytes and then needs to
  42. // convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
  43. ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
  44. auto bytes = buffer.request();
  45. auto text = FromBytes(bytes);
  46. auto ptr = text.data() + pos;
  47. auto end = text.data() + text.size();
  48. while (ptr < end && len > 0) {
  49. ptr += OneCharLen(ptr);
  50. --len;
  51. }
  52. return ptr - (text.data() + pos);
  53. }
  54. // Helper function for when Python decodes bytes to str and then needs to
  55. // convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
  56. ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
  57. auto bytes = buffer.request();
  58. auto text = FromBytes(bytes);
  59. auto ptr = text.data() + pos;
  60. auto end = text.data() + endpos;
  61. ssize_t len = 0;
  62. while (ptr < end) {
  63. ptr += OneCharLen(ptr);
  64. ++len;
  65. }
  66. return len;
  67. }
  68. std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
  69. const RE2::Options& options) {
  70. auto bytes = buffer.request();
  71. auto pattern = FromBytes(bytes);
  72. return std::make_unique<RE2>(pattern, options);
  73. }
  74. py::bytes RE2ErrorShim(const RE2& self) {
  75. // Return std::string as bytes. That is, without decoding to str.
  76. return self.error();
  77. }
  78. std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
  79. const RE2& self) {
  80. const int num_groups = self.NumberOfCapturingGroups();
  81. std::vector<std::pair<py::bytes, int>> groups;
  82. groups.reserve(num_groups);
  83. for (const auto& it : self.NamedCapturingGroups()) {
  84. groups.emplace_back(it.first, it.second);
  85. }
  86. return groups;
  87. }
  88. std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
  89. std::vector<int> histogram;
  90. self.ProgramFanout(&histogram);
  91. return histogram;
  92. }
  93. std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
  94. std::vector<int> histogram;
  95. self.ReverseProgramFanout(&histogram);
  96. return histogram;
  97. }
  98. std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
  99. const RE2& self, int maxlen) {
  100. std::string min, max;
  101. // Return std::string as bytes. That is, without decoding to str.
  102. return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
  103. }
  104. std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
  105. RE2::Anchor anchor,
  106. py::buffer buffer,
  107. ssize_t pos,
  108. ssize_t endpos) {
  109. auto bytes = buffer.request();
  110. auto text = FromBytes(bytes);
  111. const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0
  112. std::vector<absl::string_view> groups;
  113. groups.resize(num_groups);
  114. py::gil_scoped_release release_gil;
  115. if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
  116. // Ensure that groups are null before converting to spans!
  117. for (auto& it : groups) {
  118. it = absl::string_view();
  119. }
  120. }
  121. std::vector<std::pair<ssize_t, ssize_t>> spans;
  122. spans.reserve(num_groups);
  123. for (const auto& it : groups) {
  124. if (it.data() == NULL) {
  125. spans.emplace_back(-1, -1);
  126. } else {
  127. spans.emplace_back(it.data() - text.data(),
  128. it.data() - text.data() + it.size());
  129. }
  130. }
  131. return spans;
  132. }
  133. py::bytes RE2QuoteMetaShim(py::buffer buffer) {
  134. auto bytes = buffer.request();
  135. auto pattern = FromBytes(bytes);
  136. // Return std::string as bytes. That is, without decoding to str.
  137. return RE2::QuoteMeta(pattern);
  138. }
  139. class Set {
  140. public:
  141. Set(RE2::Anchor anchor, const RE2::Options& options)
  142. : set_(options, anchor) {}
  143. ~Set() = default;
  144. // Not copyable or movable.
  145. Set(const Set&) = delete;
  146. Set& operator=(const Set&) = delete;
  147. int Add(py::buffer buffer) {
  148. auto bytes = buffer.request();
  149. auto pattern = FromBytes(bytes);
  150. int index = set_.Add(pattern, /*error=*/NULL); // -1 on error
  151. return index;
  152. }
  153. bool Compile() {
  154. // Compiling can fail.
  155. return set_.Compile();
  156. }
  157. std::vector<int> Match(py::buffer buffer) const {
  158. auto bytes = buffer.request();
  159. auto text = FromBytes(bytes);
  160. std::vector<int> matches;
  161. py::gil_scoped_release release_gil;
  162. set_.Match(text, &matches);
  163. return matches;
  164. }
  165. private:
  166. RE2::Set set_;
  167. };
  168. class Filter {
  169. public:
  170. Filter() = default;
  171. ~Filter() = default;
  172. // Not copyable or movable.
  173. Filter(const Filter&) = delete;
  174. Filter& operator=(const Filter&) = delete;
  175. int Add(py::buffer buffer, const RE2::Options& options) {
  176. auto bytes = buffer.request();
  177. auto pattern = FromBytes(bytes);
  178. int index = -1; // not clobbered on error
  179. filter_.Add(pattern, options, &index);
  180. return index;
  181. }
  182. bool Compile() {
  183. std::vector<std::string> atoms;
  184. filter_.Compile(&atoms);
  185. RE2::Options options;
  186. options.set_literal(true);
  187. options.set_case_sensitive(false);
  188. set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
  189. for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
  190. if (set_->Add(atoms[i], /*error=*/NULL) != i) {
  191. // Should never happen: the atom is a literal!
  192. py::pybind11_fail("set_->Add() failed");
  193. }
  194. }
  195. // Compiling can fail.
  196. return set_->Compile();
  197. }
  198. std::vector<int> Match(py::buffer buffer, bool potential) const {
  199. if (set_ == nullptr) {
  200. py::pybind11_fail("Match() called before compiling");
  201. }
  202. auto bytes = buffer.request();
  203. auto text = FromBytes(bytes);
  204. std::vector<int> atoms;
  205. py::gil_scoped_release release_gil;
  206. set_->Match(text, &atoms);
  207. std::vector<int> matches;
  208. if (potential) {
  209. filter_.AllPotentials(atoms, &matches);
  210. } else {
  211. filter_.AllMatches(text, atoms, &matches);
  212. }
  213. return matches;
  214. }
  215. const RE2& GetRE2(int index) const {
  216. return filter_.GetRE2(index);
  217. }
  218. private:
  219. re2::FilteredRE2 filter_;
  220. std::unique_ptr<RE2::Set> set_;
  221. };
  222. PYBIND11_MODULE(_re2, module) {
  223. // Translate exceptions thrown by py::pybind11_fail() into Python.
  224. py::register_local_exception<std::runtime_error>(module, "Error");
  225. module.def("CharLenToBytes", &CharLenToBytes);
  226. module.def("BytesToCharLen", &BytesToCharLen);
  227. // CLASSES
  228. // class RE2
  229. // enum Anchor
  230. // class Options
  231. // enum Encoding
  232. // class Set
  233. // class Filter
  234. py::class_<RE2> re2(module, "RE2");
  235. py::enum_<RE2::Anchor> anchor(re2, "Anchor");
  236. py::class_<RE2::Options> options(re2, "Options");
  237. py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
  238. py::class_<Set> set(module, "Set");
  239. py::class_<Filter> filter(module, "Filter");
  240. anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
  241. anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
  242. anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
  243. encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
  244. encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
  245. options.def(py::init<>())
  246. .def_property("max_mem", //
  247. &RE2::Options::max_mem, //
  248. &RE2::Options::set_max_mem) //
  249. .def_property("encoding", //
  250. &RE2::Options::encoding, //
  251. &RE2::Options::set_encoding) //
  252. .def_property("posix_syntax", //
  253. &RE2::Options::posix_syntax, //
  254. &RE2::Options::set_posix_syntax) //
  255. .def_property("longest_match", //
  256. &RE2::Options::longest_match, //
  257. &RE2::Options::set_longest_match) //
  258. .def_property("log_errors", //
  259. &RE2::Options::log_errors, //
  260. &RE2::Options::set_log_errors) //
  261. .def_property("literal", //
  262. &RE2::Options::literal, //
  263. &RE2::Options::set_literal) //
  264. .def_property("never_nl", //
  265. &RE2::Options::never_nl, //
  266. &RE2::Options::set_never_nl) //
  267. .def_property("dot_nl", //
  268. &RE2::Options::dot_nl, //
  269. &RE2::Options::set_dot_nl) //
  270. .def_property("never_capture", //
  271. &RE2::Options::never_capture, //
  272. &RE2::Options::set_never_capture) //
  273. .def_property("case_sensitive", //
  274. &RE2::Options::case_sensitive, //
  275. &RE2::Options::set_case_sensitive) //
  276. .def_property("perl_classes", //
  277. &RE2::Options::perl_classes, //
  278. &RE2::Options::set_perl_classes) //
  279. .def_property("word_boundary", //
  280. &RE2::Options::word_boundary, //
  281. &RE2::Options::set_word_boundary) //
  282. .def_property("one_line", //
  283. &RE2::Options::one_line, //
  284. &RE2::Options::set_one_line); //
  285. re2.def(py::init(&RE2InitShim))
  286. .def("ok", &RE2::ok)
  287. .def("error", &RE2ErrorShim)
  288. .def("options", &RE2::options)
  289. .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
  290. .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
  291. .def("ProgramSize", &RE2::ProgramSize)
  292. .def("ReverseProgramSize", &RE2::ReverseProgramSize)
  293. .def("ProgramFanout", &RE2ProgramFanoutShim)
  294. .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
  295. .def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
  296. .def("Match", &RE2MatchShim)
  297. .def_static("QuoteMeta", &RE2QuoteMetaShim);
  298. set.def(py::init<RE2::Anchor, const RE2::Options&>())
  299. .def("Add", &Set::Add)
  300. .def("Compile", &Set::Compile)
  301. .def("Match", &Set::Match);
  302. filter.def(py::init<>())
  303. .def("Add", &Filter::Add)
  304. .def("Compile", &Filter::Compile)
  305. .def("Match", &Filter::Match)
  306. .def("GetRE2", &Filter::GetRE2,
  307. py::return_value_policy::reference_internal);
  308. }
  309. } // namespace re2_python