123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193 |
- //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
- //
- // The LLVM Compiler Infrastructure
- //
- // This file is distributed under the University of Illinois Open Source
- // License. See LICENSE.TXT for details.
- //
- //===----------------------------------------------------------------------===//
- //
- // This file implements a POSIX regular expression matcher.
- //
- //===----------------------------------------------------------------------===//
- #include "llvm/Support/Regex.h"
- #include "regex_impl.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/StringRef.h"
- #include "llvm/ADT/Twine.h"
- #include <string>
- using namespace llvm;
- Regex::Regex(StringRef regex, unsigned Flags) {
- unsigned flags = 0;
- preg = new llvm_regex();
- preg->re_endp = regex.end();
- if (Flags & IgnoreCase)
- flags |= REG_ICASE;
- if (Flags & Newline)
- flags |= REG_NEWLINE;
- if (!(Flags & BasicRegex))
- flags |= REG_EXTENDED;
- error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
- }
- Regex::~Regex() {
- if (preg) {
- llvm_regfree(preg);
- delete preg;
- }
- }
- bool Regex::isValid(std::string &Error) {
- if (!error)
- return true;
-
- size_t len = llvm_regerror(error, preg, nullptr, 0);
-
- Error.resize(len - 1);
- llvm_regerror(error, preg, &Error[0], len);
- return false;
- }
- /// getNumMatches - In a valid regex, return the number of parenthesized
- /// matches it contains.
- unsigned Regex::getNumMatches() const {
- return preg->re_nsub;
- }
- bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
- unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
- // pmatch needs to have at least one element.
- SmallVector<llvm_regmatch_t, 8> pm;
- pm.resize(nmatch > 0 ? nmatch : 1);
- pm[0].rm_so = 0;
- pm[0].rm_eo = String.size();
- int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
- if (rc == REG_NOMATCH)
- return false;
- if (rc != 0) {
- // regexec can fail due to invalid pattern or running out of memory.
- error = rc;
- return false;
- }
- // There was a match.
- if (Matches) { // match position requested
- Matches->clear();
-
- for (unsigned i = 0; i != nmatch; ++i) {
- if (pm[i].rm_so == -1) {
- // this group didn't match
- Matches->push_back(StringRef());
- continue;
- }
- assert(pm[i].rm_eo >= pm[i].rm_so);
- Matches->push_back(StringRef(String.data()+pm[i].rm_so,
- pm[i].rm_eo-pm[i].rm_so));
- }
- }
- return true;
- }
- std::string Regex::sub(StringRef Repl, StringRef String,
- std::string *Error) {
- SmallVector<StringRef, 8> Matches;
- // Reset error, if given.
- if (Error && !Error->empty()) *Error = "";
- // Return the input if there was no match.
- if (!match(String, &Matches))
- return String;
- // Otherwise splice in the replacement string, starting with the prefix before
- // the match.
- std::string Res(String.begin(), Matches[0].begin());
- // Then the replacement string, honoring possible substitutions.
- while (!Repl.empty()) {
- // Skip to the next escape.
- std::pair<StringRef, StringRef> Split = Repl.split('\\');
- // Add the skipped substring.
- Res += Split.first;
- // Check for terminimation and trailing backslash.
- if (Split.second.empty()) {
- if (Repl.size() != Split.first.size() &&
- Error && Error->empty())
- *Error = "replacement string contained trailing backslash";
- break;
- }
- // Otherwise update the replacement string and interpret escapes.
- Repl = Split.second;
- // FIXME: We should have a StringExtras function for mapping C99 escapes.
- switch (Repl[0]) {
- // Treat all unrecognized characters as self-quoting.
- default:
- Res += Repl[0];
- Repl = Repl.substr(1);
- break;
- // Single character escapes.
- case 't':
- Res += '\t';
- Repl = Repl.substr(1);
- break;
- case 'n':
- Res += '\n';
- Repl = Repl.substr(1);
- break;
- // Decimal escapes are backreferences.
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9': {
- // Extract the backreference number.
- StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
- Repl = Repl.substr(Ref.size());
- unsigned RefValue;
- if (!Ref.getAsInteger(10, RefValue) &&
- RefValue < Matches.size())
- Res += Matches[RefValue];
- else if (Error && Error->empty())
- *Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
- break;
- }
- }
- }
- // And finally the suffix.
- Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
- return Res;
- }
- // These are the special characters matched in functions like "p_ere_exp".
- static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
- bool Regex::isLiteralERE(StringRef Str) {
- // Check for regex metacharacters. This list was derived from our regex
- // implementation in regcomp.c and double checked against the POSIX extended
- // regular expression specification.
- return Str.find_first_of(RegexMetachars) == StringRef::npos;
- }
- std::string Regex::escape(StringRef String) {
- std::string RegexStr;
- for (unsigned i = 0, e = String.size(); i != e; ++i) {
- if (strchr(RegexMetachars, String[i]))
- RegexStr += '\\';
- RegexStr += String[i];
- }
- return RegexStr;
- }
|