Jelajahi Sumber

Merge pull request #73973 from dalexeev/fix-regex-sub

RegEx: Fix handling of unset/unknown capture groups
Rémi Verschelde 9 bulan lalu
induk
melakukan
867806954f
3 mengubah file dengan 48 tambahan dan 18 penghapusan
  1. 37 18
      modules/regex/regex.cpp
  2. 2 0
      modules/regex/regex.h
  3. 9 0
      modules/regex/tests/test_regex.h

+ 37 - 18
modules/regex/regex.cpp

@@ -289,25 +289,17 @@ TypedArray<RegExMatch> RegEx::search_all(const String &p_subject, int p_offset,
 	return result;
 	return result;
 }
 }
 
 
-String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const {
-	ERR_FAIL_COND_V(!is_valid(), String());
-	ERR_FAIL_COND_V_MSG(p_offset < 0, String(), "RegEx sub offset must be >= 0");
-
-	// safety_zone is the number of chars we allocate in addition to the number of chars expected in order to
-	// guard against the PCRE API writing one additional \0 at the end. PCRE's API docs are unclear on whether
-	// PCRE understands outlength in pcre2_substitute() as counting an implicit additional terminating char or
-	// not. always allocating one char more than telling PCRE has us on the safe side.
+int RegEx::_sub(const String &p_subject, const String &p_replacement, int p_offset, int p_end, uint32_t p_flags, String &r_output) const {
+	// `safety_zone` is the number of chars we allocate in addition to the number of chars expected in order to
+	// guard against the PCRE API writing one additional `\0` at the end. PCRE's API docs are unclear on whether
+	// PCRE understands outlength in `pcre2_substitute(`) as counting an implicit additional terminating char or
+	// not. Always allocating one char more than telling PCRE has us on the safe side.
 	const int safety_zone = 1;
 	const int safety_zone = 1;
 
 
-	PCRE2_SIZE olength = p_subject.length() + 1; // space for output string and one terminating \0 character
+	PCRE2_SIZE olength = p_subject.length() + 1; // Space for output string and one terminating `\0` character.
 	Vector<char32_t> output;
 	Vector<char32_t> output;
 	output.resize(olength + safety_zone);
 	output.resize(olength + safety_zone);
 
 
-	uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH;
-	if (p_all) {
-		flags |= PCRE2_SUBSTITUTE_GLOBAL;
-	}
-
 	PCRE2_SIZE length = p_subject.length();
 	PCRE2_SIZE length = p_subject.length();
 	if (p_end >= 0 && (uint32_t)p_end < length) {
 	if (p_end >= 0 && (uint32_t)p_end < length) {
 		length = p_end;
 		length = p_end;
@@ -322,22 +314,49 @@ String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_a
 
 
 	pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx);
 	pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx);
 
 
-	int res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength);
+	int res = pcre2_substitute_32(c, s, length, p_offset, p_flags, match, mctx, r, p_replacement.length(), o, &olength);
 
 
 	if (res == PCRE2_ERROR_NOMEMORY) {
 	if (res == PCRE2_ERROR_NOMEMORY) {
 		output.resize(olength + safety_zone);
 		output.resize(olength + safety_zone);
 		o = (PCRE2_UCHAR32 *)output.ptrw();
 		o = (PCRE2_UCHAR32 *)output.ptrw();
-		res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength);
+		res = pcre2_substitute_32(c, s, length, p_offset, p_flags, match, mctx, r, p_replacement.length(), o, &olength);
 	}
 	}
 
 
 	pcre2_match_data_free_32(match);
 	pcre2_match_data_free_32(match);
 	pcre2_match_context_free_32(mctx);
 	pcre2_match_context_free_32(mctx);
 
 
+	if (res >= 0) {
+		r_output = String(output.ptr(), olength) + p_subject.substr(length);
+	}
+
+	return res;
+}
+
+String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const {
+	ERR_FAIL_COND_V(!is_valid(), String());
+	ERR_FAIL_COND_V_MSG(p_offset < 0, String(), "RegEx sub offset must be >= 0");
+
+	uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_UNSET_EMPTY;
+	if (p_all) {
+		flags |= PCRE2_SUBSTITUTE_GLOBAL;
+	}
+
+	String output;
+	const int res = _sub(p_subject, p_replacement, p_offset, p_end, flags, output);
+
 	if (res < 0) {
 	if (res < 0) {
-		return String();
+		PCRE2_UCHAR32 buf[256];
+		pcre2_get_error_message_32(res, buf, 256);
+		String message = "PCRE2 Error: " + String((const char32_t *)buf);
+		ERR_PRINT(message.utf8());
+
+		if (res == PCRE2_ERROR_NOSUBSTRING) {
+			flags |= PCRE2_SUBSTITUTE_UNKNOWN_UNSET;
+			_sub(p_subject, p_replacement, p_offset, p_end, flags, output);
+		}
 	}
 	}
 
 
-	return String(output.ptr(), olength) + p_subject.substr(length);
+	return output;
 }
 }
 
 
 bool RegEx::is_valid() const {
 bool RegEx::is_valid() const {

+ 2 - 0
modules/regex/regex.h

@@ -78,6 +78,8 @@ class RegEx : public RefCounted {
 
 
 	void _pattern_info(uint32_t what, void *where) const;
 	void _pattern_info(uint32_t what, void *where) const;
 
 
+	int _sub(const String &p_subject, const String &p_replacement, int p_offset, int p_end, uint32_t p_flags, String &r_output) const;
+
 protected:
 protected:
 	static void _bind_methods();
 	static void _bind_methods();
 
 

+ 9 - 0
modules/regex/tests/test_regex.h

@@ -145,6 +145,15 @@ TEST_CASE("[RegEx] Substitution") {
 	CHECK(re5.sub(s5, "cc", true, 0, 2) == "ccccaa");
 	CHECK(re5.sub(s5, "cc", true, 0, 2) == "ccccaa");
 	CHECK(re5.sub(s5, "cc", true, 1, 3) == "acccca");
 	CHECK(re5.sub(s5, "cc", true, 1, 3) == "acccca");
 	CHECK(re5.sub(s5, "", true, 0, 2) == "aa");
 	CHECK(re5.sub(s5, "", true, 0, 2) == "aa");
+
+	const String s6 = "property get_property set_property";
+
+	RegEx re6("(get_|set_)?property");
+	REQUIRE(re6.is_valid());
+	CHECK(re6.sub(s6, "$1new_property", true) == "new_property get_new_property set_new_property");
+	ERR_PRINT_OFF;
+	CHECK(re6.sub(s6, "$5new_property", true) == "new_property new_property new_property");
+	ERR_PRINT_ON;
 }
 }
 
 
 TEST_CASE("[RegEx] Substitution with empty input and/or replacement") {
 TEST_CASE("[RegEx] Substitution with empty input and/or replacement") {