|
@@ -39,22 +39,23 @@ please consult the man page, in case the conversion went wrong.
|
|
<li><a name="TOC24" href="#SEC24">INFORMATION ABOUT A PATTERN'S CALLOUTS</a>
|
|
<li><a name="TOC24" href="#SEC24">INFORMATION ABOUT A PATTERN'S CALLOUTS</a>
|
|
<li><a name="TOC25" href="#SEC25">SERIALIZATION AND PRECOMPILING</a>
|
|
<li><a name="TOC25" href="#SEC25">SERIALIZATION AND PRECOMPILING</a>
|
|
<li><a name="TOC26" href="#SEC26">THE MATCH DATA BLOCK</a>
|
|
<li><a name="TOC26" href="#SEC26">THE MATCH DATA BLOCK</a>
|
|
-<li><a name="TOC27" href="#SEC27">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
|
|
|
-<li><a name="TOC28" href="#SEC28">NEWLINE HANDLING WHEN MATCHING</a>
|
|
|
|
-<li><a name="TOC29" href="#SEC29">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
|
|
|
-<li><a name="TOC30" href="#SEC30">OTHER INFORMATION ABOUT A MATCH</a>
|
|
|
|
-<li><a name="TOC31" href="#SEC31">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
|
|
|
-<li><a name="TOC32" href="#SEC32">OBTAINING A TEXTUAL ERROR MESSAGE</a>
|
|
|
|
-<li><a name="TOC33" href="#SEC33">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
|
|
|
-<li><a name="TOC34" href="#SEC34">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
|
|
|
-<li><a name="TOC35" href="#SEC35">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
|
|
|
-<li><a name="TOC36" href="#SEC36">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
|
|
|
-<li><a name="TOC37" href="#SEC37">DUPLICATE CAPTURE GROUP NAMES</a>
|
|
|
|
-<li><a name="TOC38" href="#SEC38">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
|
|
|
-<li><a name="TOC39" href="#SEC39">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
|
|
|
-<li><a name="TOC40" href="#SEC40">SEE ALSO</a>
|
|
|
|
-<li><a name="TOC41" href="#SEC41">AUTHOR</a>
|
|
|
|
-<li><a name="TOC42" href="#SEC42">REVISION</a>
|
|
|
|
|
|
+<li><a name="TOC27" href="#SEC27">MEMORY USE FOR MATCH DATA BLOCKS</a>
|
|
|
|
+<li><a name="TOC28" href="#SEC28">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
|
|
|
+<li><a name="TOC29" href="#SEC29">NEWLINE HANDLING WHEN MATCHING</a>
|
|
|
|
+<li><a name="TOC30" href="#SEC30">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
|
|
|
+<li><a name="TOC31" href="#SEC31">OTHER INFORMATION ABOUT A MATCH</a>
|
|
|
|
+<li><a name="TOC32" href="#SEC32">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
|
|
|
+<li><a name="TOC33" href="#SEC33">OBTAINING A TEXTUAL ERROR MESSAGE</a>
|
|
|
|
+<li><a name="TOC34" href="#SEC34">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
|
|
|
+<li><a name="TOC35" href="#SEC35">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
|
|
|
+<li><a name="TOC36" href="#SEC36">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
|
|
|
+<li><a name="TOC37" href="#SEC37">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
|
|
|
+<li><a name="TOC38" href="#SEC38">DUPLICATE CAPTURE GROUP NAMES</a>
|
|
|
|
+<li><a name="TOC39" href="#SEC39">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
|
|
|
+<li><a name="TOC40" href="#SEC40">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
|
|
|
+<li><a name="TOC41" href="#SEC41">SEE ALSO</a>
|
|
|
|
+<li><a name="TOC42" href="#SEC42">AUTHOR</a>
|
|
|
|
+<li><a name="TOC43" href="#SEC43">REVISION</a>
|
|
</ul>
|
|
</ul>
|
|
<P>
|
|
<P>
|
|
<b>#include <pcre2.h></b>
|
|
<b>#include <pcre2.h></b>
|
|
@@ -103,6 +104,13 @@ document for an overview of all the PCRE2 documentation.
|
|
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
|
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
|
|
+<b>PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *<i>match_data</i>);</b>
|
|
|
|
+<br>
|
|
|
|
+<br>
|
|
|
|
+<b>PCRE2_SIZE pcre2_get_match_data_heapframes_size(</b>
|
|
|
|
+<b> pcre2_match_data *<i>match_data</i>);</b>
|
|
|
|
+<br>
|
|
|
|
+<br>
|
|
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
|
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
@@ -153,6 +161,10 @@ document for an overview of all the PCRE2 documentation.
|
|
<b> PCRE2_SIZE <i>value</i>);</b>
|
|
<b> PCRE2_SIZE <i>value</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
|
|
+<b>int pcre2_set_max_varlookbehind(pcre2_compile_contest *<i>ccontext</i>,</b>
|
|
|
|
+<b>" uint32_t <i>value</i>);</b>
|
|
|
|
+<br>
|
|
|
|
+<br>
|
|
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
|
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
|
<b> uint32_t <i>value</i>);</b>
|
|
<b> uint32_t <i>value</i>);</b>
|
|
<br>
|
|
<br>
|
|
@@ -241,7 +253,7 @@ document for an overview of all the PCRE2 documentation.
|
|
<b> PCRE2_SPTR <i>name</i>);</b>
|
|
<b> PCRE2_SPTR <i>name</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
-<b>void pcre2_substring_list_free(PCRE2_SPTR *<i>list</i>);</b>
|
|
|
|
|
|
+<b>void pcre2_substring_list_free(PCRE2_UCHAR **<i>list</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
|
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
|
@@ -270,8 +282,8 @@ document for an overview of all the PCRE2 documentation.
|
|
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
|
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
-<b>pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE <i>startsize</i>,</b>
|
|
|
|
-<b> PCRE2_SIZE <i>maxsize</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
|
|
|
|
|
+<b>pcre2_jit_stack *pcre2_jit_stack_create(size_t <i>startsize</i>,</b>
|
|
|
|
+<b> size_t <i>maxsize</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<b>void pcre2_jit_stack_assign(pcre2_match_context *<i>mcontext</i>,</b>
|
|
<b>void pcre2_jit_stack_assign(pcre2_match_context *<i>mcontext</i>,</b>
|
|
@@ -335,7 +347,7 @@ document for an overview of all the PCRE2 documentation.
|
|
<br>
|
|
<br>
|
|
<b>int pcre2_set_recursion_memory_management(</b>
|
|
<b>int pcre2_set_recursion_memory_management(</b>
|
|
<b> pcre2_match_context *<i>mcontext</i>,</b>
|
|
<b> pcre2_match_context *<i>mcontext</i>,</b>
|
|
-<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
|
|
|
|
|
|
+<b> void *(*<i>private_malloc</i>)(size_t, void *),</b>
|
|
<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
|
|
<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
@@ -388,11 +400,8 @@ This contains the function prototypes and other definitions for all three
|
|
libraries. One, two, or all three can be installed simultaneously. On Unix-like
|
|
libraries. One, two, or all three can be installed simultaneously. On Unix-like
|
|
systems the libraries are called <b>libpcre2-8</b>, <b>libpcre2-16</b>, and
|
|
systems the libraries are called <b>libpcre2-8</b>, <b>libpcre2-16</b>, and
|
|
<b>libpcre2-32</b>, and they can also co-exist with the original PCRE libraries.
|
|
<b>libpcre2-32</b>, and they can also co-exist with the original PCRE libraries.
|
|
-</P>
|
|
|
|
-<P>
|
|
|
|
-Character strings are passed to and from a PCRE2 library as a sequence of
|
|
|
|
-unsigned integers in code units of the appropriate width. Every PCRE2 function
|
|
|
|
-comes in three different forms, one for each library, for example:
|
|
|
|
|
|
+Every PCRE2 function comes in three different forms, one for each library, for
|
|
|
|
+example:
|
|
<pre>
|
|
<pre>
|
|
<b>pcre2_compile_8()</b>
|
|
<b>pcre2_compile_8()</b>
|
|
<b>pcre2_compile_16()</b>
|
|
<b>pcre2_compile_16()</b>
|
|
@@ -403,10 +412,16 @@ There are also three different sets of data types:
|
|
<b>PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32</b>
|
|
<b>PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32</b>
|
|
<b>PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32</b>
|
|
<b>PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32</b>
|
|
</pre>
|
|
</pre>
|
|
-The UCHAR types define unsigned code units of the appropriate widths. For
|
|
|
|
-example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR types are
|
|
|
|
-constant pointers to the equivalent UCHAR types, that is, they are pointers to
|
|
|
|
-vectors of unsigned code units.
|
|
|
|
|
|
+The UCHAR types define unsigned code units of the appropriate widths.
|
|
|
|
+For example, PCRE2_UCHAR16 is usually defined as `uint16_t'.
|
|
|
|
+The SPTR types are pointers to constants of the equivalent UCHAR types,
|
|
|
|
+that is, they are pointers to vectors of unsigned code units.
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+Character strings are passed to a PCRE2 library as sequences of unsigned
|
|
|
|
+integers in code units of the appropriate width. The length of a string may
|
|
|
|
+be given as a number of code units, or the string may be specified as
|
|
|
|
+zero-terminated.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
Many applications use only one code unit width. For their convenience, macros
|
|
Many applications use only one code unit width. For their convenience, macros
|
|
@@ -446,7 +461,7 @@ names, without the _8, _16, or _32 suffix.
|
|
PCRE2 has its own native API, which is described in this document. There are
|
|
PCRE2 has its own native API, which is described in this document. There are
|
|
also some wrapper functions for the 8-bit library that correspond to the
|
|
also some wrapper functions for the 8-bit library that correspond to the
|
|
POSIX regular expression API, but they do not give access to all the
|
|
POSIX regular expression API, but they do not give access to all the
|
|
-functionality of PCRE2. They are described in the
|
|
|
|
|
|
+functionality of PCRE2 and they are not thread-safe. They are described in the
|
|
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
|
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
|
documentation. Both these APIs define a set of C function calls.
|
|
documentation. Both these APIs define a set of C function calls.
|
|
</P>
|
|
</P>
|
|
@@ -559,7 +574,8 @@ unsigned integer type, currently always defined as <i>size_t</i>. The largest
|
|
value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved
|
|
value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved
|
|
as a special indicator for zero-terminated strings and unset offsets.
|
|
as a special indicator for zero-terminated strings and unset offsets.
|
|
Therefore, the longest string that can be handled is one less than this
|
|
Therefore, the longest string that can be handled is one less than this
|
|
-maximum.
|
|
|
|
|
|
+maximum. Note that string lengths are always given in code units. Only in the
|
|
|
|
+8-bit library is such a length the same as the number of bytes in the string.
|
|
<a name="newlines"></a></P>
|
|
<a name="newlines"></a></P>
|
|
<br><a name="SEC16" href="#TOC1">NEWLINES</a><br>
|
|
<br><a name="SEC16" href="#TOC1">NEWLINES</a><br>
|
|
<P>
|
|
<P>
|
|
@@ -858,6 +874,16 @@ external sources can limit their size. The default is the largest number that a
|
|
PCRE2_SIZE variable can hold, which is effectively unlimited.
|
|
PCRE2_SIZE variable can hold, which is effectively unlimited.
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
|
|
+<b>int pcre2_set_max_varlookbehind(pcre2_compile_contest *<i>ccontext</i>,</b>
|
|
|
|
+<b>" uint32_t <i>value</i>);</b>
|
|
|
|
+<br>
|
|
|
|
+<br>
|
|
|
|
+This sets a maximum length for the number of characters matched by a
|
|
|
|
+variable-length lookbehind assertion. The default is set when PCRE2 is built,
|
|
|
|
+with the ultimate default being 255, the same as Perl. Lookbehind assertions
|
|
|
|
+without a bounding length are not supported.
|
|
|
|
+<br>
|
|
|
|
+<br>
|
|
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
|
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
|
<b> uint32_t <i>value</i>);</b>
|
|
<b> uint32_t <i>value</i>);</b>
|
|
<br>
|
|
<br>
|
|
@@ -1017,7 +1043,7 @@ has its own memory control arrangements (see the
|
|
documentation for more details). If the limit is reached, the negative error
|
|
documentation for more details). If the limit is reached, the negative error
|
|
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
|
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
|
is built; if it is not, the default is set very large and is essentially
|
|
is built; if it is not, the default is set very large and is essentially
|
|
-"unlimited".
|
|
|
|
|
|
+unlimited.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
A value for the heap limit may also be supplied by an item at the start of a
|
|
A value for the heap limit may also be supplied by an item at the start of a
|
|
@@ -1030,19 +1056,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
|
limit is set, less than the default.
|
|
limit is set, less than the default.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
-The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
|
|
|
-stack for recording backtracking points. The more nested backtracking points
|
|
|
|
-there are (that is, the deeper the search tree), the more memory is needed.
|
|
|
|
-Heap memory is used only if the initial vector is too small. If the heap limit
|
|
|
|
-is set to a value less than 21 (in particular, zero) no heap memory will be
|
|
|
|
-used. In this case, only patterns that do not have a lot of nested backtracking
|
|
|
|
-can be successfully processed.
|
|
|
|
|
|
+The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
|
|
|
+value of zero guarantees a "heap limit exceeded" error. Details of how
|
|
|
|
+<b>pcre2_match()</b> uses the heap are given in the
|
|
|
|
+<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
|
|
|
+documentation.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
-Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
|
|
|
-when processing pattern recursions, lookarounds, or atomic groups, and only if
|
|
|
|
-this is not big enough is heap memory used. In this case, too, setting a value
|
|
|
|
-of zero disables the use of the heap.
|
|
|
|
|
|
+For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
|
|
|
+processing pattern recursions, lookarounds, or atomic groups, and only if this
|
|
|
|
+is not big enough is heap memory used. In this case, setting a value of zero
|
|
|
|
+disables the use of the heap.
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
|
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
|
@@ -1072,10 +1096,9 @@ is also used in this case (but in a different way) to limit how long the
|
|
matching can continue.
|
|
matching can continue.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
-The default value for the limit can be set when PCRE2 is built; the default
|
|
|
|
-default is 10 million, which handles all but the most extreme cases. A value
|
|
|
|
-for the match limit may also be supplied by an item at the start of a pattern
|
|
|
|
-of the form
|
|
|
|
|
|
+The default value for the limit can be set when PCRE2 is built; the default is
|
|
|
|
+10 million, which handles all but the most extreme cases. A value for the match
|
|
|
|
+limit may also be supplied by an item at the start of a pattern of the form
|
|
<pre>
|
|
<pre>
|
|
(*LIMIT_MATCH=ddd)
|
|
(*LIMIT_MATCH=ddd)
|
|
</pre>
|
|
</pre>
|
|
@@ -1089,10 +1112,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
|
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
|
-Each time a nested backtracking point is passed, a new memory "frame" is used
|
|
|
|
|
|
+Each time a nested backtracking point is passed, a new memory frame is used
|
|
to remember the state of matching at that point. Thus, this parameter
|
|
to remember the state of matching at that point. Thus, this parameter
|
|
indirectly limits the amount of memory that is used in a match. However,
|
|
indirectly limits the amount of memory that is used in a match. However,
|
|
-because the size of each memory "frame" depends on the number of capturing
|
|
|
|
|
|
+because the size of each memory frame depends on the number of capturing
|
|
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
|
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
|
was more useful in versions before 10.30, where function recursion was used for
|
|
was more useful in versions before 10.30, where function recursion was used for
|
|
backtracking.
|
|
backtracking.
|
|
@@ -1187,7 +1210,11 @@ for the amount of heap memory used by <b>pcre2_match()</b> or
|
|
PCRE2_CONFIG_JIT
|
|
PCRE2_CONFIG_JIT
|
|
</pre>
|
|
</pre>
|
|
The output is a uint32_t integer that is set to one if support for just-in-time
|
|
The output is a uint32_t integer that is set to one if support for just-in-time
|
|
-compiling is available; otherwise it is set to zero.
|
|
|
|
|
|
+compiling is included in the library; otherwise it is set to zero. Note that
|
|
|
|
+having the support in the library does not guarantee that JIT will be used for
|
|
|
|
+any given match. See the
|
|
|
|
+<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
|
|
|
+documentation for more details.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_CONFIG_JITTARGET
|
|
PCRE2_CONFIG_JITTARGET
|
|
</pre>
|
|
</pre>
|
|
@@ -1304,10 +1331,12 @@ zero.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
|
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
|
-The pattern is defined by a pointer to a string of code units and a length (in
|
|
|
|
-code units). If the pattern is zero-terminated, the length can be specified as
|
|
|
|
-PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of memory that
|
|
|
|
-contains the compiled pattern and related data, or NULL if an error occurred.
|
|
|
|
|
|
+The pattern is defined by a pointer to a string of code units and a length in
|
|
|
|
+code units. If the pattern is zero-terminated, the length can be specified as
|
|
|
|
+PCRE2_ZERO_TERMINATED. A NULL pattern pointer with a length of zero is treated
|
|
|
|
+as an empty string (NULL with a non-zero length causes an error return). The
|
|
|
|
+function returns a pointer to a block of memory that contains the compiled
|
|
|
|
+pattern and related data, or NULL if an error occurred.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
If the compile context argument <i>ccontext</i> is NULL, memory for the compiled
|
|
If the compile context argument <i>ccontext</i> is NULL, memory for the compiled
|
|
@@ -1383,8 +1412,7 @@ If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> return
|
|
NULL immediately. Otherwise, the variables to which these point are set to an
|
|
NULL immediately. Otherwise, the variables to which these point are set to an
|
|
error code and an offset (number of code units) within the pattern,
|
|
error code and an offset (number of code units) within the pattern,
|
|
respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
|
|
respectively, when <b>pcre2_compile()</b> returns NULL because a compilation
|
|
-error has occurred. The values are not defined when compilation is successful
|
|
|
|
-and <b>pcre2_compile()</b> returns a non-NULL value.
|
|
|
|
|
|
+error has occurred.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
|
|
There are nearly 100 positive error codes that <b>pcre2_compile()</b> may return
|
|
@@ -1399,15 +1427,18 @@ because the textual error messages that are obtained by calling the
|
|
message"
|
|
message"
|
|
<a href="#geterrormessage">below)</a>
|
|
<a href="#geterrormessage">below)</a>
|
|
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
|
should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined
|
|
-for both positive and negative error codes in <b>pcre2.h</b>.
|
|
|
|
|
|
+for both positive and negative error codes in <b>pcre2.h</b>. When compilation
|
|
|
|
+is successful <i>errorcode</i> is set to a value that returns the message "no
|
|
|
|
+error" if passed to <b>pcre2_get_error_message()</b>.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
The value returned in <i>erroroffset</i> is an indication of where in the
|
|
The value returned in <i>erroroffset</i> is an indication of where in the
|
|
-pattern the error occurred. It is not necessarily the furthest point in the
|
|
|
|
-pattern that was read. For example, after the error "lookbehind assertion is
|
|
|
|
-not fixed length", the error offset points to the start of the failing
|
|
|
|
-assertion. For an invalid UTF-8 or UTF-16 string, the offset is that of the
|
|
|
|
-first code unit of the failing character.
|
|
|
|
|
|
+pattern an error occurred. When there is no error, zero is returned. A non-zero
|
|
|
|
+value is not necessarily the furthest point in the pattern that was read. For
|
|
|
|
+example, after the error "lookbehind assertion is not fixed length", the error
|
|
|
|
+offset points to the start of the failing assertion. For an invalid UTF-8 or
|
|
|
|
+UTF-16 string, the offset is that of the first code unit of the failing
|
|
|
|
+character.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
Some errors are not detected until the whole pattern has been scanned; in these
|
|
Some errors are not detected until the whole pattern has been scanned; in these
|
|
@@ -1524,11 +1555,14 @@ PCRE2_UCP is set, Unicode properties are used for all characters with more than
|
|
one other case, and for all characters whose code points are greater than
|
|
one other case, and for all characters whose code points are greater than
|
|
U+007F. Note that there are two ASCII characters, K and S, that, in addition to
|
|
U+007F. Note that there are two ASCII characters, K and S, that, in addition to
|
|
their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin
|
|
their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin
|
|
-sign) and U+017F (long S) respectively. For lower valued characters with only
|
|
|
|
-one other case, a lookup table is used for speed. When neither PCRE2_UTF nor
|
|
|
|
-PCRE2_UCP is set, a lookup table is used for all code points less than 256, and
|
|
|
|
-higher code points (available only in 16-bit or 32-bit mode) are treated as not
|
|
|
|
-having another case.
|
|
|
|
|
|
+sign) and U+017F (long S) respectively. If you do not want this case
|
|
|
|
+equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+For lower valued characters with only one other case, a lookup table is used
|
|
|
|
+for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used
|
|
|
|
+for all code points less than 256, and higher code points (available only in
|
|
|
|
+16-bit or 32-bit mode) are treated as not having another case.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_DOLLAR_ENDONLY
|
|
PCRE2_DOLLAR_ENDONLY
|
|
</pre>
|
|
</pre>
|
|
@@ -1586,13 +1620,13 @@ the end of the subject.
|
|
PCRE2_EXTENDED
|
|
PCRE2_EXTENDED
|
|
</pre>
|
|
</pre>
|
|
If this bit is set, most white space characters in the pattern are totally
|
|
If this bit is set, most white space characters in the pattern are totally
|
|
-ignored except when escaped or inside a character class. However, white space
|
|
|
|
-is not allowed within sequences such as (?> that introduce various
|
|
|
|
-parenthesized groups, nor within numerical quantifiers such as {1,3}. Ignorable
|
|
|
|
-white space is permitted between an item and a following quantifier and between
|
|
|
|
-a quantifier and a following + that indicates possessiveness. PCRE2_EXTENDED is
|
|
|
|
-equivalent to Perl's /x option, and it can be changed within a pattern by a
|
|
|
|
-(?x) option setting.
|
|
|
|
|
|
+ignored except when escaped, inside a character class, or inside a \Q...\E
|
|
|
|
+sequence. However, white space is not allowed within sequences such as (?> that
|
|
|
|
+introduce various parenthesized groups, nor within numerical quantifiers such
|
|
|
|
+as {1,3}. Ignorable white space is permitted between an item and a following
|
|
|
|
+quantifier and between a quantifier and a following + that indicates
|
|
|
|
+possessiveness. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be
|
|
|
|
+changed within a pattern by a (?x) option setting.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recognizes as
|
|
When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recognizes as
|
|
@@ -1651,7 +1685,7 @@ PCRE2_FIRSTLINE if <i>startoffset</i> is greater than 3. See also
|
|
PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If
|
|
PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If
|
|
PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first
|
|
PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first
|
|
line and also within the offset limit. In other words, whichever limit comes
|
|
line and also within the offset limit. In other words, whichever limit comes
|
|
-first is used.
|
|
|
|
|
|
+first is used. This option has no effect for anchored patterns.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_LITERAL
|
|
PCRE2_LITERAL
|
|
</pre>
|
|
</pre>
|
|
@@ -1670,7 +1704,11 @@ PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error.
|
|
</pre>
|
|
</pre>
|
|
This option forces PCRE2_UTF (see below) and also enables support for matching
|
|
This option forces PCRE2_UTF (see below) and also enables support for matching
|
|
by <b>pcre2_match()</b> in subject strings that contain invalid UTF sequences.
|
|
by <b>pcre2_match()</b> in subject strings that contain invalid UTF sequences.
|
|
-This facility is not supported for DFA matching. For details, see the
|
|
|
|
|
|
+Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as
|
|
|
|
+sequences of uint16_t or uint32_t code points. They cannot find valid UTF
|
|
|
|
+sequences within an arbitrary string of bytes unless such sequences are
|
|
|
|
+suitably aligned. This facility is not supported for DFA matching. For details,
|
|
|
|
+see the
|
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
|
documentation.
|
|
documentation.
|
|
<pre>
|
|
<pre>
|
|
@@ -1845,7 +1883,7 @@ undefined. It may cause your program to crash or loop.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
Note that this option can also be passed to <b>pcre2_match()</b> and
|
|
Note that this option can also be passed to <b>pcre2_match()</b> and
|
|
-<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
|
|
|
|
|
|
+<b>pcre2_dfa_match()</b>, to suppress UTF validity checking of the subject
|
|
string.
|
|
string.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
@@ -1864,20 +1902,22 @@ are not representable in UTF-16.
|
|
This option has two effects. Firstly, it change the way PCRE2 processes \B,
|
|
This option has two effects. Firstly, it change the way PCRE2 processes \B,
|
|
\b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes. By
|
|
\b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes. By
|
|
default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode
|
|
default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode
|
|
-properties are used instead to classify characters. More details are given in
|
|
|
|
-the section on
|
|
|
|
|
|
+properties are used to classify characters. There are some PCRE2_EXTRA
|
|
|
|
+options (see below) that add finer control to this behaviour. More details are
|
|
|
|
+given in the section on
|
|
<a href="pcre2pattern.html#genericchartypes">generic character types</a>
|
|
<a href="pcre2pattern.html#genericchartypes">generic character types</a>
|
|
in the
|
|
in the
|
|
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
|
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
|
-page. If you set PCRE2_UCP, matching one of the items it affects takes much
|
|
|
|
-longer.
|
|
|
|
|
|
+page.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
The second effect of PCRE2_UCP is to force the use of Unicode properties for
|
|
The second effect of PCRE2_UCP is to force the use of Unicode properties for
|
|
-upper/lower casing operations on characters with code points greater than 127,
|
|
|
|
-even when PCRE2_UTF is not set. This makes it possible, for example, to process
|
|
|
|
-strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has
|
|
|
|
-been compiled with Unicode support (which is the default).
|
|
|
|
|
|
+upper/lower casing operations, even when PCRE2_UTF is not set. This makes it
|
|
|
|
+possible to process strings in the 16-bit UCS-2 code. This option is available
|
|
|
|
+only if PCRE2 has been compiled with Unicode support (which is the default).
|
|
|
|
+The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless
|
|
|
|
+matching such that ASCII characters match only ASCII characters and non-ASCII
|
|
|
|
+characters match only non-ASCII characters.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_UNGREEDY
|
|
PCRE2_UNGREEDY
|
|
</pre>
|
|
</pre>
|
|
@@ -1905,8 +1945,7 @@ Unicode support (which is the default). If Unicode support is not available,
|
|
the use of this option provokes an error. Details of how PCRE2_UTF changes the
|
|
the use of this option provokes an error. Details of how PCRE2_UTF changes the
|
|
behaviour of PCRE2 are given in the
|
|
behaviour of PCRE2 are given in the
|
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
|
-page. In particular, note that it changes the way PCRE2_CASELESS handles
|
|
|
|
-characters with code points greater than 127.
|
|
|
|
|
|
+page. In particular, note that it changes the way PCRE2_CASELESS works.
|
|
<a name="extracompileoptions"></a></P>
|
|
<a name="extracompileoptions"></a></P>
|
|
<br><b>
|
|
<br><b>
|
|
Extra compile options
|
|
Extra compile options
|
|
@@ -1953,6 +1992,37 @@ the way that ECMAscript (aka JavaScript) does. Additional functionality was
|
|
defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of
|
|
defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of
|
|
PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal
|
|
PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal
|
|
character code, where hhh.. is any number of hexadecimal digits.
|
|
character code, where hhh.. is any number of hexadecimal digits.
|
|
|
|
+<pre>
|
|
|
|
+ PCRE2_EXTRA_ASCII_BSD
|
|
|
|
+</pre>
|
|
|
|
+This option forces \d to match only ASCII digits, even when PCRE2_UCP is set.
|
|
|
|
+It can be changed within a pattern by means of the (?aD) option setting.
|
|
|
|
+<pre>
|
|
|
|
+ PCRE2_EXTRA_ASCII_BSS
|
|
|
|
+</pre>
|
|
|
|
+This option forces \s to match only ASCII space characters, even when
|
|
|
|
+PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS)
|
|
|
|
+option setting.
|
|
|
|
+<pre>
|
|
|
|
+ PCRE2_EXTRA_ASCII_BSW
|
|
|
|
+</pre>
|
|
|
|
+This option forces \w to match only ASCII word characters, even when PCRE2_UCP
|
|
|
|
+is set. It can be changed within a pattern by means of the (?aW) option
|
|
|
|
+setting.
|
|
|
|
+<pre>
|
|
|
|
+ PCRE2_EXTRA_ASCII_DIGIT
|
|
|
|
+</pre>
|
|
|
|
+This option forces the POSIX character classes [:digit:] and [:xdigit:] to
|
|
|
|
+match only ASCII digits, even when PCRE2_UCP is set. It can be changed within
|
|
|
|
+a pattern by means of the (?aT) option setting.
|
|
|
|
+<pre>
|
|
|
|
+ PCRE2_EXTRA_ASCII_POSIX
|
|
|
|
+</pre>
|
|
|
|
+This option forces all the POSIX character classes, including [:digit:] and
|
|
|
|
+[:xdigit:], to match only ASCII characters, even when PCRE2_UCP is set. It can
|
|
|
|
+be changed within a pattern by means of the (?aP) option setting, but note that
|
|
|
|
+this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets
|
|
|
|
+all ASCII restrictions for POSIX classes.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
|
|
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
|
|
</pre>
|
|
</pre>
|
|
@@ -1974,6 +2044,17 @@ that a sequence such as [\N{] is interpreted as a malformed attempt at
|
|
[\N{...}] and so is treated as [N{] whereas [\N] gives an error because an
|
|
[\N{...}] and so is treated as [N{] whereas [\N] gives an error because an
|
|
unqualified \N is a valid escape sequence but is not supported in a character
|
|
unqualified \N is a valid escape sequence but is not supported in a character
|
|
class. To reiterate: this is a dangerous option. Use with great care.
|
|
class. To reiterate: this is a dangerous option. Use with great care.
|
|
|
|
+<pre>
|
|
|
|
+ PCRE2_EXTRA_CASELESS_RESTRICT
|
|
|
|
+</pre>
|
|
|
|
+When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode
|
|
|
|
+rules, which allow for more than two cases per character. There are two
|
|
|
|
+case-equivalent character sets that contain both ASCII and non-ASCII
|
|
|
|
+characters. The ASCII letter S is case-equivalent to U+017f (long S) and the
|
|
|
|
+ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables
|
|
|
|
+recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a
|
|
|
|
+caseless match, both characters must either be ASCII or non-ASCII. The option
|
|
|
|
+can be changed with a pattern by the (?r) option setting.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
|
PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
|
</pre>
|
|
</pre>
|
|
@@ -2015,8 +2096,8 @@ also set.
|
|
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
|
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
-<b>pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE <i>startsize</i>,</b>
|
|
|
|
-<b> PCRE2_SIZE <i>maxsize</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
|
|
|
|
|
+<b>pcre2_jit_stack *pcre2_jit_stack_create(size_t <i>startsize</i>,</b>
|
|
|
|
+<b> size_t <i>maxsize</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<b>void pcre2_jit_stack_assign(pcre2_match_context *<i>mcontext</i>,</b>
|
|
<b>void pcre2_jit_stack_assign(pcre2_match_context *<i>mcontext</i>,</b>
|
|
@@ -2055,13 +2136,14 @@ point. However, this applies only to characters whose code points are less than
|
|
\d.
|
|
\d.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
-When PCRE2 is built with Unicode support (the default), the Unicode properties
|
|
|
|
-of all characters can be tested with \p and \P, or, alternatively, the
|
|
|
|
|
|
+When PCRE2 is built with Unicode support (the default), certain Unicode
|
|
|
|
+character properties can be tested with \p and \P, or, alternatively, the
|
|
PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
|
|
PCRE2_UCP option can be set when a pattern is compiled; this causes \w and
|
|
friends to use Unicode property support instead of the built-in tables.
|
|
friends to use Unicode property support instead of the built-in tables.
|
|
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
|
PCRE2_UCP also causes upper/lower casing operations on characters with code
|
|
points greater than 127 to use Unicode properties. These effects apply even
|
|
points greater than 127 to use Unicode properties. These effects apply even
|
|
-when PCRE2_UTF is not set.
|
|
|
|
|
|
+when PCRE2_UTF is not set. There are, however, some PCRE2_EXTRA options (see
|
|
|
|
+above) that can be used to modify or suppress them.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
The use of locales with Unicode is discouraged. If you are handling characters
|
|
The use of locales with Unicode is discouraged. If you are handling characters
|
|
@@ -2316,7 +2398,7 @@ return zero. The third argument should point to a <b>size_t</b> variable.
|
|
PCRE2_INFO_LASTCODETYPE
|
|
PCRE2_INFO_LASTCODETYPE
|
|
</pre>
|
|
</pre>
|
|
Returns 1 if there is a rightmost literal code unit that must exist in any
|
|
Returns 1 if there is a rightmost literal code unit that must exist in any
|
|
-matched string, other than at its start. The third argument should point to a
|
|
|
|
|
|
+matched string, other than at its start. The third argument should point to a
|
|
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
|
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
|
returned, the code unit value itself can be retrieved using
|
|
returned, the code unit value itself can be retrieved using
|
|
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
|
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
|
@@ -2543,7 +2625,9 @@ large enough to hold as many as are expected.
|
|
A minimum of at least 1 pair is imposed by <b>pcre2_match_data_create()</b>, so
|
|
A minimum of at least 1 pair is imposed by <b>pcre2_match_data_create()</b>, so
|
|
it is always possible to return the overall matched string in the case of
|
|
it is always possible to return the overall matched string in the case of
|
|
<b>pcre2_match()</b> or the longest match in the case of
|
|
<b>pcre2_match()</b> or the longest match in the case of
|
|
-<b>pcre2_dfa_match()</b>.
|
|
|
|
|
|
+<b>pcre2_dfa_match()</b>. The maximum number of pairs is 65535; if the first
|
|
|
|
+argument of <b>pcre2_match_data_create()</b> is greater than this, 65535 is
|
|
|
|
+used.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
The second argument of <b>pcre2_match_data_create()</b> is a pointer to a
|
|
The second argument of <b>pcre2_match_data_create()</b> is a pointer to a
|
|
@@ -2591,7 +2675,44 @@ When a match data block itself is no longer needed, it should be freed by
|
|
calling <b>pcre2_match_data_free()</b>. If this function is called with a NULL
|
|
calling <b>pcre2_match_data_free()</b>. If this function is called with a NULL
|
|
argument, it returns immediately, without doing anything.
|
|
argument, it returns immediately, without doing anything.
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC27" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
|
|
|
|
|
+<br><a name="SEC27" href="#TOC1">MEMORY USE FOR MATCH DATA BLOCKS</a><br>
|
|
|
|
+<P>
|
|
|
|
+<b>PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *<i>match_data</i>);</b>
|
|
|
|
+<br>
|
|
|
|
+<br>
|
|
|
|
+<b>PCRE2_SIZE pcre2_get_match_data_heapframes_size(</b>
|
|
|
|
+<b> pcre2_match_data *<i>match_data</i>);</b>
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+The size of a match data block depends on the size of the ovector that it
|
|
|
|
+contains. The function <b>pcre2_get_match_data_size()</b> returns the size, in
|
|
|
|
+bytes, of the block that is its argument.
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+When <b>pcre2_match()</b> runs interpretively (that is, without using JIT), it
|
|
|
|
+makes use of a vector of data frames for remembering backtracking positions.
|
|
|
|
+The size of each individual frame depends on the number of capturing
|
|
|
|
+parentheses in the pattern and can be obtained by calling
|
|
|
|
+<b>pcre2_pattern_info()</b> with the PCRE2_INFO_FRAMESIZE option (see the
|
|
|
|
+section entitled "Information about a compiled pattern"
|
|
|
|
+<a href="#infoaboutpattern>">above).</a>
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+Heap memory is used for the frames vector; if the initial memory block turns
|
|
|
|
+out to be too small during matching, it is automatically expanded. When
|
|
|
|
+<b>pcre2_match()</b> returns, the memory is not freed, but remains attached to
|
|
|
|
+the match data block, for use by any subsequent matches that use the same
|
|
|
|
+block. It is automatically freed when the match data block itself is freed.
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+You can find the current size of the frames vector that a match data block owns
|
|
|
|
+by calling <b>pcre2_get_match_data_heapframes_size()</b>. For a newly created
|
|
|
|
+match data block the size will be zero. Some types of match may require a lot
|
|
|
|
+of frames and thus a large vector; applications that run in environments where
|
|
|
|
+memory is constrained can check this and free the match data block if the heap
|
|
|
|
+frames vector has become too big.
|
|
|
|
+</P>
|
|
|
|
+<br><a name="SEC28" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
|
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
|
@@ -2640,7 +2761,9 @@ The subject string is passed to <b>pcre2_match()</b> as a pointer in
|
|
<i>startoffset</i>. The length and offset are in code units, not characters.
|
|
<i>startoffset</i>. The length and offset are in code units, not characters.
|
|
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
|
That is, they are in bytes for the 8-bit library, 16-bit code units for the
|
|
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
|
16-bit library, and 32-bit code units for the 32-bit library, whether or not
|
|
-UTF processing is enabled.
|
|
|
|
|
|
+UTF processing is enabled. As a special case, if <i>subject</i> is NULL and
|
|
|
|
+<i>length</i> is zero, the subject is assumed to be an empty string. If
|
|
|
|
+<i>length</i> is non-zero, an error occurs if <i>subject</i> is NULL.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
If <i>startoffset</i> is greater than the length of the subject,
|
|
If <i>startoffset</i> is greater than the length of the subject,
|
|
@@ -2697,14 +2820,16 @@ Option bits for <b>pcre2_match()</b>
|
|
<P>
|
|
<P>
|
|
The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
|
|
The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
|
|
zero. The only bits that may be set are PCRE2_ANCHORED,
|
|
zero. The only bits that may be set are PCRE2_ANCHORED,
|
|
-PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL,
|
|
|
|
-PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK,
|
|
|
|
-PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below.
|
|
|
|
|
|
+PCRE2_COPY_MATCHED_SUBJECT, PCRE2_DISABLE_RECURSELOOP_CHECK, PCRE2_ENDANCHORED,
|
|
|
|
+PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
|
|
|
+PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT.
|
|
|
|
+Their action is described below.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
Setting PCRE2_ANCHORED or PCRE2_ENDANCHORED at match time is not supported by
|
|
Setting PCRE2_ANCHORED or PCRE2_ENDANCHORED at match time is not supported by
|
|
the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the
|
|
the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the
|
|
-interpretive code in <b>pcre2_match()</b> is run. Apart from PCRE2_NO_JIT
|
|
|
|
|
|
+interpretive code in <b>pcre2_match()</b> is run.
|
|
|
|
+PCRE2_DISABLE_RECURSELOOP_CHECK is ignored by JIT, but apart from PCRE2_NO_JIT
|
|
(obviously), the remaining options are supported for JIT matching.
|
|
(obviously), the remaining options are supported for JIT matching.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_ANCHORED
|
|
PCRE2_ANCHORED
|
|
@@ -2730,6 +2855,25 @@ the match block itself is used. The copy is automatically freed when
|
|
<b>pcre2_match_data_free()</b> is called to free the match data block. It is also
|
|
<b>pcre2_match_data_free()</b> is called to free the match data block. It is also
|
|
automatically freed if the match data block is re-used for another match
|
|
automatically freed if the match data block is re-used for another match
|
|
operation.
|
|
operation.
|
|
|
|
+<pre>
|
|
|
|
+ PCRE2_DISABLE_RECURSELOOP_CHECK
|
|
|
|
+</pre>
|
|
|
|
+This option is relevant only to <b>pcre2_match()</b> for interpretive matching.
|
|
|
|
+It is ignored when JIT is used, and is forbidden for <b>pcre2_dfa_match()</b>.
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+The use of recursion in patterns can lead to infinite loops. In the
|
|
|
|
+interpretive matcher these would be eventually caught by the match or heap
|
|
|
|
+limits, but this could take a long time and/or use a lot of memory if the
|
|
|
|
+limits are large. There is therefore a check at the start of each recursion.
|
|
|
|
+If the same group is still active from a previous call, and the current subject
|
|
|
|
+pointer is the same as it was at the start of that group, and the furthest
|
|
|
|
+inspected character of the subject has not changed, an error is generated.
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+There are rare cases of matches that would complete, but nevertheless trigger
|
|
|
|
+this error. This option disables the check. It is provided mainly for testing
|
|
|
|
+when comparing JIT and interpretive behaviour.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_ENDANCHORED
|
|
PCRE2_ENDANCHORED
|
|
</pre>
|
|
</pre>
|
|
@@ -2858,7 +3002,7 @@ examples, in the
|
|
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
|
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
|
documentation.
|
|
documentation.
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC28" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
|
|
|
|
|
+<br><a name="SEC29" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
|
<P>
|
|
<P>
|
|
When PCRE2 is built, a default newline convention is set; this is usually the
|
|
When PCRE2 is built, a default newline convention is set; this is usually the
|
|
standard convention for the operating system. The default can be overridden in
|
|
standard convention for the operating system. The default can be overridden in
|
|
@@ -2898,7 +3042,7 @@ does \s, even though it includes CR and LF in the characters that it matches.
|
|
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
|
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
|
valid newline sequence and explicit \r or \n escapes appear in the pattern.
|
|
valid newline sequence and explicit \r or \n escapes appear in the pattern.
|
|
<a name="matchedstrings"></a></P>
|
|
<a name="matchedstrings"></a></P>
|
|
-<br><a name="SEC29" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
|
|
|
|
|
+<br><a name="SEC30" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
|
<P>
|
|
<P>
|
|
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
|
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
|
<br>
|
|
<br>
|
|
@@ -2985,8 +3129,8 @@ Offset values that correspond to unused groups at the end of the expression are
|
|
also set to PCRE2_UNSET. For example, if the string "abc" is matched against
|
|
also set to PCRE2_UNSET. For example, if the string "abc" is matched against
|
|
the pattern (abc)(x(yz)?)? groups 2 and 3 are not matched. The return from the
|
|
the pattern (abc)(x(yz)?)? groups 2 and 3 are not matched. The return from the
|
|
function is 2, because the highest used capture group number is 1. The offsets
|
|
function is 2, because the highest used capture group number is 1. The offsets
|
|
-for for the second and third capture groupss (assuming the vector is large
|
|
|
|
-enough, of course) are set to PCRE2_UNSET.
|
|
|
|
|
|
+for the second and third capture groups (assuming the vector is large enough,
|
|
|
|
+of course) are set to PCRE2_UNSET.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
Elements in the ovector that do not correspond to capturing parentheses in the
|
|
Elements in the ovector that do not correspond to capturing parentheses in the
|
|
@@ -2995,7 +3139,7 @@ parentheses, no more than <i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by
|
|
<b>pcre2_match()</b>. The other elements retain whatever values they previously
|
|
<b>pcre2_match()</b>. The other elements retain whatever values they previously
|
|
had. After a failed match attempt, the contents of the ovector are unchanged.
|
|
had. After a failed match attempt, the contents of the ovector are unchanged.
|
|
<a name="matchotherdata"></a></P>
|
|
<a name="matchotherdata"></a></P>
|
|
-<br><a name="SEC30" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
|
|
|
|
|
+<br><a name="SEC31" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
|
<P>
|
|
<P>
|
|
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
|
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
|
<br>
|
|
<br>
|
|
@@ -3058,7 +3202,7 @@ the code unit offset of the invalid UTF character. Details are given in the
|
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
|
page.
|
|
page.
|
|
<a name="errorlist"></a></P>
|
|
<a name="errorlist"></a></P>
|
|
-<br><a name="SEC31" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
|
|
|
|
|
+<br><a name="SEC32" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
|
<P>
|
|
<P>
|
|
If <b>pcre2_match()</b> fails, it returns a negative number. This can be
|
|
If <b>pcre2_match()</b> fails, it returns a negative number. This can be
|
|
converted to a text string by calling the <b>pcre2_get_error_message()</b>
|
|
converted to a text string by calling the <b>pcre2_get_error_message()</b>
|
|
@@ -3144,11 +3288,11 @@ The backtracking match limit was reached.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_ERROR_NOMEMORY
|
|
PCRE2_ERROR_NOMEMORY
|
|
</pre>
|
|
</pre>
|
|
-If a pattern contains many nested backtracking points, heap memory is used to
|
|
|
|
-remember them. This error is given when the memory allocation function (default
|
|
|
|
-or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
|
|
|
-if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
|
|
|
-also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
|
|
|
|
|
+Heap memory is used to remember backtracking points. This error is given when
|
|
|
|
+the memory allocation function (default or custom) fails. Note that a different
|
|
|
|
+error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
|
|
|
+the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
|
|
|
+PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
|
<pre>
|
|
<pre>
|
|
PCRE2_ERROR_NULL
|
|
PCRE2_ERROR_NULL
|
|
</pre>
|
|
</pre>
|
|
@@ -3165,7 +3309,7 @@ detected and faulted at compile time, but more complicated cases, in particular
|
|
mutual recursions between two different groups, cannot be detected until
|
|
mutual recursions between two different groups, cannot be detected until
|
|
matching is attempted.
|
|
matching is attempted.
|
|
<a name="geterrormessage"></a></P>
|
|
<a name="geterrormessage"></a></P>
|
|
-<br><a name="SEC32" href="#TOC1">OBTAINING A TEXTUAL ERROR MESSAGE</a><br>
|
|
|
|
|
|
+<br><a name="SEC33" href="#TOC1">OBTAINING A TEXTUAL ERROR MESSAGE</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_get_error_message(int <i>errorcode</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
|
|
<b>int pcre2_get_error_message(int <i>errorcode</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
|
|
<b> PCRE2_SIZE <i>bufflen</i>);</b>
|
|
<b> PCRE2_SIZE <i>bufflen</i>);</b>
|
|
@@ -3186,7 +3330,7 @@ returned. If the buffer is too small, the message is truncated (but still with
|
|
a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned.
|
|
a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned.
|
|
None of the messages are very long; a buffer size of 120 code units is ample.
|
|
None of the messages are very long; a buffer size of 120 code units is ample.
|
|
<a name="extractbynumber"></a></P>
|
|
<a name="extractbynumber"></a></P>
|
|
-<br><a name="SEC33" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
|
|
|
|
|
+<br><a name="SEC34" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
|
|
<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
|
|
<b> uint32_t <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
|
|
<b> uint32_t <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
|
|
@@ -3283,13 +3427,13 @@ The substring did not participate in the match. For example, if the pattern is
|
|
(abc)|(def) and the subject is "def", and the ovector contains at least two
|
|
(abc)|(def) and the subject is "def", and the ovector contains at least two
|
|
capturing slots, substring number 1 is unset.
|
|
capturing slots, substring number 1 is unset.
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC34" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
|
|
|
|
|
+<br><a name="SEC35" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
|
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
|
<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
|
|
<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
<br>
|
|
-<b>void pcre2_substring_list_free(PCRE2_SPTR *<i>list</i>);</b>
|
|
|
|
|
|
+<b>void pcre2_substring_list_free(PCRE2_UCHAR **<i>list</i>);</b>
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
The <b>pcre2_substring_list_get()</b> function extracts all available substrings
|
|
The <b>pcre2_substring_list_get()</b> function extracts all available substrings
|
|
@@ -3322,7 +3466,7 @@ distinguished from a genuine zero-length substring by inspecting the
|
|
appropriate offset in the ovector, which contain PCRE2_UNSET for unset
|
|
appropriate offset in the ovector, which contain PCRE2_UNSET for unset
|
|
substrings, or by calling <b>pcre2_substring_length_bynumber()</b>.
|
|
substrings, or by calling <b>pcre2_substring_length_bynumber()</b>.
|
|
<a name="extractbyname"></a></P>
|
|
<a name="extractbyname"></a></P>
|
|
-<br><a name="SEC35" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
|
|
|
|
|
+<br><a name="SEC36" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
|
|
<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
|
|
<b> PCRE2_SPTR <i>name</i>);</b>
|
|
<b> PCRE2_SPTR <i>name</i>);</b>
|
|
@@ -3382,7 +3526,7 @@ names are not included in the compiled code. The matching process uses only
|
|
numbers. For this reason, the use of different names for groups with the
|
|
numbers. For this reason, the use of different names for groups with the
|
|
same number causes an error at compile time.
|
|
same number causes an error at compile time.
|
|
<a name="substitutions"></a></P>
|
|
<a name="substitutions"></a></P>
|
|
-<br><a name="SEC36" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
|
|
|
|
|
+<br><a name="SEC37" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
|
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
|
@@ -3394,12 +3538,17 @@ same number causes an error at compile time.
|
|
<P>
|
|
<P>
|
|
This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
|
|
This function optionally calls <b>pcre2_match()</b> and then makes a copy of the
|
|
subject string in <i>outputbuffer</i>, replacing parts that were matched with
|
|
subject string in <i>outputbuffer</i>, replacing parts that were matched with
|
|
-the <i>replacement</i> string, whose length is supplied in <b>rlength</b>. This
|
|
|
|
-can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an
|
|
|
|
-option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the
|
|
|
|
-replacement string(s). The default action is to perform just one replacement if
|
|
|
|
-the pattern matches, but there is an option that requests multiple replacements
|
|
|
|
-(see PCRE2_SUBSTITUTE_GLOBAL below).
|
|
|
|
|
|
+the <i>replacement</i> string, whose length is supplied in <b>rlength</b>, which
|
|
|
|
+can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a
|
|
|
|
+special case, if <i>replacement</i> is NULL and <i>rlength</i> is zero, the
|
|
|
|
+replacement is assumed to be an empty string. If <i>rlength</i> is non-zero, an
|
|
|
|
+error occurs if <i>replacement</i> is NULL.
|
|
|
|
+</P>
|
|
|
|
+<P>
|
|
|
|
+There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just
|
|
|
|
+the replacement string(s). The default action is to perform just one
|
|
|
|
+replacement if the pattern matches, but there is an option that requests
|
|
|
|
+multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
If successful, <b>pcre2_substitute()</b> returns the number of substitutions
|
|
If successful, <b>pcre2_substitute()</b> returns the number of substitutions
|
|
@@ -3433,12 +3582,12 @@ block may or may not have been changed.
|
|
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
|
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
|
options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
|
|
options can be set in the <i>options</i> argument of <b>pcre2_substitute()</b>.
|
|
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
|
One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external
|
|
-<i>match_data</i> block must be provided, and it must have been used for an
|
|
|
|
-external call to <b>pcre2_match()</b>. The data in the <i>match_data</i> block
|
|
|
|
-(return code, offset vector) is used for the first substitution instead of
|
|
|
|
-calling <b>pcre2_match()</b> from within <b>pcre2_substitute()</b>. This allows
|
|
|
|
-an application to check for a match before choosing to substitute, without
|
|
|
|
-having to repeat the match.
|
|
|
|
|
|
+<i>match_data</i> block must be provided, and it must have already been used for
|
|
|
|
+an external call to <b>pcre2_match()</b> with the same pattern and subject
|
|
|
|
+arguments. The data in the <i>match_data</i> block (return code, offset vector)
|
|
|
|
+is then used for the first substitution instead of calling <b>pcre2_match()</b>
|
|
|
|
+from within <b>pcre2_substitute()</b>. This allows an application to check for a
|
|
|
|
+match before choosing to substitute, without having to repeat the match.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
The contents of the externally supplied match data block are not changed when
|
|
The contents of the externally supplied match data block are not changed when
|
|
@@ -3501,7 +3650,8 @@ replacement string causes an immediate return with the relevant UTF error code.
|
|
If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted
|
|
If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted
|
|
in any way. By default, however, a dollar character is an escape character that
|
|
in any way. By default, however, a dollar character is an escape character that
|
|
can specify the insertion of characters from capture groups and names from
|
|
can specify the insertion of characters from capture groups and names from
|
|
-(*MARK) or other control verbs in the pattern. The following forms are always
|
|
|
|
|
|
+(*MARK) or other control verbs in the pattern. Dollar is the only escape
|
|
|
|
+character (backslash is treated as literal). The following forms are always
|
|
recognized:
|
|
recognized:
|
|
<pre>
|
|
<pre>
|
|
$$ insert a dollar character
|
|
$$ insert a dollar character
|
|
@@ -3583,7 +3733,7 @@ and force lower case. The escape sequences change the current state: \U and
|
|
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
|
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
|
\u and \l force the next character (if it is a letter) to upper or lower
|
|
\u and \l force the next character (if it is a letter) to upper or lower
|
|
case, respectively, and then the state automatically reverts to no case
|
|
case, respectively, and then the state automatically reverts to no case
|
|
-forcing. Case forcing applies to all inserted characters, including those from
|
|
|
|
|
|
+forcing. Case forcing applies to all inserted characters, including those from
|
|
capture groups and letters within \Q...\E quoted sequences. If either
|
|
capture groups and letters within \Q...\E quoted sequences. If either
|
|
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
|
PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
|
|
properties are used for case forcing characters whose code points are greater
|
|
properties are used for case forcing characters whose code points are greater
|
|
@@ -3655,7 +3805,9 @@ default.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
|
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the
|
|
-<i>match_data</i> argument is NULL.
|
|
|
|
|
|
+<i>match_data</i> argument is NULL or if the <i>subject</i> or <i>replacement</i>
|
|
|
|
+arguments are NULL. For backward compatibility reasons an exception is made for
|
|
|
|
+the <i>replacement</i> argument if the <i>rlength</i> argument is also 0.
|
|
</P>
|
|
</P>
|
|
<P>
|
|
<P>
|
|
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
|
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
|
@@ -3731,11 +3883,11 @@ PCRE2_SUBSTITUTE_GLOBAL is set, processing continues with a search for the next
|
|
match. If the value is not zero, the current replacement is not accepted. If
|
|
match. If the value is not zero, the current replacement is not accepted. If
|
|
the value is greater than zero, processing continues when
|
|
the value is greater than zero, processing continues when
|
|
PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero or
|
|
PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero or
|
|
-PCRE2_SUBSTITUTE_GLOBAL is not set), the the rest of the input is copied to the
|
|
|
|
|
|
+PCRE2_SUBSTITUTE_GLOBAL is not set), the rest of the input is copied to the
|
|
output and the call to <b>pcre2_substitute()</b> exits, returning the number of
|
|
output and the call to <b>pcre2_substitute()</b> exits, returning the number of
|
|
matches so far.
|
|
matches so far.
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC37" href="#TOC1">DUPLICATE CAPTURE GROUP NAMES</a><br>
|
|
|
|
|
|
+<br><a name="SEC38" href="#TOC1">DUPLICATE CAPTURE GROUP NAMES</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
|
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
|
<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
|
|
<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
|
|
@@ -3781,7 +3933,7 @@ in the section entitled <i>Information about a pattern</i>. Given all the
|
|
relevant entries for the name, you can extract each of their numbers, and hence
|
|
relevant entries for the name, you can extract each of their numbers, and hence
|
|
the captured data.
|
|
the captured data.
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC38" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
|
|
|
|
|
+<br><a name="SEC39" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
|
<P>
|
|
<P>
|
|
The traditional matching function uses a similar algorithm to Perl, which stops
|
|
The traditional matching function uses a similar algorithm to Perl, which stops
|
|
when it finds the first match at a given point in the subject. If you want to
|
|
when it finds the first match at a given point in the subject. If you want to
|
|
@@ -3799,7 +3951,7 @@ substring. Then return 1, which forces <b>pcre2_match()</b> to backtrack and try
|
|
other alternatives. Ultimately, when it runs out of matches,
|
|
other alternatives. Ultimately, when it runs out of matches,
|
|
<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH.
|
|
<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH.
|
|
<a name="dfamatch"></a></P>
|
|
<a name="dfamatch"></a></P>
|
|
-<br><a name="SEC39" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
|
|
|
|
|
+<br><a name="SEC40" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
|
<P>
|
|
<P>
|
|
<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
|
<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
|
@@ -3810,12 +3962,13 @@ other alternatives. Ultimately, when it runs out of matches,
|
|
<P>
|
|
<P>
|
|
The function <b>pcre2_dfa_match()</b> is called to match a subject string
|
|
The function <b>pcre2_dfa_match()</b> is called to match a subject string
|
|
against a compiled pattern, using a matching algorithm that scans the subject
|
|
against a compiled pattern, using a matching algorithm that scans the subject
|
|
-string just once (not counting lookaround assertions), and does not backtrack.
|
|
|
|
-This has different characteristics to the normal algorithm, and is not
|
|
|
|
-compatible with Perl. Some of the features of PCRE2 patterns are not supported.
|
|
|
|
-Nevertheless, there are times when this kind of matching can be useful. For a
|
|
|
|
-discussion of the two matching algorithms, and a list of features that
|
|
|
|
-<b>pcre2_dfa_match()</b> does not support, see the
|
|
|
|
|
|
+string just once (not counting lookaround assertions), and does not backtrack
|
|
|
|
+(except when processing lookaround assertions). This has different
|
|
|
|
+characteristics to the normal algorithm, and is not compatible with Perl. Some
|
|
|
|
+of the features of PCRE2 patterns are not supported. Nevertheless, there are
|
|
|
|
+times when this kind of matching can be useful. For a discussion of the two
|
|
|
|
+matching algorithms, and a list of features that <b>pcre2_dfa_match()</b> does
|
|
|
|
+not support, see the
|
|
<a href="pcre2matching.html"><b>pcre2matching</b></a>
|
|
<a href="pcre2matching.html"><b>pcre2matching</b></a>
|
|
documentation.
|
|
documentation.
|
|
</P>
|
|
</P>
|
|
@@ -3850,7 +4003,7 @@ Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
|
|
</PRE>
|
|
</PRE>
|
|
</P>
|
|
</P>
|
|
<br><b>
|
|
<br><b>
|
|
-Option bits for <b>pcre_dfa_match()</b>
|
|
|
|
|
|
+Option bits for <b>pcre2_dfa_match()</b>
|
|
</b><br>
|
|
</b><br>
|
|
<P>
|
|
<P>
|
|
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
|
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
|
@@ -3991,13 +4144,13 @@ some plausibility checks are made on the contents of the workspace, which
|
|
should contain data about the previous partial match. If any of these checks
|
|
should contain data about the previous partial match. If any of these checks
|
|
fail, this error is given.
|
|
fail, this error is given.
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC40" href="#TOC1">SEE ALSO</a><br>
|
|
|
|
|
|
+<br><a name="SEC41" href="#TOC1">SEE ALSO</a><br>
|
|
<P>
|
|
<P>
|
|
<b>pcre2build</b>(3), <b>pcre2callout</b>(3), <b>pcre2demo(3)</b>,
|
|
<b>pcre2build</b>(3), <b>pcre2callout</b>(3), <b>pcre2demo(3)</b>,
|
|
<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3),
|
|
<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3),
|
|
<b>pcre2sample</b>(3), <b>pcre2unicode</b>(3).
|
|
<b>pcre2sample</b>(3), <b>pcre2unicode</b>(3).
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC41" href="#TOC1">AUTHOR</a><br>
|
|
|
|
|
|
+<br><a name="SEC42" href="#TOC1">AUTHOR</a><br>
|
|
<P>
|
|
<P>
|
|
Philip Hazel
|
|
Philip Hazel
|
|
<br>
|
|
<br>
|
|
@@ -4006,11 +4159,11 @@ Retired from University Computing Service
|
|
Cambridge, England.
|
|
Cambridge, England.
|
|
<br>
|
|
<br>
|
|
</P>
|
|
</P>
|
|
-<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
|
|
|
|
|
+<br><a name="SEC43" href="#TOC1">REVISION</a><br>
|
|
<P>
|
|
<P>
|
|
-Last updated: 30 August 2021
|
|
|
|
|
|
+Last updated: 27 January 2024
|
|
<br>
|
|
<br>
|
|
-Copyright © 1997-2021 University of Cambridge.
|
|
|
|
|
|
+Copyright © 1997-2024 University of Cambridge.
|
|
<br>
|
|
<br>
|
|
<p>
|
|
<p>
|
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
|
Return to the <a href="index.html">PCRE2 index page</a>.
|