| 13 |
man page, in case the conversion went wrong. |
man page, in case the conversion went wrong. |
| 14 |
<br> |
<br> |
| 15 |
<ul> |
<ul> |
| 16 |
<li><a name="TOC1" href="#SEC1">PCRE NATIVE API</a> |
<li><a name="TOC1" href="#SEC1">PCRE NATIVE API BASIC FUNCTIONS</a> |
| 17 |
<li><a name="TOC2" href="#SEC2">PCRE API OVERVIEW</a> |
<li><a name="TOC2" href="#SEC2">PCRE NATIVE API AUXILIARY FUNCTIONS</a> |
| 18 |
<li><a name="TOC3" href="#SEC3">NEWLINES</a> |
<li><a name="TOC3" href="#SEC3">PCRE NATIVE API INDIRECTED FUNCTIONS</a> |
| 19 |
<li><a name="TOC4" href="#SEC4">MULTITHREADING</a> |
<li><a name="TOC4" href="#SEC4">PCRE API OVERVIEW</a> |
| 20 |
<li><a name="TOC5" href="#SEC5">SAVING PRECOMPILED PATTERNS FOR LATER USE</a> |
<li><a name="TOC5" href="#SEC5">NEWLINES</a> |
| 21 |
<li><a name="TOC6" href="#SEC6">CHECKING BUILD-TIME OPTIONS</a> |
<li><a name="TOC6" href="#SEC6">MULTITHREADING</a> |
| 22 |
<li><a name="TOC7" href="#SEC7">COMPILING A PATTERN</a> |
<li><a name="TOC7" href="#SEC7">SAVING PRECOMPILED PATTERNS FOR LATER USE</a> |
| 23 |
<li><a name="TOC8" href="#SEC8">COMPILATION ERROR CODES</a> |
<li><a name="TOC8" href="#SEC8">CHECKING BUILD-TIME OPTIONS</a> |
| 24 |
<li><a name="TOC9" href="#SEC9">STUDYING A PATTERN</a> |
<li><a name="TOC9" href="#SEC9">COMPILING A PATTERN</a> |
| 25 |
<li><a name="TOC10" href="#SEC10">LOCALE SUPPORT</a> |
<li><a name="TOC10" href="#SEC10">COMPILATION ERROR CODES</a> |
| 26 |
<li><a name="TOC11" href="#SEC11">INFORMATION ABOUT A PATTERN</a> |
<li><a name="TOC11" href="#SEC11">STUDYING A PATTERN</a> |
| 27 |
<li><a name="TOC12" href="#SEC12">OBSOLETE INFO FUNCTION</a> |
<li><a name="TOC12" href="#SEC12">LOCALE SUPPORT</a> |
| 28 |
<li><a name="TOC13" href="#SEC13">REFERENCE COUNTS</a> |
<li><a name="TOC13" href="#SEC13">INFORMATION ABOUT A PATTERN</a> |
| 29 |
<li><a name="TOC14" href="#SEC14">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a> |
<li><a name="TOC14" href="#SEC14">OBSOLETE INFO FUNCTION</a> |
| 30 |
<li><a name="TOC15" href="#SEC15">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a> |
<li><a name="TOC15" href="#SEC15">REFERENCE COUNTS</a> |
| 31 |
<li><a name="TOC16" href="#SEC16">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a> |
<li><a name="TOC16" href="#SEC16">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a> |
| 32 |
<li><a name="TOC17" href="#SEC17">DUPLICATE SUBPATTERN NAMES</a> |
<li><a name="TOC17" href="#SEC17">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a> |
| 33 |
<li><a name="TOC18" href="#SEC18">FINDING ALL POSSIBLE MATCHES</a> |
<li><a name="TOC18" href="#SEC18">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a> |
| 34 |
<li><a name="TOC19" href="#SEC19">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a> |
<li><a name="TOC19" href="#SEC19">DUPLICATE SUBPATTERN NAMES</a> |
| 35 |
<li><a name="TOC20" href="#SEC20">SEE ALSO</a> |
<li><a name="TOC20" href="#SEC20">FINDING ALL POSSIBLE MATCHES</a> |
| 36 |
<li><a name="TOC21" href="#SEC21">AUTHOR</a> |
<li><a name="TOC21" href="#SEC21">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a> |
| 37 |
<li><a name="TOC22" href="#SEC22">REVISION</a> |
<li><a name="TOC22" href="#SEC22">SEE ALSO</a> |
| 38 |
|
<li><a name="TOC23" href="#SEC23">AUTHOR</a> |
| 39 |
|
<li><a name="TOC24" href="#SEC24">REVISION</a> |
| 40 |
</ul> |
</ul> |
| 41 |
<br><a name="SEC1" href="#TOC1">PCRE NATIVE API</a><br> |
<br><a name="SEC1" href="#TOC1">PCRE NATIVE API BASIC FUNCTIONS</a><br> |
| 42 |
<P> |
<P> |
| 43 |
<b>#include <pcre.h></b> |
<b>#include <pcre.h></b> |
| 44 |
</P> |
</P> |
| 58 |
<b>const char **<i>errptr</i>);</b> |
<b>const char **<i>errptr</i>);</b> |
| 59 |
</P> |
</P> |
| 60 |
<P> |
<P> |
| 61 |
|
<b>void pcre_free_study(pcre_extra *<i>extra</i>);</b> |
| 62 |
|
</P> |
| 63 |
|
<P> |
| 64 |
<b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
<b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
| 65 |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
| 66 |
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>);</b> |
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>);</b> |
| 67 |
</P> |
</P> |
| 68 |
|
<br><a name="SEC2" href="#TOC1">PCRE NATIVE API AUXILIARY FUNCTIONS</a><br> |
| 69 |
|
<P> |
| 70 |
|
<b>pcre_jit_stack *pcre_jit_stack_alloc(int <i>startsize</i>, int <i>maxsize</i>);</b> |
| 71 |
|
</P> |
| 72 |
|
<P> |
| 73 |
|
<b>void pcre_jit_stack_free(pcre_jit_stack *<i>stack</i>);</b> |
| 74 |
|
</P> |
| 75 |
|
<P> |
| 76 |
|
<b>void pcre_assign_jit_stack(pcre_extra *<i>extra</i>,</b> |
| 77 |
|
<b>pcre_jit_callback <i>callback</i>, void *<i>data</i>);</b> |
| 78 |
|
</P> |
| 79 |
<P> |
<P> |
| 80 |
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
| 81 |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
| 142 |
<P> |
<P> |
| 143 |
<b>char *pcre_version(void);</b> |
<b>char *pcre_version(void);</b> |
| 144 |
</P> |
</P> |
| 145 |
|
<br><a name="SEC3" href="#TOC1">PCRE NATIVE API INDIRECTED FUNCTIONS</a><br> |
| 146 |
<P> |
<P> |
| 147 |
<b>void *(*pcre_malloc)(size_t);</b> |
<b>void *(*pcre_malloc)(size_t);</b> |
| 148 |
</P> |
</P> |
| 158 |
<P> |
<P> |
| 159 |
<b>int (*pcre_callout)(pcre_callout_block *);</b> |
<b>int (*pcre_callout)(pcre_callout_block *);</b> |
| 160 |
</P> |
</P> |
| 161 |
<br><a name="SEC2" href="#TOC1">PCRE API OVERVIEW</a><br> |
<br><a name="SEC4" href="#TOC1">PCRE API OVERVIEW</a><br> |
| 162 |
<P> |
<P> |
| 163 |
PCRE has its own native API, which is described in this document. There are |
PCRE has its own native API, which is described in this document. There are |
| 164 |
also some wrapper functions that correspond to the POSIX regular expression |
also some wrapper functions that correspond to the POSIX regular expression |
| 165 |
API. These are described in the |
API, but they do not give access to all the functionality. They are described |
| 166 |
|
in the |
| 167 |
<a href="pcreposix.html"><b>pcreposix</b></a> |
<a href="pcreposix.html"><b>pcreposix</b></a> |
| 168 |
documentation. Both of these APIs define a set of C function calls. A C++ |
documentation. Both of these APIs define a set of C function calls. A C++ |
| 169 |
wrapper is distributed with PCRE. It is documented in the |
wrapper is also distributed with PCRE. It is documented in the |
| 170 |
<a href="pcrecpp.html"><b>pcrecpp</b></a> |
<a href="pcrecpp.html"><b>pcrecpp</b></a> |
| 171 |
page. |
page. |
| 172 |
</P> |
</P> |
| 197 |
documentation describes how to compile and run it. |
documentation describes how to compile and run it. |
| 198 |
</P> |
</P> |
| 199 |
<P> |
<P> |
| 200 |
|
Just-in-time compiler support is an optional feature of PCRE that can be built |
| 201 |
|
in appropriate hardware environments. It greatly speeds up the matching |
| 202 |
|
performance of many patterns. Simple programs can easily request that it be |
| 203 |
|
used if available, by setting an option that is ignored when it is not |
| 204 |
|
relevant. More complicated programs might need to make use of the functions |
| 205 |
|
<b>pcre_jit_stack_alloc()</b>, <b>pcre_jit_stack_free()</b>, and |
| 206 |
|
<b>pcre_assign_jit_stack()</b> in order to control the JIT code's memory usage. |
| 207 |
|
These functions are discussed in the |
| 208 |
|
<a href="pcrejit.html"><b>pcrejit</b></a> |
| 209 |
|
documentation. |
| 210 |
|
</P> |
| 211 |
|
<P> |
| 212 |
A second matching function, <b>pcre_dfa_exec()</b>, which is not |
A second matching function, <b>pcre_dfa_exec()</b>, which is not |
| 213 |
Perl-compatible, is also provided. This uses a different algorithm for the |
Perl-compatible, is also provided. This uses a different algorithm for the |
| 214 |
matching. The alternative algorithm finds all possible matches (at a given |
matching. The alternative algorithm finds all possible matches (at a given |
| 284 |
<a href="pcrecallout.html"><b>pcrecallout</b></a> |
<a href="pcrecallout.html"><b>pcrecallout</b></a> |
| 285 |
documentation. |
documentation. |
| 286 |
<a name="newlines"></a></P> |
<a name="newlines"></a></P> |
| 287 |
<br><a name="SEC3" href="#TOC1">NEWLINES</a><br> |
<br><a name="SEC5" href="#TOC1">NEWLINES</a><br> |
| 288 |
<P> |
<P> |
| 289 |
PCRE supports five different conventions for indicating line breaks in |
PCRE supports five different conventions for indicating line breaks in |
| 290 |
strings: a single CR (carriage return) character, a single LF (linefeed) |
strings: a single CR (carriage return) character, a single LF (linefeed) |
| 323 |
the \n or \r escape sequences, nor does it affect what \R matches, which is |
the \n or \r escape sequences, nor does it affect what \R matches, which is |
| 324 |
controlled in a similar way, but by separate options. |
controlled in a similar way, but by separate options. |
| 325 |
</P> |
</P> |
| 326 |
<br><a name="SEC4" href="#TOC1">MULTITHREADING</a><br> |
<br><a name="SEC6" href="#TOC1">MULTITHREADING</a><br> |
| 327 |
<P> |
<P> |
| 328 |
The PCRE functions can be used in multi-threading applications, with the |
The PCRE functions can be used in multi-threading applications, with the |
| 329 |
proviso that the memory management functions pointed to by <b>pcre_malloc</b>, |
proviso that the memory management functions pointed to by <b>pcre_malloc</b>, |
| 334 |
The compiled form of a regular expression is not altered during matching, so |
The compiled form of a regular expression is not altered during matching, so |
| 335 |
the same compiled pattern can safely be used by several threads at once. |
the same compiled pattern can safely be used by several threads at once. |
| 336 |
</P> |
</P> |
| 337 |
<br><a name="SEC5" href="#TOC1">SAVING PRECOMPILED PATTERNS FOR LATER USE</a><br> |
<P> |
| 338 |
|
If the just-in-time optimization feature is being used, it needs separate |
| 339 |
|
memory stack areas for each thread. See the |
| 340 |
|
<a href="pcrejit.html"><b>pcrejit</b></a> |
| 341 |
|
documentation for more details. |
| 342 |
|
</P> |
| 343 |
|
<br><a name="SEC7" href="#TOC1">SAVING PRECOMPILED PATTERNS FOR LATER USE</a><br> |
| 344 |
<P> |
<P> |
| 345 |
The compiled form of a regular expression can be saved and re-used at a later |
The compiled form of a regular expression can be saved and re-used at a later |
| 346 |
time, possibly by a different program, and even on a host other than the one on |
time, possibly by a different program, and even on a host other than the one on |
| 350 |
for use with a different version is not guaranteed to work and may cause |
for use with a different version is not guaranteed to work and may cause |
| 351 |
crashes. |
crashes. |
| 352 |
</P> |
</P> |
| 353 |
<br><a name="SEC6" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br> |
<br><a name="SEC8" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br> |
| 354 |
<P> |
<P> |
| 355 |
<b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b> |
<b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b> |
| 356 |
</P> |
</P> |
| 375 |
The output is an integer that is set to one if support for Unicode character |
The output is an integer that is set to one if support for Unicode character |
| 376 |
properties is available; otherwise it is set to zero. |
properties is available; otherwise it is set to zero. |
| 377 |
<pre> |
<pre> |
| 378 |
|
PCRE_CONFIG_JIT |
| 379 |
|
</pre> |
| 380 |
|
The output is an integer that is set to one if support for just-in-time |
| 381 |
|
compiling is available; otherwise it is set to zero. |
| 382 |
|
<pre> |
| 383 |
PCRE_CONFIG_NEWLINE |
PCRE_CONFIG_NEWLINE |
| 384 |
</pre> |
</pre> |
| 385 |
The output is an integer whose value specifies the default character sequence |
The output is an integer whose value specifies the default character sequence |
| 434 |
<b>pcre_stack_free</b> are called to manage memory blocks on the heap, thus |
<b>pcre_stack_free</b> are called to manage memory blocks on the heap, thus |
| 435 |
avoiding the use of the stack. |
avoiding the use of the stack. |
| 436 |
</P> |
</P> |
| 437 |
<br><a name="SEC7" href="#TOC1">COMPILING A PATTERN</a><br> |
<br><a name="SEC9" href="#TOC1">COMPILING A PATTERN</a><br> |
| 438 |
<P> |
<P> |
| 439 |
<b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b> |
<b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b> |
| 440 |
<b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b> |
<b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b> |
| 476 |
documentation). For those options that can be different in different parts of |
documentation). For those options that can be different in different parts of |
| 477 |
the pattern, the contents of the <i>options</i> argument specifies their |
the pattern, the contents of the <i>options</i> argument specifies their |
| 478 |
settings at the start of compilation and execution. The PCRE_ANCHORED, |
settings at the start of compilation and execution. The PCRE_ANCHORED, |
| 479 |
PCRE_BSR_<i>xxx</i>, and PCRE_NEWLINE_<i>xxx</i> options can be set at the time |
PCRE_BSR_<i>xxx</i>, PCRE_NEWLINE_<i>xxx</i>, PCRE_NO_UTF8_CHECK, and |
| 480 |
of matching as well as at compile time. |
PCRE_NO_START_OPT options can be set at the time of matching as well as at |
| 481 |
|
compile time. |
| 482 |
</P> |
</P> |
| 483 |
<P> |
<P> |
| 484 |
If <i>errptr</i> is NULL, <b>pcre_compile()</b> returns NULL immediately. |
If <i>errptr</i> is NULL, <b>pcre_compile()</b> returns NULL immediately. |
| 485 |
Otherwise, if compilation of a pattern fails, <b>pcre_compile()</b> returns |
Otherwise, if compilation of a pattern fails, <b>pcre_compile()</b> returns |
| 486 |
NULL, and sets the variable pointed to by <i>errptr</i> to point to a textual |
NULL, and sets the variable pointed to by <i>errptr</i> to point to a textual |
| 487 |
error message. This is a static string that is part of the library. You must |
error message. This is a static string that is part of the library. You must |
| 488 |
not try to free it. The byte offset from the start of the pattern to the |
not try to free it. Normally, the offset from the start of the pattern to the |
| 489 |
character that was being processed when the error was discovered is placed in |
byte that was being processed when the error was discovered is placed in the |
| 490 |
the variable pointed to by <i>erroffset</i>, which must not be NULL. If it is, |
variable pointed to by <i>erroffset</i>, which must not be NULL (if it is, an |
| 491 |
an immediate error is given. Some errors are not detected until checks are |
immediate error is given). However, for an invalid UTF-8 string, the offset is |
| 492 |
carried out when the whole pattern has been scanned; in this case the offset is |
that of the first byte of the failing character. Also, some errors are not |
| 493 |
set to the end of the pattern. |
detected until checks are carried out when the whole pattern has been scanned; |
| 494 |
|
in these cases the offset passed back is the length of the pattern. |
| 495 |
|
</P> |
| 496 |
|
<P> |
| 497 |
|
Note that the offset is in bytes, not characters, even in UTF-8 mode. It may |
| 498 |
|
sometimes point into the middle of a UTF-8 character. |
| 499 |
</P> |
</P> |
| 500 |
<P> |
<P> |
| 501 |
If <b>pcre_compile2()</b> is used instead of <b>pcre_compile()</b>, and the |
If <b>pcre_compile2()</b> is used instead of <b>pcre_compile()</b>, and the |
| 575 |
<pre> |
<pre> |
| 576 |
PCRE_DOTALL |
PCRE_DOTALL |
| 577 |
</pre> |
</pre> |
| 578 |
If this bit is set, a dot metacharater in the pattern matches all characters, |
If this bit is set, a dot metacharacter in the pattern matches a character of |
| 579 |
including those that indicate newline. Without it, a dot does not match when |
any value, including one that indicates a newline. However, it only ever |
| 580 |
the current position is at a newline. This option is equivalent to Perl's /s |
matches one character, even if newlines are coded as CRLF. Without this option, |
| 581 |
option, and it can be changed within a pattern by a (?s) option setting. A |
a dot does not match when the current position is at a newline. This option is |
| 582 |
negative class such as [^a] always matches newline characters, independent of |
equivalent to Perl's /s option, and it can be changed within a pattern by a |
| 583 |
the setting of this option. |
(?s) option setting. A negative class such as [^a] always matches newline |
| 584 |
|
characters, independent of the setting of this option. |
| 585 |
<pre> |
<pre> |
| 586 |
PCRE_DUPNAMES |
PCRE_DUPNAMES |
| 587 |
</pre> |
</pre> |
| 602 |
pattern by a (?x) option setting. |
pattern by a (?x) option setting. |
| 603 |
</P> |
</P> |
| 604 |
<P> |
<P> |
| 605 |
|
Which characters are interpreted as newlines is controlled by the options |
| 606 |
|
passed to <b>pcre_compile()</b> or by a special sequence at the start of the |
| 607 |
|
pattern, as described in the section entitled |
| 608 |
|
<a href="pcrepattern.html#newlines">"Newline conventions"</a> |
| 609 |
|
in the <b>pcrepattern</b> documentation. Note that the end of this type of |
| 610 |
|
comment is a literal newline sequence in the pattern; escape sequences that |
| 611 |
|
happen to represent a newline do not count. |
| 612 |
|
</P> |
| 613 |
|
<P> |
| 614 |
This option makes it possible to include comments inside complicated patterns. |
This option makes it possible to include comments inside complicated patterns. |
| 615 |
Note, however, that this applies only to data characters. Whitespace characters |
Note, however, that this applies only to data characters. Whitespace characters |
| 616 |
may never appear within special character sequences in a pattern, for example |
may never appear within special character sequences in a pattern, for example |
| 617 |
within the sequence (?( which introduces a conditional subpattern. |
within the sequence (?( that introduces a conditional subpattern. |
| 618 |
<pre> |
<pre> |
| 619 |
PCRE_EXTRA |
PCRE_EXTRA |
| 620 |
</pre> |
</pre> |
| 649 |
string (by default this causes the current matching alternative to fail). A |
string (by default this causes the current matching alternative to fail). A |
| 650 |
pattern such as (\1)(a) succeeds when this option is set (assuming it can find |
pattern such as (\1)(a) succeeds when this option is set (assuming it can find |
| 651 |
an "a" in the subject), whereas it fails by default, for Perl compatibility. |
an "a" in the subject), whereas it fails by default, for Perl compatibility. |
| 652 |
|
</P> |
| 653 |
|
<P> |
| 654 |
|
(3) \U matches an upper case "U" character; by default \U causes a compile |
| 655 |
|
time error (Perl uses \U to upper case subsequent characters). |
| 656 |
|
</P> |
| 657 |
|
<P> |
| 658 |
|
(4) \u matches a lower case "u" character unless it is followed by four |
| 659 |
|
hexadecimal digits, in which case the hexadecimal number defines the code point |
| 660 |
|
to match. By default, \u causes a compile time error (Perl uses it to upper |
| 661 |
|
case the following character). |
| 662 |
|
</P> |
| 663 |
|
<P> |
| 664 |
|
(5) \x matches a lower case "x" character unless it is followed by two |
| 665 |
|
hexadecimal digits, in which case the hexadecimal number defines the code point |
| 666 |
|
to match. By default, as in Perl, a hexadecimal number is always expected after |
| 667 |
|
\x, but it may have zero, one, or two digits (so, for example, \xz matches a |
| 668 |
|
binary zero character followed by z). |
| 669 |
<pre> |
<pre> |
| 670 |
PCRE_MULTILINE |
PCRE_MULTILINE |
| 671 |
</pre> |
</pre> |
| 711 |
other combinations may yield unused numbers and cause an error. |
other combinations may yield unused numbers and cause an error. |
| 712 |
</P> |
</P> |
| 713 |
<P> |
<P> |
| 714 |
The only time that a line break is specially recognized when compiling a |
The only time that a line break in a pattern is specially recognized when |
| 715 |
pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character |
compiling is when PCRE_EXTENDED is set. CR and LF are whitespace characters, |
| 716 |
class is encountered. This indicates a comment that lasts until after the next |
and so are ignored in this mode. Also, an unescaped # outside a character class |
| 717 |
line break sequence. In other circumstances, line break sequences are treated |
indicates a comment that lasts until after the next line break sequence. In |
| 718 |
as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated |
other circumstances, line break sequences in patterns are treated as literal |
| 719 |
as whitespace characters and are therefore ignored. |
data. |
| 720 |
</P> |
</P> |
| 721 |
<P> |
<P> |
| 722 |
The newline option that is set at compile time becomes the default that is used |
The newline option that is set at compile time becomes the default that is used |
| 730 |
they acquire numbers in the usual way). There is no equivalent of this option |
they acquire numbers in the usual way). There is no equivalent of this option |
| 731 |
in Perl. |
in Perl. |
| 732 |
<pre> |
<pre> |
| 733 |
|
NO_START_OPTIMIZE |
| 734 |
|
</pre> |
| 735 |
|
This is an option that acts at matching time; that is, it is really an option |
| 736 |
|
for <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>. If it is set at compile time, |
| 737 |
|
it is remembered with the compiled pattern and assumed at matching time. For |
| 738 |
|
details see the discussion of PCRE_NO_START_OPTIMIZE |
| 739 |
|
<a href="#execoptions">below.</a> |
| 740 |
|
<pre> |
| 741 |
PCRE_UCP |
PCRE_UCP |
| 742 |
</pre> |
</pre> |
| 743 |
This option changes the way PCRE processes \b, \d, \s, \w, and some of the |
This option changes the way PCRE processes \B, \b, \D, \d, \S, \s, \W, |
| 744 |
POSIX character classes. By default, only ASCII characters are recognized, but |
\w, and some of the POSIX character classes. By default, only ASCII characters |
| 745 |
if PCRE_UCP is set, Unicode properties are used instead to classify characters. |
are recognized, but if PCRE_UCP is set, Unicode properties are used instead to |
| 746 |
More details are given in the section on |
classify characters. More details are given in the section on |
| 747 |
<a href="pcre.html#genericchartypes">generic character types</a> |
<a href="pcre.html#genericchartypes">generic character types</a> |
| 748 |
in the |
in the |
| 749 |
<a href="pcrepattern.html"><b>pcrepattern</b></a> |
<a href="pcrepattern.html"><b>pcrepattern</b></a> |
| 764 |
available only when PCRE is built to include UTF-8 support. If not, the use |
available only when PCRE is built to include UTF-8 support. If not, the use |
| 765 |
of this option provokes an error. Details of how this option changes the |
of this option provokes an error. Details of how this option changes the |
| 766 |
behaviour of PCRE are given in the |
behaviour of PCRE are given in the |
| 767 |
<a href="pcre.html#utf8support">section on UTF-8 support</a> |
<a href="pcreunicode.html"><b>pcreunicode</b></a> |
|
in the main |
|
|
<a href="pcre.html"><b>pcre</b></a> |
|
| 768 |
page. |
page. |
| 769 |
<pre> |
<pre> |
| 770 |
PCRE_NO_UTF8_CHECK |
PCRE_NO_UTF8_CHECK |
| 782 |
can also be passed to <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, to suppress |
can also be passed to <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, to suppress |
| 783 |
the UTF-8 validity checking of subject strings. |
the UTF-8 validity checking of subject strings. |
| 784 |
</P> |
</P> |
| 785 |
<br><a name="SEC8" href="#TOC1">COMPILATION ERROR CODES</a><br> |
<br><a name="SEC10" href="#TOC1">COMPILATION ERROR CODES</a><br> |
| 786 |
<P> |
<P> |
| 787 |
The following table lists the error codes than may be returned by |
The following table lists the error codes than may be returned by |
| 788 |
<b>pcre_compile2()</b>, along with the error messages that may be returned by |
<b>pcre_compile2()</b>, along with the error messages that may be returned by |
| 826 |
34 character value in \x{...} sequence is too large |
34 character value in \x{...} sequence is too large |
| 827 |
35 invalid condition (?(0) |
35 invalid condition (?(0) |
| 828 |
36 \C not allowed in lookbehind assertion |
36 \C not allowed in lookbehind assertion |
| 829 |
37 PCRE does not support \L, \l, \N, \U, or \u |
37 PCRE does not support \L, \l, \N{name}, \U, or \u |
| 830 |
38 number after (?C is > 255 |
38 number after (?C is > 255 |
| 831 |
39 closing ) for (?C expected |
39 closing ) for (?C expected |
| 832 |
40 recursive call could loop indefinitely |
40 recursive call could loop indefinitely |
| 842 |
50 [this code is not in use] |
50 [this code is not in use] |
| 843 |
51 octal value is greater than \377 (not in UTF-8 mode) |
51 octal value is greater than \377 (not in UTF-8 mode) |
| 844 |
52 internal error: overran compiling workspace |
52 internal error: overran compiling workspace |
| 845 |
53 internal error: previously-checked referenced subpattern not found |
53 internal error: previously-checked referenced subpattern |
| 846 |
|
not found |
| 847 |
54 DEFINE group contains more than one branch |
54 DEFINE group contains more than one branch |
| 848 |
55 repeating a DEFINE group is not allowed |
55 repeating a DEFINE group is not allowed |
| 849 |
56 inconsistent NEWLINE options |
56 inconsistent NEWLINE options |
| 856 |
62 subpattern name expected |
62 subpattern name expected |
| 857 |
63 digit expected after (?+ |
63 digit expected after (?+ |
| 858 |
64 ] is an invalid data character in JavaScript compatibility mode |
64 ] is an invalid data character in JavaScript compatibility mode |
| 859 |
65 different names for subpatterns of the same number are not allowed |
65 different names for subpatterns of the same number are |
| 860 |
|
not allowed |
| 861 |
66 (*MARK) must have an argument |
66 (*MARK) must have an argument |
| 862 |
67 this version of PCRE is not compiled with PCRE_UCP support |
67 this version of PCRE is not compiled with PCRE_UCP support |
| 863 |
|
68 \c must be followed by an ASCII character |
| 864 |
|
69 \k is not followed by a braced, angle-bracketed, or quoted name |
| 865 |
</pre> |
</pre> |
| 866 |
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may |
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may |
| 867 |
be used if the limits were changed when PCRE was built. |
be used if the limits were changed when PCRE was built. |
| 868 |
</P> |
<a name="studyingapattern"></a></P> |
| 869 |
<br><a name="SEC9" href="#TOC1">STUDYING A PATTERN</a><br> |
<br><a name="SEC11" href="#TOC1">STUDYING A PATTERN</a><br> |
| 870 |
<P> |
<P> |
| 871 |
<b>pcre_extra *pcre_study(const pcre *<i>code</i>, int <i>options</i></b> |
<b>pcre_extra *pcre_study(const pcre *<i>code</i>, int <i>options</i></b> |
| 872 |
<b>const char **<i>errptr</i>);</b> |
<b>const char **<i>errptr</i>);</b> |
| 895 |
<b>pcre_dfa_exec()</b>, it must set up its own <b>pcre_extra</b> block. |
<b>pcre_dfa_exec()</b>, it must set up its own <b>pcre_extra</b> block. |
| 896 |
</P> |
</P> |
| 897 |
<P> |
<P> |
| 898 |
The second argument of <b>pcre_study()</b> contains option bits. At present, no |
The second argument of <b>pcre_study()</b> contains option bits. There is only |
| 899 |
options are defined, and this argument should always be zero. |
one option: PCRE_STUDY_JIT_COMPILE. If this is set, and the just-in-time |
| 900 |
|
compiler is available, the pattern is further compiled into machine code that |
| 901 |
|
executes much faster than the <b>pcre_exec()</b> matching function. If |
| 902 |
|
the just-in-time compiler is not available, this option is ignored. All other |
| 903 |
|
bits in the <i>options</i> argument must be zero. |
| 904 |
|
</P> |
| 905 |
|
<P> |
| 906 |
|
JIT compilation is a heavyweight optimization. It can take some time for |
| 907 |
|
patterns to be analyzed, and for one-off matches and simple patterns the |
| 908 |
|
benefit of faster execution might be offset by a much slower study time. |
| 909 |
|
Not all patterns can be optimized by the JIT compiler. For those that cannot be |
| 910 |
|
handled, matching automatically falls back to the <b>pcre_exec()</b> |
| 911 |
|
interpreter. For more details, see the |
| 912 |
|
<a href="pcrejit.html"><b>pcrejit</b></a> |
| 913 |
|
documentation. |
| 914 |
</P> |
</P> |
| 915 |
<P> |
<P> |
| 916 |
The third argument for <b>pcre_study()</b> is a pointer for an error message. If |
The third argument for <b>pcre_study()</b> is a pointer for an error message. If |
| 921 |
sure that it has run successfully. |
sure that it has run successfully. |
| 922 |
</P> |
</P> |
| 923 |
<P> |
<P> |
| 924 |
This is a typical call to <b>pcre_study</b>(): |
When you are finished with a pattern, you can free the memory used for the |
| 925 |
|
study data by calling <b>pcre_free_study()</b>. This function was added to the |
| 926 |
|
API for release 8.20. For earlier versions, the memory could be freed with |
| 927 |
|
<b>pcre_free()</b>, just like the pattern itself. This will still work in cases |
| 928 |
|
where PCRE_STUDY_JIT_COMPILE is not used, but it is advisable to change to the |
| 929 |
|
new function when convenient. |
| 930 |
|
</P> |
| 931 |
|
<P> |
| 932 |
|
This is a typical way in which <b>pcre_study</b>() is used (except that in a |
| 933 |
|
real application there should be tests for errors): |
| 934 |
<pre> |
<pre> |
| 935 |
pcre_extra *pe; |
int rc; |
| 936 |
pe = pcre_study( |
pcre *re; |
| 937 |
|
pcre_extra *sd; |
| 938 |
|
re = pcre_compile("pattern", 0, &error, &erroroffset, NULL); |
| 939 |
|
sd = pcre_study( |
| 940 |
re, /* result of pcre_compile() */ |
re, /* result of pcre_compile() */ |
| 941 |
0, /* no options exist */ |
0, /* no options */ |
| 942 |
&error); /* set to NULL or points to a message */ |
&error); /* set to NULL or points to a message */ |
| 943 |
|
rc = pcre_exec( /* see below for details of pcre_exec() options */ |
| 944 |
|
re, sd, "subject", 7, 0, 0, ovector, 30); |
| 945 |
|
... |
| 946 |
|
pcre_free_study(sd); |
| 947 |
|
pcre_free(re); |
| 948 |
</pre> |
</pre> |
| 949 |
Studying a pattern does two things: first, a lower bound for the length of |
Studying a pattern does two things: first, a lower bound for the length of |
| 950 |
subject string that is needed to match the pattern is computed. This does not |
subject string that is needed to match the pattern is computed. This does not |
| 959 |
single fixed starting character. A bitmap of possible starting bytes is |
single fixed starting character. A bitmap of possible starting bytes is |
| 960 |
created. This speeds up finding a position in the subject at which to start |
created. This speeds up finding a position in the subject at which to start |
| 961 |
matching. |
matching. |
| 962 |
|
</P> |
| 963 |
|
<P> |
| 964 |
|
These two optimizations apply to both <b>pcre_exec()</b> and |
| 965 |
|
<b>pcre_dfa_exec()</b>. However, they are not used by <b>pcre_exec()</b> if |
| 966 |
|
<b>pcre_study()</b> is called with the PCRE_STUDY_JIT_COMPILE option, and |
| 967 |
|
just-in-time compiling is successful. The optimizations can be disabled by |
| 968 |
|
setting the PCRE_NO_START_OPTIMIZE option when calling <b>pcre_exec()</b> or |
| 969 |
|
<b>pcre_dfa_exec()</b>. You might want to do this if your pattern contains |
| 970 |
|
callouts or (*MARK) (which cannot be handled by the JIT compiler), and you want |
| 971 |
|
to make use of these facilities in cases where matching fails. See the |
| 972 |
|
discussion of PCRE_NO_START_OPTIMIZE |
| 973 |
|
<a href="#execoptions">below.</a> |
| 974 |
<a name="localesupport"></a></P> |
<a name="localesupport"></a></P> |
| 975 |
<br><a name="SEC10" href="#TOC1">LOCALE SUPPORT</a><br> |
<br><a name="SEC12" href="#TOC1">LOCALE SUPPORT</a><br> |
| 976 |
<P> |
<P> |
| 977 |
PCRE handles caseless matching, and determines whether characters are letters, |
PCRE handles caseless matching, and determines whether characters are letters, |
| 978 |
digits, or whatever, by reference to a set of tables, indexed by character |
digits, or whatever, by reference to a set of tables, indexed by character |
| 1032 |
this facility could be used to match a pattern in a different locale from the |
this facility could be used to match a pattern in a different locale from the |
| 1033 |
one in which it was compiled. Passing table pointers at run time is discussed |
one in which it was compiled. Passing table pointers at run time is discussed |
| 1034 |
below in the section on matching a pattern. |
below in the section on matching a pattern. |
| 1035 |
</P> |
<a name="infoaboutpattern"></a></P> |
| 1036 |
<br><a name="SEC11" href="#TOC1">INFORMATION ABOUT A PATTERN</a><br> |
<br><a name="SEC13" href="#TOC1">INFORMATION ABOUT A PATTERN</a><br> |
| 1037 |
<P> |
<P> |
| 1038 |
<b>int pcre_fullinfo(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
<b>int pcre_fullinfo(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
| 1039 |
<b>int <i>what</i>, void *<i>where</i>);</b> |
<b>int <i>what</i>, void *<i>where</i>);</b> |
| 1064 |
size_t length; |
size_t length; |
| 1065 |
rc = pcre_fullinfo( |
rc = pcre_fullinfo( |
| 1066 |
re, /* result of pcre_compile() */ |
re, /* result of pcre_compile() */ |
| 1067 |
pe, /* result of pcre_study(), or NULL */ |
sd, /* result of pcre_study(), or NULL */ |
| 1068 |
PCRE_INFO_SIZE, /* what is required */ |
PCRE_INFO_SIZE, /* what is required */ |
| 1069 |
&length); /* where to put the data */ |
&length); /* where to put the data */ |
| 1070 |
</pre> |
</pre> |
| 1133 |
0. The fourth argument should point to an <b>int</b> variable. (?J) and |
0. The fourth argument should point to an <b>int</b> variable. (?J) and |
| 1134 |
(?-J) set and unset the local PCRE_DUPNAMES option, respectively. |
(?-J) set and unset the local PCRE_DUPNAMES option, respectively. |
| 1135 |
<pre> |
<pre> |
| 1136 |
|
PCRE_INFO_JIT |
| 1137 |
|
</pre> |
| 1138 |
|
Return 1 if the pattern was studied with the PCRE_STUDY_JIT_COMPILE option, and |
| 1139 |
|
just-in-time compiling was successful. The fourth argument should point to an |
| 1140 |
|
<b>int</b> variable. A return value of 0 means that JIT support is not available |
| 1141 |
|
in this version of PCRE, or that the pattern was not studied with the |
| 1142 |
|
PCRE_STUDY_JIT_COMPILE option, or that the JIT compiler could not handle this |
| 1143 |
|
particular pattern. See the |
| 1144 |
|
<a href="pcrejit.html"><b>pcrejit</b></a> |
| 1145 |
|
documentation for details of what can and cannot be handled. |
| 1146 |
|
<pre> |
| 1147 |
|
PCRE_INFO_JITSIZE |
| 1148 |
|
</pre> |
| 1149 |
|
If the pattern was successfully studied with the PCRE_STUDY_JIT_COMPILE option, |
| 1150 |
|
return the size of the JIT compiled code, otherwise return zero. The fourth |
| 1151 |
|
argument should point to a <b>size_t</b> variable. |
| 1152 |
|
<pre> |
| 1153 |
PCRE_INFO_LASTLITERAL |
PCRE_INFO_LASTLITERAL |
| 1154 |
</pre> |
</pre> |
| 1155 |
Return the value of the rightmost literal byte that must exist in any matched |
Return the value of the rightmost literal byte that must exist in any matched |
| 1258 |
<pre> |
<pre> |
| 1259 |
PCRE_INFO_SIZE |
PCRE_INFO_SIZE |
| 1260 |
</pre> |
</pre> |
| 1261 |
Return the size of the compiled pattern, that is, the value that was passed as |
Return the size of the compiled pattern. The fourth argument should point to a |
| 1262 |
the argument to <b>pcre_malloc()</b> when PCRE was getting memory in which to |
<b>size_t</b> variable. This value does not include the size of the <b>pcre</b> |
| 1263 |
place the compiled data. The fourth argument should point to a <b>size_t</b> |
structure that is returned by <b>pcre_compile()</b>. The value that is passed as |
| 1264 |
variable. |
the argument to <b>pcre_malloc()</b> when <b>pcre_compile()</b> is getting memory |
| 1265 |
|
in which to place the compiled data is the value returned by this option plus |
| 1266 |
|
the size of the <b>pcre</b> structure. Studying a compiled pattern, with or |
| 1267 |
|
without JIT, does not alter the value returned by this option. |
| 1268 |
<pre> |
<pre> |
| 1269 |
PCRE_INFO_STUDYSIZE |
PCRE_INFO_STUDYSIZE |
| 1270 |
</pre> |
</pre> |
| 1271 |
Return the size of the data block pointed to by the <i>study_data</i> field in |
Return the size of the data block pointed to by the <i>study_data</i> field in a |
| 1272 |
a <b>pcre_extra</b> block. That is, it is the value that was passed to |
<b>pcre_extra</b> block. If <b>pcre_extra</b> is NULL, or there is no study data, |
| 1273 |
<b>pcre_malloc()</b> when PCRE was getting memory into which to place the data |
zero is returned. The fourth argument should point to a <b>size_t</b> variable. |
| 1274 |
created by <b>pcre_study()</b>. If <b>pcre_extra</b> is NULL, or there is no |
The <i>study_data</i> field is set by <b>pcre_study()</b> to record information |
| 1275 |
study data, zero is returned. The fourth argument should point to a |
that will speed up matching (see the section entitled |
| 1276 |
<b>size_t</b> variable. |
<a href="#studyingapattern">"Studying a pattern"</a> |
| 1277 |
|
above). The format of the <i>study_data</i> block is private, but its length |
| 1278 |
|
is made available via this option so that it can be saved and restored (see the |
| 1279 |
|
<a href="pcreprecompile.html"><b>pcreprecompile</b></a> |
| 1280 |
|
documentation for details). |
| 1281 |
</P> |
</P> |
| 1282 |
<br><a name="SEC12" href="#TOC1">OBSOLETE INFO FUNCTION</a><br> |
<br><a name="SEC14" href="#TOC1">OBSOLETE INFO FUNCTION</a><br> |
| 1283 |
<P> |
<P> |
| 1284 |
<b>int pcre_info(const pcre *<i>code</i>, int *<i>optptr</i>, int</b> |
<b>int pcre_info(const pcre *<i>code</i>, int *<i>optptr</i>, int</b> |
| 1285 |
<b>*<i>firstcharptr</i>);</b> |
<b>*<i>firstcharptr</i>);</b> |
| 1303 |
it is used to pass back information about the first character of any matched |
it is used to pass back information about the first character of any matched |
| 1304 |
string (see PCRE_INFO_FIRSTBYTE above). |
string (see PCRE_INFO_FIRSTBYTE above). |
| 1305 |
</P> |
</P> |
| 1306 |
<br><a name="SEC13" href="#TOC1">REFERENCE COUNTS</a><br> |
<br><a name="SEC15" href="#TOC1">REFERENCE COUNTS</a><br> |
| 1307 |
<P> |
<P> |
| 1308 |
<b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b> |
<b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b> |
| 1309 |
</P> |
</P> |
| 1327 |
pattern is compiled on one host and then transferred to a host whose byte-order |
pattern is compiled on one host and then transferred to a host whose byte-order |
| 1328 |
is different. (This seems a highly unlikely scenario.) |
is different. (This seems a highly unlikely scenario.) |
| 1329 |
</P> |
</P> |
| 1330 |
<br><a name="SEC14" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br> |
<br><a name="SEC16" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br> |
| 1331 |
<P> |
<P> |
| 1332 |
<b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
<b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
| 1333 |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
| 1337 |
The function <b>pcre_exec()</b> is called to match a subject string against a |
The function <b>pcre_exec()</b> is called to match a subject string against a |
| 1338 |
compiled pattern, which is passed in the <i>code</i> argument. If the |
compiled pattern, which is passed in the <i>code</i> argument. If the |
| 1339 |
pattern was studied, the result of the study should be passed in the |
pattern was studied, the result of the study should be passed in the |
| 1340 |
<i>extra</i> argument. This function is the main matching facility of the |
<i>extra</i> argument. You can call <b>pcre_exec()</b> with the same <i>code</i> |
| 1341 |
library, and it operates in a Perl-like manner. For specialist use there is |
and <i>extra</i> arguments as many times as you like, in order to match |
| 1342 |
also an alternative matching function, which is described |
different subject strings with the same pattern. |
| 1343 |
|
</P> |
| 1344 |
|
<P> |
| 1345 |
|
This function is the main matching facility of the library, and it operates in |
| 1346 |
|
a Perl-like manner. For specialist use there is also an alternative matching |
| 1347 |
|
function, which is described |
| 1348 |
<a href="#dfamatch">below</a> |
<a href="#dfamatch">below</a> |
| 1349 |
in the section about the <b>pcre_dfa_exec()</b> function. |
in the section about the <b>pcre_dfa_exec()</b> function. |
| 1350 |
</P> |
</P> |
| 1385 |
<pre> |
<pre> |
| 1386 |
unsigned long int <i>flags</i>; |
unsigned long int <i>flags</i>; |
| 1387 |
void *<i>study_data</i>; |
void *<i>study_data</i>; |
| 1388 |
|
void *<i>executable_jit</i>; |
| 1389 |
unsigned long int <i>match_limit</i>; |
unsigned long int <i>match_limit</i>; |
| 1390 |
unsigned long int <i>match_limit_recursion</i>; |
unsigned long int <i>match_limit_recursion</i>; |
| 1391 |
void *<i>callout_data</i>; |
void *<i>callout_data</i>; |
| 1396 |
are set. The flag bits are: |
are set. The flag bits are: |
| 1397 |
<pre> |
<pre> |
| 1398 |
PCRE_EXTRA_STUDY_DATA |
PCRE_EXTRA_STUDY_DATA |
| 1399 |
|
PCRE_EXTRA_EXECUTABLE_JIT |
| 1400 |
PCRE_EXTRA_MATCH_LIMIT |
PCRE_EXTRA_MATCH_LIMIT |
| 1401 |
PCRE_EXTRA_MATCH_LIMIT_RECURSION |
PCRE_EXTRA_MATCH_LIMIT_RECURSION |
| 1402 |
PCRE_EXTRA_CALLOUT_DATA |
PCRE_EXTRA_CALLOUT_DATA |
| 1403 |
PCRE_EXTRA_TABLES |
PCRE_EXTRA_TABLES |
| 1404 |
PCRE_EXTRA_MARK |
PCRE_EXTRA_MARK |
| 1405 |
</pre> |
</pre> |
| 1406 |
Other flag bits should be set to zero. The <i>study_data</i> field is set in the |
Other flag bits should be set to zero. The <i>study_data</i> field and sometimes |
| 1407 |
<b>pcre_extra</b> block that is returned by <b>pcre_study()</b>, together with |
the <i>executable_jit</i> field are set in the <b>pcre_extra</b> block that is |
| 1408 |
the appropriate flag bit. You should not set this yourself, but you may add to |
returned by <b>pcre_study()</b>, together with the appropriate flag bits. You |
| 1409 |
the block by setting the other fields and their corresponding flag bits. |
should not set these yourself, but you may add to the block by setting the |
| 1410 |
|
other fields and their corresponding flag bits. |
| 1411 |
</P> |
</P> |
| 1412 |
<P> |
<P> |
| 1413 |
The <i>match_limit</i> field provides a means of preventing PCRE from using up a |
The <i>match_limit</i> field provides a means of preventing PCRE from using up a |
| 1416 |
classic example is a pattern that uses nested unlimited repeats. |
classic example is a pattern that uses nested unlimited repeats. |
| 1417 |
</P> |
</P> |
| 1418 |
<P> |
<P> |
| 1419 |
Internally, PCRE uses a function called <b>match()</b> which it calls repeatedly |
Internally, <b>pcre_exec()</b> uses a function called <b>match()</b>, which it |
| 1420 |
(sometimes recursively). The limit set by <i>match_limit</i> is imposed on the |
calls repeatedly (sometimes recursively). The limit set by <i>match_limit</i> is |
| 1421 |
number of times this function is called during a match, which has the effect of |
imposed on the number of times this function is called during a match, which |
| 1422 |
limiting the amount of backtracking that can take place. For patterns that are |
has the effect of limiting the amount of backtracking that can take place. For |
| 1423 |
not anchored, the count restarts from zero for each position in the subject |
patterns that are not anchored, the count restarts from zero for each position |
| 1424 |
string. |
in the subject string. |
| 1425 |
|
</P> |
| 1426 |
|
<P> |
| 1427 |
|
When <b>pcre_exec()</b> is called with a pattern that was successfully studied |
| 1428 |
|
with the PCRE_STUDY_JIT_COMPILE option, the way that the matching is executed |
| 1429 |
|
is entirely different. However, there is still the possibility of runaway |
| 1430 |
|
matching that goes on for a very long time, and so the <i>match_limit</i> value |
| 1431 |
|
is also used in this case (but in a different way) to limit how long the |
| 1432 |
|
matching can continue. |
| 1433 |
</P> |
</P> |
| 1434 |
<P> |
<P> |
| 1435 |
The default value for the limit can be set when PCRE is built; the default |
The default value for the limit can be set when PCRE is built; the default |
| 1447 |
This limit is of use only if it is set smaller than <i>match_limit</i>. |
This limit is of use only if it is set smaller than <i>match_limit</i>. |
| 1448 |
</P> |
</P> |
| 1449 |
<P> |
<P> |
| 1450 |
Limiting the recursion depth limits the amount of stack that can be used, or, |
Limiting the recursion depth limits the amount of machine stack that can be |
| 1451 |
when PCRE has been compiled to use memory on the heap instead of the stack, the |
used, or, when PCRE has been compiled to use memory on the heap instead of the |
| 1452 |
amount of heap memory that can be used. |
stack, the amount of heap memory that can be used. This limit is not relevant, |
| 1453 |
|
and is ignored, if the pattern was successfully studied with |
| 1454 |
|
PCRE_STUDY_JIT_COMPILE. |
| 1455 |
</P> |
</P> |
| 1456 |
<P> |
<P> |
| 1457 |
The default value for <i>match_limit_recursion</i> can be set when PCRE is |
The default value for <i>match_limit_recursion</i> can be set when PCRE is |
| 1504 |
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART, |
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART, |
| 1505 |
PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and |
PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and |
| 1506 |
PCRE_PARTIAL_HARD. |
PCRE_PARTIAL_HARD. |
| 1507 |
|
</P> |
| 1508 |
|
<P> |
| 1509 |
|
If the pattern was successfully studied with the PCRE_STUDY_JIT_COMPILE option, |
| 1510 |
|
the only supported options for JIT execution are PCRE_NO_UTF8_CHECK, |
| 1511 |
|
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NOTEMPTY_ATSTART. Note in |
| 1512 |
|
particular that partial matching is not supported. If an unsupported option is |
| 1513 |
|
used, JIT execution is disabled and the normal interpretive code in |
| 1514 |
|
<b>pcre_exec()</b> is run. |
| 1515 |
<pre> |
<pre> |
| 1516 |
PCRE_ANCHORED |
PCRE_ANCHORED |
| 1517 |
</pre> |
</pre> |
| 1610 |
ordinary match again. There is some code that demonstrates how to do this in |
ordinary match again. There is some code that demonstrates how to do this in |
| 1611 |
the |
the |
| 1612 |
<a href="pcredemo.html"><b>pcredemo</b></a> |
<a href="pcredemo.html"><b>pcredemo</b></a> |
| 1613 |
sample program. |
sample program. In the most general case, you have to check to see if the |
| 1614 |
|
newline convention recognizes CRLF as a newline, and if so, and the current |
| 1615 |
|
character is CR followed by LF, advance the starting offset by two characters |
| 1616 |
|
instead of one. |
| 1617 |
<pre> |
<pre> |
| 1618 |
PCRE_NO_START_OPTIMIZE |
PCRE_NO_START_OPTIMIZE |
| 1619 |
</pre> |
</pre> |
| 1623 |
for that character, and fails immediately if it cannot find it, without |
for that character, and fails immediately if it cannot find it, without |
| 1624 |
actually running the main matching function. This means that a special item |
actually running the main matching function. This means that a special item |
| 1625 |
such as (*COMMIT) at the start of a pattern is not considered until after a |
such as (*COMMIT) at the start of a pattern is not considered until after a |
| 1626 |
suitable starting point for the match has been found. When callouts are in use, |
suitable starting point for the match has been found. When callouts or (*MARK) |
| 1627 |
these "start-up" optimizations can cause them to be skipped if the pattern is |
items are in use, these "start-up" optimizations can cause them to be skipped |
| 1628 |
never actually used. The PCRE_NO_START_OPTIMIZE option disables the start-up |
if the pattern is never actually used. The start-up optimizations are in effect |
| 1629 |
optimizations, causing performance to suffer, but ensuring that the callouts do |
a pre-scan of the subject that takes place before the pattern is run. |
| 1630 |
occur, and that items such as (*COMMIT) are considered at every possible |
</P> |
| 1631 |
starting position in the subject string. |
<P> |
| 1632 |
|
The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations, possibly |
| 1633 |
|
causing performance to suffer, but ensuring that in cases where the result is |
| 1634 |
|
"no match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) |
| 1635 |
|
are considered at every possible starting position in the subject string. If |
| 1636 |
|
PCRE_NO_START_OPTIMIZE is set at compile time, it cannot be unset at matching |
| 1637 |
|
time. |
| 1638 |
|
</P> |
| 1639 |
|
<P> |
| 1640 |
|
Setting PCRE_NO_START_OPTIMIZE can change the outcome of a matching operation. |
| 1641 |
|
Consider the pattern |
| 1642 |
|
<pre> |
| 1643 |
|
(*COMMIT)ABC |
| 1644 |
|
</pre> |
| 1645 |
|
When this is compiled, PCRE records the fact that a match must start with the |
| 1646 |
|
character "A". Suppose the subject string is "DEFABC". The start-up |
| 1647 |
|
optimization scans along the subject, finds "A" and runs the first match |
| 1648 |
|
attempt from there. The (*COMMIT) item means that the pattern must match the |
| 1649 |
|
current starting position, which in this case, it does. However, if the same |
| 1650 |
|
match is run with PCRE_NO_START_OPTIMIZE set, the initial scan along the |
| 1651 |
|
subject string does not happen. The first match attempt is run starting from |
| 1652 |
|
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so |
| 1653 |
|
the overall result is "no match". If the pattern is studied, more start-up |
| 1654 |
|
optimizations may be used. For example, a minimum length for the subject may be |
| 1655 |
|
recorded. Consider the pattern |
| 1656 |
|
<pre> |
| 1657 |
|
(*MARK:A)(X|Y) |
| 1658 |
|
</pre> |
| 1659 |
|
The minimum length for a match is one character. If the subject is "ABC", there |
| 1660 |
|
will be attempts to match "ABC", "BC", "C", and then finally an empty string. |
| 1661 |
|
If the pattern is studied, the final attempt does not take place, because PCRE |
| 1662 |
|
knows that the subject is too short, and so the (*MARK) is never encountered. |
| 1663 |
|
In this case, studying the pattern does not affect the overall match result, |
| 1664 |
|
which is still "no match", but it does affect the auxiliary information that is |
| 1665 |
|
returned. |
| 1666 |
<pre> |
<pre> |
| 1667 |
PCRE_NO_UTF8_CHECK |
PCRE_NO_UTF8_CHECK |
| 1668 |
</pre> |
</pre> |
| 1675 |
in the main |
in the main |
| 1676 |
<a href="pcre.html"><b>pcre</b></a> |
<a href="pcre.html"><b>pcre</b></a> |
| 1677 |
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_exec()</b> returns |
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_exec()</b> returns |
| 1678 |
the error PCRE_ERROR_BADUTF8. If <i>startoffset</i> contains an invalid value, |
the error PCRE_ERROR_BADUTF8 or, if PCRE_PARTIAL_HARD is set and the problem is |
| 1679 |
PCRE_ERROR_BADUTF8_OFFSET is returned. |
a truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORTUTF8. In |
| 1680 |
|
both cases, information about the precise nature of the error may also be |
| 1681 |
|
returned (see the descriptions of these errors in the section entitled \fIError |
| 1682 |
|
return values from\fP <b>pcre_exec()</b> |
| 1683 |
|
<a href="#errorlist">below).</a> |
| 1684 |
|
If <i>startoffset</i> contains a value that does not point to the start of a |
| 1685 |
|
UTF-8 character (or to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is |
| 1686 |
|
returned. |
| 1687 |
</P> |
</P> |
| 1688 |
<P> |
<P> |
| 1689 |
If you already know that your subject is valid, and you want to skip these |
If you already know that your subject is valid, and you want to skip these |
| 1691 |
calling <b>pcre_exec()</b>. You might want to do this for the second and |
calling <b>pcre_exec()</b>. You might want to do this for the second and |
| 1692 |
subsequent calls to <b>pcre_exec()</b> if you are making repeated calls to find |
subsequent calls to <b>pcre_exec()</b> if you are making repeated calls to find |
| 1693 |
all the matches in a single subject string. However, you should be sure that |
all the matches in a single subject string. However, you should be sure that |
| 1694 |
the value of <i>startoffset</i> points to the start of a UTF-8 character. When |
the value of <i>startoffset</i> points to the start of a UTF-8 character (or the |
| 1695 |
PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8 string as a |
end of the subject). When PCRE_NO_UTF8_CHECK is set, the effect of passing an |
| 1696 |
subject, or a value of <i>startoffset</i> that does not point to the start of a |
invalid UTF-8 string as a subject or an invalid value of <i>startoffset</i> is |
| 1697 |
UTF-8 character, is undefined. Your program may crash. |
undefined. Your program may crash. |
| 1698 |
<pre> |
<pre> |
| 1699 |
PCRE_PARTIAL_HARD |
PCRE_PARTIAL_HARD |
| 1700 |
PCRE_PARTIAL_SOFT |
PCRE_PARTIAL_SOFT |
| 1703 |
compatibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. A partial match |
compatibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. A partial match |
| 1704 |
occurs if the end of the subject string is reached successfully, but there are |
occurs if the end of the subject string is reached successfully, but there are |
| 1705 |
not enough subject characters to complete the match. If this happens when |
not enough subject characters to complete the match. If this happens when |
| 1706 |
PCRE_PARTIAL_HARD is set, <b>pcre_exec()</b> immediately returns |
PCRE_PARTIAL_SOFT (but not PCRE_PARTIAL_HARD) is set, matching continues by |
| 1707 |
PCRE_ERROR_PARTIAL. Otherwise, if PCRE_PARTIAL_SOFT is set, matching continues |
testing any remaining alternatives. Only if no complete match can be found is |
| 1708 |
by testing any other alternatives. Only if they all fail is PCRE_ERROR_PARTIAL |
PCRE_ERROR_PARTIAL returned instead of PCRE_ERROR_NOMATCH. In other words, |
| 1709 |
returned (instead of PCRE_ERROR_NOMATCH). The portion of the string that |
PCRE_PARTIAL_SOFT says that the caller is prepared to handle a partial match, |
| 1710 |
was inspected when the partial match was found is set as the first matching |
but only if no complete match can be found. |
| 1711 |
string. There is a more detailed discussion in the |
</P> |
| 1712 |
|
<P> |
| 1713 |
|
If PCRE_PARTIAL_HARD is set, it overrides PCRE_PARTIAL_SOFT. In this case, if a |
| 1714 |
|
partial match is found, <b>pcre_exec()</b> immediately returns |
| 1715 |
|
PCRE_ERROR_PARTIAL, without considering any other alternatives. In other words, |
| 1716 |
|
when PCRE_PARTIAL_HARD is set, a partial match is considered to be more |
| 1717 |
|
important that an alternative complete match. |
| 1718 |
|
</P> |
| 1719 |
|
<P> |
| 1720 |
|
In both cases, the portion of the string that was inspected when the partial |
| 1721 |
|
match was found is set as the first matching string. There is a more detailed |
| 1722 |
|
discussion of partial and multi-segment matching, with examples, in the |
| 1723 |
<a href="pcrepartial.html"><b>pcrepartial</b></a> |
<a href="pcrepartial.html"><b>pcrepartial</b></a> |
| 1724 |
documentation. |
documentation. |
| 1725 |
</P> |
</P> |
| 1729 |
<P> |
<P> |
| 1730 |
The subject string is passed to <b>pcre_exec()</b> as a pointer in |
The subject string is passed to <b>pcre_exec()</b> as a pointer in |
| 1731 |
<i>subject</i>, a length (in bytes) in <i>length</i>, and a starting byte offset |
<i>subject</i>, a length (in bytes) in <i>length</i>, and a starting byte offset |
| 1732 |
in <i>startoffset</i>. In UTF-8 mode, the byte offset must point to the start of |
in <i>startoffset</i>. If this is negative or greater than the length of the |
| 1733 |
a UTF-8 character. Unlike the pattern string, the subject may contain binary |
subject, <b>pcre_exec()</b> returns PCRE_ERROR_BADOFFSET. When the starting |
| 1734 |
zero bytes. When the starting offset is zero, the search for a match starts at |
offset is zero, the search for a match starts at the beginning of the subject, |
| 1735 |
the beginning of the subject, and this is by far the most common case. |
and this is by far the most common case. In UTF-8 mode, the byte offset must |
| 1736 |
|
point to the start of a UTF-8 character (or the end of the subject). Unlike the |
| 1737 |
|
pattern string, the subject may contain binary zero bytes. |
| 1738 |
</P> |
</P> |
| 1739 |
<P> |
<P> |
| 1740 |
A non-zero starting offset is useful when searching for another match in the |
A non-zero starting offset is useful when searching for another match in the |
| 1756 |
behind the starting point to discover that it is preceded by a letter. |
behind the starting point to discover that it is preceded by a letter. |
| 1757 |
</P> |
</P> |
| 1758 |
<P> |
<P> |
| 1759 |
|
Finding all the matches in a subject is tricky when the pattern can match an |
| 1760 |
|
empty string. It is possible to emulate Perl's /g behaviour by first trying the |
| 1761 |
|
match again at the same offset, with the PCRE_NOTEMPTY_ATSTART and |
| 1762 |
|
PCRE_ANCHORED options, and then if that fails, advancing the starting offset |
| 1763 |
|
and trying an ordinary match again. There is some code that demonstrates how to |
| 1764 |
|
do this in the |
| 1765 |
|
<a href="pcredemo.html"><b>pcredemo</b></a> |
| 1766 |
|
sample program. In the most general case, you have to check to see if the |
| 1767 |
|
newline convention recognizes CRLF as a newline, and if so, and the current |
| 1768 |
|
character is CR followed by LF, advance the starting offset by two characters |
| 1769 |
|
instead of one. |
| 1770 |
|
</P> |
| 1771 |
|
<P> |
| 1772 |
If a non-zero starting offset is passed when the pattern is anchored, one |
If a non-zero starting offset is passed when the pattern is anchored, one |
| 1773 |
attempt to match at the given offset is made. This can only succeed if the |
attempt to match at the given offset is made. This can only succeed if the |
| 1774 |
pattern does not require the match to be at the start of the subject. |
pattern does not require the match to be at the start of the subject. |
| 1823 |
<P> |
<P> |
| 1824 |
If the vector is too small to hold all the captured substring offsets, it is |
If the vector is too small to hold all the captured substring offsets, it is |
| 1825 |
used as far as possible (up to two-thirds of its length), and the function |
used as far as possible (up to two-thirds of its length), and the function |
| 1826 |
returns a value of zero. If the substring offsets are not of interest, |
returns a value of zero. If neither the actual string matched not any captured |
| 1827 |
<b>pcre_exec()</b> may be called with <i>ovector</i> passed as NULL and |
substrings are of interest, <b>pcre_exec()</b> may be called with <i>ovector</i> |
| 1828 |
<i>ovecsize</i> as zero. However, if the pattern contains back references and |
passed as NULL and <i>ovecsize</i> as zero. However, if the pattern contains |
| 1829 |
the <i>ovector</i> is not big enough to remember the related substrings, PCRE |
back references and the <i>ovector</i> is not big enough to remember the related |
| 1830 |
has to get additional memory for use during matching. Thus it is usually |
substrings, PCRE has to get additional memory for use during matching. Thus it |
| 1831 |
advisable to supply an <i>ovector</i>. |
is usually advisable to supply an <i>ovector</i> of reasonable size. |
| 1832 |
|
</P> |
| 1833 |
|
<P> |
| 1834 |
|
There are some cases where zero is returned (indicating vector overflow) when |
| 1835 |
|
in fact the vector is exactly the right size for the final match. For example, |
| 1836 |
|
consider the pattern |
| 1837 |
|
<pre> |
| 1838 |
|
(a)(?:(b)c|bd) |
| 1839 |
|
</pre> |
| 1840 |
|
If a vector of 6 elements (allowing for only 1 captured substring) is given |
| 1841 |
|
with subject string "abd", <b>pcre_exec()</b> will try to set the second |
| 1842 |
|
captured string, thereby recording a vector overflow, before failing to match |
| 1843 |
|
"c" and backing up to try the second alternative. The zero return, however, |
| 1844 |
|
does correctly indicate that the maximum number of slots (namely 2) have been |
| 1845 |
|
filled. In similar cases where there is temporary overflow, but the final |
| 1846 |
|
number of used slots is actually less than the maximum, a non-zero value is |
| 1847 |
|
returned. |
| 1848 |
</P> |
</P> |
| 1849 |
<P> |
<P> |
| 1850 |
The <b>pcre_fullinfo()</b> function can be used to find out how many capturing |
The <b>pcre_fullinfo()</b> function can be used to find out how many capturing |
| 1865 |
expression are also set to -1. For example, if the string "abc" is matched |
expression are also set to -1. For example, if the string "abc" is matched |
| 1866 |
against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. The |
against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. The |
| 1867 |
return from the function is 2, because the highest used capturing subpattern |
return from the function is 2, because the highest used capturing subpattern |
| 1868 |
number is 1. However, you can refer to the offsets for the second and third |
number is 1, and the offsets for for the second and third capturing subpatterns |
| 1869 |
capturing subpatterns if you wish (assuming the vector is large enough, of |
(assuming the vector is large enough, of course) are set to -1. |
| 1870 |
course). |
</P> |
| 1871 |
|
<P> |
| 1872 |
|
<b>Note</b>: Elements in the first two-thirds of <i>ovector</i> that do not |
| 1873 |
|
correspond to capturing parentheses in the pattern are never changed. That is, |
| 1874 |
|
if a pattern contains <i>n</i> capturing parentheses, no more than |
| 1875 |
|
<i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by <b>pcre_exec()</b>. The other |
| 1876 |
|
elements (in the first two-thirds) retain whatever values they previously had. |
| 1877 |
</P> |
</P> |
| 1878 |
<P> |
<P> |
| 1879 |
Some convenience functions are provided for extracting the captured substrings |
Some convenience functions are provided for extracting the captured substrings |
| 1947 |
<pre> |
<pre> |
| 1948 |
PCRE_ERROR_BADUTF8 (-10) |
PCRE_ERROR_BADUTF8 (-10) |
| 1949 |
</pre> |
</pre> |
| 1950 |
A string that contains an invalid UTF-8 byte sequence was passed as a subject. |
A string that contains an invalid UTF-8 byte sequence was passed as a subject, |
| 1951 |
|
and the PCRE_NO_UTF8_CHECK option was not set. If the size of the output vector |
| 1952 |
|
(<i>ovecsize</i>) is at least 2, the byte offset to the start of the the invalid |
| 1953 |
|
UTF-8 character is placed in the first element, and a reason code is placed in |
| 1954 |
|
the second element. The reason codes are listed in the |
| 1955 |
|
<a href="#badutf8reasons">following section.</a> |
| 1956 |
|
For backward compatibility, if PCRE_PARTIAL_HARD is set and the problem is a |
| 1957 |
|
truncated UTF-8 character at the end of the subject (reason codes 1 to 5), |
| 1958 |
|
PCRE_ERROR_SHORTUTF8 is returned instead of PCRE_ERROR_BADUTF8. |
| 1959 |
<pre> |
<pre> |
| 1960 |
PCRE_ERROR_BADUTF8_OFFSET (-11) |
PCRE_ERROR_BADUTF8_OFFSET (-11) |
| 1961 |
</pre> |
</pre> |
| 1962 |
The UTF-8 byte sequence that was passed as a subject was valid, but the value |
The UTF-8 byte sequence that was passed as a subject was checked and found to |
| 1963 |
of <i>startoffset</i> did not point to the beginning of a UTF-8 character. |
be valid (the PCRE_NO_UTF8_CHECK option was not set), but the value of |
| 1964 |
|
<i>startoffset</i> did not point to the beginning of a UTF-8 character or the |
| 1965 |
|
end of the subject. |
| 1966 |
<pre> |
<pre> |
| 1967 |
PCRE_ERROR_PARTIAL (-12) |
PCRE_ERROR_PARTIAL (-12) |
| 1968 |
</pre> |
</pre> |
| 1995 |
PCRE_ERROR_BADNEWLINE (-23) |
PCRE_ERROR_BADNEWLINE (-23) |
| 1996 |
</pre> |
</pre> |
| 1997 |
An invalid combination of PCRE_NEWLINE_<i>xxx</i> options was given. |
An invalid combination of PCRE_NEWLINE_<i>xxx</i> options was given. |
| 1998 |
|
<pre> |
| 1999 |
|
PCRE_ERROR_BADOFFSET (-24) |
| 2000 |
|
</pre> |
| 2001 |
|
The value of <i>startoffset</i> was negative or greater than the length of the |
| 2002 |
|
subject, that is, the value in <i>length</i>. |
| 2003 |
|
<pre> |
| 2004 |
|
PCRE_ERROR_SHORTUTF8 (-25) |
| 2005 |
|
</pre> |
| 2006 |
|
This error is returned instead of PCRE_ERROR_BADUTF8 when the subject string |
| 2007 |
|
ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD option is set. |
| 2008 |
|
Information about the failure is returned as for PCRE_ERROR_BADUTF8. It is in |
| 2009 |
|
fact sufficient to detect this case, but this special error code for |
| 2010 |
|
PCRE_PARTIAL_HARD precedes the implementation of returned information; it is |
| 2011 |
|
retained for backwards compatibility. |
| 2012 |
|
<pre> |
| 2013 |
|
PCRE_ERROR_RECURSELOOP (-26) |
| 2014 |
|
</pre> |
| 2015 |
|
This error is returned when <b>pcre_exec()</b> detects a recursion loop within |
| 2016 |
|
the pattern. Specifically, it means that either the whole pattern or a |
| 2017 |
|
subpattern has been called recursively for the second time at the same position |
| 2018 |
|
in the subject string. Some simple patterns that might do this are detected and |
| 2019 |
|
faulted at compile time, but more complicated cases, in particular mutual |
| 2020 |
|
recursions between two different subpatterns, cannot be detected until run |
| 2021 |
|
time. |
| 2022 |
|
<pre> |
| 2023 |
|
PCRE_ERROR_JIT_STACKLIMIT (-27) |
| 2024 |
|
</pre> |
| 2025 |
|
This error is returned when a pattern that was successfully studied using the |
| 2026 |
|
PCRE_STUDY_JIT_COMPILE option is being matched, but the memory available for |
| 2027 |
|
the just-in-time processing stack is not large enough. See the |
| 2028 |
|
<a href="pcrejit.html"><b>pcrejit</b></a> |
| 2029 |
|
documentation for more details. |
| 2030 |
</P> |
</P> |
| 2031 |
<P> |
<P> |
| 2032 |
Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>. |
Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>. |
| 2033 |
|
<a name="badutf8reasons"></a></P> |
| 2034 |
|
<br><b> |
| 2035 |
|
Reason codes for invalid UTF-8 strings |
| 2036 |
|
</b><br> |
| 2037 |
|
<P> |
| 2038 |
|
When <b>pcre_exec()</b> returns either PCRE_ERROR_BADUTF8 or |
| 2039 |
|
PCRE_ERROR_SHORTUTF8, and the size of the output vector (<i>ovecsize</i>) is at |
| 2040 |
|
least 2, the offset of the start of the invalid UTF-8 character is placed in |
| 2041 |
|
the first output vector element (<i>ovector[0]</i>) and a reason code is placed |
| 2042 |
|
in the second element (<i>ovector[1]</i>). The reason codes are given names in |
| 2043 |
|
the <b>pcre.h</b> header file: |
| 2044 |
|
<pre> |
| 2045 |
|
PCRE_UTF8_ERR1 |
| 2046 |
|
PCRE_UTF8_ERR2 |
| 2047 |
|
PCRE_UTF8_ERR3 |
| 2048 |
|
PCRE_UTF8_ERR4 |
| 2049 |
|
PCRE_UTF8_ERR5 |
| 2050 |
|
</pre> |
| 2051 |
|
The string ends with a truncated UTF-8 character; the code specifies how many |
| 2052 |
|
bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be |
| 2053 |
|
no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279) |
| 2054 |
|
allows for up to 6 bytes, and this is checked first; hence the possibility of |
| 2055 |
|
4 or 5 missing bytes. |
| 2056 |
|
<pre> |
| 2057 |
|
PCRE_UTF8_ERR6 |
| 2058 |
|
PCRE_UTF8_ERR7 |
| 2059 |
|
PCRE_UTF8_ERR8 |
| 2060 |
|
PCRE_UTF8_ERR9 |
| 2061 |
|
PCRE_UTF8_ERR10 |
| 2062 |
|
</pre> |
| 2063 |
|
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the |
| 2064 |
|
character do not have the binary value 0b10 (that is, either the most |
| 2065 |
|
significant bit is 0, or the next bit is 1). |
| 2066 |
|
<pre> |
| 2067 |
|
PCRE_UTF8_ERR11 |
| 2068 |
|
PCRE_UTF8_ERR12 |
| 2069 |
|
</pre> |
| 2070 |
|
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; |
| 2071 |
|
these code points are excluded by RFC 3629. |
| 2072 |
|
<pre> |
| 2073 |
|
PCRE_UTF8_ERR13 |
| 2074 |
|
</pre> |
| 2075 |
|
A 4-byte character has a value greater than 0x10fff; these code points are |
| 2076 |
|
excluded by RFC 3629. |
| 2077 |
|
<pre> |
| 2078 |
|
PCRE_UTF8_ERR14 |
| 2079 |
|
</pre> |
| 2080 |
|
A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of |
| 2081 |
|
code points are reserved by RFC 3629 for use with UTF-16, and so are excluded |
| 2082 |
|
from UTF-8. |
| 2083 |
|
<pre> |
| 2084 |
|
PCRE_UTF8_ERR15 |
| 2085 |
|
PCRE_UTF8_ERR16 |
| 2086 |
|
PCRE_UTF8_ERR17 |
| 2087 |
|
PCRE_UTF8_ERR18 |
| 2088 |
|
PCRE_UTF8_ERR19 |
| 2089 |
|
</pre> |
| 2090 |
|
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a |
| 2091 |
|
value that can be represented by fewer bytes, which is invalid. For example, |
| 2092 |
|
the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just |
| 2093 |
|
one byte. |
| 2094 |
|
<pre> |
| 2095 |
|
PCRE_UTF8_ERR20 |
| 2096 |
|
</pre> |
| 2097 |
|
The two most significant bits of the first byte of a character have the binary |
| 2098 |
|
value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a |
| 2099 |
|
byte can only validly occur as the second or subsequent byte of a multi-byte |
| 2100 |
|
character. |
| 2101 |
|
<pre> |
| 2102 |
|
PCRE_UTF8_ERR21 |
| 2103 |
|
</pre> |
| 2104 |
|
The first byte of a character has the value 0xfe or 0xff. These values can |
| 2105 |
|
never occur in a valid UTF-8 string. |
| 2106 |
</P> |
</P> |
| 2107 |
<br><a name="SEC15" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br> |
<br><a name="SEC17" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br> |
| 2108 |
<P> |
<P> |
| 2109 |
<b>int pcre_copy_substring(const char *<i>subject</i>, int *<i>ovector</i>,</b> |
<b>int pcre_copy_substring(const char *<i>subject</i>, int *<i>ovector</i>,</b> |
| 2110 |
<b>int <i>stringcount</i>, int <i>stringnumber</i>, char *<i>buffer</i>,</b> |
<b>int <i>stringcount</i>, int <i>stringnumber</i>, char *<i>buffer</i>,</b> |
| 2199 |
<b>pcre_free</b> directly; it is for these cases that the functions are |
<b>pcre_free</b> directly; it is for these cases that the functions are |
| 2200 |
provided. |
provided. |
| 2201 |
</P> |
</P> |
| 2202 |
<br><a name="SEC16" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br> |
<br><a name="SEC18" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br> |
| 2203 |
<P> |
<P> |
| 2204 |
<b>int pcre_get_stringnumber(const pcre *<i>code</i>,</b> |
<b>int pcre_get_stringnumber(const pcre *<i>code</i>,</b> |
| 2205 |
<b>const char *<i>name</i>);</b> |
<b>const char *<i>name</i>);</b> |
| 2263 |
numbers. For this reason, the use of different names for subpatterns of the |
numbers. For this reason, the use of different names for subpatterns of the |
| 2264 |
same number causes an error at compile time. |
same number causes an error at compile time. |
| 2265 |
</P> |
</P> |
| 2266 |
<br><a name="SEC17" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br> |
<br><a name="SEC19" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br> |
| 2267 |
<P> |
<P> |
| 2268 |
<b>int pcre_get_stringtable_entries(const pcre *<i>code</i>,</b> |
<b>int pcre_get_stringtable_entries(const pcre *<i>code</i>,</b> |
| 2269 |
<b>const char *<i>name</i>, char **<i>first</i>, char **<i>last</i>);</b> |
<b>const char *<i>name</i>, char **<i>first</i>, char **<i>last</i>);</b> |
| 2296 |
has run, they point to the first and last entries in the name-to-number table |
has run, they point to the first and last entries in the name-to-number table |
| 2297 |
for the given name. The function itself returns the length of each entry, or |
for the given name. The function itself returns the length of each entry, or |
| 2298 |
PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is |
PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is |
| 2299 |
described above in the section entitled <i>Information about a pattern</i>. |
described above in the section entitled <i>Information about a pattern</i> |
| 2300 |
|
<a href="#infoaboutpattern">above.</a> |
| 2301 |
Given all the relevant entries for the name, you can extract each of their |
Given all the relevant entries for the name, you can extract each of their |
| 2302 |
numbers, and hence the captured data, if any. |
numbers, and hence the captured data, if any. |
| 2303 |
</P> |
</P> |
| 2304 |
<br><a name="SEC18" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br> |
<br><a name="SEC20" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br> |
| 2305 |
<P> |
<P> |
| 2306 |
The traditional matching function uses a similar algorithm to Perl, which stops |
The traditional matching function uses a similar algorithm to Perl, which stops |
| 2307 |
when it finds the first match, starting at a given point in the subject. If you |
when it finds the first match, starting at a given point in the subject. If you |
| 2320 |
other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b> |
other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b> |
| 2321 |
will yield PCRE_ERROR_NOMATCH. |
will yield PCRE_ERROR_NOMATCH. |
| 2322 |
<a name="dfamatch"></a></P> |
<a name="dfamatch"></a></P> |
| 2323 |
<br><a name="SEC19" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br> |
<br><a name="SEC21" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br> |
| 2324 |
<P> |
<P> |
| 2325 |
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> |
| 2326 |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> |
| 2395 |
there have been no complete matches, but there is still at least one matching |
there have been no complete matches, but there is still at least one matching |
| 2396 |
possibility. The portion of the string that was inspected when the longest |
possibility. The portion of the string that was inspected when the longest |
| 2397 |
partial match was found is set as the first matching string in both cases. |
partial match was found is set as the first matching string in both cases. |
| 2398 |
|
There is a more detailed discussion of partial and multi-segment matching, with |
| 2399 |
|
examples, in the |
| 2400 |
|
<a href="pcrepartial.html"><b>pcrepartial</b></a> |
| 2401 |
|
documentation. |
| 2402 |
<pre> |
<pre> |
| 2403 |
PCRE_DFA_SHORTEST |
PCRE_DFA_SHORTEST |
| 2404 |
</pre> |
</pre> |
| 2451 |
The strings are returned in reverse order of length; that is, the longest |
The strings are returned in reverse order of length; that is, the longest |
| 2452 |
matching string is given first. If there were too many matches to fit into |
matching string is given first. If there were too many matches to fit into |
| 2453 |
<i>ovector</i>, the yield of the function is zero, and the vector is filled with |
<i>ovector</i>, the yield of the function is zero, and the vector is filled with |
| 2454 |
the longest matches. |
the longest matches. Unlike <b>pcre_exec()</b>, <b>pcre_dfa_exec()</b> can use |
| 2455 |
|
the entire <i>ovector</i> for returning matched strings. |
| 2456 |
</P> |
</P> |
| 2457 |
<br><b> |
<br><b> |
| 2458 |
Error returns from <b>pcre_dfa_exec()</b> |
Error returns from <b>pcre_dfa_exec()</b> |
| 2479 |
PCRE_ERROR_DFA_UMLIMIT (-18) |
PCRE_ERROR_DFA_UMLIMIT (-18) |
| 2480 |
</pre> |
</pre> |
| 2481 |
This return is given if <b>pcre_dfa_exec()</b> is called with an <i>extra</i> |
This return is given if <b>pcre_dfa_exec()</b> is called with an <i>extra</i> |
| 2482 |
block that contains a setting of the <i>match_limit</i> field. This is not |
block that contains a setting of the <i>match_limit</i> or |
| 2483 |
supported (it is meaningless). |
<i>match_limit_recursion</i> fields. This is not supported (these fields are |
| 2484 |
|
meaningless for DFA matching). |
| 2485 |
<pre> |
<pre> |
| 2486 |
PCRE_ERROR_DFA_WSSIZE (-19) |
PCRE_ERROR_DFA_WSSIZE (-19) |
| 2487 |
</pre> |
</pre> |
| 2495 |
error is given if the output vector is not large enough. This should be |
error is given if the output vector is not large enough. This should be |
| 2496 |
extremely rare, as a vector of size 1000 is used. |
extremely rare, as a vector of size 1000 is used. |
| 2497 |
</P> |
</P> |
| 2498 |
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br> |
<br><a name="SEC22" href="#TOC1">SEE ALSO</a><br> |
| 2499 |
<P> |
<P> |
| 2500 |
<b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3), |
<b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3), |
| 2501 |
<b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3), |
<b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3), |
| 2502 |
<b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3). |
<b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3). |
| 2503 |
</P> |
</P> |
| 2504 |
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br> |
<br><a name="SEC23" href="#TOC1">AUTHOR</a><br> |
| 2505 |
<P> |
<P> |
| 2506 |
Philip Hazel |
Philip Hazel |
| 2507 |
<br> |
<br> |
| 2510 |
Cambridge CB2 3QH, England. |
Cambridge CB2 3QH, England. |
| 2511 |
<br> |
<br> |
| 2512 |
</P> |
</P> |
| 2513 |
<br><a name="SEC22" href="#TOC1">REVISION</a><br> |
<br><a name="SEC24" href="#TOC1">REVISION</a><br> |
| 2514 |
<P> |
<P> |
| 2515 |
Last updated: 15 June 2010 |
Last updated: 02 December 2011 |
| 2516 |
<br> |
<br> |
| 2517 |
Copyright © 1997-2010 University of Cambridge. |
Copyright © 1997-2011 University of Cambridge. |
| 2518 |
<br> |
<br> |
| 2519 |
<p> |
<p> |
| 2520 |
Return to the <a href="index.html">PCRE index page</a>. |
Return to the <a href="index.html">PCRE index page</a>. |