| 44 |
.B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);" |
.B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);" |
| 45 |
.PP |
.PP |
| 46 |
.br |
.br |
| 47 |
|
.B void pcre_free_substring(const char *\fIstringptr\fR); |
| 48 |
|
.PP |
| 49 |
|
.br |
| 50 |
|
.B void pcre_free_substring_list(const char **\fIstringptr\fR); |
| 51 |
|
.PP |
| 52 |
|
.br |
| 53 |
.B const unsigned char *pcre_maketables(void); |
.B const unsigned char *pcre_maketables(void); |
| 54 |
.PP |
.PP |
| 55 |
.br |
.br |
| 76 |
The PCRE library is a set of functions that implement regular expression |
The PCRE library is a set of functions that implement regular expression |
| 77 |
pattern matching using the same syntax and semantics as Perl 5, with just a few |
pattern matching using the same syntax and semantics as Perl 5, with just a few |
| 78 |
differences (see below). The current implementation corresponds to Perl 5.005, |
differences (see below). The current implementation corresponds to Perl 5.005, |
| 79 |
with some additional features from the Perl development release. |
with some additional features from later versions. This includes some |
| 80 |
|
experimental, incomplete support for UTF-8 encoded strings. Details of exactly |
| 81 |
|
what is and what is not supported are given below. |
| 82 |
|
|
| 83 |
PCRE has its own native API, which is described in this document. There is also |
PCRE has its own native API, which is described in this document. There is also |
| 84 |
a set of wrapper functions that correspond to the POSIX regular expression API. |
a set of wrapper functions that correspond to the POSIX regular expression API. |
| 92 |
use these to include support for different releases. |
use these to include support for different releases. |
| 93 |
|
|
| 94 |
The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR |
The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR |
| 95 |
are used for compiling and matching regular expressions, while |
are used for compiling and matching regular expressions. A sample program that |
| 96 |
\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and |
demonstrates the simplest way of using them is given in the file |
| 97 |
|
\fIpcredemo.c\fR. The last section of this man page describes how to run it. |
| 98 |
|
|
| 99 |
|
The functions \fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and |
| 100 |
\fBpcre_get_substring_list()\fR are convenience functions for extracting |
\fBpcre_get_substring_list()\fR are convenience functions for extracting |
| 101 |
captured substrings from a matched subject string. The function |
captured substrings from a matched subject string; \fBpcre_free_substring()\fR |
| 102 |
\fBpcre_maketables()\fR is used (optionally) to build a set of character tables |
and \fBpcre_free_substring_list()\fR are also provided, to free the memory used |
| 103 |
in the current locale for passing to \fBpcre_compile()\fR. |
for extracted strings. |
| 104 |
|
|
| 105 |
|
The function \fBpcre_maketables()\fR is used (optionally) to build a set of |
| 106 |
|
character tables in the current locale for passing to \fBpcre_compile()\fR. |
| 107 |
|
|
| 108 |
The function \fBpcre_fullinfo()\fR is used to find out information about a |
The function \fBpcre_fullinfo()\fR is used to find out information about a |
| 109 |
compiled pattern; \fBpcre_info()\fR is an obsolete version which returns only |
compiled pattern; \fBpcre_info()\fR is an obsolete version which returns only |
| 131 |
The function \fBpcre_compile()\fR is called to compile a pattern into an |
The function \fBpcre_compile()\fR is called to compile a pattern into an |
| 132 |
internal form. The pattern is a C string terminated by a binary zero, and |
internal form. The pattern is a C string terminated by a binary zero, and |
| 133 |
is passed in the argument \fIpattern\fR. A pointer to a single block of memory |
is passed in the argument \fIpattern\fR. A pointer to a single block of memory |
| 134 |
that is obtained via \fBpcre_malloc\fR is returned. This contains the |
that is obtained via \fBpcre_malloc\fR is returned. This contains the compiled |
| 135 |
compiled code and related data. The \fBpcre\fR type is defined for this for |
code and related data. The \fBpcre\fR type is defined for the returned block; |
| 136 |
convenience, but in fact \fBpcre\fR is just a typedef for \fBvoid\fR, since the |
this is a typedef for a structure whose contents are not externally defined. It |
| 137 |
contents of the block are not externally defined. It is up to the caller to |
is up to the caller to free the memory when it is no longer required. |
| 138 |
free the memory when it is no longer required. |
|
| 139 |
.PP |
Although the compiled code of a PCRE regex is relocatable, that is, it does not |
| 140 |
|
depend on memory location, the complete \fBpcre\fR data block is not |
| 141 |
|
fully relocatable, because it contains a copy of the \fItableptr\fR argument, |
| 142 |
|
which is an address (see below). |
| 143 |
|
|
| 144 |
The size of a compiled pattern is roughly proportional to the length of the |
The size of a compiled pattern is roughly proportional to the length of the |
| 145 |
pattern string, except that each character class (other than those containing |
pattern string, except that each character class (other than those containing |
| 146 |
just a single character, negated or not) requires 33 bytes, and repeat |
just a single character, negated or not) requires 33 bytes, and repeat |
| 147 |
quantifiers with a minimum greater than one or a bounded maximum cause the |
quantifiers with a minimum greater than one or a bounded maximum cause the |
| 148 |
relevant portions of the compiled pattern to be replicated. |
relevant portions of the compiled pattern to be replicated. |
| 149 |
.PP |
|
| 150 |
The \fIoptions\fR argument contains independent bits that affect the |
The \fIoptions\fR argument contains independent bits that affect the |
| 151 |
compilation. It should be zero if no options are required. Some of the options, |
compilation. It should be zero if no options are required. Some of the options, |
| 152 |
in particular, those that are compatible with Perl, can also be set and unset |
in particular, those that are compatible with Perl, can also be set and unset |
| 155 |
their initial settings at the start of compilation and execution. The |
their initial settings at the start of compilation and execution. The |
| 156 |
PCRE_ANCHORED option can be set at the time of matching as well as at compile |
PCRE_ANCHORED option can be set at the time of matching as well as at compile |
| 157 |
time. |
time. |
| 158 |
.PP |
|
| 159 |
If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately. |
If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately. |
| 160 |
Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns |
Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns |
| 161 |
NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual |
NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual |
| 162 |
error message. The offset from the start of the pattern to the character where |
error message. The offset from the start of the pattern to the character where |
| 163 |
the error was discovered is placed in the variable pointed to by |
the error was discovered is placed in the variable pointed to by |
| 164 |
\fIerroffset\fR, which must not be NULL. If it is, an immediate error is given. |
\fIerroffset\fR, which must not be NULL. If it is, an immediate error is given. |
| 165 |
.PP |
|
| 166 |
If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of |
If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of |
| 167 |
character tables which are built when it is compiled, using the default C |
character tables which are built when it is compiled, using the default C |
| 168 |
locale. Otherwise, \fItableptr\fR must be the result of a call to |
locale. Otherwise, \fItableptr\fR must be the result of a call to |
| 169 |
\fBpcre_maketables()\fR. See the section on locale support below. |
\fBpcre_maketables()\fR. See the section on locale support below. |
| 170 |
.PP |
|
| 171 |
|
This code fragment shows a typical straightforward call to \fBpcre_compile()\fR: |
| 172 |
|
|
| 173 |
|
pcre *re; |
| 174 |
|
const char *error; |
| 175 |
|
int erroffset; |
| 176 |
|
re = pcre_compile( |
| 177 |
|
"^A.*Z", /* the pattern */ |
| 178 |
|
0, /* default options */ |
| 179 |
|
&error, /* for error message */ |
| 180 |
|
&erroffset, /* for error offset */ |
| 181 |
|
NULL); /* use default character tables */ |
| 182 |
|
|
| 183 |
The following option bits are defined in the header file: |
The following option bits are defined in the header file: |
| 184 |
|
|
| 185 |
PCRE_ANCHORED |
PCRE_ANCHORED |
| 253 |
greedy by default, but become greedy if followed by "?". It is not compatible |
greedy by default, but become greedy if followed by "?". It is not compatible |
| 254 |
with Perl. It can also be set by a (?U) option setting within the pattern. |
with Perl. It can also be set by a (?U) option setting within the pattern. |
| 255 |
|
|
| 256 |
|
PCRE_UTF8 |
| 257 |
|
|
| 258 |
|
This option causes PCRE to regard both the pattern and the subject as strings |
| 259 |
|
of UTF-8 characters instead of just byte strings. However, it is available only |
| 260 |
|
if PCRE has been built to include UTF-8 support. If not, the use of this option |
| 261 |
|
provokes an error. Support for UTF-8 is new, experimental, and incomplete. |
| 262 |
|
Details of exactly what it entails are given below. |
| 263 |
|
|
| 264 |
|
|
| 265 |
.SH STUDYING A PATTERN |
.SH STUDYING A PATTERN |
| 266 |
When a pattern is going to be used several times, it is worth spending more |
When a pattern is going to be used several times, it is worth spending more |
| 267 |
time analyzing it in order to speed up the time taken for matching. The |
time analyzing it in order to speed up the time taken for matching. The |
| 268 |
function \fBpcre_study()\fR takes a pointer to a compiled pattern as its first |
function \fBpcre_study()\fR takes a pointer to a compiled pattern as its first |
| 269 |
argument, and returns a pointer to a \fBpcre_extra\fR block (another \fBvoid\fR |
argument, and returns a pointer to a \fBpcre_extra\fR block (another typedef |
| 270 |
typedef) containing additional information about the pattern; this can be |
for a structure with hidden contents) containing additional information about |
| 271 |
passed to \fBpcre_exec()\fR. If no additional information is available, NULL |
the pattern; this can be passed to \fBpcre_exec()\fR. If no additional |
| 272 |
is returned. |
information is available, NULL is returned. |
| 273 |
|
|
| 274 |
The second argument contains option bits. At present, no options are defined |
The second argument contains option bits. At present, no options are defined |
| 275 |
for \fBpcre_study()\fR, and this argument should always be zero. |
for \fBpcre_study()\fR, and this argument should always be zero. |
| 278 |
studying succeeds (even if no data is returned), the variable it points to is |
studying succeeds (even if no data is returned), the variable it points to is |
| 279 |
set to NULL. Otherwise it points to a textual error message. |
set to NULL. Otherwise it points to a textual error message. |
| 280 |
|
|
| 281 |
|
This is a typical call to \fBpcre_study\fR(): |
| 282 |
|
|
| 283 |
|
pcre_extra *pe; |
| 284 |
|
pe = pcre_study( |
| 285 |
|
re, /* result of pcre_compile() */ |
| 286 |
|
0, /* no options exist */ |
| 287 |
|
&error); /* set to NULL or points to a message */ |
| 288 |
|
|
| 289 |
At present, studying a pattern is useful only for non-anchored patterns that do |
At present, studying a pattern is useful only for non-anchored patterns that do |
| 290 |
not have a single fixed starting character. A bitmap of possible starting |
not have a single fixed starting character. A bitmap of possible starting |
| 291 |
characters is created. |
characters is created. |
| 335 |
PCRE_ERROR_BADMAGIC the "magic number" was not found |
PCRE_ERROR_BADMAGIC the "magic number" was not found |
| 336 |
PCRE_ERROR_BADOPTION the value of \fIwhat\fR was invalid |
PCRE_ERROR_BADOPTION the value of \fIwhat\fR was invalid |
| 337 |
|
|
| 338 |
|
Here is a typical call of \fBpcre_fullinfo()\fR, to obtain the length of the |
| 339 |
|
compiled pattern: |
| 340 |
|
|
| 341 |
|
int rc; |
| 342 |
|
unsigned long int length; |
| 343 |
|
rc = pcre_fullinfo( |
| 344 |
|
re, /* result of pcre_compile() */ |
| 345 |
|
pe, /* result of pcre_study(), or NULL */ |
| 346 |
|
PCRE_INFO_SIZE, /* what is required */ |
| 347 |
|
&length); /* where to put the data */ |
| 348 |
|
|
| 349 |
The possible values for the third argument are defined in \fBpcre.h\fR, and are |
The possible values for the third argument are defined in \fBpcre.h\fR, and are |
| 350 |
as follows: |
as follows: |
| 351 |
|
|
| 352 |
PCRE_INFO_OPTIONS |
PCRE_INFO_OPTIONS |
| 353 |
|
|
| 354 |
Return a copy of the options with which the pattern was compiled. The fourth |
Return a copy of the options with which the pattern was compiled. The fourth |
| 355 |
argument should point to au \fBunsigned long int\fR variable. These option bits |
argument should point to an \fBunsigned long int\fR variable. These option bits |
| 356 |
are those specified in the call to \fBpcre_compile()\fR, modified by any |
are those specified in the call to \fBpcre_compile()\fR, modified by any |
| 357 |
top-level option settings within the pattern itself, and with the PCRE_ANCHORED |
top-level option settings within the pattern itself, and with the PCRE_ANCHORED |
| 358 |
bit forcibly set if the form of the pattern implies that it can match only at |
bit forcibly set if the form of the pattern implies that it can match only at |
| 380 |
|
|
| 381 |
Return information about the first character of any matched string, for a |
Return information about the first character of any matched string, for a |
| 382 |
non-anchored pattern. If there is a fixed first character, e.g. from a pattern |
non-anchored pattern. If there is a fixed first character, e.g. from a pattern |
| 383 |
such as (cat|cow|coyote), then it is returned in the integer pointed to by |
such as (cat|cow|coyote), it is returned in the integer pointed to by |
| 384 |
\fIwhere\fR. Otherwise, if either |
\fIwhere\fR. Otherwise, if either |
| 385 |
|
|
| 386 |
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch |
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch |
| 389 |
(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set |
(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set |
| 390 |
(if it were set, the pattern would be anchored), |
(if it were set, the pattern would be anchored), |
| 391 |
|
|
| 392 |
then -1 is returned, indicating that the pattern matches only at the |
-1 is returned, indicating that the pattern matches only at the start of a |
| 393 |
start of a subject string or after any "\\n" within the string. Otherwise -2 is |
subject string or after any "\\n" within the string. Otherwise -2 is returned. |
| 394 |
returned. For anchored patterns, -2 is returned. |
For anchored patterns, -2 is returned. |
| 395 |
|
|
| 396 |
PCRE_INFO_FIRSTTABLE |
PCRE_INFO_FIRSTTABLE |
| 397 |
|
|
| 433 |
pattern has been studied, the result of the study should be passed in the |
pattern has been studied, the result of the study should be passed in the |
| 434 |
\fIextra\fR argument. Otherwise this must be NULL. |
\fIextra\fR argument. Otherwise this must be NULL. |
| 435 |
|
|
| 436 |
|
Here is an example of a simple call to \fBpcre_exec()\fR: |
| 437 |
|
|
| 438 |
|
int rc; |
| 439 |
|
int ovector[30]; |
| 440 |
|
rc = pcre_exec( |
| 441 |
|
re, /* result of pcre_compile() */ |
| 442 |
|
NULL, /* we didn't study the pattern */ |
| 443 |
|
"some string", /* the subject string */ |
| 444 |
|
11, /* the length of the subject string */ |
| 445 |
|
0, /* start at offset 0 in the subject */ |
| 446 |
|
0, /* default options */ |
| 447 |
|
ovector, /* vector for substring information */ |
| 448 |
|
30); /* number of elements in the vector */ |
| 449 |
|
|
| 450 |
The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose |
The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose |
| 451 |
unused bits must be zero. However, if a pattern was compiled with |
unused bits must be zero. However, if a pattern was compiled with |
| 452 |
PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it |
PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it |
| 488 |
|
|
| 489 |
The subject string is passed as a pointer in \fIsubject\fR, a length in |
The subject string is passed as a pointer in \fIsubject\fR, a length in |
| 490 |
\fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern |
\fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern |
| 491 |
string, it may contain binary zero characters. When the starting offset is |
string, the subject may contain binary zero characters. When the starting |
| 492 |
zero, the search for a match starts at the beginning of the subject, and this |
offset is zero, the search for a match starts at the beginning of the subject, |
| 493 |
is by far the most common case. |
and this is by far the most common case. |
| 494 |
|
|
| 495 |
A non-zero starting offset is useful when searching for another match in the |
A non-zero starting offset is useful when searching for another match in the |
| 496 |
same subject by calling \fBpcre_exec()\fR again after a previous success. |
same subject by calling \fBpcre_exec()\fR again after a previous success. |
| 621 |
were captured by the match, including the substring that matched the entire |
were captured by the match, including the substring that matched the entire |
| 622 |
regular expression. This is the value returned by \fBpcre_exec\fR if it |
regular expression. This is the value returned by \fBpcre_exec\fR if it |
| 623 |
is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it |
is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it |
| 624 |
ran out of space in \fIovector\fR, then the value passed as |
ran out of space in \fIovector\fR, the value passed as \fIstringcount\fR should |
| 625 |
\fIstringcount\fR should be the size of the vector divided by three. |
be the size of the vector divided by three. |
| 626 |
|
|
| 627 |
The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR |
The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR |
| 628 |
extract a single substring, whose number is given as \fIstringnumber\fR. A |
extract a single substring, whose number is given as \fIstringnumber\fR. A |
| 629 |
value of zero extracts the substring that matched the entire pattern, while |
value of zero extracts the substring that matched the entire pattern, while |
| 630 |
higher values extract the captured substrings. For \fBpcre_copy_substring()\fR, |
higher values extract the captured substrings. For \fBpcre_copy_substring()\fR, |
| 631 |
the string is placed in \fIbuffer\fR, whose length is given by |
the string is placed in \fIbuffer\fR, whose length is given by |
| 632 |
\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of store is |
\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of memory is |
| 633 |
obtained via \fBpcre_malloc\fR, and its address is returned via |
obtained via \fBpcre_malloc\fR, and its address is returned via |
| 634 |
\fIstringptr\fR. The yield of the function is the length of the string, not |
\fIstringptr\fR. The yield of the function is the length of the string, not |
| 635 |
including the terminating zero, or one of |
including the terminating zero, or one of |
| 661 |
inspecting the appropriate offset in \fIovector\fR, which is negative for unset |
inspecting the appropriate offset in \fIovector\fR, which is negative for unset |
| 662 |
substrings. |
substrings. |
| 663 |
|
|
| 664 |
|
The two convenience functions \fBpcre_free_substring()\fR and |
| 665 |
|
\fBpcre_free_substring_list()\fR can be used to free the memory returned by |
| 666 |
|
a previous call of \fBpcre_get_substring()\fR or |
| 667 |
|
\fBpcre_get_substring_list()\fR, respectively. They do nothing more than call |
| 668 |
|
the function pointed to by \fBpcre_free\fR, which of course could be called |
| 669 |
|
directly from a C program. However, PCRE is used in some situations where it is |
| 670 |
|
linked via a special interface to another programming language which cannot use |
| 671 |
|
\fBpcre_free\fR directly; it is for these cases that the functions are |
| 672 |
|
provided. |
| 673 |
|
|
| 674 |
|
|
| 675 |
.SH LIMITATIONS |
.SH LIMITATIONS |
| 677 |
practice be relevant. |
practice be relevant. |
| 678 |
The maximum length of a compiled pattern is 65539 (sic) bytes. |
The maximum length of a compiled pattern is 65539 (sic) bytes. |
| 679 |
All values in repeating quantifiers must be less than 65536. |
All values in repeating quantifiers must be less than 65536. |
| 680 |
The maximum number of capturing subpatterns is 99. |
There maximum number of capturing subpatterns is 65535. |
| 681 |
The maximum number of all parenthesized subpatterns, including capturing |
There is no limit to the number of non-capturing subpatterns, but the maximum |
| 682 |
|
depth of nesting of all kinds of parenthesized subpattern, including capturing |
| 683 |
subpatterns, assertions, and other types of subpattern, is 200. |
subpatterns, assertions, and other types of subpattern, is 200. |
| 684 |
|
|
| 685 |
The maximum length of a subject string is the largest positive number that an |
The maximum length of a subject string is the largest positive number that an |
| 731 |
with the settings of captured strings when part of a pattern is repeated. For |
with the settings of captured strings when part of a pattern is repeated. For |
| 732 |
example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value |
example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value |
| 733 |
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if |
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if |
| 734 |
the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) get set. |
the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) are set. |
| 735 |
|
|
| 736 |
In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the |
In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the |
| 737 |
future Perl changes to a consistent state that is different, PCRE may change to |
future Perl changes to a consistent state that is different, PCRE may change to |
| 772 |
described below. Regular expressions are also described in the Perl |
described below. Regular expressions are also described in the Perl |
| 773 |
documentation and in a number of other books, some of which have copious |
documentation and in a number of other books, some of which have copious |
| 774 |
examples. Jeffrey Friedl's "Mastering Regular Expressions", published by |
examples. Jeffrey Friedl's "Mastering Regular Expressions", published by |
| 775 |
O'Reilly (ISBN 1-56592-257), covers them in great detail. The description |
O'Reilly (ISBN 1-56592-257), covers them in great detail. |
| 776 |
here is intended as reference documentation. |
|
| 777 |
|
The description here is intended as reference documentation. The basic |
| 778 |
|
operation of PCRE is on strings of bytes. However, there is the beginnings of |
| 779 |
|
some support for UTF-8 character strings. To use this support you must |
| 780 |
|
configure PCRE to include it, and then call \fBpcre_compile()\fR with the |
| 781 |
|
PCRE_UTF8 option. How this affects the pattern matching is described in the |
| 782 |
|
final section of this document. |
| 783 |
|
|
| 784 |
A regular expression is a pattern that is matched against a subject string from |
A regular expression is a pattern that is matched against a subject string from |
| 785 |
left to right. Most characters stand for themselves in a pattern, and match the |
left to right. Most characters stand for themselves in a pattern, and match the |
| 1001 |
|
|
| 1002 |
Note that the sequences \\A, \\Z, and \\z can be used to match the start and |
Note that the sequences \\A, \\Z, and \\z can be used to match the start and |
| 1003 |
end of the subject in both modes, and if all branches of a pattern start with |
end of the subject in both modes, and if all branches of a pattern start with |
| 1004 |
\\A is it always anchored, whether PCRE_MULTILINE is set or not. |
\\A it is always anchored, whether PCRE_MULTILINE is set or not. |
| 1005 |
|
|
| 1006 |
|
|
| 1007 |
.SH FULL STOP (PERIOD, DOT) |
.SH FULL STOP (PERIOD, DOT) |
| 1008 |
Outside a character class, a dot in the pattern matches any one character in |
Outside a character class, a dot in the pattern matches any one character in |
| 1009 |
the subject, including a non-printing character, but not (by default) newline. |
the subject, including a non-printing character, but not (by default) newline. |
| 1010 |
If the PCRE_DOTALL option is set, then dots match newlines as well. The |
If the PCRE_DOTALL option is set, dots match newlines as well. The handling of |
| 1011 |
handling of dot is entirely independent of the handling of circumflex and |
dot is entirely independent of the handling of circumflex and dollar, the only |
| 1012 |
dollar, the only relationship being that they both involve newline characters. |
relationship being that they both involve newline characters. Dot has no |
| 1013 |
Dot has no special meaning in a character class. |
special meaning in a character class. |
| 1014 |
|
|
| 1015 |
|
|
| 1016 |
.SH SQUARE BRACKETS |
.SH SQUARE BRACKETS |
| 1105 |
|
|
| 1106 |
[12[:^digit:]] |
[12[:^digit:]] |
| 1107 |
|
|
| 1108 |
matches "1", "2", or any non-digit. PCRE (and Perl) also recogize the POSIX |
matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX |
| 1109 |
syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not |
syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not |
| 1110 |
supported, and an error is given if they are encountered. |
supported, and an error is given if they are encountered. |
| 1111 |
|
|
| 1203 |
the ((red|white) (king|queen)) |
the ((red|white) (king|queen)) |
| 1204 |
|
|
| 1205 |
the captured substrings are "red king", "red", and "king", and are numbered 1, |
the captured substrings are "red king", "red", and "king", and are numbered 1, |
| 1206 |
2, and 3. |
2, and 3, respectively. |
| 1207 |
|
|
| 1208 |
The fact that plain parentheses fulfil two functions is not always helpful. |
The fact that plain parentheses fulfil two functions is not always helpful. |
| 1209 |
There are often times when a grouping subpattern is required without a |
There are often times when a grouping subpattern is required without a |
| 1297 |
|
|
| 1298 |
/* first command */ not comment /* second comment */ |
/* first command */ not comment /* second comment */ |
| 1299 |
|
|
| 1300 |
fails, because it matches the entire string due to the greediness of the .* |
fails, because it matches the entire string owing to the greediness of the .* |
| 1301 |
item. |
item. |
| 1302 |
|
|
| 1303 |
However, if a quantifier is followed by a question mark, then it ceases to be |
However, if a quantifier is followed by a question mark, it ceases to be |
| 1304 |
greedy, and instead matches the minimum number of times possible, so the |
greedy, and instead matches the minimum number of times possible, so the |
| 1305 |
pattern |
pattern |
| 1306 |
|
|
| 1316 |
which matches one digit by preference, but can match two if that is the only |
which matches one digit by preference, but can match two if that is the only |
| 1317 |
way the rest of the pattern matches. |
way the rest of the pattern matches. |
| 1318 |
|
|
| 1319 |
If the PCRE_UNGREEDY option is set (an option which is not available in Perl) |
If the PCRE_UNGREEDY option is set (an option which is not available in Perl), |
| 1320 |
then the quantifiers are not greedy by default, but individual ones can be made |
the quantifiers are not greedy by default, but individual ones can be made |
| 1321 |
greedy by following them with a question mark. In other words, it inverts the |
greedy by following them with a question mark. In other words, it inverts the |
| 1322 |
default behaviour. |
default behaviour. |
| 1323 |
|
|
| 1326 |
compiled pattern, in proportion to the size of the minimum or maximum. |
compiled pattern, in proportion to the size of the minimum or maximum. |
| 1327 |
|
|
| 1328 |
If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent |
If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent |
| 1329 |
to Perl's /s) is set, thus allowing the . to match newlines, then the pattern |
to Perl's /s) is set, thus allowing the . to match newlines, the pattern is |
| 1330 |
is implicitly anchored, because whatever follows will be tried against every |
implicitly anchored, because whatever follows will be tried against every |
| 1331 |
character position in the subject string, so there is no point in retrying the |
character position in the subject string, so there is no point in retrying the |
| 1332 |
overall match at any position after the first. PCRE treats such a pattern as |
overall match at any position after the first. PCRE treats such a pattern as |
| 1333 |
though it were preceded by \\A. In cases where it is known that the subject |
though it were preceded by \\A. In cases where it is known that the subject |
| 1371 |
|
|
| 1372 |
matches "sense and sensibility" and "response and responsibility", but not |
matches "sense and sensibility" and "response and responsibility", but not |
| 1373 |
"sense and responsibility". If caseful matching is in force at the time of the |
"sense and responsibility". If caseful matching is in force at the time of the |
| 1374 |
back reference, then the case of letters is relevant. For example, |
back reference, the case of letters is relevant. For example, |
| 1375 |
|
|
| 1376 |
((?i)rah)\\s+\\1 |
((?i)rah)\\s+\\1 |
| 1377 |
|
|
| 1379 |
capturing subpattern is matched caselessly. |
capturing subpattern is matched caselessly. |
| 1380 |
|
|
| 1381 |
There may be more than one back reference to the same subpattern. If a |
There may be more than one back reference to the same subpattern. If a |
| 1382 |
subpattern has not actually been used in a particular match, then any back |
subpattern has not actually been used in a particular match, any back |
| 1383 |
references to it always fail. For example, the pattern |
references to it always fail. For example, the pattern |
| 1384 |
|
|
| 1385 |
(a|(bc))\\2 |
(a|(bc))\\2 |
| 1387 |
always fails if it starts to match "a" rather than "bc". Because there may be |
always fails if it starts to match "a" rather than "bc". Because there may be |
| 1388 |
up to 99 back references, all digits following the backslash are taken |
up to 99 back references, all digits following the backslash are taken |
| 1389 |
as part of a potential back reference number. If the pattern continues with a |
as part of a potential back reference number. If the pattern continues with a |
| 1390 |
digit character, then some delimiter must be used to terminate the back |
digit character, some delimiter must be used to terminate the back reference. |
| 1391 |
reference. If the PCRE_EXTENDED option is set, this can be whitespace. |
If the PCRE_EXTENDED option is set, this can be whitespace. Otherwise an empty |
| 1392 |
Otherwise an empty comment can be used. |
comment can be used. |
| 1393 |
|
|
| 1394 |
A back reference that occurs inside the parentheses to which it refers fails |
A back reference that occurs inside the parentheses to which it refers fails |
| 1395 |
when the subpattern is first used, so, for example, (a\\1) never matches. |
when the subpattern is first used, so, for example, (a\\1) never matches. |
| 1398 |
|
|
| 1399 |
(a|b\\1)+ |
(a|b\\1)+ |
| 1400 |
|
|
| 1401 |
matches any number of "a"s and also "aba", "ababaa" etc. At each iteration of |
matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of |
| 1402 |
the subpattern, the back reference matches the character string corresponding |
the subpattern, the back reference matches the character string corresponding |
| 1403 |
to the previous iteration. In order for this to work, the pattern must be such |
to the previous iteration. In order for this to work, the pattern must be such |
| 1404 |
that the first iteration does not need to match the back reference. This can be |
that the first iteration does not need to match the back reference. This can be |
| 1477 |
matches "foo" preceded by three digits that are not "999". Notice that each of |
matches "foo" preceded by three digits that are not "999". Notice that each of |
| 1478 |
the assertions is applied independently at the same point in the subject |
the assertions is applied independently at the same point in the subject |
| 1479 |
string. First there is a check that the previous three characters are all |
string. First there is a check that the previous three characters are all |
| 1480 |
digits, then there is a check that the same three characters are not "999". |
digits, and then there is a check that the same three characters are not "999". |
| 1481 |
This pattern does \fInot\fR match "foo" preceded by six characters, the first |
This pattern does \fInot\fR match "foo" preceded by six characters, the first |
| 1482 |
of which are digits and the last three of which are not "999". For example, it |
of which are digits and the last three of which are not "999". For example, it |
| 1483 |
doesn't match "123abcfoo". A pattern to do that is |
doesn't match "123abcfoo". A pattern to do that is |
| 1562 |
|
|
| 1563 |
^.*abcd$ |
^.*abcd$ |
| 1564 |
|
|
| 1565 |
then the initial .* matches the entire string at first, but when this fails |
the initial .* matches the entire string at first, but when this fails (because |
| 1566 |
(because there is no following "a"), it backtracks to match all but the last |
there is no following "a"), it backtracks to match all but the last character, |
| 1567 |
character, then all but the last two characters, and so on. Once again the |
then all but the last two characters, and so on. Once again the search for "a" |
| 1568 |
search for "a" covers the entire string, from right to left, so we are no |
covers the entire string, from right to left, so we are no better off. However, |
| 1569 |
better off. However, if the pattern is written as |
if the pattern is written as |
| 1570 |
|
|
| 1571 |
^(?>.*)(?<=abcd) |
^(?>.*)(?<=abcd) |
| 1572 |
|
|
| 1573 |
then there can be no backtracking for the .* item; it can match only the entire |
there can be no backtracking for the .* item; it can match only the entire |
| 1574 |
string. The subsequent lookbehind assertion does a single test on the last four |
string. The subsequent lookbehind assertion does a single test on the last four |
| 1575 |
characters. If it fails, the match fails immediately. For long strings, this |
characters. If it fails, the match fails immediately. For long strings, this |
| 1576 |
approach makes a significant difference to the processing time. |
approach makes a significant difference to the processing time. |
| 1615 |
subpattern, a compile-time error occurs. |
subpattern, a compile-time error occurs. |
| 1616 |
|
|
| 1617 |
There are two kinds of condition. If the text between the parentheses consists |
There are two kinds of condition. If the text between the parentheses consists |
| 1618 |
of a sequence of digits, then the condition is satisfied if the capturing |
of a sequence of digits, the condition is satisfied if the capturing subpattern |
| 1619 |
subpattern of that number has previously matched. Consider the following |
of that number has previously matched. The number must be greater than zero. |
| 1620 |
pattern, which contains non-significant white space to make it more readable |
Consider the following pattern, which contains non-significant white space to |
| 1621 |
(assume the PCRE_EXTENDED option) and to divide it into three parts for ease |
make it more readable (assume the PCRE_EXTENDED option) and to divide it into |
| 1622 |
of discussion: |
three parts for ease of discussion: |
| 1623 |
|
|
| 1624 |
( \\( )? [^()]+ (?(1) \\) ) |
( \\( )? [^()]+ (?(1) \\) ) |
| 1625 |
|
|
| 1709 |
\\( ( ( (?>[^()]+) | (?R) )* ) \\) |
\\( ( ( (?>[^()]+) | (?R) )* ) \\) |
| 1710 |
^ ^ |
^ ^ |
| 1711 |
^ ^ |
^ ^ |
| 1712 |
then the string they capture is "ab(cd)ef", the contents of the top level |
the string they capture is "ab(cd)ef", the contents of the top level |
| 1713 |
parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE |
parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE |
| 1714 |
has to obtain extra memory to store data during a recursion, which it does by |
has to obtain extra memory to store data during a recursion, which it does by |
| 1715 |
using \fBpcre_malloc\fR, freeing it via \fBpcre_free\fR afterwards. If no |
using \fBpcre_malloc\fR, freeing it via \fBpcre_free\fR afterwards. If no |
| 1773 |
applied to a whole line of "a" characters, whereas the latter takes an |
applied to a whole line of "a" characters, whereas the latter takes an |
| 1774 |
appreciable time with strings longer than about 20 characters. |
appreciable time with strings longer than about 20 characters. |
| 1775 |
|
|
| 1776 |
|
|
| 1777 |
|
.SH UTF-8 SUPPORT |
| 1778 |
|
Starting at release 3.3, PCRE has some support for character strings encoded |
| 1779 |
|
in the UTF-8 format. This is incomplete, and is regarded as experimental. In |
| 1780 |
|
order to use it, you must configure PCRE to include UTF-8 support in the code, |
| 1781 |
|
and, in addition, you must call \fBpcre_compile()\fR with the PCRE_UTF8 option |
| 1782 |
|
flag. When you do this, both the pattern and any subject strings that are |
| 1783 |
|
matched against it are treated as UTF-8 strings instead of just strings of |
| 1784 |
|
bytes, but only in the cases that are mentioned below. |
| 1785 |
|
|
| 1786 |
|
If you compile PCRE with UTF-8 support, but do not use it at run time, the |
| 1787 |
|
library will be a bit bigger, but the additional run time overhead is limited |
| 1788 |
|
to testing the PCRE_UTF8 flag in several places, so should not be very large. |
| 1789 |
|
|
| 1790 |
|
PCRE assumes that the strings it is given contain valid UTF-8 codes. It does |
| 1791 |
|
not diagnose invalid UTF-8 strings. If you pass invalid UTF-8 strings to PCRE, |
| 1792 |
|
the results are undefined. |
| 1793 |
|
|
| 1794 |
|
Running with PCRE_UTF8 set causes these changes in the way PCRE works: |
| 1795 |
|
|
| 1796 |
|
1. In a pattern, the escape sequence \\x{...}, where the contents of the braces |
| 1797 |
|
is a string of hexadecimal digits, is interpreted as a UTF-8 character whose |
| 1798 |
|
code number is the given hexadecimal number, for example: \\x{1234}. This |
| 1799 |
|
inserts from one to six literal bytes into the pattern, using the UTF-8 |
| 1800 |
|
encoding. If a non-hexadecimal digit appears between the braces, the item is |
| 1801 |
|
not recognized. |
| 1802 |
|
|
| 1803 |
|
2. The original hexadecimal escape sequence, \\xhh, generates a two-byte UTF-8 |
| 1804 |
|
character if its value is greater than 127. |
| 1805 |
|
|
| 1806 |
|
3. Repeat quantifiers are NOT correctly handled if they follow a multibyte |
| 1807 |
|
character. For example, \\x{100}* and \\xc3+ do not work. If you want to |
| 1808 |
|
repeat such characters, you must enclose them in non-capturing parentheses, |
| 1809 |
|
for example (?:\\x{100}), at present. |
| 1810 |
|
|
| 1811 |
|
4. The dot metacharacter matches one UTF-8 character instead of a single byte. |
| 1812 |
|
|
| 1813 |
|
5. Unlike literal UTF-8 characters, the dot metacharacter followed by a |
| 1814 |
|
repeat quantifier does operate correctly on UTF-8 characters instead of |
| 1815 |
|
single bytes. |
| 1816 |
|
|
| 1817 |
|
4. Although the \\x{...} escape is permitted in a character class, characters |
| 1818 |
|
whose values are greater than 255 cannot be included in a class. |
| 1819 |
|
|
| 1820 |
|
5. A class is matched against a UTF-8 character instead of just a single byte, |
| 1821 |
|
but it can match only characters whose values are less than 256. Characters |
| 1822 |
|
with greater values always fail to match a class. |
| 1823 |
|
|
| 1824 |
|
6. Repeated classes work correctly on multiple characters. |
| 1825 |
|
|
| 1826 |
|
7. Classes containing just a single character whose value is greater than 127 |
| 1827 |
|
(but less than 256), for example, [\\x80] or [^\\x{93}], do not work because |
| 1828 |
|
these are optimized into single byte matches. In the first case, of course, |
| 1829 |
|
the class brackets are just redundant. |
| 1830 |
|
|
| 1831 |
|
8. Lookbehind assertions move backwards in the subject by a fixed number of |
| 1832 |
|
characters instead of a fixed number of bytes. Simple cases have been tested |
| 1833 |
|
to work correctly, but there may be hidden gotchas herein. |
| 1834 |
|
|
| 1835 |
|
9. The character types such as \\d and \\w do not work correctly with UTF-8 |
| 1836 |
|
characters. They continue to test a single byte. |
| 1837 |
|
|
| 1838 |
|
10. Anything not explicitly mentioned here continues to work in bytes rather |
| 1839 |
|
than in characters. |
| 1840 |
|
|
| 1841 |
|
The following UTF-8 features of Perl 5.6 are not implemented: |
| 1842 |
|
|
| 1843 |
|
1. The escape sequence \\C to match a single byte. |
| 1844 |
|
|
| 1845 |
|
2. The use of Unicode tables and properties and escapes \\p, \\P, and \\X. |
| 1846 |
|
|
| 1847 |
|
|
| 1848 |
|
.SH SAMPLE PROGRAM |
| 1849 |
|
The code below is a simple, complete demonstration program, to get you started |
| 1850 |
|
with using PCRE. This code is also supplied in the file \fIpcredemo.c\fR in the |
| 1851 |
|
PCRE distribution. |
| 1852 |
|
|
| 1853 |
|
The program compiles the regular expression that is its first argument, and |
| 1854 |
|
matches it against the subject string in its second argument. No options are |
| 1855 |
|
set, and default character tables are used. If matching succeeds, the program |
| 1856 |
|
outputs the portion of the subject that matched, together with the contents of |
| 1857 |
|
any captured substrings. |
| 1858 |
|
|
| 1859 |
|
On a Unix system that has PCRE installed in \fI/usr/local\fR, you can compile |
| 1860 |
|
the demonstration program using a command like this: |
| 1861 |
|
|
| 1862 |
|
gcc -o pcredemo pcredemo.c -I/usr/local/include -L/usr/local/lib -lpcre |
| 1863 |
|
|
| 1864 |
|
Then you can run simple tests like this: |
| 1865 |
|
|
| 1866 |
|
./pcredemo 'cat|dog' 'the cat sat on the mat' |
| 1867 |
|
|
| 1868 |
|
Note that there is a much more comprehensive test program, called |
| 1869 |
|
\fBpcretest\fR, which supports many more facilities for testing regular |
| 1870 |
|
expressions. The \fBpcredemo\fR program is provided as a simple coding example. |
| 1871 |
|
|
| 1872 |
|
On some operating systems (e.g. Solaris) you may get an error like this when |
| 1873 |
|
you try to run \fBpcredemo\fR: |
| 1874 |
|
|
| 1875 |
|
ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory |
| 1876 |
|
|
| 1877 |
|
This is caused by the way shared library support works on those systems. You |
| 1878 |
|
need to add |
| 1879 |
|
|
| 1880 |
|
-R/usr/local/lib |
| 1881 |
|
|
| 1882 |
|
to the compile command to get round this problem. Here's the code: |
| 1883 |
|
|
| 1884 |
|
#include <stdio.h> |
| 1885 |
|
#include <string.h> |
| 1886 |
|
#include <pcre.h> |
| 1887 |
|
|
| 1888 |
|
#define OVECCOUNT 30 /* should be a multiple of 3 */ |
| 1889 |
|
|
| 1890 |
|
int main(int argc, char **argv) |
| 1891 |
|
{ |
| 1892 |
|
pcre *re; |
| 1893 |
|
const char *error; |
| 1894 |
|
int erroffset; |
| 1895 |
|
int ovector[OVECCOUNT]; |
| 1896 |
|
int rc, i; |
| 1897 |
|
|
| 1898 |
|
if (argc != 3) |
| 1899 |
|
{ |
| 1900 |
|
printf("Two arguments required: a regex and a " |
| 1901 |
|
"subject string\\n"); |
| 1902 |
|
return 1; |
| 1903 |
|
} |
| 1904 |
|
|
| 1905 |
|
/* Compile the regular expression in the first argument */ |
| 1906 |
|
|
| 1907 |
|
re = pcre_compile( |
| 1908 |
|
argv[1], /* the pattern */ |
| 1909 |
|
0, /* default options */ |
| 1910 |
|
&error, /* for error message */ |
| 1911 |
|
&erroffset, /* for error offset */ |
| 1912 |
|
NULL); /* use default character tables */ |
| 1913 |
|
|
| 1914 |
|
/* Compilation failed: print the error message and exit */ |
| 1915 |
|
|
| 1916 |
|
if (re == NULL) |
| 1917 |
|
{ |
| 1918 |
|
printf("PCRE compilation failed at offset %d: %s\\n", |
| 1919 |
|
erroffset, error); |
| 1920 |
|
return 1; |
| 1921 |
|
} |
| 1922 |
|
|
| 1923 |
|
/* Compilation succeeded: match the subject in the second |
| 1924 |
|
argument */ |
| 1925 |
|
|
| 1926 |
|
rc = pcre_exec( |
| 1927 |
|
re, /* the compiled pattern */ |
| 1928 |
|
NULL, /* we didn't study the pattern */ |
| 1929 |
|
argv[2], /* the subject string */ |
| 1930 |
|
(int)strlen(argv[2]), /* the length of the subject */ |
| 1931 |
|
0, /* start at offset 0 in the subject */ |
| 1932 |
|
0, /* default options */ |
| 1933 |
|
ovector, /* vector for substring information */ |
| 1934 |
|
OVECCOUNT); /* number of elements in the vector */ |
| 1935 |
|
|
| 1936 |
|
/* Matching failed: handle error cases */ |
| 1937 |
|
|
| 1938 |
|
if (rc < 0) |
| 1939 |
|
{ |
| 1940 |
|
switch(rc) |
| 1941 |
|
{ |
| 1942 |
|
case PCRE_ERROR_NOMATCH: printf("No match\\n"); break; |
| 1943 |
|
/* |
| 1944 |
|
Handle other special cases if you like |
| 1945 |
|
*/ |
| 1946 |
|
default: printf("Matching error %d\\n", rc); break; |
| 1947 |
|
} |
| 1948 |
|
return 1; |
| 1949 |
|
} |
| 1950 |
|
|
| 1951 |
|
/* Match succeded */ |
| 1952 |
|
|
| 1953 |
|
printf("Match succeeded\\n"); |
| 1954 |
|
|
| 1955 |
|
/* The output vector wasn't big enough */ |
| 1956 |
|
|
| 1957 |
|
if (rc == 0) |
| 1958 |
|
{ |
| 1959 |
|
rc = OVECCOUNT/3; |
| 1960 |
|
printf("ovector only has room for %d captured " |
| 1961 |
|
substrings\\n", rc - 1); |
| 1962 |
|
} |
| 1963 |
|
|
| 1964 |
|
/* Show substrings stored in the output vector */ |
| 1965 |
|
|
| 1966 |
|
for (i = 0; i < rc; i++) |
| 1967 |
|
{ |
| 1968 |
|
char *substring_start = argv[2] + ovector[2*i]; |
| 1969 |
|
int substring_length = ovector[2*i+1] - ovector[2*i]; |
| 1970 |
|
printf("%2d: %.*s\\n", i, substring_length, |
| 1971 |
|
substring_start); |
| 1972 |
|
} |
| 1973 |
|
|
| 1974 |
|
return 0; |
| 1975 |
|
} |
| 1976 |
|
|
| 1977 |
|
|
| 1978 |
.SH AUTHOR |
.SH AUTHOR |
| 1979 |
Philip Hazel <ph10@cam.ac.uk> |
Philip Hazel <ph10@cam.ac.uk> |
| 1980 |
.br |
.br |
| 1986 |
.br |
.br |
| 1987 |
Phone: +44 1223 334714 |
Phone: +44 1223 334714 |
| 1988 |
|
|
| 1989 |
Last updated: 27 January 2000 |
Last updated: 15 August 2001 |
| 1990 |
.br |
.br |
| 1991 |
Copyright (c) 1997-2000 University of Cambridge. |
Copyright (c) 1997-2001 University of Cambridge. |