/[pcre]/code/trunk/doc/html/pcreapi.html
ViewVC logotype

Diff of /code/trunk/doc/html/pcreapi.html

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 868 by ph10, Wed Dec 28 17:16:11 2011 UTC revision 869 by ph10, Sat Jan 14 11:16:23 2012 UTC
# Line 14  man page, in case the conversion went wr Line 14  man page, in case the conversion went wr
14  <br>  <br>
15  <ul>  <ul>
16  <li><a name="TOC1" href="#SEC1">PCRE NATIVE API BASIC FUNCTIONS</a>  <li><a name="TOC1" href="#SEC1">PCRE NATIVE API BASIC FUNCTIONS</a>
17  <li><a name="TOC2" href="#SEC2">PCRE NATIVE API AUXILIARY FUNCTIONS</a>  <li><a name="TOC2" href="#SEC2">PCRE NATIVE API STRING EXTRACTION FUNCTIONS</a>
18  <li><a name="TOC3" href="#SEC3">PCRE NATIVE API INDIRECTED FUNCTIONS</a>  <li><a name="TOC3" href="#SEC3">PCRE NATIVE API AUXILIARY FUNCTIONS</a>
19  <li><a name="TOC4" href="#SEC4">PCRE API OVERVIEW</a>  <li><a name="TOC4" href="#SEC4">PCRE NATIVE API INDIRECTED FUNCTIONS</a>
20  <li><a name="TOC5" href="#SEC5">NEWLINES</a>  <li><a name="TOC5" href="#SEC5">PCRE 8-BIT AND 16-BIT LIBRARIES</a>
21  <li><a name="TOC6" href="#SEC6">MULTITHREADING</a>  <li><a name="TOC6" href="#SEC6">PCRE API OVERVIEW</a>
22  <li><a name="TOC7" href="#SEC7">SAVING PRECOMPILED PATTERNS FOR LATER USE</a>  <li><a name="TOC7" href="#SEC7">NEWLINES</a>
23  <li><a name="TOC8" href="#SEC8">CHECKING BUILD-TIME OPTIONS</a>  <li><a name="TOC8" href="#SEC8">MULTITHREADING</a>
24  <li><a name="TOC9" href="#SEC9">COMPILING A PATTERN</a>  <li><a name="TOC9" href="#SEC9">SAVING PRECOMPILED PATTERNS FOR LATER USE</a>
25  <li><a name="TOC10" href="#SEC10">COMPILATION ERROR CODES</a>  <li><a name="TOC10" href="#SEC10">CHECKING BUILD-TIME OPTIONS</a>
26  <li><a name="TOC11" href="#SEC11">STUDYING A PATTERN</a>  <li><a name="TOC11" href="#SEC11">COMPILING A PATTERN</a>
27  <li><a name="TOC12" href="#SEC12">LOCALE SUPPORT</a>  <li><a name="TOC12" href="#SEC12">COMPILATION ERROR CODES</a>
28  <li><a name="TOC13" href="#SEC13">INFORMATION ABOUT A PATTERN</a>  <li><a name="TOC13" href="#SEC13">STUDYING A PATTERN</a>
29  <li><a name="TOC14" href="#SEC14">OBSOLETE INFO FUNCTION</a>  <li><a name="TOC14" href="#SEC14">LOCALE SUPPORT</a>
30  <li><a name="TOC15" href="#SEC15">REFERENCE COUNTS</a>  <li><a name="TOC15" href="#SEC15">INFORMATION ABOUT A PATTERN</a>
31  <li><a name="TOC16" href="#SEC16">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>  <li><a name="TOC16" href="#SEC16">REFERENCE COUNTS</a>
32  <li><a name="TOC17" href="#SEC17">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>  <li><a name="TOC17" href="#SEC17">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
33  <li><a name="TOC18" href="#SEC18">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>  <li><a name="TOC18" href="#SEC18">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
34  <li><a name="TOC19" href="#SEC19">DUPLICATE SUBPATTERN NAMES</a>  <li><a name="TOC19" href="#SEC19">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
35  <li><a name="TOC20" href="#SEC20">FINDING ALL POSSIBLE MATCHES</a>  <li><a name="TOC20" href="#SEC20">DUPLICATE SUBPATTERN NAMES</a>
36  <li><a name="TOC21" href="#SEC21">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>  <li><a name="TOC21" href="#SEC21">FINDING ALL POSSIBLE MATCHES</a>
37  <li><a name="TOC22" href="#SEC22">SEE ALSO</a>  <li><a name="TOC22" href="#SEC22">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
38  <li><a name="TOC23" href="#SEC23">AUTHOR</a>  <li><a name="TOC23" href="#SEC23">SEE ALSO</a>
39  <li><a name="TOC24" href="#SEC24">REVISION</a>  <li><a name="TOC24" href="#SEC24">AUTHOR</a>
40    <li><a name="TOC25" href="#SEC25">REVISION</a>
41  </ul>  </ul>
 <br><a name="SEC1" href="#TOC1">PCRE NATIVE API BASIC FUNCTIONS</a><br>  
42  <P>  <P>
43  <b>#include &#60;pcre.h&#62;</b>  <b>#include &#60;pcre.h&#62;</b>
44  </P>  </P>
45    <br><a name="SEC1" href="#TOC1">PCRE NATIVE API BASIC FUNCTIONS</a><br>
46  <P>  <P>
47  <b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b>  <b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b>
48  <b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>  <b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>
# Line 65  man page, in case the conversion went wr Line 66  man page, in case the conversion went wr
66  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
67  <b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>);</b>  <b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>);</b>
68  </P>  </P>
 <br><a name="SEC2" href="#TOC1">PCRE NATIVE API AUXILIARY FUNCTIONS</a><br>  
 <P>  
 <b>pcre_jit_stack *pcre_jit_stack_alloc(int <i>startsize</i>, int <i>maxsize</i>);</b>  
 </P>  
 <P>  
 <b>void pcre_jit_stack_free(pcre_jit_stack *<i>stack</i>);</b>  
 </P>  
 <P>  
 <b>void pcre_assign_jit_stack(pcre_extra *<i>extra</i>,</b>  
 <b>pcre_jit_callback <i>callback</i>, void *<i>data</i>);</b>  
 </P>  
69  <P>  <P>
70  <b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>  <b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
71  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
72  <b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>,</b>  <b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>,</b>
73  <b>int *<i>workspace</i>, int <i>wscount</i>);</b>  <b>int *<i>workspace</i>, int <i>wscount</i>);</b>
74  </P>  </P>
75    <br><a name="SEC2" href="#TOC1">PCRE NATIVE API STRING EXTRACTION FUNCTIONS</a><br>
76  <P>  <P>
77  <b>int pcre_copy_named_substring(const pcre *<i>code</i>,</b>  <b>int pcre_copy_named_substring(const pcre *<i>code</i>,</b>
78  <b>const char *<i>subject</i>, int *<i>ovector</i>,</b>  <b>const char *<i>subject</i>, int *<i>ovector</i>,</b>
# Line 122  man page, in case the conversion went wr Line 113  man page, in case the conversion went wr
113  <P>  <P>
114  <b>void pcre_free_substring_list(const char **<i>stringptr</i>);</b>  <b>void pcre_free_substring_list(const char **<i>stringptr</i>);</b>
115  </P>  </P>
116    <br><a name="SEC3" href="#TOC1">PCRE NATIVE API AUXILIARY FUNCTIONS</a><br>
117    <P>
118    <b>pcre_jit_stack *pcre_jit_stack_alloc(int <i>startsize</i>, int <i>maxsize</i>);</b>
119    </P>
120    <P>
121    <b>void pcre_jit_stack_free(pcre_jit_stack *<i>stack</i>);</b>
122    </P>
123    <P>
124    <b>void pcre_assign_jit_stack(pcre_extra *<i>extra</i>,</b>
125    <b>pcre_jit_callback <i>callback</i>, void *<i>data</i>);</b>
126    </P>
127  <P>  <P>
128  <b>const unsigned char *pcre_maketables(void);</b>  <b>const unsigned char *pcre_maketables(void);</b>
129  </P>  </P>
# Line 130  man page, in case the conversion went wr Line 132  man page, in case the conversion went wr
132  <b>int <i>what</i>, void *<i>where</i>);</b>  <b>int <i>what</i>, void *<i>where</i>);</b>
133  </P>  </P>
134  <P>  <P>
 <b>int pcre_info(const pcre *<i>code</i>, int *<i>optptr</i>, int</b>  
 <b>*<i>firstcharptr</i>);</b>  
 </P>  
 <P>  
135  <b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b>  <b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b>
136  </P>  </P>
137  <P>  <P>
138  <b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b>  <b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b>
139  </P>  </P>
140  <P>  <P>
141  <b>char *pcre_version(void);</b>  <b>const char *pcre_version(void);</b>
142  </P>  </P>
143  <br><a name="SEC3" href="#TOC1">PCRE NATIVE API INDIRECTED FUNCTIONS</a><br>  <P>
144    <b>int pcre_pattern_to_host_byte_order(pcre *<i>code</i>,</b>
145    <b>pcre_extra *<i>extra</i>, const unsigned char *<i>tables</i>);</b>
146    </P>
147    <br><a name="SEC4" href="#TOC1">PCRE NATIVE API INDIRECTED FUNCTIONS</a><br>
148  <P>  <P>
149  <b>void *(*pcre_malloc)(size_t);</b>  <b>void *(*pcre_malloc)(size_t);</b>
150  </P>  </P>
# Line 158  man page, in case the conversion went wr Line 160  man page, in case the conversion went wr
160  <P>  <P>
161  <b>int (*pcre_callout)(pcre_callout_block *);</b>  <b>int (*pcre_callout)(pcre_callout_block *);</b>
162  </P>  </P>
163  <br><a name="SEC4" href="#TOC1">PCRE API OVERVIEW</a><br>  <br><a name="SEC5" href="#TOC1">PCRE 8-BIT AND 16-BIT LIBRARIES</a><br>
164    <P>
165    From release 8.30, PCRE can be compiled as a library for handling 16-bit
166    character strings as well as, or instead of, the original library that handles
167    8-bit character strings. To avoid too much complication, this document
168    describes the 8-bit versions of the functions, with only occasional references
169    to the 16-bit library.
170    </P>
171    <P>
172    The 16-bit functions operate in the same way as their 8-bit counterparts; they
173    just use different data types for their arguments and results, and their names
174    start with <b>pcre16_</b> instead of <b>pcre_</b>. For every option that has UTF8
175    in its name (for example, PCRE_UTF8), there is a corresponding 16-bit name with
176    UTF8 replaced by UTF16. This facility is in fact just cosmetic; the 16-bit
177    option names define the same bit values.
178    </P>
179    <P>
180    References to bytes and UTF-8 in this document should be read as references to
181    16-bit data quantities and UTF-16 when using the 16-bit library, unless
182    specified otherwise. More details of the specific differences for the 16-bit
183    library are given in the
184    <a href="pcre16.html"><b>pcre16</b></a>
185    page.
186    </P>
187    <br><a name="SEC6" href="#TOC1">PCRE API OVERVIEW</a><br>
188  <P>  <P>
189  PCRE has its own native API, which is described in this document. There are  PCRE has its own native API, which is described in this document. There are
190  also some wrapper functions that correspond to the POSIX regular expression  also some wrapper functions (for the 8-bit library only) that correspond to the
191  API, but they do not give access to all the functionality. They are described  POSIX regular expression API, but they do not give access to all the
192  in the  functionality. They are described in the
193  <a href="pcreposix.html"><b>pcreposix</b></a>  <a href="pcreposix.html"><b>pcreposix</b></a>
194  documentation. Both of these APIs define a set of C function calls. A C++  documentation. Both of these APIs define a set of C function calls. A C++
195  wrapper is also distributed with PCRE. It is documented in the  wrapper (again for the 8-bit library only) is also distributed with PCRE. It is
196    documented in the
197  <a href="pcrecpp.html"><b>pcrecpp</b></a>  <a href="pcrecpp.html"><b>pcrecpp</b></a>
198  page.  page.
199  </P>  </P>
200  <P>  <P>
201  The native API C function prototypes are defined in the header file  The native API C function prototypes are defined in the header file
202  <b>pcre.h</b>, and on Unix systems the library itself is called <b>libpcre</b>.  <b>pcre.h</b>, and on Unix-like systems the (8-bit) library itself is called
203  It can normally be accessed by adding <b>-lpcre</b> to the command for linking  <b>libpcre</b>. It can normally be accessed by adding <b>-lpcre</b> to the
204  an application that uses PCRE. The header file defines the macros PCRE_MAJOR  command for linking an application that uses PCRE. The header file defines the
205  and PCRE_MINOR to contain the major and minor release numbers for the library.  macros PCRE_MAJOR and PCRE_MINOR to contain the major and minor release numbers
206  Applications can use these to include support for different releases of PCRE.  for the library. Applications can use these to include support for different
207    releases of PCRE.
208  </P>  </P>
209  <P>  <P>
210  In a Windows environment, if you want to statically link an application program  In a Windows environment, if you want to statically link an application program
# Line 244  internal tables that are generated when Line 272  internal tables that are generated when
272  </P>  </P>
273  <P>  <P>
274  The function <b>pcre_fullinfo()</b> is used to find out information about a  The function <b>pcre_fullinfo()</b> is used to find out information about a
275  compiled pattern; <b>pcre_info()</b> is an obsolete version that returns only  compiled pattern. The function <b>pcre_version()</b> returns a pointer to a
276  some of the available information, but is retained for backwards compatibility.  string containing the version of PCRE and its date of release.
 The function <b>pcre_version()</b> returns a pointer to a string containing the  
 version of PCRE and its date of release.  
277  </P>  </P>
278  <P>  <P>
279  The function <b>pcre_refcount()</b> maintains a reference count in a data block  The function <b>pcre_refcount()</b> maintains a reference count in a data block
# Line 284  points during a matching operation. Deta Line 310  points during a matching operation. Deta
310  <a href="pcrecallout.html"><b>pcrecallout</b></a>  <a href="pcrecallout.html"><b>pcrecallout</b></a>
311  documentation.  documentation.
312  <a name="newlines"></a></P>  <a name="newlines"></a></P>
313  <br><a name="SEC5" href="#TOC1">NEWLINES</a><br>  <br><a name="SEC7" href="#TOC1">NEWLINES</a><br>
314  <P>  <P>
315  PCRE supports five different conventions for indicating line breaks in  PCRE supports five different conventions for indicating line breaks in
316  strings: a single CR (carriage return) character, a single LF (linefeed)  strings: a single CR (carriage return) character, a single LF (linefeed)
# Line 323  The choice of newline convention does no Line 349  The choice of newline convention does no
349  the \n or \r escape sequences, nor does it affect what \R matches, which is  the \n or \r escape sequences, nor does it affect what \R matches, which is
350  controlled in a similar way, but by separate options.  controlled in a similar way, but by separate options.
351  </P>  </P>
352  <br><a name="SEC6" href="#TOC1">MULTITHREADING</a><br>  <br><a name="SEC8" href="#TOC1">MULTITHREADING</a><br>
353  <P>  <P>
354  The PCRE functions can be used in multi-threading applications, with the  The PCRE functions can be used in multi-threading applications, with the
355  proviso that the memory management functions pointed to by <b>pcre_malloc</b>,  proviso that the memory management functions pointed to by <b>pcre_malloc</b>,
# Line 340  memory stack areas for each thread. See Line 366  memory stack areas for each thread. See
366  <a href="pcrejit.html"><b>pcrejit</b></a>  <a href="pcrejit.html"><b>pcrejit</b></a>
367  documentation for more details.  documentation for more details.
368  </P>  </P>
369  <br><a name="SEC7" href="#TOC1">SAVING PRECOMPILED PATTERNS FOR LATER USE</a><br>  <br><a name="SEC9" href="#TOC1">SAVING PRECOMPILED PATTERNS FOR LATER USE</a><br>
370  <P>  <P>
371  The compiled form of a regular expression can be saved and re-used at a later  The compiled form of a regular expression can be saved and re-used at a later
372  time, possibly by a different program, and even on a host other than the one on  time, possibly by a different program, and even on a host other than the one on
373  which it was compiled. Details are given in the  which it was compiled. Details are given in the
374  <a href="pcreprecompile.html"><b>pcreprecompile</b></a>  <a href="pcreprecompile.html"><b>pcreprecompile</b></a>
375  documentation. However, compiling a regular expression with one version of PCRE  documentation, which includes a description of the
376  for use with a different version is not guaranteed to work and may cause  <b>pcre_pattern_to_host_byte_order()</b> function. However, compiling a regular
377  crashes.  expression with one version of PCRE for use with a different version is not
378    guaranteed to work and may cause crashes.
379  </P>  </P>
380  <br><a name="SEC8" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>  <br><a name="SEC10" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
381  <P>  <P>
382  <b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b>  <b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b>
383  </P>  </P>
# Line 363  documentation has more details about the Line 390  documentation has more details about the
390  <P>  <P>
391  The first argument for <b>pcre_config()</b> is an integer, specifying which  The first argument for <b>pcre_config()</b> is an integer, specifying which
392  information is required; the second argument is a pointer to a variable into  information is required; the second argument is a pointer to a variable into
393  which the information is placed. The following information is available:  which the information is placed. The returned value is zero on success, or the
394    negative error code PCRE_ERROR_BADOPTION if the value in the first argument is
395    not recognized. The following information is available:
396  <pre>  <pre>
397    PCRE_CONFIG_UTF8    PCRE_CONFIG_UTF8
398  </pre>  </pre>
399  The output is an integer that is set to one if UTF-8 support is available;  The output is an integer that is set to one if UTF-8 support is available;
400  otherwise it is set to zero.  otherwise it is set to zero. If this option is given to the 16-bit version of
401    this function, <b>pcre16_config()</b>, the result is PCRE_ERROR_BADOPTION.
402    <pre>
403      PCRE_CONFIG_UTF16
404    </pre>
405    The output is an integer that is set to one if UTF-16 support is available;
406    otherwise it is set to zero. This value should normally be given to the 16-bit
407    version of this function, <b>pcre16_config()</b>. If it is given to the 8-bit
408    version of this function, the result is PCRE_ERROR_BADOPTION.
409  <pre>  <pre>
410    PCRE_CONFIG_UNICODE_PROPERTIES    PCRE_CONFIG_UNICODE_PROPERTIES
411  </pre>  </pre>
# Line 399  or CRLF. The default can be overridden w Line 436  or CRLF. The default can be overridden w
436    PCRE_CONFIG_LINK_SIZE    PCRE_CONFIG_LINK_SIZE
437  </pre>  </pre>
438  The output is an integer that contains the number of bytes used for internal  The output is an integer that contains the number of bytes used for internal
439  linkage in compiled regular expressions. The value is 2, 3, or 4. Larger values  linkage in compiled regular expressions. For the 8-bit library, the value can
440  allow larger regular expressions to be compiled, at the expense of slower  be 2, 3, or 4. For the 16-bit library, the value is either 2 or 4 and is still
441  matching. The default value of 2 is sufficient for all but the most massive  a number of bytes. The default value of 2 is sufficient for all but the most
442  patterns, since it allows the compiled pattern to be up to 64K in size.  massive patterns, since it allows the compiled pattern to be up to 64K in size.
443    Larger values allow larger regular expressions to be compiled, at the expense
444    of slower matching.
445  <pre>  <pre>
446    PCRE_CONFIG_POSIX_MALLOC_THRESHOLD    PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
447  </pre>  </pre>
# Line 434  of recursive function calls. In this cas Line 473  of recursive function calls. In this cas
473  <b>pcre_stack_free</b> are called to manage memory blocks on the heap, thus  <b>pcre_stack_free</b> are called to manage memory blocks on the heap, thus
474  avoiding the use of the stack.  avoiding the use of the stack.
475  </P>  </P>
476  <br><a name="SEC9" href="#TOC1">COMPILING A PATTERN</a><br>  <br><a name="SEC11" href="#TOC1">COMPILING A PATTERN</a><br>
477  <P>  <P>
478  <b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b>  <b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b>
479  <b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>  <b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>
# Line 489  not try to free it. Normally, the offset Line 528  not try to free it. Normally, the offset
528  byte that was being processed when the error was discovered is placed in the  byte that was being processed when the error was discovered is placed in the
529  variable pointed to by <i>erroffset</i>, which must not be NULL (if it is, an  variable pointed to by <i>erroffset</i>, which must not be NULL (if it is, an
530  immediate error is given). However, for an invalid UTF-8 string, the offset is  immediate error is given). However, for an invalid UTF-8 string, the offset is
531  that of the first byte of the failing character. Also, some errors are not  that of the first byte of the failing character.
 detected until checks are carried out when the whole pattern has been scanned;  
 in these cases the offset passed back is the length of the pattern.  
532  </P>  </P>
533  <P>  <P>
534  Note that the offset is in bytes, not characters, even in UTF-8 mode. It may  Some errors are not detected until the whole pattern has been scanned; in these
535  sometimes point into the middle of a UTF-8 character.  cases, the offset passed back is the length of the pattern. Note that the
536    offset is in bytes, not characters, even in UTF-8 mode. It may sometimes point
537    into the middle of a UTF-8 character.
538  </P>  </P>
539  <P>  <P>
540  If <b>pcre_compile2()</b> is used instead of <b>pcre_compile()</b>, and the  If <b>pcre_compile2()</b> is used instead of <b>pcre_compile()</b>, and the
# Line 699  preceding sequences should be recognized Line 738  preceding sequences should be recognized
738  that any Unicode newline sequence should be recognized. The Unicode newline  that any Unicode newline sequence should be recognized. The Unicode newline
739  sequences are the three just mentioned, plus the single characters VT (vertical  sequences are the three just mentioned, plus the single characters VT (vertical
740  tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line  tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
741  separator, U+2028), and PS (paragraph separator, U+2029). The last two are  separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
742  recognized only in UTF-8 mode.  library, the last two are recognized only in UTF-8 mode.
743  </P>  </P>
744  <P>  <P>
745  The newline setting in the options word uses three bits that are treated  The newline setting in the options word uses three bits that are treated
# Line 760  with Perl. It can also be set by a (?U) Line 799  with Perl. It can also be set by a (?U)
799    PCRE_UTF8    PCRE_UTF8
800  </pre>  </pre>
801  This option causes PCRE to regard both the pattern and the subject as strings  This option causes PCRE to regard both the pattern and the subject as strings
802  of UTF-8 characters instead of single-byte character strings. However, it is  of UTF-8 characters instead of single-byte strings. However, it is available
803  available only when PCRE is built to include UTF-8 support. If not, the use  only when PCRE is built to include UTF support. If not, the use of this option
804  of this option provokes an error. Details of how this option changes the  provokes an error. Details of how this option changes the behaviour of PCRE are
805  behaviour of PCRE are given in the  given in the
806  <a href="pcreunicode.html"><b>pcreunicode</b></a>  <a href="pcreunicode.html"><b>pcreunicode</b></a>
807  page.  page.
808  <pre>  <pre>
809    PCRE_NO_UTF8_CHECK    PCRE_NO_UTF8_CHECK
810  </pre>  </pre>
811  When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is  When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
812  automatically checked. There is a discussion about the  string is automatically checked. There is a discussion about the
813  <a href="pcre.html#utf8strings">validity of UTF-8 strings</a>  <a href="pcreunicode.html#utf8strings">validity of UTF-8 strings</a>
814  in the main  in the
815  <a href="pcre.html"><b>pcre</b></a>  <a href="pcreunicode.html"><b>pcreunicode</b></a>
816  page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_compile()</b>  page. If an invalid UTF-8 sequence is found, <b>pcre_compile()</b> returns an
817  returns an error. If you already know that your pattern is valid, and you want  error. If you already know that your pattern is valid, and you want to skip
818  to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK  this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK option.
819  option. When it is set, the effect of passing an invalid UTF-8 string as a  When it is set, the effect of passing an invalid UTF-8 string as a pattern is
820  pattern is undefined. It may cause your program to crash. Note that this option  undefined. It may cause your program to crash. Note that this option can also
821  can also be passed to <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, to suppress  be passed to <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, to suppress the
822  the UTF-8 validity checking of subject strings.  validity checking of subject strings.
823  </P>  </P>
824  <br><a name="SEC10" href="#TOC1">COMPILATION ERROR CODES</a><br>  <br><a name="SEC12" href="#TOC1">COMPILATION ERROR CODES</a><br>
825  <P>  <P>
826  The following table lists the error codes than may be returned by  The following table lists the error codes than may be returned by
827  <b>pcre_compile2()</b>, along with the error messages that may be returned by  <b>pcre_compile2()</b>, along with the error messages that may be returned by
828  both compiling functions. As PCRE has developed, some error codes have fallen  both compiling functions. Note that error messages are always 8-bit ASCII
829  out of use. To avoid confusion, they have not been re-used.  strings, even in 16-bit mode. As PCRE has developed, some error codes have
830    fallen out of use. To avoid confusion, they have not been re-used.
831  <pre>  <pre>
832     0  no error     0  no error
833     1  \ at end of pattern     1  \ at end of pattern
# Line 821  out of use. To avoid confusion, they hav Line 861  out of use. To avoid confusion, they hav
861    29  (?R or (?[+-]digits must be followed by )    29  (?R or (?[+-]digits must be followed by )
862    30  unknown POSIX class name    30  unknown POSIX class name
863    31  POSIX collating elements are not supported    31  POSIX collating elements are not supported
864    32  this version of PCRE is not compiled with PCRE_UTF8 support    32  this version of PCRE is compiled without UTF support
865    33  [this code is not in use]    33  [this code is not in use]
866    34  character value in \x{...} sequence is too large    34  character value in \x{...} sequence is too large
867    35  invalid condition (?(0)    35  invalid condition (?(0)
# Line 833  out of use. To avoid confusion, they hav Line 873  out of use. To avoid confusion, they hav
873    41  unrecognized character after (?P    41  unrecognized character after (?P
874    42  syntax error in subpattern name (missing terminator)    42  syntax error in subpattern name (missing terminator)
875    43  two named subpatterns have the same name    43  two named subpatterns have the same name
876    44  invalid UTF-8 string    44  invalid UTF-8 string (specifically UTF-8)
877    45  support for \P, \p, and \X has not been compiled    45  support for \P, \p, and \X has not been compiled
878    46  malformed \P or \p sequence    46  malformed \P or \p sequence
879    47  unknown property name after \P or \p    47  unknown property name after \P or \p
880    48  subpattern name is too long (maximum 32 characters)    48  subpattern name is too long (maximum 32 characters)
881    49  too many named subpatterns (maximum 10000)    49  too many named subpatterns (maximum 10000)
882    50  [this code is not in use]    50  [this code is not in use]
883    51  octal value is greater than \377 (not in UTF-8 mode)    51  octal value is greater than \377 in 8-bit non-UTF-8 mode
884    52  internal error: overran compiling workspace    52  internal error: overran compiling workspace
885    53  internal error: previously-checked referenced subpattern    53  internal error: previously-checked referenced subpattern
886          not found          not found
# Line 859  out of use. To avoid confusion, they hav Line 899  out of use. To avoid confusion, they hav
899    65  different names for subpatterns of the same number are    65  different names for subpatterns of the same number are
900          not allowed          not allowed
901    66  (*MARK) must have an argument    66  (*MARK) must have an argument
902    67  this version of PCRE is not compiled with PCRE_UCP support    67  this version of PCRE is not compiled with Unicode property
903            support
904    68  \c must be followed by an ASCII character    68  \c must be followed by an ASCII character
905    69  \k is not followed by a braced, angle-bracketed, or quoted name    69  \k is not followed by a braced, angle-bracketed, or quoted name
906      70  internal error: unknown opcode in find_fixedlength()
907      71  \N is not supported in a class
908      72  too many forward references
909      73  disallowed Unicode code point (&#62;= 0xd800 && &#60;= 0xdfff)
910      74  invalid UTF-16 string (specifically UTF-16)
911  </pre>  </pre>
912  The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may  The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
913  be used if the limits were changed when PCRE was built.  be used if the limits were changed when PCRE was built.
914  <a name="studyingapattern"></a></P>  <a name="studyingapattern"></a></P>
915  <br><a name="SEC11" href="#TOC1">STUDYING A PATTERN</a><br>  <br><a name="SEC13" href="#TOC1">STUDYING A PATTERN</a><br>
916  <P>  <P>
917  <b>pcre_extra *pcre_study(const pcre *<i>code</i>, int <i>options</i></b>  <b>pcre_extra *pcre_study(const pcre *<i>code</i>, int <i>options</i></b>
918  <b>const char **<i>errptr</i>);</b>  <b>const char **<i>errptr</i>);</b>
# Line 958  in a calling program via the pcre_ful Line 1004  in a calling program via the pcre_ful
1004  Studying a pattern is also useful for non-anchored patterns that do not have a  Studying a pattern is also useful for non-anchored patterns that do not have a
1005  single fixed starting character. A bitmap of possible starting bytes is  single fixed starting character. A bitmap of possible starting bytes is
1006  created. This speeds up finding a position in the subject at which to start  created. This speeds up finding a position in the subject at which to start
1007  matching.  matching. (In 16-bit mode, the bitmap is used for 16-bit values less than 256.)
1008  </P>  </P>
1009  <P>  <P>
1010  These two optimizations apply to both <b>pcre_exec()</b> and  These two optimizations apply to both <b>pcre_exec()</b> and
# Line 972  to make use of these facilities in cases Line 1018  to make use of these facilities in cases
1018  discussion of PCRE_NO_START_OPTIMIZE  discussion of PCRE_NO_START_OPTIMIZE
1019  <a href="#execoptions">below.</a>  <a href="#execoptions">below.</a>
1020  <a name="localesupport"></a></P>  <a name="localesupport"></a></P>
1021  <br><a name="SEC12" href="#TOC1">LOCALE SUPPORT</a><br>  <br><a name="SEC14" href="#TOC1">LOCALE SUPPORT</a><br>
1022  <P>  <P>
1023  PCRE handles caseless matching, and determines whether characters are letters,  PCRE handles caseless matching, and determines whether characters are letters,
1024  digits, or whatever, by reference to a set of tables, indexed by character  digits, or whatever, by reference to a set of tables, indexed by character
1025  value. When running in UTF-8 mode, this applies only to characters with codes  value. When running in UTF-8 mode, this applies only to characters
1026  less than 128. By default, higher-valued codes never match escapes such as \w  with codes less than 128. By default, higher-valued codes never match escapes
1027  or \d, but they can be tested with \p if PCRE is built with Unicode character  such as \w or \d, but they can be tested with \p if PCRE is built with
1028  property support. Alternatively, the PCRE_UCP option can be set at compile  Unicode character property support. Alternatively, the PCRE_UCP option can be
1029  time; this causes \w and friends to use Unicode property support instead of  set at compile time; this causes \w and friends to use Unicode property
1030  built-in tables. The use of locales with Unicode is discouraged. If you are  support instead of built-in tables. The use of locales with Unicode is
1031  handling characters with codes greater than 128, you should either use UTF-8  discouraged. If you are handling characters with codes greater than 128, you
1032  and Unicode, or use locales, but not try to mix the two.  should either use UTF-8 and Unicode, or use locales, but not try to mix the
1033    two.
1034  </P>  </P>
1035  <P>  <P>
1036  PCRE contains an internal set of tables that are used when the final argument  PCRE contains an internal set of tables that are used when the final argument
# Line 1033  this facility could be used to match a p Line 1080  this facility could be used to match a p
1080  one in which it was compiled. Passing table pointers at run time is discussed  one in which it was compiled. Passing table pointers at run time is discussed
1081  below in the section on matching a pattern.  below in the section on matching a pattern.
1082  <a name="infoaboutpattern"></a></P>  <a name="infoaboutpattern"></a></P>
1083  <br><a name="SEC13" href="#TOC1">INFORMATION ABOUT A PATTERN</a><br>  <br><a name="SEC15" href="#TOC1">INFORMATION ABOUT A PATTERN</a><br>
1084  <P>  <P>
1085  <b>int pcre_fullinfo(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>  <b>int pcre_fullinfo(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
1086  <b>int <i>what</i>, void *<i>where</i>);</b>  <b>int <i>what</i>, void *<i>where</i>);</b>
1087  </P>  </P>
1088  <P>  <P>
1089  The <b>pcre_fullinfo()</b> function returns information about a compiled  The <b>pcre_fullinfo()</b> function returns information about a compiled
1090  pattern. It replaces the obsolete <b>pcre_info()</b> function, which is  pattern. It replaces the <b>pcre_info()</b> function, which was removed from the
1091  nevertheless retained for backwards compability (and is documented below).  library at version 8.30, after more than 10 years of obsolescence.
1092  </P>  </P>
1093  <P>  <P>
1094  The first argument for <b>pcre_fullinfo()</b> is a pointer to the compiled  The first argument for <b>pcre_fullinfo()</b> is a pointer to the compiled
# Line 1051  information is required, and the fourth Line 1098  information is required, and the fourth
1098  to receive the data. The yield of the function is zero for success, or one of  to receive the data. The yield of the function is zero for success, or one of
1099  the following negative numbers:  the following negative numbers:
1100  <pre>  <pre>
1101    PCRE_ERROR_NULL       the argument <i>code</i> was NULL    PCRE_ERROR_NULL           the argument <i>code</i> was NULL
1102                          the argument <i>where</i> was NULL                              the argument <i>where</i> was NULL
1103    PCRE_ERROR_BADMAGIC   the "magic number" was not found    PCRE_ERROR_BADMAGIC       the "magic number" was not found
1104    PCRE_ERROR_BADOPTION  the value of <i>what</i> was invalid    PCRE_ERROR_BADENDIANNESS  the pattern was compiled with different
1105                                endianness
1106      PCRE_ERROR_BADOPTION      the value of <i>what</i> was invalid
1107  </pre>  </pre>
1108  The "magic number" is placed at the start of each compiled pattern as an simple  The "magic number" is placed at the start of each compiled pattern as an simple
1109  check against passing an arbitrary memory pointer. Here is a typical call of  check against passing an arbitrary memory pointer. The endianness error can
1110  <b>pcre_fullinfo()</b>, to obtain the length of the compiled pattern:  occur if a compiled pattern is saved and reloaded on a different host. Here is
1111    a typical call of <b>pcre_fullinfo()</b>, to obtain the length of the compiled
1112    pattern:
1113  <pre>  <pre>
1114    int rc;    int rc;
1115    size_t length;    size_t length;
# Line 1092  a NULL table pointer. Line 1143  a NULL table pointer.
1143  <pre>  <pre>
1144    PCRE_INFO_FIRSTBYTE    PCRE_INFO_FIRSTBYTE
1145  </pre>  </pre>
1146  Return information about the first byte of any matched string, for a  Return information about the first data unit of any matched string, for a
1147  non-anchored pattern. The fourth argument should point to an <b>int</b>  non-anchored pattern. (The name of this option refers to the 8-bit library,
1148  variable. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is  where data units are bytes.) The fourth argument should point to an <b>int</b>
1149  still recognized for backwards compatibility.)  variable.
1150  </P>  </P>
1151  <P>  <P>
1152  If there is a fixed first byte, for example, from a pattern such as  If there is a fixed first value, for example, the letter "c" from a pattern
1153  (cat|cow|coyote), its value is returned. Otherwise, if either  such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
1154    value is always less than 256; in the 16-bit library the value can be up to
1155    0xffff.
1156    </P>
1157    <P>
1158    If there is no fixed first value, and if either
1159  <br>  <br>
1160  <br>  <br>
1161  (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch  (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
# Line 1117  returned. For anchored patterns, -2 is r Line 1173  returned. For anchored patterns, -2 is r
1173    PCRE_INFO_FIRSTTABLE    PCRE_INFO_FIRSTTABLE
1174  </pre>  </pre>
1175  If the pattern was studied, and this resulted in the construction of a 256-bit  If the pattern was studied, and this resulted in the construction of a 256-bit
1176  table indicating a fixed set of bytes for the first byte in any matching  table indicating a fixed set of values for the first data unit in any matching
1177  string, a pointer to the table is returned. Otherwise NULL is returned. The  string, a pointer to the table is returned. Otherwise NULL is returned. The
1178  fourth argument should point to an <b>unsigned char *</b> variable.  fourth argument should point to an <b>unsigned char *</b> variable.
1179  <pre>  <pre>
# Line 1152  argument should point to a size_t Line 1208  argument should point to a size_t
1208  <pre>  <pre>
1209    PCRE_INFO_LASTLITERAL    PCRE_INFO_LASTLITERAL
1210  </pre>  </pre>
1211  Return the value of the rightmost literal byte that must exist in any matched  Return the value of the rightmost literal data unit that must exist in any
1212  string, other than at its start, if such a byte has been recorded. The fourth  matched string, other than at its start, if such a value has been recorded. The
1213  argument should point to an <b>int</b> variable. If there is no such byte, -1 is  fourth argument should point to an <b>int</b> variable. If there is no such
1214  returned. For anchored patterns, a last literal byte is recorded only if it  value, -1 is returned. For anchored patterns, a last literal value is recorded
1215  follows something of variable length. For example, for the pattern  only if it follows something of variable length. For example, for the pattern
1216  /^a\d+z\d+/ the returned value is "z", but for /^a\dz\d/ the returned value  /^a\d+z\d+/ the returned value is "z", but for /^a\dz\d/ the returned value
1217  is -1.  is -1.
1218  <pre>  <pre>
# Line 1164  is -1. Line 1220  is -1.
1220  </pre>  </pre>
1221  If the pattern was studied and a minimum length for matching subject strings  If the pattern was studied and a minimum length for matching subject strings
1222  was computed, its value is returned. Otherwise the returned value is -1. The  was computed, its value is returned. Otherwise the returned value is -1. The
1223  value is a number of characters, not bytes (this may be relevant in UTF-8  value is a number of characters, which in UTF-8 mode may be different from the
1224  mode). The fourth argument should point to an <b>int</b> variable. A  number of bytes. The fourth argument should point to an <b>int</b> variable. A
1225  non-negative value is a lower bound to the length of any matching string. There  non-negative value is a lower bound to the length of any matching string. There
1226  may not be any strings of that length that do actually match, but every string  may not be any strings of that length that do actually match, but every string
1227  that does match is at least that long.  that does match is at least that long.
# Line 1189  The map consists of a number of fixed-si Line 1245  The map consists of a number of fixed-si
1245  the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size of each  the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size of each
1246  entry; both of these return an <b>int</b> value. The entry size depends on the  entry; both of these return an <b>int</b> value. The entry size depends on the
1247  length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first  length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first
1248  entry of the table (a pointer to <b>char</b>). The first two bytes of each entry  entry of the table. This is a pointer to <b>char</b> in the 8-bit library, where
1249  are the number of the capturing parenthesis, most significant byte first. The  the first two bytes of each entry are the number of the capturing parenthesis,
1250  rest of the entry is the corresponding name, zero terminated.  most significant byte first. In the 16-bit library, the pointer points to
1251    16-bit data units, the first of which contains the parenthesis number. The rest
1252    of the entry is the corresponding name, zero terminated.
1253  </P>  </P>
1254  <P>  <P>
1255  The names are in alphabetical order. Duplicate names may appear if (?| is used  The names are in alphabetical order. Duplicate names may appear if (?| is used
# Line 1207  necessarily the case because later subpa Line 1265  necessarily the case because later subpa
1265  </P>  </P>
1266  <P>  <P>
1267  As a simple example of the name/number table, consider the following pattern  As a simple example of the name/number table, consider the following pattern
1268  (assume PCRE_EXTENDED is set, so white space - including newlines - is  after compilation by the 8-bit library (assume PCRE_EXTENDED is set, so white
1269  ignored):  space - including newlines - is ignored):
1270  <pre>  <pre>
1271    (?&#60;date&#62; (?&#60;year&#62;(\d\d)?\d\d) - (?&#60;month&#62;\d\d) - (?&#60;day&#62;\d\d) )    (?&#60;date&#62; (?&#60;year&#62;(\d\d)?\d\d) - (?&#60;month&#62;\d\d) - (?&#60;day&#62;\d\d) )
1272  </pre>  </pre>
# Line 1258  For such patterns, the PCRE_ANCHORED bit Line 1316  For such patterns, the PCRE_ANCHORED bit
1316  <pre>  <pre>
1317    PCRE_INFO_SIZE    PCRE_INFO_SIZE
1318  </pre>  </pre>
1319  Return the size of the compiled pattern. The fourth argument should point to a  Return the size of the compiled pattern in bytes (for both libraries). The
1320  <b>size_t</b> variable. This value does not include the size of the <b>pcre</b>  fourth argument should point to a <b>size_t</b> variable. This value does not
1321  structure that is returned by <b>pcre_compile()</b>. The value that is passed as  include the size of the <b>pcre</b> structure that is returned by
1322  the argument to <b>pcre_malloc()</b> when <b>pcre_compile()</b> is getting memory  <b>pcre_compile()</b>. The value that is passed as the argument to
1323  in which to place the compiled data is the value returned by this option plus  <b>pcre_malloc()</b> when <b>pcre_compile()</b> is getting memory in which to
1324  the size of the <b>pcre</b> structure. Studying a compiled pattern, with or  place the compiled data is the value returned by this option plus the size of
1325  without JIT, does not alter the value returned by this option.  the <b>pcre</b> structure. Studying a compiled pattern, with or without JIT,
1326    does not alter the value returned by this option.
1327  <pre>  <pre>
1328    PCRE_INFO_STUDYSIZE    PCRE_INFO_STUDYSIZE
1329  </pre>  </pre>
1330  Return the size of the data block pointed to by the <i>study_data</i> field in a  Return the size in bytes of the data block pointed to by the <i>study_data</i>
1331  <b>pcre_extra</b> block. If <b>pcre_extra</b> is NULL, or there is no study data,  field in a <b>pcre_extra</b> block. If <b>pcre_extra</b> is NULL, or there is no
1332  zero is returned. The fourth argument should point to a <b>size_t</b> variable.  study data, zero is returned. The fourth argument should point to a
1333  The <i>study_data</i> field is set by <b>pcre_study()</b> to record information  <b>size_t</b> variable. The <i>study_data</i> field is set by <b>pcre_study()</b>
1334  that will speed up matching (see the section entitled  to record information that will speed up matching (see the section entitled
1335  <a href="#studyingapattern">"Studying a pattern"</a>  <a href="#studyingapattern">"Studying a pattern"</a>
1336  above). The format of the <i>study_data</i> block is private, but its length  above). The format of the <i>study_data</i> block is private, but its length
1337  is made available via this option so that it can be saved and restored (see the  is made available via this option so that it can be saved and restored (see the
1338  <a href="pcreprecompile.html"><b>pcreprecompile</b></a>  <a href="pcreprecompile.html"><b>pcreprecompile</b></a>
1339  documentation for details).  documentation for details).
1340  </P>  </P>
1341  <br><a name="SEC14" href="#TOC1">OBSOLETE INFO FUNCTION</a><br>  <br><a name="SEC16" href="#TOC1">REFERENCE COUNTS</a><br>
 <P>  
 <b>int pcre_info(const pcre *<i>code</i>, int *<i>optptr</i>, int</b>  
 <b>*<i>firstcharptr</i>);</b>  
 </P>  
 <P>  
 The <b>pcre_info()</b> function is now obsolete because its interface is too  
 restrictive to return all the available data about a compiled pattern. New  
 programs should use <b>pcre_fullinfo()</b> instead. The yield of  
 <b>pcre_info()</b> is the number of capturing subpatterns, or one of the  
 following negative numbers:  
 <pre>  
   PCRE_ERROR_NULL       the argument <i>code</i> was NULL  
   PCRE_ERROR_BADMAGIC   the "magic number" was not found  
 </pre>  
 If the <i>optptr</i> argument is not NULL, a copy of the options with which the  
 pattern was compiled is placed in the integer it points to (see  
 PCRE_INFO_OPTIONS above).  
 </P>  
 <P>  
 If the pattern is not anchored and the <i>firstcharptr</i> argument is not NULL,  
 it is used to pass back information about the first character of any matched  
 string (see PCRE_INFO_FIRSTBYTE above).  
 </P>  
 <br><a name="SEC15" href="#TOC1">REFERENCE COUNTS</a><br>  
1342  <P>  <P>
1343  <b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b>  <b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b>
1344  </P>  </P>
# Line 1327  Except when it is zero, the reference co Line 1362  Except when it is zero, the reference co
1362  pattern is compiled on one host and then transferred to a host whose byte-order  pattern is compiled on one host and then transferred to a host whose byte-order
1363  is different. (This seems a highly unlikely scenario.)  is different. (This seems a highly unlikely scenario.)
1364  </P>  </P>
1365  <br><a name="SEC16" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>  <br><a name="SEC17" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
1366  <P>  <P>
1367  <b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>  <b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
1368  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
# Line 1392  fields (not necessarily in this order): Line 1427  fields (not necessarily in this order):
1427    const unsigned char *<i>tables</i>;    const unsigned char *<i>tables</i>;
1428    unsigned char **<i>mark</i>;    unsigned char **<i>mark</i>;
1429  </pre>  </pre>
1430    In the 16-bit version of this structure, the <i>mark</i> field has type
1431    "PCRE_UCHAR16 **".
1432    </P>
1433    <P>
1434  The <i>flags</i> field is a bitmap that specifies which of the other fields  The <i>flags</i> field is a bitmap that specifies which of the other fields
1435  are set. The flag bits are:  are set. The flag bits are:
1436  <pre>  <pre>
# Line 1482  documentation for a discussion of saving Line 1521  documentation for a discussion of saving
1521  </P>  </P>
1522  <P>  <P>
1523  If PCRE_EXTRA_MARK is set in the <i>flags</i> field, the <i>mark</i> field must  If PCRE_EXTRA_MARK is set in the <i>flags</i> field, the <i>mark</i> field must
1524  be set to point to a <b>char *</b> variable. If the pattern contains any  be set to point to a suitable variable. If the pattern contains any
1525  backtracking control verbs such as (*MARK:NAME), and the execution ends up with  backtracking control verbs such as (*MARK:NAME), and the execution ends up with
1526  a name to pass back, a pointer to the name string (zero terminated) is placed  a name to pass back, a pointer to the name string (zero terminated) is placed
1527  in the variable pointed to by the <i>mark</i> field. The names are within the  in the variable pointed to by the <i>mark</i> field. The names are within the
1528  compiled pattern; if you wish to retain such a name you must copy it before  compiled pattern; if you wish to retain such a name you must copy it before
1529  freeing the memory of a compiled pattern. If there is no name to pass back, the  freeing the memory of a compiled pattern. If there is no name to pass back, the
1530  variable pointed to by the <i>mark</i> field set to NULL. For details of the  variable pointed to by the <i>mark</i> field is set to NULL. For details of the
1531  backtracking control verbs, see the section entitled  backtracking control verbs, see the section entitled
1532  <a href="pcrepattern#backtrackcontrol">"Backtracking control"</a>  <a href="pcrepattern#backtrackcontrol">"Backtracking control"</a>
1533  in the  in the
# Line 1671  string is automatically checked when Line 1710  string is automatically checked when
1710  The value of <i>startoffset</i> is also checked to ensure that it points to the  The value of <i>startoffset</i> is also checked to ensure that it points to the
1711  start of a UTF-8 character. There is a discussion about the validity of UTF-8  start of a UTF-8 character. There is a discussion about the validity of UTF-8
1712  strings in the  strings in the
1713  <a href="pcre.html#utf8strings">section on UTF-8 support</a>  <a href="pcreunicode.html"><b>pcreunicode</b></a>
1714  in the main  page. If an invalid sequence of bytes is found, <b>pcre_exec()</b> returns the
1715  <a href="pcre.html"><b>pcre</b></a>  error PCRE_ERROR_BADUTF8 or, if PCRE_PARTIAL_HARD is set and the problem is a
1716  page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_exec()</b> returns  truncated character at the end of the subject, PCRE_ERROR_SHORTUTF8. In both
1717  the error PCRE_ERROR_BADUTF8 or, if PCRE_PARTIAL_HARD is set and the problem is  cases, information about the precise nature of the error may also be returned
1718  a truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORTUTF8. In  (see the descriptions of these errors in the section entitled \fIError return
1719  both cases, information about the precise nature of the error may also be  values from\fP <b>pcre_exec()</b>
 returned (see the descriptions of these errors in the section entitled \fIError  
 return values from\fP <b>pcre_exec()</b>  
1720  <a href="#errorlist">below).</a>  <a href="#errorlist">below).</a>
1721  If <i>startoffset</i> contains a value that does not point to the start of a  If <i>startoffset</i> contains a value that does not point to the start of a
1722  UTF-8 character (or to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is  UTF-8 character (or to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is
# Line 1691  checks for performance reasons, you can Line 1728  checks for performance reasons, you can
1728  calling <b>pcre_exec()</b>. You might want to do this for the second and  calling <b>pcre_exec()</b>. You might want to do this for the second and
1729  subsequent calls to <b>pcre_exec()</b> if you are making repeated calls to find  subsequent calls to <b>pcre_exec()</b> if you are making repeated calls to find
1730  all the matches in a single subject string. However, you should be sure that  all the matches in a single subject string. However, you should be sure that
1731  the value of <i>startoffset</i> points to the start of a UTF-8 character (or the  the value of <i>startoffset</i> points to the start of a character (or the end
1732  end of the subject). When PCRE_NO_UTF8_CHECK is set, the effect of passing an  of the subject). When PCRE_NO_UTF8_CHECK is set, the effect of passing an
1733  invalid UTF-8 string as a subject or an invalid value of <i>startoffset</i> is  invalid string as a subject or an invalid value of <i>startoffset</i> is
1734  undefined. Your program may crash.  undefined. Your program may crash.
1735  <pre>  <pre>
1736    PCRE_PARTIAL_HARD    PCRE_PARTIAL_HARD
# Line 1728  The string to be matched by pcre_exec Line 1765  The string to be matched by pcre_exec
1765  </b><br>  </b><br>
1766  <P>  <P>
1767  The subject string is passed to <b>pcre_exec()</b> as a pointer in  The subject string is passed to <b>pcre_exec()</b> as a pointer in
1768  <i>subject</i>, a length (in bytes) in <i>length</i>, and a starting byte offset  <i>subject</i>, a length in bytes in <i>length</i>, and a starting byte offset
1769  in <i>startoffset</i>. If this is negative or greater than the length of the  in <i>startoffset</i>. If this is negative or greater than the length of the
1770  subject, <b>pcre_exec()</b> returns PCRE_ERROR_BADOFFSET. When the starting  subject, <b>pcre_exec()</b> returns PCRE_ERROR_BADOFFSET. When the starting
1771  offset is zero, the search for a match starts at the beginning of the subject,  offset is zero, the search for a match starts at the beginning of the subject,
# Line 2027  PCRE_STUDY_JIT_COMPILE option is being m Line 2064  PCRE_STUDY_JIT_COMPILE option is being m
2064  the just-in-time processing stack is not large enough. See the  the just-in-time processing stack is not large enough. See the
2065  <a href="pcrejit.html"><b>pcrejit</b></a>  <a href="pcrejit.html"><b>pcrejit</b></a>
2066  documentation for more details.  documentation for more details.
2067    <pre>
2068      PCRE_ERROR_BADMODE (-28)
2069    </pre>
2070    This error is given if a pattern that was compiled by the 8-bit library is
2071    passed to a 16-bit library function, or vice versa.
2072    <pre>
2073      PCRE_ERROR_BADENDIANNESS (-29)
2074    </pre>
2075    This error is given if a pattern that was compiled and saved is reloaded on a
2076    host with different endianness. The utility function
2077    <b>pcre_pattern_to_host_byte_order()</b> can be used to convert such a pattern
2078    so that it runs on the new host.
2079  </P>  </P>
2080  <P>  <P>
2081  Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.  Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.
# Line 2035  Error numbers -16 to -20 and -22 are not Line 2084  Error numbers -16 to -20 and -22 are not
2084  Reason codes for invalid UTF-8 strings  Reason codes for invalid UTF-8 strings
2085  </b><br>  </b><br>
2086  <P>  <P>
2087    This section applies only to the 8-bit library. The corresponding information
2088    for the 16-bit library is given in the
2089    <a href="pcre16.html"><b>pcre16</b></a>
2090    page.
2091    </P>
2092    <P>
2093  When <b>pcre_exec()</b> returns either PCRE_ERROR_BADUTF8 or  When <b>pcre_exec()</b> returns either PCRE_ERROR_BADUTF8 or
2094  PCRE_ERROR_SHORTUTF8, and the size of the output vector (<i>ovecsize</i>) is at  PCRE_ERROR_SHORTUTF8, and the size of the output vector (<i>ovecsize</i>) is at
2095  least 2, the offset of the start of the invalid UTF-8 character is placed in  least 2, the offset of the start of the invalid UTF-8 character is placed in
# Line 2104  character. Line 2159  character.
2159  The first byte of a character has the value 0xfe or 0xff. These values can  The first byte of a character has the value 0xfe or 0xff. These values can
2160  never occur in a valid UTF-8 string.  never occur in a valid UTF-8 string.
2161  </P>  </P>
2162  <br><a name="SEC17" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>  <br><a name="SEC18" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
2163  <P>  <P>
2164  <b>int pcre_copy_substring(const char *<i>subject</i>, int *<i>ovector</i>,</b>  <b>int pcre_copy_substring(const char *<i>subject</i>, int *<i>ovector</i>,</b>
2165  <b>int <i>stringcount</i>, int <i>stringnumber</i>, char *<i>buffer</i>,</b>  <b>int <i>stringcount</i>, int <i>stringnumber</i>, char *<i>buffer</i>,</b>
# Line 2199  linked via a special interface to anothe Line 2254  linked via a special interface to anothe
2254  <b>pcre_free</b> directly; it is for these cases that the functions are  <b>pcre_free</b> directly; it is for these cases that the functions are
2255  provided.  provided.
2256  </P>  </P>
2257  <br><a name="SEC18" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>  <br><a name="SEC19" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
2258  <P>  <P>
2259  <b>int pcre_get_stringnumber(const pcre *<i>code</i>,</b>  <b>int pcre_get_stringnumber(const pcre *<i>code</i>,</b>
2260  <b>const char *<i>name</i>);</b>  <b>const char *<i>name</i>);</b>
# Line 2263  names are not included in the compiled c Line 2318  names are not included in the compiled c
2318  numbers. For this reason, the use of different names for subpatterns of the  numbers. For this reason, the use of different names for subpatterns of the
2319  same number causes an error at compile time.  same number causes an error at compile time.
2320  </P>  </P>
2321  <br><a name="SEC19" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>  <br><a name="SEC20" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
2322  <P>  <P>
2323  <b>int pcre_get_stringtable_entries(const pcre *<i>code</i>,</b>  <b>int pcre_get_stringtable_entries(const pcre *<i>code</i>,</b>
2324  <b>const char *<i>name</i>, char **<i>first</i>, char **<i>last</i>);</b>  <b>const char *<i>name</i>, char **<i>first</i>, char **<i>last</i>);</b>
# Line 2301  described above in the section entitled Line 2356  described above in the section entitled
2356  Given all the relevant entries for the name, you can extract each of their  Given all the relevant entries for the name, you can extract each of their
2357  numbers, and hence the captured data, if any.  numbers, and hence the captured data, if any.
2358  </P>  </P>
2359  <br><a name="SEC20" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br>  <br><a name="SEC21" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br>
2360  <P>  <P>
2361  The traditional matching function uses a similar algorithm to Perl, which stops  The traditional matching function uses a similar algorithm to Perl, which stops
2362  when it finds the first match, starting at a given point in the subject. If you  when it finds the first match, starting at a given point in the subject. If you
# Line 2320  substring. Then return 1, which forces < Line 2375  substring. Then return 1, which forces <
2375  other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b>  other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b>
2376  will yield PCRE_ERROR_NOMATCH.  will yield PCRE_ERROR_NOMATCH.
2377  <a name="dfamatch"></a></P>  <a name="dfamatch"></a></P>
2378  <br><a name="SEC21" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>  <br><a name="SEC22" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
2379  <P>  <P>
2380  <b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>  <b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
2381  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>  <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
# Line 2495  recursively, using private vectors for < Line 2550  recursively, using private vectors for <
2550  error is given if the output vector is not large enough. This should be  error is given if the output vector is not large enough. This should be
2551  extremely rare, as a vector of size 1000 is used.  extremely rare, as a vector of size 1000 is used.
2552  </P>  </P>
2553  <br><a name="SEC22" href="#TOC1">SEE ALSO</a><br>  <br><a name="SEC23" href="#TOC1">SEE ALSO</a><br>
2554  <P>  <P>
2555  <b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3),  <b>pcre16</b>(3), <b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3),
2556  <b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3),  <b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3),
2557  <b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3).  <b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3).
2558  </P>  </P>
2559  <br><a name="SEC23" href="#TOC1">AUTHOR</a><br>  <br><a name="SEC24" href="#TOC1">AUTHOR</a><br>
2560  <P>  <P>
2561  Philip Hazel  Philip Hazel
2562  <br>  <br>
# Line 2510  University Computing Service Line 2565  University Computing Service
2565  Cambridge CB2 3QH, England.  Cambridge CB2 3QH, England.
2566  <br>  <br>
2567  </P>  </P>
2568  <br><a name="SEC24" href="#TOC1">REVISION</a><br>  <br><a name="SEC25" href="#TOC1">REVISION</a><br>
2569  <P>  <P>
2570  Last updated: 02 December 2011  Last updated: 07 January 2012
2571  <br>  <br>
2572  Copyright &copy; 1997-2011 University of Cambridge.  Copyright &copy; 1997-2012 University of Cambridge.
2573  <br>  <br>
2574  <p>  <p>
2575  Return to the <a href="index.html">PCRE index page</a>.  Return to the <a href="index.html">PCRE index page</a>.

Legend:
Removed from v.868  
changed lines
  Added in v.869

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12