/[pcre]/code/trunk/doc/pcreapi.3
ViewVC logotype

Contents of /code/trunk/doc/pcreapi.3

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1339 - (hide annotations) (download)
Fri Jun 14 09:09:28 2013 UTC (5 days, 8 hours ago) by ph10
File size: 125756 byte(s)
Update man pages to avoid the use of .ti (which causes difficulties for some 
translation software).

1 ph10 1339 .TH PCREAPI 3 "12 June 2013" "PCRE 8.33"
2 nigel 63 .SH NAME
3     PCRE - Perl-compatible regular expressions
4 ph10 856 .sp
5     .B #include <pcre.h>
6     .
7     .
8 ph10 678 .SH "PCRE NATIVE API BASIC FUNCTIONS"
9 nigel 63 .rs
10     .sp
11 ph10 1339 .nf
12 nigel 75 .B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
13 ph10 1339 .B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
14     .B " const unsigned char *\fItableptr\fP);"
15     .sp
16 nigel 77 .B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
17 ph10 1339 .B " int *\fIerrorcodeptr\fP,"
18     .B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
19     .B " const unsigned char *\fItableptr\fP);"
20     .sp
21 nigel 75 .B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
22 ph10 1339 .B " const char **\fIerrptr\fP);"
23     .sp
24 ph10 678 .B void pcre_free_study(pcre_extra *\fIextra\fP);
25 ph10 1339 .sp
26 nigel 75 .B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
27 ph10 1339 .B " const char *\fIsubject\fP, int \fIlength\fP, int \fIstartoffset\fP,"
28     .B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);"
29     .sp
30 nigel 77 .B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
31 ph10 1339 .B " const char *\fIsubject\fP, int \fIlength\fP, int \fIstartoffset\fP,"
32     .B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,"
33     .B " int *\fIworkspace\fP, int \fIwscount\fP);"
34     .fi
35 ph10 856 .
36     .
37     .SH "PCRE NATIVE API STRING EXTRACTION FUNCTIONS"
38     .rs
39     .sp
40 ph10 1339 .nf
41 nigel 75 .B int pcre_copy_named_substring(const pcre *\fIcode\fP,
42 ph10 1339 .B " const char *\fIsubject\fP, int *\fIovector\fP,"
43     .B " int \fIstringcount\fP, const char *\fIstringname\fP,"
44     .B " char *\fIbuffer\fP, int \fIbuffersize\fP);"
45     .sp
46 nigel 75 .B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
47 ph10 1339 .B " int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,"
48     .B " int \fIbuffersize\fP);"
49     .sp
50 nigel 75 .B int pcre_get_named_substring(const pcre *\fIcode\fP,
51 ph10 1339 .B " const char *\fIsubject\fP, int *\fIovector\fP,"
52     .B " int \fIstringcount\fP, const char *\fIstringname\fP,"
53     .B " const char **\fIstringptr\fP);"
54     .sp
55 nigel 75 .B int pcre_get_stringnumber(const pcre *\fIcode\fP,
56 ph10 1339 .B " const char *\fIname\fP);"
57     .sp
58 nigel 91 .B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
59 ph10 1339 .B " const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);"
60     .sp
61 nigel 75 .B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
62 ph10 1339 .B " int \fIstringcount\fP, int \fIstringnumber\fP,"
63     .B " const char **\fIstringptr\fP);"
64     .sp
65 nigel 75 .B int pcre_get_substring_list(const char *\fIsubject\fP,
66 ph10 1339 .B " int *\fIovector\fP, int \fIstringcount\fP, const char ***\fIlistptr\fP);"
67     .sp
68 nigel 75 .B void pcre_free_substring(const char *\fIstringptr\fP);
69 ph10 1339 .sp
70 nigel 75 .B void pcre_free_substring_list(const char **\fIstringptr\fP);
71 ph10 1339 .fi
72 ph10 856 .
73     .
74     .SH "PCRE NATIVE API AUXILIARY FUNCTIONS"
75     .rs
76     .sp
77 ph10 1339 .nf
78 ph10 1194 .B int pcre_jit_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
79 ph10 1339 .B " const char *\fIsubject\fP, int \fIlength\fP, int \fIstartoffset\fP,"
80     .B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,"
81     .B " pcre_jit_stack *\fIjstack\fP);"
82     .sp
83 ph10 856 .B pcre_jit_stack *pcre_jit_stack_alloc(int \fIstartsize\fP, int \fImaxsize\fP);
84 ph10 1339 .sp
85 ph10 856 .B void pcre_jit_stack_free(pcre_jit_stack *\fIstack\fP);
86 ph10 1339 .sp
87 ph10 856 .B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
88 ph10 1339 .B " pcre_jit_callback \fIcallback\fP, void *\fIdata\fP);"
89     .sp
90 nigel 63 .B const unsigned char *pcre_maketables(void);
91 ph10 1339 .sp
92 nigel 75 .B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
93 ph10 1339 .B " int \fIwhat\fP, void *\fIwhere\fP);"
94     .sp
95 nigel 77 .B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
96 ph10 1339 .sp
97 nigel 75 .B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
98 ph10 1339 .sp
99 ph10 856 .B const char *pcre_version(void);
100 ph10 1339 .sp
101 ph10 856 .B int pcre_pattern_to_host_byte_order(pcre *\fIcode\fP,
102 ph10 1339 .B " pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP);"
103     .fi
104 ph10 686 .
105     .
106     .SH "PCRE NATIVE API INDIRECTED FUNCTIONS"
107     .rs
108     .sp
109 ph10 1339 .nf
110 nigel 63 .B void *(*pcre_malloc)(size_t);
111 ph10 1339 .sp
112 nigel 63 .B void (*pcre_free)(void *);
113 ph10 1339 .sp
114 nigel 73 .B void *(*pcre_stack_malloc)(size_t);
115 ph10 1339 .sp
116 nigel 73 .B void (*pcre_stack_free)(void *);
117 ph10 1339 .sp
118 nigel 63 .B int (*pcre_callout)(pcre_callout_block *);
119 ph10 1339 .fi
120 nigel 75 .
121     .
122 ph10 1214 .SH "PCRE 8-BIT, 16-BIT, AND 32-BIT LIBRARIES"
123 ph10 856 .rs
124     .sp
125 ph10 1214 As well as support for 8-bit character strings, PCRE also supports 16-bit
126     strings (from release 8.30) and 32-bit strings (from release 8.32), by means of
127     two additional libraries. They can be built as well as, or instead of, the
128     8-bit library. To avoid too much complication, this document describes the
129     8-bit versions of the functions, with only occasional references to the 16-bit
130     and 32-bit libraries.
131 ph10 856 .P
132 ph10 1191 The 16-bit and 32-bit functions operate in the same way as their 8-bit
133     counterparts; they just use different data types for their arguments and
134     results, and their names start with \fBpcre16_\fP or \fBpcre32_\fP instead of
135     \fBpcre_\fP. For every option that has UTF8 in its name (for example,
136     PCRE_UTF8), there are corresponding 16-bit and 32-bit names with UTF8 replaced
137     by UTF16 or UTF32, respectively. This facility is in fact just cosmetic; the
138     16-bit and 32-bit option names define the same bit values.
139 ph10 856 .P
140     References to bytes and UTF-8 in this document should be read as references to
141 ph10 1328 16-bit data units and UTF-16 when using the 16-bit library, or 32-bit data
142     units and UTF-32 when using the 32-bit library, unless specified otherwise.
143     More details of the specific differences for the 16-bit and 32-bit libraries
144     are given in the
145 ph10 856 .\" HREF
146     \fBpcre16\fP
147     .\"
148 ph10 1191 and
149 chpe 1055 .\" HREF
150     \fBpcre32\fP
151     .\"
152 ph10 1191 pages.
153 ph10 856 .
154 chpe 1055 .
155 nigel 75 .SH "PCRE API OVERVIEW"
156 nigel 63 .rs
157     .sp
158 nigel 93 PCRE has its own native API, which is described in this document. There are
159 ph10 856 also some wrapper functions (for the 8-bit library only) that correspond to the
160     POSIX regular expression API, but they do not give access to all the
161     functionality. They are described in the
162 nigel 75 .\" HREF
163     \fBpcreposix\fP
164     .\"
165 nigel 77 documentation. Both of these APIs define a set of C function calls. A C++
166 ph10 856 wrapper (again for the 8-bit library only) is also distributed with PCRE. It is
167     documented in the
168 nigel 77 .\" HREF
169     \fBpcrecpp\fP
170     .\"
171     page.
172 nigel 75 .P
173 nigel 77 The native API C function prototypes are defined in the header file
174 ph10 856 \fBpcre.h\fP, and on Unix-like systems the (8-bit) library itself is called
175     \fBlibpcre\fP. It can normally be accessed by adding \fB-lpcre\fP to the
176     command for linking an application that uses PCRE. The header file defines the
177     macros PCRE_MAJOR and PCRE_MINOR to contain the major and minor release numbers
178     for the library. Applications can use these to include support for different
179     releases of PCRE.
180 nigel 75 .P
181 ph10 525 In a Windows environment, if you want to statically link an application program
182     against a non-dll \fBpcre.a\fP file, you must define PCRE_STATIC before
183 ph10 526 including \fBpcre.h\fP or \fBpcrecpp.h\fP, because otherwise the
184     \fBpcre_malloc()\fP and \fBpcre_free()\fP exported functions will be declared
185 ph10 525 \fB__declspec(dllimport)\fP, with unwanted results.
186     .P
187 nigel 77 The functions \fBpcre_compile()\fP, \fBpcre_compile2()\fP, \fBpcre_study()\fP,
188     and \fBpcre_exec()\fP are used for compiling and matching regular expressions
189     in a Perl-compatible manner. A sample program that demonstrates the simplest
190 ph10 429 way of using them is provided in the file called \fIpcredemo.c\fP in the PCRE
191     source distribution. A listing of this program is given in the
192 nigel 75 .\" HREF
193 ph10 429 \fBpcredemo\fP
194     .\"
195     documentation, and the
196     .\" HREF
197 nigel 75 \fBpcresample\fP
198     .\"
199 ph10 312 documentation describes how to compile and run it.
200 nigel 75 .P
201 ph10 678 Just-in-time compiler support is an optional feature of PCRE that can be built
202 ph10 686 in appropriate hardware environments. It greatly speeds up the matching
203     performance of many patterns. Simple programs can easily request that it be
204     used if available, by setting an option that is ignored when it is not
205     relevant. More complicated programs might need to make use of the functions
206     \fBpcre_jit_stack_alloc()\fP, \fBpcre_jit_stack_free()\fP, and
207     \fBpcre_assign_jit_stack()\fP in order to control the JIT code's memory usage.
208 ph10 1191 .P
209     From release 8.32 there is also a direct interface for JIT execution, which
210     gives improved performance. The JIT-specific functions are discussed in the
211 ph10 678 .\" HREF
212     \fBpcrejit\fP
213     .\"
214     documentation.
215     .P
216 nigel 77 A second matching function, \fBpcre_dfa_exec()\fP, which is not
217     Perl-compatible, is also provided. This uses a different algorithm for the
218 nigel 91 matching. The alternative algorithm finds all possible matches (at a given
219 ph10 435 point in the subject), and scans the subject just once (unless there are
220     lookbehind assertions). However, this algorithm does not return captured
221     substrings. A description of the two matching algorithms and their advantages
222     and disadvantages is given in the
223 nigel 77 .\" HREF
224     \fBpcrematching\fP
225     .\"
226     documentation.
227     .P
228 nigel 75 In addition to the main compiling and matching functions, there are convenience
229 nigel 77 functions for extracting captured substrings from a subject string that is
230     matched by \fBpcre_exec()\fP. They are:
231 nigel 75 .sp
232     \fBpcre_copy_substring()\fP
233     \fBpcre_copy_named_substring()\fP
234     \fBpcre_get_substring()\fP
235     \fBpcre_get_named_substring()\fP
236     \fBpcre_get_substring_list()\fP
237     \fBpcre_get_stringnumber()\fP
238 nigel 91 \fBpcre_get_stringtable_entries()\fP
239 nigel 75 .sp
240     \fBpcre_free_substring()\fP and \fBpcre_free_substring_list()\fP are also
241 nigel 63 provided, to free the memory used for extracted strings.
242 nigel 75 .P
243     The function \fBpcre_maketables()\fP is used to build a set of character tables
244 nigel 77 in the current locale for passing to \fBpcre_compile()\fP, \fBpcre_exec()\fP,
245     or \fBpcre_dfa_exec()\fP. This is an optional facility that is provided for
246     specialist use. Most commonly, no special tables are passed, in which case
247     internal tables that are generated when PCRE is built are used.
248 nigel 75 .P
249     The function \fBpcre_fullinfo()\fP is used to find out information about a
250 ph10 856 compiled pattern. The function \fBpcre_version()\fP returns a pointer to a
251     string containing the version of PCRE and its date of release.
252 nigel 75 .P
253 nigel 77 The function \fBpcre_refcount()\fP maintains a reference count in a data block
254     containing a compiled pattern. This is provided for the benefit of
255     object-oriented applications.
256     .P
257 nigel 75 The global variables \fBpcre_malloc\fP and \fBpcre_free\fP initially contain
258     the entry points of the standard \fBmalloc()\fP and \fBfree()\fP functions,
259 nigel 63 respectively. PCRE calls the memory management functions via these variables,
260     so a calling program can replace them if it wishes to intercept the calls. This
261     should be done before calling any PCRE functions.
262 nigel 75 .P
263     The global variables \fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP are also
264 nigel 73 indirections to memory management functions. These special functions are used
265     only when PCRE is compiled to use the heap for remembering data, instead of
266 nigel 91 recursive function calls, when running the \fBpcre_exec()\fP function. See the
267     .\" HREF
268     \fBpcrebuild\fP
269     .\"
270     documentation for details of how to do this. It is a non-standard way of
271     building PCRE, for use in environments that have limited stacks. Because of the
272     greater use of memory management, it runs more slowly. Separate functions are
273     provided so that special-purpose external code can be used for this case. When
274     used, these functions are always called in a stack-like manner (last obtained,
275     first freed), and always for memory blocks of the same size. There is a
276     discussion about PCRE's stack usage in the
277     .\" HREF
278     \fBpcrestack\fP
279     .\"
280     documentation.
281 nigel 75 .P
282     The global variable \fBpcre_callout\fP initially contains NULL. It can be set
283 nigel 63 by the caller to a "callout" function, which PCRE will then call at specified
284 nigel 75 points during a matching operation. Details are given in the
285     .\" HREF
286     \fBpcrecallout\fP
287     .\"
288 nigel 63 documentation.
289 nigel 75 .
290     .
291 ph10 227 .\" HTML <a name="newlines"></a>
292 nigel 91 .SH NEWLINES
293 nigel 93 .rs
294 nigel 91 .sp
295 ph10 149 PCRE supports five different conventions for indicating line breaks in
296 nigel 93 strings: a single CR (carriage return) character, a single LF (linefeed)
297 ph10 149 character, the two-character sequence CRLF, any of the three preceding, or any
298     Unicode newline sequence. The Unicode newline sequences are the three just
299 ph10 968 mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed,
300 ph10 149 U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
301     (paragraph separator, U+2029).
302 nigel 93 .P
303     Each of the first three conventions is used by at least one operating system as
304     its standard newline sequence. When PCRE is built, a default can be specified.
305     The default default is LF, which is the Unix standard. When PCRE is run, the
306     default can be overridden, either when a pattern is compiled, or when it is
307     matched.
308     .P
309 ph10 227 At compile time, the newline convention can be specified by the \fIoptions\fP
310     argument of \fBpcre_compile()\fP, or it can be specified by special text at the
311     start of the pattern itself; this overrides any other settings. See the
312     .\" HREF
313     \fBpcrepattern\fP
314     .\"
315     page for details of the special character sequences.
316     .P
317 nigel 91 In the PCRE documentation the word "newline" is used to mean "the character or
318 nigel 93 pair of characters that indicate a line break". The choice of newline
319     convention affects the handling of the dot, circumflex, and dollar
320     metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
321     recognized line ending sequence, the match position advancement for a
322 ph10 226 non-anchored pattern. There is more detail about this in the
323     .\" HTML <a href="#execoptions">
324     .\" </a>
325     section on \fBpcre_exec()\fP options
326     .\"
327 ph10 231 below.
328     .P
329     The choice of newline convention does not affect the interpretation of
330     the \en or \er escape sequences, nor does it affect what \eR matches, which is
331     controlled in a similar way, but by separate options.
332 nigel 91 .
333     .
334 nigel 63 .SH MULTITHREADING
335     .rs
336     .sp
337     The PCRE functions can be used in multi-threading applications, with the
338 nigel 75 proviso that the memory management functions pointed to by \fBpcre_malloc\fP,
339     \fBpcre_free\fP, \fBpcre_stack_malloc\fP, and \fBpcre_stack_free\fP, and the
340     callout function pointed to by \fBpcre_callout\fP, are shared by all threads.
341     .P
342 nigel 63 The compiled form of a regular expression is not altered during matching, so
343     the same compiled pattern can safely be used by several threads at once.
344 ph10 678 .P
345 ph10 691 If the just-in-time optimization feature is being used, it needs separate
346 ph10 678 memory stack areas for each thread. See the
347     .\" HREF
348     \fBpcrejit\fP
349     .\"
350     documentation for more details.
351 nigel 75 .
352     .
353     .SH "SAVING PRECOMPILED PATTERNS FOR LATER USE"
354 nigel 63 .rs
355     .sp
356 nigel 75 The compiled form of a regular expression can be saved and re-used at a later
357     time, possibly by a different program, and even on a host other than the one on
358     which it was compiled. Details are given in the
359     .\" HREF
360     \fBpcreprecompile\fP
361     .\"
362 ph10 856 documentation, which includes a description of the
363     \fBpcre_pattern_to_host_byte_order()\fP function. However, compiling a regular
364     expression with one version of PCRE for use with a different version is not
365     guaranteed to work and may cause crashes.
366 nigel 75 .
367     .
368     .SH "CHECKING BUILD-TIME OPTIONS"
369     .rs
370     .sp
371     .B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
372 nigel 63 .PP
373 nigel 75 The function \fBpcre_config()\fP makes it possible for a PCRE client to
374 nigel 63 discover which optional features have been compiled into the PCRE library. The
375     .\" HREF
376 nigel 75 \fBpcrebuild\fP
377 nigel 63 .\"
378     documentation has more details about these optional features.
379 nigel 75 .P
380     The first argument for \fBpcre_config()\fP is an integer, specifying which
381 nigel 63 information is required; the second argument is a pointer to a variable into
382 ph10 856 which the information is placed. The returned value is zero on success, or the
383     negative error code PCRE_ERROR_BADOPTION if the value in the first argument is
384     not recognized. The following information is available:
385 nigel 75 .sp
386 nigel 63 PCRE_CONFIG_UTF8
387 nigel 75 .sp
388 nigel 63 The output is an integer that is set to one if UTF-8 support is available;
389 chpe 1055 otherwise it is set to zero. This value should normally be given to the 8-bit
390     version of this function, \fBpcre_config()\fP. If it is given to the 16-bit
391     or 32-bit version of this function, the result is PCRE_ERROR_BADOPTION.
392 nigel 75 .sp
393 ph10 856 PCRE_CONFIG_UTF16
394     .sp
395     The output is an integer that is set to one if UTF-16 support is available;
396     otherwise it is set to zero. This value should normally be given to the 16-bit
397     version of this function, \fBpcre16_config()\fP. If it is given to the 8-bit
398 chpe 1055 or 32-bit version of this function, the result is PCRE_ERROR_BADOPTION.
399 ph10 856 .sp
400 chpe 1055 PCRE_CONFIG_UTF32
401     .sp
402     The output is an integer that is set to one if UTF-32 support is available;
403     otherwise it is set to zero. This value should normally be given to the 32-bit
404     version of this function, \fBpcre32_config()\fP. If it is given to the 8-bit
405     or 16-bit version of this function, the result is PCRE_ERROR_BADOPTION.
406     .sp
407 nigel 75 PCRE_CONFIG_UNICODE_PROPERTIES
408     .sp
409     The output is an integer that is set to one if support for Unicode character
410     properties is available; otherwise it is set to zero.
411     .sp
412 ph10 678 PCRE_CONFIG_JIT
413     .sp
414 ph10 691 The output is an integer that is set to one if support for just-in-time
415     compiling is available; otherwise it is set to zero.
416 ph10 678 .sp
417 ph10 887 PCRE_CONFIG_JITTARGET
418     .sp
419     The output is a pointer to a zero-terminated "const char *" string. If JIT
420 ph10 903 support is available, the string contains the name of the architecture for
421     which the JIT compiler is configured, for example "x86 32bit (little endian +
422 ph10 887 unaligned)". If JIT support is not available, the result is NULL.
423     .sp
424 nigel 63 PCRE_CONFIG_NEWLINE
425 nigel 75 .sp
426 nigel 91 The output is an integer whose value specifies the default character sequence
427 ph10 1221 that is recognized as meaning "newline". The values that are supported in
428 ph10 1031 ASCII/Unicode environments are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for
429     ANYCRLF, and -1 for ANY. In EBCDIC environments, CR, ANYCRLF, and ANY yield the
430     same values. However, the value for LF is normally 21, though some EBCDIC
431     environments use 37. The corresponding values for CRLF are 3349 and 3365. The
432     default should normally correspond to the standard sequence for your operating
433     system.
434 nigel 75 .sp
435 ph10 231 PCRE_CONFIG_BSR
436     .sp
437     The output is an integer whose value indicates what character sequences the \eR
438     escape sequence matches by default. A value of 0 means that \eR matches any
439     Unicode line ending sequence; a value of 1 means that \eR matches only CR, LF,
440     or CRLF. The default can be overridden when a pattern is compiled or matched.
441     .sp
442 nigel 63 PCRE_CONFIG_LINK_SIZE
443 nigel 75 .sp
444 nigel 63 The output is an integer that contains the number of bytes used for internal
445 ph10 856 linkage in compiled regular expressions. For the 8-bit library, the value can
446     be 2, 3, or 4. For the 16-bit library, the value is either 2 or 4 and is still
447 chpe 1055 a number of bytes. For the 32-bit library, the value is either 2 or 4 and is
448     still a number of bytes. The default value of 2 is sufficient for all but the
449     most massive patterns, since it allows the compiled pattern to be up to 64K in
450     size. Larger values allow larger regular expressions to be compiled, at the
451     expense of slower matching.
452 nigel 75 .sp
453 nigel 63 PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
454 nigel 75 .sp
455 nigel 63 The output is an integer that contains the threshold above which the POSIX
456 nigel 75 interface uses \fBmalloc()\fP for output vectors. Further details are given in
457     the
458     .\" HREF
459     \fBpcreposix\fP
460     .\"
461     documentation.
462     .sp
463 nigel 63 PCRE_CONFIG_MATCH_LIMIT
464 nigel 75 .sp
465 ph10 376 The output is a long integer that gives the default limit for the number of
466 nigel 75 internal matching function calls in a \fBpcre_exec()\fP execution. Further
467     details are given with \fBpcre_exec()\fP below.
468     .sp
469 nigel 87 PCRE_CONFIG_MATCH_LIMIT_RECURSION
470     .sp
471 ph10 376 The output is a long integer that gives the default limit for the depth of
472 nigel 87 recursion when calling the internal matching function in a \fBpcre_exec()\fP
473     execution. Further details are given with \fBpcre_exec()\fP below.
474     .sp
475 nigel 73 PCRE_CONFIG_STACKRECURSE
476 nigel 75 .sp
477 nigel 77 The output is an integer that is set to one if internal recursion when running
478     \fBpcre_exec()\fP is implemented by recursive function calls that use the stack
479     to remember their state. This is the usual way that PCRE is compiled. The
480     output is zero if PCRE was compiled to use blocks of data on the heap instead
481     of recursive function calls. In this case, \fBpcre_stack_malloc\fP and
482     \fBpcre_stack_free\fP are called to manage memory blocks on the heap, thus
483     avoiding the use of the stack.
484 nigel 75 .
485     .
486     .SH "COMPILING A PATTERN"
487 nigel 63 .rs
488     .sp
489 ph10 1339 .nf
490 nigel 75 .B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
491 ph10 1339 .B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
492     .B " const unsigned char *\fItableptr\fP);"
493 nigel 77 .sp
494     .B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
495 ph10 1339 .B " int *\fIerrorcodeptr\fP,"
496     .B " const char **\fIerrptr\fP, int *\fIerroffset\fP,"
497     .B " const unsigned char *\fItableptr\fP);"
498     .fi
499 nigel 75 .P
500 nigel 77 Either of the functions \fBpcre_compile()\fP or \fBpcre_compile2()\fP can be
501     called to compile a pattern into an internal form. The only difference between
502     the two interfaces is that \fBpcre_compile2()\fP has an additional argument,
503 ph10 461 \fIerrorcodeptr\fP, via which a numerical error code can be returned. To avoid
504     too much repetition, we refer just to \fBpcre_compile()\fP below, but the
505 ph10 456 information applies equally to \fBpcre_compile2()\fP.
506 nigel 75 .P
507 nigel 77 The pattern is a C string terminated by a binary zero, and is passed in the
508     \fIpattern\fP argument. A pointer to a single block of memory that is obtained
509     via \fBpcre_malloc\fP is returned. This contains the compiled code and related
510     data. The \fBpcre\fP type is defined for the returned block; this is a typedef
511     for a structure whose contents are not externally defined. It is up to the
512 nigel 91 caller to free the memory (via \fBpcre_free\fP) when it is no longer required.
513 nigel 77 .P
514 nigel 63 Although the compiled code of a PCRE regex is relocatable, that is, it does not
515 nigel 75 depend on memory location, the complete \fBpcre\fP data block is not
516     fully relocatable, because it may contain a copy of the \fItableptr\fP
517     argument, which is an address (see below).
518     .P
519 nigel 93 The \fIoptions\fP argument contains various bit settings that affect the
520 nigel 75 compilation. It should be zero if no options are required. The available
521 ph10 412 options are described below. Some of them (in particular, those that are
522 ph10 456 compatible with Perl, but some others as well) can also be set and unset from
523 ph10 412 within the pattern (see the detailed description in the
524 nigel 75 .\" HREF
525     \fBpcrepattern\fP
526     .\"
527 ph10 412 documentation). For those options that can be different in different parts of
528 ph10 456 the pattern, the contents of the \fIoptions\fP argument specifies their
529 ph10 461 settings at the start of compilation and execution. The PCRE_ANCHORED,
530 ph10 576 PCRE_BSR_\fIxxx\fP, PCRE_NEWLINE_\fIxxx\fP, PCRE_NO_UTF8_CHECK, and
531 ph10 930 PCRE_NO_START_OPTIMIZE options can be set at the time of matching as well as at
532 ph10 576 compile time.
533 nigel 75 .P
534     If \fIerrptr\fP is NULL, \fBpcre_compile()\fP returns NULL immediately.
535     Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
536     NULL, and sets the variable pointed to by \fIerrptr\fP to point to a textual
537 nigel 87 error message. This is a static string that is part of the library. You must
538 ph10 598 not try to free it. Normally, the offset from the start of the pattern to the
539 ph10 1328 data unit that was being processed when the error was discovered is placed in
540     the variable pointed to by \fIerroffset\fP, which must not be NULL (if it is,
541     an immediate error is given). However, for an invalid UTF-8 or UTF-16 string,
542     the offset is that of the first data unit of the failing character.
543 nigel 75 .P
544 ph10 856 Some errors are not detected until the whole pattern has been scanned; in these
545     cases, the offset passed back is the length of the pattern. Note that the
546 ph10 1328 offset is in data units, not characters, even in a UTF mode. It may sometimes
547     point into the middle of a UTF-8 or UTF-16 character.
548 ph10 569 .P
549 nigel 77 If \fBpcre_compile2()\fP is used instead of \fBpcre_compile()\fP, and the
550     \fIerrorcodeptr\fP argument is not NULL, a non-zero error code number is
551     returned via this argument in the event of an error. This is in addition to the
552     textual error message. Error codes and messages are listed below.
553     .P
554 nigel 75 If the final argument, \fItableptr\fP, is NULL, PCRE uses a default set of
555     character tables that are built when PCRE is compiled, using the default C
556     locale. Otherwise, \fItableptr\fP must be an address that is the result of a
557     call to \fBpcre_maketables()\fP. This value is stored with the compiled
558     pattern, and used again by \fBpcre_exec()\fP, unless another table pointer is
559     passed to it. For more discussion, see the section on locale support below.
560     .P
561     This code fragment shows a typical straightforward call to \fBpcre_compile()\fP:
562     .sp
563 nigel 63 pcre *re;
564     const char *error;
565     int erroffset;
566     re = pcre_compile(
567     "^A.*Z", /* the pattern */
568     0, /* default options */
569     &error, /* for error message */
570     &erroffset, /* for error offset */
571     NULL); /* use default character tables */
572 nigel 75 .sp
573     The following names for option bits are defined in the \fBpcre.h\fP header
574     file:
575     .sp
576 nigel 63 PCRE_ANCHORED
577 nigel 75 .sp
578 nigel 63 If this bit is set, the pattern is forced to be "anchored", that is, it is
579 nigel 75 constrained to match only at the first matching point in the string that is
580 nigel 63 being searched (the "subject string"). This effect can also be achieved by
581     appropriate constructs in the pattern itself, which is the only way to do it in
582     Perl.
583 nigel 75 .sp
584     PCRE_AUTO_CALLOUT
585     .sp
586     If this bit is set, \fBpcre_compile()\fP automatically inserts callout items,
587     all with number 255, before each pattern item. For discussion of the callout
588     facility, see the
589     .\" HREF
590     \fBpcrecallout\fP
591     .\"
592     documentation.
593     .sp
594 ph10 231 PCRE_BSR_ANYCRLF
595     PCRE_BSR_UNICODE
596     .sp
597     These options (which are mutually exclusive) control what the \eR escape
598     sequence matches. The choice is either to match only CR, LF, or CRLF, or to
599     match any Unicode newline sequence. The default is specified when PCRE is
600     built. It can be overridden from within the pattern, or by setting an option
601     when a compiled pattern is matched.
602     .sp
603 nigel 63 PCRE_CASELESS
604 nigel 75 .sp
605 nigel 63 If this bit is set, letters in the pattern match both upper and lower case
606     letters. It is equivalent to Perl's /i option, and it can be changed within a
607 nigel 77 pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
608     concept of case for characters whose values are less than 128, so caseless
609     matching is always possible. For characters with higher values, the concept of
610     case is supported if PCRE is compiled with Unicode property support, but not
611     otherwise. If you want to use caseless matching for characters 128 and above,
612     you must ensure that PCRE is compiled with Unicode property support as well as
613     with UTF-8 support.
614 nigel 75 .sp
615 nigel 63 PCRE_DOLLAR_ENDONLY
616 nigel 75 .sp
617 nigel 63 If this bit is set, a dollar metacharacter in the pattern matches only at the
618     end of the subject string. Without this option, a dollar also matches
619 nigel 91 immediately before a newline at the end of the string (but not before any other
620     newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set.
621     There is no equivalent to this option in Perl, and no way to set it within a
622     pattern.
623 nigel 75 .sp
624 nigel 63 PCRE_DOTALL
625 nigel 75 .sp
626 ph10 572 If this bit is set, a dot metacharacter in the pattern matches a character of
627     any value, including one that indicates a newline. However, it only ever
628     matches one character, even if newlines are coded as CRLF. Without this option,
629     a dot does not match when the current position is at a newline. This option is
630     equivalent to Perl's /s option, and it can be changed within a pattern by a
631     (?s) option setting. A negative class such as [^a] always matches newline
632     characters, independent of the setting of this option.
633 nigel 75 .sp
634 nigel 91 PCRE_DUPNAMES
635     .sp
636     If this bit is set, names used to identify capturing subpatterns need not be
637     unique. This can be helpful for certain types of pattern when it is known that
638     only one instance of the named subpattern can ever be matched. There are more
639     details of named subpatterns below; see also the
640     .\" HREF
641     \fBpcrepattern\fP
642     .\"
643     documentation.
644     .sp
645 nigel 63 PCRE_EXTENDED
646 nigel 75 .sp
647 ph10 968 If this bit is set, white space data characters in the pattern are totally
648     ignored except when escaped or inside a character class. White space does not
649 nigel 63 include the VT character (code 11). In addition, characters between an
650 nigel 91 unescaped # outside a character class and the next newline, inclusive, are also
651     ignored. This is equivalent to Perl's /x option, and it can be changed within a
652     pattern by a (?x) option setting.
653 nigel 75 .P
654 ph10 598 Which characters are interpreted as newlines is controlled by the options
655     passed to \fBpcre_compile()\fP or by a special sequence at the start of the
656     pattern, as described in the section entitled
657 ph10 572 .\" HTML <a href="pcrepattern.html#newlines">
658     .\" </a>
659     "Newline conventions"
660     .\"
661     in the \fBpcrepattern\fP documentation. Note that the end of this type of
662     comment is a literal newline sequence in the pattern; escape sequences that
663     happen to represent a newline do not count.
664     .P
665 nigel 63 This option makes it possible to include comments inside complicated patterns.
666 ph10 968 Note, however, that this applies only to data characters. White space characters
667 nigel 63 may never appear within special character sequences in a pattern, for example
668 ph10 572 within the sequence (?( that introduces a conditional subpattern.
669 nigel 75 .sp
670 nigel 63 PCRE_EXTRA
671 nigel 75 .sp
672 nigel 63 This option was invented in order to turn on additional functionality of PCRE
673     that is incompatible with Perl, but it is currently of very little use. When
674     set, any backslash in a pattern that is followed by a letter that has no
675     special meaning causes an error, thus reserving these combinations for future
676     expansion. By default, as in Perl, a backslash followed by a letter with no
677 nigel 91 special meaning is treated as a literal. (Perl can, however, be persuaded to
678 ph10 513 give an error for this, by running it with the -w option.) There are at present
679     no other features controlled by this option. It can also be set by a (?X)
680     option setting within a pattern.
681 nigel 75 .sp
682 nigel 77 PCRE_FIRSTLINE
683     .sp
684     If this option is set, an unanchored pattern is required to match before or at
685 nigel 91 the first newline in the subject string, though the matched text may continue
686     over the newline.
687 nigel 77 .sp
688 ph10 336 PCRE_JAVASCRIPT_COMPAT
689     .sp
690 ph10 345 If this option is set, PCRE's behaviour is changed in some ways so that it is
691 ph10 336 compatible with JavaScript rather than Perl. The changes are as follows:
692     .P
693     (1) A lone closing square bracket in a pattern causes a compile-time error,
694     because this is illegal in JavaScript (by default it is treated as a data
695     character). Thus, the pattern AB]CD becomes illegal when this option is set.
696     .P
697     (2) At run time, a back reference to an unset subpattern group matches an empty
698 ph10 345 string (by default this causes the current matching alternative to fail). A
699     pattern such as (\e1)(a) succeeds when this option is set (assuming it can find
700     an "a" in the subject), whereas it fails by default, for Perl compatibility.
701 ph10 745 .P
702 ph10 836 (3) \eU matches an upper case "U" character; by default \eU causes a compile
703 ph10 745 time error (Perl uses \eU to upper case subsequent characters).
704     .P
705 ph10 836 (4) \eu matches a lower case "u" character unless it is followed by four
706     hexadecimal digits, in which case the hexadecimal number defines the code point
707     to match. By default, \eu causes a compile time error (Perl uses it to upper
708 ph10 745 case the following character).
709     .P
710 ph10 836 (5) \ex matches a lower case "x" character unless it is followed by two
711     hexadecimal digits, in which case the hexadecimal number defines the code point
712     to match. By default, as in Perl, a hexadecimal number is always expected after
713     \ex, but it may have zero, one, or two digits (so, for example, \exz matches a
714 ph10 745 binary zero character followed by z).
715 ph10 336 .sp
716 nigel 63 PCRE_MULTILINE
717 nigel 75 .sp
718 ph10 1326 By default, for the purposes of matching "start of line" and "end of line",
719     PCRE treats the subject string as consisting of a single line of characters,
720     even if it actually contains newlines. The "start of line" metacharacter (^)
721     matches only at the start of the string, and the "end of line" metacharacter
722     ($) matches only at the end of the string, or before a terminating newline
723     (except when PCRE_DOLLAR_ENDONLY is set). Note, however, that unless
724     PCRE_DOTALL is set, the "any character" metacharacter (.) does not match at a
725     newline. This behaviour (for ^, $, and dot) is the same as Perl.
726 nigel 75 .P
727 nigel 63 When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
728 nigel 91 match immediately following or immediately before internal newlines in the
729     subject string, respectively, as well as at the very start and end. This is
730     equivalent to Perl's /m option, and it can be changed within a pattern by a
731     (?m) option setting. If there are no newlines in a subject string, or no
732 nigel 63 occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
733 nigel 75 .sp
734 ph10 1309 PCRE_NEVER_UTF
735     .sp
736     This option locks out interpretation of the pattern as UTF-8 (or UTF-16 or
737     UTF-32 in the 16-bit and 32-bit libraries). In particular, it prevents the
738     creator of the pattern from switching to UTF interpretation by starting the
739     pattern with (*UTF). This may be useful in applications that process patterns
740 ph10 1314 from external sources. The combination of PCRE_UTF8 and PCRE_NEVER_UTF also
741 ph10 1309 causes an error.
742     .sp
743 nigel 91 PCRE_NEWLINE_CR
744     PCRE_NEWLINE_LF
745     PCRE_NEWLINE_CRLF
746 ph10 150 PCRE_NEWLINE_ANYCRLF
747 nigel 93 PCRE_NEWLINE_ANY
748 nigel 91 .sp
749     These options override the default newline definition that was chosen when PCRE
750     was built. Setting the first or the second specifies that a newline is
751 nigel 93 indicated by a single character (CR or LF, respectively). Setting
752     PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
753 ph10 149 CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
754     preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
755 ph10 1221 that any Unicode newline sequence should be recognized.
756 nigel 91 .P
757 ph10 1031 In an ASCII/Unicode environment, the Unicode newline sequences are the three
758     just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form
759     feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
760     (paragraph separator, U+2029). For the 8-bit library, the last two are
761     recognized only in UTF-8 mode.
762     .P
763     When PCRE is compiled to run in an EBCDIC (mainframe) environment, the code for
764     CR is 0x0d, the same as ASCII. However, the character code for LF is normally
765 ph10 1221 0x15, though in some EBCDIC environments 0x25 is used. Whichever of these is
766     not LF is made to correspond to Unicode's NEL character. EBCDIC codes are all
767 ph10 1031 less than 256. For more details, see the
768     .\" HREF
769     \fBpcrebuild\fP
770     .\"
771     documentation.
772     .P
773 nigel 93 The newline setting in the options word uses three bits that are treated
774 ph10 149 as a number, giving eight possibilities. Currently only six are used (default
775     plus the five values above). This means that if you set more than one newline
776 nigel 93 option, the combination may or may not be sensible. For example,
777     PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
778 ph10 149 other combinations may yield unused numbers and cause an error.
779 nigel 93 .P
780 ph10 572 The only time that a line break in a pattern is specially recognized when
781 ph10 968 compiling is when PCRE_EXTENDED is set. CR and LF are white space characters,
782 ph10 572 and so are ignored in this mode. Also, an unescaped # outside a character class
783     indicates a comment that lasts until after the next line break sequence. In
784     other circumstances, line break sequences in patterns are treated as literal
785     data.
786 nigel 93 .P
787     The newline option that is set at compile time becomes the default that is used
788     for \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
789 nigel 91 .sp
790 nigel 63 PCRE_NO_AUTO_CAPTURE
791 nigel 75 .sp
792 nigel 63 If this option is set, it disables the use of numbered capturing parentheses in
793     the pattern. Any opening parenthesis that is not followed by ? behaves as if it
794     were followed by ?: but named parentheses can still be used for capturing (and
795     they acquire numbers in the usual way). There is no equivalent of this option
796     in Perl.
797 nigel 75 .sp
798 ph10 1301 PCRE_NO_START_OPTIMIZE
799 ph10 576 .sp
800     This is an option that acts at matching time; that is, it is really an option
801     for \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. If it is set at compile time,
802 ph10 1301 it is remembered with the compiled pattern and assumed at matching time. This
803     is necessary if you want to use JIT execution, because the JIT compiler needs
804     to know whether or not this option is set. For details see the discussion of
805     PCRE_NO_START_OPTIMIZE
806 ph10 576 .\" HTML <a href="#execoptions">
807     .\" </a>
808     below.
809     .\"
810     .sp
811 ph10 518 PCRE_UCP
812     .sp
813 ph10 572 This option changes the way PCRE processes \eB, \eb, \eD, \ed, \eS, \es, \eW,
814     \ew, and some of the POSIX character classes. By default, only ASCII characters
815     are recognized, but if PCRE_UCP is set, Unicode properties are used instead to
816     classify characters. More details are given in the section on
817 ph10 518 .\" HTML <a href="pcre.html#genericchartypes">
818     .\" </a>
819     generic character types
820     .\"
821     in the
822     .\" HREF
823     \fBpcrepattern\fP
824     .\"
825     page. If you set PCRE_UCP, matching one of the items it affects takes much
826     longer. The option is available only if PCRE has been compiled with Unicode
827     property support.
828     .sp
829 nigel 63 PCRE_UNGREEDY
830 nigel 75 .sp
831 nigel 63 This option inverts the "greediness" of the quantifiers so that they are not
832     greedy by default, but become greedy if followed by "?". It is not compatible
833     with Perl. It can also be set by a (?U) option setting within the pattern.
834 nigel 75 .sp
835 nigel 63 PCRE_UTF8
836 nigel 75 .sp
837 nigel 63 This option causes PCRE to regard both the pattern and the subject as strings
838 ph10 856 of UTF-8 characters instead of single-byte strings. However, it is available
839     only when PCRE is built to include UTF support. If not, the use of this option
840     provokes an error. Details of how this option changes the behaviour of PCRE are
841     given in the
842 nigel 63 .\" HREF
843 ph10 678 \fBpcreunicode\fP
844 nigel 63 .\"
845     page.
846 nigel 75 .sp
847 nigel 71 PCRE_NO_UTF8_CHECK
848 nigel 75 .sp
849 ph10 1191 When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
850     automatically checked. There is a discussion about the
851 ph10 856 .\" HTML <a href="pcreunicode.html#utf8strings">
852 ph10 211 .\" </a>
853 ph10 903 validity of UTF-8 strings
854 ph10 211 .\"
855 ph10 856 in the
856 ph10 211 .\" HREF
857 ph10 856 \fBpcreunicode\fP
858 ph10 211 .\"
859 ph10 856 page. If an invalid UTF-8 sequence is found, \fBpcre_compile()\fP returns an
860     error. If you already know that your pattern is valid, and you want to skip
861     this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK option.
862     When it is set, the effect of passing an invalid UTF-8 string as a pattern is
863     undefined. It may cause your program to crash. Note that this option can also
864     be passed to \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, to suppress the
865 ph10 1221 validity checking of subject strings only. If the same string is being matched
866     many times, the option can be safely set for the second and subsequent
867 ph10 1191 matchings to improve performance.
868 nigel 75 .
869     .
870 nigel 77 .SH "COMPILATION ERROR CODES"
871     .rs
872     .sp
873     The following table lists the error codes than may be returned by
874     \fBpcre_compile2()\fP, along with the error messages that may be returned by
875 ph10 903 both compiling functions. Note that error messages are always 8-bit ASCII
876 chpe 1055 strings, even in 16-bit or 32-bit mode. As PCRE has developed, some error codes
877     have fallen out of use. To avoid confusion, they have not been re-used.
878 nigel 77 .sp
879     0 no error
880     1 \e at end of pattern
881     2 \ec at end of pattern
882     3 unrecognized character follows \e
883     4 numbers out of order in {} quantifier
884     5 number too big in {} quantifier
885     6 missing terminating ] for character class
886     7 invalid escape sequence in character class
887     8 range out of order in character class
888     9 nothing to repeat
889 nigel 93 10 [this code is not in use]
890 nigel 77 11 internal error: unexpected repeat
891 ph10 290 12 unrecognized character after (? or (?-
892 nigel 77 13 POSIX named classes are supported only within a class
893     14 missing )
894     15 reference to non-existent subpattern
895     16 erroffset passed as NULL
896     17 unknown option bit(s) set
897     18 missing ) after comment
898 nigel 93 19 [this code is not in use]
899 ph10 290 20 regular expression is too large
900 nigel 77 21 failed to get memory
901     22 unmatched parentheses
902     23 internal error: code overflow
903     24 unrecognized character after (?<
904     25 lookbehind assertion is not fixed length
905 nigel 91 26 malformed number or name after (?(
906 nigel 77 27 conditional group contains more than two branches
907     28 assertion expected after (?(
908 ph10 181 29 (?R or (?[+-]digits must be followed by )
909 nigel 77 30 unknown POSIX class name
910     31 POSIX collating elements are not supported
911 ph10 856 32 this version of PCRE is compiled without UTF support
912 nigel 93 33 [this code is not in use]
913 nigel 77 34 character value in \ex{...} sequence is too large
914     35 invalid condition (?(0)
915     36 \eC not allowed in lookbehind assertion
916 ph10 656 37 PCRE does not support \eL, \el, \eN{name}, \eU, or \eu
917 nigel 77 38 number after (?C is > 255
918     39 closing ) for (?C expected
919     40 recursive call could loop indefinitely
920     41 unrecognized character after (?P
921 nigel 93 42 syntax error in subpattern name (missing terminator)
922 nigel 91 43 two named subpatterns have the same name
923 ph10 856 44 invalid UTF-8 string (specifically UTF-8)
924 nigel 77 45 support for \eP, \ep, and \eX has not been compiled
925     46 malformed \eP or \ep sequence
926     47 unknown property name after \eP or \ep
927 nigel 91 48 subpattern name is too long (maximum 32 characters)
928 ph10 290 49 too many named subpatterns (maximum 10000)
929 ph10 202 50 [this code is not in use]
930 ph10 856 51 octal value is greater than \e377 in 8-bit non-UTF-8 mode
931 nigel 93 52 internal error: overran compiling workspace
932 ph10 548 53 internal error: previously-checked referenced subpattern
933 ph10 546 not found
934 nigel 93 54 DEFINE group contains more than one branch
935     55 repeating a DEFINE group is not allowed
936 ph10 231 56 inconsistent NEWLINE options
937 ph10 345 57 \eg is not followed by a braced, angle-bracketed, or quoted
938     name/number or by a plain number
939 ph10 336 58 a numbered reference must not be zero
940 ph10 510 59 an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)
941 ph10 1313 60 (*VERB) not recognized or malformed
942 ph10 290 61 number is too big
943     62 subpattern name expected
944 ph10 292 63 digit expected after (?+
945 ph10 336 64 ] is an invalid data character in JavaScript compatibility mode
946 ph10 548 65 different names for subpatterns of the same number are
947 ph10 546 not allowed
948 ph10 510 66 (*MARK) must have an argument
949 ph10 903 67 this version of PCRE is not compiled with Unicode property
950 ph10 856 support
951 ph10 656 68 \ec must be followed by an ASCII character
952 ph10 659 69 \ek is not followed by a braced, angle-bracketed, or quoted name
953 ph10 856 70 internal error: unknown opcode in find_fixedlength()
954     71 \eN is not supported in a class
955     72 too many forward references
956 ph10 903 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
957 ph10 856 74 invalid UTF-16 string (specifically UTF-16)
958 ph10 975 75 name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)
959 ph10 978 76 character value in \eu.... sequence is too large
960 chpe 1055 77 invalid UTF-32 string (specifically UTF-32)
961 ph10 290 .sp
962 ph10 292 The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
963 ph10 290 be used if the limits were changed when PCRE was built.
964 nigel 77 .
965     .
966 ph10 656 .\" HTML <a name="studyingapattern"></a>
967 nigel 75 .SH "STUDYING A PATTERN"
968 nigel 63 .rs
969     .sp
970 ph10 1339 .nf
971     .B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
972     .B " const char **\fIerrptr\fP);"
973     .fi
974 nigel 63 .PP
975 nigel 75 If a compiled pattern is going to be used several times, it is worth spending
976     more time analyzing it in order to speed up the time taken for matching. The
977     function \fBpcre_study()\fP takes a pointer to a compiled pattern as its first
978     argument. If studying the pattern produces additional information that will
979     help speed up matching, \fBpcre_study()\fP returns a pointer to a
980     \fBpcre_extra\fP block, in which the \fIstudy_data\fP field points to the
981     results of the study.
982     .P
983     The returned value from \fBpcre_study()\fP can be passed directly to
984 ph10 455 \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. However, a \fBpcre_extra\fP block
985     also contains other fields that can be set by the caller before the block is
986     passed; these are described
987 nigel 75 .\" HTML <a href="#extradata">
988     .\" </a>
989     below
990     .\"
991     in the section on matching a pattern.
992     .P
993 ph10 455 If studying the pattern does not produce any useful information,
994 ph10 1022 \fBpcre_study()\fP returns NULL by default. In that circumstance, if the
995     calling program wants to pass any of the other fields to \fBpcre_exec()\fP or
996 ph10 1221 \fBpcre_dfa_exec()\fP, it must set up its own \fBpcre_extra\fP block. However,
997 ph10 1022 if \fBpcre_study()\fP is called with the PCRE_STUDY_EXTRA_NEEDED option, it
998     returns a \fBpcre_extra\fP block even if studying did not find any additional
999     information. It may still return NULL, however, if an error occurs in
1000     \fBpcre_study()\fP.
1001 nigel 75 .P
1002 ph10 921 The second argument of \fBpcre_study()\fP contains option bits. There are three
1003 ph10 1022 further options in addition to PCRE_STUDY_EXTRA_NEEDED:
1004 ph10 921 .sp
1005     PCRE_STUDY_JIT_COMPILE
1006     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE
1007     PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE
1008 ph10 975 .sp
1009 ph10 921 If any of these are set, and the just-in-time compiler is available, the
1010     pattern is further compiled into machine code that executes much faster than
1011     the \fBpcre_exec()\fP interpretive matching function. If the just-in-time
1012 ph10 1022 compiler is not available, these options are ignored. All undefined bits in the
1013 ph10 921 \fIoptions\fP argument must be zero.
1014 nigel 75 .P
1015 ph10 691 JIT compilation is a heavyweight optimization. It can take some time for
1016 ph10 678 patterns to be analyzed, and for one-off matches and simple patterns the
1017     benefit of faster execution might be offset by a much slower study time.
1018 ph10 691 Not all patterns can be optimized by the JIT compiler. For those that cannot be
1019 ph10 678 handled, matching automatically falls back to the \fBpcre_exec()\fP
1020     interpreter. For more details, see the
1021     .\" HREF
1022     \fBpcrejit\fP
1023     .\"
1024     documentation.
1025     .P
1026 nigel 75 The third argument for \fBpcre_study()\fP is a pointer for an error message. If
1027 nigel 63 studying succeeds (even if no data is returned), the variable it points to is
1028 nigel 87 set to NULL. Otherwise it is set to point to a textual error message. This is a
1029     static string that is part of the library. You must not try to free it. You
1030     should test the error pointer for NULL after calling \fBpcre_study()\fP, to be
1031     sure that it has run successfully.
1032 nigel 75 .P
1033 ph10 678 When you are finished with a pattern, you can free the memory used for the
1034     study data by calling \fBpcre_free_study()\fP. This function was added to the
1035     API for release 8.20. For earlier versions, the memory could be freed with
1036     \fBpcre_free()\fP, just like the pattern itself. This will still work in cases
1037 ph10 921 where JIT optimization is not used, but it is advisable to change to the new
1038     function when convenient.
1039 ph10 678 .P
1040 ph10 691 This is a typical way in which \fBpcre_study\fP() is used (except that in a
1041 ph10 678 real application there should be tests for errors):
1042 nigel 75 .sp
1043 ph10 678 int rc;
1044     pcre *re;
1045     pcre_extra *sd;
1046     re = pcre_compile("pattern", 0, &error, &erroroffset, NULL);
1047     sd = pcre_study(
1048 nigel 63 re, /* result of pcre_compile() */
1049 ph10 678 0, /* no options */
1050 nigel 63 &error); /* set to NULL or points to a message */
1051 ph10 678 rc = pcre_exec( /* see below for details of pcre_exec() options */
1052 ph10 691 re, sd, "subject", 7, 0, 0, ovector, 30);
1053 ph10 678 ...
1054     pcre_free_study(sd);
1055 ph10 691 pcre_free(re);
1056 nigel 75 .sp
1057 ph10 455 Studying a pattern does two things: first, a lower bound for the length of
1058 ph10 461 subject string that is needed to match the pattern is computed. This does not
1059     mean that there are any strings of that length that match, but it does
1060 ph10 1022 guarantee that no shorter strings match. The value is used to avoid wasting
1061     time by trying to match strings that are shorter than the lower bound. You can
1062     find out the value in a calling program via the \fBpcre_fullinfo()\fP function.
1063 ph10 455 .P
1064     Studying a pattern is also useful for non-anchored patterns that do not have a
1065     single fixed starting character. A bitmap of possible starting bytes is
1066 ph10 461 created. This speeds up finding a position in the subject at which to start
1067 chpe 1055 matching. (In 16-bit mode, the bitmap is used for 16-bit values less than 256.
1068     In 32-bit mode, the bitmap is used for 32-bit values less than 256.)
1069 ph10 547 .P
1070 ph10 691 These two optimizations apply to both \fBpcre_exec()\fP and
1071 ph10 921 \fBpcre_dfa_exec()\fP, and the information is also used by the JIT compiler.
1072 ph10 1314 The optimizations can be disabled by setting the PCRE_NO_START_OPTIMIZE option.
1073 ph10 1301 You might want to do this if your pattern contains callouts or (*MARK) and you
1074     want to make use of these facilities in cases where matching fails.
1075     .P
1076     PCRE_NO_START_OPTIMIZE can be specified at either compile time or execution
1077 ph10 1314 time. However, if PCRE_NO_START_OPTIMIZE is passed to \fBpcre_exec()\fP, (that
1078     is, after any JIT compilation has happened) JIT execution is disabled. For JIT
1079 ph10 1301 execution to work with PCRE_NO_START_OPTIMIZE, the option must be set at
1080     compile time.
1081     .P
1082     There is a longer discussion of PCRE_NO_START_OPTIMIZE
1083 ph10 547 .\" HTML <a href="#execoptions">
1084     .\" </a>
1085     below.
1086     .\"
1087 nigel 75 .
1088     .
1089 nigel 63 .\" HTML <a name="localesupport"></a>
1090 nigel 75 .SH "LOCALE SUPPORT"
1091 nigel 63 .rs
1092     .sp
1093 ph10 139 PCRE handles caseless matching, and determines whether characters are letters,
1094 nigel 75 digits, or whatever, by reference to a set of tables, indexed by character
1095 ph10 856 value. When running in UTF-8 mode, this applies only to characters
1096     with codes less than 128. By default, higher-valued codes never match escapes
1097     such as \ew or \ed, but they can be tested with \ep if PCRE is built with
1098     Unicode character property support. Alternatively, the PCRE_UCP option can be
1099     set at compile time; this causes \ew and friends to use Unicode property
1100     support instead of built-in tables. The use of locales with Unicode is
1101     discouraged. If you are handling characters with codes greater than 128, you
1102     should either use UTF-8 and Unicode, or use locales, but not try to mix the
1103     two.
1104 nigel 75 .P
1105 ph10 139 PCRE contains an internal set of tables that are used when the final argument
1106     of \fBpcre_compile()\fP is NULL. These are sufficient for many applications.
1107 ph10 142 Normally, the internal tables recognize only ASCII characters. However, when
1108 ph10 139 PCRE is built, it is possible to cause the internal tables to be rebuilt in the
1109     default "C" locale of the local system, which may cause them to be different.
1110 nigel 75 .P
1111 ph10 139 The internal tables can always be overridden by tables supplied by the
1112     application that calls PCRE. These may be created in a different locale from
1113     the default. As more and more applications change to using Unicode, the need
1114     for this locale support is expected to die away.
1115     .P
1116 nigel 75 External tables are built by calling the \fBpcre_maketables()\fP function,
1117     which has no arguments, in the relevant locale. The result can then be passed
1118     to \fBpcre_compile()\fP or \fBpcre_exec()\fP as often as necessary. For
1119     example, to build and use tables that are appropriate for the French locale
1120     (where accented characters with values greater than 128 are treated as letters),
1121     the following code could be used:
1122     .sp
1123     setlocale(LC_CTYPE, "fr_FR");
1124 nigel 63 tables = pcre_maketables();
1125     re = pcre_compile(..., tables);
1126 nigel 75 .sp
1127 ph10 142 The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
1128 ph10 139 are using Windows, the name for the French locale is "french".
1129     .P
1130 nigel 75 When \fBpcre_maketables()\fP runs, the tables are built in memory that is
1131     obtained via \fBpcre_malloc\fP. It is the caller's responsibility to ensure
1132     that the memory containing the tables remains available for as long as it is
1133     needed.
1134     .P
1135     The pointer that is passed to \fBpcre_compile()\fP is saved with the compiled
1136     pattern, and the same tables are used via this pointer by \fBpcre_study()\fP
1137     and normally also by \fBpcre_exec()\fP. Thus, by default, for any single
1138     pattern, compilation, studying and matching all happen in the same locale, but
1139     different patterns can be compiled in different locales.
1140     .P
1141     It is possible to pass a table pointer or NULL (indicating the use of the
1142     internal tables) to \fBpcre_exec()\fP. Although not intended for this purpose,
1143     this facility could be used to match a pattern in a different locale from the
1144     one in which it was compiled. Passing table pointers at run time is discussed
1145     below in the section on matching a pattern.
1146     .
1147     .
1148 ph10 598 .\" HTML <a name="infoaboutpattern"></a>
1149 nigel 75 .SH "INFORMATION ABOUT A PATTERN"
1150 nigel 63 .rs
1151     .sp
1152 ph10 1339 .nf
1153 nigel 75 .B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
1154 ph10 1339 .B " int \fIwhat\fP, void *\fIwhere\fP);"
1155     .fi
1156 nigel 63 .PP
1157 nigel 75 The \fBpcre_fullinfo()\fP function returns information about a compiled
1158 ph10 856 pattern. It replaces the \fBpcre_info()\fP function, which was removed from the
1159     library at version 8.30, after more than 10 years of obsolescence.
1160 nigel 75 .P
1161     The first argument for \fBpcre_fullinfo()\fP is a pointer to the compiled
1162     pattern. The second argument is the result of \fBpcre_study()\fP, or NULL if
1163 nigel 63 the pattern was not studied. The third argument specifies which piece of
1164     information is required, and the fourth argument is a pointer to a variable
1165     to receive the data. The yield of the function is zero for success, or one of
1166     the following negative numbers:
1167 nigel 75 .sp
1168 ph10 856 PCRE_ERROR_NULL the argument \fIcode\fP was NULL
1169     the argument \fIwhere\fP was NULL
1170     PCRE_ERROR_BADMAGIC the "magic number" was not found
1171 ph10 903 PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
1172 ph10 856 endianness
1173     PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
1174 ph10 1314 PCRE_ERROR_UNSET the requested field is not set
1175 nigel 75 .sp
1176     The "magic number" is placed at the start of each compiled pattern as an simple
1177 ph10 903 check against passing an arbitrary memory pointer. The endianness error can
1178 ph10 856 occur if a compiled pattern is saved and reloaded on a different host. Here is
1179     a typical call of \fBpcre_fullinfo()\fP, to obtain the length of the compiled
1180     pattern:
1181 nigel 75 .sp
1182 nigel 63 int rc;
1183 nigel 91 size_t length;
1184 nigel 63 rc = pcre_fullinfo(
1185     re, /* result of pcre_compile() */
1186 ph10 678 sd, /* result of pcre_study(), or NULL */
1187 nigel 63 PCRE_INFO_SIZE, /* what is required */
1188     &length); /* where to put the data */
1189 nigel 75 .sp
1190     The possible values for the third argument are defined in \fBpcre.h\fP, and are
1191 nigel 63 as follows:
1192 nigel 75 .sp
1193 nigel 63 PCRE_INFO_BACKREFMAX
1194 nigel 75 .sp
1195 nigel 63 Return the number of the highest back reference in the pattern. The fourth
1196 nigel 75 argument should point to an \fBint\fP variable. Zero is returned if there are
1197 nigel 63 no back references.
1198 nigel 75 .sp
1199 nigel 63 PCRE_INFO_CAPTURECOUNT
1200 nigel 75 .sp
1201 nigel 63 Return the number of capturing subpatterns in the pattern. The fourth argument
1202 nigel 75 should point to an \fBint\fP variable.
1203     .sp
1204 nigel 77 PCRE_INFO_DEFAULT_TABLES
1205 nigel 75 .sp
1206     Return a pointer to the internal default character tables within PCRE. The
1207     fourth argument should point to an \fBunsigned char *\fP variable. This
1208     information call is provided for internal use by the \fBpcre_study()\fP
1209     function. External callers can cause PCRE to use its internal tables by passing
1210     a NULL table pointer.
1211     .sp
1212 nigel 63 PCRE_INFO_FIRSTBYTE
1213 nigel 75 .sp
1214 ph10 856 Return information about the first data unit of any matched string, for a
1215     non-anchored pattern. (The name of this option refers to the 8-bit library,
1216     where data units are bytes.) The fourth argument should point to an \fBint\fP
1217     variable.
1218 nigel 75 .P
1219 ph10 856 If there is a fixed first value, for example, the letter "c" from a pattern
1220 ph10 903 such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
1221 chpe 1055 value is always less than 256. In the 16-bit library the value can be up to
1222     0xffff. In the 32-bit library the value can be up to 0x10ffff.
1223 ph10 856 .P
1224     If there is no fixed first value, and if either
1225 nigel 75 .sp
1226 nigel 63 (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
1227     starts with "^", or
1228 nigel 75 .sp
1229 nigel 63 (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
1230     (if it were set, the pattern would be anchored),
1231 nigel 75 .sp
1232 nigel 63 -1 is returned, indicating that the pattern matches only at the start of a
1233     subject string or after any newline within the string. Otherwise -2 is
1234     returned. For anchored patterns, -2 is returned.
1235 chpe 1080 .P
1236     Since for the 32-bit library using the non-UTF-32 mode, this function is unable
1237     to return the full 32-bit range of the character, this value is deprecated;
1238 ph10 1191 instead the PCRE_INFO_FIRSTCHARACTERFLAGS and PCRE_INFO_FIRSTCHARACTER values
1239     should be used.
1240 nigel 75 .sp
1241 nigel 63 PCRE_INFO_FIRSTTABLE
1242 nigel 75 .sp
1243 nigel 63 If the pattern was studied, and this resulted in the construction of a 256-bit
1244 ph10 856 table indicating a fixed set of values for the first data unit in any matching
1245 nigel 63 string, a pointer to the table is returned. Otherwise NULL is returned. The
1246 nigel 75 fourth argument should point to an \fBunsigned char *\fP variable.
1247     .sp
1248 ph10 226 PCRE_INFO_HASCRORLF
1249     .sp
1250 ph10 227 Return 1 if the pattern contains any explicit matches for CR or LF characters,
1251 ph10 243 otherwise 0. The fourth argument should point to an \fBint\fP variable. An
1252 ph10 231 explicit match is either a literal CR or LF character, or \er or \en.
1253 ph10 226 .sp
1254 ph10 169 PCRE_INFO_JCHANGED
1255     .sp
1256 ph10 278 Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
1257     0. The fourth argument should point to an \fBint\fP variable. (?J) and
1258     (?-J) set and unset the local PCRE_DUPNAMES option, respectively.
1259 ph10 169 .sp
1260 ph10 678 PCRE_INFO_JIT
1261     .sp
1262 ph10 921 Return 1 if the pattern was studied with one of the JIT options, and
1263 ph10 678 just-in-time compiling was successful. The fourth argument should point to an
1264     \fBint\fP variable. A return value of 0 means that JIT support is not available
1265 ph10 921 in this version of PCRE, or that the pattern was not studied with a JIT option,
1266     or that the JIT compiler could not handle this particular pattern. See the
1267 ph10 678 .\" HREF
1268     \fBpcrejit\fP
1269     .\"
1270     documentation for details of what can and cannot be handled.
1271     .sp
1272 ph10 836 PCRE_INFO_JITSIZE
1273     .sp
1274 ph10 921 If the pattern was successfully studied with a JIT option, return the size of
1275     the JIT compiled code, otherwise return zero. The fourth argument should point
1276     to a \fBsize_t\fP variable.
1277 ph10 836 .sp
1278 nigel 63 PCRE_INFO_LASTLITERAL
1279 nigel 75 .sp
1280 ph10 856 Return the value of the rightmost literal data unit that must exist in any
1281     matched string, other than at its start, if such a value has been recorded. The
1282     fourth argument should point to an \fBint\fP variable. If there is no such
1283     value, -1 is returned. For anchored patterns, a last literal value is recorded
1284     only if it follows something of variable length. For example, for the pattern
1285 nigel 75 /^a\ed+z\ed+/ the returned value is "z", but for /^a\edz\ed/ the returned value
1286 nigel 65 is -1.
1287 chpe 1080 .P
1288     Since for the 32-bit library using the non-UTF-32 mode, this function is unable
1289     to return the full 32-bit range of the character, this value is deprecated;
1290 chpe 1113 instead the PCRE_INFO_REQUIREDCHARFLAGS and PCRE_INFO_REQUIREDCHAR values should
1291 chpe 1080 be used.
1292 nigel 75 .sp
1293 ph10 1314 PCRE_INFO_MATCHLIMIT
1294     .sp
1295     If the pattern set a match limit by including an item of the form
1296     (*LIMIT_MATCH=nnnn) at the start, the value is returned. The fourth argument
1297     should point to an unsigned 32-bit integer. If no such value has been set, the
1298     call to \fBpcre_fullinfo()\fP returns the error PCRE_ERROR_UNSET.
1299     .sp
1300 ph10 932 PCRE_INFO_MAXLOOKBEHIND
1301     .sp
1302 ph10 1328 Return the number of characters (NB not data units) in the longest lookbehind
1303 ph10 1253 assertion in the pattern. This information is useful when doing multi-segment
1304     matching using the partial matching facilities. Note that the simple assertions
1305     \eb and \eB require a one-character lookbehind. \eA also registers a
1306     one-character lookbehind, though it does not actually inspect the previous
1307     character. This is to ensure that at least one character from the old segment
1308 ph10 1314 is retained when a new segment is processed. Otherwise, if there are no
1309     lookbehinds in the pattern, \eA might match incorrectly at the start of a new
1310 ph10 1253 segment.
1311 ph10 932 .sp
1312 ph10 455 PCRE_INFO_MINLENGTH
1313     .sp
1314     If the pattern was studied and a minimum length for matching subject strings
1315     was computed, its value is returned. Otherwise the returned value is -1. The
1316 ph10 1328 value is a number of characters, which in UTF mode may be different from the
1317     number of data units. The fourth argument should point to an \fBint\fP
1318     variable. A non-negative value is a lower bound to the length of any matching
1319     string. There may not be any strings of that length that do actually match, but
1320     every string that does match is at least that long.
1321 ph10 455 .sp
1322 nigel 63 PCRE_INFO_NAMECOUNT
1323     PCRE_INFO_NAMEENTRYSIZE
1324     PCRE_INFO_NAMETABLE
1325 nigel 75 .sp
1326 nigel 63 PCRE supports the use of named as well as numbered capturing parentheses. The
1327     names are just an additional way of identifying the parentheses, which still
1328 nigel 91 acquire numbers. Several convenience functions such as
1329     \fBpcre_get_named_substring()\fP are provided for extracting captured
1330     substrings by name. It is also possible to extract the data directly, by first
1331     converting the name to a number in order to access the correct pointers in the
1332     output vector (described with \fBpcre_exec()\fP below). To do the conversion,
1333     you need to use the name-to-number map, which is described by these three
1334     values.
1335 nigel 75 .P
1336 nigel 63 The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT gives
1337     the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size of each
1338 nigel 75 entry; both of these return an \fBint\fP value. The entry size depends on the
1339 nigel 63 length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first
1340 ph10 856 entry of the table. This is a pointer to \fBchar\fP in the 8-bit library, where
1341     the first two bytes of each entry are the number of the capturing parenthesis,
1342     most significant byte first. In the 16-bit library, the pointer points to
1343 ph10 1328 16-bit data units, the first of which contains the parenthesis number. In the
1344     32-bit library, the pointer points to 32-bit data units, the first of which
1345     contains the parenthesis number. The rest of the entry is the corresponding
1346     name, zero terminated.
1347 ph10 457 .P
1348     The names are in alphabetical order. Duplicate names may appear if (?| is used
1349     to create multiple groups with the same number, as described in the
1350     .\" HTML <a href="pcrepattern.html#dupsubpatternnumber">
1351     .\" </a>
1352     section on duplicate subpattern numbers
1353     .\"
1354     in the
1355     .\" HREF
1356     \fBpcrepattern\fP
1357     .\"
1358 ph10 461 page. Duplicate names for subpatterns with different numbers are permitted only
1359     if PCRE_DUPNAMES is set. In all cases of duplicate names, they appear in the
1360     table in the order in which they were found in the pattern. In the absence of
1361     (?| this is the order of increasing number; when (?| is used this is not
1362 ph10 457 necessarily the case because later subpatterns may have lower numbers.
1363     .P
1364     As a simple example of the name/number table, consider the following pattern
1365 ph10 856 after compilation by the 8-bit library (assume PCRE_EXTENDED is set, so white
1366     space - including newlines - is ignored):
1367 nigel 75 .sp
1368     .\" JOIN
1369 nigel 93 (?<date> (?<year>(\ed\ed)?\ed\ed) -
1370     (?<month>\ed\ed) - (?<day>\ed\ed) )
1371 nigel 75 .sp
1372 nigel 63 There are four named subpatterns, so the table has four entries, and each entry
1373     in the table is eight bytes long. The table is as follows, with non-printing
1374 nigel 75 bytes shows in hexadecimal, and undefined bytes shown as ??:
1375     .sp
1376 nigel 63 00 01 d a t e 00 ??
1377     00 05 d a y 00 ?? ??
1378     00 04 m o n t h 00
1379     00 02 y e a r 00 ??
1380 nigel 75 .sp
1381     When writing code to extract data from named subpatterns using the
1382 nigel 91 name-to-number map, remember that the length of the entries is likely to be
1383 nigel 75 different for each compiled pattern.
1384     .sp
1385 ph10 169 PCRE_INFO_OKPARTIAL
1386     .sp
1387 ph10 435 Return 1 if the pattern can be used for partial matching with
1388     \fBpcre_exec()\fP, otherwise 0. The fourth argument should point to an
1389     \fBint\fP variable. From release 8.00, this always returns 1, because the
1390     restrictions that previously applied to partial matching have been lifted. The
1391 ph10 169 .\" HREF
1392     \fBpcrepartial\fP
1393     .\"
1394 ph10 426 documentation gives details of partial matching.
1395 ph10 169 .sp
1396 nigel 63 PCRE_INFO_OPTIONS
1397 nigel 75 .sp
1398 nigel 63 Return a copy of the options with which the pattern was compiled. The fourth
1399 nigel 75 argument should point to an \fBunsigned long int\fP variable. These option bits
1400     are those specified in the call to \fBpcre_compile()\fP, modified by any
1401 ph10 196 top-level option settings at the start of the pattern itself. In other words,
1402     they are the options that will be in force when matching starts. For example,
1403     if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
1404     result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
1405 nigel 75 .P
1406 nigel 63 A pattern is automatically anchored by PCRE if all of its top-level
1407     alternatives begin with one of the following:
1408 nigel 75 .sp
1409 nigel 63 ^ unless PCRE_MULTILINE is set
1410 nigel 75 \eA always
1411     \eG always
1412     .\" JOIN
1413 nigel 63 .* if PCRE_DOTALL is set and there are no back
1414     references to the subpattern in which .* appears
1415 nigel 75 .sp
1416 nigel 63 For such patterns, the PCRE_ANCHORED bit is set in the options returned by
1417 nigel 75 \fBpcre_fullinfo()\fP.
1418     .sp
1419 ph10 1314 PCRE_INFO_RECURSIONLIMIT
1420     .sp
1421     If the pattern set a recursion limit by including an item of the form
1422     (*LIMIT_RECURSION=nnnn) at the start, the value is returned. The fourth
1423     argument should point to an unsigned 32-bit integer. If no such value has been
1424     set, the call to \fBpcre_fullinfo()\fP returns the error PCRE_ERROR_UNSET.
1425     .sp
1426 nigel 63 PCRE_INFO_SIZE
1427 nigel 75 .sp
1428 ph10 1328 Return the size of the compiled pattern in bytes (for all three libraries). The
1429 ph10 856 fourth argument should point to a \fBsize_t\fP variable. This value does not
1430     include the size of the \fBpcre\fP structure that is returned by
1431     \fBpcre_compile()\fP. The value that is passed as the argument to
1432     \fBpcre_malloc()\fP when \fBpcre_compile()\fP is getting memory in which to
1433     place the compiled data is the value returned by this option plus the size of
1434     the \fBpcre\fP structure. Studying a compiled pattern, with or without JIT,
1435     does not alter the value returned by this option.
1436 nigel 75 .sp
1437 nigel 63 PCRE_INFO_STUDYSIZE
1438 nigel 75 .sp
1439 ph10 1328 Return the size in bytes (for all three libraries) of the data block pointed to
1440     by the \fIstudy_data\fP field in a \fBpcre_extra\fP block. If \fBpcre_extra\fP
1441     is NULL, or there is no study data, zero is returned. The fourth argument
1442     should point to a \fBsize_t\fP variable. The \fIstudy_data\fP field is set by
1443     \fBpcre_study()\fP to record information that will speed up matching (see the
1444     section entitled
1445 ph10 656 .\" HTML <a href="#studyingapattern">
1446     .\" </a>
1447 ph10 659 "Studying a pattern"
1448 ph10 656 .\"
1449     above). The format of the \fIstudy_data\fP block is private, but its length
1450     is made available via this option so that it can be saved and restored (see the
1451     .\" HREF
1452     \fBpcreprecompile\fP
1453     .\"
1454     documentation for details).
1455 chpe 1080 .sp
1456 chpe 1113 PCRE_INFO_FIRSTCHARACTERFLAGS
1457 chpe 1080 .sp
1458     Return information about the first data unit of any matched string, for a
1459     non-anchored pattern. The fourth argument should point to an \fBint\fP
1460     variable.
1461     .P
1462     If there is a fixed first value, for example, the letter "c" from a pattern
1463     such as (cat|cow|coyote), 1 is returned, and the character value can be
1464 chpe 1113 retrieved using PCRE_INFO_FIRSTCHARACTER.
1465 chpe 1080 .P
1466     If there is no fixed first value, and if either
1467     .sp
1468     (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
1469     starts with "^", or
1470     .sp
1471     (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
1472     (if it were set, the pattern would be anchored),
1473     .sp
1474     2 is returned, indicating that the pattern matches only at the start of a
1475     subject string or after any newline within the string. Otherwise 0 is
1476     returned. For anchored patterns, 0 is returned.
1477     .sp
1478 chpe 1113 PCRE_INFO_FIRSTCHARACTER
1479 chpe 1080 .sp
1480 ph10 1191 Return the fixed first character value, if PCRE_INFO_FIRSTCHARACTERFLAGS
1481     returned 1; otherwise returns 0. The fourth argument should point to an
1482     \fBuint_t\fP variable.
1483 chpe 1080 .P
1484     In the 8-bit library, the value is always less than 256. In the 16-bit library
1485     the value can be up to 0xffff. In the 32-bit library in UTF-32 mode the value
1486     can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode.
1487     .P
1488     If there is no fixed first value, and if either
1489     .sp
1490     (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
1491     starts with "^", or
1492     .sp
1493     (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
1494     (if it were set, the pattern would be anchored),
1495     .sp
1496     -1 is returned, indicating that the pattern matches only at the start of a
1497     subject string or after any newline within the string. Otherwise -2 is
1498     returned. For anchored patterns, -2 is returned.
1499     .sp
1500 chpe 1113 PCRE_INFO_REQUIREDCHARFLAGS
1501 chpe 1080 .sp
1502 ph10 1191 Returns 1 if there is a rightmost literal data unit that must exist in any
1503     matched string, other than at its start. The fourth argument should point to
1504     an \fBint\fP variable. If there is no such value, 0 is returned. If returning
1505     1, the character value itself can be retrieved using PCRE_INFO_REQUIREDCHAR.
1506 chpe 1080 .P
1507 ph10 1191 For anchored patterns, a last literal value is recorded only if it follows
1508     something of variable length. For example, for the pattern /^a\ed+z\ed+/ the
1509     returned value 1 (with "z" returned from PCRE_INFO_REQUIREDCHAR), but for
1510     /^a\edz\ed/ the returned value is 0.
1511 chpe 1080 .sp
1512 chpe 1113 PCRE_INFO_REQUIREDCHAR
1513 chpe 1080 .sp
1514     Return the value of the rightmost literal data unit that must exist in any
1515     matched string, other than at its start, if such a value has been recorded. The
1516     fourth argument should point to an \fBuint32_t\fP variable. If there is no such
1517     value, 0 is returned.
1518 nigel 75 .
1519     .
1520 nigel 77 .SH "REFERENCE COUNTS"
1521 nigel 63 .rs
1522     .sp
1523 nigel 77 .B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
1524     .PP
1525     The \fBpcre_refcount()\fP function is used to maintain a reference count in the
1526     data block that contains a compiled pattern. It is provided for the benefit of
1527     applications that operate in an object-oriented manner, where different parts
1528     of the application may be using the same compiled pattern, but you want to free
1529     the block when they are all done.
1530     .P
1531     When a pattern is compiled, the reference count field is initialized to zero.
1532     It is changed only by calling this function, whose action is to add the
1533     \fIadjust\fP value (which may be positive or negative) to it. The yield of the
1534     function is the new value. However, the value of the count is constrained to
1535     lie between 0 and 65535, inclusive. If the new value is outside these limits,
1536     it is forced to the appropriate limit value.
1537     .P
1538     Except when it is zero, the reference count is not correctly preserved if a
1539     pattern is compiled on one host and then transferred to a host whose byte-order
1540     is different. (This seems a highly unlikely scenario.)
1541     .
1542     .
1543     .SH "MATCHING A PATTERN: THE TRADITIONAL FUNCTION"
1544     .rs
1545     .sp
1546 ph10 1339 .nf
1547 nigel 75 .B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
1548 ph10 1339 .B " const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
1549     .B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);"
1550     .fi
1551 nigel 75 .P
1552     The function \fBpcre_exec()\fP is called to match a subject string against a
1553     compiled pattern, which is passed in the \fIcode\fP argument. If the
1554 ph10 455 pattern was studied, the result of the study should be passed in the
1555 ph10 707 \fIextra\fP argument. You can call \fBpcre_exec()\fP with the same \fIcode\fP
1556     and \fIextra\fP arguments as many times as you like, in order to match
1557     different subject strings with the same pattern.
1558     .P
1559     This function is the main matching facility of the library, and it operates in
1560     a Perl-like manner. For specialist use there is also an alternative matching
1561     function, which is described
1562 nigel 77 .\" HTML <a href="#dfamatch">
1563     .\" </a>
1564     below
1565     .\"
1566     in the section about the \fBpcre_dfa_exec()\fP function.
1567 nigel 75 .P
1568     In most applications, the pattern will have been compiled (and optionally
1569     studied) in the same process that calls \fBpcre_exec()\fP. However, it is
1570     possible to save compiled patterns and study data, and then use them later
1571     in different processes, possibly even on different hosts. For a discussion
1572     about this, see the
1573     .\" HREF
1574     \fBpcreprecompile\fP
1575     .\"
1576     documentation.
1577     .P
1578     Here is an example of a simple call to \fBpcre_exec()\fP:
1579     .sp
1580 nigel 63 int rc;
1581     int ovector[30];
1582     rc = pcre_exec(
1583     re, /* result of pcre_compile() */
1584     NULL, /* we didn't study the pattern */
1585     "some string", /* the subject string */
1586     11, /* the length of the subject string */
1587     0, /* start at offset 0 in the subject */
1588     0, /* default options */
1589 nigel 75 ovector, /* vector of integers for substring information */
1590 nigel 77 30); /* number of elements (NOT size in bytes) */
1591 nigel 75 .
1592 ph10 656 .
1593 nigel 75 .\" HTML <a name="extradata"></a>
1594     .SS "Extra data for \fBpcre_exec()\fR"
1595     .rs
1596     .sp
1597     If the \fIextra\fP argument is not NULL, it must point to a \fBpcre_extra\fP
1598     data block. The \fBpcre_study()\fP function returns such a block (when it
1599 nigel 63 doesn't return NULL), but you can also create one for yourself, and pass
1600 nigel 87 additional information in it. The \fBpcre_extra\fP block contains the following
1601     fields (not necessarily in this order):
1602 nigel 75 .sp
1603     unsigned long int \fIflags\fP;
1604     void *\fIstudy_data\fP;
1605 ph10 691 void *\fIexecutable_jit\fP;
1606 nigel 75 unsigned long int \fImatch_limit\fP;
1607 nigel 87 unsigned long int \fImatch_limit_recursion\fP;
1608 nigel 75 void *\fIcallout_data\fP;
1609     const unsigned char *\fItables\fP;
1610 ph10 512 unsigned char **\fImark\fP;
1611 nigel 75 .sp
1612 ph10 903 In the 16-bit version of this structure, the \fImark\fP field has type
1613 ph10 859 "PCRE_UCHAR16 **".
1614 chpe 1055 .sp
1615     In the 32-bit version of this structure, the \fImark\fP field has type
1616     "PCRE_UCHAR32 **".
1617 ph10 856 .P
1618 ph10 926 The \fIflags\fP field is used to specify which of the other fields are set. The
1619     flag bits are:
1620 nigel 75 .sp
1621 ph10 922 PCRE_EXTRA_CALLOUT_DATA
1622 ph10 691 PCRE_EXTRA_EXECUTABLE_JIT
1623 ph10 922 PCRE_EXTRA_MARK
1624 nigel 63 PCRE_EXTRA_MATCH_LIMIT
1625 nigel 87 PCRE_EXTRA_MATCH_LIMIT_RECURSION
1626 ph10 922 PCRE_EXTRA_STUDY_DATA
1627 nigel 75 PCRE_EXTRA_TABLES
1628     .sp
1629 ph10 678 Other flag bits should be set to zero. The \fIstudy_data\fP field and sometimes
1630     the \fIexecutable_jit\fP field are set in the \fBpcre_extra\fP block that is
1631     returned by \fBpcre_study()\fP, together with the appropriate flag bits. You
1632 ph10 922 should not set these yourself, but you may add to the block by setting other
1633     fields and their corresponding flag bits.
1634 nigel 75 .P
1635     The \fImatch_limit\fP field provides a means of preventing PCRE from using up a
1636 nigel 63 vast amount of resources when running patterns that are not going to match,
1637     but which have a very large number of possibilities in their search trees. The
1638 ph10 456 classic example is a pattern that uses nested unlimited repeats.
1639 nigel 75 .P
1640 ph10 678 Internally, \fBpcre_exec()\fP uses a function called \fBmatch()\fP, which it
1641     calls repeatedly (sometimes recursively). The limit set by \fImatch_limit\fP is
1642     imposed on the number of times this function is called during a match, which
1643     has the effect of limiting the amount of backtracking that can take place. For
1644     patterns that are not anchored, the count restarts from zero for each position
1645 ph10 691 in the subject string.
1646 nigel 75 .P
1647 ph10 691 When \fBpcre_exec()\fP is called with a pattern that was successfully studied
1648 ph10 921 with a JIT option, the way that the matching is executed is entirely different.
1649     However, there is still the possibility of runaway matching that goes on for a
1650     very long time, and so the \fImatch_limit\fP value is also used in this case
1651     (but in a different way) to limit how long the matching can continue.
1652 ph10 678 .P
1653 nigel 87 The default value for the limit can be set when PCRE is built; the default
1654 nigel 63 default is 10 million, which handles all but the most extreme cases. You can
1655 nigel 87 override the default by suppling \fBpcre_exec()\fP with a \fBpcre_extra\fP
1656     block in which \fImatch_limit\fP is set, and PCRE_EXTRA_MATCH_LIMIT is set in
1657     the \fIflags\fP field. If the limit is exceeded, \fBpcre_exec()\fP returns
1658     PCRE_ERROR_MATCHLIMIT.
1659 nigel 75 .P
1660 ph10 1314 A value for the match limit may also be supplied by an item at the start of a
1661     pattern of the form
1662     .sp
1663     (*LIMIT_MATCH=d)
1664     .sp
1665     where d is a decimal number. However, such a setting is ignored unless d is
1666     less than the limit set by the caller of \fBpcre_exec()\fP or, if no such limit
1667     is set, less than the default.
1668     .P
1669 nigel 87 The \fImatch_limit_recursion\fP field is similar to \fImatch_limit\fP, but
1670     instead of limiting the total number of times that \fBmatch()\fP is called, it
1671     limits the depth of recursion. The recursion depth is a smaller number than the
1672     total number of calls, because not all calls to \fBmatch()\fP are recursive.
1673 ph10 691 This limit is of use only if it is set smaller than \fImatch_limit\fP.
1674 nigel 87 .P
1675 ph10 678 Limiting the recursion depth limits the amount of machine stack that can be
1676     used, or, when PCRE has been compiled to use memory on the heap instead of the
1677 ph10 686 stack, the amount of heap memory that can be used. This limit is not relevant,
1678 ph10 975 and is ignored, when matching is done using JIT compiled code.
1679 nigel 87 .P
1680     The default value for \fImatch_limit_recursion\fP can be set when PCRE is
1681     built; the default default is the same value as the default for
1682     \fImatch_limit\fP. You can override the default by suppling \fBpcre_exec()\fP
1683     with a \fBpcre_extra\fP block in which \fImatch_limit_recursion\fP is set, and
1684     PCRE_EXTRA_MATCH_LIMIT_RECURSION is set in the \fIflags\fP field. If the limit
1685     is exceeded, \fBpcre_exec()\fP returns PCRE_ERROR_RECURSIONLIMIT.
1686     .P
1687 ph10 1314 A value for the recursion limit may also be supplied by an item at the start of
1688     a pattern of the form
1689     .sp
1690     (*LIMIT_RECURSION=d)
1691     .sp
1692     where d is a decimal number. However, such a setting is ignored unless d is
1693     less than the limit set by the caller of \fBpcre_exec()\fP or, if no such limit
1694     is set, less than the default.
1695     .P
1696 ph10 440 The \fIcallout_data\fP field is used in conjunction with the "callout" feature,
1697     and is described in the
1698 nigel 75 .\" HREF
1699     \fBpcrecallout\fP
1700     .\"
1701     documentation.
1702     .P
1703     The \fItables\fP field is used to pass a character tables pointer to
1704     \fBpcre_exec()\fP; this overrides the value that is stored with the compiled
1705     pattern. A non-NULL value is stored with the compiled pattern only if custom
1706     tables were supplied to \fBpcre_compile()\fP via its \fItableptr\fP argument.
1707     If NULL is passed to \fBpcre_exec()\fP using this mechanism, it forces PCRE's
1708     internal tables to be used. This facility is helpful when re-using patterns
1709     that have been saved after compiling with an external set of tables, because
1710     the external tables might be at a different address when \fBpcre_exec()\fP is
1711     called. See the
1712     .\" HREF
1713     \fBpcreprecompile\fP
1714     .\"
1715     documentation for a discussion of saving compiled patterns for later use.
1716 ph10 510 .P
1717     If PCRE_EXTRA_MARK is set in the \fIflags\fP field, the \fImark\fP field must
1718 ph10 856 be set to point to a suitable variable. If the pattern contains any
1719 ph10 510 backtracking control verbs such as (*MARK:NAME), and the execution ends up with
1720     a name to pass back, a pointer to the name string (zero terminated) is placed
1721     in the variable pointed to by the \fImark\fP field. The names are within the
1722     compiled pattern; if you wish to retain such a name you must copy it before
1723     freeing the memory of a compiled pattern. If there is no name to pass back, the
1724 ph10 856 variable pointed to by the \fImark\fP field is set to NULL. For details of the
1725 ph10 510 backtracking control verbs, see the section entitled
1726     .\" HTML <a href="pcrepattern#backtrackcontrol">
1727     .\" </a>
1728     "Backtracking control"
1729     .\"
1730     in the
1731     .\" HREF
1732     \fBpcrepattern\fP
1733     .\"
1734     documentation.
1735 nigel 75 .
1736 ph10 510 .
1737 ph10 226 .\" HTML <a name="execoptions"></a>
1738 nigel 75 .SS "Option bits for \fBpcre_exec()\fP"
1739     .rs
1740     .sp
1741     The unused bits of the \fIoptions\fP argument for \fBpcre_exec()\fP must be
1742 nigel 91 zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
1743 ph10 442 PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
1744 ph10 921 PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_HARD, and
1745     PCRE_PARTIAL_SOFT.
1746 ph10 686 .P
1747 ph10 921 If the pattern was successfully studied with one of the just-in-time (JIT)
1748     compile options, the only supported options for JIT execution are
1749     PCRE_NO_UTF8_CHECK, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY,
1750     PCRE_NOTEMPTY_ATSTART, PCRE_PARTIAL_HARD, and PCRE_PARTIAL_SOFT. If an
1751     unsupported option is used, JIT execution is disabled and the normal
1752     interpretive code in \fBpcre_exec()\fP is run.
1753 nigel 75 .sp
1754     PCRE_ANCHORED
1755     .sp
1756     The PCRE_ANCHORED option limits \fBpcre_exec()\fP to matching at the first
1757     matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
1758     to be anchored by virtue of its contents, it cannot be made unachored at
1759     matching time.
1760     .sp
1761 ph10 231 PCRE_BSR_ANYCRLF
1762     PCRE_BSR_UNICODE
1763     .sp
1764     These options (which are mutually exclusive) control what the \eR escape
1765     sequence matches. The choice is either to match only CR, LF, or CRLF, or to
1766     match any Unicode newline sequence. These options override the choice that was
1767     made or defaulted when the pattern was compiled.
1768     .sp
1769 nigel 91 PCRE_NEWLINE_CR
1770     PCRE_NEWLINE_LF
1771     PCRE_NEWLINE_CRLF
1772 ph10 150 PCRE_NEWLINE_ANYCRLF
1773 nigel 93 PCRE_NEWLINE_ANY
1774 nigel 91 .sp
1775     These options override the newline definition that was chosen or defaulted when
1776 nigel 93 the pattern was compiled. For details, see the description of
1777     \fBpcre_compile()\fP above. During matching, the newline choice affects the
1778     behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
1779     the way the match position is advanced after a match failure for an unanchored
1780 ph10 227 pattern.
1781 ph10 225 .P
1782 ph10 226 When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
1783     match attempt for an unanchored pattern fails when the current position is at a
1784 ph10 230 CRLF sequence, and the pattern contains no explicit matches for CR or LF
1785 ph10 226 characters, the match position is advanced by two characters instead of one, in
1786     other words, to after the CRLF.
1787     .P
1788 ph10 227 The above rule is a compromise that makes the most common cases work as
1789     expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
1790     set), it does not match the string "\er\enA" because, after failing at the
1791     start, it skips both the CR and the LF before retrying. However, the pattern
1792     [\er\en]A does match that string, because it contains an explicit CR or LF
1793 ph10 226 reference, and so advances only by one character after the first failure.
1794     .P
1795 ph10 231 An explicit match for CR of LF is either a literal appearance of one of those
1796     characters, or one of the \er or \en escape sequences. Implicit matches such as
1797 ph10 230 [^X] do not count, nor does \es (which includes CR and LF in the characters
1798     that it matches).
1799     .P
1800 ph10 226 Notwithstanding the above, anomalous effects may still occur when CRLF is a
1801     valid newline sequence and explicit \er or \en escapes appear in the pattern.
1802 nigel 91 .sp
1803 nigel 63 PCRE_NOTBOL
1804 nigel 75 .sp
1805     This option specifies that first character of the subject string is not the
1806     beginning of a line, so the circumflex metacharacter should not match before
1807     it. Setting this without PCRE_MULTILINE (at compile time) causes circumflex
1808     never to match. This option affects only the behaviour of the circumflex
1809     metacharacter. It does not affect \eA.
1810     .sp
1811 nigel 63 PCRE_NOTEOL
1812 nigel 75 .sp
1813     This option specifies that the end of the subject string is not the end of a
1814     line, so the dollar metacharacter should not match it nor (except in multiline
1815     mode) a newline immediately before it. Setting this without PCRE_MULTILINE (at
1816     compile time) causes dollar never to match. This option affects only the
1817     behaviour of the dollar metacharacter. It does not affect \eZ or \ez.
1818     .sp
1819 nigel 63 PCRE_NOTEMPTY
1820 nigel 75 .sp
1821 nigel 63 An empty string is not considered to be a valid match if this option is set. If
1822     there are alternatives in the pattern, they are tried. If all the alternatives
1823     match the empty string, the entire match fails. For example, if the pattern
1824 nigel 75 .sp
1825 nigel 63 a?b?
1826 nigel 75 .sp
1827 ph10 442 is applied to a string not beginning with "a" or "b", it matches an empty
1828 nigel 63 string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
1829     valid, so PCRE searches further into the string for occurrences of "a" or "b".
1830 ph10 442 .sp
1831     PCRE_NOTEMPTY_ATSTART
1832     .sp
1833 ph10 461 This is like PCRE_NOTEMPTY, except that an empty string match that is not at
1834 ph10 442 the start of the subject is permitted. If the pattern is anchored, such a match
1835     can occur only if the pattern contains \eK.
1836 nigel 75 .P
1837 ph10 442 Perl has no direct equivalent of PCRE_NOTEMPTY or PCRE_NOTEMPTY_ATSTART, but it
1838     does make a special case of a pattern match of the empty string within its
1839     \fBsplit()\fP function, and when using the /g modifier. It is possible to
1840     emulate Perl's behaviour after matching a null string by first trying the match
1841     again at the same offset with PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED, and then
1842     if that fails, by advancing the starting offset (see below) and trying an
1843     ordinary match again. There is some code that demonstrates how to do this in
1844     the
1845 ph10 429 .\" HREF
1846     \fBpcredemo\fP
1847     .\"
1848 ph10 572 sample program. In the most general case, you have to check to see if the
1849     newline convention recognizes CRLF as a newline, and if so, and the current
1850 ph10 566 character is CR followed by LF, advance the starting offset by two characters
1851     instead of one.
1852 nigel 75 .sp
1853 ph10 389 PCRE_NO_START_OPTIMIZE
1854     .sp
1855 ph10 392 There are a number of optimizations that \fBpcre_exec()\fP uses at the start of
1856 ph10 542 a match, in order to speed up the process. For example, if it is known that an
1857     unanchored match must start with a specific character, it searches the subject
1858     for that character, and fails immediately if it cannot find it, without
1859 ph10 545 actually running the main matching function. This means that a special item
1860 ph10 542 such as (*COMMIT) at the start of a pattern is not considered until after a
1861 ph10 1301 suitable starting point for the match has been found. Also, when callouts or
1862     (*MARK) items are in use, these "start-up" optimizations can cause them to be
1863     skipped if the pattern is never actually used. The start-up optimizations are
1864     in effect a pre-scan of the subject that takes place before the pattern is run.
1865 ph10 546 .P
1866     The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations, possibly
1867     causing performance to suffer, but ensuring that in cases where the result is
1868     "no match", the callouts do occur, and that items such as (*COMMIT) and (*MARK)
1869 ph10 579 are considered at every possible starting position in the subject string. If
1870     PCRE_NO_START_OPTIMIZE is set at compile time, it cannot be unset at matching
1871 ph10 1301 time. The use of PCRE_NO_START_OPTIMIZE at matching time (that is, passing it
1872     to \fBpcre_exec()\fP) disables JIT execution; in this situation, matching is
1873     always done using interpretively.
1874 ph10 576 .P
1875 ph10 546 Setting PCRE_NO_START_OPTIMIZE can change the outcome of a matching operation.
1876     Consider the pattern
1877 ph10 389 .sp
1878 ph10 546 (*COMMIT)ABC
1879     .sp
1880     When this is compiled, PCRE records the fact that a match must start with the
1881 ph10 548 character "A". Suppose the subject string is "DEFABC". The start-up
1882     optimization scans along the subject, finds "A" and runs the first match
1883 ph10 546 attempt from there. The (*COMMIT) item means that the pattern must match the
1884 ph10 548 current starting position, which in this case, it does. However, if the same
1885     match is run with PCRE_NO_START_OPTIMIZE set, the initial scan along the
1886     subject string does not happen. The first match attempt is run starting from
1887 ph10 546 "D" and when this fails, (*COMMIT) prevents any further matches being tried, so
1888     the overall result is "no match". If the pattern is studied, more start-up
1889     optimizations may be used. For example, a minimum length for the subject may be
1890     recorded. Consider the pattern
1891     .sp
1892     (*MARK:A)(X|Y)
1893     .sp
1894 ph10 548 The minimum length for a match is one character. If the subject is "ABC", there
1895     will be attempts to match "ABC", "BC", "C", and then finally an empty string.
1896     If the pattern is studied, the final attempt does not take place, because PCRE
1897     knows that the subject is too short, and so the (*MARK) is never encountered.
1898     In this case, studying the pattern does not affect the overall match result,
1899     which is still "no match", but it does affect the auxiliary information that is
1900 ph10 546 returned.
1901     .sp
1902 nigel 75 PCRE_NO_UTF8_CHECK
1903     .sp
1904     When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
1905     string is automatically checked when \fBpcre_exec()\fP is subsequently called.
1906 ph10 959 The entire string is checked before any other processing takes place. The value
1907     of \fIstartoffset\fP is also checked to ensure that it points to the start of a
1908     UTF-8 character. There is a discussion about the
1909     .\" HTML <a href="pcreunicode.html#utf8strings">
1910     .\" </a>
1911     validity of UTF-8 strings
1912     .\"
1913     in the
1914 ph10 211 .\" HREF
1915 ph10 856 \fBpcreunicode\fP
1916 ph10 211 .\"
1917 ph10 856 page. If an invalid sequence of bytes is found, \fBpcre_exec()\fP returns the
1918     error PCRE_ERROR_BADUTF8 or, if PCRE_PARTIAL_HARD is set and the problem is a
1919     truncated character at the end of the subject, PCRE_ERROR_SHORTUTF8. In both
1920     cases, information about the precise nature of the error may also be returned
1921     (see the descriptions of these errors in the section entitled \fIError return
1922     values from\fP \fBpcre_exec()\fP
1923 ph10 598 .\" HTML <a href="#errorlist">
1924     .\" </a>
1925     below).
1926     .\"
1927     If \fIstartoffset\fP contains a value that does not point to the start of a
1928     UTF-8 character (or to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is
1929 ph10 569 returned.
1930 nigel 75 .P
1931     If you already know that your subject is valid, and you want to skip these
1932     checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
1933     calling \fBpcre_exec()\fP. You might want to do this for the second and
1934     subsequent calls to \fBpcre_exec()\fP if you are making repeated calls to find
1935     all the matches in a single subject string. However, you should be sure that
1936 ph10 856 the value of \fIstartoffset\fP points to the start of a character (or the end
1937     of the subject). When PCRE_NO_UTF8_CHECK is set, the effect of passing an
1938     invalid string as a subject or an invalid value of \fIstartoffset\fP is
1939 ph10 567 undefined. Your program may crash.
1940 nigel 75 .sp
1941 ph10 461 PCRE_PARTIAL_HARD
1942 ph10 428 PCRE_PARTIAL_SOFT
1943 nigel 75 .sp
1944 ph10 428 These options turn on the partial matching feature. For backwards
1945     compatibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. A partial match
1946     occurs if the end of the subject string is reached successfully, but there are
1947     not enough subject characters to complete the match. If this happens when
1948 ph10 553 PCRE_PARTIAL_SOFT (but not PCRE_PARTIAL_HARD) is set, matching continues by
1949     testing any remaining alternatives. Only if no complete match can be found is
1950     PCRE_ERROR_PARTIAL returned instead of PCRE_ERROR_NOMATCH. In other words,
1951     PCRE_PARTIAL_SOFT says that the caller is prepared to handle a partial match,
1952     but only if no complete match can be found.
1953     .P
1954     If PCRE_PARTIAL_HARD is set, it overrides PCRE_PARTIAL_SOFT. In this case, if a
1955     partial match is found, \fBpcre_exec()\fP immediately returns
1956     PCRE_ERROR_PARTIAL, without considering any other alternatives. In other words,
1957 ph10 572 when PCRE_PARTIAL_HARD is set, a partial match is considered to be more
1958 ph10 553 important that an alternative complete match.
1959     .P
1960     In both cases, the portion of the string that was inspected when the partial
1961     match was found is set as the first matching string. There is a more detailed
1962     discussion of partial and multi-segment matching, with examples, in the
1963 nigel 75 .\" HREF
1964     \fBpcrepartial\fP
1965     .\"
1966     documentation.
1967     .
1968 ph10 567 .
1969 nigel 75 .SS "The string to be matched by \fBpcre_exec()\fP"
1970     .rs
1971     .sp
1972     The subject string is passed to \fBpcre_exec()\fP as a pointer in
1973 ph10 1328 \fIsubject\fP, a length in \fIlength\fP, and a starting offset in
1974     \fIstartoffset\fP. The units for \fIlength\fP and \fIstartoffset\fP are bytes
1975     for the 8-bit library, 16-bit data items for the 16-bit library, and 32-bit
1976     data items for the 32-bit library.
1977 nigel 75 .P
1978 ph10 1328 If \fIstartoffset\fP is negative or greater than the length of the subject,
1979     \fBpcre_exec()\fP returns PCRE_ERROR_BADOFFSET. When the starting offset is
1980     zero, the search for a match starts at the beginning of the subject, and this
1981     is by far the most common case. In UTF-8 or UTF-16 mode, the offset must point
1982 ph10 1335 to the start of a character, or the end of the subject (in UTF-32 mode, one
1983 ph10 1328 data unit equals one character, so all offsets are valid). Unlike the pattern
1984     string, the subject may contain binary zeroes.
1985     .P
1986 nigel 63 A non-zero starting offset is useful when searching for another match in the
1987 nigel 75 same subject by calling \fBpcre_exec()\fP again after a previous success.
1988     Setting \fIstartoffset\fP differs from just passing over a shortened string and
1989 nigel 63 setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
1990     lookbehind. For example, consider the pattern
1991 nigel 75 .sp
1992     \eBiss\eB
1993     .sp
1994     which finds occurrences of "iss" in the middle of words. (\eB matches only if
1995 nigel 63 the current position in the subject is not a word boundary.) When applied to
1996 nigel 75 the string "Mississipi" the first call to \fBpcre_exec()\fP finds the first
1997     occurrence. If \fBpcre_exec()\fP is called again with just the remainder of the
1998     subject, namely "issipi", it does not match, because \eB is always false at the
1999 nigel 63 start of the subject, which is deemed to be a word boundary. However, if
2000 nigel 75 \fBpcre_exec()\fP is passed the entire string again, but with \fIstartoffset\fP
2001 nigel 63 set to 4, it finds the second occurrence of "iss" because it is able to look
2002     behind the starting point to discover that it is preceded by a letter.
2003 nigel 75 .P
2004 ph10 567 Finding all the matches in a subject is tricky when the pattern can match an
2005     empty string. It is possible to emulate Perl's /g behaviour by first trying the
2006     match again at the same offset, with the PCRE_NOTEMPTY_ATSTART and
2007     PCRE_ANCHORED options, and then if that fails, advancing the starting offset
2008     and trying an ordinary match again. There is some code that demonstrates how to
2009     do this in the
2010     .\" HREF
2011     \fBpcredemo\fP
2012     .\"
2013 ph10 572 sample program. In the most general case, you have to check to see if the
2014     newline convention recognizes CRLF as a newline, and if so, and the current
2015 ph10 567 character is CR followed by LF, advance the starting offset by two characters
2016     instead of one.
2017     .P
2018 nigel 63 If a non-zero starting offset is passed when the pattern is anchored, one
2019 nigel 75 attempt to match at the given offset is made. This can only succeed if the
2020 nigel 63 pattern does not require the match to be at the start of the subject.
2021 nigel 75 .
2022 ph10 567 .
2023 nigel 75 .SS "How \fBpcre_exec()\fP returns captured substrings"
2024     .rs
2025     .sp
2026 nigel 63 In general, a pattern matches a certain portion of the subject, and in
2027     addition, further substrings from the subject may be picked out by parts of the
2028     pattern. Following the usage in Jeffrey Friedl's book, this is called
2029     "capturing" in what follows, and the phrase "capturing subpattern" is used for
2030     a fragment of a pattern that picks out a substring. PCRE supports several other
2031     kinds of parenthesized subpattern that do not cause substrings to be captured.
2032 nigel 75 .P
2033 ph10 368 Captured substrings are returned to the caller via a vector of integers whose
2034     address is passed in \fIovector\fP. The number of elements in the vector is
2035     passed in \fIovecsize\fP, which must be a non-negative number. \fBNote\fP: this
2036     argument is NOT the size of \fIovector\fP in bytes.
2037 nigel 75 .P
2038     The first two-thirds of the vector is used to pass back captured substrings,
2039     each substring using a pair of integers. The remaining third of the vector is
2040     used as workspace by \fBpcre_exec()\fP while matching capturing subpatterns,
2041 ph10 368 and is not available for passing back information. The number passed in
2042 nigel 75 \fIovecsize\fP should always be a multiple of three. If it is not, it is
2043     rounded down.
2044     .P
2045     When a match is successful, information about captured substrings is returned
2046     in pairs of integers, starting at the beginning of \fIovector\fP, and
2047 ph10 371 continuing up to two-thirds of its length at the most. The first element of
2048 ph10 1328 each pair is set to the offset of the first character in a substring, and the
2049     second is set to the offset of the first character after the end of a
2050     substring. These values are always data unit offsets, even in UTF mode. They
2051     are byte offsets in the 8-bit library, 16-bit data item offsets in the 16-bit
2052     library, and 32-bit data item offsets in the 32-bit library. \fBNote\fP: they
2053     are not character counts.
2054 nigel 75 .P
2055 ph10 368 The first pair of integers, \fIovector[0]\fP and \fIovector[1]\fP, identify the
2056     portion of the subject string matched by the entire pattern. The next pair is
2057     used for the first capturing subpattern, and so on. The value returned by
2058     \fBpcre_exec()\fP is one more than the highest numbered pair that has been set.
2059     For example, if two substrings have been captured, the returned value is 3. If
2060     there are no capturing subpatterns, the return value from a successful match is
2061     1, indicating that just the first pair of offsets has been set.
2062     .P
2063 nigel 63 If a capturing subpattern is matched repeatedly, it is the last portion of the
2064 nigel 75 string that it matched that is returned.
2065     .P
2066     If the vector is too small to hold all the captured substring offsets, it is
2067     used as far as possible (up to two-thirds of its length), and the function
2068 ph10 950 returns a value of zero. If neither the actual string matched nor any captured
2069 ph10 686 substrings are of interest, \fBpcre_exec()\fP may be called with \fIovector\fP
2070     passed as NULL and \fIovecsize\fP as zero. However, if the pattern contains
2071     back references and the \fIovector\fP is not big enough to remember the related
2072     substrings, PCRE has to get additional memory for use during matching. Thus it
2073     is usually advisable to supply an \fIovector\fP of reasonable size.
2074 nigel 75 .P
2075 ph10 686 There are some cases where zero is returned (indicating vector overflow) when
2076     in fact the vector is exactly the right size for the final match. For example,
2077     consider the pattern
2078     .sp
2079     (a)(?:(b)c|bd)
2080     .sp
2081     If a vector of 6 elements (allowing for only 1 captured substring) is given
2082     with subject string "abd", \fBpcre_exec()\fP will try to set the second
2083     captured string, thereby recording a vector overflow, before failing to match
2084 ph10 691 "c" and backing up to try the second alternative. The zero return, however,
2085 ph10 686 does correctly indicate that the maximum number of slots (namely 2) have been
2086     filled. In similar cases where there is temporary overflow, but the final
2087     number of used slots is actually less than the maximum, a non-zero value is
2088     returned.
2089     .P
2090 ph10 456 The \fBpcre_fullinfo()\fP function can be used to find out how many capturing
2091 nigel 63 subpatterns there are in a compiled pattern. The smallest size for
2092 nigel 75 \fIovector\fP that will allow for \fIn\fP captured substrings, in addition to
2093     the offsets of the substring matched by the whole pattern, is (\fIn\fP+1)*3.
2094 nigel 91 .P
2095     It is possible for capturing subpattern number \fIn+1\fP to match some part of
2096     the subject when subpattern \fIn\fP has not been used at all. For example, if
2097     the string "abc" is matched against the pattern (a|(z))(bc) the return from the
2098     function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this
2099     happens, both values in the offset pairs corresponding to unused subpatterns
2100     are set to -1.
2101     .P
2102     Offset values that correspond to unused subpatterns at the end of the
2103     expression are also set to -1. For example, if the string "abc" is matched
2104     against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. The
2105     return from the function is 2, because the highest used capturing subpattern
2106 ph10 568 number is 1, and the offsets for for the second and third capturing subpatterns
2107     (assuming the vector is large enough, of course) are set to -1.
2108 nigel 91 .P
2109 ph10 686 \fBNote\fP: Elements in the first two-thirds of \fIovector\fP that do not
2110 ph10 683 correspond to capturing parentheses in the pattern are never changed. That is,
2111     if a pattern contains \fIn\fP capturing parentheses, no more than
2112     \fIovector[0]\fP to \fIovector[2n+1]\fP are set by \fBpcre_exec()\fP. The other
2113     elements (in the first two-thirds) retain whatever values they previously had.
2114 ph10 568 .P
2115 nigel 91 Some convenience functions are provided for extracting the captured substrings
2116     as separate strings. These are described below.
2117 nigel 75 .
2118 ph10 598 .
2119 nigel 77 .\" HTML <a name="errorlist"></a>
2120 nigel 91 .SS "Error return values from \fBpcre_exec()\fP"
2121 nigel 75 .rs
2122     .sp
2123     If \fBpcre_exec()\fP fails, it returns a negative number. The following are
2124 nigel 63 defined in the header file:
2125 nigel 75 .sp
2126 nigel 63 PCRE_ERROR_NOMATCH (-1)
2127 nigel 75 .sp
2128 nigel 63 The subject string did not match the pattern.
2129 nigel 75 .sp
2130 nigel 63 PCRE_ERROR_NULL (-2)
2131 nigel 75 .sp
2132     Either \fIcode\fP or \fIsubject\fP was passed as NULL, or \fIovector\fP was
2133     NULL and \fIovecsize\fP was not zero.
2134     .sp
2135 nigel 63 PCRE_ERROR_BADOPTION (-3)
2136 nigel 75 .sp
2137     An unrecognized bit was set in the \fIoptions\fP argument.
2138     .sp
2139 nigel 63 PCRE_ERROR_BADMAGIC (-4)
2140 nigel 75 .sp
2141 nigel 63 PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
2142 nigel 75 the case when it is passed a junk pointer and to detect when a pattern that was
2143     compiled in an environment of one endianness is run in an environment with the
2144     other endianness. This is the error that PCRE gives when the magic number is
2145     not present.
2146     .sp
2147 nigel 93 PCRE_ERROR_UNKNOWN_OPCODE (-5)
2148 nigel 75 .sp
2149 nigel 63 While running the pattern match, an unknown item was encountered in the
2150     compiled pattern. This error could be caused by a bug in PCRE or by overwriting
2151     of the compiled pattern.
2152 nigel 75 .sp
2153 nigel 63 PCRE_ERROR_NOMEMORY (-6)
2154 nigel 75 .sp
2155     If a pattern contains back references, but the \fIovector\fP that is passed to
2156     \fBpcre_exec()\fP is not big enough to remember the referenced substrings, PCRE
2157 nigel 63 gets a block of memory at the start of matching to use for this purpose. If the
2158 nigel 75 call via \fBpcre_malloc()\fP fails, this error is given. The memory is
2159     automatically freed at the end of matching.
2160 ph10 531 .P
2161 ph10 535 This error is also given if \fBpcre_stack_malloc()\fP fails in
2162 ph10 531 \fBpcre_exec()\fP. This can happen only when PCRE has been compiled with
2163     \fB--disable-stack-for-recursion\fP.
2164 nigel 75 .sp
2165 nigel 63 PCRE_ERROR_NOSUBSTRING (-7)
2166 nigel 75 .sp
2167     This error is used by the \fBpcre_copy_substring()\fP,
2168     \fBpcre_get_substring()\fP, and \fBpcre_get_substring_list()\fP functions (see
2169     below). It is never returned by \fBpcre_exec()\fP.
2170     .sp
2171 nigel 63 PCRE_ERROR_MATCHLIMIT (-8)
2172 nigel 75 .sp
2173 nigel 87 The backtracking limit, as specified by the \fImatch_limit\fP field in a
2174     \fBpcre_extra\fP structure (or defaulted) was reached. See the description
2175     above.
2176     .sp
2177 nigel 63 PCRE_ERROR_CALLOUT (-9)
2178 nigel 75 .sp
2179     This error is never generated by \fBpcre_exec()\fP itself. It is provided for
2180 nigel 63 use by callout functions that want to yield a distinctive error code. See the
2181 nigel 75 .\" HREF
2182     \fBpcrecallout\fP
2183     .\"
2184     documentation for details.
2185     .sp
2186 nigel 73 PCRE_ERROR_BADUTF8 (-10)
2187 nigel 75 .sp
2188 ph10 598 A string that contains an invalid UTF-8 byte sequence was passed as a subject,
2189     and the PCRE_NO_UTF8_CHECK option was not set. If the size of the output vector
2190     (\fIovecsize\fP) is at least 2, the byte offset to the start of the the invalid
2191     UTF-8 character is placed in the first element, and a reason code is placed in
2192     the second element. The reason codes are listed in the
2193     .\" HTML <a href="#badutf8reasons">
2194     .\" </a>
2195     following section.
2196     .\"
2197     For backward compatibility, if PCRE_PARTIAL_HARD is set and the problem is a
2198     truncated UTF-8 character at the end of the subject (reason codes 1 to 5),
2199     PCRE_ERROR_SHORTUTF8 is returned instead of PCRE_ERROR_BADUTF8.
2200 nigel 75 .sp
2201 nigel 73 PCRE_ERROR_BADUTF8_OFFSET (-11)
2202 nigel 75 .sp
2203 ph10 654 The UTF-8 byte sequence that was passed as a subject was checked and found to
2204 ph10 598 be valid (the PCRE_NO_UTF8_CHECK option was not set), but the value of
2205     \fIstartoffset\fP did not point to the beginning of a UTF-8 character or the
2206 ph10 569 end of the subject.
2207 nigel 75 .sp
2208 nigel 77 PCRE_ERROR_PARTIAL (-12)
2209 nigel 75 .sp
2210     The subject string did not match, but it did match partially. See the
2211     .\" HREF
2212     \fBpcrepartial\fP
2213     .\"
2214     documentation for details of partial matching.
2215     .sp
2216 nigel 77 PCRE_ERROR_BADPARTIAL (-13)
2217 nigel 75 .sp
2218 ph10 426 This code is no longer in use. It was formerly returned when the PCRE_PARTIAL
2219     option was used with a compiled pattern containing items that were not
2220 ph10 461 supported for partial matching. From release 8.00 onwards, there are no
2221 ph10 426 restrictions on partial matching.
2222 nigel 75 .sp
2223 nigel 77 PCRE_ERROR_INTERNAL (-14)
2224 nigel 75 .sp
2225     An unexpected internal error has occurred. This error could be caused by a bug
2226     in PCRE or by overwriting of the compiled pattern.
2227     .sp
2228 nigel 77 PCRE_ERROR_BADCOUNT (-15)
2229 nigel 75 .sp
2230     This error is given if the value of the \fIovecsize\fP argument is negative.
2231 nigel 93 .sp
2232     PCRE_ERROR_RECURSIONLIMIT (-21)
2233     .sp
2234     The internal recursion limit, as specified by the \fImatch_limit_recursion\fP
2235     field in a \fBpcre_extra\fP structure (or defaulted) was reached. See the
2236     description above.
2237     .sp
2238     PCRE_ERROR_BADNEWLINE (-23)
2239     .sp
2240     An invalid combination of PCRE_NEWLINE_\fIxxx\fP options was given.
2241 ph10 567 .sp
2242     PCRE_ERROR_BADOFFSET (-24)
2243     .sp
2244 ph10 572 The value of \fIstartoffset\fP was negative or greater than the length of the
2245 ph10 567 subject, that is, the value in \fIlength\fP.
2246 ph10 569 .sp
2247     PCRE_ERROR_SHORTUTF8 (-25)
2248     .sp
2249 ph10 598 This error is returned instead of PCRE_ERROR_BADUTF8 when the subject string
2250     ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD option is set.
2251     Information about the failure is returned as for PCRE_ERROR_BADUTF8. It is in
2252     fact sufficient to detect this case, but this special error code for
2253     PCRE_PARTIAL_HARD precedes the implementation of returned information; it is
2254     retained for backwards compatibility.
2255 ph10 642 .sp
2256     PCRE_ERROR_RECURSELOOP (-26)
2257     .sp
2258 ph10 654 This error is returned when \fBpcre_exec()\fP detects a recursion loop within
2259     the pattern. Specifically, it means that either the whole pattern or a
2260     subpattern has been called recursively for the second time at the same position
2261 ph10 642 in the subject string. Some simple patterns that might do this are detected and
2262     faulted at compile time, but more complicated cases, in particular mutual
2263     recursions between two different subpatterns, cannot be detected until run
2264     time.
2265 ph10 678 .sp
2266     PCRE_ERROR_JIT_STACKLIMIT (-27)
2267     .sp
2268 ph10 922 This error is returned when a pattern that was successfully studied using a
2269     JIT compile option is being matched, but the memory available for the
2270     just-in-time processing stack is not large enough. See the
2271 ph10 678 .\" HREF
2272     \fBpcrejit\fP
2273     .\"
2274 ph10 691 documentation for more details.
2275 ph10 856 .sp
2276 ph10 960 PCRE_ERROR_BADMODE (-28)
2277 ph10 856 .sp
2278 ph10 903 This error is given if a pattern that was compiled by the 8-bit library is
2279 chpe 1055 passed to a 16-bit or 32-bit library function, or vice versa.
2280 ph10 856 .sp
2281 ph10 960 PCRE_ERROR_BADENDIANNESS (-29)
2282 ph10 903 .sp
2283     This error is given if a pattern that was compiled and saved is reloaded on a
2284     host with different endianness. The utility function
2285     \fBpcre_pattern_to_host_byte_order()\fP can be used to convert such a pattern
2286 ph10 856 so that it runs on the new host.
2287 ph10 1191 .sp
2288 ph10 1194 PCRE_ERROR_JIT_BADOPTION
2289     .sp
2290     This error is returned when a pattern that was successfully studied using a JIT
2291     compile option is being matched, but the matching mode (partial or complete
2292 ph10 1221 match) does not correspond to any JIT compilation mode. When the JIT fast path
2293 ph10 1194 function is used, this error may be also given for invalid options. See the
2294     .\" HREF
2295     \fBpcrejit\fP
2296     .\"
2297     documentation for more details.
2298     .sp
2299 ph10 1191 PCRE_ERROR_BADLENGTH (-32)
2300     .sp
2301 ph10 1221 This error is given if \fBpcre_exec()\fP is called with a negative value for
2302     the \fIlength\fP argument.
2303 nigel 93 .P
2304 ph10 1194 Error numbers -16 to -20, -22, and 30 are not used by \fBpcre_exec()\fP.
2305 nigel 75 .
2306     .
2307 ph10 598 .\" HTML <a name="badutf8reasons"></a>
2308     .SS "Reason codes for invalid UTF-8 strings"
2309     .rs
2310     .sp
2311 ph10 903 This section applies only to the 8-bit library. The corresponding information
2312 ph10 1214 for the 16-bit and 32-bit libraries is given in the
2313 ph10 856 .\" HREF
2314     \fBpcre16\fP
2315     .\"
2316 ph10 1214 and
2317 chpe 1055 .\" HREF
2318     \fBpcre32\fP
2319     .\"
2320 ph10 1221 pages.
2321 ph10 856 .P
2322 ph10 654 When \fBpcre_exec()\fP returns either PCRE_ERROR_BADUTF8 or
2323     PCRE_ERROR_SHORTUTF8, and the size of the output vector (\fIovecsize\fP) is at
2324     least 2, the offset of the start of the invalid UTF-8 character is placed in
2325     the first output vector element (\fIovector[0]\fP) and a reason code is placed
2326 ph10 598 in the second element (\fIovector[1]\fP). The reason codes are given names in
2327     the \fBpcre.h\fP header file:
2328     .sp
2329     PCRE_UTF8_ERR1
2330     PCRE_UTF8_ERR2
2331     PCRE_UTF8_ERR3
2332     PCRE_UTF8_ERR4
2333     PCRE_UTF8_ERR5
2334     .sp
2335 ph10 654 The string ends with a truncated UTF-8 character; the code specifies how many
2336 ph10 598 bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be
2337     no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279)
2338 ph10 654 allows for up to 6 bytes, and this is checked first; hence the possibility of
2339 ph10 598 4 or 5 missing bytes.
2340     .sp
2341     PCRE_UTF8_ERR6
2342     PCRE_UTF8_ERR7
2343     PCRE_UTF8_ERR8
2344     PCRE_UTF8_ERR9
2345     PCRE_UTF8_ERR10
2346     .sp
2347 ph10 654 The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the
2348 ph10 598 character do not have the binary value 0b10 (that is, either the most
2349     significant bit is 0, or the next bit is 1).
2350 ph10 654 .sp
2351 ph10 598 PCRE_UTF8_ERR11
2352     PCRE_UTF8_ERR12
2353     .sp
2354 ph10 654 A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long;
2355     these code points are excluded by RFC 3629.
2356     .sp
2357 ph10 598 PCRE_UTF8_ERR13
2358     .sp
2359 ph10 654 A 4-byte character has a value greater than 0x10fff; these code points are
2360 ph10 598 excluded by RFC 3629.
2361 ph10 654 .sp
2362 ph10 598 PCRE_UTF8_ERR14
2363     .sp
2364     A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of
2365 ph10 654 code points are reserved by RFC 3629 for use with UTF-16, and so are excluded
2366 ph10 598 from UTF-8.
2367 ph10 654 .sp
2368 ph10 598 PCRE_UTF8_ERR15
2369     PCRE_UTF8_ERR16
2370     PCRE_UTF8_ERR17
2371     PCRE_UTF8_ERR18
2372     PCRE_UTF8_ERR19
2373     .sp
2374 ph10 654 A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a
2375     value that can be represented by fewer bytes, which is invalid. For example,
2376 ph10 598 the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just
2377     one byte.
2378     .sp
2379     PCRE_UTF8_ERR20
2380     .sp
2381 ph10 654 The two most significant bits of the first byte of a character have the binary
2382     value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a
2383 ph10 598 byte can only validly occur as the second or subsequent byte of a multi-byte
2384     character.
2385     .sp
2386     PCRE_UTF8_ERR21
2387     .sp
2388     The first byte of a character has the value 0xfe or 0xff. These values can
2389     never occur in a valid UTF-8 string.
2390 chpe 1098 .sp
2391 chpe 1262 PCRE_UTF8_ERR22
2392 chpe 1098 .sp
2393 ph10 1261 This error code was formerly used when the presence of a so-called
2394 ph10 1314 "non-character" caused an error. Unicode corrigendum #9 makes it clear that
2395     such characters should not cause a string to be rejected, and so this code is
2396 ph10 1261 no longer in use and is never returned.
2397 ph10 598 .
2398     .
2399 nigel 75 .SH "EXTRACTING CAPTURED SUBSTRINGS BY NUMBER"
2400 nigel 63 .rs
2401     .sp
2402 ph10 1339 .nf
2403 nigel 75 .B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
2404 ph10 1339 .B " int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,"
2405     .B " int \fIbuffersize\fP);"
2406     .sp
2407 nigel 75 .B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
2408 ph10 1339 .B " int \fIstringcount\fP, int \fIstringnumber\fP,"
2409     .B " const char **\fIstringptr\fP);"
2410     .sp
2411 nigel 75 .B int pcre_get_substring_list(const char *\fIsubject\fP,
2412 ph10 1339 .B " int *\fIovector\fP, int \fIstringcount\fP, const char ***\fIlistptr\fP);"
2413     .fi
2414 nigel 63 .PP
2415     Captured substrings can be accessed directly by using the offsets returned by
2416 nigel 75 \fBpcre_exec()\fP in \fIovector\fP. For convenience, the functions
2417     \fBpcre_copy_substring()\fP, \fBpcre_get_substring()\fP, and
2418     \fBpcre_get_substring_list()\fP are provided for extracting captured substrings
2419 nigel 63 as new, separate, zero-terminated strings. These functions identify substrings
2420     by number. The next section describes functions for extracting named
2421 nigel 91 substrings.
2422 nigel 75 .P
2423 nigel 91 A substring that contains a binary zero is correctly extracted and has a
2424     further zero added on the end, but the result is not, of course, a C string.
2425     However, you can process such a string by referring to the length that is
2426     returned by \fBpcre_copy_substring()\fP and \fBpcre_get_substring()\fP.
2427     Unfortunately, the interface to \fBpcre_get_substring_list()\fP is not adequate
2428     for handling strings containing binary zeros, because the end of the final
2429     string is not independently indicated.
2430     .P
2431 nigel 63 The first three arguments are the same for all three of these functions:
2432 nigel 75 \fIsubject\fP is the subject string that has just been successfully matched,
2433     \fIovector\fP is a pointer to the vector of integer offsets that was passed to
2434     \fBpcre_exec()\fP, and \fIstringcount\fP is the number of substrings that were
2435 nigel 63 captured by the match, including the substring that matched the entire regular
2436 nigel 75 expression. This is the value returned by \fBpcre_exec()\fP if it is greater
2437     than zero. If \fBpcre_exec()\fP returned zero, indicating that it ran out of
2438     space in \fIovector\fP, the value passed as \fIstringcount\fP should be the
2439     number of elements in the vector divided by three.
2440     .P
2441     The functions \fBpcre_copy_substring()\fP and \fBpcre_get_substring()\fP
2442     extract a single substring, whose number is given as \fIstringnumber\fP. A
2443     value of zero extracts the substring that matched the entire pattern, whereas
2444     higher values extract the captured substrings. For \fBpcre_copy_substring()\fP,
2445     the string is placed in \fIbuffer\fP, whose length is given by
2446     \fIbuffersize\fP, while for \fBpcre_get_substring()\fP a new block of memory is
2447     obtained via \fBpcre_malloc\fP, and its address is returned via
2448     \fIstringptr\fP. The yield of the function is the length of the string, not
2449 nigel 93 including the terminating zero, or one of these error codes:
2450 nigel 75 .sp
2451 nigel 63 PCRE_ERROR_NOMEMORY (-6)
2452 nigel 75 .sp
2453     The buffer was too small for \fBpcre_copy_substring()\fP, or the attempt to get
2454     memory failed for \fBpcre_get_substring()\fP.
2455     .sp
2456 nigel 63 PCRE_ERROR_NOSUBSTRING (-7)
2457 nigel 75 .sp
2458     There is no substring whose number is \fIstringnumber\fP.
2459     .P
2460     The \fBpcre_get_substring_list()\fP function extracts all available substrings
2461 nigel 63 and builds a list of pointers to them. All this is done in a single block of
2462 nigel 75 memory that is obtained via \fBpcre_malloc\fP. The address of the memory block
2463     is returned via \fIlistptr\fP, which is also the start of the list of string
2464 nigel 63 pointers. The end of the list is marked by a NULL pointer. The yield of the
2465 nigel 93 function is zero if all went well, or the error code
2466 nigel 75 .sp
2467 nigel 63 PCRE_ERROR_NOMEMORY (-6)
2468 nigel 75 .sp
2469 nigel 63 if the attempt to get the memory block failed.
2470 nigel 75 .P
2471 nigel 63 When any of these functions encounter a substring that is unset, which can
2472 nigel 75 happen when capturing subpattern number \fIn+1\fP matches some part of the
2473     subject, but subpattern \fIn\fP has not been used at all, they return an empty
2474 nigel 63 string. This can be distinguished from a genuine zero-length substring by
2475 nigel 75 inspecting the appropriate offset in \fIovector\fP, which is negative for unset
2476 nigel 63 substrings.
2477 nigel 75 .P
2478     The two convenience functions \fBpcre_free_substring()\fP and
2479     \fBpcre_free_substring_list()\fP can be used to free the memory returned by
2480     a previous call of \fBpcre_get_substring()\fP or
2481     \fBpcre_get_substring_list()\fP, respectively. They do nothing more than call
2482     the function pointed to by \fBpcre_free\fP, which of course could be called
2483 nigel 63 directly from a C program. However, PCRE is used in some situations where it is
2484 nigel 91 linked via a special interface to another programming language that cannot use
2485 nigel 75 \fBpcre_free\fP directly; it is for these cases that the functions are
2486 nigel 63 provided.
2487 nigel 75 .
2488     .
2489     .SH "EXTRACTING CAPTURED SUBSTRINGS BY NAME"
2490 nigel 63 .rs
2491     .sp
2492 ph10 1339 .nf
2493 nigel 75 .B int pcre_get_stringnumber(const pcre *\fIcode\fP,
2494 ph10 1339 .B " const char *\fIname\fP);"
2495     .sp
2496 nigel 75 .B int pcre_copy_named_substring(const pcre *\fIcode\fP,
2497 ph10 1339 .B " const char *\fIsubject\fP, int *\fIovector\fP,"
2498     .B " int \fIstringcount\fP, const char *\fIstringname\fP,"
2499     .B " char *\fIbuffer\fP, int \fIbuffersize\fP);"
2500     .sp
2501 nigel 75 .B int pcre_get_named_substring(const pcre *\fIcode\fP,
2502 ph10 1339 .B " const char *\fIsubject\fP, int *\fIovector\fP,"
2503     .B " int \fIstringcount\fP, const char *\fIstringname\fP,"
2504     .B " const char **\fIstringptr\fP);"
2505     .fi
2506 nigel 63 .PP
2507 nigel 75 To extract a substring by name, you first have to find associated number.
2508     For example, for this pattern
2509     .sp
2510 nigel 93 (a+)b(?<xxx>\ed+)...
2511 nigel 75 .sp
2512 nigel 91 the number of the subpattern called "xxx" is 2. If the name is known to be
2513     unique (PCRE_DUPNAMES was not set), you can find the number from the name by
2514     calling \fBpcre_get_stringnumber()\fP. The first argument is the compiled
2515     pattern, and the second is the name. The yield of the function is the
2516 nigel 75 subpattern number, or PCRE_ERROR_NOSUBSTRING (-7) if there is no subpattern of
2517     that name.
2518     .P
2519     Given the number, you can extract the substring directly, or use one of the
2520     functions described in the previous section. For convenience, there are also
2521     two functions that do the whole job.
2522     .P
2523 nigel 91 Most of the arguments of \fBpcre_copy_named_substring()\fP and
2524     \fBpcre_get_named_substring()\fP are the same as those for the similarly named
2525 nigel 75 functions that extract by number. As these are described in the previous
2526     section, they are not re-described here. There are just two differences:
2527     .P
2528 nigel 63 First, instead of a substring number, a substring name is given. Second, there
2529     is an extra argument, given at the start, which is a pointer to the compiled
2530     pattern. This is needed in order to gain access to the name-to-number
2531     translation table.
2532 nigel 75 .P
2533     These functions call \fBpcre_get_stringnumber()\fP, and if it succeeds, they
2534 ph10 127 then call \fBpcre_copy_substring()\fP or \fBpcre_get_substring()\fP, as
2535     appropriate. \fBNOTE:\fP If PCRE_DUPNAMES is set and there are duplicate names,
2536 ph10 128 the behaviour may not be what you want (see the next section).
2537 ph10 385 .P
2538 ph10 457 \fBWarning:\fP If the pattern uses the (?| feature to set up multiple
2539     subpatterns with the same number, as described in the
2540     .\" HTML <a href="pcrepattern.html#dupsubpatternnumber">
2541     .\" </a>
2542     section on duplicate subpattern numbers
2543     .\"
2544     in the
2545     .\" HREF
2546     \fBpcrepattern\fP
2547     .\"
2548     page, you cannot use names to distinguish the different subpatterns, because
2549     names are not included in the compiled code. The matching process uses only
2550     numbers. For this reason, the use of different names for subpatterns of the
2551     same number causes an error at compile time.
2552 nigel 77 .
2553 ph10 686 .
2554 nigel 91 .SH "DUPLICATE SUBPATTERN NAMES"
2555     .rs
2556     .sp
2557 ph10 1339 .nf
2558 nigel 91 .B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
2559 ph10 1339 .B " const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);"
2560     .fi
2561 nigel 91 .PP
2562     When a pattern is compiled with the PCRE_DUPNAMES option, names for subpatterns
2563 ph10 457 are not required to be unique. (Duplicate names are always allowed for
2564     subpatterns with the same number, created by using the (?| feature. Indeed, if
2565     such subpatterns are named, they are required to use the same names.)
2566     .P
2567     Normally, patterns with duplicate names are such that in any one match, only
2568     one of the named subpatterns participates. An example is shown in the
2569 nigel 91 .\" HREF
2570     \fBpcrepattern\fP
2571     .\"
2572 ph10 208 documentation.
2573 ph10 203 .P
2574     When duplicates are present, \fBpcre_copy_named_substring()\fP and
2575     \fBpcre_get_named_substring()\fP return the first substring corresponding to
2576     the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
2577     returned; no data is returned. The \fBpcre_get_stringnumber()\fP function
2578     returns one of the numbers that are associated with the name, but it is not
2579     defined which it is.
2580     .P
2581 nigel 91 If you want to get full details of all captured substrings for a given name,
2582     you must use the \fBpcre_get_stringtable_entries()\fP function. The first
2583     argument is the compiled pattern, and the second is the name. The third and
2584     fourth are pointers to variables which are updated by the function. After it
2585     has run, they point to the first and last entries in the name-to-number table
2586     for the given name. The function itself returns the length of each entry, or
2587 nigel 93 PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
2588 ph10 598 described above in the section entitled \fIInformation about a pattern\fP
2589     .\" HTML <a href="#infoaboutpattern">
2590     .\" </a>
2591     above.
2592     .\"
2593 nigel 93 Given all the relevant entries for the name, you can extract each of their
2594     numbers, and hence the captured data, if any.
2595 nigel 91 .
2596     .
2597 nigel 77 .SH "FINDING ALL POSSIBLE MATCHES"
2598     .rs
2599     .sp
2600     The traditional matching function uses a similar algorithm to Perl, which stops
2601     when it finds the first match, starting at a given point in the subject. If you
2602     want to find all possible matches, or the longest possible match, consider
2603     using the alternative matching function (see below) instead. If you cannot use
2604     the alternative function, but still need to find all possible matches, you
2605     can kludge it up by making use of the callout facility, which is described in
2606     the
2607     .\" HREF
2608     \fBpcrecallout\fP
2609     .\"
2610     documentation.
2611 nigel 75 .P
2612 nigel 77 What you have to do is to insert a callout right at the end of the pattern.
2613     When your callout function is called, extract and save the current matched
2614     substring. Then return 1, which forces \fBpcre_exec()\fP to backtrack and try
2615     other alternatives. Ultimately, when it runs out of matches, \fBpcre_exec()\fP
2616     will yield PCRE_ERROR_NOMATCH.
2617     .
2618     .
2619 ph10 901 .SH "OBTAINING AN ESTIMATE OF STACK USAGE"
2620     .rs
2621     .sp
2622     Matching certain patterns using \fBpcre_exec()\fP can use a lot of process
2623 ph10 903 stack, which in certain environments can be rather limited in size. Some users
2624     find it helpful to have an estimate of the amount of stack that is used by
2625 ph10 901 \fBpcre_exec()\fP, to help them set recursion limits, as described in the
2626     .\" HREF
2627     \fBpcrestack\fP
2628     .\"
2629 ph10 903 documentation. The estimate that is output by \fBpcretest\fP when called with
2630     the \fB-m\fP and \fB-C\fP options is obtained by calling \fBpcre_exec\fP with
2631 ph10 901 the values NULL, NULL, NULL, -999, and -999 for its first five arguments.
2632     .P
2633     Normally, if its first argument is NULL, \fBpcre_exec()\fP immediately returns
2634     the negative error code PCRE_ERROR_NULL, but with this special combination of
2635     arguments, it returns instead a negative number whose absolute value is the
2636     approximate stack frame size in bytes. (A negative number is used so that it is
2637     clear that no match has happened.) The value is approximate because in some
2638 ph10 903 cases, recursive calls to \fBpcre_exec()\fP occur when there are one or two
2639 ph10 901 additional variables on the stack.
2640     .P
2641 ph10 903 If PCRE has been compiled to use the heap instead of the stack for recursion,
2642 ph10 901 the value returned is the size of each block that is obtained from the heap.
2643     .
2644     .
2645 nigel 77 .\" HTML <a name="dfamatch"></a>
2646     .SH "MATCHING A PATTERN: THE ALTERNATIVE FUNCTION"
2647     .rs
2648     .sp
2649 ph10 1339 .nf
2650 nigel 77 .B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
2651 ph10 1339 .B " const char *\fIsubject\fP, int \fIlength\fP, int \fIstartoffset\fP,"
2652     .B " int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP,"
2653     .B " int *\fIworkspace\fP, int \fIwscount\fP);"
2654     .fi
2655 nigel 77 .P
2656     The function \fBpcre_dfa_exec()\fP is called to match a subject string against
2657 nigel 93 a compiled pattern, using a matching algorithm that scans the subject string
2658     just once, and does not backtrack. This has different characteristics to the
2659     normal algorithm, and is not compatible with Perl. Some of the features of PCRE
2660     patterns are not supported. Nevertheless, there are times when this kind of
2661 ph10 461 matching can be useful. For a discussion of the two matching algorithms, and a
2662 ph10 435 list of features that \fBpcre_dfa_exec()\fP does not support, see the
2663 nigel 77 .\" HREF
2664     \fBpcrematching\fP
2665     .\"
2666     documentation.
2667     .P
2668     The arguments for the \fBpcre_dfa_exec()\fP function are the same as for
2669     \fBpcre_exec()\fP, plus two extras. The \fIovector\fP argument is used in a
2670     different way, and this is described below. The other common arguments are used
2671     in the same way as for \fBpcre_exec()\fP, so their description is not repeated
2672     here.
2673     .P
2674     The two additional arguments provide workspace for the function. The workspace
2675     vector should contain at least 20 elements. It is used for keeping track of
2676     multiple paths through the pattern tree. More workspace will be needed for
2677 nigel 91 patterns and subjects where there are a lot of potential matches.
2678 nigel 77 .P
2679 nigel 87 Here is an example of a simple call to \fBpcre_dfa_exec()\fP:
2680 nigel 77 .sp
2681     int rc;
2682     int ovector[10];
2683     int wspace[20];
2684 nigel 87 rc = pcre_dfa_exec(
2685 nigel 77 re, /* result of pcre_compile() */
2686     NULL, /* we didn't study the pattern */
2687     "some string", /* the subject string */
2688     11, /* the length of the subject string */
2689     0, /* start at offset 0 in the subject */
2690     0, /* default options */
2691     ovector, /* vector of integers for substring information */
2692     10, /* number of elements (NOT size in bytes) */
2693     wspace, /* working space vector */
2694     20); /* number of elements (NOT size in bytes) */
2695     .
2696     .SS "Option bits for \fBpcre_dfa_exec()\fP"
2697     .rs
2698     .sp
2699     The unused bits of the \fIoptions\fP argument for \fBpcre_dfa_exec()\fP must be
2700 nigel 91 zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
2701 ph10 442 PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
2702 ph10 542 PCRE_NO_UTF8_CHECK, PCRE_BSR_ANYCRLF, PCRE_BSR_UNICODE, PCRE_NO_START_OPTIMIZE,
2703     PCRE_PARTIAL_HARD, PCRE_PARTIAL_SOFT, PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART.
2704     All but the last four of these are exactly the same as for \fBpcre_exec()\fP,
2705     so their description is not repeated here.
2706 nigel 77 .sp
2707 ph10 428 PCRE_PARTIAL_HARD
2708 ph10 461 PCRE_PARTIAL_SOFT
2709 nigel 77 .sp
2710 ph10 428 These have the same general effect as they do for \fBpcre_exec()\fP, but the
2711     details are slightly different. When PCRE_PARTIAL_HARD is set for
2712     \fBpcre_dfa_exec()\fP, it returns PCRE_ERROR_PARTIAL if the end of the subject
2713     is reached and there is still at least one matching possibility that requires
2714     additional characters. This happens even if some complete matches have also
2715     been found. When PCRE_PARTIAL_SOFT is set, the return code PCRE_ERROR_NOMATCH
2716     is converted into PCRE_ERROR_PARTIAL if the end of the subject is reached,
2717     there have been no complete matches, but there is still at least one matching
2718 ph10 435 possibility. The portion of the string that was inspected when the longest
2719     partial match was found is set as the first matching string in both cases.
2720 ph10 553 There is a more detailed discussion of partial and multi-segment matching, with
2721     examples, in the
2722     .\" HREF
2723     \fBpcrepartial\fP
2724     .\"
2725     documentation.
2726 nigel 77 .sp
2727     PCRE_DFA_SHORTEST
2728     .sp
2729     Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
2730 nigel 93 soon as it has found one match. Because of the way the alternative algorithm
2731     works, this is necessarily the shortest possible match at the first possible
2732     matching point in the subject string.
2733 nigel 77 .sp
2734     PCRE_DFA_RESTART
2735     .sp
2736 ph10 428 When \fBpcre_dfa_exec()\fP returns a partial match, it is possible to call it
2737     again, with additional subject characters, and have it continue with the same
2738     match. The PCRE_DFA_RESTART option requests this action; when it is set, the
2739     \fIworkspace\fP and \fIwscount\fP options must reference the same vector as
2740     before because data about the match so far is left in them after a partial
2741     match. There is more discussion of this facility in the
2742 nigel 77 .\" HREF
2743     \fBpcrepartial\fP
2744     .\"
2745     documentation.
2746     .
2747 ph10 598 .
2748 nigel 77 .SS "Successful returns from \fBpcre_dfa_exec()\fP"
2749     .rs
2750     .sp
2751     When \fBpcre_dfa_exec()\fP succeeds, it may have matched more than one
2752     substring in the subject. Note, however, that all the matches from one run of
2753     the function start at the same point in the subject. The shorter matches are
2754     all initial substrings of the longer matches. For example, if the pattern
2755     .sp
2756     <.*>
2757     .sp
2758     is matched against the string
2759     .sp
2760     This is <something> <something else> <something further> no more
2761     .sp
2762     the three matched strings are
2763     .sp
2764     <something>
2765     <something> <something else>
2766     <something> <something else> <something further>
2767     .sp
2768     On success, the yield of the function is a number greater than zero, which is
2769     the number of matched substrings. The substrings themselves are returned in
2770     \fIovector\fP. Each string uses two elements; the first is the offset to the
2771 nigel 93 start, and the second is the offset to the end. In fact, all the strings have
2772     the same start offset. (Space could have been saved by giving this only once,
2773     but it was decided to retain some compatibility with the way \fBpcre_exec()\fP
2774     returns data, even though the meaning of the strings is different.)
2775 nigel 77 .P
2776     The strings are returned in reverse order of length; that is, the longest
2777     matching string is given first. If there were too many matches to fit into
2778     \fIovector\fP, the yield of the function is zero, and the vector is filled with
2779 ph10 691 the longest matches. Unlike \fBpcre_exec()\fP, \fBpcre_dfa_exec()\fP can use
2780 ph10 683 the entire \fIovector\fP for returning matched strings.
2781 nigel 77 .
2782 ph10 598 .
2783 nigel 77 .SS "Error returns from \fBpcre_dfa_exec()\fP"
2784     .rs
2785     .sp
2786     The \fBpcre_dfa_exec()\fP function returns a negative number when it fails.
2787     Many of the errors are the same as for \fBpcre_exec()\fP, and these are
2788     described
2789     .\" HTML <a href="#errorlist">
2790     .\" </a>
2791     above.
2792     .\"
2793     There are in addition the following errors that are specific to
2794     \fBpcre_dfa_exec()\fP:
2795     .sp
2796     PCRE_ERROR_DFA_UITEM (-16)
2797     .sp
2798     This return is given if \fBpcre_dfa_exec()\fP encounters an item in the pattern
2799     that it does not support, for instance, the use of \eC or a back reference.
2800     .sp
2801     PCRE_ERROR_DFA_UCOND (-17)
2802     .sp
2803 nigel 93 This return is given if \fBpcre_dfa_exec()\fP encounters a condition item that
2804     uses a back reference for the condition, or a test for recursion in a specific
2805     group. These are not supported.
2806 nigel 77 .sp
2807     PCRE_ERROR_DFA_UMLIMIT (-18)
2808     .sp
2809     This return is given if \fBpcre_dfa_exec()\fP is called with an \fIextra\fP
2810 ph10 678 block that contains a setting of the \fImatch_limit\fP or
2811     \fImatch_limit_recursion\fP fields. This is not supported (these fields are
2812     meaningless for DFA matching).
2813 nigel 77 .sp
2814     PCRE_ERROR_DFA_WSSIZE (-19)
2815     .sp
2816     This return is given if \fBpcre_dfa_exec()\fP runs out of space in the
2817     \fIworkspace\fP vector.
2818     .sp
2819     PCRE_ERROR_DFA_RECURSE (-20)
2820     .sp
2821     When a recursive subpattern is processed, the matching function calls itself
2822     recursively, using private vectors for \fIovector\fP and \fIworkspace\fP. This
2823     error is given if the output vector is not large enough. This should be
2824     extremely rare, as a vector of size 1000 is used.
2825 ph10 960 .sp
2826     PCRE_ERROR_DFA_BADRESTART (-30)
2827     .sp
2828     When \fBpcre_dfa_exec()\fP is called with the \fBPCRE_DFA_RESTART\fP option,
2829 ph10 975 some plausibility checks are made on the contents of the workspace, which
2830     should contain data about the previous partial match. If any of these checks
2831     fail, this error is given.
2832 nigel 93 .
2833     .
2834     .SH "SEE ALSO"
2835     .rs
2836     .sp
2837 chpe 1055 \fBpcre16\fP(3), \fBpcre32\fP(3), \fBpcrebuild\fP(3), \fBpcrecallout\fP(3),
2838     \fBpcrecpp(3)\fP(3), \fBpcrematching\fP(3), \fBpcrepartial\fP(3),
2839     \fBpcreposix\fP(3), \fBpcreprecompile\fP(3), \fBpcresample\fP(3),
2840     \fBpcrestack\fP(3).
2841 ph10 99 .
2842     .
2843     .SH AUTHOR
2844     .rs
2845     .sp
2846     .nf
2847     Philip Hazel
2848     University Computing Service
2849     Cambridge CB2 3QH, England.
2850     .fi
2851     .
2852     .
2853     .SH REVISION
2854     .rs
2855     .sp
2856     .nf
2857 ph10 1339 Last updated: 12 June 2013
2858 ph10 1253 Copyright (c) 1997-2013 University of Cambridge.
2859 ph10 99 .fi

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12