/[pcre]/code/trunk/doc/pcre.txt
ViewVC logotype

Contents of /code/trunk/doc/pcre.txt

Parent Directory Parent Directory | Revision Log Revision Log


Revision 53 - (hide annotations) (download)
Sat Feb 24 21:39:42 2007 UTC (6 years, 3 months ago) by nigel
File MIME type: text/plain
File size: 99591 byte(s)
Load pcre-3.5 into code/trunk.

1 nigel 41 NAME
2     pcre - Perl-compatible regular expressions.
3    
4    
5    
6     SYNOPSIS
7     #include <pcre.h>
8    
9     pcre *pcre_compile(const char *pattern, int options,
10     const char **errptr, int *erroffset,
11     const unsigned char *tableptr);
12    
13     pcre_extra *pcre_study(const pcre *code, int options,
14     const char **errptr);
15    
16     int pcre_exec(const pcre *code, const pcre_extra *extra,
17     const char *subject, int length, int startoffset,
18     int options, int *ovector, int ovecsize);
19    
20     int pcre_copy_substring(const char *subject, int *ovector,
21     int stringcount, int stringnumber, char *buffer,
22     int buffersize);
23    
24     int pcre_get_substring(const char *subject, int *ovector,
25     int stringcount, int stringnumber,
26     const char **stringptr);
27    
28     int pcre_get_substring_list(const char *subject,
29     int *ovector, int stringcount, const char ***listptr);
30    
31 nigel 49 void pcre_free_substring(const char *stringptr);
32    
33     void pcre_free_substring_list(const char **stringptr);
34    
35 nigel 41 const unsigned char *pcre_maketables(void);
36    
37 nigel 43 int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
38     int what, void *where);
39    
40 nigel 41 int pcre_info(const pcre *code, int *optptr, *firstcharptr);
41    
42     char *pcre_version(void);
43    
44     void *(*pcre_malloc)(size_t);
45    
46     void (*pcre_free)(void *);
47    
48    
49    
50    
51     DESCRIPTION
52     The PCRE library is a set of functions that implement regu-
53     lar expression pattern matching using the same syntax and
54     semantics as Perl 5, with just a few differences (see
55 nigel 49
56 nigel 41 below). The current implementation corresponds to Perl
57 nigel 49 5.005, with some additional features from later versions.
58     This includes some experimental, incomplete support for
59     UTF-8 encoded strings. Details of exactly what is and what
60     is not supported are given below.
61 nigel 41
62     PCRE has its own native API, which is described in this
63     document. There is also a set of wrapper functions that
64 nigel 43 correspond to the POSIX regular expression API. These are
65     described in the pcreposix documentation.
66    
67 nigel 41 The native API function prototypes are defined in the header
68     file pcre.h, and on Unix systems the library itself is
69     called libpcre.a, so can be accessed by adding -lpcre to the
70 nigel 43 command for linking an application which calls it. The
71     header file defines the macros PCRE_MAJOR and PCRE_MINOR to
72     contain the major and minor release numbers for the library.
73     Applications can use these to include support for different
74     releases.
75 nigel 41
76     The functions pcre_compile(), pcre_study(), and pcre_exec()
77 nigel 53 are used for compiling and matching regular expressions. A
78     sample program that demonstrates the simplest way of using
79     them is given in the file pcredemo.c. The last section of
80     this man page describes how to run it.
81 nigel 49
82     The functions pcre_copy_substring(), pcre_get_substring(),
83     and pcre_get_substring_list() are convenience functions for
84 nigel 41 extracting captured substrings from a matched subject
85 nigel 49 string; pcre_free_substring() and pcre_free_substring_list()
86     are also provided, to free the memory used for extracted
87     strings.
88 nigel 41
89 nigel 49 The function pcre_maketables() is used (optionally) to build
90     a set of character tables in the current locale for passing
91     to pcre_compile().
92    
93 nigel 43 The function pcre_fullinfo() is used to find out information
94     about a compiled pattern; pcre_info() is an obsolete version
95     which returns only some of the available information, but is
96     retained for backwards compatibility. The function
97     pcre_version() returns a pointer to a string containing the
98     version of PCRE and its date of release.
99 nigel 41
100     The global variables pcre_malloc and pcre_free initially
101     contain the entry points of the standard malloc() and free()
102     functions respectively. PCRE calls the memory management
103     functions via these variables, so a calling program can
104     replace them if it wishes to intercept the calls. This
105     should be done before calling any PCRE functions.
106    
107    
108    
109     MULTI-THREADING
110 nigel 53 The PCRE functions can be used in multi-threading applica-
111     tions, with the proviso that the memory management functions
112     pointed to by pcre_malloc and pcre_free are shared by all
113     threads.
114 nigel 41
115     The compiled form of a regular expression is not altered
116     during matching, so the same compiled pattern can safely be
117     used by several threads at once.
118    
119    
120    
121     COMPILING A PATTERN
122     The function pcre_compile() is called to compile a pattern
123     into an internal form. The pattern is a C string terminated
124     by a binary zero, and is passed in the argument pattern. A
125     pointer to a single block of memory that is obtained via
126     pcre_malloc is returned. This contains the compiled code and
127 nigel 53 related data. The pcre type is defined for the returned
128     block; this is a typedef for a structure whose contents are
129     not externally defined. It is up to the caller to free the
130     memory when it is no longer required.
131 nigel 41
132 nigel 53 Although the compiled code of a PCRE regex is relocatable,
133     that is, it does not depend on memory location, the complete
134     pcre data block is not fully relocatable, because it con-
135     tains a copy of the tableptr argument, which is an address
136     (see below).
137    
138 nigel 41 The size of a compiled pattern is roughly proportional to
139     the length of the pattern string, except that each character
140     class (other than those containing just a single character,
141     negated or not) requires 33 bytes, and repeat quantifiers
142     with a minimum greater than one or a bounded maximum cause
143     the relevant portions of the compiled pattern to be repli-
144     cated.
145    
146     The options argument contains independent bits that affect
147     the compilation. It should be zero if no options are
148     required. Some of the options, in particular, those that are
149     compatible with Perl, can also be set and unset from within
150     the pattern (see the detailed description of regular expres-
151     sions below). For these options, the contents of the options
152     argument specifies their initial settings at the start of
153     compilation and execution. The PCRE_ANCHORED option can be
154     set at the time of matching as well as at compile time.
155    
156     If errptr is NULL, pcre_compile() returns NULL immediately.
157     Otherwise, if compilation of a pattern fails, pcre_compile()
158     returns NULL, and sets the variable pointed to by errptr to
159     point to a textual error message. The offset from the start
160     of the pattern to the character where the error was
161     discovered is placed in the variable pointed to by
162     erroffset, which must not be NULL. If it is, an immediate
163     error is given.
164    
165     If the final argument, tableptr, is NULL, PCRE uses a
166     default set of character tables which are built when it is
167     compiled, using the default C locale. Otherwise, tableptr
168     must be the result of a call to pcre_maketables(). See the
169     section on locale support below.
170    
171 nigel 53 This code fragment shows a typical straightforward call to
172     pcre_compile():
173    
174     pcre *re;
175     const char *error;
176     int erroffset;
177     re = pcre_compile(
178     "^A.*Z", /* the pattern */
179     0, /* default options */
180     &error, /* for error message */
181     &erroffset, /* for error offset */
182     NULL); /* use default character tables */
183    
184 nigel 41 The following option bits are defined in the header file:
185    
186     PCRE_ANCHORED
187    
188     If this bit is set, the pattern is forced to be "anchored",
189     that is, it is constrained to match only at the start of the
190     string which is being searched (the "subject string"). This
191     effect can also be achieved by appropriate constructs in the
192     pattern itself, which is the only way to do it in Perl.
193    
194     PCRE_CASELESS
195    
196     If this bit is set, letters in the pattern match both upper
197     and lower case letters. It is equivalent to Perl's /i
198     option.
199    
200     PCRE_DOLLAR_ENDONLY
201    
202     If this bit is set, a dollar metacharacter in the pattern
203     matches only at the end of the subject string. Without this
204     option, a dollar also matches immediately before the final
205     character if it is a newline (but not before any other new-
206     lines). The PCRE_DOLLAR_ENDONLY option is ignored if
207     PCRE_MULTILINE is set. There is no equivalent to this option
208     in Perl.
209    
210     PCRE_DOTALL
211    
212     If this bit is set, a dot metacharater in the pattern
213     matches all characters, including newlines. Without it, new-
214     lines are excluded. This option is equivalent to Perl's /s
215     option. A negative class such as [^a] always matches a new-
216     line character, independent of the setting of this option.
217    
218     PCRE_EXTENDED
219    
220     If this bit is set, whitespace data characters in the pat-
221     tern are totally ignored except when escaped or inside a
222     character class, and characters between an unescaped # out-
223     side a character class and the next newline character,
224     inclusive, are also ignored. This is equivalent to Perl's /x
225     option, and makes it possible to include comments inside
226     complicated patterns. Note, however, that this applies only
227     to data characters. Whitespace characters may never appear
228     within special character sequences in a pattern, for example
229     within the sequence (?( which introduces a conditional sub-
230     pattern.
231    
232     PCRE_EXTRA
233    
234 nigel 43 This option was invented in order to turn on additional
235     functionality of PCRE that is incompatible with Perl, but it
236     is currently of very little use. When set, any backslash in
237     a pattern that is followed by a letter that has no special
238     meaning causes an error, thus reserving these combinations
239     for future expansion. By default, as in Perl, a backslash
240     followed by a letter with no special meaning is treated as a
241     literal. There are at present no other features controlled
242     by this option. It can also be set by a (?X) option setting
243     within a pattern.
244 nigel 41
245     PCRE_MULTILINE
246    
247     By default, PCRE treats the subject string as consisting of
248     a single "line" of characters (even if it actually contains
249     several newlines). The "start of line" metacharacter (^)
250     matches only at the start of the string, while the "end of
251     line" metacharacter ($) matches only at the end of the
252     string, or before a terminating newline (unless
253     PCRE_DOLLAR_ENDONLY is set). This is the same as Perl.
254    
255     When PCRE_MULTILINE it is set, the "start of line" and "end
256 nigel 43 of line" constructs match immediately following or immedi-
257     ately before any newline in the subject string, respec-
258     tively, as well as at the very start and end. This is
259 nigel 41 equivalent to Perl's /m option. If there are no "\n" charac-
260     ters in a subject string, or no occurrences of ^ or $ in a
261     pattern, setting PCRE_MULTILINE has no effect.
262    
263     PCRE_UNGREEDY
264    
265     This option inverts the "greediness" of the quantifiers so
266     that they are not greedy by default, but become greedy if
267     followed by "?". It is not compatible with Perl. It can also
268     be set by a (?U) option setting within the pattern.
269    
270 nigel 49 PCRE_UTF8
271 nigel 41
272 nigel 49 This option causes PCRE to regard both the pattern and the
273     subject as strings of UTF-8 characters instead of just byte
274     strings. However, it is available only if PCRE has been
275     built to include UTF-8 support. If not, the use of this
276     option provokes an error. Support for UTF-8 is new, experi-
277     mental, and incomplete. Details of exactly what it entails
278     are given below.
279 nigel 41
280 nigel 49
281    
282 nigel 41 STUDYING A PATTERN
283     When a pattern is going to be used several times, it is
284     worth spending more time analyzing it in order to speed up
285     the time taken for matching. The function pcre_study() takes
286     a pointer to a compiled pattern as its first argument, and
287 nigel 53 returns a pointer to a pcre_extra block (another typedef for
288     a structure with hidden contents) containing additional
289     information about the pattern; this can be passed to
290     pcre_exec(). If no additional information is available, NULL
291     is returned.
292 nigel 41
293     The second argument contains option bits. At present, no
294     options are defined for pcre_study(), and this argument
295     should always be zero.
296    
297     The third argument for pcre_study() is a pointer to an error
298     message. If studying succeeds (even if no data is returned),
299     the variable it points to is set to NULL. Otherwise it
300     points to a textual error message.
301    
302 nigel 53 This is a typical call to pcre_study():
303    
304     pcre_extra *pe;
305     pe = pcre_study(
306     re, /* result of pcre_compile() */
307     0, /* no options exist */
308     &error); /* set to NULL or points to a message */
309    
310 nigel 41 At present, studying a pattern is useful only for non-
311     anchored patterns that do not have a single fixed starting
312     character. A bitmap of possible starting characters is
313     created.
314    
315    
316    
317     LOCALE SUPPORT
318     PCRE handles caseless matching, and determines whether char-
319     acters are letters, digits, or whatever, by reference to a
320     set of tables. The library contains a default set of tables
321     which is created in the default C locale when PCRE is com-
322     piled. This is used when the final argument of
323     pcre_compile() is NULL, and is sufficient for many applica-
324     tions.
325    
326     An alternative set of tables can, however, be supplied. Such
327     tables are built by calling the pcre_maketables() function,
328     which has no arguments, in the relevant locale. The result
329     can then be passed to pcre_compile() as often as necessary.
330     For example, to build and use tables that are appropriate
331     for the French locale (where accented characters with codes
332     greater than 128 are treated as letters), the following code
333     could be used:
334    
335     setlocale(LC_CTYPE, "fr");
336     tables = pcre_maketables();
337     re = pcre_compile(..., tables);
338    
339     The tables are built in memory that is obtained via
340     pcre_malloc. The pointer that is passed to pcre_compile is
341     saved with the compiled pattern, and the same tables are
342     used via this pointer by pcre_study() and pcre_exec(). Thus
343     for any single pattern, compilation, studying and matching
344     all happen in the same locale, but different patterns can be
345     compiled in different locales. It is the caller's responsi-
346     bility to ensure that the memory containing the tables
347     remains available for as long as it is needed.
348    
349    
350    
351     INFORMATION ABOUT A PATTERN
352 nigel 43 The pcre_fullinfo() function returns information about a
353     compiled pattern. It replaces the obsolete pcre_info() func-
354     tion, which is nevertheless retained for backwards compabil-
355     ity (and is documented below).
356 nigel 41
357 nigel 43 The first argument for pcre_fullinfo() is a pointer to the
358     compiled pattern. The second argument is the result of
359     pcre_study(), or NULL if the pattern was not studied. The
360     third argument specifies which piece of information is
361     required, while the fourth argument is a pointer to a vari-
362     able to receive the data. The yield of the function is zero
363     for success, or one of the following negative numbers:
364    
365 nigel 41 PCRE_ERROR_NULL the argument code was NULL
366 nigel 43 the argument where was NULL
367 nigel 41 PCRE_ERROR_BADMAGIC the "magic number" was not found
368 nigel 43 PCRE_ERROR_BADOPTION the value of what was invalid
369 nigel 41
370 nigel 53 Here is a typical call of pcre_fullinfo(), to obtain the
371     length of the compiled pattern:
372    
373     int rc;
374     unsigned long int length;
375     rc = pcre_fullinfo(
376     re, /* result of pcre_compile() */
377     pe, /* result of pcre_study(), or NULL */
378     PCRE_INFO_SIZE, /* what is required */
379     &length); /* where to put the data */
380    
381 nigel 43 The possible values for the third argument are defined in
382     pcre.h, and are as follows:
383    
384     PCRE_INFO_OPTIONS
385    
386     Return a copy of the options with which the pattern was com-
387 nigel 53 piled. The fourth argument should point to an unsigned long
388 nigel 43 int variable. These option bits are those specified in the
389 nigel 41 call to pcre_compile(), modified by any top-level option
390     settings within the pattern itself, and with the
391 nigel 43 PCRE_ANCHORED bit forcibly set if the form of the pattern
392     implies that it can match only at the start of a subject
393     string.
394 nigel 41
395 nigel 43 PCRE_INFO_SIZE
396    
397     Return the size of the compiled pattern, that is, the value
398     that was passed as the argument to pcre_malloc() when PCRE
399     was getting memory in which to place the compiled data. The
400     fourth argument should point to a size_t variable.
401    
402     PCRE_INFO_CAPTURECOUNT
403    
404     Return the number of capturing subpatterns in the pattern.
405     The fourth argument should point to an int variable.
406    
407     PCRE_INFO_BACKREFMAX
408    
409 nigel 53 Return the number of the highest back reference in the pat-
410     tern. The fourth argument should point to an int variable.
411     Zero is returned if there are no back references.
412 nigel 43
413     PCRE_INFO_FIRSTCHAR
414    
415     Return information about the first character of any matched
416     string, for a non-anchored pattern. If there is a fixed
417     first character, e.g. from a pattern such as
418 nigel 47 (cat|cow|coyote), it is returned in the integer pointed to
419     by where. Otherwise, if either
420 nigel 41
421     (a) the pattern was compiled with the PCRE_MULTILINE option,
422     and every branch starts with "^", or
423    
424     (b) every branch of the pattern starts with ".*" and
425     PCRE_DOTALL is not set (if it were set, the pattern would be
426     anchored),
427 nigel 43
428 nigel 47 -1 is returned, indicating that the pattern matches only at
429     the start of a subject string or after any "\n" within the
430     string. Otherwise -2 is returned. For anchored patterns, -2
431     is returned.
432 nigel 41
433 nigel 43 PCRE_INFO_FIRSTTABLE
434 nigel 41
435 nigel 43 If the pattern was studied, and this resulted in the con-
436     struction of a 256-bit table indicating a fixed set of char-
437     acters for the first character in any matching string, a
438     pointer to the table is returned. Otherwise NULL is
439     returned. The fourth argument should point to an unsigned
440     char * variable.
441 nigel 41
442 nigel 43 PCRE_INFO_LASTLITERAL
443    
444     For a non-anchored pattern, return the value of the right-
445     most literal character which must exist in any matched
446     string, other than at its start. The fourth argument should
447     point to an int variable. If there is no such character, or
448     if the pattern is anchored, -1 is returned. For example, for
449     the pattern /a\d+z\d+/ the returned value is 'z'.
450    
451     The pcre_info() function is now obsolete because its inter-
452     face is too restrictive to return all the available data
453     about a compiled pattern. New programs should use
454     pcre_fullinfo() instead. The yield of pcre_info() is the
455     number of capturing subpatterns, or one of the following
456     negative numbers:
457    
458     PCRE_ERROR_NULL the argument code was NULL
459     PCRE_ERROR_BADMAGIC the "magic number" was not found
460    
461     If the optptr argument is not NULL, a copy of the options
462     with which the pattern was compiled is placed in the integer
463     it points to (see PCRE_INFO_OPTIONS above).
464    
465     If the pattern is not anchored and the firstcharptr argument
466     is not NULL, it is used to pass back information about the
467     first character of any matched string (see
468     PCRE_INFO_FIRSTCHAR above).
469    
470    
471    
472 nigel 41 MATCHING A PATTERN
473     The function pcre_exec() is called to match a subject string
474 nigel 53
475    
476    
477    
478    
479     SunOS 5.8 Last change: 9
480    
481    
482    
483 nigel 41 against a pre-compiled pattern, which is passed in the code
484     argument. If the pattern has been studied, the result of the
485     study should be passed in the extra argument. Otherwise this
486     must be NULL.
487    
488 nigel 53 Here is an example of a simple call to pcre_exec():
489    
490     int rc;
491     int ovector[30];
492     rc = pcre_exec(
493     re, /* result of pcre_compile() */
494     NULL, /* we didn't study the pattern */
495     "some string", /* the subject string */
496     11, /* the length of the subject string */
497     0, /* start at offset 0 in the subject */
498     0, /* default options */
499     ovector, /* vector for substring information */
500     30); /* number of elements in the vector */
501    
502 nigel 41 The PCRE_ANCHORED option can be passed in the options argu-
503     ment, whose unused bits must be zero. However, if a pattern
504     was compiled with PCRE_ANCHORED, or turned out to be
505     anchored by virtue of its contents, it cannot be made
506     unachored at matching time.
507    
508     There are also three further options that can be set only at
509     matching time:
510    
511     PCRE_NOTBOL
512    
513     The first character of the string is not the beginning of a
514     line, so the circumflex metacharacter should not match
515     before it. Setting this without PCRE_MULTILINE (at compile
516     time) causes circumflex never to match.
517    
518     PCRE_NOTEOL
519    
520     The end of the string is not the end of a line, so the dol-
521     lar metacharacter should not match it nor (except in multi-
522     line mode) a newline immediately before it. Setting this
523     without PCRE_MULTILINE (at compile time) causes dollar never
524     to match.
525    
526     PCRE_NOTEMPTY
527    
528     An empty string is not considered to be a valid match if
529     this option is set. If there are alternatives in the pat-
530     tern, they are tried. If all the alternatives match the
531     empty string, the entire match fails. For example, if the
532     pattern
533    
534     a?b?
535    
536     is applied to a string not beginning with "a" or "b", it
537     matches the empty string at the start of the subject. With
538     PCRE_NOTEMPTY set, this match is not valid, so PCRE searches
539     further into the string for occurrences of "a" or "b".
540    
541     Perl has no direct equivalent of PCRE_NOTEMPTY, but it does
542     make a special case of a pattern match of the empty string
543     within its split() function, and when using the /g modifier.
544     It is possible to emulate Perl's behaviour after matching a
545     null string by first trying the match again at the same
546     offset with PCRE_NOTEMPTY set, and then if that fails by
547     advancing the starting offset (see below) and trying an
548     ordinary match again.
549    
550     The subject string is passed as a pointer in subject, a
551     length in length, and a starting offset in startoffset.
552 nigel 53 Unlike the pattern string, the subject may contain binary
553     zero characters. When the starting offset is zero, the
554     search for a match starts at the beginning of the subject,
555     and this is by far the most common case.
556 nigel 41
557     A non-zero starting offset is useful when searching for
558     another match in the same subject by calling pcre_exec()
559     again after a previous success. Setting startoffset differs
560     from just passing over a shortened string and setting
561     PCRE_NOTBOL in the case of a pattern that begins with any
562     kind of lookbehind. For example, consider the pattern
563    
564     \Biss\B
565    
566     which finds occurrences of "iss" in the middle of words. (\B
567     matches only if the current position in the subject is not a
568     word boundary.) When applied to the string "Mississipi" the
569     first call to pcre_exec() finds the first occurrence. If
570     pcre_exec() is called again with just the remainder of the
571     subject, namely "issipi", it does not match, because \B is
572     always false at the start of the subject, which is deemed to
573     be a word boundary. However, if pcre_exec() is passed the
574     entire string again, but with startoffset set to 4, it finds
575     the second occurrence of "iss" because it is able to look
576     behind the starting point to discover that it is preceded by
577     a letter.
578    
579     If a non-zero starting offset is passed when the pattern is
580     anchored, one attempt to match at the given offset is tried.
581     This can only succeed if the pattern does not require the
582     match to be at the start of the subject.
583    
584     In general, a pattern matches a certain portion of the sub-
585     ject, and in addition, further substrings from the subject
586     may be picked out by parts of the pattern. Following the
587     usage in Jeffrey Friedl's book, this is called "capturing"
588     in what follows, and the phrase "capturing subpattern" is
589     used for a fragment of a pattern that picks out a substring.
590     PCRE supports several other kinds of parenthesized subpat-
591     tern that do not cause substrings to be captured.
592    
593     Captured substrings are returned to the caller via a vector
594     of integer offsets whose address is passed in ovector. The
595     number of elements in the vector is passed in ovecsize. The
596     first two-thirds of the vector is used to pass back captured
597     substrings, each substring using a pair of integers. The
598     remaining third of the vector is used as workspace by
599     pcre_exec() while matching capturing subpatterns, and is not
600     available for passing back information. The length passed in
601     ovecsize should always be a multiple of three. If it is not,
602     it is rounded down.
603    
604     When a match has been successful, information about captured
605     substrings is returned in pairs of integers, starting at the
606     beginning of ovector, and continuing up to two-thirds of its
607     length at the most. The first element of a pair is set to
608     the offset of the first character in a substring, and the
609     second is set to the offset of the first character after the
610     end of a substring. The first pair, ovector[0] and ovec-
611     tor[1], identify the portion of the subject string matched
612     by the entire pattern. The next pair is used for the first
613     capturing subpattern, and so on. The value returned by
614     pcre_exec() is the number of pairs that have been set. If
615     there are no capturing subpatterns, the return value from a
616     successful match is 1, indicating that just the first pair
617     of offsets has been set.
618    
619     Some convenience functions are provided for extracting the
620     captured substrings as separate strings. These are described
621     in the following section.
622    
623     It is possible for an capturing subpattern number n+1 to
624     match some part of the subject when subpattern n has not
625     been used at all. For example, if the string "abc" is
626     matched against the pattern (a|(z))(bc) subpatterns 1 and 3
627     are matched, but 2 is not. When this happens, both offset
628     values corresponding to the unused subpattern are set to -1.
629    
630     If a capturing subpattern is matched repeatedly, it is the
631     last portion of the string that it matched that gets
632     returned.
633    
634     If the vector is too small to hold all the captured sub-
635     strings, it is used as far as possible (up to two-thirds of
636     its length), and the function returns a value of zero. In
637     particular, if the substring offsets are not of interest,
638     pcre_exec() may be called with ovector passed as NULL and
639     ovecsize as zero. However, if the pattern contains back
640     references and the ovector isn't big enough to remember the
641     related substrings, PCRE has to get additional memory for
642     use during matching. Thus it is usually advisable to supply
643     an ovector.
644    
645     Note that pcre_info() can be used to find out how many cap-
646     turing subpatterns there are in a compiled pattern. The
647     smallest size for ovector that will allow for n captured
648     substrings in addition to the offsets of the substring
649     matched by the whole pattern is (n+1)*3.
650    
651     If pcre_exec() fails, it returns a negative number. The fol-
652     lowing are defined in the header file:
653    
654     PCRE_ERROR_NOMATCH (-1)
655    
656     The subject string did not match the pattern.
657    
658     PCRE_ERROR_NULL (-2)
659    
660     Either code or subject was passed as NULL, or ovector was
661     NULL and ovecsize was not zero.
662    
663     PCRE_ERROR_BADOPTION (-3)
664    
665     An unrecognized bit was set in the options argument.
666    
667     PCRE_ERROR_BADMAGIC (-4)
668    
669     PCRE stores a 4-byte "magic number" at the start of the com-
670     piled code, to catch the case when it is passed a junk
671     pointer. This is the error it gives when the magic number
672     isn't present.
673    
674     PCRE_ERROR_UNKNOWN_NODE (-5)
675    
676     While running the pattern match, an unknown item was encoun-
677     tered in the compiled pattern. This error could be caused by
678     a bug in PCRE or by overwriting of the compiled pattern.
679    
680     PCRE_ERROR_NOMEMORY (-6)
681    
682     If a pattern contains back references, but the ovector that
683     is passed to pcre_exec() is not big enough to remember the
684     referenced substrings, PCRE gets a block of memory at the
685     start of matching to use for this purpose. If the call via
686     pcre_malloc() fails, this error is given. The memory is
687     freed at the end of matching.
688    
689    
690    
691 nigel 53
692 nigel 41 EXTRACTING CAPTURED SUBSTRINGS
693     Captured substrings can be accessed directly by using the
694     offsets returned by pcre_exec() in ovector. For convenience,
695     the functions pcre_copy_substring(), pcre_get_substring(),
696     and pcre_get_substring_list() are provided for extracting
697     captured substrings as new, separate, zero-terminated
698     strings. A substring that contains a binary zero is
699     correctly extracted and has a further zero added on the end,
700     but the result does not, of course, function as a C string.
701    
702     The first three arguments are the same for all three func-
703     tions: subject is the subject string which has just been
704     successfully matched, ovector is a pointer to the vector of
705     integer offsets that was passed to pcre_exec(), and
706     stringcount is the number of substrings that were captured
707     by the match, including the substring that matched the
708     entire regular expression. This is the value returned by
709     pcre_exec if it is greater than zero. If pcre_exec()
710     returned zero, indicating that it ran out of space in ovec-
711 nigel 47 tor, the value passed as stringcount should be the size of
712     the vector divided by three.
713 nigel 41
714     The functions pcre_copy_substring() and pcre_get_substring()
715     extract a single substring, whose number is given as string-
716     number. A value of zero extracts the substring that matched
717     the entire pattern, while higher values extract the captured
718     substrings. For pcre_copy_substring(), the string is placed
719     in buffer, whose length is given by buffersize, while for
720 nigel 49 pcre_get_substring() a new block of memory is obtained via
721 nigel 41 pcre_malloc, and its address is returned via stringptr. The
722     yield of the function is the length of the string, not
723     including the terminating zero, or one of
724    
725     PCRE_ERROR_NOMEMORY (-6)
726    
727     The buffer was too small for pcre_copy_substring(), or the
728     attempt to get memory failed for pcre_get_substring().
729    
730     PCRE_ERROR_NOSUBSTRING (-7)
731    
732     There is no substring whose number is stringnumber.
733    
734     The pcre_get_substring_list() function extracts all avail-
735     able substrings and builds a list of pointers to them. All
736     this is done in a single block of memory which is obtained
737     via pcre_malloc. The address of the memory block is returned
738     via listptr, which is also the start of the list of string
739     pointers. The end of the list is marked by a NULL pointer.
740     The yield of the function is zero if all went well, or
741    
742     PCRE_ERROR_NOMEMORY (-6)
743    
744     if the attempt to get the memory block failed.
745    
746     When any of these functions encounter a substring that is
747     unset, which can happen when capturing subpattern number n+1
748     matches some part of the subject, but subpattern n has not
749     been used at all, they return an empty string. This can be
750     distinguished from a genuine zero-length substring by
751     inspecting the appropriate offset in ovector, which is nega-
752     tive for unset substrings.
753    
754 nigel 49 The two convenience functions pcre_free_substring() and
755     pcre_free_substring_list() can be used to free the memory
756     returned by a previous call of pcre_get_substring() or
757     pcre_get_substring_list(), respectively. They do nothing
758     more than call the function pointed to by pcre_free, which
759     of course could be called directly from a C program. How-
760     ever, PCRE is used in some situations where it is linked via
761     a special interface to another programming language which
762     cannot use pcre_free directly; it is for these cases that
763     the functions are provided.
764 nigel 41
765    
766    
767     LIMITATIONS
768     There are some size limitations in PCRE but it is hoped that
769     they will never in practice be relevant. The maximum length
770     of a compiled pattern is 65539 (sic) bytes. All values in
771 nigel 53 repeating quantifiers must be less than 65536. There max-
772     imum number of capturing subpatterns is 65535. There is no
773     limit to the number of non-capturing subpatterns, but the
774     maximum depth of nesting of all kinds of parenthesized sub-
775     pattern, including capturing subpatterns, assertions, and
776     other types of subpattern, is 200.
777 nigel 41
778     The maximum length of a subject string is the largest posi-
779     tive number that an integer variable can hold. However, PCRE
780     uses recursion to handle subpatterns and indefinite repeti-
781     tion. This means that the available stack space may limit
782     the size of a subject string that can be processed by cer-
783     tain patterns.
784    
785    
786    
787     DIFFERENCES FROM PERL
788     The differences described here are with respect to Perl
789     5.005.
790    
791     1. By default, a whitespace character is any character that
792     the C library function isspace() recognizes, though it is
793     possible to compile PCRE with alternative character type
794     tables. Normally isspace() matches space, formfeed, newline,
795     carriage return, horizontal tab, and vertical tab. Perl 5 no
796     longer includes vertical tab in its set of whitespace char-
797     acters. The \v escape that was in the Perl documentation for
798     a long time was never in fact recognized. However, the char-
799     acter itself was treated as whitespace at least up to 5.002.
800     In 5.004 and 5.005 it does not match \s.
801    
802     2. PCRE does not allow repeat quantifiers on lookahead
803     assertions. Perl permits them, but they do not mean what you
804     might think. For example, (?!a){3} does not assert that the
805     next three characters are not "a". It just asserts that the
806     next character is not "a" three times.
807    
808     3. Capturing subpatterns that occur inside negative looka-
809     head assertions are counted, but their entries in the
810     offsets vector are never set. Perl sets its numerical vari-
811     ables from any such patterns that are matched before the
812     assertion fails to match something (thereby succeeding), but
813     only if the negative lookahead assertion contains just one
814     branch.
815    
816     4. Though binary zero characters are supported in the sub-
817     ject string, they are not allowed in a pattern string
818     because it is passed as a normal C string, terminated by
819     zero. The escape sequence "\0" can be used in the pattern to
820     represent a binary zero.
821    
822     5. The following Perl escape sequences are not supported:
823     \l, \u, \L, \U, \E, \Q. In fact these are implemented by
824     Perl's general string-handling and are not part of its pat-
825     tern matching engine.
826    
827     6. The Perl \G assertion is not supported as it is not
828     relevant to single pattern matches.
829    
830 nigel 43 7. Fairly obviously, PCRE does not support the (?{code}) and
831     (?p{code}) constructions. However, there is some experimen-
832     tal support for recursive patterns using the non-Perl item
833     (?R).
834 nigel 49
835 nigel 41 8. There are at the time of writing some oddities in Perl
836     5.005_02 concerned with the settings of captured strings
837     when part of a pattern is repeated. For example, matching
838     "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
839     "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2
840     unset. However, if the pattern is changed to
841 nigel 47 /^(aa(b(b))?)+$/ then $2 (and $3) are set.
842 nigel 41
843     In Perl 5.004 $2 is set in both cases, and that is also true
844     of PCRE. If in the future Perl changes to a consistent state
845     that is different, PCRE may change to follow.
846    
847     9. Another as yet unresolved discrepancy is that in Perl
848     5.005_02 the pattern /^(a)?(?(1)a|b)+$/ matches the string
849     "a", whereas in PCRE it does not. However, in both Perl and
850     PCRE /^(a)?a/ matched against "a" leaves $1 unset.
851    
852     10. PCRE provides some extensions to the Perl regular
853     expression facilities:
854    
855     (a) Although lookbehind assertions must match fixed length
856     strings, each alternative branch of a lookbehind assertion
857     can match a different length of string. Perl 5.005 requires
858     them all to have the same length.
859    
860     (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not
861     set, the $ meta- character matches only at the very end of
862     the string.
863    
864     (c) If PCRE_EXTRA is set, a backslash followed by a letter
865     with no special meaning is faulted.
866    
867 nigel 43 (d) If PCRE_UNGREEDY is set, the greediness of the repeti-
868     tion quantifiers is inverted, that is, by default they are
869     not greedy, but if followed by a question mark they are.
870 nigel 41
871     (e) PCRE_ANCHORED can be used to force a pattern to be tried
872     only at the start of the subject.
873    
874     (f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options
875     for pcre_exec() have no Perl equivalents.
876    
877 nigel 43 (g) The (?R) construct allows for recursive pattern matching
878     (Perl 5.6 can do this using the (?p{code}) construct, which
879     PCRE cannot of course support.)
880 nigel 41
881    
882 nigel 43
883 nigel 41 REGULAR EXPRESSION DETAILS
884     The syntax and semantics of the regular expressions sup-
885     ported by PCRE are described below. Regular expressions are
886     also described in the Perl documentation and in a number of
887     other books, some of which have copious examples. Jeffrey
888     Friedl's "Mastering Regular Expressions", published by
889 nigel 49 O'Reilly (ISBN 1-56592-257), covers them in great detail.
890    
891 nigel 41 The description here is intended as reference documentation.
892 nigel 49 The basic operation of PCRE is on strings of bytes. However,
893     there is the beginnings of some support for UTF-8 character
894     strings. To use this support you must configure PCRE to
895     include it, and then call pcre_compile() with the PCRE_UTF8
896     option. How this affects the pattern matching is described
897     in the final section of this document.
898 nigel 41
899     A regular expression is a pattern that is matched against a
900     subject string from left to right. Most characters stand for
901     themselves in a pattern, and match the corresponding charac-
902     ters in the subject. As a trivial example, the pattern
903    
904     The quick brown fox
905    
906     matches a portion of a subject string that is identical to
907     itself. The power of regular expressions comes from the
908     ability to include alternatives and repetitions in the pat-
909     tern. These are encoded in the pattern by the use of meta-
910     characters, which do not stand for themselves but instead
911     are interpreted in some special way.
912    
913     There are two different sets of meta-characters: those that
914     are recognized anywhere in the pattern except within square
915     brackets, and those that are recognized in square brackets.
916     Outside square brackets, the meta-characters are as follows:
917    
918     \ general escape character with several uses
919     ^ assert start of subject (or line, in multiline
920     mode)
921     $ assert end of subject (or line, in multiline mode)
922     . match any character except newline (by default)
923     [ start character class definition
924     | start of alternative branch
925     ( start subpattern
926     ) end subpattern
927     ? extends the meaning of (
928     also 0 or 1 quantifier
929     also quantifier minimizer
930     * 0 or more quantifier
931     + 1 or more quantifier
932     { start min/max quantifier
933    
934     Part of a pattern that is in square brackets is called a
935     "character class". In a character class the only meta-
936     characters are:
937    
938     \ general escape character
939     ^ negate the class, but only if the first character
940     - indicates character range
941     ] terminates the character class
942    
943     The following sections describe the use of each of the
944     meta-characters.
945    
946    
947    
948     BACKSLASH
949     The backslash character has several uses. Firstly, if it is
950     followed by a non-alphameric character, it takes away any
951     special meaning that character may have. This use of
952 nigel 53
953 nigel 41 backslash as an escape character applies both inside and
954     outside character classes.
955    
956     For example, if you want to match a "*" character, you write
957     "\*" in the pattern. This applies whether or not the follow-
958     ing character would otherwise be interpreted as a meta-
959     character, so it is always safe to precede a non-alphameric
960     with "\" to specify that it stands for itself. In particu-
961     lar, if you want to match a backslash, you write "\\".
962    
963     If a pattern is compiled with the PCRE_EXTENDED option, whi-
964     tespace in the pattern (other than in a character class) and
965     characters between a "#" outside a character class and the
966     next newline character are ignored. An escaping backslash
967     can be used to include a whitespace or "#" character as part
968     of the pattern.
969    
970     A second use of backslash provides a way of encoding non-
971     printing characters in patterns in a visible manner. There
972     is no restriction on the appearance of non-printing charac-
973     ters, apart from the binary zero that terminates a pattern,
974     but when a pattern is being prepared by text editing, it is
975     usually easier to use one of the following escape sequences
976     than the binary character it represents:
977    
978     \a alarm, that is, the BEL character (hex 07)
979     \cx "control-x", where x is any character
980     \e escape (hex 1B)
981     \f formfeed (hex 0C)
982     \n newline (hex 0A)
983     \r carriage return (hex 0D)
984 nigel 43 \t tab (hex 09)
985 nigel 41 \xhh character with hex code hh
986     \ddd character with octal code ddd, or backreference
987    
988     The precise effect of "\cx" is as follows: if "x" is a lower
989     case letter, it is converted to upper case. Then bit 6 of
990     the character (hex 40) is inverted. Thus "\cz" becomes hex
991     1A, but "\c{" becomes hex 3B, while "\c;" becomes hex 7B.
992    
993     After "\x", up to two hexadecimal digits are read (letters
994     can be in upper or lower case).
995    
996     After "\0" up to two further octal digits are read. In both
997     cases, if there are fewer than two digits, just those that
998     are present are used. Thus the sequence "\0\x\07" specifies
999     two binary zeros followed by a BEL character. Make sure you
1000     supply two digits after the initial zero if the character
1001     that follows is itself an octal digit.
1002    
1003     The handling of a backslash followed by a digit other than 0
1004     is complicated. Outside a character class, PCRE reads it
1005     and any following digits as a decimal number. If the number
1006     is less than 10, or if there have been at least that many
1007     previous capturing left parentheses in the expression, the
1008     entire sequence is taken as a back reference. A description
1009     of how this works is given later, following the discussion
1010     of parenthesized subpatterns.
1011    
1012     Inside a character class, or if the decimal number is
1013     greater than 9 and there have not been that many capturing
1014     subpatterns, PCRE re-reads up to three octal digits follow-
1015     ing the backslash, and generates a single byte from the
1016     least significant 8 bits of the value. Any subsequent digits
1017     stand for themselves. For example:
1018    
1019     \040 is another way of writing a space
1020     \40 is the same, provided there are fewer than 40
1021     previous capturing subpatterns
1022     \7 is always a back reference
1023     \11 might be a back reference, or another way of
1024     writing a tab
1025     \011 is always a tab
1026     \0113 is a tab followed by the character "3"
1027     \113 is the character with octal code 113 (since there
1028     can be no more than 99 back references)
1029     \377 is a byte consisting entirely of 1 bits
1030     \81 is either a back reference, or a binary zero
1031     followed by the two characters "8" and "1"
1032    
1033     Note that octal values of 100 or greater must not be intro-
1034     duced by a leading zero, because no more than three octal
1035     digits are ever read.
1036 nigel 43
1037 nigel 41 All the sequences that define a single byte value can be
1038     used both inside and outside character classes. In addition,
1039     inside a character class, the sequence "\b" is interpreted
1040     as the backspace character (hex 08). Outside a character
1041     class it has a different meaning (see below).
1042    
1043     The third use of backslash is for specifying generic charac-
1044     ter types:
1045    
1046     \d any decimal digit
1047     \D any character that is not a decimal digit
1048     \s any whitespace character
1049     \S any character that is not a whitespace character
1050     \w any "word" character
1051     \W any "non-word" character
1052    
1053     Each pair of escape sequences partitions the complete set of
1054     characters into two disjoint sets. Any given character
1055     matches one, and only one, of each pair.
1056    
1057     A "word" character is any letter or digit or the underscore
1058     character, that is, any character which can be part of a
1059     Perl "word". The definition of letters and digits is con-
1060     trolled by PCRE's character tables, and may vary if locale-
1061     specific matching is taking place (see "Locale support"
1062     above). For example, in the "fr" (French) locale, some char-
1063     acter codes greater than 128 are used for accented letters,
1064     and these are matched by \w.
1065    
1066     These character type sequences can appear both inside and
1067     outside character classes. They each match one character of
1068     the appropriate type. If the current matching point is at
1069     the end of the subject string, all of them fail, since there
1070     is no character to match.
1071    
1072     The fourth use of backslash is for certain simple asser-
1073     tions. An assertion specifies a condition that has to be met
1074     at a particular point in a match, without consuming any
1075     characters from the subject string. The use of subpatterns
1076     for more complicated assertions is described below. The
1077     backslashed assertions are
1078    
1079     \b word boundary
1080     \B not a word boundary
1081     \A start of subject (independent of multiline mode)
1082     \Z end of subject or newline at end (independent of
1083     multiline mode)
1084     \z end of subject (independent of multiline mode)
1085    
1086     These assertions may not appear in character classes (but
1087     note that "\b" has a different meaning, namely the backspace
1088     character, inside a character class).
1089 nigel 43
1090 nigel 41 A word boundary is a position in the subject string where
1091     the current character and the previous character do not both
1092     match \w or \W (i.e. one matches \w and the other matches
1093     \W), or the start or end of the string if the first or last
1094     character matches \w, respectively.
1095    
1096     The \A, \Z, and \z assertions differ from the traditional
1097     circumflex and dollar (described below) in that they only
1098     ever match at the very start and end of the subject string,
1099     whatever options are set. They are not affected by the
1100     PCRE_NOTBOL or PCRE_NOTEOL options. If the startoffset argu-
1101     ment of pcre_exec() is non-zero, \A can never match. The
1102     difference between \Z and \z is that \Z matches before a
1103     newline that is the last character of the string as well as
1104     at the end of the string, whereas \z matches only at the
1105     end.
1106    
1107    
1108    
1109     CIRCUMFLEX AND DOLLAR
1110     Outside a character class, in the default matching mode, the
1111     circumflex character is an assertion which is true only if
1112     the current matching point is at the start of the subject
1113     string. If the startoffset argument of pcre_exec() is non-
1114     zero, circumflex can never match. Inside a character class,
1115     circumflex has an entirely different meaning (see below).
1116    
1117     Circumflex need not be the first character of the pattern if
1118     a number of alternatives are involved, but it should be the
1119     first thing in each alternative in which it appears if the
1120     pattern is ever to match that branch. If all possible alter-
1121     natives start with a circumflex, that is, if the pattern is
1122     constrained to match only at the start of the subject, it is
1123     said to be an "anchored" pattern. (There are also other con-
1124     structs that can cause a pattern to be anchored.)
1125    
1126     A dollar character is an assertion which is true only if the
1127     current matching point is at the end of the subject string,
1128     or immediately before a newline character that is the last
1129     character in the string (by default). Dollar need not be the
1130     last character of the pattern if a number of alternatives
1131     are involved, but it should be the last item in any branch
1132     in which it appears. Dollar has no special meaning in a
1133     character class.
1134    
1135     The meaning of dollar can be changed so that it matches only
1136     at the very end of the string, by setting the
1137     PCRE_DOLLAR_ENDONLY option at compile or matching time. This
1138     does not affect the \Z assertion.
1139    
1140     The meanings of the circumflex and dollar characters are
1141     changed if the PCRE_MULTILINE option is set. When this is
1142     the case, they match immediately after and immediately
1143     before an internal "\n" character, respectively, in addition
1144     to matching at the start and end of the subject string. For
1145     example, the pattern /^abc$/ matches the subject string
1146     "def\nabc" in multiline mode, but not otherwise. Conse-
1147     quently, patterns that are anchored in single line mode
1148     because all branches start with "^" are not anchored in mul-
1149     tiline mode, and a match for circumflex is possible when the
1150     startoffset argument of pcre_exec() is non-zero. The
1151     PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
1152     set.
1153    
1154     Note that the sequences \A, \Z, and \z can be used to match
1155     the start and end of the subject in both modes, and if all
1156 nigel 53 branches of a pattern start with \A it is always anchored,
1157 nigel 41 whether PCRE_MULTILINE is set or not.
1158    
1159    
1160    
1161     FULL STOP (PERIOD, DOT)
1162     Outside a character class, a dot in the pattern matches any
1163     one character in the subject, including a non-printing char-
1164     acter, but not (by default) newline. If the PCRE_DOTALL
1165 nigel 47 option is set, dots match newlines as well. The handling of
1166     dot is entirely independent of the handling of circumflex
1167     and dollar, the only relationship being that they both
1168     involve newline characters. Dot has no special meaning in a
1169     character class.
1170 nigel 41
1171    
1172    
1173     SQUARE BRACKETS
1174     An opening square bracket introduces a character class, ter-
1175     minated by a closing square bracket. A closing square
1176     bracket on its own is not special. If a closing square
1177     bracket is required as a member of the class, it should be
1178     the first data character in the class (after an initial cir-
1179     cumflex, if present) or escaped with a backslash.
1180    
1181     A character class matches a single character in the subject;
1182     the character must be in the set of characters defined by
1183     the class, unless the first character in the class is a cir-
1184     cumflex, in which case the subject character must not be in
1185     the set defined by the class. If a circumflex is actually
1186     required as a member of the class, ensure it is not the
1187     first character, or escape it with a backslash.
1188    
1189     For example, the character class [aeiou] matches any lower
1190     case vowel, while [^aeiou] matches any character that is not
1191     a lower case vowel. Note that a circumflex is just a con-
1192     venient notation for specifying the characters which are in
1193     the class by enumerating those that are not. It is not an
1194     assertion: it still consumes a character from the subject
1195     string, and fails if the current pointer is at the end of
1196     the string.
1197    
1198     When caseless matching is set, any letters in a class
1199     represent both their upper case and lower case versions, so
1200     for example, a caseless [aeiou] matches "A" as well as "a",
1201     and a caseless [^aeiou] does not match "A", whereas a case-
1202     ful version would.
1203    
1204     The newline character is never treated in any special way in
1205     character classes, whatever the setting of the PCRE_DOTALL
1206     or PCRE_MULTILINE options is. A class such as [^a] will
1207     always match a newline.
1208    
1209     The minus (hyphen) character can be used to specify a range
1210     of characters in a character class. For example, [d-m]
1211     matches any letter between d and m, inclusive. If a minus
1212     character is required in a class, it must be escaped with a
1213     backslash or appear in a position where it cannot be inter-
1214     preted as indicating a range, typically as the first or last
1215     character in the class.
1216    
1217     It is not possible to have the literal character "]" as the
1218     end character of a range. A pattern such as [W-]46] is
1219     interpreted as a class of two characters ("W" and "-") fol-
1220     lowed by a literal string "46]", so it would match "W46]" or
1221     "-46]". However, if the "]" is escaped with a backslash it
1222     is interpreted as the end of range, so [W-\]46] is inter-
1223     preted as a single class containing a range followed by two
1224     separate characters. The octal or hexadecimal representation
1225     of "]" can also be used to end a range.
1226    
1227     Ranges operate in ASCII collating sequence. They can also be
1228     used for characters specified numerically, for example
1229     [\000-\037]. If a range that includes letters is used when
1230     caseless matching is set, it matches the letters in either
1231     case. For example, [W-c] is equivalent to [][\^_`wxyzabc],
1232     matched caselessly, and if character tables for the "fr"
1233     locale are in use, [\xc8-\xcb] matches accented E characters
1234     in both cases.
1235    
1236     The character types \d, \D, \s, \S, \w, and \W may also
1237     appear in a character class, and add the characters that
1238     they match to the class. For example, [\dABCDEF] matches any
1239     hexadecimal digit. A circumflex can conveniently be used
1240     with the upper case character types to specify a more res-
1241     tricted set of characters than the matching lower case type.
1242     For example, the class [^\W_] matches any letter or digit,
1243     but not underscore.
1244    
1245     All non-alphameric characters other than \, -, ^ (at the
1246     start) and the terminating ] are non-special in character
1247     classes, but it does no harm if they are escaped.
1248    
1249    
1250    
1251 nigel 43 POSIX CHARACTER CLASSES
1252     Perl 5.6 (not yet released at the time of writing) is going
1253     to support the POSIX notation for character classes, which
1254     uses names enclosed by [: and :] within the enclosing
1255     square brackets. PCRE supports this notation. For example,
1256    
1257     [01[:alpha:]%]
1258    
1259     matches "0", "1", any alphabetic character, or "%". The sup-
1260     ported class names are
1261    
1262     alnum letters and digits
1263     alpha letters
1264     ascii character codes 0 - 127
1265     cntrl control characters
1266     digit decimal digits (same as \d)
1267     graph printing characters, excluding space
1268     lower lower case letters
1269     print printing characters, including space
1270     punct printing characters, excluding letters and digits
1271     space white space (same as \s)
1272     upper upper case letters
1273     word "word" characters (same as \w)
1274     xdigit hexadecimal digits
1275    
1276     The names "ascii" and "word" are Perl extensions. Another
1277     Perl extension is negation, which is indicated by a ^ char-
1278     acter after the colon. For example,
1279    
1280     [12[:^digit:]]
1281    
1282     matches "1", "2", or any non-digit. PCRE (and Perl) also
1283 nigel 53 recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a
1284 nigel 43 "collating element", but these are not supported, and an
1285     error is given if they are encountered.
1286    
1287    
1288    
1289 nigel 41 VERTICAL BAR
1290     Vertical bar characters are used to separate alternative
1291     patterns. For example, the pattern
1292    
1293     gilbert|sullivan
1294    
1295     matches either "gilbert" or "sullivan". Any number of alter-
1296     natives may appear, and an empty alternative is permitted
1297     (matching the empty string). The matching process tries
1298     each alternative in turn, from left to right, and the first
1299     one that succeeds is used. If the alternatives are within a
1300     subpattern (defined below), "succeeds" means matching the
1301     rest of the main pattern as well as the alternative in the
1302     subpattern.
1303    
1304    
1305    
1306     INTERNAL OPTION SETTING
1307     The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL,
1308     and PCRE_EXTENDED can be changed from within the pattern by
1309     a sequence of Perl option letters enclosed between "(?" and
1310     ")". The option letters are
1311    
1312     i for PCRE_CASELESS
1313     m for PCRE_MULTILINE
1314     s for PCRE_DOTALL
1315     x for PCRE_EXTENDED
1316    
1317     For example, (?im) sets caseless, multiline matching. It is
1318     also possible to unset these options by preceding the letter
1319     with a hyphen, and a combined setting and unsetting such as
1320     (?im-sx), which sets PCRE_CASELESS and PCRE_MULTILINE while
1321     unsetting PCRE_DOTALL and PCRE_EXTENDED, is also permitted.
1322     If a letter appears both before and after the hyphen, the
1323     option is unset.
1324    
1325     The scope of these option changes depends on where in the
1326     pattern the setting occurs. For settings that are outside
1327     any subpattern (defined below), the effect is the same as if
1328     the options were set or unset at the start of matching. The
1329     following patterns all behave in exactly the same way:
1330    
1331     (?i)abc
1332     a(?i)bc
1333     ab(?i)c
1334     abc(?i)
1335    
1336     which in turn is the same as compiling the pattern abc with
1337     PCRE_CASELESS set. In other words, such "top level" set-
1338     tings apply to the whole pattern (unless there are other
1339     changes inside subpatterns). If there is more than one set-
1340     ting of the same option at top level, the rightmost setting
1341     is used.
1342    
1343     If an option change occurs inside a subpattern, the effect
1344     is different. This is a change of behaviour in Perl 5.005.
1345     An option change inside a subpattern affects only that part
1346     of the subpattern that follows it, so
1347    
1348     (a(?i)b)c
1349    
1350     matches abc and aBc and no other strings (assuming
1351     PCRE_CASELESS is not used). By this means, options can be
1352     made to have different settings in different parts of the
1353     pattern. Any changes made in one alternative do carry on
1354     into subsequent branches within the same subpattern. For
1355     example,
1356    
1357     (a(?i)b|c)
1358    
1359     matches "ab", "aB", "c", and "C", even though when matching
1360     "C" the first branch is abandoned before the option setting.
1361     This is because the effects of option settings happen at
1362     compile time. There would be some very weird behaviour oth-
1363     erwise.
1364    
1365     The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can
1366     be changed in the same way as the Perl-compatible options by
1367     using the characters U and X respectively. The (?X) flag
1368     setting is special in that it must always occur earlier in
1369     the pattern than any of the additional features it turns on,
1370     even when it is at top level. It is best put at the start.
1371    
1372    
1373    
1374     SUBPATTERNS
1375     Subpatterns are delimited by parentheses (round brackets),
1376     which can be nested. Marking part of a pattern as a subpat-
1377     tern does two things:
1378    
1379     1. It localizes a set of alternatives. For example, the pat-
1380     tern
1381    
1382     cat(aract|erpillar|)
1383    
1384     matches one of the words "cat", "cataract", or "caterpil-
1385     lar". Without the parentheses, it would match "cataract",
1386     "erpillar" or the empty string.
1387    
1388     2. It sets up the subpattern as a capturing subpattern (as
1389     defined above). When the whole pattern matches, that por-
1390     tion of the subject string that matched the subpattern is
1391     passed back to the caller via the ovector argument of
1392     pcre_exec(). Opening parentheses are counted from left to
1393     right (starting from 1) to obtain the numbers of the captur-
1394     ing subpatterns.
1395    
1396     For example, if the string "the red king" is matched against
1397     the pattern
1398    
1399     the ((red|white) (king|queen))
1400    
1401     the captured substrings are "red king", "red", and "king",
1402 nigel 53 and are numbered 1, 2, and 3, respectively.
1403 nigel 41
1404     The fact that plain parentheses fulfil two functions is not
1405     always helpful. There are often times when a grouping sub-
1406     pattern is required without a capturing requirement. If an
1407     opening parenthesis is followed by "?:", the subpattern does
1408     not do any capturing, and is not counted when computing the
1409     number of any subsequent capturing subpatterns. For example,
1410     if the string "the white queen" is matched against the pat-
1411     tern
1412    
1413     the ((?:red|white) (king|queen))
1414    
1415     the captured substrings are "white queen" and "queen", and
1416     are numbered 1 and 2. The maximum number of captured sub-
1417     strings is 99, and the maximum number of all subpatterns,
1418     both capturing and non-capturing, is 200.
1419    
1420     As a convenient shorthand, if any option settings are
1421     required at the start of a non-capturing subpattern, the
1422     option letters may appear between the "?" and the ":". Thus
1423     the two patterns
1424    
1425     (?i:saturday|sunday)
1426     (?:(?i)saturday|sunday)
1427    
1428     match exactly the same set of strings. Because alternative
1429     branches are tried from left to right, and options are not
1430     reset until the end of the subpattern is reached, an option
1431     setting in one branch does affect subsequent branches, so
1432     the above patterns match "SUNDAY" as well as "Saturday".
1433    
1434    
1435    
1436     REPETITION
1437     Repetition is specified by quantifiers, which can follow any
1438     of the following items:
1439    
1440     a single character, possibly escaped
1441     the . metacharacter
1442     a character class
1443     a back reference (see next section)
1444     a parenthesized subpattern (unless it is an assertion -
1445     see below)
1446    
1447     The general repetition quantifier specifies a minimum and
1448     maximum number of permitted matches, by giving the two
1449     numbers in curly brackets (braces), separated by a comma.
1450     The numbers must be less than 65536, and the first must be
1451     less than or equal to the second. For example:
1452    
1453     z{2,4}
1454    
1455     matches "zz", "zzz", or "zzzz". A closing brace on its own
1456     is not a special character. If the second number is omitted,
1457     but the comma is present, there is no upper limit; if the
1458     second number and the comma are both omitted, the quantifier
1459     specifies an exact number of required matches. Thus
1460    
1461     [aeiou]{3,}
1462    
1463     matches at least 3 successive vowels, but may match many
1464     more, while
1465    
1466     \d{8}
1467    
1468     matches exactly 8 digits. An opening curly bracket that
1469     appears in a position where a quantifier is not allowed, or
1470     one that does not match the syntax of a quantifier, is taken
1471     as a literal character. For example, {,6} is not a quantif-
1472     ier, but a literal string of four characters.
1473     The quantifier {0} is permitted, causing the expression to
1474     behave as if the previous item and the quantifier were not
1475     present.
1476    
1477     For convenience (and historical compatibility) the three
1478     most common quantifiers have single-character abbreviations:
1479    
1480     * is equivalent to {0,}
1481     + is equivalent to {1,}
1482     ? is equivalent to {0,1}
1483    
1484     It is possible to construct infinite loops by following a
1485     subpattern that can match no characters with a quantifier
1486     that has no upper limit, for example:
1487    
1488     (a?)*
1489    
1490     Earlier versions of Perl and PCRE used to give an error at
1491     compile time for such patterns. However, because there are
1492     cases where this can be useful, such patterns are now
1493     accepted, but if any repetition of the subpattern does in
1494     fact match no characters, the loop is forcibly broken.
1495    
1496     By default, the quantifiers are "greedy", that is, they
1497     match as much as possible (up to the maximum number of per-
1498     mitted times), without causing the rest of the pattern to
1499     fail. The classic example of where this gives problems is in
1500     trying to match comments in C programs. These appear between
1501     the sequences /* and */ and within the sequence, individual
1502     * and / characters may appear. An attempt to match C com-
1503     ments by applying the pattern
1504    
1505     /\*.*\*/
1506    
1507     to the string
1508    
1509     /* first command */ not comment /* second comment */
1510    
1511 nigel 51 fails, because it matches the entire string owing to the
1512 nigel 41 greediness of the .* item.
1513    
1514 nigel 47 However, if a quantifier is followed by a question mark, it
1515     ceases to be greedy, and instead matches the minimum number
1516     of times possible, so the pattern
1517 nigel 41
1518     /\*.*?\*/
1519    
1520     does the right thing with the C comments. The meaning of the
1521     various quantifiers is not otherwise changed, just the pre-
1522     ferred number of matches. Do not confuse this use of ques-
1523     tion mark with its use as a quantifier in its own right.
1524     Because it has two uses, it can sometimes appear doubled, as
1525     in
1526    
1527     \d??\d
1528    
1529     which matches one digit by preference, but can match two if
1530     that is the only way the rest of the pattern matches.
1531    
1532     If the PCRE_UNGREEDY option is set (an option which is not
1533 nigel 47 available in Perl), the quantifiers are not greedy by
1534 nigel 41 default, but individual ones can be made greedy by following
1535     them with a question mark. In other words, it inverts the
1536     default behaviour.
1537    
1538     When a parenthesized subpattern is quantified with a minimum
1539     repeat count that is greater than 1 or with a limited max-
1540     imum, more store is required for the compiled pattern, in
1541     proportion to the size of the minimum or maximum.
1542    
1543     If a pattern starts with .* or .{0,} and the PCRE_DOTALL
1544     option (equivalent to Perl's /s) is set, thus allowing the .
1545 nigel 47 to match newlines, the pattern is implicitly anchored,
1546 nigel 41 because whatever follows will be tried against every charac-
1547     ter position in the subject string, so there is no point in
1548     retrying the overall match at any position after the first.
1549     PCRE treats such a pattern as though it were preceded by \A.
1550     In cases where it is known that the subject string contains
1551     no newlines, it is worth setting PCRE_DOTALL when the pat-
1552     tern begins with .* in order to obtain this optimization, or
1553     alternatively using ^ to indicate anchoring explicitly.
1554    
1555     When a capturing subpattern is repeated, the value captured
1556     is the substring that matched the final iteration. For exam-
1557     ple, after
1558    
1559     (tweedle[dume]{3}\s*)+
1560    
1561     has matched "tweedledum tweedledee" the value of the cap-
1562     tured substring is "tweedledee". However, if there are
1563     nested capturing subpatterns, the corresponding captured
1564     values may have been set in previous iterations. For exam-
1565     ple, after
1566    
1567     /(a|(b))+/
1568    
1569     matches "aba" the value of the second captured substring is
1570     "b".
1571    
1572    
1573    
1574     BACK REFERENCES
1575     Outside a character class, a backslash followed by a digit
1576     greater than 0 (and possibly further digits) is a back
1577 nigel 53
1578    
1579    
1580    
1581     SunOS 5.8 Last change: 30
1582    
1583    
1584    
1585 nigel 41 reference to a capturing subpattern earlier (i.e. to its
1586     left) in the pattern, provided there have been that many
1587     previous capturing left parentheses.
1588    
1589     However, if the decimal number following the backslash is
1590     less than 10, it is always taken as a back reference, and
1591     causes an error only if there are not that many capturing
1592     left parentheses in the entire pattern. In other words, the
1593     parentheses that are referenced need not be to the left of
1594     the reference for numbers less than 10. See the section
1595     entitled "Backslash" above for further details of the han-
1596     dling of digits following a backslash.
1597    
1598     A back reference matches whatever actually matched the cap-
1599     turing subpattern in the current subject string, rather than
1600     anything matching the subpattern itself. So the pattern
1601    
1602     (sens|respons)e and \1ibility
1603    
1604     matches "sense and sensibility" and "response and responsi-
1605     bility", but not "sense and responsibility". If caseful
1606 nigel 47 matching is in force at the time of the back reference, the
1607     case of letters is relevant. For example,
1608 nigel 41
1609     ((?i)rah)\s+\1
1610    
1611     matches "rah rah" and "RAH RAH", but not "RAH rah", even
1612     though the original capturing subpattern is matched case-
1613     lessly.
1614    
1615     There may be more than one back reference to the same sub-
1616     pattern. If a subpattern has not actually been used in a
1617 nigel 47 particular match, any back references to it always fail. For
1618     example, the pattern
1619 nigel 41
1620     (a|(bc))\2
1621    
1622     always fails if it starts to match "a" rather than "bc".
1623     Because there may be up to 99 back references, all digits
1624     following the backslash are taken as part of a potential
1625     back reference number. If the pattern continues with a digit
1626 nigel 47 character, some delimiter must be used to terminate the back
1627     reference. If the PCRE_EXTENDED option is set, this can be
1628     whitespace. Otherwise an empty comment can be used.
1629 nigel 41
1630     A back reference that occurs inside the parentheses to which
1631     it refers fails when the subpattern is first used, so, for
1632     example, (a\1) never matches. However, such references can
1633 nigel 49 be useful inside repeated subpatterns. For example, the pat-
1634     tern
1635 nigel 41
1636     (a|b\1)+
1637    
1638 nigel 49 matches any number of "a"s and also "aba", "ababbaa" etc. At
1639 nigel 41 each iteration of the subpattern, the back reference matches
1640 nigel 53 the character string corresponding to the previous itera-
1641     tion. In order for this to work, the pattern must be such
1642     that the first iteration does not need to match the back
1643     reference. This can be done using alternation, as in the
1644     example above, or by a quantifier with a minimum of zero.
1645 nigel 41
1646    
1647    
1648     ASSERTIONS
1649     An assertion is a test on the characters following or
1650     preceding the current matching point that does not actually
1651     consume any characters. The simple assertions coded as \b,
1652     \B, \A, \Z, \z, ^ and $ are described above. More compli-
1653     cated assertions are coded as subpatterns. There are two
1654     kinds: those that look ahead of the current position in the
1655     subject string, and those that look behind it.
1656 nigel 43
1657 nigel 41 An assertion subpattern is matched in the normal way, except
1658     that it does not cause the current matching position to be
1659     changed. Lookahead assertions start with (?= for positive
1660     assertions and (?! for negative assertions. For example,
1661    
1662     \w+(?=;)
1663    
1664     matches a word followed by a semicolon, but does not include
1665     the semicolon in the match, and
1666    
1667     foo(?!bar)
1668    
1669     matches any occurrence of "foo" that is not followed by
1670     "bar". Note that the apparently similar pattern
1671    
1672     (?!foo)bar
1673    
1674     does not find an occurrence of "bar" that is preceded by
1675     something other than "foo"; it finds any occurrence of "bar"
1676     whatsoever, because the assertion (?!foo) is always true
1677     when the next three characters are "bar". A lookbehind
1678     assertion is needed to achieve this effect.
1679    
1680     Lookbehind assertions start with (?<= for positive asser-
1681     tions and (?<! for negative assertions. For example,
1682    
1683     (?<!foo)bar
1684    
1685     does find an occurrence of "bar" that is not preceded by
1686     "foo". The contents of a lookbehind assertion are restricted
1687     such that all the strings it matches must have a fixed
1688     length. However, if there are several alternatives, they do
1689     not all have to have the same fixed length. Thus
1690    
1691     (?<=bullock|donkey)
1692    
1693     is permitted, but
1694    
1695     (?<!dogs?|cats?)
1696    
1697     causes an error at compile time. Branches that match dif-
1698     ferent length strings are permitted only at the top level of
1699     a lookbehind assertion. This is an extension compared with
1700     Perl 5.005, which requires all branches to match the same
1701     length of string. An assertion such as
1702    
1703     (?<=ab(c|de))
1704    
1705     is not permitted, because its single top-level branch can
1706     match two different lengths, but it is acceptable if rewrit-
1707     ten to use two top-level branches:
1708    
1709     (?<=abc|abde)
1710    
1711     The implementation of lookbehind assertions is, for each
1712     alternative, to temporarily move the current position back
1713     by the fixed width and then try to match. If there are
1714     insufficient characters before the current position, the
1715     match is deemed to fail. Lookbehinds in conjunction with
1716     once-only subpatterns can be particularly useful for match-
1717     ing at the ends of strings; an example is given at the end
1718     of the section on once-only subpatterns.
1719    
1720     Several assertions (of any sort) may occur in succession.
1721     For example,
1722    
1723     (?<=\d{3})(?<!999)foo
1724    
1725     matches "foo" preceded by three digits that are not "999".
1726     Notice that each of the assertions is applied independently
1727     at the same point in the subject string. First there is a
1728 nigel 47 check that the previous three characters are all digits, and
1729 nigel 41 then there is a check that the same three characters are not
1730     "999". This pattern does not match "foo" preceded by six
1731     characters, the first of which are digits and the last three
1732     of which are not "999". For example, it doesn't match
1733     "123abcfoo". A pattern to do that is
1734    
1735     (?<=\d{3}...)(?<!999)foo
1736    
1737     This time the first assertion looks at the preceding six
1738     characters, checking that the first three are digits, and
1739     then the second assertion checks that the preceding three
1740     characters are not "999".
1741    
1742     Assertions can be nested in any combination. For example,
1743    
1744     (?<=(?<!foo)bar)baz
1745    
1746     matches an occurrence of "baz" that is preceded by "bar"
1747     which in turn is not preceded by "foo", while
1748    
1749     (?<=\d{3}(?!999)...)foo
1750    
1751     is another pattern which matches "foo" preceded by three
1752     digits and any three characters that are not "999".
1753    
1754     Assertion subpatterns are not capturing subpatterns, and may
1755     not be repeated, because it makes no sense to assert the
1756     same thing several times. If any kind of assertion contains
1757     capturing subpatterns within it, these are counted for the
1758     purposes of numbering the capturing subpatterns in the whole
1759     pattern. However, substring capturing is carried out only
1760     for positive assertions, because it does not make sense for
1761     negative assertions.
1762    
1763     Assertions count towards the maximum of 200 parenthesized
1764     subpatterns.
1765    
1766    
1767    
1768     ONCE-ONLY SUBPATTERNS
1769     With both maximizing and minimizing repetition, failure of
1770     what follows normally causes the repeated item to be re-
1771     evaluated to see if a different number of repeats allows the
1772     rest of the pattern to match. Sometimes it is useful to
1773     prevent this, either to change the nature of the match, or
1774     to cause it fail earlier than it otherwise might, when the
1775     author of the pattern knows there is no point in carrying
1776     on.
1777    
1778     Consider, for example, the pattern \d+foo when applied to
1779     the subject line
1780    
1781     123456bar
1782    
1783     After matching all 6 digits and then failing to match "foo",
1784     the normal action of the matcher is to try again with only 5
1785     digits matching the \d+ item, and then with 4, and so on,
1786     before ultimately failing. Once-only subpatterns provide the
1787     means for specifying that once a portion of the pattern has
1788     matched, it is not to be re-evaluated in this way, so the
1789     matcher would give up immediately on failing to match "foo"
1790     the first time. The notation is another kind of special
1791     parenthesis, starting with (?> as in this example:
1792    
1793     (?>\d+)bar
1794    
1795     This kind of parenthesis "locks up" the part of the pattern
1796     it contains once it has matched, and a failure further into
1797 nigel 53 the pattern is prevented from backtracking into it. Back-
1798     tracking past it to previous items, however, works as nor-
1799     mal.
1800 nigel 41
1801     An alternative description is that a subpattern of this type
1802     matches the string of characters that an identical stan-
1803     dalone pattern would match, if anchored at the current point
1804     in the subject string.
1805    
1806     Once-only subpatterns are not capturing subpatterns. Simple
1807     cases such as the above example can be thought of as a max-
1808     imizing repeat that must swallow everything it can. So,
1809     while both \d+ and \d+? are prepared to adjust the number of
1810     digits they match in order to make the rest of the pattern
1811     match, (?>\d+) can only match an entire sequence of digits.
1812    
1813     This construction can of course contain arbitrarily compli-
1814     cated subpatterns, and it can be nested.
1815    
1816     Once-only subpatterns can be used in conjunction with look-
1817     behind assertions to specify efficient matching at the end
1818     of the subject string. Consider a simple pattern such as
1819    
1820     abcd$
1821    
1822 nigel 43 when applied to a long string which does not match. Because
1823     matching proceeds from left to right, PCRE will look for
1824     each "a" in the subject and then see if what follows matches
1825     the rest of the pattern. If the pattern is specified as
1826 nigel 41
1827     ^.*abcd$
1828    
1829 nigel 47 the initial .* matches the entire string at first, but when
1830     this fails (because there is no following "a"), it back-
1831     tracks to match all but the last character, then all but the
1832     last two characters, and so on. Once again the search for
1833     "a" covers the entire string, from right to left, so we are
1834     no better off. However, if the pattern is written as
1835 nigel 41
1836     ^(?>.*)(?<=abcd)
1837    
1838 nigel 47 there can be no backtracking for the .* item; it can match
1839     only the entire string. The subsequent lookbehind assertion
1840     does a single test on the last four characters. If it fails,
1841     the match fails immediately. For long strings, this approach
1842     makes a significant difference to the processing time.
1843 nigel 41
1844 nigel 43 When a pattern contains an unlimited repeat inside a subpat-
1845     tern that can itself be repeated an unlimited number of
1846     times, the use of a once-only subpattern is the only way to
1847     avoid some failing matches taking a very long time indeed.
1848     The pattern
1849 nigel 41
1850 nigel 43 (\D+|<\d+>)*[!?]
1851 nigel 41
1852 nigel 43 matches an unlimited number of substrings that either con-
1853     sist of non-digits, or digits enclosed in <>, followed by
1854     either ! or ?. When it matches, it runs quickly. However, if
1855     it is applied to
1856    
1857     aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
1858    
1859     it takes a long time before reporting failure. This is
1860     because the string can be divided between the two repeats in
1861     a large number of ways, and all have to be tried. (The exam-
1862     ple used [!?] rather than a single character at the end,
1863     because both PCRE and Perl have an optimization that allows
1864     for fast failure when a single character is used. They
1865     remember the last single character that is required for a
1866     match, and fail early if it is not present in the string.)
1867     If the pattern is changed to
1868    
1869     ((?>\D+)|<\d+>)*[!?]
1870    
1871     sequences of non-digits cannot be broken, and failure hap-
1872     pens quickly.
1873    
1874    
1875    
1876 nigel 41 CONDITIONAL SUBPATTERNS
1877     It is possible to cause the matching process to obey a sub-
1878     pattern conditionally or to choose between two alternative
1879     subpatterns, depending on the result of an assertion, or
1880     whether a previous capturing subpattern matched or not. The
1881     two possible forms of conditional subpattern are
1882    
1883     (?(condition)yes-pattern)
1884     (?(condition)yes-pattern|no-pattern)
1885    
1886     If the condition is satisfied, the yes-pattern is used; oth-
1887     erwise the no-pattern (if present) is used. If there are
1888     more than two alternatives in the subpattern, a compile-time
1889     error occurs.
1890    
1891     There are two kinds of condition. If the text between the
1892 nigel 47 parentheses consists of a sequence of digits, the condition
1893     is satisfied if the capturing subpattern of that number has
1894 nigel 51 previously matched. The number must be greater than zero.
1895     Consider the following pattern, which contains non-
1896     significant white space to make it more readable (assume the
1897     PCRE_EXTENDED option) and to divide it into three parts for
1898     ease of discussion:
1899 nigel 41
1900     ( \( )? [^()]+ (?(1) \) )
1901    
1902     The first part matches an optional opening parenthesis, and
1903     if that character is present, sets it as the first captured
1904     substring. The second part matches one or more characters
1905     that are not parentheses. The third part is a conditional
1906     subpattern that tests whether the first set of parentheses
1907     matched or not. If they did, that is, if subject started
1908     with an opening parenthesis, the condition is true, and so
1909     the yes-pattern is executed and a closing parenthesis is
1910     required. Otherwise, since no-pattern is not present, the
1911     subpattern matches nothing. In other words, this pattern
1912     matches a sequence of non-parentheses, optionally enclosed
1913     in parentheses.
1914    
1915     If the condition is not a sequence of digits, it must be an
1916     assertion. This may be a positive or negative lookahead or
1917     lookbehind assertion. Consider this pattern, again contain-
1918     ing non-significant white space, and with the two alterna-
1919     tives on the second line:
1920    
1921     (?(?=[^a-z]*[a-z])
1922     \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
1923    
1924     The condition is a positive lookahead assertion that matches
1925     an optional sequence of non-letters followed by a letter. In
1926     other words, it tests for the presence of at least one
1927     letter in the subject. If a letter is found, the subject is
1928     matched against the first alternative; otherwise it is
1929     matched against the second. This pattern matches strings in
1930     one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
1931     letters and dd are digits.
1932    
1933    
1934    
1935     COMMENTS
1936     The sequence (?# marks the start of a comment which contin-
1937     ues up to the next closing parenthesis. Nested parentheses
1938     are not permitted. The characters that make up a comment
1939     play no part in the pattern matching at all.
1940    
1941     If the PCRE_EXTENDED option is set, an unescaped # character
1942     outside a character class introduces a comment that contin-
1943     ues up to the next newline character in the pattern.
1944    
1945    
1946    
1947 nigel 43 RECURSIVE PATTERNS
1948     Consider the problem of matching a string in parentheses,
1949     allowing for unlimited nested parentheses. Without the use
1950     of recursion, the best that can be done is to use a pattern
1951     that matches up to some fixed depth of nesting. It is not
1952     possible to handle an arbitrary nesting depth. Perl 5.6 has
1953     provided an experimental facility that allows regular
1954     expressions to recurse (amongst other things). It does this
1955     by interpolating Perl code in the expression at run time,
1956     and the code can refer to the expression itself. A Perl pat-
1957     tern to solve the parentheses problem can be created like
1958     this:
1959    
1960     $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
1961    
1962     The (?p{...}) item interpolates Perl code at run time, and
1963     in this case refers recursively to the pattern in which it
1964     appears. Obviously, PCRE cannot support the interpolation of
1965     Perl code. Instead, the special item (?R) is provided for
1966     the specific case of recursion. This PCRE pattern solves the
1967     parentheses problem (assume the PCRE_EXTENDED option is set
1968     so that white space is ignored):
1969    
1970     \( ( (?>[^()]+) | (?R) )* \)
1971    
1972     First it matches an opening parenthesis. Then it matches any
1973     number of substrings which can either be a sequence of non-
1974     parentheses, or a recursive match of the pattern itself
1975     (i.e. a correctly parenthesized substring). Finally there is
1976     a closing parenthesis.
1977    
1978     This particular example pattern contains nested unlimited
1979     repeats, and so the use of a once-only subpattern for match-
1980     ing strings of non-parentheses is important when applying
1981     the pattern to strings that do not match. For example, when
1982     it is applied to
1983    
1984     (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
1985    
1986     it yields "no match" quickly. However, if a once-only sub-
1987     pattern is not used, the match runs for a very long time
1988     indeed because there are so many different ways the + and *
1989     repeats can carve up the subject, and all have to be tested
1990     before failure can be reported.
1991    
1992     The values set for any capturing subpatterns are those from
1993     the outermost level of the recursion at which the subpattern
1994     value is set. If the pattern above is matched against
1995    
1996     (ab(cd)ef)
1997    
1998     the value for the capturing parentheses is "ef", which is
1999     the last value taken on at the top level. If additional
2000     parentheses are added, giving
2001    
2002     \( ( ( (?>[^()]+) | (?R) )* ) \)
2003     ^ ^
2004 nigel 47 ^ ^ the string they capture is
2005     "ab(cd)ef", the contents of the top level parentheses. If
2006 nigel 43 there are more than 15 capturing parentheses in a pattern,
2007     PCRE has to obtain extra memory to store data during a
2008     recursion, which it does by using pcre_malloc, freeing it
2009     via pcre_free afterwards. If no memory can be obtained, it
2010     saves data for the first 15 capturing parentheses only, as
2011     there is no way to give an out-of-memory error from within a
2012     recursion.
2013    
2014    
2015    
2016 nigel 41 PERFORMANCE
2017     Certain items that may appear in patterns are more efficient
2018     than others. It is more efficient to use a character class
2019     like [aeiou] than a set of alternatives such as (a|e|i|o|u).
2020     In general, the simplest construction that provides the
2021     required behaviour is usually the most efficient. Jeffrey
2022     Friedl's book contains a lot of discussion about optimizing
2023     regular expressions for efficient performance.
2024    
2025     When a pattern begins with .* and the PCRE_DOTALL option is
2026     set, the pattern is implicitly anchored by PCRE, since it
2027     can match only at the start of a subject string. However, if
2028     PCRE_DOTALL is not set, PCRE cannot make this optimization,
2029     because the . metacharacter does not then match a newline,
2030     and if the subject string contains newlines, the pattern may
2031     match from the character immediately following one of them
2032     instead of from the very start. For example, the pattern
2033    
2034     (.*) second
2035    
2036     matches the subject "first\nand second" (where \n stands for
2037     a newline character) with the first captured substring being
2038     "and". In order to do this, PCRE has to retry the match
2039     starting after every newline in the subject.
2040    
2041     If you are using such a pattern with subject strings that do
2042     not contain newlines, the best performance is obtained by
2043     setting PCRE_DOTALL, or starting the pattern with ^.* to
2044     indicate explicit anchoring. That saves PCRE from having to
2045     scan along the subject looking for a newline to restart at.
2046    
2047     Beware of patterns that contain nested indefinite repeats.
2048     These can take a long time to run when applied to a string
2049     that does not match. Consider the pattern fragment
2050    
2051     (a+)*
2052    
2053     This can match "aaaa" in 33 different ways, and this number
2054     increases very rapidly as the string gets longer. (The *
2055     repeat can match 0, 1, 2, 3, or 4 times, and for each of
2056     those cases other than 0, the + repeats can match different
2057     numbers of times.) When the remainder of the pattern is such
2058 nigel 51 that the entire match is going to fail, PCRE has in princi-
2059     ple to try every possible variation, and this can take an
2060     extremely long time.
2061 nigel 41
2062     An optimization catches some of the more simple cases such
2063     as
2064    
2065     (a+)*b
2066    
2067     where a literal character follows. Before embarking on the
2068     standard matching procedure, PCRE checks that there is a "b"
2069     later in the subject string, and if there is not, it fails
2070     the match immediately. However, when there is no following
2071     literal this optimization cannot be used. You can see the
2072     difference by comparing the behaviour of
2073    
2074     (a+)*\d
2075    
2076     with the pattern above. The former gives a failure almost
2077     instantly when applied to a whole line of "a" characters,
2078     whereas the latter takes an appreciable time with strings
2079     longer than about 20 characters.
2080    
2081    
2082    
2083 nigel 49 UTF-8 SUPPORT
2084     Starting at release 3.3, PCRE has some support for character
2085     strings encoded in the UTF-8 format. This is incomplete, and
2086     is regarded as experimental. In order to use it, you must
2087     configure PCRE to include UTF-8 support in the code, and, in
2088     addition, you must call pcre_compile() with the PCRE_UTF8
2089     option flag. When you do this, both the pattern and any sub-
2090     ject strings that are matched against it are treated as
2091     UTF-8 strings instead of just strings of bytes, but only in
2092     the cases that are mentioned below.
2093    
2094     If you compile PCRE with UTF-8 support, but do not use it at
2095     run time, the library will be a bit bigger, but the addi-
2096     tional run time overhead is limited to testing the PCRE_UTF8
2097     flag in several places, so should not be very large.
2098    
2099     PCRE assumes that the strings it is given contain valid
2100     UTF-8 codes. It does not diagnose invalid UTF-8 strings. If
2101     you pass invalid UTF-8 strings to PCRE, the results are
2102     undefined.
2103    
2104     Running with PCRE_UTF8 set causes these changes in the way
2105     PCRE works:
2106    
2107 nigel 53 1. In a pattern, the escape sequence \x{...}, where the
2108     contents of the braces is a string of hexadecimal digits, is
2109 nigel 49 interpreted as a UTF-8 character whose code number is the
2110     given hexadecimal number, for example: \x{1234}. This
2111     inserts from one to six literal bytes into the pattern,
2112     using the UTF-8 encoding. If a non-hexadecimal digit appears
2113     between the braces, the item is not recognized.
2114    
2115     2. The original hexadecimal escape sequence, \xhh, generates
2116     a two-byte UTF-8 character if its value is greater than 127.
2117    
2118     3. Repeat quantifiers are NOT correctly handled if they fol-
2119     low a multibyte character. For example, \x{100}* and \xc3+
2120     do not work. If you want to repeat such characters, you must
2121     enclose them in non-capturing parentheses, for example
2122     (?:\x{100}), at present.
2123    
2124     4. The dot metacharacter matches one UTF-8 character instead
2125     of a single byte.
2126    
2127     5. Unlike literal UTF-8 characters, the dot metacharacter
2128     followed by a repeat quantifier does operate correctly on
2129     UTF-8 characters instead of single bytes.
2130    
2131     4. Although the \x{...} escape is permitted in a character
2132     class, characters whose values are greater than 255 cannot
2133     be included in a class.
2134    
2135     5. A class is matched against a UTF-8 character instead of
2136     just a single byte, but it can match only characters whose
2137     values are less than 256. Characters with greater values
2138     always fail to match a class.
2139    
2140     6. Repeated classes work correctly on multiple characters.
2141    
2142     7. Classes containing just a single character whose value is
2143     greater than 127 (but less than 256), for example, [\x80] or
2144     [^\x{93}], do not work because these are optimized into sin-
2145     gle byte matches. In the first case, of course, the class
2146     brackets are just redundant.
2147    
2148     8. Lookbehind assertions move backwards in the subject by a
2149     fixed number of characters instead of a fixed number of
2150     bytes. Simple cases have been tested to work correctly, but
2151     there may be hidden gotchas herein.
2152    
2153     9. The character types such as \d and \w do not work
2154     correctly with UTF-8 characters. They continue to test a
2155     single byte.
2156    
2157     10. Anything not explicitly mentioned here continues to work
2158     in bytes rather than in characters.
2159    
2160     The following UTF-8 features of Perl 5.6 are not imple-
2161     mented:
2162 nigel 53
2163 nigel 49 1. The escape sequence \C to match a single byte.
2164    
2165     2. The use of Unicode tables and properties and escapes \p,
2166     \P, and \X.
2167    
2168    
2169    
2170 nigel 53 SAMPLE PROGRAM
2171     The code below is a simple, complete demonstration program,
2172     to get you started with using PCRE. This code is also sup-
2173     plied in the file pcredemo.c in the PCRE distribution.
2174    
2175     The program compiles the regular expression that is its
2176     first argument, and matches it against the subject string in
2177     its second argument. No options are set, and default charac-
2178     ter tables are used. If matching succeeds, the program out-
2179     puts the portion of the subject that matched, together with
2180     the contents of any captured substrings.
2181    
2182     On a Unix system that has PCRE installed in /usr/local, you
2183     can compile the demonstration program using a command like
2184     this:
2185    
2186     gcc -o pcredemo pcredemo.c -I/usr/local/include
2187     -L/usr/local/lib -lpcre
2188    
2189     Then you can run simple tests like this:
2190    
2191     ./pcredemo 'cat|dog' 'the cat sat on the mat'
2192    
2193     Note that there is a much more comprehensive test program,
2194     called pcretest, which supports many more facilities for
2195     testing regular expressions. The pcredemo program is pro-
2196     vided as a simple coding example.
2197    
2198     On some operating systems (e.g. Solaris) you may get an
2199     error like this when you try to run pcredemo:
2200    
2201     ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such
2202     file or directory
2203    
2204     This is caused by the way shared library support works on
2205     those systems. You need to add
2206    
2207     -R/usr/local/lib
2208    
2209     to the compile command to get round this problem. Here's the
2210     code:
2211    
2212     #include <stdio.h>
2213     #include <string.h>
2214     #include <pcre.h>
2215    
2216     #define OVECCOUNT 30 /* should be a multiple of 3 */
2217    
2218     int main(int argc, char **argv)
2219     {
2220     pcre *re;
2221     const char *error;
2222     int erroffset;
2223     int ovector[OVECCOUNT];
2224     int rc, i;
2225    
2226     if (argc != 3)
2227     {
2228     printf("Two arguments required: a regex and a "
2229     "subject string\n");
2230     return 1;
2231     }
2232    
2233     /* Compile the regular expression in the first argument */
2234    
2235     re = pcre_compile(
2236     argv[1], /* the pattern */
2237     0, /* default options */
2238     &error, /* for error message */
2239     &erroffset, /* for error offset */
2240     NULL); /* use default character tables */
2241    
2242     /* Compilation failed: print the error message and exit */
2243    
2244     if (re == NULL)
2245     {
2246     printf("PCRE compilation failed at offset %d: %s\n",
2247     erroffset, error);
2248     return 1;
2249     }
2250    
2251     /* Compilation succeeded: match the subject in the second
2252     argument */
2253    
2254     rc = pcre_exec(
2255     re, /* the compiled pattern */
2256     NULL, /* we didn't study the pattern */
2257     argv[2], /* the subject string */
2258     (int)strlen(argv[2]), /* the length of the subject */
2259     0, /* start at offset 0 in the subject */
2260     0, /* default options */
2261     ovector, /* vector for substring information */
2262     OVECCOUNT); /* number of elements in the vector */
2263    
2264     /* Matching failed: handle error cases */
2265    
2266     if (rc < 0)
2267     {
2268     switch(rc)
2269     {
2270     case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
2271     /*
2272     Handle other special cases if you like
2273     */
2274     default: printf("Matching error %d\n", rc); break;
2275     }
2276     return 1;
2277     }
2278    
2279     /* Match succeded */
2280    
2281     printf("Match succeeded\n");
2282    
2283     /* The output vector wasn't big enough */
2284    
2285     if (rc == 0)
2286     {
2287     rc = OVECCOUNT/3;
2288     printf("ovector only has room for %d captured "
2289     substrings\n", rc - 1);
2290     }
2291    
2292     /* Show substrings stored in the output vector */
2293    
2294     for (i = 0; i < rc; i++)
2295     {
2296     char *substring_start = argv[2] + ovector[2*i];
2297     int substring_length = ovector[2*i+1] - ovector[2*i];
2298     printf("%2d: %.*s\n", i, substring_length,
2299     substring_start);
2300     }
2301    
2302     return 0;
2303     }
2304    
2305    
2306    
2307 nigel 41 AUTHOR
2308     Philip Hazel <ph10@cam.ac.uk>
2309     University Computing Service,
2310     New Museums Site,
2311     Cambridge CB2 3QG, England.
2312     Phone: +44 1223 334714
2313    
2314 nigel 53 Last updated: 15 August 2001
2315     Copyright (c) 1997-2001 University of Cambridge.

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12