/[pcre]/code/tags/pcre-6.3/pcre_internal.h
ViewVC logotype

Contents of /code/tags/pcre-6.3/pcre_internal.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 84 - (hide annotations) (download)
Sat Feb 24 21:41:08 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 36859 byte(s)
Tag code/trunk as code/tags/pcre-6.3.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5    
6     /* PCRE is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language.
8    
9     Written by Philip Hazel
10     Copyright (c) 1997-2005 University of Cambridge
11    
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41     /* This header contains definitions that are shared between the different
42     modules, but which are not relevant to the exported API. This includes some
43     functions whose names all begin with "_pcre_". */
44    
45    
46     /* Define DEBUG to get debugging output on stdout. */
47    
48     /****
49     #define DEBUG
50     ****/
51    
52     /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
53     inline, and there are *still* stupid compilers about that don't like indented
54     pre-processor statements, or at least there were when I first wrote this. After
55     all, it had only been about 10 years then... */
56    
57     #ifdef DEBUG
58     #define DPRINTF(p) printf p
59     #else
60     #define DPRINTF(p) /*nothing*/
61     #endif
62    
63    
64     /* Get the definitions provided by running "configure" */
65    
66     #include "config.h"
67    
68     /* Standard C headers plus the external interface definition. The only time
69     setjmp and stdarg are used is when NO_RECURSE is set. */
70    
71     #include <ctype.h>
72     #include <limits.h>
73     #include <setjmp.h>
74     #include <stdarg.h>
75     #include <stddef.h>
76     #include <stdio.h>
77     #include <stdlib.h>
78     #include <string.h>
79    
80     #ifndef PCRE_SPY
81     #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
82     #endif
83    
84     /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
85     cannot determine these outside the compilation (e.g. by running a program as
86     part of "configure") because PCRE is often cross-compiled for use on other
87     systems. Instead we make use of the maximum sizes that are available at
88     preprocessor time in standard C environments. */
89    
90     #if USHRT_MAX == 65535
91     typedef unsigned short pcre_uint16;
92     #elif UINT_MAX == 65535
93     typedef unsigned int pcre_uint16;
94     #else
95     #error Cannot determine a type for 16-bit unsigned integers
96     #endif
97    
98     #if UINT_MAX == 4294967295
99     typedef unsigned int pcre_uint32;
100     #elif ULONG_MAX == 4294967295
101     typedef unsigned long int pcre_uint32;
102     #else
103     #error Cannot determine a type for 32-bit unsigned integers
104     #endif
105    
106     /* All character handling must be done as unsigned characters. Otherwise there
107     are problems with top-bit-set characters and functions such as isspace().
108     However, we leave the interface to the outside world as char *, because that
109     should make things easier for callers. We define a short type for unsigned char
110     to save lots of typing. I tried "uchar", but it causes problems on Digital
111     Unix, where it is defined in sys/types, so use "uschar" instead. */
112    
113     typedef unsigned char uschar;
114    
115     /* Include the public PCRE header */
116    
117     #include "pcre.h"
118    
119     /* Include the (copy of) the public ucp header, changing the external name into
120     a private one. This does no harm, even if we aren't compiling UCP support. */
121    
122     #define ucp_findchar _pcre_ucp_findchar
123     #include "ucp.h"
124    
125     /* When compiling for use with the Virtual Pascal compiler, these functions
126     need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
127     option on the command line. */
128    
129     #ifdef VPCOMPAT
130     #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
131     #define memcpy(d,s,n) _memcpy(d,s,n)
132     #define memmove(d,s,n) _memmove(d,s,n)
133     #define memset(s,c,n) _memset(s,c,n)
134     #else /* VPCOMPAT */
135    
136     /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
137     define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
138     is set. Otherwise, include an emulating function for those systems that have
139     neither (there some non-Unix environments where this is the case). This assumes
140     that all calls to memmove are moving strings upwards in store, which is the
141     case in PCRE. */
142    
143     #if ! HAVE_MEMMOVE
144     #undef memmove /* some systems may have a macro */
145     #if HAVE_BCOPY
146     #define memmove(a, b, c) bcopy(b, a, c)
147     #else /* HAVE_BCOPY */
148     void *
149     pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
150     {
151     int i;
152     dest += n;
153     src += n;
154     for (i = 0; i < n; ++i) *(--dest) = *(--src);
155     }
156     #define memmove(a, b, c) pcre_memmove(a, b, c)
157     #endif /* not HAVE_BCOPY */
158     #endif /* not HAVE_MEMMOVE */
159     #endif /* not VPCOMPAT */
160    
161    
162     /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
163     in big-endian order) by default. These are used, for example, to link from the
164     start of a subpattern to its alternatives and its end. The use of 2 bytes per
165     offset limits the size of the compiled regex to around 64K, which is big enough
166     for almost everybody. However, I received a request for an even bigger limit.
167     For this reason, and also to make the code easier to maintain, the storing and
168     loading of offsets from the byte string is now handled by the macros that are
169     defined here.
170    
171     The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
172     the config.h file, but can be overridden by using -D on the command line. This
173     is automated on Unix systems via the "configure" command. */
174    
175     #if LINK_SIZE == 2
176    
177     #define PUT(a,n,d) \
178     (a[n] = (d) >> 8), \
179     (a[(n)+1] = (d) & 255)
180    
181     #define GET(a,n) \
182     (((a)[n] << 8) | (a)[(n)+1])
183    
184     #define MAX_PATTERN_SIZE (1 << 16)
185    
186    
187     #elif LINK_SIZE == 3
188    
189     #define PUT(a,n,d) \
190     (a[n] = (d) >> 16), \
191     (a[(n)+1] = (d) >> 8), \
192     (a[(n)+2] = (d) & 255)
193    
194     #define GET(a,n) \
195     (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
196    
197     #define MAX_PATTERN_SIZE (1 << 24)
198    
199    
200     #elif LINK_SIZE == 4
201    
202     #define PUT(a,n,d) \
203     (a[n] = (d) >> 24), \
204     (a[(n)+1] = (d) >> 16), \
205     (a[(n)+2] = (d) >> 8), \
206     (a[(n)+3] = (d) & 255)
207    
208     #define GET(a,n) \
209     (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
210    
211     #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
212    
213    
214     #else
215     #error LINK_SIZE must be either 2, 3, or 4
216     #endif
217    
218    
219     /* Convenience macro defined in terms of the others */
220    
221     #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
222    
223    
224     /* PCRE uses some other 2-byte quantities that do not change when the size of
225     offsets changes. There are used for repeat counts and for other things such as
226     capturing parenthesis numbers in back references. */
227    
228     #define PUT2(a,n,d) \
229     a[n] = (d) >> 8; \
230     a[(n)+1] = (d) & 255
231    
232     #define GET2(a,n) \
233     (((a)[n] << 8) | (a)[(n)+1])
234    
235     #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
236    
237    
238     /* When UTF-8 encoding is being used, a character is no longer just a single
239     byte. The macros for character handling generate simple sequences when used in
240     byte-mode, and more complicated ones for UTF-8 characters. */
241    
242     #ifndef SUPPORT_UTF8
243     #define GETCHAR(c, eptr) c = *eptr;
244     #define GETCHARTEST(c, eptr) c = *eptr;
245     #define GETCHARINC(c, eptr) c = *eptr++;
246     #define GETCHARINCTEST(c, eptr) c = *eptr++;
247     #define GETCHARLEN(c, eptr, len) c = *eptr;
248     #define BACKCHAR(eptr)
249    
250     #else /* SUPPORT_UTF8 */
251    
252     /* Get the next UTF-8 character, not advancing the pointer. This is called when
253     we know we are in UTF-8 mode. */
254    
255     #define GETCHAR(c, eptr) \
256     c = *eptr; \
257     if ((c & 0xc0) == 0xc0) \
258     { \
259     int gcii; \
260     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
261     int gcss = 6*gcaa; \
262     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
263     for (gcii = 1; gcii <= gcaa; gcii++) \
264     { \
265     gcss -= 6; \
266     c |= (eptr[gcii] & 0x3f) << gcss; \
267     } \
268     }
269    
270     /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
271     pointer. */
272    
273     #define GETCHARTEST(c, eptr) \
274     c = *eptr; \
275     if (utf8 && (c & 0xc0) == 0xc0) \
276     { \
277     int gcii; \
278     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
279     int gcss = 6*gcaa; \
280     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
281     for (gcii = 1; gcii <= gcaa; gcii++) \
282     { \
283     gcss -= 6; \
284     c |= (eptr[gcii] & 0x3f) << gcss; \
285     } \
286     }
287    
288     /* Get the next UTF-8 character, advancing the pointer. This is called when we
289     know we are in UTF-8 mode. */
290    
291     #define GETCHARINC(c, eptr) \
292     c = *eptr++; \
293     if ((c & 0xc0) == 0xc0) \
294     { \
295     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
296     int gcss = 6*gcaa; \
297     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
298     while (gcaa-- > 0) \
299     { \
300     gcss -= 6; \
301     c |= (*eptr++ & 0x3f) << gcss; \
302     } \
303     }
304    
305     /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
306    
307     #define GETCHARINCTEST(c, eptr) \
308     c = *eptr++; \
309     if (utf8 && (c & 0xc0) == 0xc0) \
310     { \
311     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
312     int gcss = 6*gcaa; \
313     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
314     while (gcaa-- > 0) \
315     { \
316     gcss -= 6; \
317     c |= (*eptr++ & 0x3f) << gcss; \
318     } \
319     }
320    
321     /* Get the next UTF-8 character, not advancing the pointer, incrementing length
322     if there are extra bytes. This is called when we know we are in UTF-8 mode. */
323    
324     #define GETCHARLEN(c, eptr, len) \
325     c = *eptr; \
326     if ((c & 0xc0) == 0xc0) \
327     { \
328     int gcii; \
329     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
330     int gcss = 6*gcaa; \
331     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
332     for (gcii = 1; gcii <= gcaa; gcii++) \
333     { \
334     gcss -= 6; \
335     c |= (eptr[gcii] & 0x3f) << gcss; \
336     } \
337     len += gcaa; \
338     }
339    
340     /* If the pointer is not at the start of a character, move it back until
341     it is. Called only in UTF-8 mode. */
342    
343     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
344    
345     #endif
346    
347    
348     /* In case there is no definition of offsetof() provided - though any proper
349     Standard C system should have one. */
350    
351     #ifndef offsetof
352     #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
353     #endif
354    
355    
356     /* These are the public options that can change during matching. */
357    
358     #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
359    
360     /* Private options flags start at the most significant end of the four bytes,
361     but skip the top bit so we can use ints for convenience without getting tangled
362     with negative values. The public options defined in pcre.h start at the least
363     significant end. Make sure they don't overlap! */
364    
365     #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
366     #define PCRE_REQCHSET 0x20000000 /* req_byte is set */
367     #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
368     #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
369     #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
370    
371     /* Options for the "extra" block produced by pcre_study(). */
372    
373     #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
374    
375     /* Masks for identifying the public options that are permitted at compile
376     time, run time, or study time, respectively. */
377    
378     #define PUBLIC_OPTIONS \
379     (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
380     PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
381     PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE)
382    
383     #define PUBLIC_EXEC_OPTIONS \
384     (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
385     PCRE_PARTIAL)
386    
387     #define PUBLIC_DFA_EXEC_OPTIONS \
388     (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
389     PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART)
390    
391     #define PUBLIC_STUDY_OPTIONS 0 /* None defined */
392    
393     /* Magic number to provide a small check against being handed junk. Also used
394     to detect whether a pattern was compiled on a host of different endianness. */
395    
396     #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
397    
398     /* Negative values for the firstchar and reqchar variables */
399    
400     #define REQ_UNSET (-2)
401     #define REQ_NONE (-1)
402    
403     /* The maximum remaining length of subject we are prepared to search for a
404     req_byte match. */
405    
406     #define REQ_BYTE_MAX 1000
407    
408     /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
409     variable-length repeat, or a anything other than literal characters. */
410    
411     #define REQ_CASELESS 0x0100 /* indicates caselessness */
412     #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
413    
414     /* Miscellaneous definitions */
415    
416     typedef int BOOL;
417    
418     #define FALSE 0
419     #define TRUE 1
420    
421     /* Escape items that are just an encoding of a particular data value. Note that
422     ESC_n is defined as yet another macro, which is set in config.h to either \n
423     (the default) or \r (which some people want). */
424    
425     #ifndef ESC_e
426     #define ESC_e 27
427     #endif
428    
429     #ifndef ESC_f
430     #define ESC_f '\f'
431     #endif
432    
433     #ifndef ESC_n
434     #define ESC_n NEWLINE
435     #endif
436    
437     #ifndef ESC_r
438     #define ESC_r '\r'
439     #endif
440    
441     /* We can't officially use ESC_t because it is a POSIX reserved identifier
442     (presumably because of all the others like size_t). */
443    
444     #ifndef ESC_tee
445     #define ESC_tee '\t'
446     #endif
447    
448     /* These are escaped items that aren't just an encoding of a particular data
449     value such as \n. They must have non-zero values, as check_escape() returns
450     their negation. Also, they must appear in the same order as in the opcode
451     definitions below, up to ESC_z. There's a dummy for OP_ANY because it
452     corresponds to "." rather than an escape sequence. The final one must be
453     ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
454     tests in the code for an escape greater than ESC_b and less than ESC_Z to
455     detect the types that may be repeated. These are the types that consume
456     characters. If any new escapes are put in between that don't consume a
457     character, that code will have to change. */
458    
459     enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
460     ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
461     ESC_Q, ESC_REF };
462    
463     /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
464     contain UTF-8 characters with values greater than 255. */
465    
466     #define XCL_NOT 0x01 /* Flag: this is a negative class */
467     #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
468    
469     #define XCL_END 0 /* Marks end of individual items */
470     #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
471     #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
472     #define XCL_PROP 3 /* Unicode property (one property code) follows */
473     #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
474    
475    
476     /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
477     that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
478     OP_EOD must correspond in order to the list of escapes immediately above.
479     Note that whenever this list is updated, the two macro definitions that follow
480     must also be updated to match. */
481    
482     enum {
483     OP_END, /* 0 End of pattern */
484    
485     /* Values corresponding to backslashed metacharacters */
486    
487     OP_SOD, /* 1 Start of data: \A */
488     OP_SOM, /* 2 Start of match (subject + offset): \G */
489     OP_NOT_WORD_BOUNDARY, /* 3 \B */
490     OP_WORD_BOUNDARY, /* 4 \b */
491     OP_NOT_DIGIT, /* 5 \D */
492     OP_DIGIT, /* 6 \d */
493     OP_NOT_WHITESPACE, /* 7 \S */
494     OP_WHITESPACE, /* 8 \s */
495     OP_NOT_WORDCHAR, /* 9 \W */
496     OP_WORDCHAR, /* 10 \w */
497     OP_ANY, /* 11 Match any character */
498     OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
499     OP_NOTPROP, /* 13 \P (not Unicode property) */
500     OP_PROP, /* 14 \p (Unicode property) */
501     OP_EXTUNI, /* 15 \X (extended Unicode sequence */
502     OP_EODN, /* 16 End of data or \n at end of data: \Z. */
503     OP_EOD, /* 17 End of data: \z */
504    
505     OP_OPT, /* 18 Set runtime options */
506     OP_CIRC, /* 19 Start of line - varies with multiline switch */
507     OP_DOLL, /* 20 End of line - varies with multiline switch */
508     OP_CHAR, /* 21 Match one character, casefully */
509     OP_CHARNC, /* 22 Match one character, caselessly */
510     OP_NOT, /* 23 Match anything but the following char */
511    
512     OP_STAR, /* 24 The maximizing and minimizing versions of */
513     OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
514     OP_PLUS, /* 26 the minimizing one second. */
515     OP_MINPLUS, /* 27 This first set applies to single characters */
516     OP_QUERY, /* 28 */
517     OP_MINQUERY, /* 29 */
518     OP_UPTO, /* 30 From 0 to n matches */
519     OP_MINUPTO, /* 31 */
520     OP_EXACT, /* 32 Exactly n matches */
521    
522     OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
523     OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
524     OP_NOTPLUS, /* 35 the minimizing one second. */
525     OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
526     OP_NOTQUERY, /* 37 */
527     OP_NOTMINQUERY, /* 38 */
528     OP_NOTUPTO, /* 39 From 0 to n matches */
529     OP_NOTMINUPTO, /* 40 */
530     OP_NOTEXACT, /* 41 Exactly n matches */
531    
532     OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
533     OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
534     OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
535     OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
536     OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
537     OP_TYPEMINQUERY, /* 47 */
538     OP_TYPEUPTO, /* 48 From 0 to n matches */
539     OP_TYPEMINUPTO, /* 49 */
540     OP_TYPEEXACT, /* 50 Exactly n matches */
541    
542     OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
543     OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
544     OP_CRPLUS, /* 53 the minimizing one second. These codes must */
545     OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
546     OP_CRQUERY, /* 55 These are for character classes and back refs */
547     OP_CRMINQUERY, /* 56 */
548     OP_CRRANGE, /* 57 These are different to the three sets above. */
549     OP_CRMINRANGE, /* 58 */
550    
551     OP_CLASS, /* 59 Match a character class, chars < 256 only */
552     OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
553     class - the difference is relevant only when a UTF-8
554     character > 255 is encountered. */
555    
556     OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
557     class. This does both positive and negative. */
558    
559     OP_REF, /* 62 Match a back reference */
560     OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
561     OP_CALLOUT, /* 64 Call out to external function if provided */
562    
563     OP_ALT, /* 65 Start of alternation */
564     OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
565     OP_KETRMAX, /* 67 These two must remain together and in this */
566     OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
567    
568     /* The assertions must come before ONCE and COND */
569    
570     OP_ASSERT, /* 69 Positive lookahead */
571     OP_ASSERT_NOT, /* 70 Negative lookahead */
572     OP_ASSERTBACK, /* 71 Positive lookbehind */
573     OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
574     OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
575    
576     /* ONCE and COND must come after the assertions, with ONCE first, as there's
577     a test for >= ONCE for a subpattern that isn't an assertion. */
578    
579     OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
580     OP_COND, /* 75 Conditional group */
581     OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
582    
583     OP_BRAZERO, /* 77 These two must remain together and in this */
584     OP_BRAMINZERO, /* 78 order. */
585    
586     OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
587     than can fit into an opcode. */
588    
589     OP_BRA /* 80 This and greater values are used for brackets that
590     extract substrings up to EXTRACT_BASIC_MAX. After
591     that, use is made of OP_BRANUMBER. */
592     };
593    
594     /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
595     study.c that all opcodes are less than 128 in value. This makes handling UTF-8
596     character sequences easier. */
597    
598     /* The highest extraction number before we have to start using additional
599     bytes. (Originally PCRE didn't have support for extraction counts highter than
600     this number.) The value is limited by the number of opcodes left after OP_BRA,
601     i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
602     opcodes. */
603    
604     #define EXTRACT_BASIC_MAX 100
605    
606    
607     /* This macro defines textual names for all the opcodes. These are used only
608     for debugging. The macro is referenced only in pcre_printint.c. */
609    
610     #define OP_NAME_LIST \
611     "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
612     "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
613     "notprop", "prop", "extuni", \
614     "\\Z", "\\z", \
615     "Opt", "^", "$", "char", "charnc", "not", \
616     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
617     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
618     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
619     "*", "*?", "+", "+?", "?", "??", "{", "{", \
620     "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
621     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
622     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
623     "Brazero", "Braminzero", "Branumber", "Bra"
624    
625    
626     /* This macro defines the length of fixed length operations in the compiled
627     regex. The lengths are used when searching for specific things, and also in the
628     debugging printing of a compiled regex. We use a macro so that it can be
629     defined close to the definitions of the opcodes themselves.
630    
631     As things have been extended, some of these are no longer fixed lenths, but are
632     minima instead. For example, the length of a single-character repeat may vary
633     in UTF-8 mode. The code that uses this table must know about such things. */
634    
635     #define OP_LENGTHS \
636     1, /* End */ \
637     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
638     1, 1, /* Any, Anybyte */ \
639     2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
640     1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
641     2, /* Char - the minimum length */ \
642     2, /* Charnc - the minimum length */ \
643     2, /* not */ \
644     /* Positive single-char repeats ** These are */ \
645     2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
646     4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
647     /* Negative single-char repeats - only for chars < 256 */ \
648     2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
649     4, 4, 4, /* NOT upto, minupto, exact */ \
650     /* Positive type repeats */ \
651     2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
652     4, 4, 4, /* Type upto, minupto, exact */ \
653     /* Character class & ref repeats */ \
654     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
655     5, 5, /* CRRANGE, CRMINRANGE */ \
656     33, /* CLASS */ \
657     33, /* NCLASS */ \
658     0, /* XCLASS - variable length */ \
659     3, /* REF */ \
660     1+LINK_SIZE, /* RECURSE */ \
661     2+2*LINK_SIZE, /* CALLOUT */ \
662     1+LINK_SIZE, /* Alt */ \
663     1+LINK_SIZE, /* Ket */ \
664     1+LINK_SIZE, /* KetRmax */ \
665     1+LINK_SIZE, /* KetRmin */ \
666     1+LINK_SIZE, /* Assert */ \
667     1+LINK_SIZE, /* Assert not */ \
668     1+LINK_SIZE, /* Assert behind */ \
669     1+LINK_SIZE, /* Assert behind not */ \
670     1+LINK_SIZE, /* Reverse */ \
671     1+LINK_SIZE, /* Once */ \
672     1+LINK_SIZE, /* COND */ \
673     3, /* CREF */ \
674     1, 1, /* BRAZERO, BRAMINZERO */ \
675     3, /* BRANUMBER */ \
676     1+LINK_SIZE /* BRA */ \
677    
678    
679     /* A magic value for OP_CREF to indicate the "in recursion" condition. */
680    
681     #define CREF_RECURSE 0xffff
682    
683     /* Error code numbers. They are given names so that they can more easily be
684     tracked. */
685    
686     enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
687     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
688     ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
689     ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
690     ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };
691    
692     /* The real format of the start of the pcre block; the index of names and the
693     code vector run on as long as necessary after the end. We store an explicit
694     offset to the name table so that if a regex is compiled on one host, saved, and
695     then run on another where the size of pointers is different, all might still
696     be well. For the case of compiled-on-4 and run-on-8, we include an extra
697     pointer that is always NULL. For future-proofing, a few dummy fields were
698     originally included - even though you can never get this planning right - but
699     there is only one left now.
700    
701     NOTE NOTE NOTE:
702     Because people can now save and re-use compiled patterns, any additions to this
703     structure should be made at the end, and something earlier (e.g. a new
704     flag in the options or one of the dummy fields) should indicate that the new
705     fields are present. Currently PCRE always sets the dummy fields to zero.
706     NOTE NOTE NOTE:
707     */
708    
709     typedef struct real_pcre {
710     pcre_uint32 magic_number;
711     pcre_uint32 size; /* Total that was malloced */
712     pcre_uint32 options;
713     pcre_uint32 dummy1; /* For future use, maybe */
714    
715     pcre_uint16 top_bracket;
716     pcre_uint16 top_backref;
717     pcre_uint16 first_byte;
718     pcre_uint16 req_byte;
719     pcre_uint16 name_table_offset; /* Offset to name table that follows */
720     pcre_uint16 name_entry_size; /* Size of any name items */
721     pcre_uint16 name_count; /* Number of name items */
722     pcre_uint16 ref_count; /* Reference count */
723    
724     const unsigned char *tables; /* Pointer to tables or NULL for std */
725     const unsigned char *nullpad; /* NULL padding */
726     } real_pcre;
727    
728     /* The format of the block used to store data from pcre_study(). The same
729     remark (see NOTE above) about extending this structure applies. */
730    
731     typedef struct pcre_study_data {
732     pcre_uint32 size; /* Total that was malloced */
733     pcre_uint32 options;
734     uschar start_bits[32];
735     } pcre_study_data;
736    
737     /* Structure for passing "static" information around between the functions
738     doing the compiling, so that they are thread-safe. */
739    
740     typedef struct compile_data {
741     const uschar *lcc; /* Points to lower casing table */
742     const uschar *fcc; /* Points to case-flipping table */
743     const uschar *cbits; /* Points to character type table */
744     const uschar *ctypes; /* Points to table of type maps */
745     const uschar *start_code; /* The start of the compiled code */
746     const uschar *start_pattern; /* The start of the pattern */
747     uschar *name_table; /* The name/number table */
748     int names_found; /* Number of entries so far */
749     int name_entry_size; /* Size of each entry */
750     int top_backref; /* Maximum back reference */
751     unsigned int backref_map; /* Bitmap of low back refs */
752     int req_varyopt; /* "After variable item" flag for reqbyte */
753     BOOL nopartial; /* Set TRUE if partial won't work */
754     } compile_data;
755    
756     /* Structure for maintaining a chain of pointers to the currently incomplete
757     branches, for testing for left recursion. */
758    
759     typedef struct branch_chain {
760     struct branch_chain *outer;
761     uschar *current;
762     } branch_chain;
763    
764     /* Structure for items in a linked list that represents an explicit recursive
765     call within the pattern. */
766    
767     typedef struct recursion_info {
768     struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
769     int group_num; /* Number of group that was called */
770     const uschar *after_call; /* "Return value": points after the call in the expr */
771     const uschar *save_start; /* Old value of md->start_match */
772     int *offset_save; /* Pointer to start of saved offsets */
773     int saved_max; /* Number of saved offsets */
774     } recursion_info;
775    
776     /* When compiling in a mode that doesn't use recursive calls to match(),
777     a structure is used to remember local variables on the heap. It is defined in
778     pcre.c, close to the match() function, so that it is easy to keep it in step
779     with any changes of local variable. However, the pointer to the current frame
780     must be saved in some "static" place over a longjmp(). We declare the
781     structure here so that we can put a pointer in the match_data structure.
782     NOTE: This isn't used for a "normal" compilation of pcre. */
783    
784     struct heapframe;
785    
786     /* Structure for passing "static" information around between the functions
787     doing traditional NFA matching, so that they are thread-safe. */
788    
789     typedef struct match_data {
790     unsigned long int match_call_count; /* As it says */
791     unsigned long int match_limit;/* As it says */
792     int *offset_vector; /* Offset vector */
793     int offset_end; /* One past the end */
794     int offset_max; /* The maximum usable for return data */
795     const uschar *lcc; /* Points to lower casing table */
796     const uschar *ctypes; /* Points to table of type maps */
797     BOOL offset_overflow; /* Set if too many extractions */
798     BOOL notbol; /* NOTBOL flag */
799     BOOL noteol; /* NOTEOL flag */
800     BOOL utf8; /* UTF8 flag */
801     BOOL endonly; /* Dollar not before final \n */
802     BOOL notempty; /* Empty string match not wanted */
803     BOOL partial; /* PARTIAL flag */
804     BOOL hitend; /* Hit the end of the subject at some point */
805     const uschar *start_code; /* For use when recursing */
806     const uschar *start_subject; /* Start of the subject string */
807     const uschar *end_subject; /* End of the subject string */
808     const uschar *start_match; /* Start of this match attempt */
809     const uschar *end_match_ptr; /* Subject position at end match */
810     int end_offset_top; /* Highwater mark at end of match */
811     int capture_last; /* Most recent capture number */
812     int start_offset; /* The start offset value */
813     recursion_info *recursive; /* Linked list of recursion data */
814     void *callout_data; /* To pass back to callouts */
815     struct heapframe *thisframe; /* Used only when compiling for no recursion */
816     } match_data;
817    
818     /* A similar structure is used for the same purpose by the DFA matching
819     functions. */
820    
821     typedef struct dfa_match_data {
822     const uschar *start_code; /* Start of the compiled pattern */
823     const uschar *start_subject; /* Start of the subject string */
824     const uschar *end_subject; /* End of subject string */
825     const uschar *tables; /* Character tables */
826     int moptions; /* Match options */
827     int poptions; /* Pattern options */
828     void *callout_data; /* To pass back to callouts */
829     } dfa_match_data;
830    
831     /* Bit definitions for entries in the pcre_ctypes table. */
832    
833     #define ctype_space 0x01
834     #define ctype_letter 0x02
835     #define ctype_digit 0x04
836     #define ctype_xdigit 0x08
837     #define ctype_word 0x10 /* alphameric or '_' */
838     #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
839    
840     /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
841     of bits for a class map. Some classes are built by combining these tables. */
842    
843     #define cbit_space 0 /* [:space:] or \s */
844     #define cbit_xdigit 32 /* [:xdigit:] */
845     #define cbit_digit 64 /* [:digit:] or \d */
846     #define cbit_upper 96 /* [:upper:] */
847     #define cbit_lower 128 /* [:lower:] */
848     #define cbit_word 160 /* [:word:] or \w */
849     #define cbit_graph 192 /* [:graph:] */
850     #define cbit_print 224 /* [:print:] */
851     #define cbit_punct 256 /* [:punct:] */
852     #define cbit_cntrl 288 /* [:cntrl:] */
853     #define cbit_length 320 /* Length of the cbits table */
854    
855     /* Offsets of the various tables from the base tables pointer, and
856     total length. */
857    
858     #define lcc_offset 0
859     #define fcc_offset 256
860     #define cbits_offset 512
861     #define ctypes_offset (cbits_offset + cbit_length)
862     #define tables_length (ctypes_offset + 256)
863    
864     /* Layout of the UCP type table that translates property names into codes for
865     ucp_findchar(). */
866    
867     typedef struct {
868     const char *name;
869     int value;
870     } ucp_type_table;
871    
872    
873     /* Internal shared data tables. These are tables that are used by more than one
874     of the exported public functions. They have to be "external" in the C sense,
875     but are not part of the PCRE public API. The data for these tables is in the
876     pcre_tables.c module. */
877    
878     extern const int _pcre_utf8_table1[];
879     extern const int _pcre_utf8_table2[];
880     extern const int _pcre_utf8_table3[];
881     extern const uschar _pcre_utf8_table4[];
882    
883     extern const int _pcre_utf8_table1_size;
884    
885     extern const ucp_type_table _pcre_utt[];
886     extern const int _pcre_utt_size;
887    
888     extern const uschar _pcre_default_tables[];
889    
890     extern const uschar _pcre_OP_lengths[];
891    
892    
893     /* Internal shared functions. These are functions that are used by more than
894     one of the exported public functions. They have to be "external" in the C
895     sense, but are not part of the PCRE public API. */
896    
897     extern int _pcre_ord2utf8(int, uschar *);
898     extern void _pcre_printint(pcre *, FILE *);
899     extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
900     const pcre_study_data *, pcre_study_data *);
901     extern int _pcre_ucp_findchar(const int, int *, int *);
902     extern int _pcre_valid_utf8(const uschar *, int);
903     extern BOOL _pcre_xclass(int, const uschar *);
904    
905     /* End of pcre_internal.h */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12