/[pcre]/code/trunk/pcre_internal.h
ViewVC logotype

Contents of /code/trunk/pcre_internal.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 85 - (hide annotations) (download)
Sat Feb 24 21:41:13 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 36724 byte(s)
Load pcre-6.4 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5    
6     /* PCRE is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language.
8    
9     Written by Philip Hazel
10     Copyright (c) 1997-2005 University of Cambridge
11    
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41     /* This header contains definitions that are shared between the different
42     modules, but which are not relevant to the exported API. This includes some
43     functions whose names all begin with "_pcre_". */
44    
45 nigel 85 #ifndef PCRE_INTERNAL_H
46     #define PCRE_INTERNAL_H
47 nigel 77
48     /* Define DEBUG to get debugging output on stdout. */
49    
50 nigel 85 #if 0
51 nigel 77 #define DEBUG
52 nigel 85 #endif
53 nigel 77
54     /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
55     inline, and there are *still* stupid compilers about that don't like indented
56     pre-processor statements, or at least there were when I first wrote this. After
57     all, it had only been about 10 years then... */
58    
59     #ifdef DEBUG
60     #define DPRINTF(p) printf p
61     #else
62     #define DPRINTF(p) /*nothing*/
63     #endif
64    
65    
66     /* Get the definitions provided by running "configure" */
67    
68     #include "config.h"
69    
70     /* Standard C headers plus the external interface definition. The only time
71     setjmp and stdarg are used is when NO_RECURSE is set. */
72    
73     #include <ctype.h>
74     #include <limits.h>
75     #include <setjmp.h>
76     #include <stdarg.h>
77     #include <stddef.h>
78     #include <stdio.h>
79     #include <stdlib.h>
80     #include <string.h>
81    
82     #ifndef PCRE_SPY
83     #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
84     #endif
85    
86     /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
87     cannot determine these outside the compilation (e.g. by running a program as
88     part of "configure") because PCRE is often cross-compiled for use on other
89     systems. Instead we make use of the maximum sizes that are available at
90     preprocessor time in standard C environments. */
91    
92     #if USHRT_MAX == 65535
93     typedef unsigned short pcre_uint16;
94     #elif UINT_MAX == 65535
95     typedef unsigned int pcre_uint16;
96     #else
97     #error Cannot determine a type for 16-bit unsigned integers
98     #endif
99    
100     #if UINT_MAX == 4294967295
101     typedef unsigned int pcre_uint32;
102     #elif ULONG_MAX == 4294967295
103     typedef unsigned long int pcre_uint32;
104     #else
105     #error Cannot determine a type for 32-bit unsigned integers
106     #endif
107    
108     /* All character handling must be done as unsigned characters. Otherwise there
109     are problems with top-bit-set characters and functions such as isspace().
110     However, we leave the interface to the outside world as char *, because that
111     should make things easier for callers. We define a short type for unsigned char
112     to save lots of typing. I tried "uchar", but it causes problems on Digital
113     Unix, where it is defined in sys/types, so use "uschar" instead. */
114    
115     typedef unsigned char uschar;
116    
117 nigel 85 /* Include the public PCRE header and the definitions of UCP character property
118     values. */
119 nigel 77
120     #include "pcre.h"
121     #include "ucp.h"
122    
123     /* When compiling for use with the Virtual Pascal compiler, these functions
124     need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
125     option on the command line. */
126    
127     #ifdef VPCOMPAT
128     #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
129     #define memcpy(d,s,n) _memcpy(d,s,n)
130     #define memmove(d,s,n) _memmove(d,s,n)
131     #define memset(s,c,n) _memset(s,c,n)
132     #else /* VPCOMPAT */
133    
134     /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
135     define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
136     is set. Otherwise, include an emulating function for those systems that have
137     neither (there some non-Unix environments where this is the case). This assumes
138     that all calls to memmove are moving strings upwards in store, which is the
139     case in PCRE. */
140    
141     #if ! HAVE_MEMMOVE
142     #undef memmove /* some systems may have a macro */
143     #if HAVE_BCOPY
144     #define memmove(a, b, c) bcopy(b, a, c)
145     #else /* HAVE_BCOPY */
146     void *
147     pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
148     {
149     int i;
150     dest += n;
151     src += n;
152     for (i = 0; i < n; ++i) *(--dest) = *(--src);
153     }
154     #define memmove(a, b, c) pcre_memmove(a, b, c)
155     #endif /* not HAVE_BCOPY */
156     #endif /* not HAVE_MEMMOVE */
157     #endif /* not VPCOMPAT */
158    
159    
160     /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
161     in big-endian order) by default. These are used, for example, to link from the
162     start of a subpattern to its alternatives and its end. The use of 2 bytes per
163     offset limits the size of the compiled regex to around 64K, which is big enough
164     for almost everybody. However, I received a request for an even bigger limit.
165     For this reason, and also to make the code easier to maintain, the storing and
166     loading of offsets from the byte string is now handled by the macros that are
167     defined here.
168    
169     The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
170     the config.h file, but can be overridden by using -D on the command line. This
171     is automated on Unix systems via the "configure" command. */
172    
173     #if LINK_SIZE == 2
174    
175     #define PUT(a,n,d) \
176     (a[n] = (d) >> 8), \
177     (a[(n)+1] = (d) & 255)
178    
179     #define GET(a,n) \
180     (((a)[n] << 8) | (a)[(n)+1])
181    
182     #define MAX_PATTERN_SIZE (1 << 16)
183    
184    
185     #elif LINK_SIZE == 3
186    
187     #define PUT(a,n,d) \
188     (a[n] = (d) >> 16), \
189     (a[(n)+1] = (d) >> 8), \
190     (a[(n)+2] = (d) & 255)
191    
192     #define GET(a,n) \
193     (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
194    
195     #define MAX_PATTERN_SIZE (1 << 24)
196    
197    
198     #elif LINK_SIZE == 4
199    
200     #define PUT(a,n,d) \
201     (a[n] = (d) >> 24), \
202     (a[(n)+1] = (d) >> 16), \
203     (a[(n)+2] = (d) >> 8), \
204     (a[(n)+3] = (d) & 255)
205    
206     #define GET(a,n) \
207     (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
208    
209     #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
210    
211    
212     #else
213     #error LINK_SIZE must be either 2, 3, or 4
214     #endif
215    
216    
217     /* Convenience macro defined in terms of the others */
218    
219     #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
220    
221    
222     /* PCRE uses some other 2-byte quantities that do not change when the size of
223     offsets changes. There are used for repeat counts and for other things such as
224     capturing parenthesis numbers in back references. */
225    
226     #define PUT2(a,n,d) \
227     a[n] = (d) >> 8; \
228     a[(n)+1] = (d) & 255
229    
230     #define GET2(a,n) \
231     (((a)[n] << 8) | (a)[(n)+1])
232    
233     #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
234    
235    
236     /* When UTF-8 encoding is being used, a character is no longer just a single
237     byte. The macros for character handling generate simple sequences when used in
238     byte-mode, and more complicated ones for UTF-8 characters. */
239    
240     #ifndef SUPPORT_UTF8
241     #define GETCHAR(c, eptr) c = *eptr;
242     #define GETCHARTEST(c, eptr) c = *eptr;
243     #define GETCHARINC(c, eptr) c = *eptr++;
244     #define GETCHARINCTEST(c, eptr) c = *eptr++;
245     #define GETCHARLEN(c, eptr, len) c = *eptr;
246     #define BACKCHAR(eptr)
247    
248     #else /* SUPPORT_UTF8 */
249    
250     /* Get the next UTF-8 character, not advancing the pointer. This is called when
251     we know we are in UTF-8 mode. */
252    
253     #define GETCHAR(c, eptr) \
254     c = *eptr; \
255     if ((c & 0xc0) == 0xc0) \
256     { \
257     int gcii; \
258     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
259     int gcss = 6*gcaa; \
260     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
261     for (gcii = 1; gcii <= gcaa; gcii++) \
262     { \
263     gcss -= 6; \
264     c |= (eptr[gcii] & 0x3f) << gcss; \
265     } \
266     }
267    
268     /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
269     pointer. */
270    
271     #define GETCHARTEST(c, eptr) \
272     c = *eptr; \
273     if (utf8 && (c & 0xc0) == 0xc0) \
274     { \
275     int gcii; \
276     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
277     int gcss = 6*gcaa; \
278     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
279     for (gcii = 1; gcii <= gcaa; gcii++) \
280     { \
281     gcss -= 6; \
282     c |= (eptr[gcii] & 0x3f) << gcss; \
283     } \
284     }
285    
286     /* Get the next UTF-8 character, advancing the pointer. This is called when we
287     know we are in UTF-8 mode. */
288    
289     #define GETCHARINC(c, eptr) \
290     c = *eptr++; \
291     if ((c & 0xc0) == 0xc0) \
292     { \
293     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
294     int gcss = 6*gcaa; \
295     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
296     while (gcaa-- > 0) \
297     { \
298     gcss -= 6; \
299     c |= (*eptr++ & 0x3f) << gcss; \
300     } \
301     }
302    
303     /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
304    
305     #define GETCHARINCTEST(c, eptr) \
306     c = *eptr++; \
307     if (utf8 && (c & 0xc0) == 0xc0) \
308     { \
309     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
310     int gcss = 6*gcaa; \
311     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
312     while (gcaa-- > 0) \
313     { \
314     gcss -= 6; \
315     c |= (*eptr++ & 0x3f) << gcss; \
316     } \
317     }
318    
319     /* Get the next UTF-8 character, not advancing the pointer, incrementing length
320     if there are extra bytes. This is called when we know we are in UTF-8 mode. */
321    
322     #define GETCHARLEN(c, eptr, len) \
323     c = *eptr; \
324     if ((c & 0xc0) == 0xc0) \
325     { \
326     int gcii; \
327     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
328     int gcss = 6*gcaa; \
329     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
330     for (gcii = 1; gcii <= gcaa; gcii++) \
331     { \
332     gcss -= 6; \
333     c |= (eptr[gcii] & 0x3f) << gcss; \
334     } \
335     len += gcaa; \
336     }
337    
338     /* If the pointer is not at the start of a character, move it back until
339     it is. Called only in UTF-8 mode. */
340    
341     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
342    
343     #endif
344    
345    
346     /* In case there is no definition of offsetof() provided - though any proper
347     Standard C system should have one. */
348    
349     #ifndef offsetof
350     #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
351     #endif
352    
353    
354     /* These are the public options that can change during matching. */
355    
356     #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
357    
358     /* Private options flags start at the most significant end of the four bytes,
359     but skip the top bit so we can use ints for convenience without getting tangled
360     with negative values. The public options defined in pcre.h start at the least
361     significant end. Make sure they don't overlap! */
362    
363     #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
364     #define PCRE_REQCHSET 0x20000000 /* req_byte is set */
365     #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
366     #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
367     #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
368    
369     /* Options for the "extra" block produced by pcre_study(). */
370    
371     #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
372    
373     /* Masks for identifying the public options that are permitted at compile
374     time, run time, or study time, respectively. */
375    
376     #define PUBLIC_OPTIONS \
377     (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
378     PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
379     PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE)
380    
381     #define PUBLIC_EXEC_OPTIONS \
382     (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
383     PCRE_PARTIAL)
384    
385     #define PUBLIC_DFA_EXEC_OPTIONS \
386     (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
387     PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART)
388    
389     #define PUBLIC_STUDY_OPTIONS 0 /* None defined */
390    
391     /* Magic number to provide a small check against being handed junk. Also used
392     to detect whether a pattern was compiled on a host of different endianness. */
393    
394     #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
395    
396     /* Negative values for the firstchar and reqchar variables */
397    
398     #define REQ_UNSET (-2)
399     #define REQ_NONE (-1)
400    
401     /* The maximum remaining length of subject we are prepared to search for a
402     req_byte match. */
403    
404     #define REQ_BYTE_MAX 1000
405    
406     /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
407     variable-length repeat, or a anything other than literal characters. */
408    
409     #define REQ_CASELESS 0x0100 /* indicates caselessness */
410     #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
411    
412     /* Miscellaneous definitions */
413    
414     typedef int BOOL;
415    
416     #define FALSE 0
417     #define TRUE 1
418    
419     /* Escape items that are just an encoding of a particular data value. Note that
420     ESC_n is defined as yet another macro, which is set in config.h to either \n
421     (the default) or \r (which some people want). */
422    
423     #ifndef ESC_e
424     #define ESC_e 27
425     #endif
426    
427     #ifndef ESC_f
428     #define ESC_f '\f'
429     #endif
430    
431     #ifndef ESC_n
432     #define ESC_n NEWLINE
433     #endif
434    
435     #ifndef ESC_r
436     #define ESC_r '\r'
437     #endif
438    
439     /* We can't officially use ESC_t because it is a POSIX reserved identifier
440     (presumably because of all the others like size_t). */
441    
442     #ifndef ESC_tee
443     #define ESC_tee '\t'
444     #endif
445    
446     /* These are escaped items that aren't just an encoding of a particular data
447     value such as \n. They must have non-zero values, as check_escape() returns
448     their negation. Also, they must appear in the same order as in the opcode
449     definitions below, up to ESC_z. There's a dummy for OP_ANY because it
450     corresponds to "." rather than an escape sequence. The final one must be
451     ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
452     tests in the code for an escape greater than ESC_b and less than ESC_Z to
453     detect the types that may be repeated. These are the types that consume
454     characters. If any new escapes are put in between that don't consume a
455     character, that code will have to change. */
456    
457     enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
458     ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
459     ESC_Q, ESC_REF };
460    
461     /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
462     contain UTF-8 characters with values greater than 255. */
463    
464     #define XCL_NOT 0x01 /* Flag: this is a negative class */
465     #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
466    
467     #define XCL_END 0 /* Marks end of individual items */
468     #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
469     #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
470     #define XCL_PROP 3 /* Unicode property (one property code) follows */
471     #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
472    
473    
474     /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
475     that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
476     OP_EOD must correspond in order to the list of escapes immediately above.
477     Note that whenever this list is updated, the two macro definitions that follow
478     must also be updated to match. */
479    
480     enum {
481     OP_END, /* 0 End of pattern */
482    
483     /* Values corresponding to backslashed metacharacters */
484    
485     OP_SOD, /* 1 Start of data: \A */
486     OP_SOM, /* 2 Start of match (subject + offset): \G */
487     OP_NOT_WORD_BOUNDARY, /* 3 \B */
488     OP_WORD_BOUNDARY, /* 4 \b */
489     OP_NOT_DIGIT, /* 5 \D */
490     OP_DIGIT, /* 6 \d */
491     OP_NOT_WHITESPACE, /* 7 \S */
492     OP_WHITESPACE, /* 8 \s */
493     OP_NOT_WORDCHAR, /* 9 \W */
494     OP_WORDCHAR, /* 10 \w */
495     OP_ANY, /* 11 Match any character */
496     OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
497     OP_NOTPROP, /* 13 \P (not Unicode property) */
498     OP_PROP, /* 14 \p (Unicode property) */
499     OP_EXTUNI, /* 15 \X (extended Unicode sequence */
500     OP_EODN, /* 16 End of data or \n at end of data: \Z. */
501     OP_EOD, /* 17 End of data: \z */
502    
503     OP_OPT, /* 18 Set runtime options */
504     OP_CIRC, /* 19 Start of line - varies with multiline switch */
505     OP_DOLL, /* 20 End of line - varies with multiline switch */
506     OP_CHAR, /* 21 Match one character, casefully */
507     OP_CHARNC, /* 22 Match one character, caselessly */
508     OP_NOT, /* 23 Match anything but the following char */
509    
510     OP_STAR, /* 24 The maximizing and minimizing versions of */
511     OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
512     OP_PLUS, /* 26 the minimizing one second. */
513     OP_MINPLUS, /* 27 This first set applies to single characters */
514     OP_QUERY, /* 28 */
515     OP_MINQUERY, /* 29 */
516     OP_UPTO, /* 30 From 0 to n matches */
517     OP_MINUPTO, /* 31 */
518     OP_EXACT, /* 32 Exactly n matches */
519    
520     OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
521     OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
522     OP_NOTPLUS, /* 35 the minimizing one second. */
523     OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
524     OP_NOTQUERY, /* 37 */
525     OP_NOTMINQUERY, /* 38 */
526     OP_NOTUPTO, /* 39 From 0 to n matches */
527     OP_NOTMINUPTO, /* 40 */
528     OP_NOTEXACT, /* 41 Exactly n matches */
529    
530     OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
531     OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
532     OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
533     OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
534     OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
535     OP_TYPEMINQUERY, /* 47 */
536     OP_TYPEUPTO, /* 48 From 0 to n matches */
537     OP_TYPEMINUPTO, /* 49 */
538     OP_TYPEEXACT, /* 50 Exactly n matches */
539    
540     OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
541     OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
542     OP_CRPLUS, /* 53 the minimizing one second. These codes must */
543     OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
544     OP_CRQUERY, /* 55 These are for character classes and back refs */
545     OP_CRMINQUERY, /* 56 */
546     OP_CRRANGE, /* 57 These are different to the three sets above. */
547     OP_CRMINRANGE, /* 58 */
548    
549     OP_CLASS, /* 59 Match a character class, chars < 256 only */
550     OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
551     class - the difference is relevant only when a UTF-8
552     character > 255 is encountered. */
553    
554     OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
555     class. This does both positive and negative. */
556    
557     OP_REF, /* 62 Match a back reference */
558     OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
559     OP_CALLOUT, /* 64 Call out to external function if provided */
560    
561     OP_ALT, /* 65 Start of alternation */
562     OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
563     OP_KETRMAX, /* 67 These two must remain together and in this */
564     OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
565    
566     /* The assertions must come before ONCE and COND */
567    
568     OP_ASSERT, /* 69 Positive lookahead */
569     OP_ASSERT_NOT, /* 70 Negative lookahead */
570     OP_ASSERTBACK, /* 71 Positive lookbehind */
571     OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
572     OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
573    
574     /* ONCE and COND must come after the assertions, with ONCE first, as there's
575     a test for >= ONCE for a subpattern that isn't an assertion. */
576    
577     OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
578     OP_COND, /* 75 Conditional group */
579     OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
580    
581     OP_BRAZERO, /* 77 These two must remain together and in this */
582     OP_BRAMINZERO, /* 78 order. */
583    
584     OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
585     than can fit into an opcode. */
586    
587     OP_BRA /* 80 This and greater values are used for brackets that
588     extract substrings up to EXTRACT_BASIC_MAX. After
589     that, use is made of OP_BRANUMBER. */
590     };
591    
592     /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
593     study.c that all opcodes are less than 128 in value. This makes handling UTF-8
594     character sequences easier. */
595    
596     /* The highest extraction number before we have to start using additional
597     bytes. (Originally PCRE didn't have support for extraction counts highter than
598     this number.) The value is limited by the number of opcodes left after OP_BRA,
599     i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
600     opcodes. */
601    
602     #define EXTRACT_BASIC_MAX 100
603    
604    
605     /* This macro defines textual names for all the opcodes. These are used only
606     for debugging. The macro is referenced only in pcre_printint.c. */
607    
608     #define OP_NAME_LIST \
609     "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
610     "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
611     "notprop", "prop", "extuni", \
612     "\\Z", "\\z", \
613     "Opt", "^", "$", "char", "charnc", "not", \
614     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
615     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
616     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
617     "*", "*?", "+", "+?", "?", "??", "{", "{", \
618     "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
619     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
620     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
621     "Brazero", "Braminzero", "Branumber", "Bra"
622    
623    
624     /* This macro defines the length of fixed length operations in the compiled
625     regex. The lengths are used when searching for specific things, and also in the
626     debugging printing of a compiled regex. We use a macro so that it can be
627     defined close to the definitions of the opcodes themselves.
628    
629     As things have been extended, some of these are no longer fixed lenths, but are
630     minima instead. For example, the length of a single-character repeat may vary
631     in UTF-8 mode. The code that uses this table must know about such things. */
632    
633     #define OP_LENGTHS \
634     1, /* End */ \
635     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
636     1, 1, /* Any, Anybyte */ \
637     2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
638     1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
639     2, /* Char - the minimum length */ \
640     2, /* Charnc - the minimum length */ \
641     2, /* not */ \
642     /* Positive single-char repeats ** These are */ \
643     2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
644     4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
645     /* Negative single-char repeats - only for chars < 256 */ \
646     2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
647     4, 4, 4, /* NOT upto, minupto, exact */ \
648     /* Positive type repeats */ \
649     2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
650     4, 4, 4, /* Type upto, minupto, exact */ \
651     /* Character class & ref repeats */ \
652     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
653     5, 5, /* CRRANGE, CRMINRANGE */ \
654     33, /* CLASS */ \
655     33, /* NCLASS */ \
656     0, /* XCLASS - variable length */ \
657     3, /* REF */ \
658     1+LINK_SIZE, /* RECURSE */ \
659     2+2*LINK_SIZE, /* CALLOUT */ \
660     1+LINK_SIZE, /* Alt */ \
661     1+LINK_SIZE, /* Ket */ \
662     1+LINK_SIZE, /* KetRmax */ \
663     1+LINK_SIZE, /* KetRmin */ \
664     1+LINK_SIZE, /* Assert */ \
665     1+LINK_SIZE, /* Assert not */ \
666     1+LINK_SIZE, /* Assert behind */ \
667     1+LINK_SIZE, /* Assert behind not */ \
668     1+LINK_SIZE, /* Reverse */ \
669     1+LINK_SIZE, /* Once */ \
670     1+LINK_SIZE, /* COND */ \
671     3, /* CREF */ \
672     1, 1, /* BRAZERO, BRAMINZERO */ \
673     3, /* BRANUMBER */ \
674     1+LINK_SIZE /* BRA */ \
675    
676    
677     /* A magic value for OP_CREF to indicate the "in recursion" condition. */
678    
679     #define CREF_RECURSE 0xffff
680    
681     /* Error code numbers. They are given names so that they can more easily be
682     tracked. */
683    
684     enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
685     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
686     ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
687     ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
688     ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 };
689    
690     /* The real format of the start of the pcre block; the index of names and the
691     code vector run on as long as necessary after the end. We store an explicit
692     offset to the name table so that if a regex is compiled on one host, saved, and
693     then run on another where the size of pointers is different, all might still
694     be well. For the case of compiled-on-4 and run-on-8, we include an extra
695     pointer that is always NULL. For future-proofing, a few dummy fields were
696     originally included - even though you can never get this planning right - but
697     there is only one left now.
698    
699     NOTE NOTE NOTE:
700     Because people can now save and re-use compiled patterns, any additions to this
701     structure should be made at the end, and something earlier (e.g. a new
702     flag in the options or one of the dummy fields) should indicate that the new
703     fields are present. Currently PCRE always sets the dummy fields to zero.
704     NOTE NOTE NOTE:
705     */
706    
707     typedef struct real_pcre {
708     pcre_uint32 magic_number;
709     pcre_uint32 size; /* Total that was malloced */
710     pcre_uint32 options;
711     pcre_uint32 dummy1; /* For future use, maybe */
712    
713     pcre_uint16 top_bracket;
714     pcre_uint16 top_backref;
715     pcre_uint16 first_byte;
716     pcre_uint16 req_byte;
717     pcre_uint16 name_table_offset; /* Offset to name table that follows */
718     pcre_uint16 name_entry_size; /* Size of any name items */
719     pcre_uint16 name_count; /* Number of name items */
720     pcre_uint16 ref_count; /* Reference count */
721    
722     const unsigned char *tables; /* Pointer to tables or NULL for std */
723     const unsigned char *nullpad; /* NULL padding */
724     } real_pcre;
725    
726     /* The format of the block used to store data from pcre_study(). The same
727     remark (see NOTE above) about extending this structure applies. */
728    
729     typedef struct pcre_study_data {
730     pcre_uint32 size; /* Total that was malloced */
731     pcre_uint32 options;
732     uschar start_bits[32];
733     } pcre_study_data;
734    
735     /* Structure for passing "static" information around between the functions
736     doing the compiling, so that they are thread-safe. */
737    
738     typedef struct compile_data {
739     const uschar *lcc; /* Points to lower casing table */
740     const uschar *fcc; /* Points to case-flipping table */
741     const uschar *cbits; /* Points to character type table */
742     const uschar *ctypes; /* Points to table of type maps */
743     const uschar *start_code; /* The start of the compiled code */
744     const uschar *start_pattern; /* The start of the pattern */
745     uschar *name_table; /* The name/number table */
746     int names_found; /* Number of entries so far */
747     int name_entry_size; /* Size of each entry */
748     int top_backref; /* Maximum back reference */
749     unsigned int backref_map; /* Bitmap of low back refs */
750     int req_varyopt; /* "After variable item" flag for reqbyte */
751     BOOL nopartial; /* Set TRUE if partial won't work */
752     } compile_data;
753    
754     /* Structure for maintaining a chain of pointers to the currently incomplete
755     branches, for testing for left recursion. */
756    
757     typedef struct branch_chain {
758     struct branch_chain *outer;
759     uschar *current;
760     } branch_chain;
761    
762     /* Structure for items in a linked list that represents an explicit recursive
763     call within the pattern. */
764    
765     typedef struct recursion_info {
766     struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
767     int group_num; /* Number of group that was called */
768     const uschar *after_call; /* "Return value": points after the call in the expr */
769     const uschar *save_start; /* Old value of md->start_match */
770     int *offset_save; /* Pointer to start of saved offsets */
771     int saved_max; /* Number of saved offsets */
772     } recursion_info;
773    
774     /* When compiling in a mode that doesn't use recursive calls to match(),
775     a structure is used to remember local variables on the heap. It is defined in
776     pcre.c, close to the match() function, so that it is easy to keep it in step
777     with any changes of local variable. However, the pointer to the current frame
778     must be saved in some "static" place over a longjmp(). We declare the
779     structure here so that we can put a pointer in the match_data structure.
780     NOTE: This isn't used for a "normal" compilation of pcre. */
781    
782     struct heapframe;
783    
784     /* Structure for passing "static" information around between the functions
785     doing traditional NFA matching, so that they are thread-safe. */
786    
787     typedef struct match_data {
788     unsigned long int match_call_count; /* As it says */
789     unsigned long int match_limit;/* As it says */
790     int *offset_vector; /* Offset vector */
791     int offset_end; /* One past the end */
792     int offset_max; /* The maximum usable for return data */
793     const uschar *lcc; /* Points to lower casing table */
794     const uschar *ctypes; /* Points to table of type maps */
795     BOOL offset_overflow; /* Set if too many extractions */
796     BOOL notbol; /* NOTBOL flag */
797     BOOL noteol; /* NOTEOL flag */
798     BOOL utf8; /* UTF8 flag */
799     BOOL endonly; /* Dollar not before final \n */
800     BOOL notempty; /* Empty string match not wanted */
801     BOOL partial; /* PARTIAL flag */
802     BOOL hitend; /* Hit the end of the subject at some point */
803     const uschar *start_code; /* For use when recursing */
804     const uschar *start_subject; /* Start of the subject string */
805     const uschar *end_subject; /* End of the subject string */
806     const uschar *start_match; /* Start of this match attempt */
807     const uschar *end_match_ptr; /* Subject position at end match */
808     int end_offset_top; /* Highwater mark at end of match */
809     int capture_last; /* Most recent capture number */
810     int start_offset; /* The start offset value */
811     recursion_info *recursive; /* Linked list of recursion data */
812     void *callout_data; /* To pass back to callouts */
813     struct heapframe *thisframe; /* Used only when compiling for no recursion */
814     } match_data;
815    
816     /* A similar structure is used for the same purpose by the DFA matching
817     functions. */
818    
819     typedef struct dfa_match_data {
820     const uschar *start_code; /* Start of the compiled pattern */
821     const uschar *start_subject; /* Start of the subject string */
822     const uschar *end_subject; /* End of subject string */
823     const uschar *tables; /* Character tables */
824     int moptions; /* Match options */
825     int poptions; /* Pattern options */
826     void *callout_data; /* To pass back to callouts */
827     } dfa_match_data;
828    
829     /* Bit definitions for entries in the pcre_ctypes table. */
830    
831     #define ctype_space 0x01
832     #define ctype_letter 0x02
833     #define ctype_digit 0x04
834     #define ctype_xdigit 0x08
835     #define ctype_word 0x10 /* alphameric or '_' */
836     #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
837    
838     /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
839     of bits for a class map. Some classes are built by combining these tables. */
840    
841     #define cbit_space 0 /* [:space:] or \s */
842     #define cbit_xdigit 32 /* [:xdigit:] */
843     #define cbit_digit 64 /* [:digit:] or \d */
844     #define cbit_upper 96 /* [:upper:] */
845     #define cbit_lower 128 /* [:lower:] */
846     #define cbit_word 160 /* [:word:] or \w */
847     #define cbit_graph 192 /* [:graph:] */
848     #define cbit_print 224 /* [:print:] */
849     #define cbit_punct 256 /* [:punct:] */
850     #define cbit_cntrl 288 /* [:cntrl:] */
851     #define cbit_length 320 /* Length of the cbits table */
852    
853     /* Offsets of the various tables from the base tables pointer, and
854     total length. */
855    
856     #define lcc_offset 0
857     #define fcc_offset 256
858     #define cbits_offset 512
859     #define ctypes_offset (cbits_offset + cbit_length)
860     #define tables_length (ctypes_offset + 256)
861    
862     /* Layout of the UCP type table that translates property names into codes for
863 nigel 85 pcre_ucp_findchar(). */
864 nigel 77
865     typedef struct {
866     const char *name;
867     int value;
868     } ucp_type_table;
869    
870    
871     /* Internal shared data tables. These are tables that are used by more than one
872     of the exported public functions. They have to be "external" in the C sense,
873     but are not part of the PCRE public API. The data for these tables is in the
874     pcre_tables.c module. */
875    
876     extern const int _pcre_utf8_table1[];
877     extern const int _pcre_utf8_table2[];
878     extern const int _pcre_utf8_table3[];
879     extern const uschar _pcre_utf8_table4[];
880    
881     extern const int _pcre_utf8_table1_size;
882    
883     extern const ucp_type_table _pcre_utt[];
884     extern const int _pcre_utt_size;
885    
886     extern const uschar _pcre_default_tables[];
887    
888     extern const uschar _pcre_OP_lengths[];
889    
890    
891     /* Internal shared functions. These are functions that are used by more than
892     one of the exported public functions. They have to be "external" in the C
893     sense, but are not part of the PCRE public API. */
894    
895     extern int _pcre_ord2utf8(int, uschar *);
896     extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
897     const pcre_study_data *, pcre_study_data *);
898     extern int _pcre_ucp_findchar(const int, int *, int *);
899     extern int _pcre_valid_utf8(const uschar *, int);
900     extern BOOL _pcre_xclass(int, const uschar *);
901    
902 nigel 85 #endif
903    
904 nigel 77 /* End of pcre_internal.h */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12