/[pcre]/code/trunk/pcre_internal.h
ViewVC logotype

Contents of /code/trunk/pcre_internal.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (hide annotations) (download)
Sat Feb 24 21:41:34 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 38927 byte(s)
Load pcre-6.7 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5    
6     /* PCRE is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language.
8    
9     Written by Philip Hazel
10 nigel 87 Copyright (c) 1997-2006 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41     /* This header contains definitions that are shared between the different
42     modules, but which are not relevant to the exported API. This includes some
43     functions whose names all begin with "_pcre_". */
44    
45 nigel 85 #ifndef PCRE_INTERNAL_H
46     #define PCRE_INTERNAL_H
47 nigel 77
48     /* Define DEBUG to get debugging output on stdout. */
49    
50 nigel 85 #if 0
51 nigel 77 #define DEBUG
52 nigel 85 #endif
53 nigel 77
54     /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
55     inline, and there are *still* stupid compilers about that don't like indented
56     pre-processor statements, or at least there were when I first wrote this. After
57     all, it had only been about 10 years then... */
58    
59     #ifdef DEBUG
60     #define DPRINTF(p) printf p
61     #else
62     #define DPRINTF(p) /*nothing*/
63     #endif
64    
65    
66     /* Get the definitions provided by running "configure" */
67    
68     #include "config.h"
69    
70     /* Standard C headers plus the external interface definition. The only time
71     setjmp and stdarg are used is when NO_RECURSE is set. */
72    
73     #include <ctype.h>
74     #include <limits.h>
75     #include <setjmp.h>
76     #include <stdarg.h>
77     #include <stddef.h>
78     #include <stdio.h>
79     #include <stdlib.h>
80     #include <string.h>
81    
82     #ifndef PCRE_SPY
83     #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
84     #endif
85    
86     /* We need to have types that specify unsigned 16-bit and 32-bit integers. We
87     cannot determine these outside the compilation (e.g. by running a program as
88     part of "configure") because PCRE is often cross-compiled for use on other
89     systems. Instead we make use of the maximum sizes that are available at
90     preprocessor time in standard C environments. */
91    
92     #if USHRT_MAX == 65535
93     typedef unsigned short pcre_uint16;
94     #elif UINT_MAX == 65535
95     typedef unsigned int pcre_uint16;
96     #else
97     #error Cannot determine a type for 16-bit unsigned integers
98     #endif
99    
100     #if UINT_MAX == 4294967295
101     typedef unsigned int pcre_uint32;
102     #elif ULONG_MAX == 4294967295
103     typedef unsigned long int pcre_uint32;
104     #else
105     #error Cannot determine a type for 32-bit unsigned integers
106     #endif
107    
108     /* All character handling must be done as unsigned characters. Otherwise there
109     are problems with top-bit-set characters and functions such as isspace().
110     However, we leave the interface to the outside world as char *, because that
111     should make things easier for callers. We define a short type for unsigned char
112     to save lots of typing. I tried "uchar", but it causes problems on Digital
113     Unix, where it is defined in sys/types, so use "uschar" instead. */
114    
115     typedef unsigned char uschar;
116    
117 nigel 91 /* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The
118     following macro is used to package up testing for newlines. NLBLOCK is defined
119     in the various modules to indicate in which datablock the parameters exist. */
120    
121     #define IS_NEWLINE(p) \
122     ((p)[0] == NLBLOCK->nl[0] && \
123     (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]))
124    
125 nigel 87 /* When PCRE is compiled as a C++ library, the subject pointer can be replaced
126     with a custom type. This makes it possible, for example, to allow pcre_exec()
127     to process subject strings that are discontinuous by using a smart pointer
128     class. It must always be possible to inspect all of the subject string in
129     pcre_exec() because of the way it backtracks. Two macros are required in the
130     normal case, for sign-unspecified and unsigned char pointers. The former is
131     used for the external interface and appears in pcre.h, which is why its name
132     must begin with PCRE_. */
133    
134     #ifdef CUSTOM_SUBJECT_PTR
135     #define PCRE_SPTR CUSTOM_SUBJECT_PTR
136     #define USPTR CUSTOM_SUBJECT_PTR
137     #else
138     #define PCRE_SPTR const char *
139     #define USPTR const unsigned char *
140     #endif
141    
142 nigel 85 /* Include the public PCRE header and the definitions of UCP character property
143     values. */
144 nigel 77
145     #include "pcre.h"
146     #include "ucp.h"
147    
148     /* When compiling for use with the Virtual Pascal compiler, these functions
149     need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
150     option on the command line. */
151    
152     #ifdef VPCOMPAT
153     #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
154     #define memcpy(d,s,n) _memcpy(d,s,n)
155     #define memmove(d,s,n) _memmove(d,s,n)
156     #define memset(s,c,n) _memset(s,c,n)
157     #else /* VPCOMPAT */
158    
159     /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
160     define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
161     is set. Otherwise, include an emulating function for those systems that have
162     neither (there some non-Unix environments where this is the case). This assumes
163     that all calls to memmove are moving strings upwards in store, which is the
164     case in PCRE. */
165    
166     #if ! HAVE_MEMMOVE
167     #undef memmove /* some systems may have a macro */
168     #if HAVE_BCOPY
169     #define memmove(a, b, c) bcopy(b, a, c)
170     #else /* HAVE_BCOPY */
171 nigel 91 static void *
172 nigel 77 pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
173     {
174 nigel 87 size_t i;
175 nigel 77 dest += n;
176     src += n;
177     for (i = 0; i < n; ++i) *(--dest) = *(--src);
178 nigel 87 return dest;
179 nigel 77 }
180     #define memmove(a, b, c) pcre_memmove(a, b, c)
181     #endif /* not HAVE_BCOPY */
182     #endif /* not HAVE_MEMMOVE */
183     #endif /* not VPCOMPAT */
184    
185    
186     /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
187     in big-endian order) by default. These are used, for example, to link from the
188     start of a subpattern to its alternatives and its end. The use of 2 bytes per
189     offset limits the size of the compiled regex to around 64K, which is big enough
190     for almost everybody. However, I received a request for an even bigger limit.
191     For this reason, and also to make the code easier to maintain, the storing and
192     loading of offsets from the byte string is now handled by the macros that are
193     defined here.
194    
195     The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
196     the config.h file, but can be overridden by using -D on the command line. This
197     is automated on Unix systems via the "configure" command. */
198    
199     #if LINK_SIZE == 2
200    
201     #define PUT(a,n,d) \
202     (a[n] = (d) >> 8), \
203     (a[(n)+1] = (d) & 255)
204    
205     #define GET(a,n) \
206     (((a)[n] << 8) | (a)[(n)+1])
207    
208     #define MAX_PATTERN_SIZE (1 << 16)
209    
210    
211     #elif LINK_SIZE == 3
212    
213     #define PUT(a,n,d) \
214     (a[n] = (d) >> 16), \
215     (a[(n)+1] = (d) >> 8), \
216     (a[(n)+2] = (d) & 255)
217    
218     #define GET(a,n) \
219     (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
220    
221     #define MAX_PATTERN_SIZE (1 << 24)
222    
223    
224     #elif LINK_SIZE == 4
225    
226     #define PUT(a,n,d) \
227     (a[n] = (d) >> 24), \
228     (a[(n)+1] = (d) >> 16), \
229     (a[(n)+2] = (d) >> 8), \
230     (a[(n)+3] = (d) & 255)
231    
232     #define GET(a,n) \
233     (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
234    
235     #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
236    
237    
238     #else
239     #error LINK_SIZE must be either 2, 3, or 4
240     #endif
241    
242    
243     /* Convenience macro defined in terms of the others */
244    
245     #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
246    
247    
248     /* PCRE uses some other 2-byte quantities that do not change when the size of
249     offsets changes. There are used for repeat counts and for other things such as
250     capturing parenthesis numbers in back references. */
251    
252     #define PUT2(a,n,d) \
253     a[n] = (d) >> 8; \
254     a[(n)+1] = (d) & 255
255    
256     #define GET2(a,n) \
257     (((a)[n] << 8) | (a)[(n)+1])
258    
259     #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
260    
261    
262     /* When UTF-8 encoding is being used, a character is no longer just a single
263     byte. The macros for character handling generate simple sequences when used in
264     byte-mode, and more complicated ones for UTF-8 characters. */
265    
266     #ifndef SUPPORT_UTF8
267     #define GETCHAR(c, eptr) c = *eptr;
268     #define GETCHARTEST(c, eptr) c = *eptr;
269     #define GETCHARINC(c, eptr) c = *eptr++;
270     #define GETCHARINCTEST(c, eptr) c = *eptr++;
271     #define GETCHARLEN(c, eptr, len) c = *eptr;
272     #define BACKCHAR(eptr)
273    
274     #else /* SUPPORT_UTF8 */
275    
276     /* Get the next UTF-8 character, not advancing the pointer. This is called when
277     we know we are in UTF-8 mode. */
278    
279     #define GETCHAR(c, eptr) \
280     c = *eptr; \
281     if ((c & 0xc0) == 0xc0) \
282     { \
283     int gcii; \
284     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
285     int gcss = 6*gcaa; \
286     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
287     for (gcii = 1; gcii <= gcaa; gcii++) \
288     { \
289     gcss -= 6; \
290     c |= (eptr[gcii] & 0x3f) << gcss; \
291     } \
292     }
293    
294     /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
295     pointer. */
296    
297     #define GETCHARTEST(c, eptr) \
298     c = *eptr; \
299     if (utf8 && (c & 0xc0) == 0xc0) \
300     { \
301     int gcii; \
302     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
303     int gcss = 6*gcaa; \
304     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
305     for (gcii = 1; gcii <= gcaa; gcii++) \
306     { \
307     gcss -= 6; \
308     c |= (eptr[gcii] & 0x3f) << gcss; \
309     } \
310     }
311    
312     /* Get the next UTF-8 character, advancing the pointer. This is called when we
313     know we are in UTF-8 mode. */
314    
315     #define GETCHARINC(c, eptr) \
316     c = *eptr++; \
317     if ((c & 0xc0) == 0xc0) \
318     { \
319     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
320     int gcss = 6*gcaa; \
321     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
322     while (gcaa-- > 0) \
323     { \
324     gcss -= 6; \
325     c |= (*eptr++ & 0x3f) << gcss; \
326     } \
327     }
328    
329     /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
330    
331     #define GETCHARINCTEST(c, eptr) \
332     c = *eptr++; \
333     if (utf8 && (c & 0xc0) == 0xc0) \
334     { \
335     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
336     int gcss = 6*gcaa; \
337     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
338     while (gcaa-- > 0) \
339     { \
340     gcss -= 6; \
341     c |= (*eptr++ & 0x3f) << gcss; \
342     } \
343     }
344    
345     /* Get the next UTF-8 character, not advancing the pointer, incrementing length
346     if there are extra bytes. This is called when we know we are in UTF-8 mode. */
347    
348     #define GETCHARLEN(c, eptr, len) \
349     c = *eptr; \
350     if ((c & 0xc0) == 0xc0) \
351     { \
352     int gcii; \
353     int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
354     int gcss = 6*gcaa; \
355     c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
356     for (gcii = 1; gcii <= gcaa; gcii++) \
357     { \
358     gcss -= 6; \
359     c |= (eptr[gcii] & 0x3f) << gcss; \
360     } \
361     len += gcaa; \
362     }
363    
364     /* If the pointer is not at the start of a character, move it back until
365     it is. Called only in UTF-8 mode. */
366    
367     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
368    
369     #endif
370    
371    
372     /* In case there is no definition of offsetof() provided - though any proper
373     Standard C system should have one. */
374    
375     #ifndef offsetof
376     #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
377     #endif
378    
379    
380     /* These are the public options that can change during matching. */
381    
382     #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
383    
384 nigel 91 /* Private options flags start at the most significant end of the four bytes.
385     The public options defined in pcre.h start at the least significant end. Make
386     sure they don't overlap! The bits are getting a bit scarce now -- when we run
387     out, there is a dummy word in the structure that could be used for the private
388     bits. */
389 nigel 77
390 nigel 91 #define PCRE_NOPARTIAL 0x80000000 /* can't use partial with this regex */
391 nigel 77 #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
392     #define PCRE_REQCHSET 0x20000000 /* req_byte is set */
393     #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
394 nigel 91 #define PCRE_JCHANGED 0x08000000 /* j option changes within regex */
395 nigel 77
396     /* Options for the "extra" block produced by pcre_study(). */
397    
398     #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
399    
400     /* Masks for identifying the public options that are permitted at compile
401     time, run time, or study time, respectively. */
402    
403     #define PUBLIC_OPTIONS \
404     (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
405     PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
406 nigel 91 PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
407     PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
408 nigel 77
409     #define PUBLIC_EXEC_OPTIONS \
410     (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
411 nigel 91 PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
412 nigel 77
413     #define PUBLIC_DFA_EXEC_OPTIONS \
414     (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
415 nigel 91 PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \
416     PCRE_NEWLINE_LF)
417 nigel 77
418     #define PUBLIC_STUDY_OPTIONS 0 /* None defined */
419    
420     /* Magic number to provide a small check against being handed junk. Also used
421     to detect whether a pattern was compiled on a host of different endianness. */
422    
423     #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
424    
425     /* Negative values for the firstchar and reqchar variables */
426    
427     #define REQ_UNSET (-2)
428     #define REQ_NONE (-1)
429    
430     /* The maximum remaining length of subject we are prepared to search for a
431     req_byte match. */
432    
433     #define REQ_BYTE_MAX 1000
434    
435     /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
436     variable-length repeat, or a anything other than literal characters. */
437    
438     #define REQ_CASELESS 0x0100 /* indicates caselessness */
439     #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
440    
441     /* Miscellaneous definitions */
442    
443     typedef int BOOL;
444    
445     #define FALSE 0
446     #define TRUE 1
447    
448     /* Escape items that are just an encoding of a particular data value. Note that
449     ESC_n is defined as yet another macro, which is set in config.h to either \n
450     (the default) or \r (which some people want). */
451    
452     #ifndef ESC_e
453     #define ESC_e 27
454     #endif
455    
456     #ifndef ESC_f
457     #define ESC_f '\f'
458     #endif
459    
460     #ifndef ESC_n
461     #define ESC_n NEWLINE
462     #endif
463    
464     #ifndef ESC_r
465     #define ESC_r '\r'
466     #endif
467    
468     /* We can't officially use ESC_t because it is a POSIX reserved identifier
469     (presumably because of all the others like size_t). */
470    
471     #ifndef ESC_tee
472     #define ESC_tee '\t'
473     #endif
474    
475 nigel 87 /* Codes for different types of Unicode property */
476    
477     #define PT_ANY 0 /* Any property - matches all chars */
478     #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
479     #define PT_GC 2 /* General characteristic (e.g. L) */
480     #define PT_PC 3 /* Particular characteristic (e.g. Lu) */
481     #define PT_SC 4 /* Script (e.g. Han) */
482    
483     /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
484     contain UTF-8 characters with values greater than 255. */
485    
486     #define XCL_NOT 0x01 /* Flag: this is a negative class */
487     #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
488    
489     #define XCL_END 0 /* Marks end of individual items */
490     #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
491     #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
492     #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
493     #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
494    
495 nigel 77 /* These are escaped items that aren't just an encoding of a particular data
496     value such as \n. They must have non-zero values, as check_escape() returns
497     their negation. Also, they must appear in the same order as in the opcode
498     definitions below, up to ESC_z. There's a dummy for OP_ANY because it
499     corresponds to "." rather than an escape sequence. The final one must be
500     ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
501     tests in the code for an escape greater than ESC_b and less than ESC_Z to
502     detect the types that may be repeated. These are the types that consume
503     characters. If any new escapes are put in between that don't consume a
504     character, that code will have to change. */
505    
506     enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
507     ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
508     ESC_Q, ESC_REF };
509    
510     /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
511     that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
512     OP_EOD must correspond in order to the list of escapes immediately above.
513     Note that whenever this list is updated, the two macro definitions that follow
514     must also be updated to match. */
515    
516     enum {
517     OP_END, /* 0 End of pattern */
518    
519     /* Values corresponding to backslashed metacharacters */
520    
521     OP_SOD, /* 1 Start of data: \A */
522     OP_SOM, /* 2 Start of match (subject + offset): \G */
523     OP_NOT_WORD_BOUNDARY, /* 3 \B */
524     OP_WORD_BOUNDARY, /* 4 \b */
525     OP_NOT_DIGIT, /* 5 \D */
526     OP_DIGIT, /* 6 \d */
527     OP_NOT_WHITESPACE, /* 7 \S */
528     OP_WHITESPACE, /* 8 \s */
529     OP_NOT_WORDCHAR, /* 9 \W */
530     OP_WORDCHAR, /* 10 \w */
531     OP_ANY, /* 11 Match any character */
532     OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
533     OP_NOTPROP, /* 13 \P (not Unicode property) */
534     OP_PROP, /* 14 \p (Unicode property) */
535     OP_EXTUNI, /* 15 \X (extended Unicode sequence */
536     OP_EODN, /* 16 End of data or \n at end of data: \Z. */
537     OP_EOD, /* 17 End of data: \z */
538    
539     OP_OPT, /* 18 Set runtime options */
540     OP_CIRC, /* 19 Start of line - varies with multiline switch */
541     OP_DOLL, /* 20 End of line - varies with multiline switch */
542     OP_CHAR, /* 21 Match one character, casefully */
543     OP_CHARNC, /* 22 Match one character, caselessly */
544 nigel 91 OP_NOT, /* 23 Match one character, not the following one */
545 nigel 77
546     OP_STAR, /* 24 The maximizing and minimizing versions of */
547     OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
548     OP_PLUS, /* 26 the minimizing one second. */
549     OP_MINPLUS, /* 27 This first set applies to single characters */
550     OP_QUERY, /* 28 */
551     OP_MINQUERY, /* 29 */
552     OP_UPTO, /* 30 From 0 to n matches */
553     OP_MINUPTO, /* 31 */
554     OP_EXACT, /* 32 Exactly n matches */
555    
556     OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
557     OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
558     OP_NOTPLUS, /* 35 the minimizing one second. */
559     OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
560     OP_NOTQUERY, /* 37 */
561     OP_NOTMINQUERY, /* 38 */
562     OP_NOTUPTO, /* 39 From 0 to n matches */
563     OP_NOTMINUPTO, /* 40 */
564     OP_NOTEXACT, /* 41 Exactly n matches */
565    
566     OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
567     OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
568     OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
569     OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
570     OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
571     OP_TYPEMINQUERY, /* 47 */
572     OP_TYPEUPTO, /* 48 From 0 to n matches */
573     OP_TYPEMINUPTO, /* 49 */
574     OP_TYPEEXACT, /* 50 Exactly n matches */
575    
576     OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
577     OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
578     OP_CRPLUS, /* 53 the minimizing one second. These codes must */
579     OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
580     OP_CRQUERY, /* 55 These are for character classes and back refs */
581     OP_CRMINQUERY, /* 56 */
582     OP_CRRANGE, /* 57 These are different to the three sets above. */
583     OP_CRMINRANGE, /* 58 */
584    
585     OP_CLASS, /* 59 Match a character class, chars < 256 only */
586     OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
587     class - the difference is relevant only when a UTF-8
588     character > 255 is encountered. */
589    
590     OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
591     class. This does both positive and negative. */
592    
593     OP_REF, /* 62 Match a back reference */
594     OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
595     OP_CALLOUT, /* 64 Call out to external function if provided */
596    
597     OP_ALT, /* 65 Start of alternation */
598     OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
599     OP_KETRMAX, /* 67 These two must remain together and in this */
600     OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
601    
602     /* The assertions must come before ONCE and COND */
603    
604     OP_ASSERT, /* 69 Positive lookahead */
605     OP_ASSERT_NOT, /* 70 Negative lookahead */
606     OP_ASSERTBACK, /* 71 Positive lookbehind */
607     OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
608     OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
609    
610     /* ONCE and COND must come after the assertions, with ONCE first, as there's
611     a test for >= ONCE for a subpattern that isn't an assertion. */
612    
613     OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
614     OP_COND, /* 75 Conditional group */
615     OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
616    
617     OP_BRAZERO, /* 77 These two must remain together and in this */
618     OP_BRAMINZERO, /* 78 order. */
619    
620     OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
621     than can fit into an opcode. */
622    
623     OP_BRA /* 80 This and greater values are used for brackets that
624     extract substrings up to EXTRACT_BASIC_MAX. After
625     that, use is made of OP_BRANUMBER. */
626     };
627    
628     /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
629     study.c that all opcodes are less than 128 in value. This makes handling UTF-8
630     character sequences easier. */
631    
632     /* The highest extraction number before we have to start using additional
633     bytes. (Originally PCRE didn't have support for extraction counts highter than
634     this number.) The value is limited by the number of opcodes left after OP_BRA,
635     i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
636     opcodes. */
637    
638     #define EXTRACT_BASIC_MAX 100
639    
640    
641     /* This macro defines textual names for all the opcodes. These are used only
642     for debugging. The macro is referenced only in pcre_printint.c. */
643    
644     #define OP_NAME_LIST \
645     "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
646     "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
647     "notprop", "prop", "extuni", \
648     "\\Z", "\\z", \
649     "Opt", "^", "$", "char", "charnc", "not", \
650     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
651     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
652     "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
653     "*", "*?", "+", "+?", "?", "??", "{", "{", \
654     "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
655     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
656     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
657     "Brazero", "Braminzero", "Branumber", "Bra"
658    
659    
660     /* This macro defines the length of fixed length operations in the compiled
661     regex. The lengths are used when searching for specific things, and also in the
662     debugging printing of a compiled regex. We use a macro so that it can be
663     defined close to the definitions of the opcodes themselves.
664    
665     As things have been extended, some of these are no longer fixed lenths, but are
666     minima instead. For example, the length of a single-character repeat may vary
667     in UTF-8 mode. The code that uses this table must know about such things. */
668    
669     #define OP_LENGTHS \
670     1, /* End */ \
671     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
672     1, 1, /* Any, Anybyte */ \
673 nigel 87 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
674 nigel 77 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
675     2, /* Char - the minimum length */ \
676     2, /* Charnc - the minimum length */ \
677     2, /* not */ \
678     /* Positive single-char repeats ** These are */ \
679     2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
680     4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
681     /* Negative single-char repeats - only for chars < 256 */ \
682     2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
683     4, 4, 4, /* NOT upto, minupto, exact */ \
684     /* Positive type repeats */ \
685     2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
686     4, 4, 4, /* Type upto, minupto, exact */ \
687     /* Character class & ref repeats */ \
688     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
689     5, 5, /* CRRANGE, CRMINRANGE */ \
690     33, /* CLASS */ \
691     33, /* NCLASS */ \
692     0, /* XCLASS - variable length */ \
693     3, /* REF */ \
694     1+LINK_SIZE, /* RECURSE */ \
695     2+2*LINK_SIZE, /* CALLOUT */ \
696     1+LINK_SIZE, /* Alt */ \
697     1+LINK_SIZE, /* Ket */ \
698     1+LINK_SIZE, /* KetRmax */ \
699     1+LINK_SIZE, /* KetRmin */ \
700     1+LINK_SIZE, /* Assert */ \
701     1+LINK_SIZE, /* Assert not */ \
702     1+LINK_SIZE, /* Assert behind */ \
703     1+LINK_SIZE, /* Assert behind not */ \
704     1+LINK_SIZE, /* Reverse */ \
705     1+LINK_SIZE, /* Once */ \
706     1+LINK_SIZE, /* COND */ \
707     3, /* CREF */ \
708     1, 1, /* BRAZERO, BRAMINZERO */ \
709     3, /* BRANUMBER */ \
710     1+LINK_SIZE /* BRA */ \
711    
712    
713     /* A magic value for OP_CREF to indicate the "in recursion" condition. */
714    
715     #define CREF_RECURSE 0xffff
716    
717     /* Error code numbers. They are given names so that they can more easily be
718     tracked. */
719    
720     enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
721     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
722     ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
723     ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
724 nigel 91 ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
725     ERR50, ERR51 };
726 nigel 77
727     /* The real format of the start of the pcre block; the index of names and the
728     code vector run on as long as necessary after the end. We store an explicit
729     offset to the name table so that if a regex is compiled on one host, saved, and
730     then run on another where the size of pointers is different, all might still
731     be well. For the case of compiled-on-4 and run-on-8, we include an extra
732     pointer that is always NULL. For future-proofing, a few dummy fields were
733     originally included - even though you can never get this planning right - but
734     there is only one left now.
735    
736     NOTE NOTE NOTE:
737     Because people can now save and re-use compiled patterns, any additions to this
738     structure should be made at the end, and something earlier (e.g. a new
739     flag in the options or one of the dummy fields) should indicate that the new
740     fields are present. Currently PCRE always sets the dummy fields to zero.
741     NOTE NOTE NOTE:
742     */
743    
744     typedef struct real_pcre {
745     pcre_uint32 magic_number;
746     pcre_uint32 size; /* Total that was malloced */
747     pcre_uint32 options;
748     pcre_uint32 dummy1; /* For future use, maybe */
749    
750     pcre_uint16 top_bracket;
751     pcre_uint16 top_backref;
752     pcre_uint16 first_byte;
753     pcre_uint16 req_byte;
754     pcre_uint16 name_table_offset; /* Offset to name table that follows */
755     pcre_uint16 name_entry_size; /* Size of any name items */
756     pcre_uint16 name_count; /* Number of name items */
757     pcre_uint16 ref_count; /* Reference count */
758    
759     const unsigned char *tables; /* Pointer to tables or NULL for std */
760     const unsigned char *nullpad; /* NULL padding */
761     } real_pcre;
762    
763     /* The format of the block used to store data from pcre_study(). The same
764     remark (see NOTE above) about extending this structure applies. */
765    
766     typedef struct pcre_study_data {
767     pcre_uint32 size; /* Total that was malloced */
768     pcre_uint32 options;
769     uschar start_bits[32];
770     } pcre_study_data;
771    
772     /* Structure for passing "static" information around between the functions
773     doing the compiling, so that they are thread-safe. */
774    
775     typedef struct compile_data {
776     const uschar *lcc; /* Points to lower casing table */
777     const uschar *fcc; /* Points to case-flipping table */
778     const uschar *cbits; /* Points to character type table */
779     const uschar *ctypes; /* Points to table of type maps */
780     const uschar *start_code; /* The start of the compiled code */
781     const uschar *start_pattern; /* The start of the pattern */
782     uschar *name_table; /* The name/number table */
783     int names_found; /* Number of entries so far */
784     int name_entry_size; /* Size of each entry */
785     int top_backref; /* Maximum back reference */
786     unsigned int backref_map; /* Bitmap of low back refs */
787     int req_varyopt; /* "After variable item" flag for reqbyte */
788     BOOL nopartial; /* Set TRUE if partial won't work */
789 nigel 91 int nllen; /* 1 or 2 for newline string length */
790     uschar nl[4]; /* Newline string */
791 nigel 77 } compile_data;
792    
793     /* Structure for maintaining a chain of pointers to the currently incomplete
794     branches, for testing for left recursion. */
795    
796     typedef struct branch_chain {
797     struct branch_chain *outer;
798     uschar *current;
799     } branch_chain;
800    
801     /* Structure for items in a linked list that represents an explicit recursive
802     call within the pattern. */
803    
804     typedef struct recursion_info {
805     struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
806     int group_num; /* Number of group that was called */
807     const uschar *after_call; /* "Return value": points after the call in the expr */
808 nigel 87 USPTR save_start; /* Old value of md->start_match */
809 nigel 77 int *offset_save; /* Pointer to start of saved offsets */
810     int saved_max; /* Number of saved offsets */
811     } recursion_info;
812    
813     /* When compiling in a mode that doesn't use recursive calls to match(),
814     a structure is used to remember local variables on the heap. It is defined in
815 nigel 91 pcre_exec.c, close to the match() function, so that it is easy to keep it in
816     step with any changes of local variable. However, the pointer to the current
817     frame must be saved in some "static" place over a longjmp(). We declare the
818     structure here so that we can put a pointer in the match_data structure. NOTE:
819     This isn't used for a "normal" compilation of pcre. */
820 nigel 77
821     struct heapframe;
822    
823     /* Structure for passing "static" information around between the functions
824     doing traditional NFA matching, so that they are thread-safe. */
825    
826     typedef struct match_data {
827 nigel 87 unsigned long int match_call_count; /* As it says */
828     unsigned long int match_limit; /* As it says */
829     unsigned long int match_limit_recursion; /* As it says */
830 nigel 77 int *offset_vector; /* Offset vector */
831     int offset_end; /* One past the end */
832     int offset_max; /* The maximum usable for return data */
833 nigel 91 int nllen; /* 1 or 2 for newline string length */
834     uschar nl[4]; /* Newline string */
835 nigel 77 const uschar *lcc; /* Points to lower casing table */
836     const uschar *ctypes; /* Points to table of type maps */
837     BOOL offset_overflow; /* Set if too many extractions */
838     BOOL notbol; /* NOTBOL flag */
839     BOOL noteol; /* NOTEOL flag */
840     BOOL utf8; /* UTF8 flag */
841     BOOL endonly; /* Dollar not before final \n */
842     BOOL notempty; /* Empty string match not wanted */
843     BOOL partial; /* PARTIAL flag */
844     BOOL hitend; /* Hit the end of the subject at some point */
845     const uschar *start_code; /* For use when recursing */
846 nigel 87 USPTR start_subject; /* Start of the subject string */
847     USPTR end_subject; /* End of the subject string */
848     USPTR start_match; /* Start of this match attempt */
849     USPTR end_match_ptr; /* Subject position at end match */
850 nigel 77 int end_offset_top; /* Highwater mark at end of match */
851     int capture_last; /* Most recent capture number */
852     int start_offset; /* The start offset value */
853     recursion_info *recursive; /* Linked list of recursion data */
854     void *callout_data; /* To pass back to callouts */
855     struct heapframe *thisframe; /* Used only when compiling for no recursion */
856     } match_data;
857    
858     /* A similar structure is used for the same purpose by the DFA matching
859     functions. */
860    
861     typedef struct dfa_match_data {
862     const uschar *start_code; /* Start of the compiled pattern */
863     const uschar *start_subject; /* Start of the subject string */
864     const uschar *end_subject; /* End of subject string */
865     const uschar *tables; /* Character tables */
866     int moptions; /* Match options */
867     int poptions; /* Pattern options */
868 nigel 91 int nllen; /* 1 or 2 for newline string length */
869     uschar nl[4]; /* Newline string */
870 nigel 77 void *callout_data; /* To pass back to callouts */
871     } dfa_match_data;
872    
873     /* Bit definitions for entries in the pcre_ctypes table. */
874    
875     #define ctype_space 0x01
876     #define ctype_letter 0x02
877     #define ctype_digit 0x04
878     #define ctype_xdigit 0x08
879     #define ctype_word 0x10 /* alphameric or '_' */
880     #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
881    
882     /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
883     of bits for a class map. Some classes are built by combining these tables. */
884    
885     #define cbit_space 0 /* [:space:] or \s */
886     #define cbit_xdigit 32 /* [:xdigit:] */
887     #define cbit_digit 64 /* [:digit:] or \d */
888     #define cbit_upper 96 /* [:upper:] */
889     #define cbit_lower 128 /* [:lower:] */
890     #define cbit_word 160 /* [:word:] or \w */
891     #define cbit_graph 192 /* [:graph:] */
892     #define cbit_print 224 /* [:print:] */
893     #define cbit_punct 256 /* [:punct:] */
894     #define cbit_cntrl 288 /* [:cntrl:] */
895     #define cbit_length 320 /* Length of the cbits table */
896    
897     /* Offsets of the various tables from the base tables pointer, and
898     total length. */
899    
900     #define lcc_offset 0
901     #define fcc_offset 256
902     #define cbits_offset 512
903     #define ctypes_offset (cbits_offset + cbit_length)
904     #define tables_length (ctypes_offset + 256)
905    
906 nigel 87 /* Layout of the UCP type table that translates property names into types and
907     codes. */
908 nigel 77
909     typedef struct {
910     const char *name;
911 nigel 87 pcre_uint16 type;
912     pcre_uint16 value;
913 nigel 77 } ucp_type_table;
914    
915    
916     /* Internal shared data tables. These are tables that are used by more than one
917     of the exported public functions. They have to be "external" in the C sense,
918     but are not part of the PCRE public API. The data for these tables is in the
919     pcre_tables.c module. */
920    
921     extern const int _pcre_utf8_table1[];
922     extern const int _pcre_utf8_table2[];
923     extern const int _pcre_utf8_table3[];
924     extern const uschar _pcre_utf8_table4[];
925    
926     extern const int _pcre_utf8_table1_size;
927    
928     extern const ucp_type_table _pcre_utt[];
929     extern const int _pcre_utt_size;
930    
931     extern const uschar _pcre_default_tables[];
932    
933     extern const uschar _pcre_OP_lengths[];
934    
935    
936     /* Internal shared functions. These are functions that are used by more than
937     one of the exported public functions. They have to be "external" in the C
938     sense, but are not part of the PCRE public API. */
939    
940     extern int _pcre_ord2utf8(int, uschar *);
941     extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
942     const pcre_study_data *, pcre_study_data *);
943 nigel 91 extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
944 nigel 87 extern int _pcre_ucp_othercase(const int);
945 nigel 77 extern int _pcre_valid_utf8(const uschar *, int);
946     extern BOOL _pcre_xclass(int, const uschar *);
947    
948 nigel 85 #endif
949    
950 nigel 77 /* End of pcre_internal.h */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12