/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 51 - (hide annotations) (download)
Sat Feb 24 21:39:37 2007 UTC (6 years, 2 months ago) by nigel
File MIME type: text/plain
File size: 152081 byte(s)
Load pcre-3.4 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 43 Copyright (c) 1997-2000 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Define DEBUG to get debugging output on stdout. */
37    
38     /* #define DEBUG */
39    
40 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41     inline, and there are *still* stupid compilers about that don't like indented
42     pre-processor statements. I suppose it's only been 10 years... */
43 nigel 3
44 nigel 9 #ifdef DEBUG
45     #define DPRINTF(p) printf p
46     #else
47     #define DPRINTF(p) /*nothing*/
48     #endif
49    
50 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
51     the external pcre header. */
52    
53     #include "internal.h"
54    
55    
56 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58     #ifdef __cplusplus
59     #define class pcre_class
60     #endif
61    
62    
63 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
64     not be set greater than 200. */
65    
66     #define BRASTACK_SIZE 200
67    
68    
69 nigel 49 /* The number of bytes in a literal character string above which we can't add
70     any more is different when UTF-8 characters may be encountered. */
71    
72     #ifdef SUPPORT_UTF8
73     #define MAXLIT 250
74     #else
75     #define MAXLIT 255
76     #endif
77    
78    
79 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83 nigel 3
84 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
85 nigel 3
86     #ifdef DEBUG
87 nigel 7 static const char *OP_names[] = {
88     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90     "Opt", "^", "$", "Any", "chars", "not",
91 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94     "*", "*?", "+", "+?", "?", "??", "{", "{",
95 nigel 43 "class", "Ref", "Recurse",
96 nigel 23 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98 nigel 3 "Brazero", "Braminzero", "Bra"
99     };
100     #endif
101    
102     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103     are simple data values; negative values are for special things like \d and so
104     on. Zero means further processing is needed (for things like \x), or the escape
105     is invalid. */
106    
107 nigel 15 static const short int escapes[] = {
108 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
111     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
115     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
116     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
117 nigel 23 0, 0, -ESC_z /* x - z */
118 nigel 3 };
119    
120 nigel 43 /* Tables of names of POSIX character classes and their lengths. The list is
121     terminated by a zero length entry. The first three must be alpha, upper, lower,
122     as this is assumed for handling case independence. */
123    
124     static const char *posix_names[] = {
125     "alpha", "lower", "upper",
126     "alnum", "ascii", "cntrl", "digit", "graph",
127     "print", "punct", "space", "word", "xdigit" };
128    
129     static const uschar posix_name_lengths[] = {
130     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131    
132     /* Table of class bit maps for each POSIX class; up to three may be combined
133     to form the class. */
134    
135     static const int posix_class_maps[] = {
136     cbit_lower, cbit_upper, -1, /* alpha */
137     cbit_lower, -1, -1, /* lower */
138     cbit_upper, -1, -1, /* upper */
139     cbit_digit, cbit_lower, cbit_upper, /* alnum */
140     cbit_print, cbit_cntrl, -1, /* ascii */
141     cbit_cntrl, -1, -1, /* cntrl */
142     cbit_digit, -1, -1, /* digit */
143     cbit_graph, -1, -1, /* graph */
144     cbit_print, -1, -1, /* print */
145     cbit_punct, -1, -1, /* punct */
146     cbit_space, -1, -1, /* space */
147     cbit_word, -1, -1, /* word */
148     cbit_xdigit,-1, -1 /* xdigit */
149     };
150    
151    
152 nigel 3 /* Definition to allow mutual recursion */
153    
154 nigel 13 static BOOL
155 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156 nigel 37 BOOL, int, int *, int *, compile_data *);
157 nigel 3
158 nigel 47 /* Structure for building a chain of data that actually lives on the
159     stack, for holding the values of the subject pointer at the start of each
160     subpattern, so as to detect when an empty string has been matched by a
161     subpattern - to break infinite loops. */
162 nigel 3
163 nigel 47 typedef struct eptrblock {
164     struct eptrblock *prev;
165     const uschar *saved_eptr;
166     } eptrblock;
167 nigel 3
168 nigel 47 /* Flag bits for the match() function */
169    
170     #define match_condassert 0x01 /* Called to check a condition assertion */
171     #define match_isgroup 0x02 /* Set if start of bracketed group */
172    
173    
174    
175 nigel 3 /*************************************************
176     * Global variables *
177     *************************************************/
178    
179     /* PCRE is thread-clean and doesn't use any global variables in the normal
180     sense. However, it calls memory allocation and free functions via the two
181     indirections below, which are can be changed by the caller, but are shared
182     between all threads. */
183    
184     void *(*pcre_malloc)(size_t) = malloc;
185     void (*pcre_free)(void *) = free;
186    
187    
188    
189 nigel 49 /*************************************************
190     * Macros and tables for character handling *
191     *************************************************/
192 nigel 3
193 nigel 49 /* When UTF-8 encoding is being used, a character is no longer just a single
194     byte. The macros for character handling generate simple sequences when used in
195     byte-mode, and more complicated ones for UTF-8 characters. */
196    
197     #ifndef SUPPORT_UTF8
198     #define GETCHARINC(c, eptr) c = *eptr++;
199     #define GETCHARLEN(c, eptr, len) c = *eptr;
200     #define BACKCHAR(eptr)
201    
202     #else /* SUPPORT_UTF8 */
203    
204     /* Get the next UTF-8 character, advancing the pointer */
205    
206     #define GETCHARINC(c, eptr) \
207     c = *eptr++; \
208     if (md->utf8 && (c & 0xc0) == 0xc0) \
209     { \
210     int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
211     int s = 6 - a; /* Amount to shift next byte */ \
212     c &= utf8_table3[a]; /* Low order bits from first byte */ \
213     while (a-- > 0) \
214     { \
215     c |= (*eptr++ & 0x3f) << s; \
216     s += 6; \
217     } \
218     }
219    
220     /* Get the next UTF-8 character, not advancing the pointer, setting length */
221    
222     #define GETCHARLEN(c, eptr, len) \
223     c = *eptr; \
224     len = 1; \
225     if (md->utf8 && (c & 0xc0) == 0xc0) \
226     { \
227     int i; \
228     int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
229     int s = 6 - a; /* Amount to shift next byte */ \
230     c &= utf8_table3[a]; /* Low order bits from first byte */ \
231     for (i = 1; i <= a; i++) \
232     { \
233     c |= (eptr[i] & 0x3f) << s; \
234     s += 6; \
235     } \
236     len += a; \
237     }
238    
239     /* If the pointer is not at the start of a character, move it back until
240     it is. */
241    
242     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243    
244     #endif
245    
246    
247    
248 nigel 3 /*************************************************
249 nigel 25 * Default character tables *
250     *************************************************/
251    
252     /* A default set of character tables is included in the PCRE binary. Its source
253     is built by the maketables auxiliary program, which uses the default C ctypes
254     functions, and put in the file chartables.c. These tables are used by PCRE
255     whenever the caller of pcre_compile() does not provide an alternate set of
256     tables. */
257    
258     #include "chartables.c"
259    
260    
261    
262 nigel 49 #ifdef SUPPORT_UTF8
263 nigel 25 /*************************************************
264 nigel 49 * Tables for UTF-8 support *
265     *************************************************/
266    
267     /* These are the breakpoints for different numbers of bytes in a UTF-8
268     character. */
269    
270     static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271    
272     /* These are the indicator bits and the mask for the data bits to set in the
273     first byte of a character, indexed by the number of additional bytes. */
274    
275     static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276     static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277    
278     /* Table of the number of extra characters, indexed by the first character
279     masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280     0x3d. */
281    
282     static uschar utf8_table4[] = {
283     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286     3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287    
288    
289     /*************************************************
290     * Convert character value to UTF-8 *
291     *************************************************/
292    
293     /* This function takes an integer value in the range 0 - 0x7fffffff
294     and encodes it as a UTF-8 character in 0 to 6 bytes.
295    
296     Arguments:
297     cvalue the character value
298     buffer pointer to buffer for result - at least 6 bytes long
299    
300     Returns: number of characters placed in the buffer
301     */
302    
303     static int
304     ord2utf8(int cvalue, uschar *buffer)
305     {
306     register int i, j;
307     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308     if (cvalue <= utf8_table1[i]) break;
309     *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310     cvalue >>= 6 - i;
311     for (j = 0; j < i; j++)
312     {
313     *buffer++ = 0x80 | (cvalue & 0x3f);
314     cvalue >>= 6;
315     }
316     return i + 1;
317     }
318     #endif
319    
320    
321    
322     /*************************************************
323 nigel 3 * Return version string *
324     *************************************************/
325    
326 nigel 39 #define STRING(a) # a
327     #define XSTRING(s) STRING(s)
328    
329 nigel 7 const char *
330 nigel 3 pcre_version(void)
331     {
332 nigel 39 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333 nigel 3 }
334    
335    
336    
337    
338     /*************************************************
339 nigel 43 * (Obsolete) Return info about compiled pattern *
340 nigel 3 *************************************************/
341    
342 nigel 43 /* This is the original "info" function. It picks potentially useful data out
343     of the private structure, but its interface was too rigid. It remains for
344     backwards compatibility. The public options are passed back in an int - though
345     the re->options field has been expanded to a long int, all the public options
346 nigel 37 at the low end of it, and so even on 16-bit systems this will still be OK.
347     Therefore, I haven't changed the API for pcre_info().
348 nigel 3
349     Arguments:
350     external_re points to compiled code
351     optptr where to pass back the options
352     first_char where to pass back the first character,
353     or -1 if multiline and all branches start ^,
354     or -2 otherwise
355    
356 nigel 43 Returns: number of capturing subpatterns
357 nigel 3 or negative values on error
358     */
359    
360     int
361     pcre_info(const pcre *external_re, int *optptr, int *first_char)
362     {
363 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
364 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
365     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367 nigel 3 if (first_char != NULL)
368     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
370     return re->top_bracket;
371     }
372    
373    
374    
375 nigel 43 /*************************************************
376     * Return info about compiled pattern *
377     *************************************************/
378 nigel 3
379 nigel 43 /* This is a newer "info" function which has an extensible interface so
380     that additional items can be added compatibly.
381    
382     Arguments:
383     external_re points to compiled code
384     external_study points to study data, or NULL
385     what what information is required
386     where where to put the information
387    
388     Returns: 0 if data returned, negative on error
389     */
390    
391     int
392     pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393     void *where)
394     {
395     const real_pcre *re = (const real_pcre *)external_re;
396     const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397    
398     if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400    
401     switch (what)
402     {
403     case PCRE_INFO_OPTIONS:
404     *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405     break;
406    
407     case PCRE_INFO_SIZE:
408     *((size_t *)where) = re->size;
409     break;
410    
411     case PCRE_INFO_CAPTURECOUNT:
412     *((int *)where) = re->top_bracket;
413     break;
414    
415     case PCRE_INFO_BACKREFMAX:
416     *((int *)where) = re->top_backref;
417     break;
418    
419     case PCRE_INFO_FIRSTCHAR:
420     *((int *)where) =
421     ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423     break;
424    
425     case PCRE_INFO_FIRSTTABLE:
426     *((const uschar **)where) =
427     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428     study->start_bits : NULL;
429     break;
430    
431     case PCRE_INFO_LASTLITERAL:
432     *((int *)where) =
433     ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434     break;
435    
436     default: return PCRE_ERROR_BADOPTION;
437     }
438    
439     return 0;
440     }
441    
442    
443    
444 nigel 3 #ifdef DEBUG
445     /*************************************************
446     * Debugging function to print chars *
447     *************************************************/
448    
449     /* Print a sequence of chars in printable format, stopping at the end of the
450     subject if the requested.
451    
452     Arguments:
453     p points to characters
454     length number to print
455     is_subject TRUE if printing from within md->start_subject
456     md pointer to matching data block, if is_subject is TRUE
457    
458     Returns: nothing
459     */
460    
461 nigel 9 static void
462     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
463 nigel 3 {
464     int c;
465     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
466     while (length-- > 0)
467     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
468     }
469     #endif
470    
471    
472    
473    
474     /*************************************************
475     * Handle escapes *
476     *************************************************/
477    
478     /* This function is called when a \ has been encountered. It either returns a
479     positive value for a simple escape such as \n, or a negative value which
480 nigel 49 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481     a positive value greater than 255 may be returned. On entry, ptr is pointing at
482     the \. On exit, it is on the final character of the escape sequence.
483 nigel 3
484     Arguments:
485     ptrptr points to the pattern position pointer
486     errorptr points to the pointer to the error message
487     bracount number of previous extracting brackets
488     options the options bits
489     isclass TRUE if inside a character class
490 nigel 25 cd pointer to char tables block
491 nigel 3
492     Returns: zero or positive => a data character
493     negative => a special escape sequence
494     on error, errorptr is set
495     */
496    
497     static int
498 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499 nigel 25 int options, BOOL isclass, compile_data *cd)
500 nigel 3 {
501 nigel 7 const uschar *ptr = *ptrptr;
502 nigel 43 int c, i;
503 nigel 3
504 nigel 49 /* If backslash is at the end of the pattern, it's an error. */
505    
506     c = *(++ptr);
507 nigel 3 if (c == 0) *errorptr = ERR1;
508    
509     /* Digits or letters may have special meaning; all others are literals. */
510    
511     else if (c < '0' || c > 'z') {}
512    
513     /* Do an initial lookup in a table. A non-zero result is something that can be
514     returned immediately. Otherwise further processing may be required. */
515    
516     else if ((i = escapes[c - '0']) != 0) c = i;
517    
518     /* Escapes that need further processing, or are illegal. */
519    
520     else
521     {
522 nigel 7 const uschar *oldptr;
523 nigel 3 switch (c)
524     {
525     /* The handling of escape sequences consisting of a string of digits
526     starting with one that is not zero is not straightforward. By experiment,
527     the way Perl works seems to be as follows:
528    
529     Outside a character class, the digits are read as a decimal number. If the
530     number is less than 10, or if there are that many previous extracting
531     left brackets, then it is a back reference. Otherwise, up to three octal
532     digits are read to form an escaped byte. Thus \123 is likely to be octal
533     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
534     value is greater than 377, the least significant 8 bits are taken. Inside a
535     character class, \ followed by a digit is always an octal number. */
536    
537     case '1': case '2': case '3': case '4': case '5':
538     case '6': case '7': case '8': case '9':
539    
540     if (!isclass)
541     {
542     oldptr = ptr;
543     c -= '0';
544 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545 nigel 3 c = c * 10 + *(++ptr) - '0';
546     if (c < 10 || c <= bracount)
547     {
548     c = -(ESC_REF + c);
549     break;
550     }
551     ptr = oldptr; /* Put the pointer back and fall through */
552     }
553    
554     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
555     generates a binary zero byte and treats the digit as a following literal.
556     Thus we have to pull back the pointer by one. */
557    
558     if ((c = *ptr) >= '8')
559     {
560     ptr--;
561     c = 0;
562     break;
563     }
564    
565     /* \0 always starts an octal number, but we may drop through to here with a
566 nigel 49 larger first octal digit. */
567 nigel 3
568     case '0':
569     c -= '0';
570 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571 nigel 3 ptr[1] != '8' && ptr[1] != '9')
572     c = c * 8 + *(++ptr) - '0';
573 nigel 49 c &= 255; /* Take least significant 8 bits */
574 nigel 3 break;
575    
576 nigel 49 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577     which can be greater than 0xff, but only if the ddd are hex digits. */
578 nigel 3
579     case 'x':
580 nigel 49 #ifdef SUPPORT_UTF8
581     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582     {
583     const uschar *pt = ptr + 2;
584     register int count = 0;
585     c = 0;
586     while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587     {
588     count++;
589     c = c * 16 + cd->lcc[*pt] -
590     (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591     pt++;
592     }
593     if (*pt == '}')
594     {
595     if (c < 0 || count > 8) *errorptr = ERR34;
596     ptr = pt;
597     break;
598     }
599     /* If the sequence of hex digits does not end with '}', then we don't
600     recognize this construct; fall through to the normal \x handling. */
601     }
602     #endif
603    
604     /* Read just a single hex char */
605    
606 nigel 3 c = 0;
607 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608 nigel 3 {
609     ptr++;
610 nigel 25 c = c * 16 + cd->lcc[*ptr] -
611     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
612 nigel 3 }
613     break;
614    
615 nigel 49 /* Other special escapes not starting with a digit are straightforward */
616    
617 nigel 3 case 'c':
618     c = *(++ptr);
619     if (c == 0)
620     {
621     *errorptr = ERR2;
622     return 0;
623     }
624    
625     /* A letter is upper-cased; then the 0x40 bit is flipped */
626    
627 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
628 nigel 3 c ^= 0x40;
629     break;
630    
631     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
634     there used to be some cases other than the default, and there may be again
635     in future, so I haven't "optimized" it. */
636 nigel 3
637     default:
638     if ((options & PCRE_EXTRA) != 0) switch(c)
639     {
640     default:
641     *errorptr = ERR3;
642     break;
643     }
644     break;
645     }
646     }
647    
648     *ptrptr = ptr;
649     return c;
650     }
651    
652    
653    
654     /*************************************************
655     * Check for counted repeat *
656     *************************************************/
657    
658     /* This function is called when a '{' is encountered in a place where it might
659     start a quantifier. It looks ahead to see if it really is a quantifier or not.
660     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
661     where the ddds are digits.
662    
663     Arguments:
664     p pointer to the first char after '{'
665 nigel 25 cd pointer to char tables block
666 nigel 3
667     Returns: TRUE or FALSE
668     */
669    
670     static BOOL
671 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
672 nigel 3 {
673 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675 nigel 3 if (*p == '}') return TRUE;
676    
677     if (*p++ != ',') return FALSE;
678     if (*p == '}') return TRUE;
679    
680 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
682 nigel 3 return (*p == '}');
683     }
684    
685    
686    
687     /*************************************************
688     * Read repeat counts *
689     *************************************************/
690    
691     /* Read an item of the form {n,m} and return the values. This is called only
692     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
693     so the syntax is guaranteed to be correct, but we need to check the values.
694    
695     Arguments:
696     p pointer to first char after '{'
697     minp pointer to int for min
698     maxp pointer to int for max
699     returned as -1 if no max
700     errorptr points to pointer to error message
701 nigel 25 cd pointer to character tables clock
702 nigel 3
703     Returns: pointer to '}' on success;
704     current ptr on error, with errorptr set
705     */
706    
707 nigel 7 static const uschar *
708 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
709     const char **errorptr, compile_data *cd)
710 nigel 3 {
711     int min = 0;
712     int max = -1;
713    
714 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
715 nigel 3
716     if (*p == '}') max = min; else
717     {
718     if (*(++p) != '}')
719     {
720     max = 0;
721 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722 nigel 3 if (max < min)
723     {
724     *errorptr = ERR4;
725     return p;
726     }
727     }
728     }
729    
730     /* Do paranoid checks, then fill in the required variables, and pass back the
731     pointer to the terminating '}'. */
732    
733     if (min > 65535 || max > 65535)
734     *errorptr = ERR5;
735     else
736     {
737     *minp = min;
738     *maxp = max;
739     }
740     return p;
741     }
742    
743    
744    
745     /*************************************************
746 nigel 23 * Find the fixed length of a pattern *
747     *************************************************/
748    
749     /* Scan a pattern and compute the fixed length of subject that will match it,
750     if the length is fixed. This is needed for dealing with backward assertions.
751    
752     Arguments:
753     code points to the start of the pattern (the bracket)
754 nigel 49 options the compiling options
755 nigel 23
756     Returns: the fixed length, or -1 if there is no fixed length
757     */
758    
759     static int
760 nigel 49 find_fixedlength(uschar *code, int options)
761 nigel 23 {
762     int length = -1;
763    
764     register int branchlength = 0;
765     register uschar *cc = code + 3;
766    
767     /* Scan along the opcodes for this branch. If we get to the end of the
768     branch, check the length against that of the other branches. */
769    
770     for (;;)
771     {
772     int d;
773     register int op = *cc;
774     if (op >= OP_BRA) op = OP_BRA;
775    
776     switch (op)
777     {
778     case OP_BRA:
779     case OP_ONCE:
780     case OP_COND:
781 nigel 49 d = find_fixedlength(cc, options);
782 nigel 23 if (d < 0) return -1;
783     branchlength += d;
784     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
785     cc += 3;
786     break;
787    
788     /* Reached end of a branch; if it's a ket it is the end of a nested
789     call. If it's ALT it is an alternation in a nested call. If it is
790     END it's the end of the outer call. All can be handled by the same code. */
791    
792     case OP_ALT:
793     case OP_KET:
794     case OP_KETRMAX:
795     case OP_KETRMIN:
796     case OP_END:
797     if (length < 0) length = branchlength;
798     else if (length != branchlength) return -1;
799     if (*cc != OP_ALT) return length;
800     cc += 3;
801     branchlength = 0;
802     break;
803    
804     /* Skip over assertive subpatterns */
805    
806     case OP_ASSERT:
807     case OP_ASSERT_NOT:
808     case OP_ASSERTBACK:
809     case OP_ASSERTBACK_NOT:
810     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811     cc += 3;
812     break;
813    
814     /* Skip over things that don't match chars */
815    
816     case OP_REVERSE:
817     cc++;
818 nigel 37 /* Fall through */
819 nigel 23
820     case OP_CREF:
821     case OP_OPT:
822     cc++;
823     /* Fall through */
824    
825     case OP_SOD:
826     case OP_EOD:
827     case OP_EODN:
828     case OP_CIRC:
829     case OP_DOLL:
830     case OP_NOT_WORD_BOUNDARY:
831     case OP_WORD_BOUNDARY:
832     cc++;
833     break;
834    
835 nigel 49 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836     This requires a scan of the string, unfortunately. We assume valid UTF-8
837     strings, so all we do is reduce the length by one for byte whose bits are
838     10xxxxxx. */
839 nigel 23
840     case OP_CHARS:
841     branchlength += *(++cc);
842 nigel 49 #ifdef SUPPORT_UTF8
843     for (d = 1; d <= *cc; d++)
844     if ((cc[d] & 0xc0) == 0x80) branchlength--;
845     #endif
846 nigel 23 cc += *cc + 1;
847     break;
848    
849     /* Handle exact repetitions */
850    
851     case OP_EXACT:
852     case OP_TYPEEXACT:
853     branchlength += (cc[1] << 8) + cc[2];
854     cc += 4;
855     break;
856    
857     /* Handle single-char matchers */
858    
859     case OP_NOT_DIGIT:
860     case OP_DIGIT:
861     case OP_NOT_WHITESPACE:
862     case OP_WHITESPACE:
863     case OP_NOT_WORDCHAR:
864     case OP_WORDCHAR:
865     case OP_ANY:
866     branchlength++;
867     cc++;
868     break;
869    
870    
871     /* Check a class for variable quantification */
872    
873     case OP_CLASS:
874     cc += (*cc == OP_REF)? 2 : 33;
875    
876     switch (*cc)
877     {
878     case OP_CRSTAR:
879     case OP_CRMINSTAR:
880     case OP_CRQUERY:
881     case OP_CRMINQUERY:
882     return -1;
883    
884     case OP_CRRANGE:
885     case OP_CRMINRANGE:
886     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887     branchlength += (cc[1] << 8) + cc[2];
888     cc += 5;
889     break;
890    
891     default:
892     branchlength++;
893     }
894     break;
895    
896     /* Anything else is variable length */
897    
898     default:
899     return -1;
900     }
901     }
902     /* Control never gets here */
903     }
904    
905    
906    
907    
908     /*************************************************
909 nigel 43 * Check for POSIX class syntax *
910     *************************************************/
911    
912     /* This function is called when the sequence "[:" or "[." or "[=" is
913     encountered in a character class. It checks whether this is followed by an
914     optional ^ and then a sequence of letters, terminated by a matching ":]" or
915     ".]" or "=]".
916    
917     Argument:
918     ptr pointer to the initial [
919     endptr where to return the end pointer
920     cd pointer to compile data
921    
922     Returns: TRUE or FALSE
923     */
924    
925     static BOOL
926     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927     {
928     int terminator; /* Don't combine these lines; the Solaris cc */
929     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
930     if (*(++ptr) == '^') ptr++;
931     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932     if (*ptr == terminator && ptr[1] == ']')
933     {
934     *endptr = ptr;
935     return TRUE;
936     }
937     return FALSE;
938     }
939    
940    
941    
942    
943     /*************************************************
944     * Check POSIX class name *
945     *************************************************/
946    
947     /* This function is called to check the name given in a POSIX-style class entry
948     such as [:alnum:].
949    
950     Arguments:
951     ptr points to the first letter
952     len the length of the name
953    
954     Returns: a value representing the name, or -1 if unknown
955     */
956    
957     static int
958     check_posix_name(const uschar *ptr, int len)
959     {
960     register int yield = 0;
961     while (posix_name_lengths[yield] != 0)
962     {
963     if (len == posix_name_lengths[yield] &&
964     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965     yield++;
966     }
967     return -1;
968     }
969    
970    
971    
972    
973     /*************************************************
974 nigel 3 * Compile one branch *
975     *************************************************/
976    
977     /* Scan the pattern, compiling it into the code vector.
978    
979     Arguments:
980 nigel 25 options the option bits
981     brackets points to number of brackets used
982     code points to the pointer to the current code point
983     ptrptr points to the current pattern pointer
984     errorptr points to pointer to error message
985     optchanged set to the value of the last OP_OPT item compiled
986 nigel 37 reqchar set to the last literal character required, else -1
987     countlits set to count of mandatory literal characters
988 nigel 25 cd contains pointers to tables
989 nigel 3
990 nigel 25 Returns: TRUE on success
991     FALSE, with *errorptr set on error
992 nigel 3 */
993    
994     static BOOL
995 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
996 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
997 nigel 37 int *reqchar, int *countlits, compile_data *cd)
998 nigel 3 {
999     int repeat_type, op_type;
1000     int repeat_min, repeat_max;
1001     int bravalue, length;
1002 nigel 19 int greedy_default, greedy_non_default;
1003 nigel 37 int prevreqchar;
1004     int condcount = 0;
1005     int subcountlits = 0;
1006 nigel 3 register int c;
1007     register uschar *code = *codeptr;
1008 nigel 23 uschar *tempcode;
1009 nigel 7 const uschar *ptr = *ptrptr;
1010 nigel 23 const uschar *tempptr;
1011 nigel 3 uschar *previous = NULL;
1012     uschar class[32];
1013    
1014 nigel 19 /* Set up the default and non-default settings for greediness */
1015    
1016     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017     greedy_non_default = greedy_default ^ 1;
1018    
1019 nigel 37 /* Initialize no required char, and count of literals */
1020    
1021     *reqchar = prevreqchar = -1;
1022     *countlits = 0;
1023    
1024 nigel 3 /* Switch on next character until the end of the branch */
1025    
1026     for (;; ptr++)
1027     {
1028     BOOL negate_class;
1029 nigel 23 int class_charcount;
1030     int class_lastchar;
1031     int newoptions;
1032     int condref;
1033 nigel 37 int subreqchar;
1034 nigel 3
1035     c = *ptr;
1036     if ((options & PCRE_EXTENDED) != 0)
1037     {
1038 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039 nigel 3 if (c == '#')
1040     {
1041 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
1042     on the Macintosh. */
1043     while ((c = *(++ptr)) != 0 && c != '\n') ;
1044 nigel 3 continue;
1045     }
1046     }
1047    
1048     switch(c)
1049     {
1050     /* The branch terminates at end of string, |, or ). */
1051    
1052     case 0:
1053     case '|':
1054     case ')':
1055     *codeptr = code;
1056     *ptrptr = ptr;
1057     return TRUE;
1058    
1059     /* Handle single-character metacharacters */
1060    
1061     case '^':
1062     previous = NULL;
1063     *code++ = OP_CIRC;
1064     break;
1065    
1066     case '$':
1067     previous = NULL;
1068     *code++ = OP_DOLL;
1069     break;
1070    
1071     case '.':
1072     previous = code;
1073     *code++ = OP_ANY;
1074     break;
1075    
1076     /* Character classes. These always build a 32-byte bitmap of the permitted
1077     characters, except in the special case where there is only one character.
1078     For negated classes, we build the map as usual, then invert it at the end.
1079     */
1080    
1081     case '[':
1082     previous = code;
1083 nigel 23 *code++ = OP_CLASS;
1084 nigel 3
1085 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
1086 nigel 3
1087     if ((c = *(++ptr)) == '^')
1088     {
1089     negate_class = TRUE;
1090     c = *(++ptr);
1091     }
1092 nigel 23 else negate_class = FALSE;
1093 nigel 3
1094     /* Keep a count of chars so that we can optimize the case of just a single
1095     character. */
1096    
1097     class_charcount = 0;
1098     class_lastchar = -1;
1099    
1100     /* Initialize the 32-char bit map to all zeros. We have to build the
1101     map in a temporary bit of store, in case the class contains only 1
1102     character, because in that case the compiled code doesn't use the
1103     bit map. */
1104    
1105     memset(class, 0, 32 * sizeof(uschar));
1106    
1107     /* Process characters until ] is reached. By writing this as a "do" it
1108     means that an initial ] is taken as a data character. */
1109    
1110     do
1111     {
1112     if (c == 0)
1113     {
1114     *errorptr = ERR6;
1115     goto FAILED;
1116     }
1117    
1118 nigel 43 /* Handle POSIX class names. Perl allows a negation extension of the
1119     form [:^name]. A square bracket that doesn't match the syntax is
1120     treated as a literal. We also recognize the POSIX constructions
1121     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122     5.6 does. */
1123    
1124     if (c == '[' &&
1125     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126     check_posix_syntax(ptr, &tempptr, cd))
1127     {
1128     BOOL local_negate = FALSE;
1129     int posix_class, i;
1130     register const uschar *cbits = cd->cbits;
1131    
1132     if (ptr[1] != ':')
1133     {
1134     *errorptr = ERR31;
1135     goto FAILED;
1136     }
1137    
1138     ptr += 2;
1139     if (*ptr == '^')
1140     {
1141     local_negate = TRUE;
1142     ptr++;
1143     }
1144    
1145     posix_class = check_posix_name(ptr, tempptr - ptr);
1146     if (posix_class < 0)
1147     {
1148     *errorptr = ERR30;
1149     goto FAILED;
1150     }
1151    
1152     /* If matching is caseless, upper and lower are converted to
1153     alpha. This relies on the fact that the class table starts with
1154     alpha, lower, upper as the first 3 entries. */
1155    
1156     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157     posix_class = 0;
1158    
1159     /* Or into the map we are building up to 3 of the static class
1160     tables, or their negations. */
1161    
1162     posix_class *= 3;
1163     for (i = 0; i < 3; i++)
1164     {
1165     int taboffset = posix_class_maps[posix_class + i];
1166     if (taboffset < 0) break;
1167     if (local_negate)
1168     for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169     else
1170     for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171     }
1172    
1173     ptr = tempptr + 1;
1174     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1175     continue;
1176     }
1177    
1178 nigel 3 /* Backslash may introduce a single character, or it may introduce one
1179     of the specials, which just set a flag. Escaped items are checked for
1180     validity in the pre-compiling pass. The sequence \b is a special case.
1181 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
1182 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
1183     or into the one we are building. We assume they have more than one
1184     character in them, so set class_count bigger than one. */
1185    
1186     if (c == '\\')
1187     {
1188 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189 nigel 3 if (-c == ESC_b) c = '\b';
1190     else if (c < 0)
1191     {
1192 nigel 25 register const uschar *cbits = cd->cbits;
1193 nigel 3 class_charcount = 10;
1194     switch (-c)
1195     {
1196     case ESC_d:
1197 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1198 nigel 3 continue;
1199    
1200     case ESC_D:
1201 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1202 nigel 3 continue;
1203    
1204     case ESC_w:
1205 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1206 nigel 3 continue;
1207    
1208     case ESC_W:
1209 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1210 nigel 3 continue;
1211    
1212     case ESC_s:
1213 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1214 nigel 3 continue;
1215    
1216     case ESC_S:
1217 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1218 nigel 3 continue;
1219    
1220     default:
1221     *errorptr = ERR7;
1222     goto FAILED;
1223     }
1224     }
1225 nigel 49
1226     /* Fall through if single character, but don't at present allow
1227     chars > 255 in UTF-8 mode. */
1228    
1229     #ifdef SUPPORT_UTF8
1230     if (c > 255)
1231     {
1232     *errorptr = ERR33;
1233     goto FAILED;
1234     }
1235     #endif
1236 nigel 3 }
1237    
1238     /* A single character may be followed by '-' to form a range. However,
1239     Perl does not permit ']' to be the end of the range. A '-' character
1240     here is treated as a literal. */
1241    
1242     if (ptr[1] == '-' && ptr[2] != ']')
1243     {
1244     int d;
1245     ptr += 2;
1246     d = *ptr;
1247    
1248     if (d == 0)
1249     {
1250     *errorptr = ERR6;
1251     goto FAILED;
1252     }
1253    
1254     /* The second part of a range can be a single-character escape, but
1255 nigel 49 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256     in such circumstances. */
1257 nigel 3
1258     if (d == '\\')
1259     {
1260 nigel 49 const uschar *oldptr = ptr;
1261 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262 nigel 49
1263     #ifdef SUPPORT_UTF8
1264     if (d > 255)
1265     {
1266     *errorptr = ERR33;
1267     goto FAILED;
1268     }
1269     #endif
1270     /* \b is backslash; any other special means the '-' was literal */
1271    
1272 nigel 3 if (d < 0)
1273     {
1274     if (d == -ESC_b) d = '\b'; else
1275     {
1276 nigel 49 ptr = oldptr - 2;
1277     goto SINGLE_CHARACTER; /* A few lines below */
1278 nigel 3 }
1279     }
1280     }
1281    
1282     if (d < c)
1283     {
1284     *errorptr = ERR8;
1285     goto FAILED;
1286     }
1287    
1288     for (; c <= d; c++)
1289     {
1290     class[c/8] |= (1 << (c&7));
1291     if ((options & PCRE_CASELESS) != 0)
1292     {
1293 nigel 25 int uc = cd->fcc[c]; /* flip case */
1294 nigel 3 class[uc/8] |= (1 << (uc&7));
1295     }
1296     class_charcount++; /* in case a one-char range */
1297     class_lastchar = c;
1298     }
1299     continue; /* Go get the next char in the class */
1300     }
1301    
1302     /* Handle a lone single character - we can get here for a normal
1303     non-escape char, or after \ that introduces a single character. */
1304    
1305 nigel 49 SINGLE_CHARACTER:
1306    
1307 nigel 3 class [c/8] |= (1 << (c&7));
1308     if ((options & PCRE_CASELESS) != 0)
1309     {
1310 nigel 25 c = cd->fcc[c]; /* flip case */
1311 nigel 3 class[c/8] |= (1 << (c&7));
1312     }
1313     class_charcount++;
1314     class_lastchar = c;
1315     }
1316    
1317     /* Loop until ']' reached; the check for end of string happens inside the
1318     loop. This "while" is the end of the "do" above. */
1319    
1320     while ((c = *(++ptr)) != ']');
1321    
1322     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1323     precisely one character. This doesn't need the whole 32-byte bit map.
1324     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1325     it's negative. */
1326    
1327     if (class_charcount == 1 && class_lastchar >= 0)
1328     {
1329     if (negate_class)
1330     {
1331     code[-1] = OP_NOT;
1332     }
1333     else
1334     {
1335     code[-1] = OP_CHARS;
1336     *code++ = 1;
1337     }
1338     *code++ = class_lastchar;
1339     }
1340    
1341     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1342     the code vector. */
1343    
1344     else
1345     {
1346     if (negate_class)
1347     for (c = 0; c < 32; c++) code[c] = ~class[c];
1348     else
1349     memcpy(code, class, 32);
1350     code += 32;
1351     }
1352     break;
1353    
1354     /* Various kinds of repeat */
1355    
1356     case '{':
1357 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359 nigel 3 if (*errorptr != NULL) goto FAILED;
1360     goto REPEAT;
1361    
1362     case '*':
1363     repeat_min = 0;
1364     repeat_max = -1;
1365     goto REPEAT;
1366    
1367     case '+':
1368     repeat_min = 1;
1369     repeat_max = -1;
1370     goto REPEAT;
1371    
1372     case '?':
1373     repeat_min = 0;
1374     repeat_max = 1;
1375    
1376     REPEAT:
1377     if (previous == NULL)
1378     {
1379     *errorptr = ERR9;
1380     goto FAILED;
1381     }
1382    
1383 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
1384     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1385 nigel 3 next character. */
1386    
1387 nigel 19 if (ptr[1] == '?')
1388     { repeat_type = greedy_non_default; ptr++; }
1389     else repeat_type = greedy_default;
1390 nigel 3
1391     /* If previous was a string of characters, chop off the last one and use it
1392     as the subject of the repeat. If there was only one character, we can
1393 nigel 37 abolish the previous item altogether. A repeat with a zero minimum wipes
1394     out any reqchar setting, backing up to the previous value. We must also
1395     adjust the countlits value. */
1396 nigel 3
1397 nigel 37 if (*previous == OP_CHARS)
1398 nigel 3 {
1399     int len = previous[1];
1400 nigel 37
1401     if (repeat_min == 0) *reqchar = prevreqchar;
1402     *countlits += repeat_min - 1;
1403    
1404 nigel 3 if (len == 1)
1405     {
1406     c = previous[2];
1407     code = previous;
1408     }
1409     else
1410     {
1411     c = previous[len+1];
1412     previous[1]--;
1413     code--;
1414     }
1415     op_type = 0; /* Use single-char op codes */
1416     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1417     }
1418    
1419     /* If previous was a single negated character ([^a] or similar), we use
1420     one of the special opcodes, replacing it. The code is shared with single-
1421     character repeats by adding a suitable offset into repeat_type. */
1422    
1423     else if ((int)*previous == OP_NOT)
1424     {
1425     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1426     c = previous[1];
1427     code = previous;
1428     goto OUTPUT_SINGLE_REPEAT;
1429     }
1430    
1431     /* If previous was a character type match (\d or similar), abolish it and
1432     create a suitable repeat item. The code is shared with single-character
1433     repeats by adding a suitable offset into repeat_type. */
1434    
1435 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1436 nigel 3 {
1437     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1438     c = *previous;
1439     code = previous;
1440    
1441     OUTPUT_SINGLE_REPEAT:
1442    
1443 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
1444     this case, so we do too - by simply omitting the item altogether. */
1445    
1446     if (repeat_max == 0) goto END_REPEAT;
1447    
1448     /* Combine the op_type with the repeat_type */
1449    
1450     repeat_type += op_type;
1451    
1452 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
1453     an UPTO, with the maximum given. */
1454    
1455     if (repeat_min == 0)
1456     {
1457     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1458     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1459     else
1460     {
1461     *code++ = OP_UPTO + repeat_type;
1462     *code++ = repeat_max >> 8;
1463     *code++ = (repeat_max & 255);
1464     }
1465     }
1466    
1467     /* The case {1,} is handled as the special case + */
1468    
1469     else if (repeat_min == 1 && repeat_max == -1)
1470     *code++ = OP_PLUS + repeat_type;
1471    
1472     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1473     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1474    
1475     else
1476     {
1477     if (repeat_min != 1)
1478     {
1479     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1480     *code++ = repeat_min >> 8;
1481     *code++ = (repeat_min & 255);
1482     }
1483    
1484     /* If the mininum is 1 and the previous item was a character string,
1485     we either have to put back the item that got cancelled if the string
1486     length was 1, or add the character back onto the end of a longer
1487 nigel 21 string. For a character type nothing need be done; it will just get
1488     put back naturally. Note that the final character is always going to
1489     get added below. */
1490 nigel 3
1491     else if (*previous == OP_CHARS)
1492     {
1493     if (code == previous) code += 2; else previous[1]++;
1494     }
1495    
1496 nigel 21 /* For a single negated character we also have to put back the
1497     item that got cancelled. */
1498    
1499     else if (*previous == OP_NOT) code++;
1500    
1501 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1502 nigel 3
1503 nigel 9 if (repeat_max < 0)
1504 nigel 3 {
1505     *code++ = c;
1506 nigel 9 *code++ = OP_STAR + repeat_type;
1507     }
1508    
1509     /* Else insert an UPTO if the max is greater than the min. */
1510    
1511     else if (repeat_max != repeat_min)
1512     {
1513     *code++ = c;
1514 nigel 3 repeat_max -= repeat_min;
1515     *code++ = OP_UPTO + repeat_type;
1516     *code++ = repeat_max >> 8;
1517     *code++ = (repeat_max & 255);
1518     }
1519     }
1520    
1521     /* The character or character type itself comes last in all cases. */
1522    
1523     *code++ = c;
1524     }
1525    
1526     /* If previous was a character class or a back reference, we put the repeat
1527 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
1528 nigel 3
1529 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1530 nigel 3 {
1531 nigel 37 if (repeat_max == 0)
1532     {
1533     code = previous;
1534     goto END_REPEAT;
1535     }
1536 nigel 3 if (repeat_min == 0 && repeat_max == -1)
1537     *code++ = OP_CRSTAR + repeat_type;
1538     else if (repeat_min == 1 && repeat_max == -1)
1539     *code++ = OP_CRPLUS + repeat_type;
1540     else if (repeat_min == 0 && repeat_max == 1)
1541     *code++ = OP_CRQUERY + repeat_type;
1542     else
1543     {
1544     *code++ = OP_CRRANGE + repeat_type;
1545     *code++ = repeat_min >> 8;
1546     *code++ = repeat_min & 255;
1547     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1548     *code++ = repeat_max >> 8;
1549     *code++ = repeat_max & 255;
1550     }
1551     }
1552    
1553     /* If previous was a bracket group, we may have to replicate it in certain
1554 nigel 23 cases. */
1555 nigel 3
1556 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557     (int)*previous == OP_COND)
1558 nigel 3 {
1559 nigel 31 register int i;
1560     int ketoffset = 0;
1561 nigel 9 int len = code - previous;
1562 nigel 31 uschar *bralink = NULL;
1563 nigel 3
1564 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1565     by scanning through from the start, and compute the offset back to it
1566     from the current code pointer. There may be an OP_OPT setting following
1567     the final KET, so we can't find the end just by going back from the code
1568     pointer. */
1569    
1570     if (repeat_max == -1)
1571 nigel 3 {
1572 nigel 23 register uschar *ket = previous;
1573     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574     ketoffset = code - ket;
1575 nigel 3 }
1576    
1577 nigel 31 /* The case of a zero minimum is special because of the need to stick
1578     OP_BRAZERO in front of it, and because the group appears once in the
1579     data, whereas in other cases it appears the minimum number of times. For
1580     this reason, it is simplest to treat this case separately, as otherwise
1581     the code gets far too mess. There are several special subcases when the
1582     minimum is zero. */
1583    
1584     if (repeat_min == 0)
1585     {
1586 nigel 37 /* If we set up a required char from the bracket, we must back off
1587     to the previous value and reset the countlits value too. */
1588    
1589     if (subcountlits > 0)
1590     {
1591     *reqchar = prevreqchar;
1592     *countlits -= subcountlits;
1593     }
1594    
1595 nigel 31 /* If the maximum is also zero, we just omit the group from the output
1596     altogether. */
1597    
1598     if (repeat_max == 0)
1599     {
1600     code = previous;
1601 nigel 37 goto END_REPEAT;
1602 nigel 31 }
1603    
1604     /* If the maximum is 1 or unlimited, we just have to stick in the
1605     BRAZERO and do no more at this point. */
1606    
1607     if (repeat_max <= 1)
1608     {
1609     memmove(previous+1, previous, len);
1610     code++;
1611     *previous++ = OP_BRAZERO + repeat_type;
1612     }
1613    
1614     /* If the maximum is greater than 1 and limited, we have to replicate
1615     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616     The first one has to be handled carefully because it's the original
1617     copy, which has to be moved up. The remainder can be handled by code
1618     that is common with the non-zero minimum case below. We just have to
1619     adjust the value or repeat_max, since one less copy is required. */
1620    
1621     else
1622     {
1623     int offset;
1624     memmove(previous+4, previous, len);
1625     code += 4;
1626     *previous++ = OP_BRAZERO + repeat_type;
1627     *previous++ = OP_BRA;
1628    
1629     /* We chain together the bracket offset fields that have to be
1630     filled in later when the ends of the brackets are reached. */
1631    
1632     offset = (bralink == NULL)? 0 : previous - bralink;
1633     bralink = previous;
1634     *previous++ = offset >> 8;
1635     *previous++ = offset & 255;
1636     }
1637    
1638     repeat_max--;
1639     }
1640    
1641     /* If the minimum is greater than zero, replicate the group as many
1642     times as necessary, and adjust the maximum to the number of subsequent
1643     copies that we need. */
1644    
1645     else
1646     {
1647     for (i = 1; i < repeat_min; i++)
1648     {
1649     memcpy(code, previous, len);
1650     code += len;
1651     }
1652     if (repeat_max > 0) repeat_max -= repeat_min;
1653     }
1654    
1655     /* This code is common to both the zero and non-zero minimum cases. If
1656     the maximum is limited, it replicates the group in a nested fashion,
1657     remembering the bracket starts on a stack. In the case of a zero minimum,
1658     the first one was set up above. In all cases the repeat_max now specifies
1659     the number of additional copies needed. */
1660    
1661     if (repeat_max >= 0)
1662     {
1663     for (i = repeat_max - 1; i >= 0; i--)
1664     {
1665     *code++ = OP_BRAZERO + repeat_type;
1666    
1667     /* All but the final copy start a new nesting, maintaining the
1668     chain of brackets outstanding. */
1669    
1670     if (i != 0)
1671     {
1672     int offset;
1673     *code++ = OP_BRA;
1674     offset = (bralink == NULL)? 0 : code - bralink;
1675     bralink = code;
1676     *code++ = offset >> 8;
1677     *code++ = offset & 255;
1678     }
1679    
1680     memcpy(code, previous, len);
1681     code += len;
1682     }
1683    
1684     /* Now chain through the pending brackets, and fill in their length
1685     fields (which are holding the chain links pro tem). */
1686    
1687     while (bralink != NULL)
1688     {
1689     int oldlinkoffset;
1690     int offset = code - bralink + 1;
1691     uschar *bra = code - offset;
1692     oldlinkoffset = (bra[1] << 8) + bra[2];
1693     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1694     *code++ = OP_KET;
1695     *code++ = bra[1] = offset >> 8;
1696     *code++ = bra[2] = (offset & 255);
1697     }
1698     }
1699    
1700     /* If the maximum is unlimited, set a repeater in the final copy. We
1701     can't just offset backwards from the current code point, because we
1702     don't know if there's been an options resetting after the ket. The
1703     correct offset was computed above. */
1704    
1705     else code[-ketoffset] = OP_KETRMAX + repeat_type;
1706 nigel 3 }
1707    
1708     /* Else there's some kind of shambles */
1709    
1710     else
1711     {
1712     *errorptr = ERR11;
1713     goto FAILED;
1714     }
1715    
1716     /* In all case we no longer have a previous item. */
1717    
1718 nigel 37 END_REPEAT:
1719 nigel 3 previous = NULL;
1720     break;
1721    
1722    
1723 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1724     lookbehind or option setting or condition. First deal with special things
1725     that can come after a bracket; all are introduced by ?, and the appearance
1726     of any of them means that this is not a referencing group. They were
1727     checked for validity in the first pass over the string, so we don't have to
1728     check for syntax errors here. */
1729 nigel 3
1730     case '(':
1731 nigel 23 newoptions = options;
1732     condref = -1;
1733    
1734 nigel 3 if (*(++ptr) == '?')
1735     {
1736 nigel 23 int set, unset;
1737     int *optset;
1738 nigel 3
1739     switch (*(++ptr))
1740     {
1741 nigel 23 case '#': /* Comment; skip to ket */
1742 nigel 3 ptr++;
1743     while (*ptr != ')') ptr++;
1744     continue;
1745    
1746     case ':': /* Non-extracting bracket */
1747 nigel 23 bravalue = OP_BRA;
1748 nigel 3 ptr++;
1749     break;
1750    
1751 nigel 23 case '(':
1752     bravalue = OP_COND; /* Conditional group */
1753 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1754 nigel 23 {
1755     condref = *ptr - '0';
1756     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757 nigel 51 if (condref == 0)
1758     {
1759     *errorptr = ERR35;
1760     goto FAILED;
1761     }
1762 nigel 23 ptr++;
1763     }
1764     else ptr--;
1765     break;
1766    
1767     case '=': /* Positive lookahead */
1768 nigel 3 bravalue = OP_ASSERT;
1769     ptr++;
1770     break;
1771    
1772 nigel 23 case '!': /* Negative lookahead */
1773 nigel 3 bravalue = OP_ASSERT_NOT;
1774     ptr++;
1775     break;
1776    
1777 nigel 23 case '<': /* Lookbehinds */
1778     switch (*(++ptr))
1779 nigel 3 {
1780 nigel 23 case '=': /* Positive lookbehind */
1781     bravalue = OP_ASSERTBACK;
1782 nigel 3 ptr++;
1783     break;
1784 nigel 23
1785     case '!': /* Negative lookbehind */
1786     bravalue = OP_ASSERTBACK_NOT;
1787     ptr++;
1788     break;
1789    
1790     default: /* Syntax error */
1791     *errorptr = ERR24;
1792     goto FAILED;
1793 nigel 3 }
1794 nigel 23 break;
1795 nigel 3
1796 nigel 23 case '>': /* One-time brackets */
1797     bravalue = OP_ONCE;
1798     ptr++;
1799     break;
1800    
1801 nigel 43 case 'R': /* Pattern recursion */
1802     *code++ = OP_RECURSE;
1803     ptr++;
1804     continue;
1805    
1806 nigel 23 default: /* Option setting */
1807     set = unset = 0;
1808     optset = &set;
1809    
1810     while (*ptr != ')' && *ptr != ':')
1811     {
1812     switch (*ptr++)
1813     {
1814     case '-': optset = &unset; break;
1815    
1816     case 'i': *optset |= PCRE_CASELESS; break;
1817     case 'm': *optset |= PCRE_MULTILINE; break;
1818     case 's': *optset |= PCRE_DOTALL; break;
1819     case 'x': *optset |= PCRE_EXTENDED; break;
1820     case 'U': *optset |= PCRE_UNGREEDY; break;
1821     case 'X': *optset |= PCRE_EXTRA; break;
1822    
1823     default:
1824     *errorptr = ERR12;
1825     goto FAILED;
1826     }
1827     }
1828    
1829     /* Set up the changed option bits, but don't change anything yet. */
1830    
1831     newoptions = (options | set) & (~unset);
1832    
1833     /* If the options ended with ')' this is not the start of a nested
1834     group with option changes, so the options change at this level. At top
1835     level there is nothing else to be done (the options will in fact have
1836     been set from the start of compiling as a result of the first pass) but
1837     at an inner level we must compile code to change the ims options if
1838     necessary, and pass the new setting back so that it can be put at the
1839     start of any following branches, and when this group ends, a resetting
1840     item can be compiled. */
1841    
1842     if (*ptr == ')')
1843     {
1844     if ((options & PCRE_INGROUP) != 0 &&
1845     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1846     {
1847     *code++ = OP_OPT;
1848     *code++ = *optchanged = newoptions & PCRE_IMS;
1849     }
1850     options = newoptions; /* Change options at this level */
1851     previous = NULL; /* This item can't be repeated */
1852     continue; /* It is complete */
1853     }
1854    
1855     /* If the options ended with ':' we are heading into a nested group
1856     with possible change of options. Such groups are non-capturing and are
1857     not assertions of any kind. All we need to do is skip over the ':';
1858     the newoptions value is handled below. */
1859    
1860     bravalue = OP_BRA;
1861     ptr++;
1862 nigel 3 }
1863     }
1864    
1865 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1866 nigel 3
1867     else
1868     {
1869     if (++(*brackets) > EXTRACT_MAX)
1870     {
1871     *errorptr = ERR13;
1872     goto FAILED;
1873     }
1874     bravalue = OP_BRA + *brackets;
1875     }
1876    
1877 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1878     kinds can be. We copy code into a non-register variable in order to be able
1879     to pass its address because some compilers complain otherwise. Pass in a
1880     new setting for the ims options if they have changed. */
1881 nigel 3
1882 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1883 nigel 3 *code = bravalue;
1884 nigel 23 tempcode = code;
1885    
1886     if (!compile_regex(
1887     options | PCRE_INGROUP, /* Set for all nested groups */
1888     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1889     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1890     brackets, /* Bracket level */
1891     &tempcode, /* Where to put code (updated) */
1892     &ptr, /* Input pointer (updated) */
1893     errorptr, /* Where to put an error message */
1894     (bravalue == OP_ASSERTBACK ||
1895     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896 nigel 25 condref, /* Condition reference number */
1897 nigel 37 &subreqchar, /* For possible last char */
1898     &subcountlits, /* For literal count */
1899 nigel 25 cd)) /* Tables block */
1900 nigel 23 goto FAILED;
1901    
1902     /* At the end of compiling, code is still pointing to the start of the
1903     group, while tempcode has been updated to point past the end of the group
1904     and any option resetting that may follow it. The pattern pointer (ptr)
1905     is on the bracket. */
1906    
1907     /* If this is a conditional bracket, check that there are no more than
1908     two branches in the group. */
1909    
1910     if (bravalue == OP_COND)
1911 nigel 3 {
1912 nigel 23 uschar *tc = code;
1913 nigel 37 condcount = 0;
1914 nigel 23
1915     do {
1916 nigel 37 condcount++;
1917 nigel 23 tc += (tc[1] << 8) | tc[2];
1918     }
1919     while (*tc != OP_KET);
1920    
1921 nigel 37 if (condcount > 2)
1922 nigel 23 {
1923     *errorptr = ERR27;
1924 nigel 3 goto FAILED;
1925 nigel 23 }
1926 nigel 3 }
1927    
1928 nigel 37 /* Handle updating of the required character. If the subpattern didn't
1929     set one, leave it as it was. Otherwise, update it for normal brackets of
1930     all kinds, forward assertions, and conditions with two branches. Don't
1931     update the literal count for forward assertions, however. If the bracket
1932     is followed by a quantifier with zero repeat, we have to back off. Hence
1933     the definition of prevreqchar and subcountlits outside the main loop so
1934     that they can be accessed for the back off. */
1935    
1936     if (subreqchar > 0 &&
1937     (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938     (bravalue == OP_COND && condcount == 2)))
1939     {
1940     prevreqchar = *reqchar;
1941     *reqchar = subreqchar;
1942     if (bravalue != OP_ASSERT) *countlits += subcountlits;
1943     }
1944    
1945 nigel 23 /* Now update the main code pointer to the end of the group. */
1946    
1947     code = tempcode;
1948    
1949     /* Error if hit end of pattern */
1950    
1951 nigel 3 if (*ptr != ')')
1952     {
1953     *errorptr = ERR14;
1954     goto FAILED;
1955     }
1956     break;
1957    
1958     /* Check \ for being a real metacharacter; if not, fall through and handle
1959     it as a data character at the start of a string. Escape items are checked
1960     for validity in the pre-compiling pass. */
1961    
1962     case '\\':
1963 nigel 23 tempptr = ptr;
1964 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1965 nigel 3
1966     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1967     are arranged to be the negation of the corresponding OP_values. For the
1968     back references, the values are ESC_REF plus the reference number. Only
1969     back references and those types that consume a character may be repeated.
1970     We can test for values between ESC_b and ESC_Z for the latter; this may
1971     have to change if any new ones are ever created. */
1972    
1973     if (c < 0)
1974     {
1975     if (-c >= ESC_REF)
1976     {
1977     previous = code;
1978     *code++ = OP_REF;
1979 nigel 23 *code++ = -c - ESC_REF;
1980 nigel 3 }
1981     else
1982     {
1983 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1984 nigel 3 *code++ = -c;
1985     }
1986     continue;
1987     }
1988    
1989 nigel 7 /* Data character: reset and fall through */
1990 nigel 3
1991 nigel 23 ptr = tempptr;
1992 nigel 3 c = '\\';
1993    
1994     /* Handle a run of data characters until a metacharacter is encountered.
1995     The first character is guaranteed not to be whitespace or # when the
1996     extended flag is set. */
1997    
1998     NORMAL_CHAR:
1999     default:
2000     previous = code;
2001     *code = OP_CHARS;
2002     code += 2;
2003     length = 0;
2004    
2005     do
2006     {
2007     if ((options & PCRE_EXTENDED) != 0)
2008     {
2009 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2010 nigel 3 if (c == '#')
2011     {
2012 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
2013     on the Macintosh. */
2014     while ((c = *(++ptr)) != 0 && c != '\n') ;
2015 nigel 3 if (c == 0) break;
2016     continue;
2017     }
2018     }
2019    
2020     /* Backslash may introduce a data char or a metacharacter. Escaped items
2021     are checked for validity in the pre-compiling pass. Stop the string
2022     before a metaitem. */
2023    
2024     if (c == '\\')
2025     {
2026 nigel 23 tempptr = ptr;
2027 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028 nigel 23 if (c < 0) { ptr = tempptr; break; }
2029 nigel 49
2030     /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031     two or more characters in the UTF-8 encoding. */
2032    
2033     #ifdef SUPPORT_UTF8
2034     if (c > 127 && (options & PCRE_UTF8) != 0)
2035     {
2036     uschar buffer[8];
2037     int len = ord2utf8(c, buffer);
2038     for (c = 0; c < len; c++) *code++ = buffer[c];
2039     length += len;
2040     continue;
2041     }
2042     #endif
2043 nigel 3 }
2044    
2045     /* Ordinary character or single-char escape */
2046    
2047     *code++ = c;
2048     length++;
2049     }
2050    
2051     /* This "while" is the end of the "do" above. */
2052    
2053 nigel 49 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2054 nigel 3
2055 nigel 37 /* Update the last character and the count of literals */
2056    
2057     prevreqchar = (length > 1)? code[-2] : *reqchar;
2058     *reqchar = code[-1];
2059     *countlits += length;
2060    
2061 nigel 3 /* Compute the length and set it in the data vector, and advance to
2062     the next state. */
2063    
2064     previous[1] = length;
2065 nigel 49 if (length < MAXLIT) ptr--;
2066 nigel 3 break;
2067     }
2068     } /* end of big loop */
2069    
2070     /* Control never reaches here by falling through, only by a goto for all the
2071     error states. Pass back the position in the pattern so that it can be displayed
2072     to the user for diagnosing the error. */
2073    
2074     FAILED:
2075     *ptrptr = ptr;
2076     return FALSE;
2077     }
2078    
2079    
2080    
2081    
2082     /*************************************************
2083     * Compile sequence of alternatives *
2084     *************************************************/
2085    
2086     /* On entry, ptr is pointing past the bracket character, but on return
2087     it points to the closing bracket, or vertical bar, or end of string.
2088     The code variable is pointing at the byte into which the BRA operator has been
2089 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
2090     during any branch, we need to insert an OP_OPT item at the start of every
2091     following branch to ensure they get set correctly at run time, and also pass
2092     the new options into every subsequent branch compile.
2093 nigel 3
2094     Argument:
2095 nigel 23 options the option bits
2096     optchanged new ims options to set as if (?ims) were at the start, or -1
2097     for no change
2098     brackets -> int containing the number of extracting brackets used
2099     codeptr -> the address of the current code pointer
2100     ptrptr -> the address of the current pattern pointer
2101     errorptr -> pointer to error message
2102     lookbehind TRUE if this is a lookbehind assertion
2103 nigel 51 condref >= 0 for OPT_CREF setting at start of conditional group
2104 nigel 37 reqchar -> place to put the last required character, or a negative number
2105     countlits -> place to put the shortest literal count of any branch
2106 nigel 25 cd points to the data block with tables pointers
2107 nigel 3
2108 nigel 23 Returns: TRUE on success
2109 nigel 3 */
2110    
2111     static BOOL
2112 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113 nigel 25 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114 nigel 37 int *reqchar, int *countlits, compile_data *cd)
2115 nigel 3 {
2116 nigel 7 const uschar *ptr = *ptrptr;
2117 nigel 3 uschar *code = *codeptr;
2118 nigel 23 uschar *last_branch = code;
2119 nigel 3 uschar *start_bracket = code;
2120 nigel 23 uschar *reverse_count = NULL;
2121     int oldoptions = options & PCRE_IMS;
2122 nigel 37 int branchreqchar, branchcountlits;
2123 nigel 3
2124 nigel 37 *reqchar = -1;
2125     *countlits = INT_MAX;
2126 nigel 23 code += 3;
2127    
2128     /* At the start of a reference-based conditional group, insert the reference
2129     number as an OP_CREF item. */
2130    
2131 nigel 51 if (condref >= 0)
2132 nigel 23 {
2133     *code++ = OP_CREF;
2134     *code++ = condref;
2135     }
2136    
2137     /* Loop for each alternative branch */
2138    
2139 nigel 3 for (;;)
2140     {
2141     int length;
2142    
2143 nigel 23 /* Handle change of options */
2144    
2145     if (optchanged >= 0)
2146 nigel 3 {
2147 nigel 23 *code++ = OP_OPT;
2148     *code++ = optchanged;
2149     options = (options & ~PCRE_IMS) | optchanged;
2150     }
2151    
2152     /* Set up dummy OP_REVERSE if lookbehind assertion */
2153    
2154     if (lookbehind)
2155     {
2156     *code++ = OP_REVERSE;
2157     reverse_count = code;
2158     *code++ = 0;
2159     *code++ = 0;
2160     }
2161    
2162     /* Now compile the branch */
2163    
2164 nigel 37 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165     &branchreqchar, &branchcountlits, cd))
2166 nigel 23 {
2167 nigel 3 *ptrptr = ptr;
2168     return FALSE;
2169     }
2170    
2171     /* Fill in the length of the last branch */
2172    
2173     length = code - last_branch;
2174     last_branch[1] = length >> 8;
2175     last_branch[2] = length & 255;
2176    
2177 nigel 37 /* Save the last required character if all branches have the same; a current
2178     value of -1 means unset, while -2 means "previous branch had no last required
2179     char". */
2180    
2181     if (*reqchar != -2)
2182     {
2183     if (branchreqchar >= 0)
2184     {
2185     if (*reqchar == -1) *reqchar = branchreqchar;
2186     else if (*reqchar != branchreqchar) *reqchar = -2;
2187     }
2188     else *reqchar = -2;
2189     }
2190    
2191     /* Keep the shortest literal count */
2192    
2193     if (branchcountlits < *countlits) *countlits = branchcountlits;
2194     DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2195    
2196 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
2197     and put the length into the OP_REVERSE item. Temporarily mark the end of
2198     the branch with OP_END. */
2199    
2200     if (lookbehind)
2201     {
2202     *code = OP_END;
2203 nigel 49 length = find_fixedlength(last_branch, options);
2204 nigel 23 DPRINTF(("fixed length = %d\n", length));
2205     if (length < 0)
2206     {
2207     *errorptr = ERR25;
2208     *ptrptr = ptr;
2209     return FALSE;
2210     }
2211     reverse_count[0] = (length >> 8);
2212     reverse_count[1] = length & 255;
2213     }
2214    
2215 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
2216     terminating ket and the length of the whole bracketed item, and return,
2217 nigel 23 leaving the pointer at the terminating char. If any of the ims options
2218     were changed inside the group, compile a resetting op-code following. */
2219 nigel 3
2220     if (*ptr != '|')
2221     {
2222     length = code - start_bracket;
2223     *code++ = OP_KET;
2224     *code++ = length >> 8;
2225     *code++ = length & 255;
2226 nigel 23 if (optchanged >= 0)
2227     {
2228     *code++ = OP_OPT;
2229     *code++ = oldoptions;
2230     }
2231 nigel 3 *codeptr = code;
2232     *ptrptr = ptr;
2233     return TRUE;
2234     }
2235    
2236     /* Another branch follows; insert an "or" node and advance the pointer. */
2237    
2238     *code = OP_ALT;
2239 nigel 23 last_branch = code;
2240     code += 3;
2241 nigel 3 ptr++;
2242     }
2243     /* Control never reaches here */
2244     }
2245    
2246    
2247    
2248 nigel 23
2249 nigel 3 /*************************************************
2250 nigel 23 * Find first significant op code *
2251     *************************************************/
2252    
2253     /* This is called by several functions that scan a compiled expression looking
2254     for a fixed first character, or an anchoring op code etc. It skips over things
2255     that do not influence this. For one application, a change of caseless option is
2256     important.
2257    
2258     Arguments:
2259     code pointer to the start of the group
2260     options pointer to external options
2261     optbit the option bit whose changing is significant, or
2262     zero if none are
2263     optstop TRUE to return on option change, otherwise change the options
2264     value and continue
2265    
2266     Returns: pointer to the first significant opcode
2267     */
2268    
2269     static const uschar*
2270     first_significant_code(const uschar *code, int *options, int optbit,
2271     BOOL optstop)
2272     {
2273     for (;;)
2274     {
2275     switch ((int)*code)
2276     {
2277     case OP_OPT:
2278     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2279     {
2280     if (optstop) return code;
2281     *options = (int)code[1];
2282     }
2283     code += 2;
2284     break;
2285    
2286     case OP_CREF:
2287     code += 2;
2288     break;
2289    
2290 nigel 35 case OP_WORD_BOUNDARY:
2291     case OP_NOT_WORD_BOUNDARY:
2292     code++;
2293     break;
2294    
2295 nigel 23 case OP_ASSERT_NOT:
2296     case OP_ASSERTBACK:
2297     case OP_ASSERTBACK_NOT:
2298     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2299     code += 3;
2300     break;
2301    
2302     default:
2303     return code;
2304     }
2305     }
2306     /* Control never reaches here */
2307     }
2308    
2309    
2310    
2311    
2312     /*************************************************
2313 nigel 3 * Check for anchored expression *
2314     *************************************************/
2315    
2316     /* Try to find out if this is an anchored regular expression. Consider each
2317     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2318     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2319     it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320     counts, since OP_CIRC can match in the middle.
2321    
2322 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323     because that will try the rest of the pattern at all possible matching points,
2324     so there is no point trying them again.
2325 nigel 3
2326 nigel 23 Arguments:
2327     code points to start of expression (the bracket)
2328     options points to the options setting
2329    
2330     Returns: TRUE or FALSE
2331 nigel 3 */
2332    
2333     static BOOL
2334 nigel 23 is_anchored(register const uschar *code, int *options)
2335 nigel 3 {
2336     do {
2337 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
2338     PCRE_MULTILINE, FALSE);
2339     register int op = *scode;
2340     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341     { if (!is_anchored(scode, options)) return FALSE; }
2342 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343     (*options & PCRE_DOTALL) != 0)
2344 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
2345     else if (op != OP_SOD &&
2346     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2347     return FALSE;
2348 nigel 3 code += (code[1] << 8) + code[2];
2349     }
2350     while (*code == OP_ALT);
2351     return TRUE;
2352     }
2353    
2354    
2355    
2356     /*************************************************
2357 nigel 33 * Check for starting with ^ or .* *
2358 nigel 3 *************************************************/
2359    
2360 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
2361     "first char" processing can be done to speed things up in multiline
2362     matching and for non-DOTALL patterns that start with .* (which must start at
2363     the beginning or after \n).
2364 nigel 3
2365     Argument: points to start of expression (the bracket)
2366     Returns: TRUE or FALSE
2367     */
2368    
2369     static BOOL
2370 nigel 7 is_startline(const uschar *code)
2371 nigel 3 {
2372     do {
2373 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2374     register int op = *scode;
2375     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376     { if (!is_startline(scode)) return FALSE; }
2377 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378     { if (scode[1] != OP_ANY) return FALSE; }
2379 nigel 23 else if (op != OP_CIRC) return FALSE;
2380 nigel 3 code += (code[1] << 8) + code[2];
2381     }
2382     while (*code == OP_ALT);
2383     return TRUE;
2384     }
2385    
2386    
2387    
2388     /*************************************************
2389     * Check for fixed first char *
2390     *************************************************/
2391    
2392     /* Try to find out if there is a fixed first character. This is called for
2393     unanchored expressions, as it speeds up their processing quite considerably.
2394     Consider each alternative branch. If they all start with the same char, or with
2395     a bracket all of whose alternatives start with the same char (recurse ad lib),
2396     then we return that char, otherwise -1.
2397    
2398 nigel 23 Arguments:
2399     code points to start of expression (the bracket)
2400     options pointer to the options (used to check casing changes)
2401    
2402     Returns: -1 or the fixed first char
2403 nigel 3 */
2404    
2405     static int
2406 nigel 23 find_firstchar(const uschar *code, int *options)
2407 nigel 3 {
2408     register int c = -1;
2409 nigel 23 do {
2410     int d;
2411     const uschar *scode = first_significant_code(code + 3, options,
2412     PCRE_CASELESS, TRUE);
2413     register int op = *scode;
2414 nigel 3
2415 nigel 23 if (op >= OP_BRA) op = OP_BRA;
2416 nigel 3
2417 nigel 23 switch(op)
2418     {
2419     default:
2420     return -1;
2421 nigel 3
2422 nigel 23 case OP_BRA:
2423     case OP_ASSERT:
2424     case OP_ONCE:
2425     case OP_COND:
2426     if ((d = find_firstchar(scode, options)) < 0) return -1;
2427     if (c < 0) c = d; else if (c != d) return -1;
2428     break;
2429 nigel 3
2430 nigel 23 case OP_EXACT: /* Fall through */
2431     scode++;
2432 nigel 3
2433 nigel 23 case OP_CHARS: /* Fall through */
2434     scode++;
2435    
2436     case OP_PLUS:
2437     case OP_MINPLUS:
2438     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2439     break;
2440     }
2441    
2442     code += (code[1] << 8) + code[2];
2443     }
2444 nigel 3 while (*code == OP_ALT);
2445     return c;
2446     }
2447    
2448    
2449    
2450 nigel 23
2451    
2452 nigel 3 /*************************************************
2453     * Compile a Regular Expression *
2454     *************************************************/
2455    
2456     /* This function takes a string and returns a pointer to a block of store
2457     holding a compiled version of the expression.
2458    
2459     Arguments:
2460     pattern the regular expression
2461     options various option bits
2462     errorptr pointer to pointer to error text
2463     erroroffset ptr offset in pattern where error was detected
2464 nigel 25 tables pointer to character tables or NULL
2465 nigel 3
2466     Returns: pointer to compiled data block, or NULL on error,
2467     with errorptr and erroroffset set
2468     */
2469    
2470     pcre *
2471 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
2472 nigel 25 int *erroroffset, const unsigned char *tables)
2473 nigel 3 {
2474     real_pcre *re;
2475     int length = 3; /* For initial BRA plus length */
2476     int runlength;
2477 nigel 43 int c, reqchar, countlits;
2478 nigel 3 int bracount = 0;
2479     int top_backref = 0;
2480 nigel 23 int branch_extra = 0;
2481     int branch_newextra;
2482 nigel 7 unsigned int brastackptr = 0;
2483 nigel 43 size_t size;
2484 nigel 7 uschar *code;
2485     const uschar *ptr;
2486 nigel 25 compile_data compile_block;
2487 nigel 23 int brastack[BRASTACK_SIZE];
2488     uschar bralenstack[BRASTACK_SIZE];
2489 nigel 3
2490     #ifdef DEBUG
2491     uschar *code_base, *code_end;
2492     #endif
2493    
2494 nigel 49 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2495    
2496     #ifndef SUPPORT_UTF8
2497     if ((options & PCRE_UTF8) != 0)
2498     {
2499     *errorptr = ERR32;
2500     return NULL;
2501     }
2502     #endif
2503    
2504 nigel 3 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2505     can do is just return NULL. */
2506    
2507     if (errorptr == NULL) return NULL;
2508     *errorptr = NULL;
2509    
2510     /* However, we can give a message for this error */
2511    
2512     if (erroroffset == NULL)
2513     {
2514     *errorptr = ERR16;
2515     return NULL;
2516     }
2517     *erroroffset = 0;
2518    
2519     if ((options & ~PUBLIC_OPTIONS) != 0)
2520     {
2521     *errorptr = ERR17;
2522     return NULL;
2523     }
2524    
2525 nigel 25 /* Set up pointers to the individual character tables */
2526    
2527     if (tables == NULL) tables = pcre_default_tables;
2528     compile_block.lcc = tables + lcc_offset;
2529     compile_block.fcc = tables + fcc_offset;
2530     compile_block.cbits = tables + cbits_offset;
2531     compile_block.ctypes = tables + ctypes_offset;
2532    
2533     /* Reflect pattern for debugging output */
2534    
2535 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
2536     DPRINTF(("%s\n", pattern));
2537 nigel 3
2538     /* The first thing to do is to make a pass over the pattern to compute the
2539     amount of store required to hold the compiled code. This does not have to be
2540     perfect as long as errors are overestimates. At the same time we can detect any
2541     internal flag settings. Make an attempt to correct for any counted white space
2542     if an "extended" flag setting appears late in the pattern. We can't be so
2543     clever for #-comments. */
2544    
2545 nigel 7 ptr = (const uschar *)(pattern - 1);
2546 nigel 3 while ((c = *(++ptr)) != 0)
2547     {
2548     int min, max;
2549     int class_charcount;
2550    
2551 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2552 nigel 3 {
2553 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2554 nigel 23 if (c == '#')
2555     {
2556 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
2557     on the Macintosh. */
2558     while ((c = *(++ptr)) != 0 && c != '\n') ;
2559 nigel 23 continue;
2560     }
2561 nigel 3 }
2562    
2563     switch(c)
2564     {
2565     /* A backslashed item may be an escaped "normal" character or a
2566     character type. For a "normal" character, put the pointers and
2567     character back so that tests for whitespace etc. in the input
2568     are done correctly. */
2569    
2570     case '\\':
2571     {
2572 nigel 7 const uschar *save_ptr = ptr;
2573 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2574 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2575     if (c >= 0)
2576     {
2577     ptr = save_ptr;
2578     c = '\\';
2579     goto NORMAL_CHAR;
2580     }
2581     }
2582     length++;
2583    
2584     /* A back reference needs an additional char, plus either one or 5
2585     bytes for a repeat. We also need to keep the value of the highest
2586     back reference. */
2587    
2588     if (c <= -ESC_REF)
2589     {
2590     int refnum = -c - ESC_REF;
2591     if (refnum > top_backref) top_backref = refnum;
2592     length++; /* For single back reference */
2593 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2594 nigel 3 {
2595 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2596 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2597     if ((min == 0 && (max == 1 || max == -1)) ||
2598     (min == 1 && max == -1))
2599     length++;
2600     else length += 5;
2601     if (ptr[1] == '?') ptr++;
2602     }
2603     }
2604     continue;
2605    
2606     case '^':
2607     case '.':
2608     case '$':
2609     case '*': /* These repeats won't be after brackets; */
2610     case '+': /* those are handled separately */
2611     case '?':
2612     length++;
2613     continue;
2614    
2615     /* This covers the cases of repeats after a single char, metachar, class,
2616     or back reference. */
2617    
2618     case '{':
2619 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2620     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2621 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2622     if ((min == 0 && (max == 1 || max == -1)) ||
2623     (min == 1 && max == -1))
2624     length++;
2625     else
2626     {
2627     length--; /* Uncount the original char or metachar */
2628     if (min == 1) length++; else if (min > 0) length += 4;
2629     if (max > 0) length += 4; else length += 2;
2630     }
2631     if (ptr[1] == '?') ptr++;
2632     continue;
2633    
2634 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
2635     options changed in the previous branch(es), and/or if we are in a
2636     lookbehind assertion, extra space will be needed at the start of the
2637     branch. This is handled by branch_extra. */
2638    
2639 nigel 3 case '|':
2640 nigel 23 length += 3 + branch_extra;
2641 nigel 3 continue;
2642    
2643     /* A character class uses 33 characters. Don't worry about character types
2644     that aren't allowed in classes - they'll get picked up during the compile.
2645     A character class that contains only one character uses 2 or 3 bytes,
2646     depending on whether it is negated or not. Notice this where we can. */
2647    
2648     case '[':
2649     class_charcount = 0;
2650     if (*(++ptr) == '^') ptr++;
2651     do
2652     {
2653     if (*ptr == '\\')
2654     {
2655 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2656     &compile_block);
2657 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2658 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2659 nigel 3 }
2660     else class_charcount++;
2661     ptr++;
2662     }
2663     while (*ptr != 0 && *ptr != ']');
2664    
2665     /* Repeats for negated single chars are handled by the general code */
2666    
2667     if (class_charcount == 1) length += 3; else
2668     {
2669     length += 33;
2670    
2671     /* A repeat needs either 1 or 5 bytes. */
2672    
2673 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2674 nigel 3 {
2675 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2676 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677     if ((min == 0 && (max == 1 || max == -1)) ||
2678     (min == 1 && max == -1))
2679     length++;
2680     else length += 5;
2681     if (ptr[1] == '?') ptr++;
2682     }
2683     }
2684     continue;
2685    
2686     /* Brackets may be genuine groups or special things */
2687    
2688     case '(':
2689 nigel 23 branch_newextra = 0;
2690 nigel 3
2691     /* Handle special forms of bracket, which all start (? */
2692    
2693 nigel 23 if (ptr[1] == '?')
2694 nigel 3 {
2695 nigel 23 int set, unset;
2696     int *optset;
2697    
2698     switch (c = ptr[2])
2699 nigel 3 {
2700 nigel 23 /* Skip over comments entirely */
2701     case '#':
2702     ptr += 3;
2703     while (*ptr != 0 && *ptr != ')') ptr++;
2704     if (*ptr == 0)
2705     {
2706     *errorptr = ERR18;
2707     goto PCRE_ERROR_RETURN;
2708     }
2709     continue;
2710 nigel 3
2711 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2712     then behave like a non-special bracket, except that they don't increment
2713     the count of extracting brackets. Ditto for the "once only" bracket,
2714     which is in Perl from version 5.005. */
2715 nigel 3
2716 nigel 23 case ':':
2717     case '=':
2718     case '!':
2719     case '>':
2720 nigel 3 ptr += 2;
2721     break;
2722    
2723 nigel 43 /* A recursive call to the regex is an extension, to provide the
2724     facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2725    
2726     case 'R':
2727     if (ptr[3] != ')')
2728     {
2729     *errorptr = ERR29;
2730     goto PCRE_ERROR_RETURN;
2731     }
2732     ptr += 3;
2733     length += 1;
2734     break;
2735    
2736 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2737 nigel 3
2738 nigel 23 case '<':
2739     if (ptr[3] == '=' || ptr[3] == '!')
2740 nigel 3 {
2741 nigel 23 ptr += 3;
2742     branch_newextra = 3;
2743     length += 3; /* For the first branch */
2744     break;
2745 nigel 3 }
2746 nigel 23 *errorptr = ERR24;
2747     goto PCRE_ERROR_RETURN;
2748    
2749     /* Conditionals are in Perl from version 5.005. The bracket must either
2750     be followed by a number (for bracket reference) or by an assertion
2751     group. */
2752    
2753     case '(':
2754 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2755 nigel 3 {
2756 nigel 23 ptr += 4;
2757     length += 2;
2758 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2759 nigel 23 if (*ptr != ')')
2760     {
2761     *errorptr = ERR26;
2762     goto PCRE_ERROR_RETURN;
2763     }
2764 nigel 3 }
2765 nigel 23 else /* An assertion must follow */
2766 nigel 3 {
2767 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2768 nigel 47 if (ptr[2] != '?' ||
2769     (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2770 nigel 23 {
2771     ptr += 2; /* To get right offset in message */
2772     *errorptr = ERR28;
2773     goto PCRE_ERROR_RETURN;
2774     }
2775 nigel 3 }
2776 nigel 23 break;
2777    
2778     /* Else loop checking valid options until ) is met. Anything else is an
2779     error. If we are without any brackets, i.e. at top level, the settings
2780     act as if specified in the options, so massage the options immediately.
2781     This is for backward compatibility with Perl 5.004. */
2782    
2783     default:
2784     set = unset = 0;
2785     optset = &set;
2786     ptr += 2;
2787    
2788     for (;; ptr++)
2789 nigel 3 {
2790 nigel 23 c = *ptr;
2791     switch (c)
2792     {
2793     case 'i':
2794     *optset |= PCRE_CASELESS;
2795     continue;
2796    
2797     case 'm':
2798     *optset |= PCRE_MULTILINE;
2799     continue;
2800    
2801     case 's':
2802     *optset |= PCRE_DOTALL;
2803     continue;
2804    
2805     case 'x':
2806     *optset |= PCRE_EXTENDED;
2807     continue;
2808    
2809     case 'X':
2810     *optset |= PCRE_EXTRA;
2811     continue;
2812    
2813     case 'U':
2814     *optset |= PCRE_UNGREEDY;
2815     continue;
2816    
2817     case '-':
2818     optset = &unset;
2819     continue;
2820    
2821     /* A termination by ')' indicates an options-setting-only item;
2822     this is global at top level; otherwise nothing is done here and
2823     it is handled during the compiling process on a per-bracket-group
2824     basis. */
2825    
2826     case ')':
2827     if (brastackptr == 0)
2828     {
2829     options = (options | set) & (~unset);
2830     set = unset = 0; /* To save length */
2831     }
2832     /* Fall through */
2833    
2834     /* A termination by ':' indicates the start of a nested group with
2835     the given options set. This is again handled at compile time, but
2836     we must allow for compiled space if any of the ims options are
2837     set. We also have to allow for resetting space at the end of
2838     the group, which is why 4 is added to the length and not just 2.
2839     If there are several changes of options within the same group, this
2840     will lead to an over-estimate on the length, but this shouldn't
2841     matter very much. We also have to allow for resetting options at
2842     the start of any alternations, which we do by setting
2843 nigel 37 branch_newextra to 2. Finally, we record whether the case-dependent
2844     flag ever changes within the regex. This is used by the "required
2845     character" code. */
2846 nigel 23
2847     case ':':
2848     if (((set|unset) & PCRE_IMS) != 0)
2849     {
2850     length += 4;
2851     branch_newextra = 2;
2852 nigel 37 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2853 nigel 23 }
2854     goto END_OPTIONS;
2855    
2856     /* Unrecognized option character */
2857    
2858     default:
2859     *errorptr = ERR12;
2860     goto PCRE_ERROR_RETURN;
2861     }
2862 nigel 3 }
2863 nigel 23
2864     /* If we hit a closing bracket, that's it - this is a freestanding
2865     option-setting. We need to ensure that branch_extra is updated if
2866     necessary. The only values branch_newextra can have here are 0 or 2.
2867     If the value is 2, then branch_extra must either be 2 or 5, depending
2868     on whether this is a lookbehind group or not. */
2869    
2870     END_OPTIONS:
2871     if (c == ')')
2872 nigel 19 {
2873 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2874     branch_extra += branch_newextra;
2875 nigel 19 continue;
2876     }
2877 nigel 3
2878 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2879     to handle the group below. */
2880 nigel 3 }
2881     }
2882    
2883     /* Extracting brackets must be counted so we can process escapes in a
2884     Perlish way. */
2885    
2886     else bracount++;
2887    
2888     /* Non-special forms of bracket. Save length for computing whole length
2889 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2890     save the current value of branch_extra, and start the new group with
2891     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2892     for a lookbehind assertion. */
2893 nigel 3
2894     if (brastackptr >= sizeof(brastack)/sizeof(int))
2895     {
2896     *errorptr = ERR19;
2897     goto PCRE_ERROR_RETURN;
2898     }
2899    
2900 nigel 23 bralenstack[brastackptr] = branch_extra;
2901     branch_extra = branch_newextra;
2902    
2903 nigel 3 brastack[brastackptr++] = length;
2904     length += 3;
2905     continue;
2906    
2907     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2908 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2909     0 this is an unmatched bracket which will generate an error, but take care
2910 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2911     the branch_extra value. */
2912 nigel 3
2913     case ')':
2914     length += 3;
2915     {
2916 nigel 9 int minval = 1;
2917     int maxval = 1;
2918 nigel 23 int duplength;
2919 nigel 3
2920 nigel 23 if (brastackptr > 0)
2921     {
2922     duplength = length - brastack[--brastackptr];
2923     branch_extra = bralenstack[brastackptr];
2924     }
2925     else duplength = 0;
2926    
2927 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2928     automatically; for the others we need an increment. */
2929    
2930 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2931 nigel 3 {
2932 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2933     &compile_block);
2934 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2935     }
2936 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2937     else if (c == '+') { maxval = -1; ptr++; }
2938     else if (c == '?') { minval = 0; ptr++; }
2939 nigel 3
2940 nigel 31 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2941     group, and if the maximum is greater than zero, we have to replicate
2942     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2943     bracket set - hence the 7. */
2944 nigel 3
2945 nigel 31 if (minval == 0)
2946     {
2947     length++;
2948     if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2949     }
2950    
2951     /* When the minimum is greater than zero, 1 we have to replicate up to
2952     minval-1 times, with no additions required in the copies. Then, if
2953     there is a limited maximum we have to replicate up to maxval-1 times
2954     allowing for a BRAZERO item before each optional copy and nesting
2955     brackets for all but one of the optional copies. */
2956    
2957     else
2958     {
2959     length += (minval - 1) * duplength;
2960     if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2961     length += (maxval - minval) * (duplength + 7) - 6;
2962     }
2963 nigel 3 }
2964     continue;
2965    
2966     /* Non-special character. For a run of such characters the length required
2967     is the number of characters + 2, except that the maximum run length is 255.
2968     We won't get a skipped space or a non-data escape or the start of a #
2969     comment as the first character, so the length can't be zero. */
2970    
2971     NORMAL_CHAR:
2972     default:
2973     length += 2;
2974     runlength = 0;
2975     do
2976     {
2977 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2978 nigel 3 {
2979 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2980 nigel 23 if (c == '#')
2981     {
2982 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
2983     on the Macintosh. */
2984     while ((c = *(++ptr)) != 0 && c != '\n') ;
2985 nigel 23 continue;
2986     }
2987 nigel 3 }
2988    
2989     /* Backslash may introduce a data char or a metacharacter; stop the
2990     string before the latter. */
2991    
2992     if (c == '\\')
2993     {
2994 nigel 7 const uschar *saveptr = ptr;
2995 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2996     &compile_block);
2997 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2998     if (c < 0) { ptr = saveptr; break; }
2999 nigel 49
3000     #ifdef SUPPORT_UTF8
3001     if (c > 127 && (options & PCRE_UTF8) != 0)
3002     {
3003     int i;
3004     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3005     if (c <= utf8_table1[i]) break;
3006     runlength += i;
3007     }
3008     #endif
3009 nigel 3 }
3010    
3011     /* Ordinary character or single-char escape */
3012    
3013     runlength++;
3014     }
3015    
3016     /* This "while" is the end of the "do" above. */
3017    
3018 nigel 49 while (runlength < MAXLIT &&
3019 nigel 25 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3020 nigel 3
3021     ptr--;
3022     length += runlength;
3023     continue;
3024     }
3025     }
3026    
3027     length += 4; /* For final KET and END */
3028    
3029     if (length > 65539)
3030     {
3031     *errorptr = ERR20;
3032     return NULL;
3033     }
3034    
3035     /* Compute the size of data block needed and get it, either from malloc or
3036 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
3037     rather than just "code", because it has been reported that one broken compiler
3038     fails on "code" because it is also an independent variable. It should make no
3039     difference to the value of the offsetof(). */
3040 nigel 3
3041 nigel 9 size = length + offsetof(real_pcre, code[0]);
3042 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
3043    
3044     if (re == NULL)
3045     {
3046     *errorptr = ERR21;
3047     return NULL;
3048     }
3049    
3050 nigel 43 /* Put in the magic number, and save the size, options, and table pointer */
3051 nigel 9
3052 nigel 3 re->magic_number = MAGIC_NUMBER;
3053 nigel 43 re->size = size;
3054 nigel 3 re->options = options;
3055 nigel 25 re->tables = tables;
3056 nigel 3
3057     /* Set up a starting, non-extracting bracket, then compile the expression. On
3058     error, *errorptr will be set non-NULL, so we don't need to look at the result
3059     of the function here. */
3060    
3061 nigel 7 ptr = (const uschar *)pattern;
3062 nigel 3 code = re->code;
3063     *code = OP_BRA;
3064     bracount = 0;
3065 nigel 25 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3066 nigel 37 &reqchar, &countlits, &compile_block);
3067 nigel 3 re->top_bracket = bracount;
3068     re->top_backref = top_backref;
3069    
3070     /* If not reached end of pattern on success, there's an excess bracket. */
3071    
3072     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3073    
3074     /* Fill in the terminating state and check for disastrous overflow, but
3075     if debugging, leave the test till after things are printed out. */
3076    
3077     *code++ = OP_END;
3078    
3079     #ifndef DEBUG
3080     if (code - re->code > length) *errorptr = ERR23;
3081     #endif
3082    
3083 nigel 23 /* Give an error if there's back reference to a non-existent capturing
3084     subpattern. */
3085    
3086     if (top_backref > re->top_bracket) *errorptr = ERR15;
3087    
3088 nigel 3 /* Failed to compile */
3089    
3090     if (*errorptr != NULL)
3091     {
3092     (pcre_free)(re);
3093     PCRE_ERROR_RETURN:
3094 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
3095 nigel 3 return NULL;
3096     }
3097    
3098 nigel 33 /* If the anchored option was not passed, set flag if we can determine that the
3099     pattern is anchored by virtue of ^ characters or \A or anything else (such as
3100     starting with .* when DOTALL is set).
3101 nigel 3
3102 nigel 33 Otherwise, see if we can determine what the first character has to be, because
3103     that speeds up unanchored matches no end. If not, see if we can set the
3104     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3105     start with ^. and also when all branches start with .* for non-DOTALL matches.
3106     */
3107    
3108 nigel 3 if ((options & PCRE_ANCHORED) == 0)
3109     {
3110 nigel 23 int temp_options = options;
3111     if (is_anchored(re->code, &temp_options))
3112 nigel 3 re->options |= PCRE_ANCHORED;
3113     else
3114     {
3115 nigel 23 int ch = find_firstchar(re->code, &temp_options);
3116 nigel 9 if (ch >= 0)
3117 nigel 3 {
3118 nigel 9 re->first_char = ch;
3119 nigel 3 re->options |= PCRE_FIRSTSET;
3120     }
3121     else if (is_startline(re->code))
3122     re->options |= PCRE_STARTLINE;
3123     }
3124     }
3125    
3126 nigel 37 /* Save the last required character if there are at least two literal
3127     characters on all paths, or if there is no first character setting. */
3128    
3129     if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3130     {
3131     re->req_char = reqchar;
3132     re->options |= PCRE_REQCHSET;
3133     }
3134    
3135 nigel 3 /* Print out the compiled data for debugging */
3136    
3137     #ifdef DEBUG
3138    
3139 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
3140 nigel 3 length, re->top_bracket, re->top_backref);
3141    
3142     if (re->options != 0)
3143     {
3144 nigel 37 printf("%s%s%s%s%s%s%s%s%s\n",
3145 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3146     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3147 nigel 37 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3148 nigel 3 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3149     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3150     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3151     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3152 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3153     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3154 nigel 3 }
3155    
3156     if ((re->options & PCRE_FIRSTSET) != 0)
3157     {
3158     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3159     else printf("First char = \\x%02x\n", re->first_char);
3160     }
3161    
3162 nigel 37 if ((re->options & PCRE_REQCHSET) != 0)
3163     {
3164     if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3165     else printf("Req char = \\x%02x\n", re->req_char);
3166     }
3167    
3168 nigel 3 code_end = code;
3169     code_base = code = re->code;
3170    
3171     while (code < code_end)
3172     {
3173     int charlength;
3174    
3175     printf("%3d ", code - code_base);
3176    
3177     if (*code >= OP_BRA)
3178     {
3179     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3180     code += 2;
3181     }
3182    
3183     else switch(*code)
3184     {
3185 nigel 23 case OP_OPT:
3186     printf(" %.2x %s", code[1], OP_names[*code]);
3187     code++;
3188     break;
3189    
3190     case OP_COND:
3191     printf("%3d Cond", (code[1] << 8) + code[2]);
3192     code += 2;
3193     break;
3194    
3195     case OP_CREF:
3196     printf(" %.2d %s", code[1], OP_names[*code]);
3197     code++;
3198     break;
3199    
3200 nigel 3 case OP_CHARS:
3201     charlength = *(++code);
3202     printf("%3d ", charlength);
3203     while (charlength-- > 0)
3204     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3205     break;
3206    
3207     case OP_KETRMAX:
3208     case OP_KETRMIN:
3209     case OP_ALT:
3210     case OP_KET:
3211     case OP_ASSERT:
3212     case OP_ASSERT_NOT:
3213 nigel 23 case OP_ASSERTBACK:
3214     case OP_ASSERTBACK_NOT:
3215 nigel 3 case OP_ONCE:
3216     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3217     code += 2;
3218     break;
3219    
3220 nigel 23 case OP_REVERSE:
3221     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3222     code += 2;
3223     break;
3224    
3225 nigel 3 case OP_STAR:
3226     case OP_MINSTAR:
3227     case OP_PLUS:
3228     case OP_MINPLUS:
3229     case OP_QUERY:
3230     case OP_MINQUERY:
3231     case OP_TYPESTAR:
3232     case OP_TYPEMINSTAR:
3233     case OP_TYPEPLUS:
3234     case OP_TYPEMINPLUS:
3235     case OP_TYPEQUERY:
3236     case OP_TYPEMINQUERY:
3237     if (*code >= OP_TYPESTAR)
3238     printf(" %s", OP_names[code[1]]);
3239     else if (isprint(c = code[1])) printf(" %c", c);
3240     else printf(" \\x%02x", c);
3241     printf("%s", OP_names[*code++]);
3242     break;
3243    
3244     case OP_EXACT:
3245     case OP_UPTO:
3246     case OP_MINUPTO:
3247     if (isprint(c = code[3])) printf(" %c{", c);
3248     else printf(" \\x%02x{", c);
3249 nigel 11 if (*code != OP_EXACT) printf("0,");
3250 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
3251     if (*code == OP_MINUPTO) printf("?");
3252     code += 3;
3253     break;
3254    
3255     case OP_TYPEEXACT:
3256     case OP_TYPEUPTO:
3257     case OP_TYPEMINUPTO:
3258     printf(" %s{", OP_names[code[3]]);
3259     if (*code != OP_TYPEEXACT) printf(",");
3260     printf("%d}", (code[1] << 8) + code[2]);
3261     if (*code == OP_TYPEMINUPTO) printf("?");
3262     code += 3;
3263     break;
3264    
3265     case OP_NOT:
3266     if (isprint(c = *(++code))) printf(" [^%c]", c);
3267     else printf(" [^\\x%02x]", c);
3268     break;
3269    
3270     case OP_NOTSTAR:
3271     case OP_NOTMINSTAR:
3272     case OP_NOTPLUS:
3273     case OP_NOTMINPLUS:
3274     case OP_NOTQUERY:
3275     case OP_NOTMINQUERY:
3276     if (isprint(c = code[1])) printf(" [^%c]", c);
3277     else printf(" [^\\x%02x]", c);
3278     printf("%s", OP_names[*code++]);
3279     break;
3280    
3281     case OP_NOTEXACT:
3282     case OP_NOTUPTO:
3283     case OP_NOTMINUPTO:
3284     if (isprint(c = code[3])) printf(" [^%c]{", c);
3285     else printf(" [^\\x%02x]{", c);
3286     if (*code != OP_NOTEXACT) printf(",");
3287     printf("%d}", (code[1] << 8) + code[2]);
3288     if (*code == OP_NOTMINUPTO) printf("?");
3289     code += 3;
3290     break;
3291    
3292     case OP_REF:
3293     printf(" \\%d", *(++code));
3294 nigel 9 code ++;
3295     goto CLASS_REF_REPEAT;
3296 nigel 3
3297     case OP_CLASS:
3298     {
3299     int i, min, max;
3300 nigel 23 code++;
3301     printf(" [");
3302 nigel 3
3303     for (i = 0; i < 256; i++)
3304     {
3305     if ((code[i/8] & (1 << (i&7))) != 0)
3306     {
3307     int j;
3308     for (j = i+1; j < 256; j++)
3309     if ((code[j/8] & (1 << (j&7))) == 0) break;
3310     if (i == '-' || i == ']') printf("\\");
3311     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3312     if (--j > i)
3313     {
3314     printf("-");
3315     if (j == '-' || j == ']') printf("\\");
3316     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3317     }
3318     i = j;
3319     }
3320     }
3321     printf("]");
3322     code += 32;
3323    
3324 nigel 9 CLASS_REF_REPEAT:
3325    
3326 nigel 3 switch(*code)
3327     {
3328     case OP_CRSTAR:
3329     case OP_CRMINSTAR:
3330     case OP_CRPLUS:
3331     case OP_CRMINPLUS:
3332     case OP_CRQUERY:
3333     case OP_CRMINQUERY:
3334     printf("%s", OP_names[*code]);
3335     break;
3336    
3337     case OP_CRRANGE:
3338     case OP_CRMINRANGE:
3339     min = (code[1] << 8) + code[2];
3340     max = (code[3] << 8) + code[4];
3341     if (max == 0) printf("{%d,}", min);
3342     else printf("{%d,%d}", min, max);
3343     if (*code == OP_CRMINRANGE) printf("?");
3344     code += 4;
3345     break;
3346    
3347     default:
3348     code--;
3349     }
3350     }
3351     break;
3352    
3353     /* Anything else is just a one-node item */
3354    
3355     default:
3356     printf(" %s", OP_names[*code]);
3357     break;
3358     }
3359    
3360     code++;
3361     printf("\n");
3362     }
3363     printf("------------------------------------------------------------------\n");
3364    
3365     /* This check is done here in the debugging case so that the code that
3366     was compiled can be seen. */
3367    
3368     if (code - re->code > length)
3369     {
3370     *errorptr = ERR23;
3371     (pcre_free)(re);
3372     *erroroffset = ptr - (uschar *)pattern;
3373     return NULL;
3374     }
3375     #endif
3376    
3377     return (pcre *)re;
3378     }
3379    
3380    
3381    
3382     /*************************************************
3383     * Match a back-reference *
3384     *************************************************/
3385    
3386 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
3387     than the number of characters left in the string, so the match fails.
3388 nigel 3
3389     Arguments:
3390 nigel 23 offset index into the offset vector
3391 nigel 3 eptr points into the subject
3392     length length to be matched
3393     md points to match data block
3394 nigel 23 ims the ims flags
3395 nigel 3
3396     Returns: TRUE if matched
3397     */
3398    
3399     static BOOL
3400 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3401 nigel 37 unsigned long int ims)
3402 nigel 3 {
3403 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
3404 nigel 3
3405     #ifdef DEBUG
3406     if (eptr >= md->end_subject)
3407     printf("matching subject <null>");
3408     else
3409     {
3410     printf("matching subject ");
3411     pchars(eptr, length, TRUE, md);
3412     }
3413     printf(" against backref ");
3414     pchars(p, length, FALSE, md);
3415     printf("\n");
3416     #endif
3417    
3418     /* Always fail if not enough characters left */
3419    
3420 nigel 23 if (length > md->end_subject - eptr) return FALSE;
3421 nigel 3
3422     /* Separate the caselesss case for speed */
3423    
3424 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3425 nigel 25 {
3426     while (length-- > 0)
3427     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3428     }
3429 nigel 3 else
3430     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3431    
3432     return TRUE;
3433     }
3434    
3435    
3436    
3437     /*************************************************
3438     * Match from current position *
3439     *************************************************/
3440    
3441 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
3442     in the subject string, while eptrb holds the value of eptr at the start of the
3443     last bracketed group - used for breaking infinite loops matching zero-length
3444     strings.
3445 nigel 3
3446     Arguments:
3447     eptr pointer in subject
3448     ecode position in code
3449     offset_top current top pointer
3450     md pointer to "static" info for the match
3451 nigel 23 ims current /i, /m, and /s options
3452 nigel 47 eptrb pointer to chain of blocks containing eptr at start of
3453     brackets - for testing for empty matches
3454     flags can contain
3455     match_condassert - this is an assertion condition
3456     match_isgroup - this is the start of a bracketed group
3457 nigel 3
3458     Returns: TRUE if matched
3459     */
3460    
3461     static BOOL
3462 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
3463 nigel 47 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3464     int flags)
3465 nigel 3 {
3466 nigel 37 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3467 nigel 47 eptrblock newptrb;
3468 nigel 23
3469 nigel 47 /* At the start of a bracketed group, add the current subject pointer to the
3470     stack of such pointers, to be re-instated at the end of the group when we hit
3471     the closing ket. When match() is called in other circumstances, we don't add to
3472     the stack. */
3473    
3474     if ((flags & match_isgroup) != 0)
3475     {
3476     newptrb.prev = eptrb;
3477     newptrb.saved_eptr = eptr;
3478     eptrb = &newptrb;
3479     }
3480    
3481     /* Now start processing the operations. */
3482    
3483 nigel 3 for (;;)
3484     {
3485 nigel 23 int op = (int)*ecode;
3486 nigel 3 int min, max, ctype;
3487     register int i;
3488     register int c;
3489 nigel 7 BOOL minimize = FALSE;
3490 nigel 3
3491 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
3492     the current subject position in the working slot at the top of the vector. We
3493     mustn't change the current values of the data slot, because they may be set
3494     from a previous iteration of this group, and be referred to by a reference
3495     inside the group.
3496 nigel 3
3497 nigel 23 If the bracket fails to match, we need to restore this value and also the
3498     values of the final offsets, in case they were set by a previous iteration of
3499     the same bracket.
3500    
3501     If there isn't enough space in the offset vector, treat this as if it were a
3502     non-capturing bracket. Don't worry about setting the flag for the error case
3503     here; that is handled in the code for KET. */
3504    
3505     if (op > OP_BRA)
3506 nigel 3 {
3507 nigel 23 int number = op - OP_BRA;
3508     int offset = number << 1;
3509 nigel 3
3510 nigel 31 #ifdef DEBUG
3511     printf("start bracket %d subject=", number);
3512     pchars(eptr, 16, TRUE, md);
3513     printf("\n");
3514     #endif
3515 nigel 3
3516 nigel 23 if (offset < md->offset_max)
3517 nigel 3 {
3518 nigel 23 int save_offset1 = md->offset_vector[offset];
3519     int save_offset2 = md->offset_vector[offset+1];
3520     int save_offset3 = md->offset_vector[md->offset_end - number];
3521 nigel 3
3522 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3523     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3524    
3525     do
3526     {
3527 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3528     return TRUE;
3529 nigel 23 ecode += (ecode[1] << 8) + ecode[2];
3530     }
3531     while (*ecode == OP_ALT);
3532    
3533     DPRINTF(("bracket %d failed\n", number));
3534    
3535     md->offset_vector[offset] = save_offset1;
3536     md->offset_vector[offset+1] = save_offset2;
3537     md->offset_vector[md->offset_end - number] = save_offset3;
3538     return FALSE;
3539 nigel 3 }
3540    
3541 nigel 23 /* Insufficient room for saving captured contents */
3542 nigel 3
3543 nigel 23 else op = OP_BRA;
3544     }
3545    
3546     /* Other types of node can be handled by a switch */
3547    
3548     switch(op)
3549     {
3550     case OP_BRA: /* Non-capturing bracket: optimized */
3551     DPRINTF(("start bracket 0\n"));
3552 nigel 3 do
3553     {
3554 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3555     return TRUE;
3556 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3557     }
3558     while (*ecode == OP_ALT);
3559 nigel 23 DPRINTF(("bracket 0 failed\n"));
3560     return FALSE;
3561 nigel 3
3562 nigel 23 /* Conditional group: compilation checked that there are no more than
3563     two branches. If the condition is false, skipping the first branch takes us
3564     past the end if there is only one branch, but that's OK because that is
3565     exactly what going to the ket would do. */
3566 nigel 3
3567 nigel 23 case OP_COND:
3568     if (ecode[3] == OP_CREF) /* Condition is extraction test */
3569 nigel 3 {
3570 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
3571     return match(eptr,
3572     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3573     5 : 3 + (ecode[1] << 8) + ecode[2]),
3574 nigel 47 offset_top, md, ims, eptrb, match_isgroup);
3575 nigel 3 }
3576    
3577 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
3578     the final argument TRUE causes it to stop at the end of an assertion. */
3579 nigel 3
3580 nigel 23 else
3581     {
3582 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3583     match_condassert | match_isgroup))
3584 nigel 23 {
3585     ecode += 3 + (ecode[4] << 8) + ecode[5];
3586     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3587     }
3588     else ecode += (ecode[1] << 8) + ecode[2];
3589 nigel 47 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3590 nigel 23 }
3591     /* Control never reaches here */
3592 nigel 3
3593 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
3594    
3595     case OP_CREF:
3596     ecode += 2;
3597     break;
3598    
3599 nigel 37 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3600     an empty string - recursion will then try other alternatives, if any. */
3601 nigel 23
3602 nigel 3 case OP_END:
3603 nigel 37 if (md->notempty && eptr == md->start_match) return FALSE;
3604 nigel 3 md->end_match_ptr = eptr; /* Record where we ended */
3605     md->end_offset_top = offset_top; /* and how many extracts were taken */
3606     return TRUE;
3607    
3608 nigel 23 /* Change option settings */
3609 nigel 3
3610 nigel 23 case OP_OPT:
3611     ims = ecode[1];
3612     ecode += 2;
3613 nigel 39 DPRINTF(("ims set to %02lx\n", ims));
3614 nigel 23 break;
3615 nigel 3
3616     /* Assertion brackets. Check the alternative branches in turn - the
3617     matching won't pass the KET for an assertion. If any one branch matches,
3618 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3619     start of each branch to move the current point backwards, so the code at
3620     this level is identical to the lookahead case. */
3621 nigel 3
3622     case OP_ASSERT:
3623 nigel 23 case OP_ASSERTBACK:
3624 nigel 3 do
3625     {
3626 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3627 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3628     }
3629     while (*ecode == OP_ALT);
3630     if (*ecode == OP_KET) return FALSE;
3631    
3632 nigel 23 /* If checking an assertion for a condition, return TRUE. */
3633    
3634 nigel 47 if ((flags & match_condassert) != 0) return TRUE;
3635 nigel 23
3636 nigel 3 /* Continue from after the assertion, updating the offsets high water
3637     mark, since extracts may have been taken during the assertion. */
3638    
3639     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3640     ecode += 3;
3641     offset_top = md->end_offset_top;
3642     continue;
3643    
3644     /* Negative assertion: all branches must fail to match */
3645    
3646     case OP_ASSERT_NOT:
3647 nigel 23 case OP_ASSERTBACK_NOT:
3648 nigel 3 do
3649     {
3650 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3651     return FALSE;
3652 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3653     }
3654     while (*ecode == OP_ALT);
3655 nigel 23
3656 nigel 47 if ((flags & match_condassert) != 0) return TRUE;
3657    
3658 nigel 3 ecode += 3;
3659     continue;
3660    
3661 nigel 23 /* Move the subject pointer back. This occurs only at the start of
3662     each branch of a lookbehind assertion. If we are too close to the start to
3663 nigel 49 move back, this match function fails. When working with UTF-8 we move
3664     back a number of characters, not bytes. */
3665 nigel 23
3666     case OP_REVERSE:
3667 nigel 49 #ifdef SUPPORT_UTF8
3668     c = (ecode[1] << 8) + ecode[2];
3669     for (i = 0; i < c; i++)
3670     {
3671     eptr--;
3672     BACKCHAR(eptr)
3673     }
3674     #else
3675 nigel 23 eptr -= (ecode[1] << 8) + ecode[2];
3676 nigel 49 #endif
3677    
3678 nigel 23 if (eptr < md->start_subject) return FALSE;
3679     ecode += 3;
3680     break;
3681    
3682 nigel 43 /* Recursion matches the current regex, nested. If there are any capturing
3683     brackets started but not finished, we have to save their starting points
3684     and reinstate them after the recursion. However, we don't know how many
3685     such there are (offset_top records the completed total) so we just have
3686     to save all the potential data. There may be up to 99 such values, which
3687     is a bit large to put on the stack, but using malloc for small numbers
3688     seems expensive. As a compromise, the stack is used when there are fewer
3689     than 16 values to store; otherwise malloc is used. A problem is what to do
3690     if the malloc fails ... there is no way of returning to the top level with
3691     an error. Save the top 15 values on the stack, and accept that the rest
3692     may be wrong. */
3693 nigel 23
3694 nigel 43 case OP_RECURSE:
3695     {
3696     BOOL rc;
3697     int *save;
3698     int stacksave[15];
3699    
3700     c = md->offset_max;
3701    
3702     if (c < 16) save = stacksave; else
3703     {
3704     save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3705     if (save == NULL)
3706     {
3707     save = stacksave;
3708     c = 15;
3709     }
3710     }
3711    
3712     for (i = 1; i <= c; i++)
3713     save[i] = md->offset_vector[md->offset_end - i];
3714 nigel 47 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3715     match_isgroup);
3716 nigel 43 for (i = 1; i <= c; i++)
3717     md->offset_vector[md->offset_end - i] = save[i];
3718     if (save != stacksave) (pcre_free)(save);
3719     if (!rc) return FALSE;
3720    
3721     /* In case the recursion has set more capturing values, save the final
3722     number, then move along the subject till after the recursive match,
3723     and advance one byte in the pattern code. */
3724    
3725     offset_top = md->end_offset_top;
3726     eptr = md->end_match_ptr;
3727     ecode++;
3728     }
3729     break;
3730    
3731 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
3732     the point in the subject string is not moved back. Thus there can never be
3733 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
3734 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
3735 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
3736     pointer. */
3737 nigel 3
3738     case OP_ONCE:
3739     {
3740 nigel 23 const uschar *prev = ecode;
3741 nigel 47 const uschar *saved_eptr = eptr;
3742 nigel 3
3743 nigel 23 do
3744     {
3745 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3746     break;
3747 nigel 23 ecode += (ecode[1] << 8) + ecode[2];
3748     }
3749     while (*ecode == OP_ALT);
3750 nigel 3
3751 nigel 23 /* If hit the end of the group (which could be repeated), fail */
3752 nigel 3
3753 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3754    
3755     /* Continue as from after the assertion, updating the offsets high water
3756     mark, since extracts may have been taken. */
3757    
3758     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3759    
3760     offset_top = md->end_offset_top;
3761     eptr = md->end_match_ptr;
3762    
3763     /* For a non-repeating ket, just continue at this level. This also
3764     happens for a repeating ket if no characters were matched in the group.
3765     This is the forcible breaking of infinite loops as implemented in Perl
3766     5.005. If there is an options reset, it will get obeyed in the normal
3767     course of events. */
3768    
3769 nigel 47 if (*ecode == OP_KET || eptr == saved_eptr)
3770 nigel 23 {
3771     ecode += 3;
3772     break;
3773     }
3774    
3775     /* The repeating kets try the rest of the pattern or restart from the
3776     preceding bracket, in the appropriate order. We need to reset any options
3777     that changed within the bracket before re-running it, so check the next
3778     opcode. */
3779    
3780     if (ecode[3] == OP_OPT)
3781     {
3782     ims = (ims & ~PCRE_IMS) | ecode[4];
3783 nigel 39 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3784 nigel 23 }
3785    
3786     if (*ecode == OP_KETRMIN)
3787     {
3788 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3789     match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3790     return TRUE;
3791 nigel 23 }
3792     else /* OP_KETRMAX */
3793     {
3794 nigel 47 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3795     match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3796 nigel 23 }
3797     }
3798     return FALSE;
3799    
3800 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
3801     bracketed group and go to there. */
3802    
3803     case OP_ALT:
3804     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3805     break;
3806    
3807     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3808     that it may occur zero times. It may repeat infinitely, or not at all -
3809     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3810     repeat limits are compiled as a number of copies, with the optional ones
3811     preceded by BRAZERO or BRAMINZERO. */
3812    
3813     case OP_BRAZERO:
3814     {
3815 nigel 7 const uschar *next = ecode+1;
3816 nigel 47 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3817     return TRUE;
3818 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3819     ecode = next + 3;
3820     }
3821     break;
3822    
3823     case OP_BRAMINZERO:
3824     {
3825 nigel 7 const uschar *next = ecode+1;
3826 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3827 nigel 47 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3828     return TRUE;
3829 nigel 3 ecode++;
3830     }
3831 nigel 23 break;
3832 nigel 3
3833     /* End of a group, repeated or non-repeating. If we are at the end of
3834     an assertion "group", stop matching and return TRUE, but record the
3835 nigel 23 current high water mark for use by positive assertions. Do this also
3836     for the "once" (not-backup up) groups. */
3837 nigel 3
3838     case OP_KET:
3839     case OP_KETRMIN:
3840     case OP_KETRMAX:
3841     {
3842 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3843 nigel 47 const uschar *saved_eptr = eptrb->saved_eptr;
3844 nigel 3
3845 nigel 47 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3846    
3847 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3848     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3849     *prev == OP_ONCE)
3850 nigel 3 {
3851     md->end_match_ptr = eptr; /* For ONCE */
3852     md->end_offset_top = offset_top;
3853     return TRUE;
3854     }
3855    
3856 nigel 23 /* In all other cases except a conditional group we have to check the
3857     group number back at the start and if necessary complete handling an
3858     extraction by setting the offsets and bumping the high water mark. */
3859 nigel 3
3860 nigel 23 if (*prev != OP_COND)
3861     {
3862     int number = *prev - OP_BRA;
3863     int offset = number << 1;
3864 nigel 3
3865 nigel 47 #ifdef DEBUG
3866     printf("end bracket %d", number);
3867     printf("\n");
3868     #endif
3869 nigel 3
3870 nigel 23 if (number > 0)
3871 nigel 3 {
3872 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3873     {
3874     md->offset_vector[offset] =
3875     md->offset_vector[md->offset_end - number];
3876     md->offset_vector[offset+1] = eptr - md->start_subject;
3877     if (offset_top <= offset) offset_top = offset + 2;
3878     }
3879 nigel 3 }
3880     }
3881    
3882 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3883     the group. */
3884 nigel 3
3885 nigel 23 ims = original_ims;
3886 nigel 39 DPRINTF(("ims reset to %02lx\n", ims));
3887 nigel 23
3888     /* For a non-repeating ket, just continue at this level. This also
3889     happens for a repeating ket if no characters were matched in the group.
3890     This is the forcible breaking of infinite loops as implemented in Perl
3891     5.005. If there is an options reset, it will get obeyed in the normal
3892     course of events. */
3893    
3894 nigel 47 if (*ecode == OP_KET || eptr == saved_eptr)
3895 nigel 3 {
3896     ecode += 3;
3897     break;
3898     }
3899    
3900     /* The repeating kets try the rest of the pattern or restart from the
3901     preceding bracket, in the appropriate order. */
3902    
3903     if (*ecode == OP_KETRMIN)
3904     {
3905 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3906     match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3907     return TRUE;
3908 nigel 3 }
3909     else /* OP_KETRMAX */
3910     {
3911 nigel 47 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3912     match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3913 nigel 3 }
3914     }
3915     return FALSE;
3916    
3917     /* Start of subject unless notbol, or after internal newline if multiline */
3918    
3919     case OP_CIRC:
3920     if (md->notbol && eptr == md->start_subject) return FALSE;
3921 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3922 nigel 3 {
3923     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3924     ecode++;
3925     break;
3926     }
3927     /* ... else fall through */
3928    
3929     /* Start of subject assertion */
3930    
3931     case OP_SOD:
3932     if (eptr != md->start_subject) return FALSE;
3933     ecode++;
3934     break;
3935    
3936 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3937     newline unless endonly is set, else end of subject unless noteol is set. */
3938 nigel 3
3939     case OP_DOLL:
3940 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3941 nigel 3 {
3942 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3943     else { if (md->noteol) return FALSE; }
3944 nigel 3 ecode++;
3945     break;
3946     }
3947 nigel 23 else
3948 nigel 3 {
3949 nigel 23 if (md->noteol) return FALSE;
3950     if (!md->endonly)
3951     {
3952     if (eptr < md->end_subject - 1 ||
3953     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3954    
3955     ecode++;
3956     break;
3957     }
3958 nigel 3 }
3959     /* ... else fall through */
3960    
3961 nigel 23 /* End of subject assertion (\z) */
3962 nigel 3
3963     case OP_EOD:
3964     if (eptr < md->end_subject) return FALSE;
3965     ecode++;
3966     break;
3967    
3968 nigel 23 /* End of subject or ending \n assertion (\Z) */
3969    
3970     case OP_EODN:
3971     if (eptr < md->end_subject - 1 ||
3972     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3973     ecode++;
3974     break;
3975    
3976 nigel 3 /* Word boundary assertions */
3977    
3978     case OP_NOT_WORD_BOUNDARY:
3979     case OP_WORD_BOUNDARY:
3980     {
3981     BOOL prev_is_word = (eptr != md->start_subject) &&
3982 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3983 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
3984 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
3985 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
3986     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3987     return FALSE;
3988     }
3989     break;
3990    
3991     /* Match a single character type; inline for speed */
3992    
3993     case OP_ANY:
3994 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3995     return FALSE;
3996 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3997 nigel 49 #ifdef SUPPORT_UTF8
3998     if (md->utf8)
3999     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4000     #endif
4001 nigel 3 ecode++;
4002     break;
4003    
4004     case OP_NOT_DIGIT:
4005 nigel 25 if (eptr >= md->end_subject ||
4006     (md->ctypes[*eptr++] & ctype_digit) != 0)
4007 nigel 3 return FALSE;
4008     ecode++;
4009     break;
4010    
4011     case OP_DIGIT:
4012 nigel 25 if (eptr >= md->end_subject ||
4013     (md->ctypes[*eptr++] & ctype_digit) == 0)
4014 nigel 3 return FALSE;
4015     ecode++;
4016     break;
4017    
4018     case OP_NOT_WHITESPACE:
4019 nigel 25 if (eptr >= md->end_subject ||
4020     (md->ctypes[*eptr++] & ctype_space) != 0)
4021 nigel 3 return FALSE;
4022     ecode++;
4023     break;
4024    
4025     case OP_WHITESPACE:
4026 nigel 25 if (eptr >= md->end_subject ||
4027     (md->ctypes[*eptr++] & ctype_space) == 0)
4028 nigel 3 return FALSE;
4029     ecode++;
4030     break;
4031    
4032     case OP_NOT_WORDCHAR:
4033 nigel 25 if (eptr >= md->end_subject ||
4034     (md->ctypes[*eptr++] & ctype_word) != 0)
4035 nigel 3 return FALSE;
4036     ecode++;
4037     break;
4038    
4039     case OP_WORDCHAR:
4040 nigel 25 if (eptr >= md->end_subject ||
4041     (md->ctypes[*eptr++] & ctype_word) == 0)
4042 nigel 3 return FALSE;
4043     ecode++;
4044     break;
4045    
4046     /* Match a back reference, possibly repeatedly. Look past the end of the
4047     item to see if there is repeat information following. The code is similar
4048     to that for character classes, but repeated for efficiency. Then obey
4049     similar code to character type repeats - written out again for speed.
4050     However, if the referenced string is the empty string, always treat
4051     it as matched, any number of times (otherwise there could be infinite
4052     loops). */
4053    
4054     case OP_REF:
4055     {
4056     int length;
4057 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
4058 nigel 3 ecode += 2; /* Advance past the item */
4059    
4060 nigel 23 /* If the reference is unset, set the length to be longer than the amount
4061     of subject left; this ensures that every attempt at a match fails. We
4062     can't just fail here, because of the possibility of quantifiers with zero
4063     minima. */
4064 nigel 3
4065 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4066     md->end_subject - eptr + 1 :
4067     md->offset_vector[offset+1] - md->offset_vector[offset];
4068 nigel 3
4069 nigel 23 /* Set up for repetition, or handle the non-repeated case */
4070    
4071 nigel 3 switch (*ecode)
4072     {
4073     case OP_CRSTAR:
4074     case OP_CRMINSTAR:
4075     case OP_CRPLUS:
4076     case OP_CRMINPLUS:
4077     case OP_CRQUERY:
4078     case OP_CRMINQUERY:
4079     c = *ecode++ - OP_CRSTAR;
4080     minimize = (c & 1) != 0;
4081     min = rep_min[c]; /* Pick up values from tables; */
4082     max = rep_max[c]; /* zero for max => infinity */
4083     if (max == 0) max = INT_MAX;
4084     break;
4085    
4086     case OP_CRRANGE:
4087     case OP_CRMINRANGE:
4088     minimize = (*ecode == OP_CRMINRANGE);
4089     min = (ecode[1] << 8) + ecode[2];
4090     max = (ecode[3] << 8) + ecode[4];
4091     if (max == 0) max = INT_MAX;
4092     ecode += 5;
4093     break;
4094    
4095     default: /* No repeat follows */
4096 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4097 nigel 3 eptr += length;
4098     continue; /* With the main loop */
4099     }
4100    
4101     /* If the length of the reference is zero, just continue with the
4102     main loop. */
4103    
4104     if (length == 0) continue;
4105    
4106     /* First, ensure the minimum number of matches are present. We get back
4107     the length of the reference string explicitly rather than passing the
4108     address of eptr, so that eptr can be a register variable. */
4109    
4110     for (i = 1; i <= min; i++)
4111     {
4112 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4113 nigel 3 eptr += length;
4114     }
4115    
4116     /* If min = max, continue at the same level without recursion.
4117     They are not both allowed to be zero. */
4118    
4119     if (min == max) continue;
4120    
4121     /* If minimizing, keep trying and advancing the pointer */
4122    
4123     if (minimize)
4124     {
4125     for (i = min;; i++)
4126     {
4127 nigel 47 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4128 nigel 23 return TRUE;
4129     if (i >= max || !match_ref(offset, eptr, length, md, ims))
4130 nigel 3 return FALSE;
4131     eptr += length;
4132     }
4133     /* Control never gets here */
4134     }
4135    
4136     /* If maximizing, find the longest string and work backwards */
4137    
4138     else
4139     {
4140 nigel 7 const uschar *pp = eptr;
4141 nigel 3 for (i = min; i < max; i++)
4142     {
4143 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
4144 nigel 3 eptr += length;
4145     }
4146     while (eptr >= pp)
4147     {
4148 nigel 47 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4149 nigel 23 return TRUE;
4150 nigel 3 eptr -= length;
4151     }
4152     return FALSE;
4153     }
4154     }
4155     /* Control never gets here */
4156    
4157 nigel 23
4158    
4159 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
4160     item to see if there is repeat information following. Then obey similar
4161 nigel 23 code to character type repeats - written out again for speed. */
4162 nigel 3
4163     case OP_CLASS:
4164     {
4165 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
4166     ecode += 33; /* Advance past the item */
4167 nigel 3
4168     switch (*ecode)
4169     {
4170     case OP_CRSTAR:
4171     case OP_CRMINSTAR:
4172     case OP_CRPLUS:
4173     case OP_CRMINPLUS:
4174     case OP_CRQUERY:
4175     case OP_CRMINQUERY:
4176     c = *ecode++ - OP_CRSTAR;
4177     minimize = (c & 1) != 0;
4178     min = rep_min[c]; /* Pick up values from tables; */
4179     max = rep_max[c]; /* zero for max => infinity */
4180     if (max == 0) max = INT_MAX;
4181     break;
4182    
4183     case OP_CRRANGE:
4184     case OP_CRMINRANGE:
4185     minimize = (*ecode == OP_CRMINRANGE);
4186