/[pcre]/code/tags/pcre-4.0/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-4.0/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 23 - (hide annotations) (download)
Sat Feb 24 21:38:41 2007 UTC (7 years, 7 months ago) by nigel
Original Path: code/trunk/pcre.c
File MIME type: text/plain
File size: 119644 byte(s)
Load pcre-2.00 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 15 Copyright (c) 1998 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Define DEBUG to get debugging output on stdout. */
33    
34     /* #define DEBUG */
35    
36 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
37     inline, and there are *still* stupid compilers about that don't like indented
38     pre-processor statements. I suppose it's only been 10 years... */
39 nigel 3
40 nigel 9 #ifdef DEBUG
41     #define DPRINTF(p) printf p
42     #else
43     #define DPRINTF(p) /*nothing*/
44     #endif
45    
46 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
47     the external pcre header. */
48    
49     #include "internal.h"
50    
51    
52 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
53    
54     #ifdef __cplusplus
55     #define class pcre_class
56     #endif
57    
58    
59 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
60     not be set greater than 200. */
61    
62     #define BRASTACK_SIZE 200
63    
64    
65 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
66    
67 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
68     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
69 nigel 3
70 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
71 nigel 3
72     #ifdef DEBUG
73 nigel 7 static const char *OP_names[] = {
74     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
75 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
76     "Opt", "^", "$", "Any", "chars", "not",
77 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
78     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
79     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
80     "*", "*?", "+", "+?", "?", "??", "{", "{",
81 nigel 23 "class", "Ref",
82     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
83     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
84 nigel 3 "Brazero", "Braminzero", "Bra"
85     };
86     #endif
87    
88     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
89     are simple data values; negative values are for special things like \d and so
90     on. Zero means further processing is needed (for things like \x), or the escape
91     is invalid. */
92    
93 nigel 15 static const short int escapes[] = {
94 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
95     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
96     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
97     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
98     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
99     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
100     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
101     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
102     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
103 nigel 23 0, 0, -ESC_z /* x - z */
104 nigel 3 };
105    
106     /* Definition to allow mutual recursion */
107    
108 nigel 13 static BOOL
109 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
110     BOOL, int);
111 nigel 3
112     /* Structure for passing "static" information around between the functions
113     doing the matching, so that they are thread-safe. */
114    
115     typedef struct match_data {
116     int errorcode; /* As it says */
117     int *offset_vector; /* Offset vector */
118     int offset_end; /* One past the end */
119 nigel 23 int offset_max; /* The maximum usable for return data */
120 nigel 3 BOOL offset_overflow; /* Set if too many extractions */
121     BOOL notbol; /* NOTBOL flag */
122     BOOL noteol; /* NOTEOL flag */
123     BOOL endonly; /* Dollar not before final \n */
124 nigel 7 const uschar *start_subject; /* Start of the subject string */
125     const uschar *end_subject; /* End of the subject string */
126     const uschar *end_match_ptr; /* Subject position at end match */
127 nigel 3 int end_offset_top; /* Highwater mark at end of match */
128     } match_data;
129    
130    
131    
132     /*************************************************
133     * Global variables *
134     *************************************************/
135    
136     /* PCRE is thread-clean and doesn't use any global variables in the normal
137     sense. However, it calls memory allocation and free functions via the two
138     indirections below, which are can be changed by the caller, but are shared
139     between all threads. */
140    
141     void *(*pcre_malloc)(size_t) = malloc;
142     void (*pcre_free)(void *) = free;
143    
144    
145    
146    
147     /*************************************************
148     * Return version string *
149     *************************************************/
150    
151 nigel 7 const char *
152 nigel 3 pcre_version(void)
153     {
154     return PCRE_VERSION;
155     }
156    
157    
158    
159    
160     /*************************************************
161     * Return info about a compiled pattern *
162     *************************************************/
163    
164     /* This function picks potentially useful data out of the private
165     structure.
166    
167     Arguments:
168     external_re points to compiled code
169     optptr where to pass back the options
170     first_char where to pass back the first character,
171     or -1 if multiline and all branches start ^,
172     or -2 otherwise
173    
174     Returns: number of identifying extraction brackets
175     or negative values on error
176     */
177    
178     int
179     pcre_info(const pcre *external_re, int *optptr, int *first_char)
180     {
181 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
182 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
183     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185     if (first_char != NULL)
186     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188     return re->top_bracket;
189     }
190    
191    
192    
193    
194     #ifdef DEBUG
195     /*************************************************
196     * Debugging function to print chars *
197     *************************************************/
198    
199     /* Print a sequence of chars in printable format, stopping at the end of the
200     subject if the requested.
201    
202     Arguments:
203     p points to characters
204     length number to print
205     is_subject TRUE if printing from within md->start_subject
206     md pointer to matching data block, if is_subject is TRUE
207    
208     Returns: nothing
209     */
210    
211 nigel 9 static void
212     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 nigel 3 {
214     int c;
215     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216     while (length-- > 0)
217     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218     }
219     #endif
220    
221    
222    
223    
224     /*************************************************
225     * Handle escapes *
226     *************************************************/
227    
228     /* This function is called when a \ has been encountered. It either returns a
229     positive value for a simple escape such as \n, or a negative value which
230     encodes one of the more complicated things such as \d. On entry, ptr is
231     pointing at the \. On exit, it is on the final character of the escape
232     sequence.
233    
234     Arguments:
235     ptrptr points to the pattern position pointer
236     errorptr points to the pointer to the error message
237     bracount number of previous extracting brackets
238     options the options bits
239     isclass TRUE if inside a character class
240    
241     Returns: zero or positive => a data character
242     negative => a special escape sequence
243     on error, errorptr is set
244     */
245    
246     static int
247 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
248     int options, BOOL isclass)
249 nigel 3 {
250 nigel 7 const uschar *ptr = *ptrptr;
251 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
252     int i;
253    
254     if (c == 0) *errorptr = ERR1;
255    
256     /* Digits or letters may have special meaning; all others are literals. */
257    
258     else if (c < '0' || c > 'z') {}
259    
260     /* Do an initial lookup in a table. A non-zero result is something that can be
261     returned immediately. Otherwise further processing may be required. */
262    
263     else if ((i = escapes[c - '0']) != 0) c = i;
264    
265     /* Escapes that need further processing, or are illegal. */
266    
267     else
268     {
269 nigel 7 const uschar *oldptr;
270 nigel 3 switch (c)
271     {
272     /* The handling of escape sequences consisting of a string of digits
273     starting with one that is not zero is not straightforward. By experiment,
274     the way Perl works seems to be as follows:
275    
276     Outside a character class, the digits are read as a decimal number. If the
277     number is less than 10, or if there are that many previous extracting
278     left brackets, then it is a back reference. Otherwise, up to three octal
279     digits are read to form an escaped byte. Thus \123 is likely to be octal
280     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
281     value is greater than 377, the least significant 8 bits are taken. Inside a
282     character class, \ followed by a digit is always an octal number. */
283    
284     case '1': case '2': case '3': case '4': case '5':
285     case '6': case '7': case '8': case '9':
286    
287     if (!isclass)
288     {
289     oldptr = ptr;
290     c -= '0';
291     while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
292     c = c * 10 + *(++ptr) - '0';
293     if (c < 10 || c <= bracount)
294     {
295     c = -(ESC_REF + c);
296     break;
297     }
298     ptr = oldptr; /* Put the pointer back and fall through */
299     }
300    
301     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
302     generates a binary zero byte and treats the digit as a following literal.
303     Thus we have to pull back the pointer by one. */
304    
305     if ((c = *ptr) >= '8')
306     {
307     ptr--;
308     c = 0;
309     break;
310     }
311    
312     /* \0 always starts an octal number, but we may drop through to here with a
313     larger first octal digit */
314    
315     case '0':
316     c -= '0';
317     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
318     ptr[1] != '8' && ptr[1] != '9')
319     c = c * 8 + *(++ptr) - '0';
320     break;
321    
322     /* Special escapes not starting with a digit are straightforward */
323    
324     case 'x':
325     c = 0;
326     while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
327     {
328     ptr++;
329     c = c * 16 + pcre_lcc[*ptr] -
330     (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
331     }
332     break;
333    
334     case 'c':
335     c = *(++ptr);
336     if (c == 0)
337     {
338     *errorptr = ERR2;
339     return 0;
340     }
341    
342     /* A letter is upper-cased; then the 0x40 bit is flipped */
343    
344     if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
345     c ^= 0x40;
346     break;
347    
348     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
349     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
350     for Perl compatibility, it is a literal. */
351    
352     default:
353     if ((options & PCRE_EXTRA) != 0) switch(c)
354     {
355     default:
356     *errorptr = ERR3;
357     break;
358     }
359     break;
360     }
361     }
362    
363     *ptrptr = ptr;
364     return c;
365     }
366    
367    
368    
369     /*************************************************
370     * Check for counted repeat *
371     *************************************************/
372    
373     /* This function is called when a '{' is encountered in a place where it might
374     start a quantifier. It looks ahead to see if it really is a quantifier or not.
375     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
376     where the ddds are digits.
377    
378     Arguments:
379     p pointer to the first char after '{'
380    
381     Returns: TRUE or FALSE
382     */
383    
384     static BOOL
385 nigel 7 is_counted_repeat(const uschar *p)
386 nigel 3 {
387     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
388     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
389     if (*p == '}') return TRUE;
390    
391     if (*p++ != ',') return FALSE;
392     if (*p == '}') return TRUE;
393    
394     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
395     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
396     return (*p == '}');
397     }
398    
399    
400    
401     /*************************************************
402     * Read repeat counts *
403     *************************************************/
404    
405     /* Read an item of the form {n,m} and return the values. This is called only
406     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
407     so the syntax is guaranteed to be correct, but we need to check the values.
408    
409     Arguments:
410     p pointer to first char after '{'
411     minp pointer to int for min
412     maxp pointer to int for max
413     returned as -1 if no max
414     errorptr points to pointer to error message
415    
416     Returns: pointer to '}' on success;
417     current ptr on error, with errorptr set
418     */
419    
420 nigel 7 static const uschar *
421     read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
422 nigel 3 {
423     int min = 0;
424     int max = -1;
425    
426     while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
427    
428     if (*p == '}') max = min; else
429     {
430     if (*(++p) != '}')
431     {
432     max = 0;
433     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
434     if (max < min)
435     {
436     *errorptr = ERR4;
437     return p;
438     }
439     }
440     }
441    
442     /* Do paranoid checks, then fill in the required variables, and pass back the
443     pointer to the terminating '}'. */
444    
445     if (min > 65535 || max > 65535)
446     *errorptr = ERR5;
447     else
448     {
449     *minp = min;
450     *maxp = max;
451     }
452     return p;
453     }
454    
455    
456    
457     /*************************************************
458 nigel 23 * Find the fixed length of a pattern *
459     *************************************************/
460    
461     /* Scan a pattern and compute the fixed length of subject that will match it,
462     if the length is fixed. This is needed for dealing with backward assertions.
463    
464     Arguments:
465     code points to the start of the pattern (the bracket)
466    
467     Returns: the fixed length, or -1 if there is no fixed length
468     */
469    
470     static int
471     find_fixedlength(uschar *code)
472     {
473     int length = -1;
474    
475     register int branchlength = 0;
476     register uschar *cc = code + 3;
477    
478     /* Scan along the opcodes for this branch. If we get to the end of the
479     branch, check the length against that of the other branches. */
480    
481     for (;;)
482     {
483     int d;
484     register int op = *cc;
485     if (op >= OP_BRA) op = OP_BRA;
486    
487     switch (op)
488     {
489     case OP_BRA:
490     case OP_ONCE:
491     case OP_COND:
492     d = find_fixedlength(cc);
493     if (d < 0) return -1;
494     branchlength += d;
495     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
496     cc += 3;
497     break;
498    
499     /* Reached end of a branch; if it's a ket it is the end of a nested
500     call. If it's ALT it is an alternation in a nested call. If it is
501     END it's the end of the outer call. All can be handled by the same code. */
502    
503     case OP_ALT:
504     case OP_KET:
505     case OP_KETRMAX:
506     case OP_KETRMIN:
507     case OP_END:
508     if (length < 0) length = branchlength;
509     else if (length != branchlength) return -1;
510     if (*cc != OP_ALT) return length;
511     cc += 3;
512     branchlength = 0;
513     break;
514    
515     /* Skip over assertive subpatterns */
516    
517     case OP_ASSERT:
518     case OP_ASSERT_NOT:
519     case OP_ASSERTBACK:
520     case OP_ASSERTBACK_NOT:
521     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
522     cc += 3;
523     break;
524    
525     /* Skip over things that don't match chars */
526    
527     case OP_REVERSE:
528     cc++;
529    
530     case OP_CREF:
531     case OP_OPT:
532     cc++;
533     /* Fall through */
534    
535     case OP_SOD:
536     case OP_EOD:
537     case OP_EODN:
538     case OP_CIRC:
539     case OP_DOLL:
540     case OP_NOT_WORD_BOUNDARY:
541     case OP_WORD_BOUNDARY:
542     cc++;
543     break;
544    
545     /* Handle char strings */
546    
547     case OP_CHARS:
548     branchlength += *(++cc);
549     cc += *cc + 1;
550     break;
551    
552     /* Handle exact repetitions */
553    
554     case OP_EXACT:
555     case OP_TYPEEXACT:
556     branchlength += (cc[1] << 8) + cc[2];
557     cc += 4;
558     break;
559    
560     /* Handle single-char matchers */
561    
562     case OP_NOT_DIGIT:
563     case OP_DIGIT:
564     case OP_NOT_WHITESPACE:
565     case OP_WHITESPACE:
566     case OP_NOT_WORDCHAR:
567     case OP_WORDCHAR:
568     case OP_ANY:
569     branchlength++;
570     cc++;
571     break;
572    
573    
574     /* Check a class for variable quantification */
575    
576     case OP_CLASS:
577     cc += (*cc == OP_REF)? 2 : 33;
578    
579     switch (*cc)
580     {
581     case OP_CRSTAR:
582     case OP_CRMINSTAR:
583     case OP_CRQUERY:
584     case OP_CRMINQUERY:
585     return -1;
586    
587     case OP_CRRANGE:
588     case OP_CRMINRANGE:
589     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
590     branchlength += (cc[1] << 8) + cc[2];
591     cc += 5;
592     break;
593    
594     default:
595     branchlength++;
596     }
597     break;
598    
599     /* Anything else is variable length */
600    
601     default:
602     return -1;
603     }
604     }
605     /* Control never gets here */
606     }
607    
608    
609    
610    
611     /*************************************************
612 nigel 3 * Compile one branch *
613     *************************************************/
614    
615     /* Scan the pattern, compiling it into the code vector.
616    
617     Arguments:
618 nigel 23 options the option bits
619     brackets points to number of brackets used
620     code points to the pointer to the current code point
621     ptrptr points to the current pattern pointer
622     errorptr points to pointer to error message
623     optchanged set to the value of the last OP_OPT item compiled
624 nigel 3
625 nigel 23 Returns: TRUE on success
626     FALSE, with *errorptr set on error
627 nigel 3 */
628    
629     static BOOL
630 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
631 nigel 23 const uschar **ptrptr, const char **errorptr, int *optchanged)
632 nigel 3 {
633     int repeat_type, op_type;
634     int repeat_min, repeat_max;
635     int bravalue, length;
636 nigel 19 int greedy_default, greedy_non_default;
637 nigel 3 register int c;
638     register uschar *code = *codeptr;
639 nigel 23 uschar *tempcode;
640 nigel 7 const uschar *ptr = *ptrptr;
641 nigel 23 const uschar *tempptr;
642 nigel 3 uschar *previous = NULL;
643     uschar class[32];
644    
645 nigel 19 /* Set up the default and non-default settings for greediness */
646    
647     greedy_default = ((options & PCRE_UNGREEDY) != 0);
648     greedy_non_default = greedy_default ^ 1;
649    
650 nigel 3 /* Switch on next character until the end of the branch */
651    
652     for (;; ptr++)
653     {
654     BOOL negate_class;
655 nigel 23 int class_charcount;
656     int class_lastchar;
657     int newoptions;
658     int condref;
659 nigel 3
660     c = *ptr;
661     if ((options & PCRE_EXTENDED) != 0)
662     {
663     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
664     if (c == '#')
665     {
666     while ((c = *(++ptr)) != 0 && c != '\n');
667     continue;
668     }
669     }
670    
671     switch(c)
672     {
673     /* The branch terminates at end of string, |, or ). */
674    
675     case 0:
676     case '|':
677     case ')':
678     *codeptr = code;
679     *ptrptr = ptr;
680     return TRUE;
681    
682     /* Handle single-character metacharacters */
683    
684     case '^':
685     previous = NULL;
686     *code++ = OP_CIRC;
687     break;
688    
689     case '$':
690     previous = NULL;
691     *code++ = OP_DOLL;
692     break;
693    
694     case '.':
695     previous = code;
696     *code++ = OP_ANY;
697     break;
698    
699     /* Character classes. These always build a 32-byte bitmap of the permitted
700     characters, except in the special case where there is only one character.
701     For negated classes, we build the map as usual, then invert it at the end.
702     */
703    
704     case '[':
705     previous = code;
706 nigel 23 *code++ = OP_CLASS;
707 nigel 3
708 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
709 nigel 3
710     if ((c = *(++ptr)) == '^')
711     {
712     negate_class = TRUE;
713     c = *(++ptr);
714     }
715 nigel 23 else negate_class = FALSE;
716 nigel 3
717     /* Keep a count of chars so that we can optimize the case of just a single
718     character. */
719    
720     class_charcount = 0;
721     class_lastchar = -1;
722    
723     /* Initialize the 32-char bit map to all zeros. We have to build the
724     map in a temporary bit of store, in case the class contains only 1
725     character, because in that case the compiled code doesn't use the
726     bit map. */
727    
728     memset(class, 0, 32 * sizeof(uschar));
729    
730     /* Process characters until ] is reached. By writing this as a "do" it
731     means that an initial ] is taken as a data character. */
732    
733     do
734     {
735     if (c == 0)
736     {
737     *errorptr = ERR6;
738     goto FAILED;
739     }
740    
741     /* Backslash may introduce a single character, or it may introduce one
742     of the specials, which just set a flag. Escaped items are checked for
743     validity in the pre-compiling pass. The sequence \b is a special case.
744 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
745 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
746     or into the one we are building. We assume they have more than one
747     character in them, so set class_count bigger than one. */
748    
749     if (c == '\\')
750     {
751     c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
752     if (-c == ESC_b) c = '\b';
753     else if (c < 0)
754     {
755     class_charcount = 10;
756     switch (-c)
757     {
758     case ESC_d:
759     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
760     continue;
761    
762     case ESC_D:
763     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
764     continue;
765    
766     case ESC_w:
767     for (c = 0; c < 32; c++)
768     class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
769     continue;
770    
771     case ESC_W:
772     for (c = 0; c < 32; c++)
773     class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
774     continue;
775    
776     case ESC_s:
777     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
778     continue;
779    
780     case ESC_S:
781     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
782     continue;
783    
784     default:
785     *errorptr = ERR7;
786     goto FAILED;
787     }
788     }
789     /* Fall through if single character */
790     }
791    
792     /* A single character may be followed by '-' to form a range. However,
793     Perl does not permit ']' to be the end of the range. A '-' character
794     here is treated as a literal. */
795    
796     if (ptr[1] == '-' && ptr[2] != ']')
797     {
798     int d;
799     ptr += 2;
800     d = *ptr;
801    
802     if (d == 0)
803     {
804     *errorptr = ERR6;
805     goto FAILED;
806     }
807    
808     /* The second part of a range can be a single-character escape, but
809     not any of the other escapes. */
810    
811     if (d == '\\')
812     {
813     d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
814     if (d < 0)
815     {
816     if (d == -ESC_b) d = '\b'; else
817     {
818     *errorptr = ERR7;
819     goto FAILED;
820     }
821     }
822     }
823    
824     if (d < c)
825     {
826     *errorptr = ERR8;
827     goto FAILED;
828     }
829    
830     for (; c <= d; c++)
831     {
832     class[c/8] |= (1 << (c&7));
833     if ((options & PCRE_CASELESS) != 0)
834     {
835     int uc = pcre_fcc[c]; /* flip case */
836     class[uc/8] |= (1 << (uc&7));
837     }
838     class_charcount++; /* in case a one-char range */
839     class_lastchar = c;
840     }
841     continue; /* Go get the next char in the class */
842     }
843    
844     /* Handle a lone single character - we can get here for a normal
845     non-escape char, or after \ that introduces a single character. */
846    
847     class [c/8] |= (1 << (c&7));
848     if ((options & PCRE_CASELESS) != 0)
849     {
850     c = pcre_fcc[c]; /* flip case */
851     class[c/8] |= (1 << (c&7));
852     }
853     class_charcount++;
854     class_lastchar = c;
855     }
856    
857     /* Loop until ']' reached; the check for end of string happens inside the
858     loop. This "while" is the end of the "do" above. */
859    
860     while ((c = *(++ptr)) != ']');
861    
862     /* If class_charcount is 1 and class_lastchar is not negative, we saw
863     precisely one character. This doesn't need the whole 32-byte bit map.
864     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
865     it's negative. */
866    
867     if (class_charcount == 1 && class_lastchar >= 0)
868     {
869     if (negate_class)
870     {
871     code[-1] = OP_NOT;
872     }
873     else
874     {
875     code[-1] = OP_CHARS;
876     *code++ = 1;
877     }
878     *code++ = class_lastchar;
879     }
880    
881     /* Otherwise, negate the 32-byte map if necessary, and copy it into
882     the code vector. */
883    
884     else
885     {
886     if (negate_class)
887     for (c = 0; c < 32; c++) code[c] = ~class[c];
888     else
889     memcpy(code, class, 32);
890     code += 32;
891     }
892     break;
893    
894     /* Various kinds of repeat */
895    
896     case '{':
897     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
898     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
899     if (*errorptr != NULL) goto FAILED;
900     goto REPEAT;
901    
902     case '*':
903     repeat_min = 0;
904     repeat_max = -1;
905     goto REPEAT;
906    
907     case '+':
908     repeat_min = 1;
909     repeat_max = -1;
910     goto REPEAT;
911    
912     case '?':
913     repeat_min = 0;
914     repeat_max = 1;
915    
916     REPEAT:
917     if (previous == NULL)
918     {
919     *errorptr = ERR9;
920     goto FAILED;
921     }
922    
923 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
924     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
925 nigel 3 next character. */
926    
927 nigel 19 if (ptr[1] == '?')
928     { repeat_type = greedy_non_default; ptr++; }
929     else repeat_type = greedy_default;
930 nigel 3
931     /* If the maximum is zero then the minimum must also be zero; Perl allows
932     this case, so we do too - by simply omitting the item altogether. */
933    
934     if (repeat_max == 0) code = previous;
935    
936     /* If previous was a string of characters, chop off the last one and use it
937     as the subject of the repeat. If there was only one character, we can
938     abolish the previous item altogether. */
939    
940     else if (*previous == OP_CHARS)
941     {
942     int len = previous[1];
943     if (len == 1)
944     {
945     c = previous[2];
946     code = previous;
947     }
948     else
949     {
950     c = previous[len+1];
951     previous[1]--;
952     code--;
953     }
954     op_type = 0; /* Use single-char op codes */
955     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
956     }
957    
958     /* If previous was a single negated character ([^a] or similar), we use
959     one of the special opcodes, replacing it. The code is shared with single-
960     character repeats by adding a suitable offset into repeat_type. */
961    
962     else if ((int)*previous == OP_NOT)
963     {
964     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
965     c = previous[1];
966     code = previous;
967     goto OUTPUT_SINGLE_REPEAT;
968     }
969    
970     /* If previous was a character type match (\d or similar), abolish it and
971     create a suitable repeat item. The code is shared with single-character
972     repeats by adding a suitable offset into repeat_type. */
973    
974 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
975 nigel 3 {
976     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
977     c = *previous;
978     code = previous;
979    
980     OUTPUT_SINGLE_REPEAT:
981     repeat_type += op_type; /* Combine both values for many cases */
982    
983     /* A minimum of zero is handled either as the special case * or ?, or as
984     an UPTO, with the maximum given. */
985    
986     if (repeat_min == 0)
987     {
988     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
989     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
990     else
991     {
992     *code++ = OP_UPTO + repeat_type;
993     *code++ = repeat_max >> 8;
994     *code++ = (repeat_max & 255);
995     }
996     }
997    
998     /* The case {1,} is handled as the special case + */
999    
1000     else if (repeat_min == 1 && repeat_max == -1)
1001     *code++ = OP_PLUS + repeat_type;
1002    
1003     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1004     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1005    
1006     else
1007     {
1008     if (repeat_min != 1)
1009     {
1010     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1011     *code++ = repeat_min >> 8;
1012     *code++ = (repeat_min & 255);
1013     }
1014    
1015     /* If the mininum is 1 and the previous item was a character string,
1016     we either have to put back the item that got cancelled if the string
1017     length was 1, or add the character back onto the end of a longer
1018 nigel 21 string. For a character type nothing need be done; it will just get
1019     put back naturally. Note that the final character is always going to
1020     get added below. */
1021 nigel 3
1022     else if (*previous == OP_CHARS)
1023     {
1024     if (code == previous) code += 2; else previous[1]++;
1025     }
1026    
1027 nigel 21 /* For a single negated character we also have to put back the
1028     item that got cancelled. */
1029    
1030     else if (*previous == OP_NOT) code++;
1031    
1032 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1033 nigel 3
1034 nigel 9 if (repeat_max < 0)
1035 nigel 3 {
1036     *code++ = c;
1037 nigel 9 *code++ = OP_STAR + repeat_type;
1038     }
1039    
1040     /* Else insert an UPTO if the max is greater than the min. */
1041    
1042     else if (repeat_max != repeat_min)
1043     {
1044     *code++ = c;
1045 nigel 3 repeat_max -= repeat_min;
1046     *code++ = OP_UPTO + repeat_type;
1047     *code++ = repeat_max >> 8;
1048     *code++ = (repeat_max & 255);
1049     }
1050     }
1051    
1052     /* The character or character type itself comes last in all cases. */
1053    
1054     *code++ = c;
1055     }
1056    
1057     /* If previous was a character class or a back reference, we put the repeat
1058     stuff after it. */
1059    
1060 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1061 nigel 3 {
1062     if (repeat_min == 0 && repeat_max == -1)
1063     *code++ = OP_CRSTAR + repeat_type;
1064     else if (repeat_min == 1 && repeat_max == -1)
1065     *code++ = OP_CRPLUS + repeat_type;
1066     else if (repeat_min == 0 && repeat_max == 1)
1067     *code++ = OP_CRQUERY + repeat_type;
1068     else
1069     {
1070     *code++ = OP_CRRANGE + repeat_type;
1071     *code++ = repeat_min >> 8;
1072     *code++ = repeat_min & 255;
1073     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1074     *code++ = repeat_max >> 8;
1075     *code++ = repeat_max & 255;
1076     }
1077     }
1078    
1079     /* If previous was a bracket group, we may have to replicate it in certain
1080 nigel 23 cases. */
1081 nigel 3
1082 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1083     (int)*previous == OP_COND)
1084 nigel 3 {
1085 nigel 23 int i, ketoffset = 0;
1086 nigel 9 int len = code - previous;
1087 nigel 3
1088 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1089     by scanning through from the start, and compute the offset back to it
1090     from the current code pointer. There may be an OP_OPT setting following
1091     the final KET, so we can't find the end just by going back from the code
1092     pointer. */
1093    
1094     if (repeat_max == -1)
1095 nigel 3 {
1096 nigel 23 register uschar *ket = previous;
1097     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1098     ketoffset = code - ket;
1099 nigel 3 }
1100    
1101     /* If the minimum is greater than zero, and the maximum is unlimited or
1102     equal to the minimum, the first copy remains where it is, and is
1103     replicated up to the minimum number of times. This case includes the +
1104     repeat, but of course no replication is needed in that case. */
1105    
1106     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1107     {
1108     for (i = 1; i < repeat_min; i++)
1109     {
1110 nigel 9 memcpy(code, previous, len);
1111     code += len;
1112 nigel 3 }
1113     }
1114    
1115     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1116     Then, if there is a fixed upper limit, replicated up to that many times,
1117     sticking BRAZERO in front of all the optional ones. */
1118    
1119     else
1120     {
1121     if (repeat_min == 0)
1122     {
1123 nigel 9 memmove(previous+1, previous, len);
1124 nigel 3 code++;
1125     *previous++ = OP_BRAZERO + repeat_type;
1126     }
1127    
1128     for (i = 1; i < repeat_min; i++)
1129     {
1130 nigel 9 memcpy(code, previous, len);
1131     code += len;
1132 nigel 3 }
1133    
1134     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1135     {
1136     *code++ = OP_BRAZERO + repeat_type;
1137 nigel 9 memcpy(code, previous, len);
1138     code += len;
1139 nigel 3 }
1140     }
1141    
1142 nigel 23 /* If the maximum is unlimited, set a repeater in the final copy. We
1143     can't just offset backwards from the current code point, because we
1144     don't know if there's been an options resetting after the ket. The
1145     correct offset was computed above. */
1146 nigel 3
1147 nigel 23 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1148 nigel 3 }
1149    
1150     /* Else there's some kind of shambles */
1151    
1152     else
1153     {
1154     *errorptr = ERR11;
1155     goto FAILED;
1156     }
1157    
1158     /* In all case we no longer have a previous item. */
1159    
1160     previous = NULL;
1161     break;
1162    
1163    
1164 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1165     lookbehind or option setting or condition. First deal with special things
1166     that can come after a bracket; all are introduced by ?, and the appearance
1167     of any of them means that this is not a referencing group. They were
1168     checked for validity in the first pass over the string, so we don't have to
1169     check for syntax errors here. */
1170 nigel 3
1171     case '(':
1172 nigel 23 newoptions = options;
1173     condref = -1;
1174    
1175 nigel 3 if (*(++ptr) == '?')
1176     {
1177 nigel 23 int set, unset;
1178     int *optset;
1179 nigel 3
1180     switch (*(++ptr))
1181     {
1182 nigel 23 case '#': /* Comment; skip to ket */
1183 nigel 3 ptr++;
1184     while (*ptr != ')') ptr++;
1185     continue;
1186    
1187     case ':': /* Non-extracting bracket */
1188 nigel 23 bravalue = OP_BRA;
1189 nigel 3 ptr++;
1190     break;
1191    
1192 nigel 23 case '(':
1193     bravalue = OP_COND; /* Conditional group */
1194     if ((pcre_ctypes[*(++ptr)] & ctype_digit) != 0)
1195     {
1196     condref = *ptr - '0';
1197     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1198     ptr++;
1199     }
1200     else ptr--;
1201     break;
1202    
1203     case '=': /* Positive lookahead */
1204 nigel 3 bravalue = OP_ASSERT;
1205     ptr++;
1206     break;
1207    
1208 nigel 23 case '!': /* Negative lookahead */
1209 nigel 3 bravalue = OP_ASSERT_NOT;
1210     ptr++;
1211     break;
1212    
1213 nigel 23 case '<': /* Lookbehinds */
1214     switch (*(++ptr))
1215 nigel 3 {
1216 nigel 23 case '=': /* Positive lookbehind */
1217     bravalue = OP_ASSERTBACK;
1218 nigel 3 ptr++;
1219     break;
1220 nigel 23
1221     case '!': /* Negative lookbehind */
1222     bravalue = OP_ASSERTBACK_NOT;
1223     ptr++;
1224     break;
1225    
1226     default: /* Syntax error */
1227     *errorptr = ERR24;
1228     goto FAILED;
1229 nigel 3 }
1230 nigel 23 break;
1231 nigel 3
1232 nigel 23 case '>': /* One-time brackets */
1233     bravalue = OP_ONCE;
1234     ptr++;
1235     break;
1236    
1237     default: /* Option setting */
1238     set = unset = 0;
1239     optset = &set;
1240    
1241     while (*ptr != ')' && *ptr != ':')
1242     {
1243     switch (*ptr++)
1244     {
1245     case '-': optset = &unset; break;
1246    
1247     case 'i': *optset |= PCRE_CASELESS; break;
1248     case 'm': *optset |= PCRE_MULTILINE; break;
1249     case 's': *optset |= PCRE_DOTALL; break;
1250     case 'x': *optset |= PCRE_EXTENDED; break;
1251     case 'U': *optset |= PCRE_UNGREEDY; break;
1252     case 'X': *optset |= PCRE_EXTRA; break;
1253    
1254     default:
1255     *errorptr = ERR12;
1256     goto FAILED;
1257     }
1258     }
1259    
1260     /* Set up the changed option bits, but don't change anything yet. */
1261    
1262     newoptions = (options | set) & (~unset);
1263    
1264     /* If the options ended with ')' this is not the start of a nested
1265     group with option changes, so the options change at this level. At top
1266     level there is nothing else to be done (the options will in fact have
1267     been set from the start of compiling as a result of the first pass) but
1268     at an inner level we must compile code to change the ims options if
1269     necessary, and pass the new setting back so that it can be put at the
1270     start of any following branches, and when this group ends, a resetting
1271     item can be compiled. */
1272    
1273     if (*ptr == ')')
1274     {
1275     if ((options & PCRE_INGROUP) != 0 &&
1276     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1277     {
1278     *code++ = OP_OPT;
1279     *code++ = *optchanged = newoptions & PCRE_IMS;
1280     }
1281     options = newoptions; /* Change options at this level */
1282     previous = NULL; /* This item can't be repeated */
1283     continue; /* It is complete */
1284     }
1285    
1286     /* If the options ended with ':' we are heading into a nested group
1287     with possible change of options. Such groups are non-capturing and are
1288     not assertions of any kind. All we need to do is skip over the ':';
1289     the newoptions value is handled below. */
1290    
1291     bravalue = OP_BRA;
1292     ptr++;
1293 nigel 3 }
1294     }
1295    
1296 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1297 nigel 3
1298     else
1299     {
1300     if (++(*brackets) > EXTRACT_MAX)
1301     {
1302     *errorptr = ERR13;
1303     goto FAILED;
1304     }
1305     bravalue = OP_BRA + *brackets;
1306     }
1307    
1308 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1309     kinds can be. We copy code into a non-register variable in order to be able
1310     to pass its address because some compilers complain otherwise. Pass in a
1311     new setting for the ims options if they have changed. */
1312 nigel 3
1313 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1314 nigel 3 *code = bravalue;
1315 nigel 23 tempcode = code;
1316    
1317     if (!compile_regex(
1318     options | PCRE_INGROUP, /* Set for all nested groups */
1319     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1320     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1321     brackets, /* Bracket level */
1322     &tempcode, /* Where to put code (updated) */
1323     &ptr, /* Input pointer (updated) */
1324     errorptr, /* Where to put an error message */
1325     (bravalue == OP_ASSERTBACK ||
1326     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1327     condref)) /* Condition reference number */
1328     goto FAILED;
1329    
1330     /* At the end of compiling, code is still pointing to the start of the
1331     group, while tempcode has been updated to point past the end of the group
1332     and any option resetting that may follow it. The pattern pointer (ptr)
1333     is on the bracket. */
1334    
1335     /* If this is a conditional bracket, check that there are no more than
1336     two branches in the group. */
1337    
1338     if (bravalue == OP_COND)
1339 nigel 3 {
1340 nigel 23 int branchcount = 0;
1341     uschar *tc = code;
1342    
1343     do {
1344     branchcount++;
1345     tc += (tc[1] << 8) | tc[2];
1346     }
1347     while (*tc != OP_KET);
1348    
1349     if (branchcount > 2)
1350     {
1351     *errorptr = ERR27;
1352 nigel 3 goto FAILED;
1353 nigel 23 }
1354 nigel 3 }
1355    
1356 nigel 23 /* Now update the main code pointer to the end of the group. */
1357    
1358     code = tempcode;
1359    
1360     /* Error if hit end of pattern */
1361    
1362 nigel 3 if (*ptr != ')')
1363     {
1364     *errorptr = ERR14;
1365     goto FAILED;
1366     }
1367     break;
1368    
1369     /* Check \ for being a real metacharacter; if not, fall through and handle
1370     it as a data character at the start of a string. Escape items are checked
1371     for validity in the pre-compiling pass. */
1372    
1373     case '\\':
1374 nigel 23 tempptr = ptr;
1375 nigel 3 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1376    
1377     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1378     are arranged to be the negation of the corresponding OP_values. For the
1379     back references, the values are ESC_REF plus the reference number. Only
1380     back references and those types that consume a character may be repeated.
1381     We can test for values between ESC_b and ESC_Z for the latter; this may
1382     have to change if any new ones are ever created. */
1383    
1384     if (c < 0)
1385     {
1386     if (-c >= ESC_REF)
1387     {
1388     previous = code;
1389     *code++ = OP_REF;
1390 nigel 23 *code++ = -c - ESC_REF;
1391 nigel 3 }
1392     else
1393     {
1394 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1395 nigel 3 *code++ = -c;
1396     }
1397     continue;
1398     }
1399    
1400 nigel 7 /* Data character: reset and fall through */
1401 nigel 3
1402 nigel 23 ptr = tempptr;
1403 nigel 3 c = '\\';
1404    
1405     /* Handle a run of data characters until a metacharacter is encountered.
1406     The first character is guaranteed not to be whitespace or # when the
1407     extended flag is set. */
1408    
1409     NORMAL_CHAR:
1410     default:
1411     previous = code;
1412     *code = OP_CHARS;
1413     code += 2;
1414     length = 0;
1415    
1416     do
1417     {
1418     if ((options & PCRE_EXTENDED) != 0)
1419     {
1420     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1421     if (c == '#')
1422     {
1423     while ((c = *(++ptr)) != 0 && c != '\n');
1424     if (c == 0) break;
1425     continue;
1426     }
1427     }
1428    
1429     /* Backslash may introduce a data char or a metacharacter. Escaped items
1430     are checked for validity in the pre-compiling pass. Stop the string
1431     before a metaitem. */
1432    
1433     if (c == '\\')
1434     {
1435 nigel 23 tempptr = ptr;
1436 nigel 3 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1437 nigel 23 if (c < 0) { ptr = tempptr; break; }
1438 nigel 3 }
1439    
1440     /* Ordinary character or single-char escape */
1441    
1442     *code++ = c;
1443     length++;
1444     }
1445    
1446     /* This "while" is the end of the "do" above. */
1447    
1448     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1449    
1450     /* Compute the length and set it in the data vector, and advance to
1451     the next state. */
1452    
1453     previous[1] = length;
1454 nigel 15 if (length < 255) ptr--;
1455 nigel 3 break;
1456     }
1457     } /* end of big loop */
1458    
1459     /* Control never reaches here by falling through, only by a goto for all the
1460     error states. Pass back the position in the pattern so that it can be displayed
1461     to the user for diagnosing the error. */
1462    
1463     FAILED:
1464     *ptrptr = ptr;
1465     return FALSE;
1466     }
1467    
1468    
1469    
1470    
1471     /*************************************************
1472     * Compile sequence of alternatives *
1473     *************************************************/
1474    
1475     /* On entry, ptr is pointing past the bracket character, but on return
1476     it points to the closing bracket, or vertical bar, or end of string.
1477     The code variable is pointing at the byte into which the BRA operator has been
1478 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
1479     during any branch, we need to insert an OP_OPT item at the start of every
1480     following branch to ensure they get set correctly at run time, and also pass
1481     the new options into every subsequent branch compile.
1482 nigel 3
1483     Argument:
1484 nigel 23 options the option bits
1485     optchanged new ims options to set as if (?ims) were at the start, or -1
1486     for no change
1487     brackets -> int containing the number of extracting brackets used
1488     codeptr -> the address of the current code pointer
1489     ptrptr -> the address of the current pattern pointer
1490     errorptr -> pointer to error message
1491     lookbehind TRUE if this is a lookbehind assertion
1492     condref > 0 for OPT_CREF setting at start of conditional group
1493 nigel 3
1494 nigel 23 Returns: TRUE on success
1495 nigel 3 */
1496    
1497     static BOOL
1498 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1499     const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref)
1500 nigel 3 {
1501 nigel 7 const uschar *ptr = *ptrptr;
1502 nigel 3 uschar *code = *codeptr;
1503 nigel 23 uschar *last_branch = code;
1504 nigel 3 uschar *start_bracket = code;
1505 nigel 23 uschar *reverse_count = NULL;
1506     int oldoptions = options & PCRE_IMS;
1507 nigel 3
1508 nigel 23 code += 3;
1509    
1510     /* At the start of a reference-based conditional group, insert the reference
1511     number as an OP_CREF item. */
1512    
1513     if (condref > 0)
1514     {
1515     *code++ = OP_CREF;
1516     *code++ = condref;
1517     }
1518    
1519     /* Loop for each alternative branch */
1520    
1521 nigel 3 for (;;)
1522     {
1523     int length;
1524    
1525 nigel 23 /* Handle change of options */
1526    
1527     if (optchanged >= 0)
1528 nigel 3 {
1529 nigel 23 *code++ = OP_OPT;
1530     *code++ = optchanged;
1531     options = (options & ~PCRE_IMS) | optchanged;
1532     }
1533    
1534     /* Set up dummy OP_REVERSE if lookbehind assertion */
1535    
1536     if (lookbehind)
1537     {
1538     *code++ = OP_REVERSE;
1539     reverse_count = code;
1540     *code++ = 0;
1541     *code++ = 0;
1542     }
1543    
1544     /* Now compile the branch */
1545    
1546     if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged))
1547     {
1548 nigel 3 *ptrptr = ptr;
1549     return FALSE;
1550     }
1551    
1552     /* Fill in the length of the last branch */
1553    
1554     length = code - last_branch;
1555     last_branch[1] = length >> 8;
1556     last_branch[2] = length & 255;
1557    
1558 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
1559     and put the length into the OP_REVERSE item. Temporarily mark the end of
1560     the branch with OP_END. */
1561    
1562     if (lookbehind)
1563     {
1564     *code = OP_END;
1565     length = find_fixedlength(last_branch);
1566     DPRINTF(("fixed length = %d\n", length));
1567     if (length < 0)
1568     {
1569     *errorptr = ERR25;
1570     *ptrptr = ptr;
1571     return FALSE;
1572     }
1573     reverse_count[0] = (length >> 8);
1574     reverse_count[1] = length & 255;
1575     }
1576    
1577 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
1578     terminating ket and the length of the whole bracketed item, and return,
1579 nigel 23 leaving the pointer at the terminating char. If any of the ims options
1580     were changed inside the group, compile a resetting op-code following. */
1581 nigel 3
1582     if (*ptr != '|')
1583     {
1584     length = code - start_bracket;
1585     *code++ = OP_KET;
1586     *code++ = length >> 8;
1587     *code++ = length & 255;
1588 nigel 23 if (optchanged >= 0)
1589     {
1590     *code++ = OP_OPT;
1591     *code++ = oldoptions;
1592     }
1593 nigel 3 *codeptr = code;
1594     *ptrptr = ptr;
1595     return TRUE;
1596     }
1597    
1598     /* Another branch follows; insert an "or" node and advance the pointer. */
1599    
1600     *code = OP_ALT;
1601 nigel 23 last_branch = code;
1602     code += 3;
1603 nigel 3 ptr++;
1604     }
1605     /* Control never reaches here */
1606     }
1607    
1608    
1609    
1610 nigel 23
1611 nigel 3 /*************************************************
1612 nigel 23 * Find first significant op code *
1613     *************************************************/
1614    
1615     /* This is called by several functions that scan a compiled expression looking
1616     for a fixed first character, or an anchoring op code etc. It skips over things
1617     that do not influence this. For one application, a change of caseless option is
1618     important.
1619    
1620     Arguments:
1621     code pointer to the start of the group
1622     options pointer to external options
1623     optbit the option bit whose changing is significant, or
1624     zero if none are
1625     optstop TRUE to return on option change, otherwise change the options
1626     value and continue
1627    
1628     Returns: pointer to the first significant opcode
1629     */
1630    
1631     static const uschar*
1632     first_significant_code(const uschar *code, int *options, int optbit,
1633     BOOL optstop)
1634     {
1635     for (;;)
1636     {
1637     switch ((int)*code)
1638     {
1639     case OP_OPT:
1640     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1641     {
1642     if (optstop) return code;
1643     *options = (int)code[1];
1644     }
1645     code += 2;
1646     break;
1647    
1648     case OP_CREF:
1649     code += 2;
1650     break;
1651    
1652     case OP_ASSERT_NOT:
1653     case OP_ASSERTBACK:
1654     case OP_ASSERTBACK_NOT:
1655     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1656     code += 3;
1657     break;
1658    
1659     default:
1660     return code;
1661     }
1662     }
1663     /* Control never reaches here */
1664     }
1665    
1666    
1667    
1668    
1669     /*************************************************
1670 nigel 3 * Check for anchored expression *
1671     *************************************************/
1672    
1673     /* Try to find out if this is an anchored regular expression. Consider each
1674     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1675     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1676     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1677     counts, since OP_CIRC can match in the middle.
1678    
1679     A branch is also implicitly anchored if it starts with .* because that will try
1680     the rest of the pattern at all possible matching points, so there is no point
1681     trying them again.
1682    
1683 nigel 23 Arguments:
1684     code points to start of expression (the bracket)
1685     options points to the options setting
1686    
1687     Returns: TRUE or FALSE
1688 nigel 3 */
1689    
1690     static BOOL
1691 nigel 23 is_anchored(register const uschar *code, int *options)
1692 nigel 3 {
1693     do {
1694 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
1695     PCRE_MULTILINE, FALSE);
1696     register int op = *scode;
1697     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1698     { if (!is_anchored(scode, options)) return FALSE; }
1699 nigel 3 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1700 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
1701     else if (op != OP_SOD &&
1702     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1703     return FALSE;
1704 nigel 3 code += (code[1] << 8) + code[2];
1705     }
1706     while (*code == OP_ALT);
1707     return TRUE;
1708     }
1709    
1710    
1711    
1712     /*************************************************
1713     * Check for start with \n line expression *
1714     *************************************************/
1715    
1716     /* This is called for multiline expressions to try to find out if every branch
1717     starts with ^ so that "first char" processing can be done to speed things up.
1718    
1719     Argument: points to start of expression (the bracket)
1720     Returns: TRUE or FALSE
1721     */
1722    
1723     static BOOL
1724 nigel 7 is_startline(const uschar *code)
1725 nigel 3 {
1726     do {
1727 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1728     register int op = *scode;
1729     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1730     { if (!is_startline(scode)) return FALSE; }
1731     else if (op != OP_CIRC) return FALSE;
1732 nigel 3 code += (code[1] << 8) + code[2];
1733     }
1734     while (*code == OP_ALT);
1735     return TRUE;
1736     }
1737    
1738    
1739    
1740     /*************************************************
1741     * Check for fixed first char *
1742     *************************************************/
1743    
1744     /* Try to find out if there is a fixed first character. This is called for
1745     unanchored expressions, as it speeds up their processing quite considerably.
1746     Consider each alternative branch. If they all start with the same char, or with
1747     a bracket all of whose alternatives start with the same char (recurse ad lib),
1748     then we return that char, otherwise -1.
1749    
1750 nigel 23 Arguments:
1751     code points to start of expression (the bracket)
1752     options pointer to the options (used to check casing changes)
1753    
1754     Returns: -1 or the fixed first char
1755 nigel 3 */
1756    
1757     static int
1758 nigel 23 find_firstchar(const uschar *code, int *options)
1759 nigel 3 {
1760     register int c = -1;
1761 nigel 23 do {
1762     int d;
1763     const uschar *scode = first_significant_code(code + 3, options,
1764     PCRE_CASELESS, TRUE);
1765     register int op = *scode;
1766 nigel 3
1767 nigel 23 if (op >= OP_BRA) op = OP_BRA;
1768 nigel 3
1769 nigel 23 switch(op)
1770     {
1771     default:
1772     return -1;
1773 nigel 3
1774 nigel 23 case OP_BRA:
1775     case OP_ASSERT:
1776     case OP_ONCE:
1777     case OP_COND:
1778     if ((d = find_firstchar(scode, options)) < 0) return -1;
1779     if (c < 0) c = d; else if (c != d) return -1;
1780     break;
1781 nigel 3
1782 nigel 23 case OP_EXACT: /* Fall through */
1783     scode++;
1784 nigel 3
1785 nigel 23 case OP_CHARS: /* Fall through */
1786     scode++;
1787    
1788     case OP_PLUS:
1789     case OP_MINPLUS:
1790     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1791     break;
1792     }
1793    
1794     code += (code[1] << 8) + code[2];
1795     }
1796 nigel 3 while (*code == OP_ALT);
1797     return c;
1798     }
1799    
1800    
1801    
1802 nigel 23
1803    
1804 nigel 3 /*************************************************
1805     * Compile a Regular Expression *
1806     *************************************************/
1807    
1808     /* This function takes a string and returns a pointer to a block of store
1809     holding a compiled version of the expression.
1810    
1811     Arguments:
1812     pattern the regular expression
1813     options various option bits
1814     errorptr pointer to pointer to error text
1815     erroroffset ptr offset in pattern where error was detected
1816    
1817     Returns: pointer to compiled data block, or NULL on error,
1818     with errorptr and erroroffset set
1819     */
1820    
1821     pcre *
1822 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
1823 nigel 3 int *erroroffset)
1824     {
1825     real_pcre *re;
1826     int length = 3; /* For initial BRA plus length */
1827     int runlength;
1828     int c, size;
1829     int bracount = 0;
1830     int top_backref = 0;
1831 nigel 23 int branch_extra = 0;
1832     int branch_newextra;
1833 nigel 7 unsigned int brastackptr = 0;
1834     uschar *code;
1835     const uschar *ptr;
1836 nigel 23 int brastack[BRASTACK_SIZE];
1837     uschar bralenstack[BRASTACK_SIZE];
1838 nigel 3
1839     #ifdef DEBUG
1840     uschar *code_base, *code_end;
1841     #endif
1842    
1843     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1844     can do is just return NULL. */
1845    
1846     if (errorptr == NULL) return NULL;
1847     *errorptr = NULL;
1848    
1849     /* However, we can give a message for this error */
1850    
1851     if (erroroffset == NULL)
1852     {
1853     *errorptr = ERR16;
1854     return NULL;
1855     }
1856     *erroroffset = 0;
1857    
1858     if ((options & ~PUBLIC_OPTIONS) != 0)
1859     {
1860     *errorptr = ERR17;
1861     return NULL;
1862     }
1863    
1864 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
1865     DPRINTF(("%s\n", pattern));
1866 nigel 3
1867     /* The first thing to do is to make a pass over the pattern to compute the
1868     amount of store required to hold the compiled code. This does not have to be
1869     perfect as long as errors are overestimates. At the same time we can detect any
1870     internal flag settings. Make an attempt to correct for any counted white space
1871     if an "extended" flag setting appears late in the pattern. We can't be so
1872     clever for #-comments. */
1873    
1874 nigel 7 ptr = (const uschar *)(pattern - 1);
1875 nigel 3 while ((c = *(++ptr)) != 0)
1876     {
1877     int min, max;
1878     int class_charcount;
1879    
1880 nigel 23 if ((options & PCRE_EXTENDED) != 0)
1881 nigel 3 {
1882 nigel 23 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1883     if (c == '#')
1884     {
1885     while ((c = *(++ptr)) != 0 && c != '\n');
1886     continue;
1887     }
1888 nigel 3 }
1889    
1890     switch(c)
1891     {
1892     /* A backslashed item may be an escaped "normal" character or a
1893     character type. For a "normal" character, put the pointers and
1894     character back so that tests for whitespace etc. in the input
1895     are done correctly. */
1896    
1897     case '\\':
1898     {
1899 nigel 7 const uschar *save_ptr = ptr;
1900 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1901     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1902     if (c >= 0)
1903     {
1904     ptr = save_ptr;
1905     c = '\\';
1906     goto NORMAL_CHAR;
1907     }
1908     }
1909     length++;
1910    
1911     /* A back reference needs an additional char, plus either one or 5
1912     bytes for a repeat. We also need to keep the value of the highest
1913     back reference. */
1914    
1915     if (c <= -ESC_REF)
1916     {
1917     int refnum = -c - ESC_REF;
1918     if (refnum > top_backref) top_backref = refnum;
1919     length++; /* For single back reference */
1920     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1921     {
1922     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1923     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1924     if ((min == 0 && (max == 1 || max == -1)) ||
1925     (min == 1 && max == -1))
1926     length++;
1927     else length += 5;
1928     if (ptr[1] == '?') ptr++;
1929     }
1930     }
1931     continue;
1932    
1933     case '^':
1934     case '.':
1935     case '$':
1936     case '*': /* These repeats won't be after brackets; */
1937     case '+': /* those are handled separately */
1938     case '?':
1939     length++;
1940     continue;
1941    
1942     /* This covers the cases of repeats after a single char, metachar, class,
1943     or back reference. */
1944    
1945     case '{':
1946     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1947     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1948     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1949     if ((min == 0 && (max == 1 || max == -1)) ||
1950     (min == 1 && max == -1))
1951     length++;
1952     else
1953     {
1954     length--; /* Uncount the original char or metachar */
1955     if (min == 1) length++; else if (min > 0) length += 4;
1956     if (max > 0) length += 4; else length += 2;
1957     }
1958     if (ptr[1] == '?') ptr++;
1959     continue;
1960    
1961 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
1962     options changed in the previous branch(es), and/or if we are in a
1963     lookbehind assertion, extra space will be needed at the start of the
1964     branch. This is handled by branch_extra. */
1965    
1966 nigel 3 case '|':
1967 nigel 23 length += 3 + branch_extra;
1968 nigel 3 continue;
1969    
1970     /* A character class uses 33 characters. Don't worry about character types
1971     that aren't allowed in classes - they'll get picked up during the compile.
1972     A character class that contains only one character uses 2 or 3 bytes,
1973     depending on whether it is negated or not. Notice this where we can. */
1974    
1975     case '[':
1976     class_charcount = 0;
1977     if (*(++ptr) == '^') ptr++;
1978     do
1979     {
1980     if (*ptr == '\\')
1981     {
1982 nigel 9 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1983 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1984 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1985 nigel 3 }
1986     else class_charcount++;
1987     ptr++;
1988     }
1989     while (*ptr != 0 && *ptr != ']');
1990    
1991     /* Repeats for negated single chars are handled by the general code */
1992    
1993     if (class_charcount == 1) length += 3; else
1994     {
1995     length += 33;
1996    
1997     /* A repeat needs either 1 or 5 bytes. */
1998    
1999 nigel 9 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2000 nigel 3 {
2001     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2002     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2003     if ((min == 0 && (max == 1 || max == -1)) ||
2004     (min == 1 && max == -1))
2005     length++;
2006     else length += 5;
2007     if (ptr[1] == '?') ptr++;
2008     }
2009     }
2010     continue;
2011    
2012     /* Brackets may be genuine groups or special things */
2013    
2014     case '(':
2015 nigel 23 branch_newextra = 0;
2016 nigel 3
2017     /* Handle special forms of bracket, which all start (? */
2018    
2019 nigel 23 if (ptr[1] == '?')
2020 nigel 3 {
2021 nigel 23 int set, unset;
2022     int *optset;
2023    
2024     switch (c = ptr[2])
2025 nigel 3 {
2026 nigel 23 /* Skip over comments entirely */
2027     case '#':
2028     ptr += 3;
2029     while (*ptr != 0 && *ptr != ')') ptr++;
2030     if (*ptr == 0)
2031     {
2032     *errorptr = ERR18;
2033     goto PCRE_ERROR_RETURN;
2034     }
2035     continue;
2036 nigel 3
2037 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2038     then behave like a non-special bracket, except that they don't increment
2039     the count of extracting brackets. Ditto for the "once only" bracket,
2040     which is in Perl from version 5.005. */
2041 nigel 3
2042 nigel 23 case ':':
2043     case '=':
2044     case '!':
2045     case '>':
2046 nigel 3 ptr += 2;
2047     break;
2048    
2049 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2050 nigel 3
2051 nigel 23 case '<':
2052     if (ptr[3] == '=' || ptr[3] == '!')
2053 nigel 3 {
2054 nigel 23 ptr += 3;
2055     branch_newextra = 3;
2056     length += 3; /* For the first branch */
2057     break;
2058 nigel 3 }
2059 nigel 23 *errorptr = ERR24;
2060     goto PCRE_ERROR_RETURN;
2061    
2062     /* Conditionals are in Perl from version 5.005. The bracket must either
2063     be followed by a number (for bracket reference) or by an assertion
2064     group. */
2065    
2066     case '(':
2067     if ((pcre_ctypes[ptr[3]] & ctype_digit) != 0)
2068 nigel 3 {
2069 nigel 23 ptr += 4;
2070     length += 2;
2071     while ((pcre_ctypes[*ptr] & ctype_digit) != 0) ptr++;
2072     if (*ptr != ')')
2073     {
2074     *errorptr = ERR26;
2075     goto PCRE_ERROR_RETURN;
2076     }
2077 nigel 3 }
2078 nigel 23 else /* An assertion must follow */
2079 nigel 3 {
2080 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2081    
2082     if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2083     {
2084     ptr += 2; /* To get right offset in message */
2085     *errorptr = ERR28;
2086     goto PCRE_ERROR_RETURN;
2087     }
2088 nigel 3 }
2089 nigel 23 break;
2090    
2091     /* Else loop checking valid options until ) is met. Anything else is an
2092     error. If we are without any brackets, i.e. at top level, the settings
2093     act as if specified in the options, so massage the options immediately.
2094     This is for backward compatibility with Perl 5.004. */
2095    
2096     default:
2097     set = unset = 0;
2098     optset = &set;
2099     ptr += 2;
2100    
2101     for (;; ptr++)
2102 nigel 3 {
2103 nigel 23 c = *ptr;
2104     switch (c)
2105     {
2106     case 'i':
2107     *optset |= PCRE_CASELESS;
2108     continue;
2109    
2110     case 'm':
2111     *optset |= PCRE_MULTILINE;
2112     continue;
2113    
2114     case 's':
2115     *optset |= PCRE_DOTALL;
2116     continue;
2117    
2118     case 'x':
2119     *optset |= PCRE_EXTENDED;
2120     continue;
2121    
2122     case 'X':
2123     *optset |= PCRE_EXTRA;
2124     continue;
2125    
2126     case 'U':
2127     *optset |= PCRE_UNGREEDY;
2128     continue;
2129    
2130     case '-':
2131     optset = &unset;
2132     continue;
2133    
2134     /* A termination by ')' indicates an options-setting-only item;
2135     this is global at top level; otherwise nothing is done here and
2136     it is handled during the compiling process on a per-bracket-group
2137     basis. */
2138    
2139     case ')':
2140     if (brastackptr == 0)
2141     {
2142     options = (options | set) & (~unset);
2143     set = unset = 0; /* To save length */
2144     }
2145     /* Fall through */
2146    
2147     /* A termination by ':' indicates the start of a nested group with
2148     the given options set. This is again handled at compile time, but
2149     we must allow for compiled space if any of the ims options are
2150     set. We also have to allow for resetting space at the end of
2151     the group, which is why 4 is added to the length and not just 2.
2152     If there are several changes of options within the same group, this
2153     will lead to an over-estimate on the length, but this shouldn't
2154     matter very much. We also have to allow for resetting options at
2155     the start of any alternations, which we do by setting
2156     branch_newextra to 2. */
2157    
2158     case ':':
2159     if (((set|unset) & PCRE_IMS) != 0)
2160     {
2161     length += 4;
2162     branch_newextra = 2;
2163     }
2164     goto END_OPTIONS;
2165    
2166     /* Unrecognized option character */
2167    
2168     default:
2169     *errorptr = ERR12;
2170     goto PCRE_ERROR_RETURN;
2171     }
2172 nigel 3 }
2173 nigel 23
2174     /* If we hit a closing bracket, that's it - this is a freestanding
2175     option-setting. We need to ensure that branch_extra is updated if
2176     necessary. The only values branch_newextra can have here are 0 or 2.
2177     If the value is 2, then branch_extra must either be 2 or 5, depending
2178     on whether this is a lookbehind group or not. */
2179    
2180     END_OPTIONS:
2181     if (c == ')')
2182 nigel 19 {
2183 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2184     branch_extra += branch_newextra;
2185 nigel 19 continue;
2186     }
2187 nigel 3
2188 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2189     to handle the group below. */
2190 nigel 3 }
2191     }
2192    
2193     /* Extracting brackets must be counted so we can process escapes in a
2194     Perlish way. */
2195    
2196     else bracount++;
2197    
2198     /* Non-special forms of bracket. Save length for computing whole length
2199 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2200     save the current value of branch_extra, and start the new group with
2201     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2202     for a lookbehind assertion. */
2203 nigel 3
2204     if (brastackptr >= sizeof(brastack)/sizeof(int))
2205     {
2206     *errorptr = ERR19;
2207     goto PCRE_ERROR_RETURN;
2208     }
2209    
2210 nigel 23 bralenstack[brastackptr] = branch_extra;
2211     branch_extra = branch_newextra;
2212    
2213 nigel 3 brastack[brastackptr++] = length;
2214     length += 3;
2215     continue;
2216    
2217     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2218 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2219     0 this is an unmatched bracket which will generate an error, but take care
2220 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2221     the branch_extra value. */
2222 nigel 3
2223     case ')':
2224     length += 3;
2225     {
2226 nigel 9 int minval = 1;
2227     int maxval = 1;
2228 nigel 23 int duplength;
2229 nigel 3
2230 nigel 23 if (brastackptr > 0)
2231     {
2232     duplength = length - brastack[--brastackptr];
2233     branch_extra = bralenstack[brastackptr];
2234     }
2235     else duplength = 0;
2236    
2237 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2238     automatically; for the others we need an increment. */
2239    
2240     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2241     {
2242 nigel 9 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2243 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2244     }
2245 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2246     else if (c == '+') { maxval = -1; ptr++; }
2247     else if (c == '?') { minval = 0; ptr++; }
2248 nigel 3
2249 nigel 9 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2250     if there is a limited maximum we have to replicate up to maxval-1 times
2251     and allow for a BRAZERO item before each optional copy, as we also have
2252     to do before the first copy if the minimum is zero. */
2253 nigel 3
2254 nigel 9 if (minval == 0) length++;
2255     else if (minval > 1) length += (minval - 1) * duplength;
2256     if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2257 nigel 3 }
2258     continue;
2259    
2260     /* Non-special character. For a run of such characters the length required
2261     is the number of characters + 2, except that the maximum run length is 255.
2262     We won't get a skipped space or a non-data escape or the start of a #
2263     comment as the first character, so the length can't be zero. */
2264    
2265     NORMAL_CHAR:
2266     default:
2267     length += 2;
2268     runlength = 0;
2269     do
2270     {
2271 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2272 nigel 3 {
2273 nigel 23 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2274     if (c == '#')
2275     {
2276     while ((c = *(++ptr)) != 0 && c != '\n');
2277     continue;
2278     }
2279 nigel 3 }
2280    
2281     /* Backslash may introduce a data char or a metacharacter; stop the
2282     string before the latter. */
2283    
2284     if (c == '\\')
2285     {
2286 nigel 7 const uschar *saveptr = ptr;
2287 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2288     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2289     if (c < 0) { ptr = saveptr; break; }
2290     }
2291    
2292     /* Ordinary character or single-char escape */
2293    
2294     runlength++;
2295     }
2296    
2297     /* This "while" is the end of the "do" above. */
2298    
2299     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2300    
2301     ptr--;
2302     length += runlength;
2303     continue;
2304     }
2305     }
2306    
2307     length += 4; /* For final KET and END */
2308    
2309     if (length > 65539)
2310     {
2311     *errorptr = ERR20;
2312     return NULL;
2313     }
2314    
2315     /* Compute the size of data block needed and get it, either from malloc or
2316 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
2317     rather than just "code", because it has been reported that one broken compiler
2318     fails on "code" because it is also an independent variable. It should make no
2319     difference to the value of the offsetof(). */
2320 nigel 3
2321 nigel 9 size = length + offsetof(real_pcre, code[0]);
2322 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
2323    
2324     if (re == NULL)
2325     {
2326     *errorptr = ERR21;
2327     return NULL;
2328     }
2329    
2330 nigel 9 /* Put in the magic number and the options. */
2331    
2332 nigel 3 re->magic_number = MAGIC_NUMBER;
2333     re->options = options;
2334    
2335     /* Set up a starting, non-extracting bracket, then compile the expression. On
2336     error, *errorptr will be set non-NULL, so we don't need to look at the result
2337     of the function here. */
2338    
2339 nigel 7 ptr = (const uschar *)pattern;
2340 nigel 3 code = re->code;
2341     *code = OP_BRA;
2342     bracount = 0;
2343 nigel 23 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1);
2344 nigel 3 re->top_bracket = bracount;
2345     re->top_backref = top_backref;
2346    
2347     /* If not reached end of pattern on success, there's an excess bracket. */
2348    
2349     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2350    
2351     /* Fill in the terminating state and check for disastrous overflow, but
2352     if debugging, leave the test till after things are printed out. */
2353    
2354     *code++ = OP_END;
2355    
2356     #ifndef DEBUG
2357     if (code - re->code > length) *errorptr = ERR23;
2358     #endif
2359    
2360 nigel 23 /* Give an error if there's back reference to a non-existent capturing
2361     subpattern. */
2362    
2363     if (top_backref > re->top_bracket) *errorptr = ERR15;
2364    
2365 nigel 3 /* Failed to compile */
2366    
2367     if (*errorptr != NULL)
2368     {
2369     (pcre_free)(re);
2370     PCRE_ERROR_RETURN:
2371 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
2372 nigel 3 return NULL;
2373     }
2374    
2375     /* If the anchored option was not passed, set flag if we can determine that it
2376     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2377     we can determine what the first character has to be, because that speeds up
2378     unanchored matches no end. In the case of multiline matches, an alternative is
2379     to set the PCRE_STARTLINE flag if all branches start with ^. */
2380    
2381     if ((options & PCRE_ANCHORED) == 0)
2382     {
2383 nigel 23 int temp_options = options;
2384     if (is_anchored(re->code, &temp_options))
2385 nigel 3 re->options |= PCRE_ANCHORED;
2386     else
2387     {
2388 nigel 23 int ch = find_firstchar(re->code, &temp_options);
2389 nigel 9 if (ch >= 0)
2390 nigel 3 {
2391 nigel 9 re->first_char = ch;
2392 nigel 3 re->options |= PCRE_FIRSTSET;
2393     }
2394     else if (is_startline(re->code))
2395     re->options |= PCRE_STARTLINE;
2396     }
2397     }
2398    
2399     /* Print out the compiled data for debugging */
2400    
2401     #ifdef DEBUG
2402    
2403 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
2404 nigel 3 length, re->top_bracket, re->top_backref);
2405    
2406     if (re->options != 0)
2407     {
2408 nigel 19 printf("%s%s%s%s%s%s%s%s\n",
2409 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2410     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2411     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2412     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2413     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2414     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2415 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2416     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2417 nigel 3 }
2418    
2419     if ((re->options & PCRE_FIRSTSET) != 0)
2420     {
2421     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2422     else printf("First char = \\x%02x\n", re->first_char);
2423     }
2424    
2425     code_end = code;
2426     code_base = code = re->code;
2427    
2428     while (code < code_end)
2429     {
2430     int charlength;
2431    
2432     printf("%3d ", code - code_base);
2433    
2434     if (*code >= OP_BRA)
2435     {
2436     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2437     code += 2;
2438     }
2439    
2440     else switch(*code)
2441     {
2442 nigel 23 case OP_OPT:
2443     printf(" %.2x %s", code[1], OP_names[*code]);
2444     code++;
2445     break;
2446    
2447     case OP_COND:
2448     printf("%3d Cond", (code[1] << 8) + code[2]);
2449     code += 2;
2450     break;
2451    
2452     case OP_CREF:
2453     printf(" %.2d %s", code[1], OP_names[*code]);
2454     code++;
2455     break;
2456    
2457 nigel 3 case OP_CHARS:
2458     charlength = *(++code);
2459     printf("%3d ", charlength);
2460     while (charlength-- > 0)
2461     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2462     break;
2463    
2464     case OP_KETRMAX:
2465     case OP_KETRMIN:
2466     case OP_ALT:
2467     case OP_KET:
2468     case OP_ASSERT:
2469     case OP_ASSERT_NOT:
2470 nigel 23 case OP_ASSERTBACK:
2471     case OP_ASSERTBACK_NOT:
2472 nigel 3 case OP_ONCE:
2473     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2474     code += 2;
2475     break;
2476    
2477 nigel 23 case OP_REVERSE:
2478     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2479     code += 2;
2480     break;
2481    
2482 nigel 3 case OP_STAR:
2483     case OP_MINSTAR:
2484     case OP_PLUS:
2485     case OP_MINPLUS:
2486     case OP_QUERY:
2487     case OP_MINQUERY:
2488     case OP_TYPESTAR:
2489     case OP_TYPEMINSTAR:
2490     case OP_TYPEPLUS:
2491     case OP_TYPEMINPLUS:
2492     case OP_TYPEQUERY:
2493     case OP_TYPEMINQUERY:
2494     if (*code >= OP_TYPESTAR)
2495     printf(" %s", OP_names[code[1]]);
2496     else if (isprint(c = code[1])) printf(" %c", c);
2497     else printf(" \\x%02x", c);
2498     printf("%s", OP_names[*code++]);
2499     break;
2500    
2501     case OP_EXACT:
2502     case OP_UPTO:
2503     case OP_MINUPTO:
2504     if (isprint(c = code[3])) printf(" %c{", c);
2505     else printf(" \\x%02x{", c);
2506 nigel 11 if (*code != OP_EXACT) printf("0,");
2507 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2508     if (*code == OP_MINUPTO) printf("?");
2509     code += 3;
2510     break;
2511    
2512     case OP_TYPEEXACT:
2513     case OP_TYPEUPTO:
2514     case OP_TYPEMINUPTO:
2515     printf(" %s{", OP_names[code[3]]);
2516     if (*code != OP_TYPEEXACT) printf(",");
2517     printf("%d}", (code[1] << 8) + code[2]);
2518     if (*code == OP_TYPEMINUPTO) printf("?");
2519     code += 3;
2520     break;
2521    
2522     case OP_NOT:
2523     if (isprint(c = *(++code))) printf(" [^%c]", c);
2524     else printf(" [^\\x%02x]", c);
2525     break;
2526    
2527     case OP_NOTSTAR:
2528     case OP_NOTMINSTAR:
2529     case OP_NOTPLUS:
2530     case OP_NOTMINPLUS:
2531     case OP_NOTQUERY:
2532     case OP_NOTMINQUERY:
2533     if (isprint(c = code[1])) printf(" [^%c]", c);
2534     else printf(" [^\\x%02x]", c);
2535     printf("%s", OP_names[*code++]);
2536     break;
2537    
2538     case OP_NOTEXACT:
2539     case OP_NOTUPTO:
2540     case OP_NOTMINUPTO:
2541     if (isprint(c = code[3])) printf(" [^%c]{", c);
2542     else printf(" [^\\x%02x]{", c);
2543     if (*code != OP_NOTEXACT) printf(",");
2544     printf("%d}", (code[1] << 8) + code[2]);
2545     if (*code == OP_NOTMINUPTO) printf("?");
2546     code += 3;
2547     break;
2548    
2549     case OP_REF:
2550     printf(" \\%d", *(++code));
2551 nigel 9 code ++;
2552     goto CLASS_REF_REPEAT;
2553 nigel 3
2554     case OP_CLASS:
2555     {
2556     int i, min, max;
2557 nigel 23 code++;
2558     printf(" [");
2559 nigel 3
2560     for (i = 0; i < 256; i++)
2561     {
2562     if ((code[i/8] & (1 << (i&7))) != 0)
2563     {
2564     int j;
2565     for (j = i+1; j < 256; j++)
2566     if ((code[j/8] & (1 << (j&7))) == 0) break;
2567     if (i == '-' || i == ']') printf("\\");
2568     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2569     if (--j > i)
2570     {
2571     printf("-");
2572     if (j == '-' || j == ']') printf("\\");
2573     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2574     }
2575     i = j;
2576     }
2577     }
2578     printf("]");
2579     code += 32;
2580    
2581 nigel 9 CLASS_REF_REPEAT:
2582    
2583 nigel 3 switch(*code)
2584     {
2585     case OP_CRSTAR:
2586     case OP_CRMINSTAR:
2587     case OP_CRPLUS:
2588     case OP_CRMINPLUS:
2589     case OP_CRQUERY:
2590     case OP_CRMINQUERY:
2591     printf("%s", OP_names[*code]);
2592     break;
2593    
2594     case OP_CRRANGE:
2595     case OP_CRMINRANGE:
2596     min = (code[1] << 8) + code[2];
2597     max = (code[3] << 8) + code[4];
2598     if (max == 0) printf("{%d,}", min);
2599     else printf("{%d,%d}", min, max);
2600     if (*code == OP_CRMINRANGE) printf("?");
2601     code += 4;
2602     break;
2603    
2604     default:
2605     code--;
2606     }
2607     }
2608     break;
2609    
2610     /* Anything else is just a one-node item */
2611    
2612     default:
2613     printf(" %s", OP_names[*code]);
2614     break;
2615     }
2616    
2617     code++;
2618     printf("\n");
2619     }
2620     printf("------------------------------------------------------------------\n");
2621    
2622     /* This check is done here in the debugging case so that the code that
2623     was compiled can be seen. */
2624    
2625     if (code - re->code > length)
2626     {
2627     *errorptr = ERR23;
2628     (pcre_free)(re);
2629     *erroroffset = ptr - (uschar *)pattern;
2630     return NULL;
2631     }
2632     #endif
2633    
2634     return (pcre *)re;
2635     }
2636    
2637    
2638    
2639     /*************************************************
2640     * Match a character type *
2641     *************************************************/
2642    
2643     /* Not used in all the places it might be as it's sometimes faster
2644     to put the code inline.
2645    
2646     Arguments:
2647     type the character type
2648     c the character
2649     dotall the dotall flag
2650    
2651     Returns: TRUE if character is of the type
2652     */
2653    
2654     static BOOL
2655     match_type(int type, int c, BOOL dotall)
2656     {
2657    
2658     #ifdef DEBUG
2659     if (isprint(c)) printf("matching subject %c against ", c);
2660     else printf("matching subject \\x%02x against ", c);
2661     printf("%s\n", OP_names[type]);
2662     #endif
2663    
2664     switch(type)
2665     {
2666     case OP_ANY: return dotall || c != '\n';
2667     case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2668     case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2669     case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2670     case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2671     case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2672     case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2673     }
2674     return FALSE;
2675     }
2676    
2677    
2678    
2679     /*************************************************
2680     * Match a back-reference *
2681     *************************************************/
2682    
2683 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
2684     than the number of characters left in the string, so the match fails.
2685 nigel 3
2686     Arguments:
2687 nigel 23 offset index into the offset vector
2688 nigel 3 eptr points into the subject
2689     length length to be matched
2690     md points to match data block
2691 nigel 23 ims the ims flags
2692 nigel 3
2693     Returns: TRUE if matched
2694     */
2695    
2696     static BOOL
2697 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2698     int ims)
2699 nigel 3 {
2700 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
2701 nigel 3
2702     #ifdef DEBUG
2703     if (eptr >= md->end_subject)
2704     printf("matching subject <null>");
2705     else
2706     {
2707     printf("matching subject ");
2708     pchars(eptr, length, TRUE, md);
2709     }
2710     printf(" against backref ");
2711     pchars(p, length, FALSE, md);
2712     printf("\n");
2713     #endif
2714    
2715     /* Always fail if not enough characters left */
2716    
2717 nigel 23 if (length > md->end_subject - eptr) return FALSE;
2718 nigel 3
2719     /* Separate the caselesss case for speed */
2720    
2721 nigel 23 if ((ims & PCRE_CASELESS) != 0)
2722 nigel 3 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2723     else
2724     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2725    
2726     return TRUE;
2727     }
2728    
2729    
2730    
2731     /*************************************************
2732     * Match from current position *
2733     *************************************************/
2734    
2735 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
2736     in the subject string, while eptrb holds the value of eptr at the start of the
2737     last bracketed group - used for breaking infinite loops matching zero-length
2738     strings.
2739 nigel 3
2740     Arguments:
2741     eptr pointer in subject
2742     ecode position in code
2743     offset_top current top pointer
2744     md pointer to "static" info for the match
2745 nigel 23 ims current /i, /m, and /s options
2746     condassert TRUE if called to check a condition assertion
2747     eptrb eptr at start of last bracket
2748 nigel 3
2749     Returns: TRUE if matched
2750     */
2751    
2752     static BOOL
2753 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
2754     int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2755 nigel 3 {
2756 nigel 23 int original_ims = ims; /* Save for resetting on ')' */
2757    
2758 nigel 3 for (;;)
2759     {
2760 nigel 23 int op = (int)*ecode;
2761 nigel 3 int min, max, ctype;
2762     register int i;
2763     register int c;
2764 nigel 7 BOOL minimize = FALSE;
2765 nigel 3
2766 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
2767     the current subject position in the working slot at the top of the vector. We
2768     mustn't change the current values of the data slot, because they may be set
2769     from a previous iteration of this group, and be referred to by a reference
2770     inside the group.
2771 nigel 3
2772 nigel 23 If the bracket fails to match, we need to restore this value and also the
2773     values of the final offsets, in case they were set by a previous iteration of
2774     the same bracket.
2775    
2776     If there isn't enough space in the offset vector, treat this as if it were a
2777     non-capturing bracket. Don't worry about setting the flag for the error case
2778     here; that is handled in the code for KET. */
2779    
2780     if (op > OP_BRA)
2781 nigel 3 {
2782 nigel 23 int number = op - OP_BRA;
2783     int offset = number << 1;
2784 nigel 3
2785 nigel 23 DPRINTF(("start bracket %d\n", number));
2786 nigel 3
2787 nigel 23 if (offset < md->offset_max)
2788 nigel 3 {
2789 nigel 23 int save_offset1 = md->offset_vector[offset];
2790     int save_offset2 = md->offset_vector[offset+1];
2791     int save_offset3 = md->offset_vector[md->offset_end - number];
2792 nigel 3
2793 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2794     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2795    
2796     do
2797     {
2798     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2799     ecode += (ecode[1] << 8) + ecode[2];
2800     }
2801     while (*ecode == OP_ALT);
2802    
2803     DPRINTF(("bracket %d failed\n", number));
2804    
2805     md->offset_vector[offset] = save_offset1;
2806     md->offset_vector[offset+1] = save_offset2;
2807     md->offset_vector[md->offset_end - number] = save_offset3;
2808     return FALSE;
2809 nigel 3 }
2810    
2811 nigel 23 /* Insufficient room for saving captured contents */
2812 nigel 3
2813 nigel 23 else op = OP_BRA;
2814     }
2815    
2816     /* Other types of node can be handled by a switch */
2817    
2818     switch(op)
2819     {
2820     case OP_BRA: /* Non-capturing bracket: optimized */
2821     DPRINTF(("start bracket 0\n"));
2822 nigel 3 do
2823     {
2824 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2825 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2826     }
2827     while (*ecode == OP_ALT);
2828 nigel 23 DPRINTF(("bracket 0 failed\n"));
2829     return FALSE;
2830 nigel 3
2831 nigel 23 /* Conditional group: compilation checked that there are no more than
2832     two branches. If the condition is false, skipping the first branch takes us
2833     past the end if there is only one branch, but that's OK because that is
2834     exactly what going to the ket would do. */
2835 nigel 3
2836 nigel 23 case OP_COND:
2837     if (ecode[3] == OP_CREF) /* Condition is extraction test */
2838 nigel 3 {
2839 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
2840     return match(eptr,
2841     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2842     5 : 3 + (ecode[1] << 8) + ecode[2]),
2843     offset_top, md, ims, FALSE, eptr);
2844 nigel 3 }
2845    
2846 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
2847     the final argument TRUE causes it to stop at the end of an assertion. */
2848 nigel 3
2849 nigel 23 else
2850     {
2851     if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
2852     {
2853     ecode += 3 + (ecode[4] << 8) + ecode[5];
2854     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
2855     }
2856     else ecode += (ecode[1] << 8) + ecode[2];
2857     return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
2858     }
2859     /* Control never reaches here */
2860 nigel 3
2861 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
2862    
2863     case OP_CREF:
2864     ecode += 2;
2865     break;
2866    
2867     /* End of the pattern */
2868    
2869 nigel 3 case OP_END:
2870     md->end_match_ptr = eptr; /* Record where we ended */
2871     md->end_offset_top = offset_top; /* and how many extracts were taken */
2872     return TRUE;
2873    
2874 nigel 23 /* Change option settings */
2875 nigel 3
2876 nigel 23 case OP_OPT:
2877     ims = ecode[1];
2878     ecode += 2;
2879     DPRINTF(("ims set to %02x\n", ims));
2880     break;
2881 nigel 3
2882     /* Assertion brackets. Check the alternative branches in turn - the
2883     matching won't pass the KET for an assertion. If any one branch matches,
2884 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
2885     start of each branch to move the current point backwards, so the code at
2886     this level is identical to the lookahead case. */
2887 nigel 3
2888     case OP_ASSERT:
2889 nigel 23 case OP_ASSERTBACK:
2890 nigel 3 do
2891     {
2892 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
2893 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2894     }
2895     while (*ecode == OP_ALT);
2896     if (*ecode == OP_KET) return FALSE;
2897    
2898 nigel 23 /* If checking an assertion for a condition, return TRUE. */
2899    
2900     if (condassert) return TRUE;
2901    
2902 nigel 3 /* Continue from after the assertion, updating the offsets high water
2903     mark, since extracts may have been taken during the assertion. */
2904    
2905     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2906     ecode += 3;
2907     offset_top = md->end_offset_top;
2908     continue;
2909    
2910     /* Negative assertion: all branches must fail to match */
2911    
2912     case OP_ASSERT_NOT:
2913 nigel 23 case OP_ASSERTBACK_NOT:
2914 nigel 3 do
2915     {
2916 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
2917 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2918     }
2919     while (*ecode == OP_ALT);
2920 nigel 23
2921     if (condassert) return TRUE;
2922 nigel 3 ecode += 3;
2923     continue;
2924    
2925 nigel 23 /* Move the subject pointer back. This occurs only at the start of
2926     each branch of a lookbehind assertion. If we are too close to the start to
2927     move back, this match function fails. */
2928    
2929     case OP_REVERSE:
2930     eptr -= (ecode[1] << 8) + ecode[2];
2931     if (eptr < md->start_subject) return FALSE;
2932     ecode += 3;
2933     break;
2934    
2935    
2936 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
2937     the point in the subject string is not moved back. Thus there can never be
2938 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
2939 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
2940 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
2941     pointer. */
2942 nigel 3
2943     case OP_ONCE:
2944     {
2945 nigel 23 const uschar *prev = ecode;
2946 nigel 3
2947 nigel 23 do
2948     {
2949     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
2950     ecode += (ecode[1] << 8) + ecode[2];
2951     }
2952     while (*ecode == OP_ALT);
2953 nigel 3
2954 nigel 23 /* If hit the end of the group (which could be repeated), fail */
2955 nigel 3
2956 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
2957    
2958     /* Continue as from after the assertion, updating the offsets high water
2959     mark, since extracts may have been taken. */
2960    
2961     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2962    
2963     offset_top = md->end_offset_top;
2964     eptr = md->end_match_ptr;
2965    
2966     /* For a non-repeating ket, just continue at this level. This also
2967     happens for a repeating ket if no characters were matched in the group.
2968     This is the forcible breaking of infinite loops as implemented in Perl
2969     5.005. If there is an options reset, it will get obeyed in the normal
2970     course of events. */
2971    
2972     if (*ecode == OP_KET || eptr == eptrb)
2973     {
2974     ecode += 3;
2975     break;
2976     }
2977    
2978     /* The repeating kets try the rest of the pattern or restart from the
2979     preceding bracket, in the appropriate order. We need to reset any options
2980     that changed within the bracket before re-running it, so check the next
2981     opcode. */
2982    
2983     if (ecode[3] == OP_OPT)
2984     {
2985     ims = (ims & ~PCRE_IMS) | ecode[4];
2986     DPRINTF(("ims set to %02x at group repeat\n", ims));
2987     }
2988    
2989     if (*ecode == OP_KETRMIN)
2990     {
2991     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
2992     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
2993     }
2994     else /* OP_KETRMAX */
2995     {
2996     if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
2997     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2998     }
2999     }
3000     return FALSE;
3001    
3002 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
3003     bracketed group and go to there. */
3004    
3005     case OP_ALT:
3006     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3007     break;
3008    
3009     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3010     that it may occur zero times. It may repeat infinitely, or not at all -
3011     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3012     repeat limits are compiled as a number of copies, with the optional ones
3013     preceded by BRAZERO or BRAMINZERO. */
3014    
3015     case OP_BRAZERO:
3016     {
3017 nigel 7 const uschar *next = ecode+1;
3018 nigel 23 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3019 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3020     ecode = next + 3;
3021     }
3022     break;
3023    
3024     case OP_BRAMINZERO:
3025     {
3026 nigel 7 const uschar *next = ecode+1;
3027 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3028 nigel 23 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3029 nigel 3 ecode++;
3030     }
3031 nigel 23 break;
3032 nigel 3
3033     /* End of a group, repeated or non-repeating. If we are at the end of
3034     an assertion "group", stop matching and return TRUE, but record the
3035 nigel 23 current high water mark for use by positive assertions. Do this also
3036     for the "once" (not-backup up) groups. */
3037 nigel 3
3038     case OP_KET:
3039     case OP_KETRMIN:
3040     case OP_KETRMAX:
3041     {
3042 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3043 nigel 3
3044 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3045     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3046     *prev == OP_ONCE)
3047 nigel 3 {
3048     md->end_match_ptr = eptr; /* For ONCE */
3049     md->end_offset_top = offset_top;
3050     return TRUE;
3051     }
3052    
3053 nigel 23 /* In all other cases except a conditional group we have to check the
3054     group number back at the start and if necessary complete handling an
3055     extraction by setting the offsets and bumping the high water mark. */
3056 nigel 3
3057 nigel 23 if (*prev != OP_COND)
3058     {
3059     int number = *prev - OP_BRA;
3060     int offset = number << 1;
3061 nigel 3
3062 nigel 23 DPRINTF(("end bracket %d\n", number));
3063 nigel 3
3064 nigel 23 if (number > 0)
3065 nigel 3 {
3066 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3067     {
3068     md->offset_vector[offset] =
3069     md->offset_vector[md->offset_end - number];
3070     md->offset_vector[offset+1] = eptr - md->start_subject;
3071     if (offset_top <= offset) offset_top = offset + 2;
3072     }
3073 nigel 3 }
3074     }
3075    
3076 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3077     the group. */
3078 nigel 3
3079 nigel 23 ims = original_ims;
3080     DPRINTF(("ims reset to %02x\n", ims));
3081    
3082     /* For a non-repeating ket, just continue at this level. This also
3083     happens for a repeating ket if no characters were matched in the group.
3084     This is the forcible breaking of infinite loops as implemented in Perl
3085     5.005. If there is an options reset, it will get obeyed in the normal
3086     course of events. */
3087    
3088     if (*ecode == OP_KET || eptr == eptrb)
3089 nigel 3 {
3090     ecode += 3;
3091     break;
3092     }
3093    
3094     /* The repeating kets try the rest of the pattern or restart from the
3095     preceding bracket, in the appropriate order. */
3096    
3097     if (*ecode == OP_KETRMIN)
3098     {
3099 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3100     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3101 nigel 3 }
3102     else /* OP_KETRMAX */
3103     {
3104 nigel 23 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3105     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3106 nigel 3 }
3107     }
3108     return FALSE;
3109    
3110     /* Start of subject unless notbol, or after internal newline if multiline */
3111    
3112     case OP_CIRC:
3113     if (md->notbol && eptr == md->start_subject) return FALSE;
3114 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3115 nigel 3 {
3116     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3117     ecode++;
3118     break;
3119     }
3120     /* ... else fall through */
3121    
3122     /* Start of subject assertion */
3123    
3124     case OP_SOD:
3125     if (eptr != md->start_subject) return FALSE;
3126     ecode++;
3127     break;
3128    
3129 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3130     newline unless endonly is set, else end of subject unless noteol is set. */
3131 nigel 3
3132     case OP_DOLL:
3133 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3134 nigel 3 {
3135 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3136     else { if (md->noteol) return FALSE; }
3137 nigel 3 ecode++;
3138     break;
3139     }
3140 nigel 23 else
3141 nigel 3 {
3142 nigel 23 if (md->noteol) return FALSE;
3143     if (!md->endonly)
3144     {
3145     if (eptr < md->end_subject - 1 ||
3146     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3147    
3148     ecode++;
3149     break;
3150     }
3151 nigel 3 }
3152     /* ... else fall through */
3153    
3154 nigel 23 /* End of subject assertion (\z) */
3155 nigel 3
3156     case OP_EOD:
3157     if (eptr < md->end_subject) return FALSE;
3158     ecode++;
3159     break;
3160    
3161 nigel 23 /* End of subject or ending \n assertion (\Z) */
3162    
3163     case OP_EODN:
3164     if (eptr < md->end_subject - 1 ||
3165     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3166     ecode++;
3167     break;
3168    
3169 nigel 3 /* Word boundary assertions */
3170    
3171     case OP_NOT_WORD_BOUNDARY:
3172     case OP_WORD_BOUNDARY:
3173     {
3174     BOOL prev_is_word = (eptr != md->start_subject) &&
3175     ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3176     BOOL cur_is_word = (eptr < md->end_subject) &&
3177     ((pcre_ctypes[*eptr] & ctype_word) != 0);
3178     if ((*ecode++ == OP_WORD_BOUNDARY)?
3179     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3180     return FALSE;
3181     }
3182     break;
3183    
3184     /* Match a single character type; inline for speed */
3185    
3186     case OP_ANY:
3187 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3188     return FALSE;
3189 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3190     ecode++;
3191     break;
3192    
3193     case OP_NOT_DIGIT:
3194     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3195     return FALSE;
3196     ecode++;
3197     break;
3198    
3199     case OP_DIGIT:
3200     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3201     return FALSE;
3202     ecode++;
3203     break;
3204    
3205     case OP_NOT_WHITESPACE:
3206     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3207     return FALSE;
3208     ecode++;
3209     break;
3210    
3211     case OP_WHITESPACE:
3212     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3213     return FALSE;
3214     ecode++;
3215     break;
3216    
3217     case OP_NOT_WORDCHAR:
3218     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3219     return FALSE;
3220     ecode++;
3221     break;
3222    
3223     case OP_WORDCHAR:
3224     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3225     return FALSE;
3226     ecode++;
3227     break;
3228    
3229     /* Match a back reference, possibly repeatedly. Look past the end of the
3230     item to see if there is repeat information following. The code is similar
3231     to that for character classes, but repeated for efficiency. Then obey
3232     similar code to character type repeats - written out again for speed.
3233     However, if the referenced string is the empty string, always treat
3234     it as matched, any number of times (otherwise there could be infinite
3235     loops). */
3236    
3237     case OP_REF:
3238     {
3239     int length;
3240 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
3241 nigel 3 ecode += 2; /* Advance past the item */
3242    
3243 nigel 23 /* If the reference is unset, set the length to be longer than the amount
3244     of subject left; this ensures that every attempt at a match fails. We
3245     can't just fail here, because of the possibility of quantifiers with zero
3246     minima. */
3247 nigel 3
3248 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3249     md->end_subject - eptr + 1 :
3250     md->offset_vector[offset+1] - md->offset_vector[offset];
3251 nigel 3
3252 nigel 23 /* Set up for repetition, or handle the non-repeated case */
3253    
3254 nigel 3 switch (*ecode)
3255     {
3256     case OP_CRSTAR:
3257     case OP_CRMINSTAR:
3258     case OP_CRPLUS:
3259     case OP_CRMINPLUS:
3260     case OP_CRQUERY:
3261     case OP_CRMINQUERY:
3262     c = *ecode++ - OP_CRSTAR;
3263     minimize = (c & 1) != 0;
3264     min = rep_min[c]; /* Pick up values from tables; */
3265     max = rep_max[c]; /* zero for max => infinity */
3266     if (max == 0) max = INT_MAX;
3267     break;
3268    
3269     case OP_CRRANGE:
3270     case OP_CRMINRANGE:
3271     minimize = (*ecode == OP_CRMINRANGE);
3272     min = (ecode[1] << 8) + ecode[2];
3273     max = (ecode[3] << 8) + ecode[4];
3274     if (max == 0) max = INT_MAX;
3275     ecode += 5;
3276     break;
3277    
3278     default: /* No repeat follows */
3279 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3280 nigel 3 eptr += length;
3281     continue; /* With the main loop */
3282     }
3283    
3284     /* If the length of the reference is zero, just continue with the
3285     main loop. */
3286    
3287     if (length == 0) continue;
3288    
3289     /* First, ensure the minimum number of matches are present. We get back
3290     the length of the reference string explicitly rather than passing the
3291     address of eptr, so that eptr can be a register variable. */
3292    
3293     for (i = 1; i <= min; i++)
3294     {
3295 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3296 nigel 3 eptr += length;
3297     }
3298    
3299     /* If min = max, continue at the same level without recursion.
3300     They are not both allowed to be zero. */
3301    
3302     if (min == max) continue;
3303    
3304     /* If minimizing, keep trying and advancing the pointer */
3305    
3306     if (minimize)
3307     {
3308     for (i = min;; i++)
3309     {
3310 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3311     return TRUE;
3312     if (i >= max || !match_ref(offset, eptr, length, md, ims))
3313 nigel 3 return FALSE;
3314     eptr += length;
3315     }
3316     /* Control never gets here */
3317     }
3318    
3319     /* If maximizing, find the longest string and work backwards */
3320    
3321     else
3322     {
3323 nigel 7 const uschar *pp = eptr;
3324 nigel 3 for (i = min; i < max; i++)
3325     {
3326 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
3327 nigel 3 eptr += length;
3328     }
3329     while (eptr >= pp)
3330     {
3331 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3332     return TRUE;
3333 nigel 3 eptr -= length;
3334     }
3335     return FALSE;
3336     }
3337     }
3338     /* Control never gets here */
3339    
3340 nigel 23
3341    
3342 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
3343     item to see if there is repeat information following. Then obey similar
3344 nigel 23 code to character type repeats - written out again for speed. */
3345 nigel 3
3346     case OP_CLASS:
3347     {
3348 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
3349     ecode += 33; /* Advance past the item */
3350 nigel 3
3351     switch (*ecode)
3352     {
3353     case OP_CRSTAR:
3354     case OP_CRMINSTAR:
3355     case OP_CRPLUS:
3356     case OP_CRMINPLUS:
3357     case OP_CRQUERY:
3358     case OP_CRMINQUERY:
3359     c = *ecode++ - OP_CRSTAR;
3360     minimize = (c & 1) != 0;
3361     min = rep_min[c]; /* Pick up values from tables; */
3362     max = rep_max[c]; /* zero for max => infinity */
3363     if (max == 0) max = INT_MAX;
3364     break;
3365    
3366     case OP_CRRANGE:
3367     case OP_CRMINRANGE:
3368     minimize = (*ecode == OP_CRMINRANGE);
3369     min = (ecode[1] << 8) + ecode[2];
3370     max = (ecode[3] << 8) + ecode[4];
3371     if (max == 0) max = INT_MAX;
3372     ecode += 5;
3373     break;
3374    
3375     default: /* No repeat follows */
3376 nigel 13 min = max = 1;
3377     break;
3378 nigel 3 }
3379    
3380     /* First, ensure the minimum number of matches are present. */
3381    
3382     for (i = 1; i <= min; i++)
3383     {
3384     if (eptr >= md->end_subject) return FALSE;
3385     c = *eptr++;
3386 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3387 nigel 3 return FALSE;
3388     }
3389    
3390     /* If max == min we can continue with the main loop without the
3391     need to recurse. */
3392    
3393     if (min == max) continue;
3394    
3395     /* If minimizing, keep testing the rest of the expression and advancing
3396     the pointer while it matches the class. */
3397    
3398     if (minimize)
3399     {
3400     for (i = min;; i++)
3401     {
3402 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3403     return TRUE;
3404 nigel 3 if (i >= max || eptr >= md->end_subject) return FALSE;
3405     c = *eptr++;
3406 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3407 nigel 3 return FALSE;
3408     }
3409     /* Control never gets here */
3410     }
3411    
3412     /* If maximizing, find the longest possible run, then work backwards. */
3413    
3414     else
3415     {
3416 nigel 7 const uschar *pp = eptr;
3417 nigel 3 for (i = min; i < max; eptr++, i++)
3418     {
3419     if (eptr >= md->end_subject) break;
3420     c = *eptr;
3421 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3422 nigel 3 break;
3423     }
3424    
3425     while (eptr >= pp)
3426 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3427     return TRUE;
3428 nigel 3 return FALSE;
3429     }
3430     }
3431     /* Control never gets here */
3432    
3433     /* Match a run of characters */
3434    
3435     case OP_CHARS:
3436     {
3437     register int length = ecode[1];
3438     ecode += 2;
3439    
3440 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3441 nigel 3 if (eptr >= md->end_subject)
3442     printf("matching subject <null> against pattern ");
3443     else
3444     {
3445     printf("matching subject ");
3446     pchars(eptr, length, TRUE, md);
3447     printf(" against pattern ");
3448     }
3449     pchars(ecode, length, FALSE, md);
3450     printf("\n");
3451 nigel 9 #endif
3452 nigel 3
3453     if (length > md->end_subject - eptr) return FALSE;
3454 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3455 nigel 3 {
3456     while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
3457     }
3458     else
3459     {
3460     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3461     }
3462     }
3463     break;
3464    
3465     /* Match a single character repeatedly; different opcodes share code. */
3466    
3467     case OP_EXACT:
3468     min = max = (ecode[1] << 8) + ecode[2];
3469     ecode += 3;
3470     goto REPEATCHAR;
3471    
3472     case OP_UPTO:
3473     case OP_MINUPTO:
3474     min = 0;
3475     max = (ecode[1] << 8) + ecode[2];
3476     minimize = *ecode == OP_MINUPTO;
3477     ecode += 3;
3478     goto REPEATCHAR;
3479    
3480     case OP_STAR:
3481     case OP_MINSTAR:
3482     case OP_PLUS:
3483     case OP_MINPLUS:
3484     case OP_QUERY:
3485     case OP_MINQUERY:
3486     c = *ecode++ - OP_STAR;
3487     minimize = (c & 1) != 0;
3488     min = rep_min[c]; /* Pick up values from tables; */
3489     max = rep_max[c]; /* zero for max => infinity */
3490     if (max == 0) max = INT_MAX;
3491    
3492     /* Common code for all repeated single-character matches. We can give
3493     up quickly if there are fewer than the minimum number of characters left in
3494     the subject. */
3495    
3496     REPEATCHAR:
3497     if (min > md->end_subject - eptr) return FALSE;
3498     c = *ecode++;
3499    
3500     /* The code is duplicated for the caseless and caseful cases, for speed,
3501     since matching characters is likely to be quite common. First, ensure the
3502     minimum number of matches are present. If min = max, continue at the same
3503     level without recursing. Otherwise, if minimizing, keep trying the rest of
3504     the expression and advancing one matching character if failing, up to the
3505     maximum. Alternatively, if maximizing, find the maximum number of
3506     characters and work backwards. */
3507    
3508 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3509     max, eptr));
3510 nigel 3
3511 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3512 nigel 3 {
3513     c = pcre_lcc[c];
3514     for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
3515     if (min == max) continue;
3516     if (minimize)
3517     {
3518     for (i = min;; i++)
3519     {
3520 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3521     return TRUE;
3522 nigel 3 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3523     return FALSE;
3524     }
3525     /* Control never gets here */
3526     }
3527     else
3528     {
3529 nigel 7 const uschar *pp = eptr;
3530 nigel 3 for (i = min; i < max; i++)
3531     {
3532     if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3533     eptr++;
3534     }
3535     while (eptr >= pp)
3536 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3537     return TRUE;
3538 nigel 3 return FALSE;
3539     }
3540     /* Control never gets here */
3541     }
3542    
3543     /* Caseful comparisons */
3544    
3545     else
3546     {
3547     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3548     if (min == max) continue;
3549     if (minimize)
3550     {
3551     for (i = min;; i++)
3552     {
3553 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3554     return TRUE;
3555 nigel 3 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3556     }
3557     /* Control never gets here */
3558     }
3559     else
3560     {
3561 nigel 7 const uschar *pp = eptr;
3562 nigel 3 for (i = min; i < max; i++)
3563     {
3564     if (eptr >= md->end_subject || c != *eptr) break;
3565     eptr++;
3566     }
3567     while (eptr >= pp)
3568 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3569     return TRUE;
3570 nigel 3 return FALSE;
3571     }
3572     }
3573     /* Control never gets here */
3574    
3575     /* Match a negated single character */
3576    
3577     case OP_NOT:
3578 nigel 9 if (eptr >= md->end_subject) return FALSE;
3579 nigel 3 ecode++;
3580 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3581 nigel 3 {
3582     if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3583     }
3584     else
3585     {
3586     if (*ecode++ == *eptr++) return FALSE;
3587     }
3588     break;
3589    
3590     /* Match a negated single character repeatedly. This is almost a repeat of
3591     the code for a repeated single character, but I haven't found a nice way of
3592     commoning these up that doesn't require a test of the positive/negative
3593     option for each character match. Maybe that wouldn't add very much to the
3594     time taken, but character matching *is* what this is all about... */
3595    
3596     case OP_NOTEXACT:
3597     min = max = (ecode[1] << 8) + ecode[2];
3598     ecode += 3;
3599     goto REPEATNOTCHAR;
3600    
3601     case OP_NOTUPTO:
3602     case OP_NOTMINUPTO:
3603     min = 0;
3604     max = (ecode[1] << 8) + ecode[2];
3605     minimize = *ecode == OP_NOTMINUPTO;
3606     ecode += 3;
3607     goto REPEATNOTCHAR;
3608    
3609     case OP_NOTSTAR:
3610     case OP_NOTMINSTAR:
3611     case OP_NOTPLUS:
3612     case OP_NOTMINPLUS:
3613     case OP_NOTQUERY:
3614     case OP_NOTMINQUERY:
3615     c = *ecode++ - OP_NOTSTAR;
3616     minimize = (c & 1) != 0;
3617     min = rep_min[c]; /* Pick up values from tables; */
3618     max = rep_max[c]; /* zero for max => infinity */
3619     if (max == 0) max = INT_MAX;
3620    
3621     /* Common code for all repeated single-character matches. We can give
3622     up quickly if there are fewer than the minimum number of characters left in
3623     the subject. */
3624    
3625     REPEATNOTCHAR:
3626     if (min > md->end_subject - eptr) return FALSE;
3627     c = *ecode++;
3628    
3629     /* The code is duplicated for the caseless and caseful cases, for speed,
3630     since matching characters is likely to be quite common. First, ensure the
3631     minimum number of matches are present. If min = max, continue at the same
3632     level without recursing. Otherwise, if minimizing, keep trying the rest of
3633     the expression and advancing one matching character if failing, up to the
3634     maximum. Alternatively, if maximizing, find the maximum number of
3635     characters and work backwards. */
3636    
3637 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3638     max, eptr));
3639 nigel 3
3640 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3641 nigel 3 {
3642     c = pcre_lcc[c];
3643     for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3644     if (min == max) continue;
3645     if (minimize)
3646     {
3647     for (i = min;; i++)
3648     {
3649 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3650     return TRUE;
3651 nigel 3 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3652     return FALSE;
3653     }
3654     /* Control never gets here */
3655     }
3656     else
3657     {
3658 nigel 7 const uschar *pp = eptr;
3659 nigel 3 for (i = min; i < max; i++)
3660     {
3661     if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3662     eptr++;
3663     }
3664     while (eptr >= pp)
3665 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3666     return TRUE;
3667 nigel 3 return FALSE;
3668     }
3669     /* Control never gets here */
3670     }
3671    
3672     /* Caseful comparisons */
3673    
3674     else
3675     {
3676     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3677     if (min == max) continue;
3678     if (minimize)
3679     {
3680     for (i = min;; i++)
3681     {
3682 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3683     return TRUE;
3684 nigel 3 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3685     }
3686     /* Control never gets here */
3687     }
3688     else
3689     {
3690 nigel 7 const uschar *pp = eptr;
3691 nigel 3 for (i = min; i < max; i++)
3692     {
3693     if (eptr >= md->end_subject || c == *eptr) break;
3694     eptr++;
3695     }
3696     while (eptr >= pp)
3697 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3698     return TRUE;
3699 nigel 3 return FALSE;
3700     }
3701     }
3702     /* Control never gets here */
3703    
3704     /* Match a single character type repeatedly; several different opcodes
3705     share code. This is very similar to the code for single characters, but we
3706     repeat it in the interests of efficiency. */
3707    
3708     case OP_TYPEEXACT:
3709     min = max = (ecode[1] << 8) + ecode[2];
3710     minimize = TRUE;
3711     ecode += 3;
3712     goto REPEATTYPE;
3713    
3714     case OP_TYPEUPTO:
3715     case OP_TYPEMINUPTO:
3716     min = 0;
3717     max = (ecode[1] << 8) + ecode[2];
3718     minimize = *ecode == OP_TYPEMINUPTO;
3719     ecode += 3;
3720     goto REPEATTYPE;
3721    
3722     case OP_TYPESTAR:
3723     case OP_TYPEMINSTAR:
3724     case OP_TYPEPLUS:
3725     case OP_TYPEMINPLUS:
3726     case OP_TYPEQUERY:
3727     case OP_TYPEMINQUERY:
3728     c = *ecode++ - OP_TYPESTAR;
3729     minimize = (c & 1) != 0;
3730     min = rep_min[c]; /* Pick up values from tables; */
3731     max = rep_max[c]; /* zero for max => infinity */
3732     if (max == 0) max = INT_MAX;
3733    
3734     /* Common code for all repeated single character type matches */
3735    
3736     REPEATTYPE:
3737     ctype = *ecode++; /* Code for the character type */
3738    
3739     /* First, ensure the minimum number of matches are present. Use inline
3740     code for maximizing the speed, and do the type test once at the start
3741     (i.e. keep it out of the loop). Also test that there are at least the
3742     minimum number of characters before we start. */
3743    
3744     if (min > md->end_subject - eptr) return FALSE;
3745     if (min > 0) switch(ctype)
3746     {
3747     case OP_ANY:
3748 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3749 nigel 3 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3750     else eptr += min;
3751     break;
3752    
3753     case OP_NOT_DIGIT:
3754     for (i = 1; i <= min; i++)
3755     if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3756     break;
3757    
3758     case OP_DIGIT:
3759     for (i = 1; i <= min; i++)
3760     if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3761     break;
3762    
3763     case OP_NOT_WHITESPACE:
3764     for (i = 1; i <= min; i++)
3765     if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3766     break;
3767    
3768     case OP_WHITESPACE:
3769     for (i = 1; i <= min; i++)
3770     if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3771     break;
3772    
3773     case OP_NOT_WORDCHAR:
3774     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3775     return FALSE;
3776     break;
3777    
3778     case OP_WORDCHAR:
3779     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3780     return FALSE;
3781     break;
3782     }
3783    
3784     /* If min = max, continue at the same level without recursing */
3785    
3786     if (min == max) continue;
3787    
3788     /* If minimizing, we have to test the rest of the pattern before each
3789     subsequent match, so inlining isn't much help; just use the function. */
3790    
3791     if (minimize)
3792     {
3793     for (i = min;; i++)
3794     {
3795 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3796 nigel 3 if (i >= max || eptr >= md->end_subject ||
3797 nigel 23 !match_type(ctype, *eptr++, (ims & PCRE_DOTALL) != 0))
3798 nigel 3 return FALSE;
3799     }
3800     /* Control never gets here */
3801     }
3802    
3803     /* If maximizing it is worth using inline code for speed, doing the type
3804     test once at the start (i.e. keep it out of the loop). */
3805    
3806     else
3807     {
3808 nigel 7 const uschar *pp = eptr;
3809 nigel 3 switch(ctype)
3810     {
3811     case OP_ANY:
3812 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3813 nigel 3 {
3814     for (i = min; i < max; i++)
3815     {
3816     if (eptr >= md->end_subject || *eptr == '\n') break;
3817     eptr++;
3818     }
3819     }
3820     else
3821     {
3822     c = max - min;
3823     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3824     eptr += c;
3825     }
3826     break;
3827    
3828     case OP_NOT_DIGIT:
3829     for (i = min; i < max; i++)
3830     {
3831     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3832     break;
3833     eptr++;
3834     }
3835     break;
3836    
3837     case OP_DIGIT:
3838     for (i = min; i < max; i++)
3839     {
3840     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3841     break;
3842     eptr++;
3843     }
3844     break;
3845    
3846     case OP_NOT_WHITESPACE:
3847     for (i = min; i < max; i++)
3848     {
3849     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3850     break;
3851     eptr++;
3852     }
3853     break;
3854    
3855     case OP_WHITESPACE:
3856     for (i = min; i < max; i++)
3857     {
3858     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3859     break;
3860     eptr++;
3861     }
3862     break;
3863    
3864     case OP_NOT_WORDCHAR:
3865     for (i = min; i < max; i++)
3866     {
3867     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3868     break;
3869     eptr++;
3870     }
3871     break;
3872    
3873     case OP_WORDCHAR:
3874     for (i = min; i < max; i++)
3875     {
3876     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3877     break;
3878     eptr++;
3879     }
3880     break;
3881     }
3882    
3883     while (eptr >= pp)
3884 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3885     return TRUE;
3886 nigel 3 return FALSE;
3887     }
3888     /* Control never gets here */
3889    
3890     /* There's been some horrible disaster. */
3891    
3892     default:
3893 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
3894 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3895     return FALSE;
3896     }
3897    
3898     /* Do not stick any code in here without much thought; it is assumed
3899     that "continue" in the code above comes out to here to repeat the main
3900     loop. */
3901    
3902     } /* End of main loop */
3903     /* Control never reaches here */
3904     }
3905    
3906    
3907    
3908 nigel 9
3909     /*************************************************
3910 nigel 3 * Execute a Regular Expression *
3911     *************************************************/
3912    
3913     /* This function applies a compiled re to a subject string and picks out
3914     portions of the string if it matches. Two elements in the vector are set for
3915     each substring: the offsets to the start and end of the substring.
3916    
3917     Arguments:
3918     external_re points to the compiled expression
3919     external_extra points to "hints" from pcre_study() or is NULL
3920     subject points to the subject string
3921     length length of subject string (may contain binary zeros)
3922     options option bits
3923     offsets points to a vector of ints to be filled in with offsets
3924     offsetcount the number of elements in the vector
3925    
3926     Returns: > 0 => success; value is the number of elements filled in
3927     = 0 => success, but offsets is not big enough
3928     -1 => failed to match
3929     < -1 => some kind of unexpected problem
3930     */
3931    
3932     int
3933     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3934     const char *subject, int length, int options, int *offsets, int offsetcount)
3935     {
3936 nigel 11 int resetcount, ocount;
3937 nigel 3 int first_char = -1;
3938 nigel 23 int ims = 0;
3939 nigel 3 match_data match_block;
3940 nigel 7 const uschar *start_bits = NULL;
3941 nigel 9 const uschar *start_match = (const uschar *)subject;
3942 nigel 7 const uschar *end_subject;
3943     const real_pcre *re = (const real_pcre *)external_re;
3944     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3945 nigel 11 BOOL using_temporary_offsets = FALSE;
3946 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3947     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3948    
3949     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3950    
3951     if (re == NULL || subject == NULL ||
3952     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3953     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3954    
3955 nigel 7 match_block.start_subject = (const uschar *)subject;
3956 nigel 3 match_block.end_subject = match_block.start_subject + length;
3957     end_subject = match_block.end_subject;
3958    
3959 nigel 23 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3960 nigel 3
3961     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3962     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3963    
3964     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3965    
3966 nigel 23 /* The ims options can vary during the matching as a result of the presence
3967     of (?ims) items in the pattern. They are kept in a local variable so that
3968     restoring at the exit of a group is easy. */
3969    
3970     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3971    
3972 nigel 3 /* If the expression has got more back references than the offsets supplied can
3973     hold, we get a temporary bit of working store to use during the matching.
3974 nigel 11 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3975 nigel 23 of 3. */
3976 nigel 3
3977 nigel 23 ocount = offsetcount - (offsetcount % 3);
3978    
3979     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3980 nigel 3 {
3981 nigel 23 ocount = re->top_backref * 3 + 3;
3982 nigel 15 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3983 nigel 3 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3984 nigel 11 using_temporary_offsets = TRUE;
3985 nigel 9 DPRINTF(("Got memory to hold back references\n"));
3986 nigel 3 }
3987     else match_block.offset_vector = offsets;
3988    
3989     match_block.offset_end = ocount;
3990 nigel 23 match_block.offset_max = (2*ocount)/3;
3991 nigel 3 match_block.offset_overflow = FALSE;
3992    
3993     /* Compute the minimum number of offsets that we need to reset each time. Doing
3994     this makes a huge difference to execution time when there aren't many brackets
3995     in the pattern. */
3996    
3997     resetcount = 2 + re->top_bracket * 2;
3998     if (resetcount > offsetcount) resetcount = ocount;
3999    
4000     /* Set up the first character to match, if available. The first_char value is
4001     never set for an anchored regular expression, but the anchoring may be forced
4002     at run time, so we have to test for anchoring. The first char may be unset for
4003     an unanchored pattern, of course. If there's no first char and the pattern was
4004 nigel 23 studied, there may be a bitmap of possible first characters. */
4005 nigel 3
4006     if (!anchored)
4007     {
4008     if ((re->options & PCRE_FIRSTSET) != 0)
4009     {
4010     first_char = re->first_char;
4011 nigel 23 if ((ims & PCRE_CASELESS) != 0) first_char = pcre_lcc[first_char];
4012 nigel 3 }
4013     else
4014     if (!startline && extra != NULL &&
4015 nigel 23 (extra->options & PCRE_STUDY_MAPPED) != 0)
4016 nigel 3 start_bits = extra->start_bits;
4017     }
4018    
4019     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4020    
4021     do
4022     {
4023 nigel 9 int rc;
4024 nigel 3 register int *iptr = match_block.offset_vector;
4025     register int *iend = iptr + resetcount;
4026    
4027     /* Reset the maximum number of extractions we might see. */
4028    
4029     while (iptr < iend) *iptr++ = -1;
4030    
4031     /* Advance to a unique first char if possible */
4032    
4033     if (first_char >= 0)
4034     {
4035 nigel 23 if ((ims & PCRE_CASELESS) != 0)
4036 nigel 3 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4037     start_match++;
4038     else
4039     while (start_match < end_subject && *start_match != first_char)
4040     start_match++;
4041     }
4042    
4043     /* Or to just after \n for a multiline match if possible */
4044    
4045     else if (startline)
4046     {
4047     if (start_match > match_block.start_subject)
4048     {
4049     while (start_match < end_subject && start_match[-1] != '\n')
4050     start_match++;
4051     }
4052     }
4053    
4054     /* Or to a non-unique first char */
4055    
4056     else if (start_bits != NULL)
4057     {
4058     while (start_match < end_subject)
4059     {
4060     register int c = *start_match;
4061     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4062     }
4063     }
4064    
4065 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4066 nigel 3 printf(">>>> Match against: ");
4067     pchars(start_match, end_subject - start_match, TRUE, &match_block);
4068     printf("\n");
4069 nigel 9 #endif
4070 nigel 3
4071     /* When a match occurs, substrings will be set for all internal extractions;
4072     we just need to set up the whole thing as substring 0 before returning. If
4073     there were too many extractions, set the return code to zero. In the case
4074     where we had to get some local store to hold offsets for backreferences, copy
4075     those back references that we can. In this case there need not be overflow
4076 nigel 23 if certain parts of the pattern were not used. */
4077 nigel 3
4078 nigel 23 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4079     continue;
4080 nigel 3
4081 nigel 9 /* Copy the offset information from temporary store if necessary */
4082    
4083 nigel 11 if (using_temporary_offsets)
4084 nigel 3 {
4085 nigel 9 if (offsetcount >= 4)
4086 nigel 3 {
4087 nigel 9 memcpy(offsets + 2, match_block.offset_vector + 2,
4088     (offsetcount - 2) * sizeof(int));
4089 nigel 11 DPRINTF(("Copied offsets from temporary memory\n"));
4090 nigel 3 }
4091 nigel 9 if (match_block.end_offset_top > offsetcount)
4092     match_block.offset_overflow = TRUE;
4093 nigel 3
4094 nigel 9 DPRINTF(("Freeing temporary memory\n"));
4095     (pcre_free)(match_block.offset_vector);
4096     }
4097 nigel 3
4098 nigel 9 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4099 nigel 3
4100 nigel 9 if (match_block.offset_end < 2) rc = 0; else
4101     {
4102     offsets[0] = start_match - match_block.start_subject;
4103     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4104 nigel 3 }
4105 nigel 9
4106     DPRINTF((">>>> returning %d\n", rc));
4107     return rc;
4108 nigel 3 }
4109     while (!anchored &&
4110     match_block.errorcode == PCRE_ERROR_NOMATCH &&
4111     start_match++ < end_subject);
4112    
4113 nigel 11 if (using_temporary_offsets)
4114     {
4115     DPRINTF(("Freeing temporary memory\n"));
4116     (pcre_free)(match_block.offset_vector);
4117     }
4118    
4119 nigel 9 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4120 nigel 3
4121     return match_block.errorcode;
4122     }
4123    
4124     /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12