/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (hide annotations) (download)
Sat Feb 24 21:38:05 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 99029 byte(s)
Load pcre-1.01 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12     Copyright (c) 1997 University of Cambridge
13    
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Define DEBUG to get debugging output on stdout. */
33    
34     /* #define DEBUG */
35    
36    
37     /* Include the internals header, which itself includes Standard C headers plus
38     the external pcre header. */
39    
40     #include "internal.h"
41    
42    
43     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
44    
45     static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
46     static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
47    
48     /* Text forms of OP_ values and things, for debugging */
49    
50     #ifdef DEBUG
51     static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
52     "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
53     "not",
54     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
55     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
56     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
57     "*", "*?", "+", "+?", "?", "??", "{", "{",
58     "class", "Ref",
59     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
60     "Brazero", "Braminzero", "Bra"
61     };
62     #endif
63    
64     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
65     are simple data values; negative values are for special things like \d and so
66     on. Zero means further processing is needed (for things like \x), or the escape
67     is invalid. */
68    
69     static short int escapes[] = {
70     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
73     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
77     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
78     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
79     0, 0, 0 /* x - z */
80     };
81    
82     /* Definition to allow mutual recursion */
83    
84     static BOOL compile_regex(int, int *,uschar **,uschar **,char **);
85    
86     /* Structure for passing "static" information around between the functions
87     doing the matching, so that they are thread-safe. */
88    
89     typedef struct match_data {
90     int errorcode; /* As it says */
91     int *offset_vector; /* Offset vector */
92     int offset_end; /* One past the end */
93     BOOL offset_overflow; /* Set if too many extractions */
94     BOOL caseless; /* Case-independent flag */
95     BOOL runtime_caseless; /* Caseless forced at run time */
96     BOOL multiline; /* Multiline flag */
97     BOOL notbol; /* NOTBOL flag */
98     BOOL noteol; /* NOTEOL flag */
99     BOOL dotall; /* Dot matches any char */
100     BOOL endonly; /* Dollar not before final \n */
101     uschar *start_subject; /* Start of the subject string */
102     uschar *end_subject; /* End of the subject string */
103     jmp_buf fail_env; /* Environment for longjump() break out */
104     uschar *end_match_ptr; /* Subject position at end match */
105     int end_offset_top; /* Highwater mark at end of match */
106     } match_data;
107    
108    
109    
110     /*************************************************
111     * Global variables *
112     *************************************************/
113    
114     /* PCRE is thread-clean and doesn't use any global variables in the normal
115     sense. However, it calls memory allocation and free functions via the two
116     indirections below, which are can be changed by the caller, but are shared
117     between all threads. */
118    
119     void *(*pcre_malloc)(size_t) = malloc;
120     void (*pcre_free)(void *) = free;
121    
122    
123    
124    
125     /*************************************************
126     * Return version string *
127     *************************************************/
128    
129     char *
130     pcre_version(void)
131     {
132     return PCRE_VERSION;
133     }
134    
135    
136    
137    
138     /*************************************************
139     * Return info about a compiled pattern *
140     *************************************************/
141    
142     /* This function picks potentially useful data out of the private
143     structure.
144    
145     Arguments:
146     external_re points to compiled code
147     optptr where to pass back the options
148     first_char where to pass back the first character,
149     or -1 if multiline and all branches start ^,
150     or -2 otherwise
151    
152     Returns: number of identifying extraction brackets
153     or negative values on error
154     */
155    
156     int
157     pcre_info(const pcre *external_re, int *optptr, int *first_char)
158     {
159     real_pcre *re = (real_pcre *)external_re;
160     if (re == NULL) return PCRE_ERROR_NULL;
161     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
162     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
163     if (first_char != NULL)
164     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
165     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
166     return re->top_bracket;
167     }
168    
169    
170    
171    
172     #ifdef DEBUG
173     /*************************************************
174     * Debugging function to print chars *
175     *************************************************/
176    
177     /* Print a sequence of chars in printable format, stopping at the end of the
178     subject if the requested.
179    
180     Arguments:
181     p points to characters
182     length number to print
183     is_subject TRUE if printing from within md->start_subject
184     md pointer to matching data block, if is_subject is TRUE
185    
186     Returns: nothing
187     */
188    
189     static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
190     {
191     int c;
192     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
193     while (length-- > 0)
194     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
195     }
196     #endif
197    
198    
199    
200    
201     /*************************************************
202     * Check subpattern for empty operand *
203     *************************************************/
204    
205     /* This function checks a bracketed subpattern to see if any of the paths
206     through it could match an empty string. This is used to diagnose an error if
207     such a subpattern is followed by a quantifier with an unlimited upper bound.
208    
209     Argument:
210     code points to the opening bracket
211    
212     Returns: TRUE or FALSE
213     */
214    
215     static BOOL
216     could_be_empty(uschar *code)
217     {
218     do {
219     uschar *cc = code + 3;
220    
221     /* Scan along the opcodes for this branch; as soon as we find something
222     that matches a non-empty string, break out and advance to test the next
223     branch. If we get to the end of the branch, return TRUE for the whole
224     sub-expression. */
225    
226     for (;;)
227     {
228     /* Test an embedded subpattern; if it could not be empty, break the
229     loop. Otherwise carry on in the branch. */
230    
231 nigel 5 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
232 nigel 3 {
233     if (!could_be_empty(cc)) break;
234     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
235     cc += 3;
236     }
237    
238     else switch (*cc)
239     {
240     /* Reached end of a branch: the subpattern may match the empty string */
241    
242     case OP_ALT:
243     case OP_KET:
244     case OP_KETRMAX:
245     case OP_KETRMIN:
246     return TRUE;
247    
248     /* Skip over assertive subpatterns */
249    
250     case OP_ASSERT:
251     case OP_ASSERT_NOT:
252     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
253     cc += 3;
254     break;
255    
256     /* Skip over things that don't match chars */
257    
258     case OP_SOD:
259     case OP_EOD:
260     case OP_CIRC:
261     case OP_DOLL:
262     case OP_BRAZERO:
263     case OP_BRAMINZERO:
264     case OP_NOT_WORD_BOUNDARY:
265     case OP_WORD_BOUNDARY:
266     cc++;
267     break;
268    
269     /* Skip over simple repeats with zero lower bound */
270    
271     case OP_STAR:
272     case OP_MINSTAR:
273     case OP_QUERY:
274     case OP_MINQUERY:
275 nigel 5 case OP_NOTSTAR:
276     case OP_NOTMINSTAR:
277     case OP_NOTQUERY:
278     case OP_NOTMINQUERY:
279 nigel 3 case OP_TYPESTAR:
280     case OP_TYPEMINSTAR:
281     case OP_TYPEQUERY:
282     case OP_TYPEMINQUERY:
283     cc += 2;
284     break;
285    
286     /* Skip over UPTOs (lower bound is zero) */
287    
288     case OP_UPTO:
289     case OP_MINUPTO:
290     case OP_TYPEUPTO:
291     case OP_TYPEMINUPTO:
292     cc += 4;
293     break;
294    
295     /* Check a class or a back reference for a zero minimum */
296    
297     case OP_CLASS:
298     case OP_REF:
299 nigel 5 cc += (*cc == OP_REF)? 2 : 33;
300 nigel 3
301     switch (*cc)
302     {
303     case OP_CRSTAR:
304     case OP_CRMINSTAR:
305     case OP_CRQUERY:
306     case OP_CRMINQUERY:
307     cc++;
308     break;
309    
310     case OP_CRRANGE:
311     case OP_CRMINRANGE:
312     if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
313     cc += 3;
314     break;
315    
316     default:
317     goto NEXT_BRANCH;
318     }
319     break;
320    
321     /* Anything else matches at least one character */
322    
323     default:
324     goto NEXT_BRANCH;
325     }
326     }
327    
328     NEXT_BRANCH:
329     code += (code[1] << 8) + code[2];
330     }
331     while (*code == OP_ALT);
332    
333     /* No branches match the empty string */
334    
335     return FALSE;
336     }
337    
338    
339    
340     /*************************************************
341     * Handle escapes *
342     *************************************************/
343    
344     /* This function is called when a \ has been encountered. It either returns a
345     positive value for a simple escape such as \n, or a negative value which
346     encodes one of the more complicated things such as \d. On entry, ptr is
347     pointing at the \. On exit, it is on the final character of the escape
348     sequence.
349    
350     Arguments:
351     ptrptr points to the pattern position pointer
352     errorptr points to the pointer to the error message
353     bracount number of previous extracting brackets
354     options the options bits
355     isclass TRUE if inside a character class
356    
357     Returns: zero or positive => a data character
358     negative => a special escape sequence
359     on error, errorptr is set
360     */
361    
362     static int
363     check_escape(uschar **ptrptr, char **errorptr, int bracount, int options,
364     BOOL isclass)
365     {
366     uschar *ptr = *ptrptr;
367     int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
368     int i;
369    
370     if (c == 0) *errorptr = ERR1;
371    
372     /* Digits or letters may have special meaning; all others are literals. */
373    
374     else if (c < '0' || c > 'z') {}
375    
376     /* Do an initial lookup in a table. A non-zero result is something that can be
377     returned immediately. Otherwise further processing may be required. */
378    
379     else if ((i = escapes[c - '0']) != 0) c = i;
380    
381     /* Escapes that need further processing, or are illegal. */
382    
383     else
384     {
385     uschar *oldptr;
386     switch (c)
387     {
388     /* The handling of escape sequences consisting of a string of digits
389     starting with one that is not zero is not straightforward. By experiment,
390     the way Perl works seems to be as follows:
391    
392     Outside a character class, the digits are read as a decimal number. If the
393     number is less than 10, or if there are that many previous extracting
394     left brackets, then it is a back reference. Otherwise, up to three octal
395     digits are read to form an escaped byte. Thus \123 is likely to be octal
396     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
397     value is greater than 377, the least significant 8 bits are taken. Inside a
398     character class, \ followed by a digit is always an octal number. */
399    
400     case '1': case '2': case '3': case '4': case '5':
401     case '6': case '7': case '8': case '9':
402    
403     if (!isclass)
404     {
405     oldptr = ptr;
406     c -= '0';
407     while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
408     c = c * 10 + *(++ptr) - '0';
409     if (c < 10 || c <= bracount)
410     {
411     c = -(ESC_REF + c);
412     break;
413     }
414     ptr = oldptr; /* Put the pointer back and fall through */
415     }
416    
417     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
418     generates a binary zero byte and treats the digit as a following literal.
419     Thus we have to pull back the pointer by one. */
420    
421     if ((c = *ptr) >= '8')
422     {
423     ptr--;
424     c = 0;
425     break;
426     }
427    
428     /* \0 always starts an octal number, but we may drop through to here with a
429     larger first octal digit */
430    
431     case '0':
432     c -= '0';
433     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
434     ptr[1] != '8' && ptr[1] != '9')
435     c = c * 8 + *(++ptr) - '0';
436     break;
437    
438     /* Special escapes not starting with a digit are straightforward */
439    
440     case 'x':
441     c = 0;
442     while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
443     {
444     ptr++;
445     c = c * 16 + pcre_lcc[*ptr] -
446     (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
447     }
448     break;
449    
450     case 'c':
451     c = *(++ptr);
452     if (c == 0)
453     {
454     *errorptr = ERR2;
455     return 0;
456     }
457    
458     /* A letter is upper-cased; then the 0x40 bit is flipped */
459    
460     if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
461     c ^= 0x40;
462     break;
463    
464     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
465     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
466     for Perl compatibility, it is a literal. */
467    
468     default:
469     if ((options & PCRE_EXTRA) != 0) switch(c)
470     {
471     case 'X':
472     c = -ESC_X; /* This could be a lookup if it ever got into Perl */
473     break;
474    
475     default:
476     *errorptr = ERR3;
477     break;
478     }
479     break;
480     }
481     }
482    
483     *ptrptr = ptr;
484     return c;
485     }
486    
487    
488    
489     /*************************************************
490     * Check for counted repeat *
491     *************************************************/
492    
493     /* This function is called when a '{' is encountered in a place where it might
494     start a quantifier. It looks ahead to see if it really is a quantifier or not.
495     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
496     where the ddds are digits.
497    
498     Arguments:
499     p pointer to the first char after '{'
500    
501     Returns: TRUE or FALSE
502     */
503    
504     static BOOL
505     is_counted_repeat(uschar *p)
506     {
507     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
508     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
509     if (*p == '}') return TRUE;
510    
511     if (*p++ != ',') return FALSE;
512     if (*p == '}') return TRUE;
513    
514     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
515     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
516     return (*p == '}');
517     }
518    
519    
520    
521     /*************************************************
522     * Read repeat counts *
523     *************************************************/
524    
525     /* Read an item of the form {n,m} and return the values. This is called only
526     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
527     so the syntax is guaranteed to be correct, but we need to check the values.
528    
529     Arguments:
530     p pointer to first char after '{'
531     minp pointer to int for min
532     maxp pointer to int for max
533     returned as -1 if no max
534     errorptr points to pointer to error message
535    
536     Returns: pointer to '}' on success;
537     current ptr on error, with errorptr set
538     */
539    
540     static uschar *
541     read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)
542     {
543     int min = 0;
544     int max = -1;
545    
546     while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
547    
548     if (*p == '}') max = min; else
549     {
550     if (*(++p) != '}')
551     {
552     max = 0;
553     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
554     if (max < min)
555     {
556     *errorptr = ERR4;
557     return p;
558     }
559     }
560     }
561    
562     /* Do paranoid checks, then fill in the required variables, and pass back the
563     pointer to the terminating '}'. */
564    
565     if (min > 65535 || max > 65535)
566     *errorptr = ERR5;
567     else
568     {
569     *minp = min;
570     *maxp = max;
571     }
572     return p;
573     }
574    
575    
576    
577     /*************************************************
578     * Compile one branch *
579     *************************************************/
580    
581     /* Scan the pattern, compiling it into the code vector.
582    
583     Arguments:
584     options the option bits
585     bracket points to number of brackets used
586     code points to the pointer to the current code point
587     ptrptr points to the current pattern pointer
588     errorptr points to pointer to error message
589    
590     Returns: TRUE on success
591     FALSE, with *errorptr set on error
592     */
593    
594     static BOOL
595     compile_branch(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
596     char **errorptr)
597     {
598     int repeat_type, op_type;
599     int repeat_min, repeat_max;
600     int bravalue, length;
601     register int c;
602     register uschar *code = *codeptr;
603     uschar *ptr = *ptrptr;
604     uschar *previous = NULL;
605     uschar *oldptr;
606     uschar class[32];
607    
608     /* Switch on next character until the end of the branch */
609    
610     for (;; ptr++)
611     {
612     BOOL negate_class;
613     int class_charcount;
614     int class_lastchar;
615    
616     c = *ptr;
617     if ((options & PCRE_EXTENDED) != 0)
618     {
619     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
620     if (c == '#')
621     {
622     while ((c = *(++ptr)) != 0 && c != '\n');
623     continue;
624     }
625     }
626    
627     switch(c)
628     {
629     /* The branch terminates at end of string, |, or ). */
630    
631     case 0:
632     case '|':
633     case ')':
634     *codeptr = code;
635     *ptrptr = ptr;
636     return TRUE;
637    
638     /* Handle single-character metacharacters */
639    
640     case '^':
641     previous = NULL;
642     *code++ = OP_CIRC;
643     break;
644    
645     case '$':
646     previous = NULL;
647     *code++ = OP_DOLL;
648     break;
649    
650     case '.':
651     previous = code;
652     *code++ = OP_ANY;
653     break;
654    
655     /* Character classes. These always build a 32-byte bitmap of the permitted
656     characters, except in the special case where there is only one character.
657     For negated classes, we build the map as usual, then invert it at the end.
658     */
659    
660     case '[':
661     previous = code;
662     *code++ = OP_CLASS;
663    
664     /* If the first character is '^', set the negation flag */
665    
666     if ((c = *(++ptr)) == '^')
667     {
668     negate_class = TRUE;
669     c = *(++ptr);
670     }
671     else negate_class = FALSE;
672    
673     /* Keep a count of chars so that we can optimize the case of just a single
674     character. */
675    
676     class_charcount = 0;
677     class_lastchar = -1;
678    
679     /* Initialize the 32-char bit map to all zeros. We have to build the
680     map in a temporary bit of store, in case the class contains only 1
681     character, because in that case the compiled code doesn't use the
682     bit map. */
683    
684     memset(class, 0, 32 * sizeof(uschar));
685    
686     /* Process characters until ] is reached. By writing this as a "do" it
687     means that an initial ] is taken as a data character. */
688    
689     do
690     {
691     if (c == 0)
692     {
693     *errorptr = ERR6;
694     goto FAILED;
695     }
696    
697     /* Backslash may introduce a single character, or it may introduce one
698     of the specials, which just set a flag. Escaped items are checked for
699     validity in the pre-compiling pass. The sequence \b is a special case.
700     Inside a class (and only there) it is treated as backslash. Elsewhere
701     it marks a word boundary. Other escapes have preset maps ready to
702     or into the one we are building. We assume they have more than one
703     character in them, so set class_count bigger than one. */
704    
705     if (c == '\\')
706     {
707     c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
708     if (-c == ESC_b) c = '\b';
709     else if (c < 0)
710     {
711     class_charcount = 10;
712     switch (-c)
713     {
714     case ESC_d:
715     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
716     continue;
717    
718     case ESC_D:
719     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
720     continue;
721    
722     case ESC_w:
723     for (c = 0; c < 32; c++)
724     class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
725     continue;
726    
727     case ESC_W:
728     for (c = 0; c < 32; c++)
729     class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
730     continue;
731    
732     case ESC_s:
733     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
734     continue;
735    
736     case ESC_S:
737     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
738     continue;
739    
740     default:
741     *errorptr = ERR7;
742     goto FAILED;
743     }
744     }
745     /* Fall through if single character */
746     }
747    
748     /* A single character may be followed by '-' to form a range. However,
749     Perl does not permit ']' to be the end of the range. A '-' character
750     here is treated as a literal. */
751    
752     if (ptr[1] == '-' && ptr[2] != ']')
753     {
754     int d;
755     ptr += 2;
756     d = *ptr;
757    
758     if (d == 0)
759     {
760     *errorptr = ERR6;
761     goto FAILED;
762     }
763    
764     /* The second part of a range can be a single-character escape, but
765     not any of the other escapes. */
766    
767     if (d == '\\')
768     {
769     d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
770     if (d < 0)
771     {
772     if (d == -ESC_b) d = '\b'; else
773     {
774     *errorptr = ERR7;
775     goto FAILED;
776     }
777     }
778     }
779    
780     if (d < c)
781     {
782     *errorptr = ERR8;
783     goto FAILED;
784     }
785    
786     for (; c <= d; c++)
787     {
788     class[c/8] |= (1 << (c&7));
789     if ((options & PCRE_CASELESS) != 0)
790     {
791     int uc = pcre_fcc[c]; /* flip case */
792     class[uc/8] |= (1 << (uc&7));
793     }
794     class_charcount++; /* in case a one-char range */
795     class_lastchar = c;
796     }
797     continue; /* Go get the next char in the class */
798     }
799    
800     /* Handle a lone single character - we can get here for a normal
801     non-escape char, or after \ that introduces a single character. */
802    
803     class [c/8] |= (1 << (c&7));
804     if ((options & PCRE_CASELESS) != 0)
805     {
806     c = pcre_fcc[c]; /* flip case */
807     class[c/8] |= (1 << (c&7));
808     }
809     class_charcount++;
810     class_lastchar = c;
811     }
812    
813     /* Loop until ']' reached; the check for end of string happens inside the
814     loop. This "while" is the end of the "do" above. */
815    
816     while ((c = *(++ptr)) != ']');
817    
818     /* If class_charcount is 1 and class_lastchar is not negative, we saw
819     precisely one character. This doesn't need the whole 32-byte bit map.
820     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
821     it's negative. */
822    
823     if (class_charcount == 1 && class_lastchar >= 0)
824     {
825     if (negate_class)
826     {
827     code[-1] = OP_NOT;
828     }
829     else
830     {
831     code[-1] = OP_CHARS;
832     *code++ = 1;
833     }
834     *code++ = class_lastchar;
835     }
836    
837     /* Otherwise, negate the 32-byte map if necessary, and copy it into
838     the code vector. */
839    
840     else
841     {
842     if (negate_class)
843     for (c = 0; c < 32; c++) code[c] = ~class[c];
844     else
845     memcpy(code, class, 32);
846     code += 32;
847     }
848     break;
849    
850     /* Various kinds of repeat */
851    
852     case '{':
853     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
854     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
855     if (*errorptr != NULL) goto FAILED;
856     goto REPEAT;
857    
858     case '*':
859     repeat_min = 0;
860     repeat_max = -1;
861     goto REPEAT;
862    
863     case '+':
864     repeat_min = 1;
865     repeat_max = -1;
866     goto REPEAT;
867    
868     case '?':
869     repeat_min = 0;
870     repeat_max = 1;
871    
872     REPEAT:
873     if (previous == NULL)
874     {
875     *errorptr = ERR9;
876     goto FAILED;
877     }
878    
879     /* If the next character is '?' this is a minimizing repeat. Advance to the
880     next character. */
881    
882     if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
883    
884     /* If the maximum is zero then the minimum must also be zero; Perl allows
885     this case, so we do too - by simply omitting the item altogether. */
886    
887     if (repeat_max == 0) code = previous;
888    
889     /* If previous was a string of characters, chop off the last one and use it
890     as the subject of the repeat. If there was only one character, we can
891     abolish the previous item altogether. */
892    
893     else if (*previous == OP_CHARS)
894     {
895     int len = previous[1];
896     if (len == 1)
897     {
898     c = previous[2];
899     code = previous;
900     }
901     else
902     {
903     c = previous[len+1];
904     previous[1]--;
905     code--;
906     }
907     op_type = 0; /* Use single-char op codes */
908     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
909     }
910    
911     /* If previous was a single negated character ([^a] or similar), we use
912     one of the special opcodes, replacing it. The code is shared with single-
913     character repeats by adding a suitable offset into repeat_type. */
914    
915     else if ((int)*previous == OP_NOT)
916     {
917     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
918     c = previous[1];
919     code = previous;
920     goto OUTPUT_SINGLE_REPEAT;
921     }
922    
923     /* If previous was a character type match (\d or similar), abolish it and
924     create a suitable repeat item. The code is shared with single-character
925     repeats by adding a suitable offset into repeat_type. */
926    
927     else if ((int)*previous < OP_EOD || *previous == OP_ANY)
928     {
929     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
930     c = *previous;
931     code = previous;
932    
933     OUTPUT_SINGLE_REPEAT:
934     repeat_type += op_type; /* Combine both values for many cases */
935    
936     /* A minimum of zero is handled either as the special case * or ?, or as
937     an UPTO, with the maximum given. */
938    
939     if (repeat_min == 0)
940     {
941     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
942     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
943     else
944     {
945     *code++ = OP_UPTO + repeat_type;
946     *code++ = repeat_max >> 8;
947     *code++ = (repeat_max & 255);
948     }
949     }
950    
951     /* The case {1,} is handled as the special case + */
952    
953     else if (repeat_min == 1 && repeat_max == -1)
954     *code++ = OP_PLUS + repeat_type;
955    
956     /* The case {n,n} is just an EXACT, while the general case {n,m} is
957     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
958    
959     else
960     {
961     if (repeat_min != 1)
962     {
963     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
964     *code++ = repeat_min >> 8;
965     *code++ = (repeat_min & 255);
966     }
967    
968     /* If the mininum is 1 and the previous item was a character string,
969     we either have to put back the item that got cancelled if the string
970     length was 1, or add the character back onto the end of a longer
971     string. For a character type nothing need be done; it will just get put
972     back naturally. */
973    
974     else if (*previous == OP_CHARS)
975     {
976     if (code == previous) code += 2; else previous[1]++;
977     }
978    
979     /* Insert an UPTO if the max is greater than the min. */
980    
981     if (repeat_max != repeat_min)
982     {
983     *code++ = c;
984     repeat_max -= repeat_min;
985     *code++ = OP_UPTO + repeat_type;
986     *code++ = repeat_max >> 8;
987     *code++ = (repeat_max & 255);
988     }
989     }
990    
991     /* The character or character type itself comes last in all cases. */
992    
993     *code++ = c;
994     }
995    
996     /* If previous was a character class or a back reference, we put the repeat
997     stuff after it. */
998    
999     else if (*previous == OP_CLASS || *previous == OP_REF)
1000     {
1001     if (repeat_min == 0 && repeat_max == -1)
1002     *code++ = OP_CRSTAR + repeat_type;
1003     else if (repeat_min == 1 && repeat_max == -1)
1004     *code++ = OP_CRPLUS + repeat_type;
1005     else if (repeat_min == 0 && repeat_max == 1)
1006     *code++ = OP_CRQUERY + repeat_type;
1007     else
1008     {
1009     *code++ = OP_CRRANGE + repeat_type;
1010     *code++ = repeat_min >> 8;
1011     *code++ = repeat_min & 255;
1012     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1013     *code++ = repeat_max >> 8;
1014     *code++ = repeat_max & 255;
1015     }
1016     }
1017    
1018     /* If previous was a bracket group, we may have to replicate it in certain
1019     cases. If the maximum repeat count is unlimited, check that the bracket
1020     group cannot match the empty string, and diagnose an error if it can. */
1021    
1022     else if ((int)*previous >= OP_BRA)
1023     {
1024     int i;
1025     int length = code - previous;
1026    
1027     if (repeat_max == -1 && could_be_empty(previous))
1028     {
1029     *errorptr = ERR10;
1030     goto FAILED;
1031     }
1032    
1033     /* If the minimum is greater than zero, and the maximum is unlimited or
1034     equal to the minimum, the first copy remains where it is, and is
1035     replicated up to the minimum number of times. This case includes the +
1036     repeat, but of course no replication is needed in that case. */
1037    
1038     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1039     {
1040     for (i = 1; i < repeat_min; i++)
1041     {
1042     memcpy(code, previous, length);
1043     code += length;
1044     }
1045     }
1046    
1047     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1048     Then, if there is a fixed upper limit, replicated up to that many times,
1049     sticking BRAZERO in front of all the optional ones. */
1050    
1051     else
1052     {
1053     if (repeat_min == 0)
1054     {
1055     memmove(previous+1, previous, length);
1056     code++;
1057     *previous++ = OP_BRAZERO + repeat_type;
1058     }
1059    
1060     for (i = 1; i < repeat_min; i++)
1061     {
1062     memcpy(code, previous, length);
1063     code += length;
1064     }
1065    
1066     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1067     {
1068     *code++ = OP_BRAZERO + repeat_type;
1069     memcpy(code, previous, length);
1070     code += length;
1071     }
1072     }
1073    
1074     /* If the maximum is unlimited, set a repeater in the final copy. */
1075    
1076     if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1077     }
1078    
1079     /* Else there's some kind of shambles */
1080    
1081     else
1082     {
1083     *errorptr = ERR11;
1084     goto FAILED;
1085     }
1086    
1087     /* In all case we no longer have a previous item. */
1088    
1089     previous = NULL;
1090     break;
1091    
1092    
1093     /* Start of nested bracket sub-expression, or comment or lookahead.
1094     First deal with special things that can come after a bracket; all are
1095     introduced by ?, and the appearance of any of them means that this is not a
1096     referencing group. They were checked for validity in the first pass over
1097     the string, so we don't have to check for syntax errors here. */
1098    
1099     case '(':
1100     previous = code; /* Only real brackets can be repeated */
1101     if (*(++ptr) == '?')
1102     {
1103     bravalue = OP_BRA;
1104    
1105     switch (*(++ptr))
1106     {
1107     case '#':
1108     case 'i':
1109     case 'm':
1110     case 's':
1111     case 'x':
1112     ptr++;
1113     while (*ptr != ')') ptr++;
1114     previous = NULL;
1115     continue;
1116    
1117     case ':': /* Non-extracting bracket */
1118     ptr++;
1119     break;
1120    
1121     case '=': /* Assertions can't be repeated */
1122     bravalue = OP_ASSERT;
1123     ptr++;
1124     previous = NULL;
1125     break;
1126    
1127     case '!':
1128     bravalue = OP_ASSERT_NOT;
1129     ptr++;
1130     previous = NULL;
1131     break;
1132    
1133     case '>': /* "Match once" brackets */
1134     if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1135     {
1136     bravalue = OP_ONCE;
1137     ptr++;
1138     previous = NULL;
1139     break;
1140     }
1141     /* Else fall through */
1142    
1143     default:
1144     *errorptr = ERR12;
1145     goto FAILED;
1146     }
1147     }
1148    
1149     /* Else we have a referencing group */
1150    
1151     else
1152     {
1153     if (++(*brackets) > EXTRACT_MAX)
1154     {
1155     *errorptr = ERR13;
1156     goto FAILED;
1157     }
1158     bravalue = OP_BRA + *brackets;
1159     }
1160    
1161     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1162     code into a non-register variable in order to be able to pass its address
1163     because some compilers complain otherwise. */
1164    
1165     *code = bravalue;
1166     {
1167     uschar *mcode = code;
1168     if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1169     goto FAILED;
1170     code = mcode;
1171     }
1172    
1173     if (*ptr != ')')
1174     {
1175     *errorptr = ERR14;
1176     goto FAILED;
1177     }
1178     break;
1179    
1180     /* Check \ for being a real metacharacter; if not, fall through and handle
1181     it as a data character at the start of a string. Escape items are checked
1182     for validity in the pre-compiling pass. */
1183    
1184     case '\\':
1185     oldptr = ptr;
1186     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1187    
1188     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1189     are arranged to be the negation of the corresponding OP_values. For the
1190     back references, the values are ESC_REF plus the reference number. Only
1191     back references and those types that consume a character may be repeated.
1192     We can test for values between ESC_b and ESC_Z for the latter; this may
1193     have to change if any new ones are ever created. */
1194    
1195     if (c < 0)
1196     {
1197     if (-c >= ESC_REF)
1198     {
1199     int refnum = -c - ESC_REF;
1200     if (*brackets < refnum)
1201     {
1202     *errorptr = ERR15;
1203     goto FAILED;
1204     }
1205     previous = code;
1206     *code++ = OP_REF;
1207     *code++ = refnum;
1208     }
1209     else
1210     {
1211     previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1212     *code++ = -c;
1213     }
1214     continue;
1215     }
1216    
1217     /* Reset and fall through */
1218    
1219     ptr = oldptr;
1220     c = '\\';
1221    
1222     /* Handle a run of data characters until a metacharacter is encountered.
1223     The first character is guaranteed not to be whitespace or # when the
1224     extended flag is set. */
1225    
1226     NORMAL_CHAR:
1227     default:
1228     previous = code;
1229     *code = OP_CHARS;
1230     code += 2;
1231     length = 0;
1232    
1233     do
1234     {
1235     if ((options & PCRE_EXTENDED) != 0)
1236     {
1237     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1238     if (c == '#')
1239     {
1240     while ((c = *(++ptr)) != 0 && c != '\n');
1241     if (c == 0) break;
1242     continue;
1243     }
1244     }
1245    
1246     /* Backslash may introduce a data char or a metacharacter. Escaped items
1247     are checked for validity in the pre-compiling pass. Stop the string
1248     before a metaitem. */
1249    
1250     if (c == '\\')
1251     {
1252     oldptr = ptr;
1253     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1254     if (c < 0) { ptr = oldptr; break; }
1255     }
1256    
1257     /* Ordinary character or single-char escape */
1258    
1259     *code++ = c;
1260     length++;
1261     }
1262    
1263     /* This "while" is the end of the "do" above. */
1264    
1265     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1266    
1267     /* Compute the length and set it in the data vector, and advance to
1268     the next state. */
1269    
1270     previous[1] = length;
1271     ptr--;
1272     break;
1273     }
1274     } /* end of big loop */
1275    
1276     /* Control never reaches here by falling through, only by a goto for all the
1277     error states. Pass back the position in the pattern so that it can be displayed
1278     to the user for diagnosing the error. */
1279    
1280     FAILED:
1281     *ptrptr = ptr;
1282     return FALSE;
1283     }
1284    
1285    
1286    
1287    
1288     /*************************************************
1289     * Compile sequence of alternatives *
1290     *************************************************/
1291    
1292     /* On entry, ptr is pointing past the bracket character, but on return
1293     it points to the closing bracket, or vertical bar, or end of string.
1294     The code variable is pointing at the byte into which the BRA operator has been
1295     stored.
1296    
1297     Argument:
1298     options the option bits
1299     brackets -> int containing the number of extracting brackets used
1300     codeptr -> the address of the current code pointer
1301     ptrptr -> the address of the current pattern pointer
1302     errorptr -> pointer to error message
1303    
1304     Returns: TRUE on success
1305     */
1306    
1307     static BOOL
1308     compile_regex(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
1309     char **errorptr)
1310     {
1311     uschar *ptr = *ptrptr;
1312     uschar *code = *codeptr;
1313     uschar *start_bracket = code;
1314    
1315     for (;;)
1316     {
1317     int length;
1318     uschar *last_branch = code;
1319    
1320     code += 3;
1321     if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1322     {
1323     *ptrptr = ptr;
1324     return FALSE;
1325     }
1326    
1327     /* Fill in the length of the last branch */
1328    
1329     length = code - last_branch;
1330     last_branch[1] = length >> 8;
1331     last_branch[2] = length & 255;
1332    
1333     /* Reached end of expression, either ')' or end of pattern. Insert a
1334     terminating ket and the length of the whole bracketed item, and return,
1335     leaving the pointer at the terminating char. */
1336    
1337     if (*ptr != '|')
1338     {
1339     length = code - start_bracket;
1340     *code++ = OP_KET;
1341     *code++ = length >> 8;
1342     *code++ = length & 255;
1343     *codeptr = code;
1344     *ptrptr = ptr;
1345     return TRUE;
1346     }
1347    
1348     /* Another branch follows; insert an "or" node and advance the pointer. */
1349    
1350     *code = OP_ALT;
1351     ptr++;
1352     }
1353     /* Control never reaches here */
1354     }
1355    
1356    
1357    
1358     /*************************************************
1359     * Check for anchored expression *
1360     *************************************************/
1361    
1362     /* Try to find out if this is an anchored regular expression. Consider each
1363     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1364     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1365     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1366     counts, since OP_CIRC can match in the middle.
1367    
1368     A branch is also implicitly anchored if it starts with .* because that will try
1369     the rest of the pattern at all possible matching points, so there is no point
1370     trying them again.
1371    
1372     Argument: points to start of expression (the bracket)
1373     Returns: TRUE or FALSE
1374     */
1375    
1376     static BOOL
1377     is_anchored(register uschar *code, BOOL multiline)
1378     {
1379     do {
1380     int op = (int)code[3];
1381     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1382     { if (!is_anchored(code+3, multiline)) return FALSE; }
1383     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1384     { if (code[4] != OP_ANY) return FALSE; }
1385     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1386     code += (code[1] << 8) + code[2];
1387     }
1388     while (*code == OP_ALT);
1389     return TRUE;
1390     }
1391    
1392    
1393    
1394     /*************************************************
1395     * Check for start with \n line expression *
1396     *************************************************/
1397    
1398     /* This is called for multiline expressions to try to find out if every branch
1399     starts with ^ so that "first char" processing can be done to speed things up.
1400    
1401     Argument: points to start of expression (the bracket)
1402     Returns: TRUE or FALSE
1403     */
1404    
1405     static BOOL
1406     is_startline(uschar *code)
1407     {
1408     do {
1409     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1410     { if (!is_startline(code+3)) return FALSE; }
1411     else if (code[3] != OP_CIRC) return FALSE;
1412     code += (code[1] << 8) + code[2];
1413     }
1414     while (*code == OP_ALT);
1415     return TRUE;
1416     }
1417    
1418    
1419    
1420     /*************************************************
1421     * Check for fixed first char *
1422     *************************************************/
1423    
1424     /* Try to find out if there is a fixed first character. This is called for
1425     unanchored expressions, as it speeds up their processing quite considerably.
1426     Consider each alternative branch. If they all start with the same char, or with
1427     a bracket all of whose alternatives start with the same char (recurse ad lib),
1428     then we return that char, otherwise -1.
1429    
1430     Argument: points to start of expression (the bracket)
1431     Returns: -1 or the fixed first char
1432     */
1433    
1434     static int
1435     find_firstchar(uschar *code)
1436     {
1437     register int c = -1;
1438     do
1439     {
1440     register int charoffset = 4;
1441    
1442     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1443     {
1444     register int d;
1445     if ((d = find_firstchar(code+3)) < 0) return -1;
1446     if (c < 0) c = d; else if (c != d) return -1;
1447     }
1448    
1449     else switch(code[3])
1450     {
1451     default:
1452     return -1;
1453    
1454     case OP_EXACT: /* Fall through */
1455     charoffset++;
1456    
1457     case OP_CHARS: /* Fall through */
1458     charoffset++;
1459    
1460     case OP_PLUS:
1461     case OP_MINPLUS:
1462     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1463     break;
1464     }
1465     code += (code[1] << 8) + code[2];
1466     }
1467     while (*code == OP_ALT);
1468     return c;
1469     }
1470    
1471    
1472    
1473     /*************************************************
1474     * Compile a Regular Expression *
1475     *************************************************/
1476    
1477     /* This function takes a string and returns a pointer to a block of store
1478     holding a compiled version of the expression.
1479    
1480     Arguments:
1481     pattern the regular expression
1482     options various option bits
1483     errorptr pointer to pointer to error text
1484     erroroffset ptr offset in pattern where error was detected
1485    
1486     Returns: pointer to compiled data block, or NULL on error,
1487     with errorptr and erroroffset set
1488     */
1489    
1490     pcre *
1491     pcre_compile(const char *pattern, int options, char **errorptr,
1492     int *erroroffset)
1493     {
1494     real_pcre *re;
1495     int spaces = 0;
1496     int length = 3; /* For initial BRA plus length */
1497     int runlength;
1498     int c, size;
1499     int bracount = 0;
1500     int brastack[200];
1501     int brastackptr = 0;
1502     int top_backref = 0;
1503     uschar *code, *ptr;
1504    
1505     #ifdef DEBUG
1506     uschar *code_base, *code_end;
1507     #endif
1508    
1509     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1510     can do is just return NULL. */
1511    
1512     if (errorptr == NULL) return NULL;
1513     *errorptr = NULL;
1514    
1515     /* However, we can give a message for this error */
1516    
1517     if (erroroffset == NULL)
1518     {
1519     *errorptr = ERR16;
1520     return NULL;
1521     }
1522     *erroroffset = 0;
1523    
1524     if ((options & ~PUBLIC_OPTIONS) != 0)
1525     {
1526     *errorptr = ERR17;
1527     return NULL;
1528     }
1529    
1530     #ifdef DEBUG
1531     printf("------------------------------------------------------------------\n");
1532     printf("%s\n", pattern);
1533     #endif
1534    
1535     /* The first thing to do is to make a pass over the pattern to compute the
1536     amount of store required to hold the compiled code. This does not have to be
1537     perfect as long as errors are overestimates. At the same time we can detect any
1538     internal flag settings. Make an attempt to correct for any counted white space
1539     if an "extended" flag setting appears late in the pattern. We can't be so
1540     clever for #-comments. */
1541    
1542     ptr = (uschar *)(pattern - 1);
1543     while ((c = *(++ptr)) != 0)
1544     {
1545     int min, max;
1546     int class_charcount;
1547    
1548     if ((pcre_ctypes[c] & ctype_space) != 0)
1549     {
1550     if ((options & PCRE_EXTENDED) != 0) continue;
1551     spaces++;
1552     }
1553    
1554     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1555     {
1556     while ((c = *(++ptr)) != 0 && c != '\n');
1557     continue;
1558     }
1559    
1560     switch(c)
1561     {
1562     /* A backslashed item may be an escaped "normal" character or a
1563     character type. For a "normal" character, put the pointers and
1564     character back so that tests for whitespace etc. in the input
1565     are done correctly. */
1566    
1567     case '\\':
1568     {
1569     uschar *save_ptr = ptr;
1570     c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1571     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1572     if (c >= 0)
1573     {
1574     ptr = save_ptr;
1575     c = '\\';
1576     goto NORMAL_CHAR;
1577     }
1578     }
1579     length++;
1580    
1581     /* A back reference needs an additional char, plus either one or 5
1582     bytes for a repeat. We also need to keep the value of the highest
1583     back reference. */
1584    
1585     if (c <= -ESC_REF)
1586     {
1587     int refnum = -c - ESC_REF;
1588     if (refnum > top_backref) top_backref = refnum;
1589     length++; /* For single back reference */
1590     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1591     {
1592     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1593     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1594     if ((min == 0 && (max == 1 || max == -1)) ||
1595     (min == 1 && max == -1))
1596     length++;
1597     else length += 5;
1598     if (ptr[1] == '?') ptr++;
1599     }
1600     }
1601     continue;
1602    
1603     case '^':
1604     case '.':
1605     case '$':
1606     case '*': /* These repeats won't be after brackets; */
1607     case '+': /* those are handled separately */
1608     case '?':
1609     length++;
1610     continue;
1611    
1612     /* This covers the cases of repeats after a single char, metachar, class,
1613     or back reference. */
1614    
1615     case '{':
1616     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1617     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1618     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1619     if ((min == 0 && (max == 1 || max == -1)) ||
1620     (min == 1 && max == -1))
1621     length++;
1622     else
1623     {
1624     length--; /* Uncount the original char or metachar */
1625     if (min == 1) length++; else if (min > 0) length += 4;
1626     if (max > 0) length += 4; else length += 2;
1627     }
1628     if (ptr[1] == '?') ptr++;
1629     continue;
1630    
1631     /* An alternation contains an offset to the next branch or ket. */
1632     case '|':
1633     length += 3;
1634     continue;
1635    
1636     /* A character class uses 33 characters. Don't worry about character types
1637     that aren't allowed in classes - they'll get picked up during the compile.
1638     A character class that contains only one character uses 2 or 3 bytes,
1639     depending on whether it is negated or not. Notice this where we can. */
1640    
1641     case '[':
1642     class_charcount = 0;
1643     if (*(++ptr) == '^') ptr++;
1644     do
1645     {
1646     if (*ptr == '\\')
1647     {
1648     int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
1649     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1650     if (-c == ESC_b) class_charcount++; else class_charcount = 10;
1651     }
1652     else class_charcount++;
1653     ptr++;
1654     }
1655     while (*ptr != 0 && *ptr != ']');
1656    
1657     /* Repeats for negated single chars are handled by the general code */
1658    
1659     if (class_charcount == 1) length += 3; else
1660     {
1661     length += 33;
1662    
1663     /* A repeat needs either 1 or 5 bytes. */
1664    
1665     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1666     {
1667     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1668     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1669     if ((min == 0 && (max == 1 || max == -1)) ||
1670     (min == 1 && max == -1))
1671     length++;
1672     else length += 5;
1673     if (ptr[1] == '?') ptr++;
1674     }
1675     }
1676     continue;
1677    
1678     /* Brackets may be genuine groups or special things */
1679    
1680     case '(':
1681    
1682     /* Handle special forms of bracket, which all start (? */
1683    
1684     if (ptr[1] == '?') switch (c = ptr[2])
1685     {
1686     /* Skip over comments entirely */
1687     case '#':
1688     ptr += 3;
1689     while (*ptr != 0 && *ptr != ')') ptr++;
1690     if (*ptr == 0)
1691     {
1692     *errorptr = ERR18;
1693     goto PCRE_ERROR_RETURN;
1694     }
1695     continue;
1696    
1697     /* Non-referencing groups and lookaheads just move the pointer on, and
1698     then behave like a non-special bracket, except that they don't increment
1699     the count of extracting brackets. */
1700    
1701     case ':':
1702     case '=':
1703     case '!':
1704     ptr += 2;
1705     break;
1706    
1707     /* Ditto for the "once only" bracket, allowed only if the extra bit
1708     is set. */
1709    
1710     case '>':
1711     if ((options & PCRE_EXTRA) != 0)
1712     {
1713     ptr += 2;
1714     break;
1715     }
1716     /* Else fall thourh */
1717    
1718     /* Else loop setting valid options until ) is met. Anything else is an
1719     error. */
1720    
1721     default:
1722     ptr += 2;
1723     for (;; ptr++)
1724     {
1725     if ((c = *ptr) == 'i')
1726     {
1727     options |= PCRE_CASELESS;
1728     continue;
1729     }
1730     else if ((c = *ptr) == 'm')
1731     {
1732     options |= PCRE_MULTILINE;
1733     continue;
1734     }
1735     else if (c == 's')
1736     {
1737     options |= PCRE_DOTALL;
1738     continue;
1739     }
1740     else if (c == 'x')
1741     {
1742     options |= PCRE_EXTENDED;
1743     length -= spaces; /* Already counted spaces */
1744     continue;
1745     }
1746     else if (c == ')') break;
1747    
1748     *errorptr = ERR12;
1749     goto PCRE_ERROR_RETURN;
1750     }
1751     continue; /* End of this bracket handling */
1752     }
1753    
1754     /* Extracting brackets must be counted so we can process escapes in a
1755     Perlish way. */
1756    
1757     else bracount++;
1758    
1759     /* Non-special forms of bracket. Save length for computing whole length
1760     at end if there's a repeat that requires duplication of the group. */
1761    
1762     if (brastackptr >= sizeof(brastack)/sizeof(int))
1763     {
1764     *errorptr = ERR19;
1765     goto PCRE_ERROR_RETURN;
1766     }
1767    
1768     brastack[brastackptr++] = length;
1769     length += 3;
1770     continue;
1771    
1772     /* Handle ket. Look for subsequent max/min; for certain sets of values we
1773     have to replicate this bracket up to that many times. */
1774    
1775     case ')':
1776     length += 3;
1777     {
1778     int min = 1;
1779     int max = 1;
1780     int duplength = length - brastack[--brastackptr];
1781    
1782     /* Leave ptr at the final char; for read_repeat_counts this happens
1783     automatically; for the others we need an increment. */
1784    
1785     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1786     {
1787     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1788     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1789     }
1790     else if (c == '*') { min = 0; max = -1; ptr++; }
1791     else if (c == '+') { max = -1; ptr++; }
1792     else if (c == '?') { min = 0; ptr++; }
1793    
1794     /* If there is a minimum > 1 we have to replicate up to min-1 times; if
1795     there is a limited maximum we have to replicate up to max-1 times and
1796     allow for a BRAZERO item before each optional copy, as we also have to
1797     do before the first copy if the minimum is zero. */
1798    
1799     if (min == 0) length++;
1800     else if (min > 1) length += (min - 1) * duplength;
1801     if (max > min) length += (max - min) * (duplength + 1);
1802     }
1803    
1804     continue;
1805    
1806     /* Non-special character. For a run of such characters the length required
1807     is the number of characters + 2, except that the maximum run length is 255.
1808     We won't get a skipped space or a non-data escape or the start of a #
1809     comment as the first character, so the length can't be zero. */
1810    
1811     NORMAL_CHAR:
1812     default:
1813     length += 2;
1814     runlength = 0;
1815     do
1816     {
1817     if ((pcre_ctypes[c] & ctype_space) != 0)
1818     {
1819     if ((options & PCRE_EXTENDED) != 0) continue;
1820     spaces++;
1821     }
1822    
1823     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1824     {
1825     while ((c = *(++ptr)) != 0 && c != '\n');
1826     continue;
1827     }
1828    
1829     /* Backslash may introduce a data char or a metacharacter; stop the
1830     string before the latter. */
1831    
1832     if (c == '\\')
1833     {
1834     uschar *saveptr = ptr;
1835     c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1836     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1837     if (c < 0) { ptr = saveptr; break; }
1838     }
1839    
1840     /* Ordinary character or single-char escape */
1841    
1842     runlength++;
1843     }
1844    
1845     /* This "while" is the end of the "do" above. */
1846    
1847     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1848    
1849     ptr--;
1850     length += runlength;
1851     continue;
1852     }
1853     }
1854    
1855     length += 4; /* For final KET and END */
1856    
1857     if (length > 65539)
1858     {
1859     *errorptr = ERR20;
1860     return NULL;
1861     }
1862    
1863     /* Compute the size of data block needed and get it, either from malloc or
1864     externally provided function. Put in the magic number and the options. */
1865    
1866     size = length + offsetof(real_pcre, code);
1867     re = (real_pcre *)(pcre_malloc)(size);
1868    
1869     if (re == NULL)
1870     {
1871     *errorptr = ERR21;
1872     return NULL;
1873     }
1874    
1875     re->magic_number = MAGIC_NUMBER;
1876     re->options = options;
1877    
1878     /* Set up a starting, non-extracting bracket, then compile the expression. On
1879     error, *errorptr will be set non-NULL, so we don't need to look at the result
1880     of the function here. */
1881    
1882     ptr = (uschar *)pattern;
1883     code = re->code;
1884     *code = OP_BRA;
1885     bracount = 0;
1886     (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1887     re->top_bracket = bracount;
1888     re->top_backref = top_backref;
1889    
1890     /* If not reached end of pattern on success, there's an excess bracket. */
1891    
1892     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1893    
1894     /* Fill in the terminating state and check for disastrous overflow, but
1895     if debugging, leave the test till after things are printed out. */
1896    
1897     *code++ = OP_END;
1898    
1899     #ifndef DEBUG
1900     if (code - re->code > length) *errorptr = ERR23;
1901     #endif
1902    
1903     /* Failed to compile */
1904    
1905     if (*errorptr != NULL)
1906     {
1907     (pcre_free)(re);
1908     PCRE_ERROR_RETURN:
1909     *erroroffset = ptr - (uschar *)pattern;
1910     return NULL;
1911     }
1912    
1913     /* If the anchored option was not passed, set flag if we can determine that it
1914     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1915     we can determine what the first character has to be, because that speeds up
1916     unanchored matches no end. In the case of multiline matches, an alternative is
1917     to set the PCRE_STARTLINE flag if all branches start with ^. */
1918    
1919     if ((options & PCRE_ANCHORED) == 0)
1920     {
1921     if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1922     re->options |= PCRE_ANCHORED;
1923     else
1924     {
1925     int c = find_firstchar(re->code);
1926     if (c >= 0)
1927     {
1928     re->first_char = c;
1929     re->options |= PCRE_FIRSTSET;
1930     }
1931     else if (is_startline(re->code))
1932     re->options |= PCRE_STARTLINE;
1933     }
1934     }
1935    
1936     /* Print out the compiled data for debugging */
1937    
1938     #ifdef DEBUG
1939    
1940     printf("Length = %d top_bracket = %d top_backref=%d\n",
1941     length, re->top_bracket, re->top_backref);
1942    
1943     if (re->options != 0)
1944     {
1945     printf("%s%s%s%s%s%s%s\n",
1946     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1947     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1948     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1949     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1950     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1951     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1952     ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1953     }
1954    
1955     if ((re->options & PCRE_FIRSTSET) != 0)
1956     {
1957     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1958     else printf("First char = \\x%02x\n", re->first_char);
1959     }
1960    
1961     code_end = code;
1962     code_base = code = re->code;
1963    
1964     while (code < code_end)
1965     {
1966     int charlength;
1967    
1968     printf("%3d ", code - code_base);
1969    
1970     if (*code >= OP_BRA)
1971     {
1972     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1973     code += 2;
1974     }
1975    
1976     else switch(*code)
1977     {
1978     case OP_CHARS:
1979     charlength = *(++code);
1980     printf("%3d ", charlength);
1981     while (charlength-- > 0)
1982     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
1983     break;
1984    
1985     case OP_KETRMAX:
1986     case OP_KETRMIN:
1987     case OP_ALT:
1988     case OP_KET:
1989     case OP_ASSERT:
1990     case OP_ASSERT_NOT:
1991     case OP_ONCE:
1992     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
1993     code += 2;
1994     break;
1995    
1996     case OP_STAR:
1997     case OP_MINSTAR:
1998     case OP_PLUS:
1999     case OP_MINPLUS:
2000     case OP_QUERY:
2001     case OP_MINQUERY:
2002     case OP_TYPESTAR:
2003     case OP_TYPEMINSTAR:
2004     case OP_TYPEPLUS:
2005     case OP_TYPEMINPLUS:
2006     case OP_TYPEQUERY:
2007     case OP_TYPEMINQUERY:
2008     if (*code >= OP_TYPESTAR)
2009     printf(" %s", OP_names[code[1]]);
2010     else if (isprint(c = code[1])) printf(" %c", c);
2011     else printf(" \\x%02x", c);
2012     printf("%s", OP_names[*code++]);
2013     break;
2014    
2015     case OP_EXACT:
2016     case OP_UPTO:
2017     case OP_MINUPTO:
2018     if (isprint(c = code[3])) printf(" %c{", c);
2019     else printf(" \\x%02x{", c);
2020     if (*code != OP_EXACT) printf(",");
2021     printf("%d}", (code[1] << 8) + code[2]);
2022     if (*code == OP_MINUPTO) printf("?");
2023     code += 3;
2024     break;
2025    
2026     case OP_TYPEEXACT:
2027     case OP_TYPEUPTO:
2028     case OP_TYPEMINUPTO:
2029     printf(" %s{", OP_names[code[3]]);
2030     if (*code != OP_TYPEEXACT) printf(",");
2031     printf("%d}", (code[1] << 8) + code[2]);
2032     if (*code == OP_TYPEMINUPTO) printf("?");
2033     code += 3;
2034     break;
2035    
2036     case OP_NOT:
2037     if (isprint(c = *(++code))) printf(" [^%c]", c);
2038     else printf(" [^\\x%02x]", c);
2039     break;
2040    
2041     case OP_NOTSTAR:
2042     case OP_NOTMINSTAR:
2043     case OP_NOTPLUS:
2044     case OP_NOTMINPLUS:
2045     case OP_NOTQUERY:
2046     case OP_NOTMINQUERY:
2047     if (isprint(c = code[1])) printf(" [^%c]", c);
2048     else printf(" [^\\x%02x]", c);
2049     printf("%s", OP_names[*code++]);
2050     break;
2051    
2052     case OP_NOTEXACT:
2053     case OP_NOTUPTO:
2054     case OP_NOTMINUPTO:
2055     if (isprint(c = code[3])) printf(" [^%c]{", c);
2056     else printf(" [^\\x%02x]{", c);
2057     if (*code != OP_NOTEXACT) printf(",");
2058     printf("%d}", (code[1] << 8) + code[2]);
2059     if (*code == OP_NOTMINUPTO) printf("?");
2060     code += 3;
2061     break;
2062    
2063     case OP_REF:
2064     printf(" \\%d", *(++code));
2065     break;
2066    
2067     case OP_CLASS:
2068     {
2069     int i, min, max;
2070    
2071     code++;
2072     printf(" [");
2073    
2074     for (i = 0; i < 256; i++)
2075     {
2076     if ((code[i/8] & (1 << (i&7))) != 0)
2077     {
2078     int j;
2079     for (j = i+1; j < 256; j++)
2080     if ((code[j/8] & (1 << (j&7))) == 0) break;
2081     if (i == '-' || i == ']') printf("\\");
2082     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2083     if (--j > i)
2084     {
2085     printf("-");
2086     if (j == '-' || j == ']') printf("\\");
2087     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2088     }
2089     i = j;
2090     }
2091     }
2092     printf("]");
2093     code += 32;
2094    
2095     switch(*code)
2096     {
2097     case OP_CRSTAR:
2098     case OP_CRMINSTAR:
2099     case OP_CRPLUS:
2100     case OP_CRMINPLUS:
2101     case OP_CRQUERY:
2102     case OP_CRMINQUERY:
2103     printf("%s", OP_names[*code]);
2104     break;
2105    
2106     case OP_CRRANGE:
2107     case OP_CRMINRANGE:
2108     min = (code[1] << 8) + code[2];
2109     max = (code[3] << 8) + code[4];
2110     if (max == 0) printf("{%d,}", min);
2111     else printf("{%d,%d}", min, max);
2112     if (*code == OP_CRMINRANGE) printf("?");
2113     code += 4;
2114     break;
2115    
2116     default:
2117     code--;
2118     }
2119     }
2120     break;
2121    
2122     /* Anything else is just a one-node item */
2123    
2124     default:
2125     printf(" %s", OP_names[*code]);
2126     break;
2127     }
2128    
2129     code++;
2130     printf("\n");
2131     }
2132     printf("------------------------------------------------------------------\n");
2133    
2134     /* This check is done here in the debugging case so that the code that
2135     was compiled can be seen. */
2136    
2137     if (code - re->code > length)
2138     {
2139     *errorptr = ERR23;
2140     (pcre_free)(re);
2141     *erroroffset = ptr - (uschar *)pattern;
2142     return NULL;
2143     }
2144     #endif
2145    
2146     return (pcre *)re;
2147     }
2148    
2149    
2150    
2151     /*************************************************
2152     * Match a character type *
2153     *************************************************/
2154    
2155     /* Not used in all the places it might be as it's sometimes faster
2156     to put the code inline.
2157    
2158     Arguments:
2159     type the character type
2160     c the character
2161     dotall the dotall flag
2162    
2163     Returns: TRUE if character is of the type
2164     */
2165    
2166     static BOOL
2167     match_type(int type, int c, BOOL dotall)
2168     {
2169    
2170     #ifdef DEBUG
2171     if (isprint(c)) printf("matching subject %c against ", c);
2172     else printf("matching subject \\x%02x against ", c);
2173     printf("%s\n", OP_names[type]);
2174     #endif
2175    
2176     switch(type)
2177     {
2178     case OP_ANY: return dotall || c != '\n';
2179     case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2180     case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2181     case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2182     case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2183     case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2184     case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2185     }
2186     return FALSE;
2187     }
2188    
2189    
2190    
2191     /*************************************************
2192     * Match a back-reference *
2193     *************************************************/
2194    
2195     /* If a back reference hasn't been set, the match fails.
2196    
2197     Arguments:
2198     number reference number
2199     eptr points into the subject
2200     length length to be matched
2201     md points to match data block
2202    
2203     Returns: TRUE if matched
2204     */
2205    
2206     static BOOL
2207     match_ref(int number, register uschar *eptr, int length, match_data *md)
2208     {
2209     uschar *p = md->start_subject + md->offset_vector[number];
2210    
2211     #ifdef DEBUG
2212     if (eptr >= md->end_subject)
2213     printf("matching subject <null>");
2214     else
2215     {
2216     printf("matching subject ");
2217     pchars(eptr, length, TRUE, md);
2218     }
2219     printf(" against backref ");
2220     pchars(p, length, FALSE, md);
2221     printf("\n");
2222     #endif
2223    
2224     /* Always fail if not enough characters left */
2225    
2226     if (length > md->end_subject - p) return FALSE;
2227    
2228     /* Separate the caselesss case for speed */
2229    
2230     if (md->caseless)
2231     { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2232     else
2233     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2234    
2235     return TRUE;
2236     }
2237    
2238    
2239    
2240     /*************************************************
2241     * Match from current position *
2242     *************************************************/
2243    
2244     /* On entry ecode points to the first opcode, and eptr to the first character.
2245    
2246     Arguments:
2247     eptr pointer in subject
2248     ecode position in code
2249     offset_top current top pointer
2250     md pointer to "static" info for the match
2251    
2252     Returns: TRUE if matched
2253     */
2254    
2255     static BOOL
2256     match(register uschar *eptr, register uschar *ecode, int offset_top,
2257     match_data *md)
2258     {
2259     for (;;)
2260     {
2261     int min, max, ctype;
2262     register int i;
2263     register int c;
2264     BOOL minimize;
2265    
2266     /* Opening bracket. Check the alternative branches in turn, failing if none
2267     match. We have to set the start offset if required and there is space
2268     in the offset vector so that it is available for subsequent back references
2269     if the bracket matches. However, if the bracket fails, we must put back the
2270     previous value of both offsets in case they were set by a previous copy of
2271     the same bracket. Don't worry about setting the flag for the error case here;
2272     that is handled in the code for KET. */
2273    
2274     if ((int)*ecode >= OP_BRA)
2275     {
2276     int number = (*ecode - OP_BRA) << 1;
2277     int save_offset1, save_offset2;
2278    
2279     #ifdef DEBUG
2280     printf("start bracket %d\n", number/2);
2281     #endif
2282    
2283     if (number > 0 && number < md->offset_end)
2284     {
2285     save_offset1 = md->offset_vector[number];
2286     save_offset2 = md->offset_vector[number+1];
2287     md->offset_vector[number] = eptr - md->start_subject;
2288    
2289     #ifdef DEBUG
2290     printf("saving %d %d\n", save_offset1, save_offset2);
2291     #endif
2292     }
2293    
2294     /* Recurse for all the alternatives. */
2295    
2296     do
2297     {
2298     if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2299     ecode += (ecode[1] << 8) + ecode[2];
2300     }
2301     while (*ecode == OP_ALT);
2302    
2303     #ifdef DEBUG
2304     printf("bracket %d failed\n", number/2);
2305     #endif
2306    
2307     if (number > 0 && number < md->offset_end)
2308     {
2309     md->offset_vector[number] = save_offset1;
2310     md->offset_vector[number+1] = save_offset2;
2311     }
2312    
2313     return FALSE;
2314     }
2315    
2316     /* Other types of node can be handled by a switch */
2317    
2318     switch(*ecode)
2319     {
2320     case OP_END:
2321     md->end_match_ptr = eptr; /* Record where we ended */
2322     md->end_offset_top = offset_top; /* and how many extracts were taken */
2323     return TRUE;
2324    
2325     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2326     whole thing doesn't match, so we have to get out via a longjmp(). */
2327    
2328     case OP_CUT:
2329     if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2330     longjmp(md->fail_env, 1);
2331    
2332     /* Assertion brackets. Check the alternative branches in turn - the
2333     matching won't pass the KET for an assertion. If any one branch matches,
2334     the assertion is true. */
2335    
2336     case OP_ASSERT:
2337     do
2338     {
2339     if (match(eptr, ecode+3, offset_top, md)) break;
2340     ecode += (ecode[1] << 8) + ecode[2];
2341     }
2342     while (*ecode == OP_ALT);
2343     if (*ecode == OP_KET) return FALSE;
2344    
2345     /* Continue from after the assertion, updating the offsets high water
2346     mark, since extracts may have been taken during the assertion. */
2347    
2348     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2349     ecode += 3;
2350     offset_top = md->end_offset_top;
2351     continue;
2352    
2353     /* Negative assertion: all branches must fail to match */
2354    
2355     case OP_ASSERT_NOT:
2356     do
2357     {
2358     if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2359     ecode += (ecode[1] << 8) + ecode[2];
2360     }
2361     while (*ecode == OP_ALT);
2362     ecode += 3;
2363     continue;
2364    
2365     /* "Once" brackets are like assertion brackets except that after a match,
2366     the point in the subject string is not moved back. Thus there can never be
2367 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
2368 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
2369     matches, we carry on, leaving the subject pointer. */
2370    
2371     case OP_ONCE:
2372     do
2373     {
2374     if (match(eptr, ecode+3, offset_top, md)) break;
2375     ecode += (ecode[1] << 8) + ecode[2];
2376     }
2377     while (*ecode == OP_ALT);
2378     if (*ecode == OP_KET) return FALSE;
2379    
2380     /* Continue as from after the assertion, updating the offsets high water
2381     mark, since extracts may have been taken. */
2382    
2383     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2384     ecode += 3;
2385     offset_top = md->end_offset_top;
2386     eptr = md->end_match_ptr;
2387     continue;
2388    
2389     /* An alternation is the end of a branch; scan along to find the end of the
2390     bracketed group and go to there. */
2391    
2392     case OP_ALT:
2393     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2394     break;
2395    
2396     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2397     that it may occur zero times. It may repeat infinitely, or not at all -
2398     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2399     repeat limits are compiled as a number of copies, with the optional ones
2400     preceded by BRAZERO or BRAMINZERO. */
2401    
2402     case OP_BRAZERO:
2403     {
2404     uschar *next = ecode+1;
2405     if (match(eptr, next, offset_top, md)) return TRUE;
2406     do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2407     ecode = next + 3;
2408     }
2409     break;
2410    
2411     case OP_BRAMINZERO:
2412     {
2413     uschar *next = ecode+1;
2414     do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2415     if (match(eptr, next+3, offset_top, md)) return TRUE;
2416     ecode++;
2417     }
2418     break;;
2419    
2420     /* End of a group, repeated or non-repeating. If we are at the end of
2421     an assertion "group", stop matching and return TRUE, but record the
2422     current high water mark for use by positive assertions. */
2423    
2424     case OP_KET:
2425     case OP_KETRMIN:
2426     case OP_KETRMAX:
2427     {
2428     int number;
2429     uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2430    
2431     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2432     {
2433     md->end_match_ptr = eptr; /* For ONCE */
2434     md->end_offset_top = offset_top;
2435     return TRUE;
2436     }
2437    
2438     /* In all other cases we have to check the group number back at the
2439     start and if necessary complete handling an extraction by setting the
2440     final offset and bumping the high water mark. */
2441    
2442     number = (*prev - OP_BRA) << 1;
2443    
2444     #ifdef DEBUG
2445     printf("end bracket %d\n", number/2);
2446     #endif
2447    
2448     if (number > 0)
2449     {
2450     if (number >= md->offset_end) md->offset_overflow = TRUE; else
2451     {
2452     md->offset_vector[number+1] = eptr - md->start_subject;
2453     if (offset_top <= number) offset_top = number + 2;
2454     }
2455     }
2456    
2457     /* For a non-repeating ket, just advance to the next node and continue at
2458     this level. */
2459    
2460     if (*ecode == OP_KET)
2461     {
2462     ecode += 3;
2463     break;
2464     }
2465    
2466     /* The repeating kets try the rest of the pattern or restart from the
2467     preceding bracket, in the appropriate order. */
2468    
2469     if (*ecode == OP_KETRMIN)
2470     {
2471     if (match(eptr, ecode+3, offset_top, md) ||
2472     match(eptr, prev, offset_top, md)) return TRUE;
2473     }
2474     else /* OP_KETRMAX */
2475     {
2476     if (match(eptr, prev, offset_top, md) ||
2477     match(eptr, ecode+3, offset_top, md)) return TRUE;
2478     }
2479     }
2480     return FALSE;
2481    
2482     /* Start of subject unless notbol, or after internal newline if multiline */
2483    
2484     case OP_CIRC:
2485     if (md->notbol && eptr == md->start_subject) return FALSE;
2486     if (md->multiline)
2487     {
2488     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2489     ecode++;
2490     break;
2491     }
2492     /* ... else fall through */
2493    
2494     /* Start of subject assertion */
2495    
2496     case OP_SOD:
2497     if (eptr != md->start_subject) return FALSE;
2498     ecode++;
2499     break;
2500    
2501     /* Assert before internal newline if multiline, or before
2502     a terminating newline unless endonly is set, else end of subject unless
2503     noteol is set. */
2504    
2505     case OP_DOLL:
2506     if (md->noteol && eptr >= md->end_subject) return FALSE;
2507     if (md->multiline)
2508     {
2509     if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2510     ecode++;
2511     break;
2512     }
2513     else if (!md->endonly)
2514     {
2515     if (eptr < md->end_subject - 1 ||
2516     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2517     ecode++;
2518     break;
2519     }
2520     /* ... else fall through */
2521    
2522     /* End of subject assertion */
2523    
2524     case OP_EOD:
2525     if (eptr < md->end_subject) return FALSE;
2526     ecode++;
2527     break;
2528    
2529     /* Word boundary assertions */
2530    
2531     case OP_NOT_WORD_BOUNDARY:
2532     case OP_WORD_BOUNDARY:
2533     {
2534     BOOL prev_is_word = (eptr != md->start_subject) &&
2535     ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2536     BOOL cur_is_word = (eptr < md->end_subject) &&
2537     ((pcre_ctypes[*eptr] & ctype_word) != 0);
2538     if ((*ecode++ == OP_WORD_BOUNDARY)?
2539     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2540     return FALSE;
2541     }
2542     break;
2543    
2544     /* Match a single character type; inline for speed */
2545    
2546     case OP_ANY:
2547     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2548     if (eptr++ >= md->end_subject) return FALSE;
2549     ecode++;
2550     break;
2551    
2552     case OP_NOT_DIGIT:
2553     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2554     return FALSE;
2555     ecode++;
2556     break;
2557    
2558     case OP_DIGIT:
2559     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2560     return FALSE;
2561     ecode++;
2562     break;
2563    
2564     case OP_NOT_WHITESPACE:
2565     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2566     return FALSE;
2567     ecode++;
2568     break;
2569    
2570     case OP_WHITESPACE:
2571     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2572     return FALSE;
2573     ecode++;
2574     break;
2575    
2576     case OP_NOT_WORDCHAR:
2577     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2578     return FALSE;
2579     ecode++;
2580     break;
2581    
2582     case OP_WORDCHAR:
2583     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2584     return FALSE;
2585     ecode++;
2586     break;
2587    
2588     /* Match a back reference, possibly repeatedly. Look past the end of the
2589     item to see if there is repeat information following. The code is similar
2590     to that for character classes, but repeated for efficiency. Then obey
2591     similar code to character type repeats - written out again for speed.
2592     However, if the referenced string is the empty string, always treat
2593     it as matched, any number of times (otherwise there could be infinite
2594     loops). */
2595    
2596     case OP_REF:
2597     {
2598     int length;
2599     int number = ecode[1] << 1; /* Doubled reference number */
2600     ecode += 2; /* Advance past the item */
2601    
2602     if (number >= offset_top || md->offset_vector[number] < 0)
2603     {
2604     md->errorcode = PCRE_ERROR_BADREF;
2605     return FALSE;
2606     }
2607    
2608     length = md->offset_vector[number+1] - md->offset_vector[number];
2609    
2610     switch (*ecode)
2611     {
2612     case OP_CRSTAR:
2613     case OP_CRMINSTAR:
2614     case OP_CRPLUS:
2615     case OP_CRMINPLUS:
2616     case OP_CRQUERY:
2617     case OP_CRMINQUERY:
2618     c = *ecode++ - OP_CRSTAR;
2619     minimize = (c & 1) != 0;
2620     min = rep_min[c]; /* Pick up values from tables; */
2621     max = rep_max[c]; /* zero for max => infinity */
2622     if (max == 0) max = INT_MAX;
2623     break;
2624    
2625     case OP_CRRANGE:
2626     case OP_CRMINRANGE:
2627     minimize = (*ecode == OP_CRMINRANGE);
2628     min = (ecode[1] << 8) + ecode[2];
2629     max = (ecode[3] << 8) + ecode[4];
2630     if (max == 0) max = INT_MAX;
2631     ecode += 5;
2632     break;
2633    
2634     default: /* No repeat follows */
2635     if (!match_ref(number, eptr, length, md)) return FALSE;
2636     eptr += length;
2637     continue; /* With the main loop */
2638     }
2639    
2640     /* If the length of the reference is zero, just continue with the
2641     main loop. */
2642    
2643     if (length == 0) continue;
2644    
2645     /* First, ensure the minimum number of matches are present. We get back
2646     the length of the reference string explicitly rather than passing the
2647     address of eptr, so that eptr can be a register variable. */
2648    
2649     for (i = 1; i <= min; i++)
2650     {
2651     if (!match_ref(number, eptr, length, md)) return FALSE;
2652     eptr += length;
2653     }
2654    
2655     /* If min = max, continue at the same level without recursion.
2656     They are not both allowed to be zero. */
2657    
2658     if (min == max) continue;
2659    
2660     /* If minimizing, keep trying and advancing the pointer */
2661    
2662     if (minimize)
2663     {
2664     for (i = min;; i++)
2665     {
2666     if (match(eptr, ecode, offset_top, md)) return TRUE;
2667     if (i >= max || !match_ref(number, eptr, length, md))
2668     return FALSE;
2669     eptr += length;
2670     }
2671     /* Control never gets here */
2672     }
2673    
2674     /* If maximizing, find the longest string and work backwards */
2675    
2676     else
2677     {
2678     uschar *pp = eptr;
2679     for (i = min; i < max; i++)
2680     {
2681     if (!match_ref(number, eptr, length, md)) break;
2682     eptr += length;
2683     }
2684     while (eptr >= pp)
2685     {
2686     if (match(eptr, ecode, offset_top, md)) return TRUE;
2687     eptr -= length;
2688     }
2689     return FALSE;
2690     }
2691     }
2692     /* Control never gets here */
2693    
2694     /* Match a character class, possibly repeatedly. Look past the end of the
2695     item to see if there is repeat information following. Then obey similar
2696     code to character type repeats - written out again for speed. If caseless
2697     matching was set at runtime but not at compile time, we have to check both
2698     versions of a character. */
2699    
2700     case OP_CLASS:
2701     {
2702     uschar *data = ecode + 1; /* Save for matching */
2703     ecode += 33; /* Advance past the item */
2704    
2705     switch (*ecode)
2706     {
2707     case OP_CRSTAR:
2708     case OP_CRMINSTAR:
2709     case OP_CRPLUS:
2710     case OP_CRMINPLUS:
2711     case OP_CRQUERY:
2712     case OP_CRMINQUERY:
2713     c = *ecode++ - OP_CRSTAR;
2714     minimize = (c & 1) != 0;
2715     min = rep_min[c]; /* Pick up values from tables; */
2716     max = rep_max[c]; /* zero for max => infinity */
2717     if (max == 0) max = INT_MAX;
2718     break;
2719    
2720     case OP_CRRANGE:
2721     case OP_CRMINRANGE:
2722     minimize = (*ecode == OP_CRMINRANGE);
2723     min = (ecode[1] << 8) + ecode[2];
2724     max = (ecode[3] << 8) + ecode[4];
2725     if (max == 0) max = INT_MAX;
2726     ecode += 5;
2727     break;
2728    
2729     default: /* No repeat follows */
2730     if (eptr >= md->end_subject) return FALSE;
2731     c = *eptr++;
2732     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2733     if (md->runtime_caseless)
2734     {
2735     c = pcre_fcc[c];
2736     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2737     }
2738     return FALSE;
2739     }
2740    
2741     /* First, ensure the minimum number of matches are present. */
2742    
2743     for (i = 1; i <= min; i++)
2744     {
2745     if (eptr >= md->end_subject) return FALSE;
2746     c = *eptr++;
2747     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2748     if (md->runtime_caseless)
2749     {
2750     c = pcre_fcc[c];
2751     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2752     }
2753     return FALSE;
2754     }
2755    
2756     /* If max == min we can continue with the main loop without the
2757     need to recurse. */
2758    
2759     if (min == max) continue;
2760    
2761     /* If minimizing, keep testing the rest of the expression and advancing
2762     the pointer while it matches the class. */
2763    
2764     if (minimize)
2765     {
2766     for (i = min;; i++)
2767     {
2768     if (match(eptr, ecode, offset_top, md)) return TRUE;
2769     if (i >= max || eptr >= md->end_subject) return FALSE;
2770     c = *eptr++;
2771     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2772     if (md->runtime_caseless)
2773     {
2774     c = pcre_fcc[c];
2775     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2776     }
2777     return FALSE;
2778     }
2779     /* Control never gets here */
2780     }
2781    
2782     /* If maximizing, find the longest possible run, then work backwards. */
2783    
2784     else
2785     {
2786     uschar *pp = eptr;
2787     for (i = min; i < max; eptr++, i++)
2788     {
2789     if (eptr >= md->end_subject) break;
2790     c = *eptr;
2791     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2792     if (md->runtime_caseless)
2793     {
2794     c = pcre_fcc[c];
2795     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2796     }
2797     break;
2798     }
2799    
2800     while (eptr >= pp)
2801     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2802     return FALSE;
2803     }
2804     }
2805     /* Control never gets here */
2806    
2807     /* Match a run of characters */
2808    
2809     case OP_CHARS:
2810     {
2811     register int length = ecode[1];
2812     ecode += 2;
2813    
2814     #ifdef DEBUG
2815     if (eptr >= md->end_subject)
2816     printf("matching subject <null> against pattern ");
2817     else
2818     {
2819     printf("matching subject ");
2820     pchars(eptr, length, TRUE, md);
2821     printf(" against pattern ");
2822     }
2823     pchars(ecode, length, FALSE, md);
2824     printf("\n");
2825     #endif
2826    
2827     if (length > md->end_subject - eptr) return FALSE;
2828     if (md->caseless)
2829     {
2830     while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2831     }
2832     else
2833     {
2834     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2835     }
2836     }
2837     break;
2838    
2839     /* Match a single character repeatedly; different opcodes share code. */
2840    
2841     case OP_EXACT:
2842     min = max = (ecode[1] << 8) + ecode[2];
2843     ecode += 3;
2844     goto REPEATCHAR;
2845    
2846     case OP_UPTO:
2847     case OP_MINUPTO:
2848     min = 0;
2849     max = (ecode[1] << 8) + ecode[2];
2850     minimize = *ecode == OP_MINUPTO;
2851     ecode += 3;
2852     goto REPEATCHAR;
2853    
2854     case OP_STAR:
2855     case OP_MINSTAR:
2856     case OP_PLUS:
2857     case OP_MINPLUS:
2858     case OP_QUERY:
2859     case OP_MINQUERY:
2860     c = *ecode++ - OP_STAR;
2861     minimize = (c & 1) != 0;
2862     min = rep_min[c]; /* Pick up values from tables; */
2863     max = rep_max[c]; /* zero for max => infinity */
2864     if (max == 0) max = INT_MAX;
2865    
2866     /* Common code for all repeated single-character matches. We can give
2867     up quickly if there are fewer than the minimum number of characters left in
2868     the subject. */
2869    
2870     REPEATCHAR:
2871     if (min > md->end_subject - eptr) return FALSE;
2872     c = *ecode++;
2873    
2874     /* The code is duplicated for the caseless and caseful cases, for speed,
2875     since matching characters is likely to be quite common. First, ensure the
2876     minimum number of matches are present. If min = max, continue at the same
2877     level without recursing. Otherwise, if minimizing, keep trying the rest of
2878     the expression and advancing one matching character if failing, up to the
2879     maximum. Alternatively, if maximizing, find the maximum number of
2880     characters and work backwards. */
2881    
2882     #ifdef DEBUG
2883     printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2884     max, eptr);
2885     #endif
2886    
2887     if (md->caseless)
2888     {
2889     c = pcre_lcc[c];
2890     for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2891     if (min == max) continue;
2892     if (minimize)
2893     {
2894     for (i = min;; i++)
2895     {
2896     if (match(eptr, ecode, offset_top, md)) return TRUE;
2897     if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2898     return FALSE;
2899     }
2900     /* Control never gets here */
2901     }
2902     else
2903     {
2904     uschar *pp = eptr;
2905     for (i = min; i < max; i++)
2906     {
2907     if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2908     eptr++;
2909     }
2910     while (eptr >= pp)
2911     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2912     return FALSE;
2913     }
2914     /* Control never gets here */
2915     }
2916    
2917     /* Caseful comparisons */
2918    
2919     else
2920     {
2921     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2922     if (min == max) continue;
2923     if (minimize)
2924     {
2925     for (i = min;; i++)
2926     {
2927     if (match(eptr, ecode, offset_top, md)) return TRUE;
2928     if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2929     }
2930     /* Control never gets here */
2931     }
2932     else
2933     {
2934     uschar *pp = eptr;
2935     for (i = min; i < max; i++)
2936     {
2937     if (eptr >= md->end_subject || c != *eptr) break;
2938     eptr++;
2939     }
2940     while (eptr >= pp)
2941     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2942     return FALSE;
2943     }
2944     }
2945     /* Control never gets here */
2946    
2947     /* Match a negated single character */
2948    
2949     case OP_NOT:
2950     if (eptr > md->end_subject) return FALSE;
2951     ecode++;
2952     if (md->caseless)
2953     {
2954     if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2955     }
2956     else
2957     {
2958     if (*ecode++ == *eptr++) return FALSE;
2959     }
2960     break;
2961    
2962     /* Match a negated single character repeatedly. This is almost a repeat of
2963     the code for a repeated single character, but I haven't found a nice way of
2964     commoning these up that doesn't require a test of the positive/negative
2965     option for each character match. Maybe that wouldn't add very much to the
2966     time taken, but character matching *is* what this is all about... */
2967    
2968     case OP_NOTEXACT:
2969     min = max = (ecode[1] << 8) + ecode[2];
2970     ecode += 3;
2971     goto REPEATNOTCHAR;
2972    
2973     case OP_NOTUPTO:
2974     case OP_NOTMINUPTO:
2975     min = 0;
2976     max = (ecode[1] << 8) + ecode[2];
2977     minimize = *ecode == OP_NOTMINUPTO;
2978     ecode += 3;
2979     goto REPEATNOTCHAR;
2980    
2981     case OP_NOTSTAR:
2982     case OP_NOTMINSTAR:
2983     case OP_NOTPLUS:
2984     case OP_NOTMINPLUS:
2985     case OP_NOTQUERY:
2986     case OP_NOTMINQUERY:
2987     c = *ecode++ - OP_NOTSTAR;
2988     minimize = (c & 1) != 0;
2989     min = rep_min[c]; /* Pick up values from tables; */
2990     max = rep_max[c]; /* zero for max => infinity */
2991     if (max == 0) max = INT_MAX;
2992    
2993     /* Common code for all repeated single-character matches. We can give
2994     up quickly if there are fewer than the minimum number of characters left in
2995     the subject. */
2996    
2997     REPEATNOTCHAR:
2998     if (min > md->end_subject - eptr) return FALSE;
2999     c = *ecode++;
3000    
3001     /* The code is duplicated for the caseless and caseful cases, for speed,
3002     since matching characters is likely to be quite common. First, ensure the
3003     minimum number of matches are present. If min = max, continue at the same
3004     level without recursing. Otherwise, if minimizing, keep trying the rest of
3005     the expression and advancing one matching character if failing, up to the
3006     maximum. Alternatively, if maximizing, find the maximum number of
3007     characters and work backwards. */
3008    
3009     #ifdef DEBUG
3010     printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3011     max, eptr);
3012     #endif
3013    
3014     if (md->caseless)
3015     {
3016     c = pcre_lcc[c];
3017     for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3018     if (min == max) continue;
3019     if (minimize)
3020     {
3021     for (i = min;; i++)
3022     {
3023     if (match(eptr, ecode, offset_top, md)) return TRUE;
3024     if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3025     return FALSE;
3026     }
3027     /* Control never gets here */
3028     }
3029     else
3030     {
3031     uschar *pp = eptr;
3032     for (i = min; i < max; i++)
3033     {
3034     if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3035     eptr++;
3036     }
3037     while (eptr >= pp)
3038     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3039     return FALSE;
3040     }
3041     /* Control never gets here */
3042     }
3043    
3044     /* Caseful comparisons */
3045    
3046     else
3047     {
3048     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3049     if (min == max) continue;
3050     if (minimize)
3051     {
3052     for (i = min;; i++)
3053     {
3054     if (match(eptr, ecode, offset_top, md)) return TRUE;
3055     if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3056     }
3057     /* Control never gets here */
3058     }
3059     else
3060     {
3061     uschar *pp = eptr;
3062     for (i = min; i < max; i++)
3063     {
3064     if (eptr >= md->end_subject || c == *eptr) break;
3065     eptr++;
3066     }
3067     while (eptr >= pp)
3068     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3069     return FALSE;
3070     }
3071     }
3072     /* Control never gets here */
3073    
3074     /* Match a single character type repeatedly; several different opcodes
3075     share code. This is very similar to the code for single characters, but we
3076     repeat it in the interests of efficiency. */
3077    
3078     case OP_TYPEEXACT:
3079     min = max = (ecode[1] << 8) + ecode[2];
3080     minimize = TRUE;
3081     ecode += 3;
3082     goto REPEATTYPE;
3083    
3084     case OP_TYPEUPTO:
3085     case OP_TYPEMINUPTO:
3086     min = 0;
3087     max = (ecode[1] << 8) + ecode[2];
3088     minimize = *ecode == OP_TYPEMINUPTO;
3089     ecode += 3;
3090     goto REPEATTYPE;
3091    
3092     case OP_TYPESTAR:
3093     case OP_TYPEMINSTAR:
3094     case OP_TYPEPLUS:
3095     case OP_TYPEMINPLUS:
3096     case OP_TYPEQUERY:
3097     case OP_TYPEMINQUERY:
3098     c = *ecode++ - OP_TYPESTAR;
3099     minimize = (c & 1) != 0;
3100     min = rep_min[c]; /* Pick up values from tables; */
3101     max = rep_max[c]; /* zero for max => infinity */
3102     if (max == 0) max = INT_MAX;
3103    
3104     /* Common code for all repeated single character type matches */
3105    
3106     REPEATTYPE:
3107     ctype = *ecode++; /* Code for the character type */
3108    
3109     /* First, ensure the minimum number of matches are present. Use inline
3110     code for maximizing the speed, and do the type test once at the start
3111     (i.e. keep it out of the loop). Also test that there are at least the
3112     minimum number of characters before we start. */
3113    
3114     if (min > md->end_subject - eptr) return FALSE;
3115     if (min > 0) switch(ctype)
3116     {
3117     case OP_ANY:
3118     if (!md->dotall)
3119     { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3120     else eptr += min;
3121     break;
3122    
3123     case OP_NOT_DIGIT:
3124     for (i = 1; i <= min; i++)
3125     if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3126     break;
3127    
3128     case OP_DIGIT:
3129     for (i = 1; i <= min; i++)
3130     if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3131     break;
3132    
3133     case OP_NOT_WHITESPACE:
3134     for (i = 1; i <= min; i++)
3135     if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3136     break;
3137    
3138     case OP_WHITESPACE:
3139     for (i = 1; i <= min; i++)
3140     if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3141     break;
3142    
3143     case OP_NOT_WORDCHAR:
3144     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3145     return FALSE;
3146     break;
3147    
3148     case OP_WORDCHAR:
3149     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3150     return FALSE;
3151     break;
3152     }
3153    
3154     /* If min = max, continue at the same level without recursing */
3155    
3156     if (min == max) continue;
3157    
3158     /* If minimizing, we have to test the rest of the pattern before each
3159     subsequent match, so inlining isn't much help; just use the function. */
3160    
3161     if (minimize)
3162     {
3163     for (i = min;; i++)
3164     {
3165     if (match(eptr, ecode, offset_top, md)) return TRUE;
3166     if (i >= max || eptr >= md->end_subject ||
3167     !match_type(ctype, *eptr++, md->dotall))
3168     return FALSE;
3169     }
3170     /* Control never gets here */
3171     }
3172    
3173     /* If maximizing it is worth using inline code for speed, doing the type
3174     test once at the start (i.e. keep it out of the loop). */
3175    
3176     else
3177     {
3178     uschar *pp = eptr;
3179     switch(ctype)
3180     {
3181     case OP_ANY:
3182     if (!md->dotall)
3183     {
3184     for (i = min; i < max; i++)
3185     {
3186     if (eptr >= md->end_subject || *eptr == '\n') break;
3187     eptr++;
3188     }
3189     }
3190     else
3191     {
3192     c = max - min;
3193     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3194     eptr += c;
3195     }
3196     break;
3197    
3198     case OP_NOT_DIGIT:
3199     for (i = min; i < max; i++)
3200     {
3201     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3202     break;
3203     eptr++;
3204     }
3205     break;
3206    
3207     case OP_DIGIT:
3208     for (i = min; i < max; i++)
3209     {
3210     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3211     break;
3212     eptr++;
3213     }
3214     break;
3215    
3216     case OP_NOT_WHITESPACE:
3217     for (i = min; i < max; i++)
3218     {
3219     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3220     break;
3221     eptr++;
3222     }
3223     break;
3224    
3225     case OP_WHITESPACE:
3226     for (i = min; i < max; i++)
3227     {
3228     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3229     break;
3230     eptr++;
3231     }
3232     break;
3233    
3234     case OP_NOT_WORDCHAR:
3235     for (i = min; i < max; i++)
3236     {
3237     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3238     break;
3239     eptr++;
3240     }
3241     break;
3242    
3243     case OP_WORDCHAR:
3244     for (i = min; i < max; i++)
3245     {
3246     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3247     break;
3248     eptr++;
3249     }
3250     break;
3251     }
3252    
3253     while (eptr >= pp)
3254     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3255     return FALSE;
3256     }
3257     /* Control never gets here */
3258    
3259     /* There's been some horrible disaster. */
3260    
3261     default:
3262     #ifdef DEBUG
3263     printf("Unknown opcode %d\n", *ecode);
3264     #endif
3265     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3266     return FALSE;
3267     }
3268    
3269     /* Do not stick any code in here without much thought; it is assumed
3270     that "continue" in the code above comes out to here to repeat the main
3271     loop. */
3272    
3273     } /* End of main loop */
3274     /* Control never reaches here */
3275     }
3276    
3277    
3278    
3279     /*************************************************
3280     * Execute a Regular Expression *
3281     *************************************************/
3282    
3283     /* This function applies a compiled re to a subject string and picks out
3284     portions of the string if it matches. Two elements in the vector are set for
3285     each substring: the offsets to the start and end of the substring.
3286    
3287     Arguments:
3288     external_re points to the compiled expression
3289     external_extra points to "hints" from pcre_study() or is NULL
3290     subject points to the subject string
3291     length length of subject string (may contain binary zeros)
3292     options option bits
3293     offsets points to a vector of ints to be filled in with offsets
3294     offsetcount the number of elements in the vector
3295    
3296     Returns: > 0 => success; value is the number of elements filled in
3297     = 0 => success, but offsets is not big enough
3298     -1 => failed to match
3299     < -1 => some kind of unexpected problem
3300     */
3301    
3302     int
3303     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3304     const char *subject, int length, int options, int *offsets, int offsetcount)
3305     {
3306     int resetcount;
3307     int ocount = offsetcount;
3308     int first_char = -1;
3309     match_data match_block;
3310     uschar *start_bits = NULL;
3311     uschar *start_match = (uschar *)subject;
3312     uschar *end_subject;
3313     real_pcre *re = (real_pcre *)external_re;
3314     real_pcre_extra *extra = (real_pcre_extra *)external_extra;
3315     BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3316     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3317    
3318     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3319    
3320     if (re == NULL || subject == NULL ||
3321     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3322     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3323    
3324     match_block.start_subject = (uschar *)subject;
3325     match_block.end_subject = match_block.start_subject + length;
3326     end_subject = match_block.end_subject;
3327    
3328     match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3329     match_block.runtime_caseless = match_block.caseless &&
3330     (re->options & PCRE_CASELESS) == 0;
3331    
3332     match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3333     match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3334     match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3335    
3336     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3337     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3338    
3339     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3340    
3341     /* If the expression has got more back references than the offsets supplied can
3342     hold, we get a temporary bit of working store to use during the matching.
3343     Otherwise, we can use the vector supplied, rounding down the size of it to a
3344     multiple of 2. */
3345    
3346     ocount &= (-2);
3347     if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
3348     {
3349     ocount = re->top_backref * 2 + 2;
3350     match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3351     if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3352     #ifdef DEBUG
3353     printf("Got memory to hold back references\n");
3354     #endif
3355     }
3356     else match_block.offset_vector = offsets;
3357    
3358     match_block.offset_end = ocount;
3359     match_block.offset_overflow = FALSE;
3360    
3361     /* Compute the minimum number of offsets that we need to reset each time. Doing
3362     this makes a huge difference to execution time when there aren't many brackets
3363     in the pattern. */
3364    
3365     resetcount = 2 + re->top_bracket * 2;
3366     if (resetcount > offsetcount) resetcount = ocount;
3367    
3368     /* If MULTILINE is set at exec time but was not set at compile time, and the
3369     anchored flag is set, we must re-check because a setting provoked by ^ in the
3370     pattern is not right in multi-line mode. Calling is_anchored() again here does
3371     the right check, because multiline is now set. If it now yields FALSE, the
3372     expression must have had ^ starting some of its branches. Check to see if
3373     that is true for *all* branches, and if so, set the startline flag. */
3374    
3375     if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3376     !is_anchored(re->code, match_block.multiline))
3377     {
3378     anchored = FALSE;
3379     if (is_startline(re->code)) startline = TRUE;
3380     }
3381    
3382     /* Set up the first character to match, if available. The first_char value is
3383     never set for an anchored regular expression, but the anchoring may be forced
3384     at run time, so we have to test for anchoring. The first char may be unset for
3385     an unanchored pattern, of course. If there's no first char and the pattern was
3386     studied, the may be a bitmap of possible first characters. However, we can
3387     use this only if the caseless state of the studying was correct. */
3388    
3389     if (!anchored)
3390     {
3391     if ((re->options & PCRE_FIRSTSET) != 0)
3392     {
3393     first_char = re->first_char;
3394     if (match_block.caseless) first_char = pcre_lcc[first_char];
3395     }
3396     else
3397     if (!startline && extra != NULL &&
3398     (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3399     ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3400     start_bits = extra->start_bits;
3401     }
3402    
3403     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3404    
3405     do
3406     {
3407     register int *iptr = match_block.offset_vector;
3408     register int *iend = iptr + resetcount;
3409    
3410     /* Reset the maximum number of extractions we might see. */
3411    
3412     while (iptr < iend) *iptr++ = -1;
3413    
3414     /* Advance to a unique first char if possible */
3415    
3416     if (first_char >= 0)
3417     {
3418     if (match_block.caseless)
3419     while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3420     start_match++;
3421     else
3422     while (start_match < end_subject && *start_match != first_char)
3423     start_match++;
3424     }
3425    
3426     /* Or to just after \n for a multiline match if possible */
3427    
3428     else if (startline)
3429     {
3430     if (start_match > match_block.start_subject)
3431     {
3432     while (start_match < end_subject && start_match[-1] != '\n')
3433     start_match++;
3434     }
3435     }
3436    
3437     /* Or to a non-unique first char */
3438    
3439     else if (start_bits != NULL)
3440     {
3441     while (start_match < end_subject)
3442     {
3443     register int c = *start_match;
3444     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3445     }
3446     }
3447    
3448     #ifdef DEBUG
3449     printf(">>>> Match against: ");
3450     pchars(start_match, end_subject - start_match, TRUE, &match_block);
3451     printf("\n");
3452     #endif
3453    
3454     /* When a match occurs, substrings will be set for all internal extractions;
3455     we just need to set up the whole thing as substring 0 before returning. If
3456     there were too many extractions, set the return code to zero. In the case
3457     where we had to get some local store to hold offsets for backreferences, copy
3458     those back references that we can. In this case there need not be overflow
3459     if certain parts of the pattern were not used.
3460    
3461     Before starting the match, we have to set up a longjmp() target to enable
3462     the "cut" operation to fail a match completely without backtracking. */
3463    
3464     if (setjmp(match_block.fail_env) == 0 &&
3465     match(start_match, re->code, 2, &match_block))
3466     {
3467     int rc;
3468    
3469     if (ocount != offsetcount)
3470     {
3471     if (offsetcount >= 4)
3472     {
3473     memcpy(offsets + 2, match_block.offset_vector + 2,
3474     (offsetcount - 2) * sizeof(int));
3475     #ifdef DEBUG
3476     printf("Copied offsets; freeing temporary memory\n");
3477     #endif
3478     }
3479     if (match_block.end_offset_top > offsetcount)
3480     match_block.offset_overflow = TRUE;
3481    
3482     #ifdef DEBUG
3483     printf("Freeing temporary memory\n");
3484     #endif
3485    
3486     (pcre_free)(match_block.offset_vector);
3487     }
3488    
3489     rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3490    
3491     if (match_block.offset_end < 2) rc = 0; else
3492     {
3493     offsets[0] = start_match - match_block.start_subject;
3494     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3495     }
3496    
3497     #ifdef DEBUG
3498     printf(">>>> returning %d\n", rc);
3499     #endif
3500     return rc;
3501     }
3502     }
3503     while (!anchored &&
3504     match_block.errorcode == PCRE_ERROR_NOMATCH &&
3505     start_match++ < end_subject);
3506    
3507     #ifdef DEBUG
3508     printf(">>>> returning %d\n", match_block.errorcode);
3509     #endif
3510    
3511     return match_block.errorcode;
3512     }
3513    
3514     /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12