/[pcre]/code/tags/pcre-1.00/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-1.00/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (hide annotations) (download)
Sat Feb 24 21:38:01 2007 UTC (7 years, 6 months ago) by nigel
Original Path: code/trunk/pcre.c
File MIME type: text/plain
File size: 98918 byte(s)
Load pcre-1.00 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12     Copyright (c) 1997 University of Cambridge
13    
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Define DEBUG to get debugging output on stdout. */
33    
34     /* #define DEBUG */
35    
36    
37     /* Include the internals header, which itself includes Standard C headers plus
38     the external pcre header. */
39    
40     #include "internal.h"
41    
42    
43     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
44    
45     static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
46     static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
47    
48     /* Text forms of OP_ values and things, for debugging */
49    
50     #ifdef DEBUG
51     static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
52     "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
53     "not",
54     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
55     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
56     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
57     "*", "*?", "+", "+?", "?", "??", "{", "{",
58     "class", "Ref",
59     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
60     "Brazero", "Braminzero", "Bra"
61     };
62     #endif
63    
64     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
65     are simple data values; negative values are for special things like \d and so
66     on. Zero means further processing is needed (for things like \x), or the escape
67     is invalid. */
68    
69     static short int escapes[] = {
70     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
73     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
77     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
78     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
79     0, 0, 0 /* x - z */
80     };
81    
82     /* Definition to allow mutual recursion */
83    
84     static BOOL compile_regex(int, int *,uschar **,uschar **,char **);
85    
86     /* Structure for passing "static" information around between the functions
87     doing the matching, so that they are thread-safe. */
88    
89     typedef struct match_data {
90     int errorcode; /* As it says */
91     int *offset_vector; /* Offset vector */
92     int offset_end; /* One past the end */
93     BOOL offset_overflow; /* Set if too many extractions */
94     BOOL caseless; /* Case-independent flag */
95     BOOL runtime_caseless; /* Caseless forced at run time */
96     BOOL multiline; /* Multiline flag */
97     BOOL notbol; /* NOTBOL flag */
98     BOOL noteol; /* NOTEOL flag */
99     BOOL dotall; /* Dot matches any char */
100     BOOL endonly; /* Dollar not before final \n */
101     uschar *start_subject; /* Start of the subject string */
102     uschar *end_subject; /* End of the subject string */
103     jmp_buf fail_env; /* Environment for longjump() break out */
104     uschar *end_match_ptr; /* Subject position at end match */
105     int end_offset_top; /* Highwater mark at end of match */
106     } match_data;
107    
108    
109    
110     /*************************************************
111     * Global variables *
112     *************************************************/
113    
114     /* PCRE is thread-clean and doesn't use any global variables in the normal
115     sense. However, it calls memory allocation and free functions via the two
116     indirections below, which are can be changed by the caller, but are shared
117     between all threads. */
118    
119     void *(*pcre_malloc)(size_t) = malloc;
120     void (*pcre_free)(void *) = free;
121    
122    
123    
124    
125     /*************************************************
126     * Return version string *
127     *************************************************/
128    
129     char *
130     pcre_version(void)
131     {
132     return PCRE_VERSION;
133     }
134    
135    
136    
137    
138     /*************************************************
139     * Return info about a compiled pattern *
140     *************************************************/
141    
142     /* This function picks potentially useful data out of the private
143     structure.
144    
145     Arguments:
146     external_re points to compiled code
147     optptr where to pass back the options
148     first_char where to pass back the first character,
149     or -1 if multiline and all branches start ^,
150     or -2 otherwise
151    
152     Returns: number of identifying extraction brackets
153     or negative values on error
154     */
155    
156     int
157     pcre_info(const pcre *external_re, int *optptr, int *first_char)
158     {
159     real_pcre *re = (real_pcre *)external_re;
160     if (re == NULL) return PCRE_ERROR_NULL;
161     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
162     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
163     if (first_char != NULL)
164     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
165     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
166     return re->top_bracket;
167     }
168    
169    
170    
171    
172     #ifdef DEBUG
173     /*************************************************
174     * Debugging function to print chars *
175     *************************************************/
176    
177     /* Print a sequence of chars in printable format, stopping at the end of the
178     subject if the requested.
179    
180     Arguments:
181     p points to characters
182     length number to print
183     is_subject TRUE if printing from within md->start_subject
184     md pointer to matching data block, if is_subject is TRUE
185    
186     Returns: nothing
187     */
188    
189     static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
190     {
191     int c;
192     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
193     while (length-- > 0)
194     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
195     }
196     #endif
197    
198    
199    
200    
201     /*************************************************
202     * Check subpattern for empty operand *
203     *************************************************/
204    
205     /* This function checks a bracketed subpattern to see if any of the paths
206     through it could match an empty string. This is used to diagnose an error if
207     such a subpattern is followed by a quantifier with an unlimited upper bound.
208    
209     Argument:
210     code points to the opening bracket
211    
212     Returns: TRUE or FALSE
213     */
214    
215     static BOOL
216     could_be_empty(uschar *code)
217     {
218     do {
219     uschar *cc = code + 3;
220    
221     /* Scan along the opcodes for this branch; as soon as we find something
222     that matches a non-empty string, break out and advance to test the next
223     branch. If we get to the end of the branch, return TRUE for the whole
224     sub-expression. */
225    
226     for (;;)
227     {
228     /* Test an embedded subpattern; if it could not be empty, break the
229     loop. Otherwise carry on in the branch. */
230    
231     if ((int)(*cc) >= OP_BRA)
232     {
233     if (!could_be_empty(cc)) break;
234     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
235     cc += 3;
236     }
237    
238     else switch (*cc)
239     {
240     /* Reached end of a branch: the subpattern may match the empty string */
241    
242     case OP_ALT:
243     case OP_KET:
244     case OP_KETRMAX:
245     case OP_KETRMIN:
246     return TRUE;
247    
248     /* Skip over assertive subpatterns */
249    
250     case OP_ASSERT:
251     case OP_ASSERT_NOT:
252     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
253     cc += 3;
254     break;
255    
256     /* Skip over things that don't match chars */
257    
258     case OP_SOD:
259     case OP_EOD:
260     case OP_CIRC:
261     case OP_DOLL:
262     case OP_BRAZERO:
263     case OP_BRAMINZERO:
264     case OP_NOT_WORD_BOUNDARY:
265     case OP_WORD_BOUNDARY:
266     cc++;
267     break;
268    
269     /* Skip over simple repeats with zero lower bound */
270    
271     case OP_STAR:
272     case OP_MINSTAR:
273     case OP_QUERY:
274     case OP_MINQUERY:
275     case OP_TYPESTAR:
276     case OP_TYPEMINSTAR:
277     case OP_TYPEQUERY:
278     case OP_TYPEMINQUERY:
279     cc += 2;
280     break;
281    
282     /* Skip over UPTOs (lower bound is zero) */
283    
284     case OP_UPTO:
285     case OP_MINUPTO:
286     case OP_TYPEUPTO:
287     case OP_TYPEMINUPTO:
288     cc += 4;
289     break;
290    
291     /* Check a class or a back reference for a zero minimum */
292    
293     case OP_CLASS:
294     case OP_REF:
295     cc += (*cc == OP_REF)? 2 : 4 + 2 * cc[2] + cc[3];
296    
297     switch (*cc)
298     {
299     case OP_CRSTAR:
300     case OP_CRMINSTAR:
301     case OP_CRQUERY:
302     case OP_CRMINQUERY:
303     cc++;
304     break;
305    
306     case OP_CRRANGE:
307     case OP_CRMINRANGE:
308     if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
309     cc += 3;
310     break;
311    
312     default:
313     goto NEXT_BRANCH;
314     }
315     break;
316    
317     /* Anything else matches at least one character */
318    
319     default:
320     goto NEXT_BRANCH;
321     }
322     }
323    
324     NEXT_BRANCH:
325     code += (code[1] << 8) + code[2];
326     }
327     while (*code == OP_ALT);
328    
329     /* No branches match the empty string */
330    
331     return FALSE;
332     }
333    
334    
335    
336     /*************************************************
337     * Handle escapes *
338     *************************************************/
339    
340     /* This function is called when a \ has been encountered. It either returns a
341     positive value for a simple escape such as \n, or a negative value which
342     encodes one of the more complicated things such as \d. On entry, ptr is
343     pointing at the \. On exit, it is on the final character of the escape
344     sequence.
345    
346     Arguments:
347     ptrptr points to the pattern position pointer
348     errorptr points to the pointer to the error message
349     bracount number of previous extracting brackets
350     options the options bits
351     isclass TRUE if inside a character class
352    
353     Returns: zero or positive => a data character
354     negative => a special escape sequence
355     on error, errorptr is set
356     */
357    
358     static int
359     check_escape(uschar **ptrptr, char **errorptr, int bracount, int options,
360     BOOL isclass)
361     {
362     uschar *ptr = *ptrptr;
363     int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
364     int i;
365    
366     if (c == 0) *errorptr = ERR1;
367    
368     /* Digits or letters may have special meaning; all others are literals. */
369    
370     else if (c < '0' || c > 'z') {}
371    
372     /* Do an initial lookup in a table. A non-zero result is something that can be
373     returned immediately. Otherwise further processing may be required. */
374    
375     else if ((i = escapes[c - '0']) != 0) c = i;
376    
377     /* Escapes that need further processing, or are illegal. */
378    
379     else
380     {
381     uschar *oldptr;
382     switch (c)
383     {
384     /* The handling of escape sequences consisting of a string of digits
385     starting with one that is not zero is not straightforward. By experiment,
386     the way Perl works seems to be as follows:
387    
388     Outside a character class, the digits are read as a decimal number. If the
389     number is less than 10, or if there are that many previous extracting
390     left brackets, then it is a back reference. Otherwise, up to three octal
391     digits are read to form an escaped byte. Thus \123 is likely to be octal
392     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
393     value is greater than 377, the least significant 8 bits are taken. Inside a
394     character class, \ followed by a digit is always an octal number. */
395    
396     case '1': case '2': case '3': case '4': case '5':
397     case '6': case '7': case '8': case '9':
398    
399     if (!isclass)
400     {
401     oldptr = ptr;
402     c -= '0';
403     while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
404     c = c * 10 + *(++ptr) - '0';
405     if (c < 10 || c <= bracount)
406     {
407     c = -(ESC_REF + c);
408     break;
409     }
410     ptr = oldptr; /* Put the pointer back and fall through */
411     }
412    
413     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
414     generates a binary zero byte and treats the digit as a following literal.
415     Thus we have to pull back the pointer by one. */
416    
417     if ((c = *ptr) >= '8')
418     {
419     ptr--;
420     c = 0;
421     break;
422     }
423    
424     /* \0 always starts an octal number, but we may drop through to here with a
425     larger first octal digit */
426    
427     case '0':
428     c -= '0';
429     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
430     ptr[1] != '8' && ptr[1] != '9')
431     c = c * 8 + *(++ptr) - '0';
432     break;
433    
434     /* Special escapes not starting with a digit are straightforward */
435    
436     case 'x':
437     c = 0;
438     while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
439     {
440     ptr++;
441     c = c * 16 + pcre_lcc[*ptr] -
442     (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
443     }
444     break;
445    
446     case 'c':
447     c = *(++ptr);
448     if (c == 0)
449     {
450     *errorptr = ERR2;
451     return 0;
452     }
453    
454     /* A letter is upper-cased; then the 0x40 bit is flipped */
455    
456     if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
457     c ^= 0x40;
458     break;
459    
460     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
461     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
462     for Perl compatibility, it is a literal. */
463    
464     default:
465     if ((options & PCRE_EXTRA) != 0) switch(c)
466     {
467     case 'X':
468     c = -ESC_X; /* This could be a lookup if it ever got into Perl */
469     break;
470    
471     default:
472     *errorptr = ERR3;
473     break;
474     }
475     break;
476     }
477     }
478    
479     *ptrptr = ptr;
480     return c;
481     }
482    
483    
484    
485     /*************************************************
486     * Check for counted repeat *
487     *************************************************/
488    
489     /* This function is called when a '{' is encountered in a place where it might
490     start a quantifier. It looks ahead to see if it really is a quantifier or not.
491     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
492     where the ddds are digits.
493    
494     Arguments:
495     p pointer to the first char after '{'
496    
497     Returns: TRUE or FALSE
498     */
499    
500     static BOOL
501     is_counted_repeat(uschar *p)
502     {
503     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
504     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
505     if (*p == '}') return TRUE;
506    
507     if (*p++ != ',') return FALSE;
508     if (*p == '}') return TRUE;
509    
510     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
511     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
512     return (*p == '}');
513     }
514    
515    
516    
517     /*************************************************
518     * Read repeat counts *
519     *************************************************/
520    
521     /* Read an item of the form {n,m} and return the values. This is called only
522     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
523     so the syntax is guaranteed to be correct, but we need to check the values.
524    
525     Arguments:
526     p pointer to first char after '{'
527     minp pointer to int for min
528     maxp pointer to int for max
529     returned as -1 if no max
530     errorptr points to pointer to error message
531    
532     Returns: pointer to '}' on success;
533     current ptr on error, with errorptr set
534     */
535    
536     static uschar *
537     read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)
538     {
539     int min = 0;
540     int max = -1;
541    
542     while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
543    
544     if (*p == '}') max = min; else
545     {
546     if (*(++p) != '}')
547     {
548     max = 0;
549     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
550     if (max < min)
551     {
552     *errorptr = ERR4;
553     return p;
554     }
555     }
556     }
557    
558     /* Do paranoid checks, then fill in the required variables, and pass back the
559     pointer to the terminating '}'. */
560    
561     if (min > 65535 || max > 65535)
562     *errorptr = ERR5;
563     else
564     {
565     *minp = min;
566     *maxp = max;
567     }
568     return p;
569     }
570    
571    
572    
573     /*************************************************
574     * Compile one branch *
575     *************************************************/
576    
577     /* Scan the pattern, compiling it into the code vector.
578    
579     Arguments:
580     options the option bits
581     bracket points to number of brackets used
582     code points to the pointer to the current code point
583     ptrptr points to the current pattern pointer
584     errorptr points to pointer to error message
585    
586     Returns: TRUE on success
587     FALSE, with *errorptr set on error
588     */
589    
590     static BOOL
591     compile_branch(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
592     char **errorptr)
593     {
594     int repeat_type, op_type;
595     int repeat_min, repeat_max;
596     int bravalue, length;
597     register int c;
598     register uschar *code = *codeptr;
599     uschar *ptr = *ptrptr;
600     uschar *previous = NULL;
601     uschar *oldptr;
602     uschar class[32];
603    
604     /* Switch on next character until the end of the branch */
605    
606     for (;; ptr++)
607     {
608     BOOL negate_class;
609     int class_charcount;
610     int class_lastchar;
611    
612     c = *ptr;
613     if ((options & PCRE_EXTENDED) != 0)
614     {
615     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
616     if (c == '#')
617     {
618     while ((c = *(++ptr)) != 0 && c != '\n');
619     continue;
620     }
621     }
622    
623     switch(c)
624     {
625     /* The branch terminates at end of string, |, or ). */
626    
627     case 0:
628     case '|':
629     case ')':
630     *codeptr = code;
631     *ptrptr = ptr;
632     return TRUE;
633    
634     /* Handle single-character metacharacters */
635    
636     case '^':
637     previous = NULL;
638     *code++ = OP_CIRC;
639     break;
640    
641     case '$':
642     previous = NULL;
643     *code++ = OP_DOLL;
644     break;
645    
646     case '.':
647     previous = code;
648     *code++ = OP_ANY;
649     break;
650    
651     /* Character classes. These always build a 32-byte bitmap of the permitted
652     characters, except in the special case where there is only one character.
653     For negated classes, we build the map as usual, then invert it at the end.
654     */
655    
656     case '[':
657     previous = code;
658     *code++ = OP_CLASS;
659    
660     /* If the first character is '^', set the negation flag */
661    
662     if ((c = *(++ptr)) == '^')
663     {
664     negate_class = TRUE;
665     c = *(++ptr);
666     }
667     else negate_class = FALSE;
668    
669     /* Keep a count of chars so that we can optimize the case of just a single
670     character. */
671    
672     class_charcount = 0;
673     class_lastchar = -1;
674    
675     /* Initialize the 32-char bit map to all zeros. We have to build the
676     map in a temporary bit of store, in case the class contains only 1
677     character, because in that case the compiled code doesn't use the
678     bit map. */
679    
680     memset(class, 0, 32 * sizeof(uschar));
681    
682     /* Process characters until ] is reached. By writing this as a "do" it
683     means that an initial ] is taken as a data character. */
684    
685     do
686     {
687     if (c == 0)
688     {
689     *errorptr = ERR6;
690     goto FAILED;
691     }
692    
693     /* Backslash may introduce a single character, or it may introduce one
694     of the specials, which just set a flag. Escaped items are checked for
695     validity in the pre-compiling pass. The sequence \b is a special case.
696     Inside a class (and only there) it is treated as backslash. Elsewhere
697     it marks a word boundary. Other escapes have preset maps ready to
698     or into the one we are building. We assume they have more than one
699     character in them, so set class_count bigger than one. */
700    
701     if (c == '\\')
702     {
703     c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
704     if (-c == ESC_b) c = '\b';
705     else if (c < 0)
706     {
707     class_charcount = 10;
708     switch (-c)
709     {
710     case ESC_d:
711     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
712     continue;
713    
714     case ESC_D:
715     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
716     continue;
717    
718     case ESC_w:
719     for (c = 0; c < 32; c++)
720     class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
721     continue;
722    
723     case ESC_W:
724     for (c = 0; c < 32; c++)
725     class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
726     continue;
727    
728     case ESC_s:
729     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
730     continue;
731    
732     case ESC_S:
733     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
734     continue;
735    
736     default:
737     *errorptr = ERR7;
738     goto FAILED;
739     }
740     }
741     /* Fall through if single character */
742     }
743    
744     /* A single character may be followed by '-' to form a range. However,
745     Perl does not permit ']' to be the end of the range. A '-' character
746     here is treated as a literal. */
747    
748     if (ptr[1] == '-' && ptr[2] != ']')
749     {
750     int d;
751     ptr += 2;
752     d = *ptr;
753    
754     if (d == 0)
755     {
756     *errorptr = ERR6;
757     goto FAILED;
758     }
759    
760     /* The second part of a range can be a single-character escape, but
761     not any of the other escapes. */
762    
763     if (d == '\\')
764     {
765     d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
766     if (d < 0)
767     {
768     if (d == -ESC_b) d = '\b'; else
769     {
770     *errorptr = ERR7;
771     goto FAILED;
772     }
773     }
774     }
775    
776     if (d < c)
777     {
778     *errorptr = ERR8;
779     goto FAILED;
780     }
781    
782     for (; c <= d; c++)
783     {
784     class[c/8] |= (1 << (c&7));
785     if ((options & PCRE_CASELESS) != 0)
786     {
787     int uc = pcre_fcc[c]; /* flip case */
788     class[uc/8] |= (1 << (uc&7));
789     }
790     class_charcount++; /* in case a one-char range */
791     class_lastchar = c;
792     }
793     continue; /* Go get the next char in the class */
794     }
795    
796     /* Handle a lone single character - we can get here for a normal
797     non-escape char, or after \ that introduces a single character. */
798    
799     class [c/8] |= (1 << (c&7));
800     if ((options & PCRE_CASELESS) != 0)
801     {
802     c = pcre_fcc[c]; /* flip case */
803     class[c/8] |= (1 << (c&7));
804     }
805     class_charcount++;
806     class_lastchar = c;
807     }
808    
809     /* Loop until ']' reached; the check for end of string happens inside the
810     loop. This "while" is the end of the "do" above. */
811    
812     while ((c = *(++ptr)) != ']');
813    
814     /* If class_charcount is 1 and class_lastchar is not negative, we saw
815     precisely one character. This doesn't need the whole 32-byte bit map.
816     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
817     it's negative. */
818    
819     if (class_charcount == 1 && class_lastchar >= 0)
820     {
821     if (negate_class)
822     {
823     code[-1] = OP_NOT;
824     }
825     else
826     {
827     code[-1] = OP_CHARS;
828     *code++ = 1;
829     }
830     *code++ = class_lastchar;
831     }
832    
833     /* Otherwise, negate the 32-byte map if necessary, and copy it into
834     the code vector. */
835    
836     else
837     {
838     if (negate_class)
839     for (c = 0; c < 32; c++) code[c] = ~class[c];
840     else
841     memcpy(code, class, 32);
842     code += 32;
843     }
844     break;
845    
846     /* Various kinds of repeat */
847    
848     case '{':
849     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
850     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
851     if (*errorptr != NULL) goto FAILED;
852     goto REPEAT;
853    
854     case '*':
855     repeat_min = 0;
856     repeat_max = -1;
857     goto REPEAT;
858    
859     case '+':
860     repeat_min = 1;
861     repeat_max = -1;
862     goto REPEAT;
863    
864     case '?':
865     repeat_min = 0;
866     repeat_max = 1;
867    
868     REPEAT:
869     if (previous == NULL)
870     {
871     *errorptr = ERR9;
872     goto FAILED;
873     }
874    
875     /* If the next character is '?' this is a minimizing repeat. Advance to the
876     next character. */
877    
878     if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
879    
880     /* If the maximum is zero then the minimum must also be zero; Perl allows
881     this case, so we do too - by simply omitting the item altogether. */
882    
883     if (repeat_max == 0) code = previous;
884    
885     /* If previous was a string of characters, chop off the last one and use it
886     as the subject of the repeat. If there was only one character, we can
887     abolish the previous item altogether. */
888    
889     else if (*previous == OP_CHARS)
890     {
891     int len = previous[1];
892     if (len == 1)
893     {
894     c = previous[2];
895     code = previous;
896     }
897     else
898     {
899     c = previous[len+1];
900     previous[1]--;
901     code--;
902     }
903     op_type = 0; /* Use single-char op codes */
904     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
905     }
906    
907     /* If previous was a single negated character ([^a] or similar), we use
908     one of the special opcodes, replacing it. The code is shared with single-
909     character repeats by adding a suitable offset into repeat_type. */
910    
911     else if ((int)*previous == OP_NOT)
912     {
913     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
914     c = previous[1];
915     code = previous;
916     goto OUTPUT_SINGLE_REPEAT;
917     }
918    
919     /* If previous was a character type match (\d or similar), abolish it and
920     create a suitable repeat item. The code is shared with single-character
921     repeats by adding a suitable offset into repeat_type. */
922    
923     else if ((int)*previous < OP_EOD || *previous == OP_ANY)
924     {
925     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
926     c = *previous;
927     code = previous;
928    
929     OUTPUT_SINGLE_REPEAT:
930     repeat_type += op_type; /* Combine both values for many cases */
931    
932     /* A minimum of zero is handled either as the special case * or ?, or as
933     an UPTO, with the maximum given. */
934    
935     if (repeat_min == 0)
936     {
937     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
938     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
939     else
940     {
941     *code++ = OP_UPTO + repeat_type;
942     *code++ = repeat_max >> 8;
943     *code++ = (repeat_max & 255);
944     }
945     }
946    
947     /* The case {1,} is handled as the special case + */
948    
949     else if (repeat_min == 1 && repeat_max == -1)
950     *code++ = OP_PLUS + repeat_type;
951    
952     /* The case {n,n} is just an EXACT, while the general case {n,m} is
953     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
954    
955     else
956     {
957     if (repeat_min != 1)
958     {
959     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
960     *code++ = repeat_min >> 8;
961     *code++ = (repeat_min & 255);
962     }
963    
964     /* If the mininum is 1 and the previous item was a character string,
965     we either have to put back the item that got cancelled if the string
966     length was 1, or add the character back onto the end of a longer
967     string. For a character type nothing need be done; it will just get put
968     back naturally. */
969    
970     else if (*previous == OP_CHARS)
971     {
972     if (code == previous) code += 2; else previous[1]++;
973     }
974    
975     /* Insert an UPTO if the max is greater than the min. */
976    
977     if (repeat_max != repeat_min)
978     {
979     *code++ = c;
980     repeat_max -= repeat_min;
981     *code++ = OP_UPTO + repeat_type;
982     *code++ = repeat_max >> 8;
983     *code++ = (repeat_max & 255);
984     }
985     }
986    
987     /* The character or character type itself comes last in all cases. */
988    
989     *code++ = c;
990     }
991    
992     /* If previous was a character class or a back reference, we put the repeat
993     stuff after it. */
994    
995     else if (*previous == OP_CLASS || *previous == OP_REF)
996     {
997     if (repeat_min == 0 && repeat_max == -1)
998     *code++ = OP_CRSTAR + repeat_type;
999     else if (repeat_min == 1 && repeat_max == -1)
1000     *code++ = OP_CRPLUS + repeat_type;
1001     else if (repeat_min == 0 && repeat_max == 1)
1002     *code++ = OP_CRQUERY + repeat_type;
1003     else
1004     {
1005     *code++ = OP_CRRANGE + repeat_type;
1006     *code++ = repeat_min >> 8;
1007     *code++ = repeat_min & 255;
1008     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1009     *code++ = repeat_max >> 8;
1010     *code++ = repeat_max & 255;
1011     }
1012     }
1013    
1014     /* If previous was a bracket group, we may have to replicate it in certain
1015     cases. If the maximum repeat count is unlimited, check that the bracket
1016     group cannot match the empty string, and diagnose an error if it can. */
1017    
1018     else if ((int)*previous >= OP_BRA)
1019     {
1020     int i;
1021     int length = code - previous;
1022    
1023     if (repeat_max == -1 && could_be_empty(previous))
1024     {
1025     *errorptr = ERR10;
1026     goto FAILED;
1027     }
1028    
1029     /* If the minimum is greater than zero, and the maximum is unlimited or
1030     equal to the minimum, the first copy remains where it is, and is
1031     replicated up to the minimum number of times. This case includes the +
1032     repeat, but of course no replication is needed in that case. */
1033    
1034     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1035     {
1036     for (i = 1; i < repeat_min; i++)
1037     {
1038     memcpy(code, previous, length);
1039     code += length;
1040     }
1041     }
1042    
1043     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1044     Then, if there is a fixed upper limit, replicated up to that many times,
1045     sticking BRAZERO in front of all the optional ones. */
1046    
1047     else
1048     {
1049     if (repeat_min == 0)
1050     {
1051     memmove(previous+1, previous, length);
1052     code++;
1053     *previous++ = OP_BRAZERO + repeat_type;
1054     }
1055    
1056     for (i = 1; i < repeat_min; i++)
1057     {
1058     memcpy(code, previous, length);
1059     code += length;
1060     }
1061    
1062     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1063     {
1064     *code++ = OP_BRAZERO + repeat_type;
1065     memcpy(code, previous, length);
1066     code += length;
1067     }
1068     }
1069    
1070     /* If the maximum is unlimited, set a repeater in the final copy. */
1071    
1072     if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1073     }
1074    
1075     /* Else there's some kind of shambles */
1076    
1077     else
1078     {
1079     *errorptr = ERR11;
1080     goto FAILED;
1081     }
1082    
1083     /* In all case we no longer have a previous item. */
1084    
1085     previous = NULL;
1086     break;
1087    
1088    
1089     /* Start of nested bracket sub-expression, or comment or lookahead.
1090     First deal with special things that can come after a bracket; all are
1091     introduced by ?, and the appearance of any of them means that this is not a
1092     referencing group. They were checked for validity in the first pass over
1093     the string, so we don't have to check for syntax errors here. */
1094    
1095     case '(':
1096     previous = code; /* Only real brackets can be repeated */
1097     if (*(++ptr) == '?')
1098     {
1099     bravalue = OP_BRA;
1100    
1101     switch (*(++ptr))
1102     {
1103     case '#':
1104     case 'i':
1105     case 'm':
1106     case 's':
1107     case 'x':
1108     ptr++;
1109     while (*ptr != ')') ptr++;
1110     previous = NULL;
1111     continue;
1112    
1113     case ':': /* Non-extracting bracket */
1114     ptr++;
1115     break;
1116    
1117     case '=': /* Assertions can't be repeated */
1118     bravalue = OP_ASSERT;
1119     ptr++;
1120     previous = NULL;
1121     break;
1122    
1123     case '!':
1124     bravalue = OP_ASSERT_NOT;
1125     ptr++;
1126     previous = NULL;
1127     break;
1128    
1129     case '>': /* "Match once" brackets */
1130     if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1131     {
1132     bravalue = OP_ONCE;
1133     ptr++;
1134     previous = NULL;
1135     break;
1136     }
1137     /* Else fall through */
1138    
1139     default:
1140     *errorptr = ERR12;
1141     goto FAILED;
1142     }
1143     }
1144    
1145     /* Else we have a referencing group */
1146    
1147     else
1148     {
1149     if (++(*brackets) > EXTRACT_MAX)
1150     {
1151     *errorptr = ERR13;
1152     goto FAILED;
1153     }
1154     bravalue = OP_BRA + *brackets;
1155     }
1156    
1157     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1158     code into a non-register variable in order to be able to pass its address
1159     because some compilers complain otherwise. */
1160    
1161     *code = bravalue;
1162     {
1163     uschar *mcode = code;
1164     if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1165     goto FAILED;
1166     code = mcode;
1167     }
1168    
1169     if (*ptr != ')')
1170     {
1171     *errorptr = ERR14;
1172     goto FAILED;
1173     }
1174     break;
1175    
1176     /* Check \ for being a real metacharacter; if not, fall through and handle
1177     it as a data character at the start of a string. Escape items are checked
1178     for validity in the pre-compiling pass. */
1179    
1180     case '\\':
1181     oldptr = ptr;
1182     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1183    
1184     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1185     are arranged to be the negation of the corresponding OP_values. For the
1186     back references, the values are ESC_REF plus the reference number. Only
1187     back references and those types that consume a character may be repeated.
1188     We can test for values between ESC_b and ESC_Z for the latter; this may
1189     have to change if any new ones are ever created. */
1190    
1191     if (c < 0)
1192     {
1193     if (-c >= ESC_REF)
1194     {
1195     int refnum = -c - ESC_REF;
1196     if (*brackets < refnum)
1197     {
1198     *errorptr = ERR15;
1199     goto FAILED;
1200     }
1201     previous = code;
1202     *code++ = OP_REF;
1203     *code++ = refnum;
1204     }
1205     else
1206     {
1207     previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1208     *code++ = -c;
1209     }
1210     continue;
1211     }
1212    
1213     /* Reset and fall through */
1214    
1215     ptr = oldptr;
1216     c = '\\';
1217    
1218     /* Handle a run of data characters until a metacharacter is encountered.
1219     The first character is guaranteed not to be whitespace or # when the
1220     extended flag is set. */
1221    
1222     NORMAL_CHAR:
1223     default:
1224     previous = code;
1225     *code = OP_CHARS;
1226     code += 2;
1227     length = 0;
1228    
1229     do
1230     {
1231     if ((options & PCRE_EXTENDED) != 0)
1232     {
1233     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1234     if (c == '#')
1235     {
1236     while ((c = *(++ptr)) != 0 && c != '\n');
1237     if (c == 0) break;
1238     continue;
1239     }
1240     }
1241    
1242     /* Backslash may introduce a data char or a metacharacter. Escaped items
1243     are checked for validity in the pre-compiling pass. Stop the string
1244     before a metaitem. */
1245    
1246     if (c == '\\')
1247     {
1248     oldptr = ptr;
1249     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1250     if (c < 0) { ptr = oldptr; break; }
1251     }
1252    
1253     /* Ordinary character or single-char escape */
1254    
1255     *code++ = c;
1256     length++;
1257     }
1258    
1259     /* This "while" is the end of the "do" above. */
1260    
1261     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1262    
1263     /* Compute the length and set it in the data vector, and advance to
1264     the next state. */
1265    
1266     previous[1] = length;
1267     ptr--;
1268     break;
1269     }
1270     } /* end of big loop */
1271    
1272     /* Control never reaches here by falling through, only by a goto for all the
1273     error states. Pass back the position in the pattern so that it can be displayed
1274     to the user for diagnosing the error. */
1275    
1276     FAILED:
1277     *ptrptr = ptr;
1278     return FALSE;
1279     }
1280    
1281    
1282    
1283    
1284     /*************************************************
1285     * Compile sequence of alternatives *
1286     *************************************************/
1287    
1288     /* On entry, ptr is pointing past the bracket character, but on return
1289     it points to the closing bracket, or vertical bar, or end of string.
1290     The code variable is pointing at the byte into which the BRA operator has been
1291     stored.
1292    
1293     Argument:
1294     options the option bits
1295     brackets -> int containing the number of extracting brackets used
1296     codeptr -> the address of the current code pointer
1297     ptrptr -> the address of the current pattern pointer
1298     errorptr -> pointer to error message
1299    
1300     Returns: TRUE on success
1301     */
1302    
1303     static BOOL
1304     compile_regex(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
1305     char **errorptr)
1306     {
1307     uschar *ptr = *ptrptr;
1308     uschar *code = *codeptr;
1309     uschar *start_bracket = code;
1310    
1311     for (;;)
1312     {
1313     int length;
1314     uschar *last_branch = code;
1315    
1316     code += 3;
1317     if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1318     {
1319     *ptrptr = ptr;
1320     return FALSE;
1321     }
1322    
1323     /* Fill in the length of the last branch */
1324    
1325     length = code - last_branch;
1326     last_branch[1] = length >> 8;
1327     last_branch[2] = length & 255;
1328    
1329     /* Reached end of expression, either ')' or end of pattern. Insert a
1330     terminating ket and the length of the whole bracketed item, and return,
1331     leaving the pointer at the terminating char. */
1332    
1333     if (*ptr != '|')
1334     {
1335     length = code - start_bracket;
1336     *code++ = OP_KET;
1337     *code++ = length >> 8;
1338     *code++ = length & 255;
1339     *codeptr = code;
1340     *ptrptr = ptr;
1341     return TRUE;
1342     }
1343    
1344     /* Another branch follows; insert an "or" node and advance the pointer. */
1345    
1346     *code = OP_ALT;
1347     ptr++;
1348     }
1349     /* Control never reaches here */
1350     }
1351    
1352    
1353    
1354     /*************************************************
1355     * Check for anchored expression *
1356     *************************************************/
1357    
1358     /* Try to find out if this is an anchored regular expression. Consider each
1359     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1360     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1361     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1362     counts, since OP_CIRC can match in the middle.
1363    
1364     A branch is also implicitly anchored if it starts with .* because that will try
1365     the rest of the pattern at all possible matching points, so there is no point
1366     trying them again.
1367    
1368     Argument: points to start of expression (the bracket)
1369     Returns: TRUE or FALSE
1370     */
1371    
1372     static BOOL
1373     is_anchored(register uschar *code, BOOL multiline)
1374     {
1375     do {
1376     int op = (int)code[3];
1377     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1378     { if (!is_anchored(code+3, multiline)) return FALSE; }
1379     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1380     { if (code[4] != OP_ANY) return FALSE; }
1381     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1382     code += (code[1] << 8) + code[2];
1383     }
1384     while (*code == OP_ALT);
1385     return TRUE;
1386     }
1387    
1388    
1389    
1390     /*************************************************
1391     * Check for start with \n line expression *
1392     *************************************************/
1393    
1394     /* This is called for multiline expressions to try to find out if every branch
1395     starts with ^ so that "first char" processing can be done to speed things up.
1396    
1397     Argument: points to start of expression (the bracket)
1398     Returns: TRUE or FALSE
1399     */
1400    
1401     static BOOL
1402     is_startline(uschar *code)
1403     {
1404     do {
1405     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1406     { if (!is_startline(code+3)) return FALSE; }
1407     else if (code[3] != OP_CIRC) return FALSE;
1408     code += (code[1] << 8) + code[2];
1409     }
1410     while (*code == OP_ALT);
1411     return TRUE;
1412     }
1413    
1414    
1415    
1416     /*************************************************
1417     * Check for fixed first char *
1418     *************************************************/
1419    
1420     /* Try to find out if there is a fixed first character. This is called for
1421     unanchored expressions, as it speeds up their processing quite considerably.
1422     Consider each alternative branch. If they all start with the same char, or with
1423     a bracket all of whose alternatives start with the same char (recurse ad lib),
1424     then we return that char, otherwise -1.
1425    
1426     Argument: points to start of expression (the bracket)
1427     Returns: -1 or the fixed first char
1428     */
1429    
1430     static int
1431     find_firstchar(uschar *code)
1432     {
1433     register int c = -1;
1434     do
1435     {
1436     register int charoffset = 4;
1437    
1438     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1439     {
1440     register int d;
1441     if ((d = find_firstchar(code+3)) < 0) return -1;
1442     if (c < 0) c = d; else if (c != d) return -1;
1443     }
1444    
1445     else switch(code[3])
1446     {
1447     default:
1448     return -1;
1449    
1450     case OP_EXACT: /* Fall through */
1451     charoffset++;
1452    
1453     case OP_CHARS: /* Fall through */
1454     charoffset++;
1455    
1456     case OP_PLUS:
1457     case OP_MINPLUS:
1458     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1459     break;
1460     }
1461     code += (code[1] << 8) + code[2];
1462     }
1463     while (*code == OP_ALT);
1464     return c;
1465     }
1466    
1467    
1468    
1469     /*************************************************
1470     * Compile a Regular Expression *
1471     *************************************************/
1472    
1473     /* This function takes a string and returns a pointer to a block of store
1474     holding a compiled version of the expression.
1475    
1476     Arguments:
1477     pattern the regular expression
1478     options various option bits
1479     errorptr pointer to pointer to error text
1480     erroroffset ptr offset in pattern where error was detected
1481    
1482     Returns: pointer to compiled data block, or NULL on error,
1483     with errorptr and erroroffset set
1484     */
1485    
1486     pcre *
1487     pcre_compile(const char *pattern, int options, char **errorptr,
1488     int *erroroffset)
1489     {
1490     real_pcre *re;
1491     int spaces = 0;
1492     int length = 3; /* For initial BRA plus length */
1493     int runlength;
1494     int c, size;
1495     int bracount = 0;
1496     int brastack[200];
1497     int brastackptr = 0;
1498     int top_backref = 0;
1499     uschar *code, *ptr;
1500    
1501     #ifdef DEBUG
1502     uschar *code_base, *code_end;
1503     #endif
1504    
1505     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1506     can do is just return NULL. */
1507    
1508     if (errorptr == NULL) return NULL;
1509     *errorptr = NULL;
1510    
1511     /* However, we can give a message for this error */
1512    
1513     if (erroroffset == NULL)
1514     {
1515     *errorptr = ERR16;
1516     return NULL;
1517     }
1518     *erroroffset = 0;
1519    
1520     if ((options & ~PUBLIC_OPTIONS) != 0)
1521     {
1522     *errorptr = ERR17;
1523     return NULL;
1524     }
1525    
1526     #ifdef DEBUG
1527     printf("------------------------------------------------------------------\n");
1528     printf("%s\n", pattern);
1529     #endif
1530    
1531     /* The first thing to do is to make a pass over the pattern to compute the
1532     amount of store required to hold the compiled code. This does not have to be
1533     perfect as long as errors are overestimates. At the same time we can detect any
1534     internal flag settings. Make an attempt to correct for any counted white space
1535     if an "extended" flag setting appears late in the pattern. We can't be so
1536     clever for #-comments. */
1537    
1538     ptr = (uschar *)(pattern - 1);
1539     while ((c = *(++ptr)) != 0)
1540     {
1541     int min, max;
1542     int class_charcount;
1543    
1544     if ((pcre_ctypes[c] & ctype_space) != 0)
1545     {
1546     if ((options & PCRE_EXTENDED) != 0) continue;
1547     spaces++;
1548     }
1549    
1550     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1551     {
1552     while ((c = *(++ptr)) != 0 && c != '\n');
1553     continue;
1554     }
1555    
1556     switch(c)
1557     {
1558     /* A backslashed item may be an escaped "normal" character or a
1559     character type. For a "normal" character, put the pointers and
1560     character back so that tests for whitespace etc. in the input
1561     are done correctly. */
1562    
1563     case '\\':
1564     {
1565     uschar *save_ptr = ptr;
1566     c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1567     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1568     if (c >= 0)
1569     {
1570     ptr = save_ptr;
1571     c = '\\';
1572     goto NORMAL_CHAR;
1573     }
1574     }
1575     length++;
1576    
1577     /* A back reference needs an additional char, plus either one or 5
1578     bytes for a repeat. We also need to keep the value of the highest
1579     back reference. */
1580    
1581     if (c <= -ESC_REF)
1582     {
1583     int refnum = -c - ESC_REF;
1584     if (refnum > top_backref) top_backref = refnum;
1585     length++; /* For single back reference */
1586     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1587     {
1588     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1589     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1590     if ((min == 0 && (max == 1 || max == -1)) ||
1591     (min == 1 && max == -1))
1592     length++;
1593     else length += 5;
1594     if (ptr[1] == '?') ptr++;
1595     }
1596     }
1597     continue;
1598    
1599     case '^':
1600     case '.':
1601     case '$':
1602     case '*': /* These repeats won't be after brackets; */
1603     case '+': /* those are handled separately */
1604     case '?':
1605     length++;
1606     continue;
1607    
1608     /* This covers the cases of repeats after a single char, metachar, class,
1609     or back reference. */
1610    
1611     case '{':
1612     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1613     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1614     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1615     if ((min == 0 && (max == 1 || max == -1)) ||
1616     (min == 1 && max == -1))
1617     length++;
1618     else
1619     {
1620     length--; /* Uncount the original char or metachar */
1621     if (min == 1) length++; else if (min > 0) length += 4;
1622     if (max > 0) length += 4; else length += 2;
1623     }
1624     if (ptr[1] == '?') ptr++;
1625     continue;
1626    
1627     /* An alternation contains an offset to the next branch or ket. */
1628     case '|':
1629     length += 3;
1630     continue;
1631    
1632     /* A character class uses 33 characters. Don't worry about character types
1633     that aren't allowed in classes - they'll get picked up during the compile.
1634     A character class that contains only one character uses 2 or 3 bytes,
1635     depending on whether it is negated or not. Notice this where we can. */
1636    
1637     case '[':
1638     class_charcount = 0;
1639     if (*(++ptr) == '^') ptr++;
1640     do
1641     {
1642     if (*ptr == '\\')
1643     {
1644     int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
1645     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1646     if (-c == ESC_b) class_charcount++; else class_charcount = 10;
1647     }
1648     else class_charcount++;
1649     ptr++;
1650     }
1651     while (*ptr != 0 && *ptr != ']');
1652    
1653     /* Repeats for negated single chars are handled by the general code */
1654    
1655     if (class_charcount == 1) length += 3; else
1656     {
1657     length += 33;
1658    
1659     /* A repeat needs either 1 or 5 bytes. */
1660    
1661     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1662     {
1663     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1664     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1665     if ((min == 0 && (max == 1 || max == -1)) ||
1666     (min == 1 && max == -1))
1667     length++;
1668     else length += 5;
1669     if (ptr[1] == '?') ptr++;
1670     }
1671     }
1672     continue;
1673    
1674     /* Brackets may be genuine groups or special things */
1675    
1676     case '(':
1677    
1678     /* Handle special forms of bracket, which all start (? */
1679    
1680     if (ptr[1] == '?') switch (c = ptr[2])
1681     {
1682     /* Skip over comments entirely */
1683     case '#':
1684     ptr += 3;
1685     while (*ptr != 0 && *ptr != ')') ptr++;
1686     if (*ptr == 0)
1687     {
1688     *errorptr = ERR18;
1689     goto PCRE_ERROR_RETURN;
1690     }
1691     continue;
1692    
1693     /* Non-referencing groups and lookaheads just move the pointer on, and
1694     then behave like a non-special bracket, except that they don't increment
1695     the count of extracting brackets. */
1696    
1697     case ':':
1698     case '=':
1699     case '!':
1700     ptr += 2;
1701     break;
1702    
1703     /* Ditto for the "once only" bracket, allowed only if the extra bit
1704     is set. */
1705    
1706     case '>':
1707     if ((options & PCRE_EXTRA) != 0)
1708     {
1709     ptr += 2;
1710     break;
1711     }
1712     /* Else fall thourh */
1713    
1714     /* Else loop setting valid options until ) is met. Anything else is an
1715     error. */
1716    
1717     default:
1718     ptr += 2;
1719     for (;; ptr++)
1720     {
1721     if ((c = *ptr) == 'i')
1722     {
1723     options |= PCRE_CASELESS;
1724     continue;
1725     }
1726     else if ((c = *ptr) == 'm')
1727     {
1728     options |= PCRE_MULTILINE;
1729     continue;
1730     }
1731     else if (c == 's')
1732     {
1733     options |= PCRE_DOTALL;
1734     continue;
1735     }
1736     else if (c == 'x')
1737     {
1738     options |= PCRE_EXTENDED;
1739     length -= spaces; /* Already counted spaces */
1740     continue;
1741     }
1742     else if (c == ')') break;
1743    
1744     *errorptr = ERR12;
1745     goto PCRE_ERROR_RETURN;
1746     }
1747     continue; /* End of this bracket handling */
1748     }
1749    
1750     /* Extracting brackets must be counted so we can process escapes in a
1751     Perlish way. */
1752    
1753     else bracount++;
1754    
1755     /* Non-special forms of bracket. Save length for computing whole length
1756     at end if there's a repeat that requires duplication of the group. */
1757    
1758     if (brastackptr >= sizeof(brastack)/sizeof(int))
1759     {
1760     *errorptr = ERR19;
1761     goto PCRE_ERROR_RETURN;
1762     }
1763    
1764     brastack[brastackptr++] = length;
1765     length += 3;
1766     continue;
1767    
1768     /* Handle ket. Look for subsequent max/min; for certain sets of values we
1769     have to replicate this bracket up to that many times. */
1770    
1771     case ')':
1772     length += 3;
1773     {
1774     int min = 1;
1775     int max = 1;
1776     int duplength = length - brastack[--brastackptr];
1777    
1778     /* Leave ptr at the final char; for read_repeat_counts this happens
1779     automatically; for the others we need an increment. */
1780    
1781     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1782     {
1783     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1784     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1785     }
1786     else if (c == '*') { min = 0; max = -1; ptr++; }
1787     else if (c == '+') { max = -1; ptr++; }
1788     else if (c == '?') { min = 0; ptr++; }
1789    
1790     /* If there is a minimum > 1 we have to replicate up to min-1 times; if
1791     there is a limited maximum we have to replicate up to max-1 times and
1792     allow for a BRAZERO item before each optional copy, as we also have to
1793     do before the first copy if the minimum is zero. */
1794    
1795     if (min == 0) length++;
1796     else if (min > 1) length += (min - 1) * duplength;
1797     if (max > min) length += (max - min) * (duplength + 1);
1798     }
1799    
1800     continue;
1801    
1802     /* Non-special character. For a run of such characters the length required
1803     is the number of characters + 2, except that the maximum run length is 255.
1804     We won't get a skipped space or a non-data escape or the start of a #
1805     comment as the first character, so the length can't be zero. */
1806    
1807     NORMAL_CHAR:
1808     default:
1809     length += 2;
1810     runlength = 0;
1811     do
1812     {
1813     if ((pcre_ctypes[c] & ctype_space) != 0)
1814     {
1815     if ((options & PCRE_EXTENDED) != 0) continue;
1816     spaces++;
1817     }
1818    
1819     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1820     {
1821     while ((c = *(++ptr)) != 0 && c != '\n');
1822     continue;
1823     }
1824    
1825     /* Backslash may introduce a data char or a metacharacter; stop the
1826     string before the latter. */
1827    
1828     if (c == '\\')
1829     {
1830     uschar *saveptr = ptr;
1831     c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1832     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1833     if (c < 0) { ptr = saveptr; break; }
1834     }
1835    
1836     /* Ordinary character or single-char escape */
1837    
1838     runlength++;
1839     }
1840    
1841     /* This "while" is the end of the "do" above. */
1842    
1843     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1844    
1845     ptr--;
1846     length += runlength;
1847     continue;
1848     }
1849     }
1850    
1851     length += 4; /* For final KET and END */
1852    
1853     if (length > 65539)
1854     {
1855     *errorptr = ERR20;
1856     return NULL;
1857     }
1858    
1859     /* Compute the size of data block needed and get it, either from malloc or
1860     externally provided function. Put in the magic number and the options. */
1861    
1862     size = length + offsetof(real_pcre, code);
1863     re = (real_pcre *)(pcre_malloc)(size);
1864    
1865     if (re == NULL)
1866     {
1867     *errorptr = ERR21;
1868     return NULL;
1869     }
1870    
1871     re->magic_number = MAGIC_NUMBER;
1872     re->options = options;
1873    
1874     /* Set up a starting, non-extracting bracket, then compile the expression. On
1875     error, *errorptr will be set non-NULL, so we don't need to look at the result
1876     of the function here. */
1877    
1878     ptr = (uschar *)pattern;
1879     code = re->code;
1880     *code = OP_BRA;
1881     bracount = 0;
1882     (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1883     re->top_bracket = bracount;
1884     re->top_backref = top_backref;
1885    
1886     /* If not reached end of pattern on success, there's an excess bracket. */
1887    
1888     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1889    
1890     /* Fill in the terminating state and check for disastrous overflow, but
1891     if debugging, leave the test till after things are printed out. */
1892    
1893     *code++ = OP_END;
1894    
1895     #ifndef DEBUG
1896     if (code - re->code > length) *errorptr = ERR23;
1897     #endif
1898    
1899     /* Failed to compile */
1900    
1901     if (*errorptr != NULL)
1902     {
1903     (pcre_free)(re);
1904     PCRE_ERROR_RETURN:
1905     *erroroffset = ptr - (uschar *)pattern;
1906     return NULL;
1907     }
1908    
1909     /* If the anchored option was not passed, set flag if we can determine that it
1910     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1911     we can determine what the first character has to be, because that speeds up
1912     unanchored matches no end. In the case of multiline matches, an alternative is
1913     to set the PCRE_STARTLINE flag if all branches start with ^. */
1914    
1915     if ((options & PCRE_ANCHORED) == 0)
1916     {
1917     if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1918     re->options |= PCRE_ANCHORED;
1919     else
1920     {
1921     int c = find_firstchar(re->code);
1922     if (c >= 0)
1923     {
1924     re->first_char = c;
1925     re->options |= PCRE_FIRSTSET;
1926     }
1927     else if (is_startline(re->code))
1928     re->options |= PCRE_STARTLINE;
1929     }
1930     }
1931    
1932     /* Print out the compiled data for debugging */
1933    
1934     #ifdef DEBUG
1935    
1936     printf("Length = %d top_bracket = %d top_backref=%d\n",
1937     length, re->top_bracket, re->top_backref);
1938    
1939     if (re->options != 0)
1940     {
1941     printf("%s%s%s%s%s%s%s\n",
1942     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1943     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1944     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1945     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1946     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1947     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1948     ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1949     }
1950    
1951     if ((re->options & PCRE_FIRSTSET) != 0)
1952     {
1953     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1954     else printf("First char = \\x%02x\n", re->first_char);
1955     }
1956    
1957     code_end = code;
1958     code_base = code = re->code;
1959    
1960     while (code < code_end)
1961     {
1962     int charlength;
1963    
1964     printf("%3d ", code - code_base);
1965    
1966     if (*code >= OP_BRA)
1967     {
1968     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1969     code += 2;
1970     }
1971    
1972     else switch(*code)
1973     {
1974     case OP_CHARS:
1975     charlength = *(++code);
1976     printf("%3d ", charlength);
1977     while (charlength-- > 0)
1978     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
1979     break;
1980    
1981     case OP_KETRMAX:
1982     case OP_KETRMIN:
1983     case OP_ALT:
1984     case OP_KET:
1985     case OP_ASSERT:
1986     case OP_ASSERT_NOT:
1987     case OP_ONCE:
1988     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
1989     code += 2;
1990     break;
1991    
1992     case OP_STAR:
1993     case OP_MINSTAR:
1994     case OP_PLUS:
1995     case OP_MINPLUS:
1996     case OP_QUERY:
1997     case OP_MINQUERY:
1998     case OP_TYPESTAR:
1999     case OP_TYPEMINSTAR:
2000     case OP_TYPEPLUS:
2001     case OP_TYPEMINPLUS:
2002     case OP_TYPEQUERY:
2003     case OP_TYPEMINQUERY:
2004     if (*code >= OP_TYPESTAR)
2005     printf(" %s", OP_names[code[1]]);
2006     else if (isprint(c = code[1])) printf(" %c", c);
2007     else printf(" \\x%02x", c);
2008     printf("%s", OP_names[*code++]);
2009     break;
2010    
2011     case OP_EXACT:
2012     case OP_UPTO:
2013     case OP_MINUPTO:
2014     if (isprint(c = code[3])) printf(" %c{", c);
2015     else printf(" \\x%02x{", c);
2016     if (*code != OP_EXACT) printf(",");
2017     printf("%d}", (code[1] << 8) + code[2]);
2018     if (*code == OP_MINUPTO) printf("?");
2019     code += 3;
2020     break;
2021    
2022     case OP_TYPEEXACT:
2023     case OP_TYPEUPTO:
2024     case OP_TYPEMINUPTO:
2025     printf(" %s{", OP_names[code[3]]);
2026     if (*code != OP_TYPEEXACT) printf(",");
2027     printf("%d}", (code[1] << 8) + code[2]);
2028     if (*code == OP_TYPEMINUPTO) printf("?");
2029     code += 3;
2030     break;
2031    
2032     case OP_NOT:
2033     if (isprint(c = *(++code))) printf(" [^%c]", c);
2034     else printf(" [^\\x%02x]", c);
2035     break;
2036    
2037     case OP_NOTSTAR:
2038     case OP_NOTMINSTAR:
2039     case OP_NOTPLUS:
2040     case OP_NOTMINPLUS:
2041     case OP_NOTQUERY:
2042     case OP_NOTMINQUERY:
2043     if (isprint(c = code[1])) printf(" [^%c]", c);
2044     else printf(" [^\\x%02x]", c);
2045     printf("%s", OP_names[*code++]);
2046     break;
2047    
2048     case OP_NOTEXACT:
2049     case OP_NOTUPTO:
2050     case OP_NOTMINUPTO:
2051     if (isprint(c = code[3])) printf(" [^%c]{", c);
2052     else printf(" [^\\x%02x]{", c);
2053     if (*code != OP_NOTEXACT) printf(",");
2054     printf("%d}", (code[1] << 8) + code[2]);
2055     if (*code == OP_NOTMINUPTO) printf("?");
2056     code += 3;
2057     break;
2058    
2059     case OP_REF:
2060     printf(" \\%d", *(++code));
2061     break;
2062    
2063     case OP_CLASS:
2064     {
2065     int i, min, max;
2066    
2067     code++;
2068     printf(" [");
2069    
2070     for (i = 0; i < 256; i++)
2071     {
2072     if ((code[i/8] & (1 << (i&7))) != 0)
2073     {
2074     int j;
2075     for (j = i+1; j < 256; j++)
2076     if ((code[j/8] & (1 << (j&7))) == 0) break;
2077     if (i == '-' || i == ']') printf("\\");
2078     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2079     if (--j > i)
2080     {
2081     printf("-");
2082     if (j == '-' || j == ']') printf("\\");
2083     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2084     }
2085     i = j;
2086     }
2087     }
2088     printf("]");
2089     code += 32;
2090    
2091     switch(*code)
2092     {
2093     case OP_CRSTAR:
2094     case OP_CRMINSTAR:
2095     case OP_CRPLUS:
2096     case OP_CRMINPLUS:
2097     case OP_CRQUERY:
2098     case OP_CRMINQUERY:
2099     printf("%s", OP_names[*code]);
2100     break;
2101    
2102     case OP_CRRANGE:
2103     case OP_CRMINRANGE:
2104     min = (code[1] << 8) + code[2];
2105     max = (code[3] << 8) + code[4];
2106     if (max == 0) printf("{%d,}", min);
2107     else printf("{%d,%d}", min, max);
2108     if (*code == OP_CRMINRANGE) printf("?");
2109     code += 4;
2110     break;
2111    
2112     default:
2113     code--;
2114     }
2115     }
2116     break;
2117    
2118     /* Anything else is just a one-node item */
2119    
2120     default:
2121     printf(" %s", OP_names[*code]);
2122     break;
2123     }
2124    
2125     code++;
2126     printf("\n");
2127     }
2128     printf("------------------------------------------------------------------\n");
2129    
2130     /* This check is done here in the debugging case so that the code that
2131     was compiled can be seen. */
2132    
2133     if (code - re->code > length)
2134     {
2135     *errorptr = ERR23;
2136     (pcre_free)(re);
2137     *erroroffset = ptr - (uschar *)pattern;
2138     return NULL;
2139     }
2140     #endif
2141    
2142     return (pcre *)re;
2143     }
2144    
2145    
2146    
2147     /*************************************************
2148     * Match a character type *
2149     *************************************************/
2150    
2151     /* Not used in all the places it might be as it's sometimes faster
2152     to put the code inline.
2153    
2154     Arguments:
2155     type the character type
2156     c the character
2157     dotall the dotall flag
2158    
2159     Returns: TRUE if character is of the type
2160     */
2161    
2162     static BOOL
2163     match_type(int type, int c, BOOL dotall)
2164     {
2165    
2166     #ifdef DEBUG
2167     if (isprint(c)) printf("matching subject %c against ", c);
2168     else printf("matching subject \\x%02x against ", c);
2169     printf("%s\n", OP_names[type]);
2170     #endif
2171    
2172     switch(type)
2173     {
2174     case OP_ANY: return dotall || c != '\n';
2175     case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2176     case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2177     case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2178     case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2179     case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2180     case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2181     }
2182     return FALSE;
2183     }
2184    
2185    
2186    
2187     /*************************************************
2188     * Match a back-reference *
2189     *************************************************/
2190    
2191     /* If a back reference hasn't been set, the match fails.
2192    
2193     Arguments:
2194     number reference number
2195     eptr points into the subject
2196     length length to be matched
2197     md points to match data block
2198    
2199     Returns: TRUE if matched
2200     */
2201    
2202     static BOOL
2203     match_ref(int number, register uschar *eptr, int length, match_data *md)
2204     {
2205     uschar *p = md->start_subject + md->offset_vector[number];
2206    
2207     #ifdef DEBUG
2208     if (eptr >= md->end_subject)
2209     printf("matching subject <null>");
2210     else
2211     {
2212     printf("matching subject ");
2213     pchars(eptr, length, TRUE, md);
2214     }
2215     printf(" against backref ");
2216     pchars(p, length, FALSE, md);
2217     printf("\n");
2218     #endif
2219    
2220     /* Always fail if not enough characters left */
2221    
2222     if (length > md->end_subject - p) return FALSE;
2223    
2224     /* Separate the caselesss case for speed */
2225    
2226     if (md->caseless)
2227     { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2228     else
2229     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2230    
2231     return TRUE;
2232     }
2233    
2234    
2235    
2236     /*************************************************
2237     * Match from current position *
2238     *************************************************/
2239    
2240     /* On entry ecode points to the first opcode, and eptr to the first character.
2241    
2242     Arguments:
2243     eptr pointer in subject
2244     ecode position in code
2245     offset_top current top pointer
2246     md pointer to "static" info for the match
2247    
2248     Returns: TRUE if matched
2249     */
2250    
2251     static BOOL
2252     match(register uschar *eptr, register uschar *ecode, int offset_top,
2253     match_data *md)
2254     {
2255     for (;;)
2256     {
2257     int min, max, ctype;
2258     register int i;
2259     register int c;
2260     BOOL minimize;
2261    
2262     /* Opening bracket. Check the alternative branches in turn, failing if none
2263     match. We have to set the start offset if required and there is space
2264     in the offset vector so that it is available for subsequent back references
2265     if the bracket matches. However, if the bracket fails, we must put back the
2266     previous value of both offsets in case they were set by a previous copy of
2267     the same bracket. Don't worry about setting the flag for the error case here;
2268     that is handled in the code for KET. */
2269    
2270     if ((int)*ecode >= OP_BRA)
2271     {
2272     int number = (*ecode - OP_BRA) << 1;
2273     int save_offset1, save_offset2;
2274    
2275     #ifdef DEBUG
2276     printf("start bracket %d\n", number/2);
2277     #endif
2278    
2279     if (number > 0 && number < md->offset_end)
2280     {
2281     save_offset1 = md->offset_vector[number];
2282     save_offset2 = md->offset_vector[number+1];
2283     md->offset_vector[number] = eptr - md->start_subject;
2284    
2285     #ifdef DEBUG
2286     printf("saving %d %d\n", save_offset1, save_offset2);
2287     #endif
2288     }
2289    
2290     /* Recurse for all the alternatives. */
2291    
2292     do
2293     {
2294     if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2295     ecode += (ecode[1] << 8) + ecode[2];
2296     }
2297     while (*ecode == OP_ALT);
2298    
2299     #ifdef DEBUG
2300     printf("bracket %d failed\n", number/2);
2301     #endif
2302    
2303     if (number > 0 && number < md->offset_end)
2304     {
2305     md->offset_vector[number] = save_offset1;
2306     md->offset_vector[number+1] = save_offset2;
2307     }
2308    
2309     return FALSE;
2310     }
2311    
2312     /* Other types of node can be handled by a switch */
2313    
2314     switch(*ecode)
2315     {
2316     case OP_END:
2317     md->end_match_ptr = eptr; /* Record where we ended */
2318     md->end_offset_top = offset_top; /* and how many extracts were taken */
2319     return TRUE;
2320    
2321     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2322     whole thing doesn't match, so we have to get out via a longjmp(). */
2323    
2324     case OP_CUT:
2325     if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2326     longjmp(md->fail_env, 1);
2327    
2328     /* Assertion brackets. Check the alternative branches in turn - the
2329     matching won't pass the KET for an assertion. If any one branch matches,
2330     the assertion is true. */
2331    
2332     case OP_ASSERT:
2333     do
2334     {
2335     if (match(eptr, ecode+3, offset_top, md)) break;
2336     ecode += (ecode[1] << 8) + ecode[2];
2337     }
2338     while (*ecode == OP_ALT);
2339     if (*ecode == OP_KET) return FALSE;
2340    
2341     /* Continue from after the assertion, updating the offsets high water
2342     mark, since extracts may have been taken during the assertion. */
2343    
2344     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2345     ecode += 3;
2346     offset_top = md->end_offset_top;
2347     continue;
2348    
2349     /* Negative assertion: all branches must fail to match */
2350    
2351     case OP_ASSERT_NOT:
2352     do
2353     {
2354     if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2355     ecode += (ecode[1] << 8) + ecode[2];
2356     }
2357     while (*ecode == OP_ALT);
2358     ecode += 3;
2359     continue;
2360    
2361     /* "Once" brackets are like assertion brackets except that after a match,
2362     the point in the subject string is not moved back. Thus there can never be
2363     a back into the brackets. Check the alternative branches in turn - the
2364     matching won't pass the KET for this kind of subpattern. If any one branch
2365     matches, we carry on, leaving the subject pointer. */
2366    
2367     case OP_ONCE:
2368     do
2369     {
2370     if (match(eptr, ecode+3, offset_top, md)) break;
2371     ecode += (ecode[1] << 8) + ecode[2];
2372     }
2373     while (*ecode == OP_ALT);
2374     if (*ecode == OP_KET) return FALSE;
2375    
2376     /* Continue as from after the assertion, updating the offsets high water
2377     mark, since extracts may have been taken. */
2378    
2379     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2380     ecode += 3;
2381     offset_top = md->end_offset_top;
2382     eptr = md->end_match_ptr;
2383     continue;
2384    
2385     /* An alternation is the end of a branch; scan along to find the end of the
2386     bracketed group and go to there. */
2387    
2388     case OP_ALT:
2389     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2390     break;
2391    
2392     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2393     that it may occur zero times. It may repeat infinitely, or not at all -
2394     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2395     repeat limits are compiled as a number of copies, with the optional ones
2396     preceded by BRAZERO or BRAMINZERO. */
2397    
2398     case OP_BRAZERO:
2399     {
2400     uschar *next = ecode+1;
2401     if (match(eptr, next, offset_top, md)) return TRUE;
2402     do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2403     ecode = next + 3;
2404     }
2405     break;
2406    
2407     case OP_BRAMINZERO:
2408     {
2409     uschar *next = ecode+1;
2410     do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2411     if (match(eptr, next+3, offset_top, md)) return TRUE;
2412     ecode++;
2413     }
2414     break;;
2415    
2416     /* End of a group, repeated or non-repeating. If we are at the end of
2417     an assertion "group", stop matching and return TRUE, but record the
2418     current high water mark for use by positive assertions. */
2419    
2420     case OP_KET:
2421     case OP_KETRMIN:
2422     case OP_KETRMAX:
2423     {
2424     int number;
2425     uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2426    
2427     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2428     {
2429     md->end_match_ptr = eptr; /* For ONCE */
2430     md->end_offset_top = offset_top;
2431     return TRUE;
2432     }
2433    
2434     /* In all other cases we have to check the group number back at the
2435     start and if necessary complete handling an extraction by setting the
2436     final offset and bumping the high water mark. */
2437    
2438     number = (*prev - OP_BRA) << 1;
2439    
2440     #ifdef DEBUG
2441     printf("end bracket %d\n", number/2);
2442     #endif
2443    
2444     if (number > 0)
2445     {
2446     if (number >= md->offset_end) md->offset_overflow = TRUE; else
2447     {
2448     md->offset_vector[number+1] = eptr - md->start_subject;
2449     if (offset_top <= number) offset_top = number + 2;
2450     }
2451     }
2452    
2453     /* For a non-repeating ket, just advance to the next node and continue at
2454     this level. */
2455    
2456     if (*ecode == OP_KET)
2457     {
2458     ecode += 3;
2459     break;
2460     }
2461    
2462     /* The repeating kets try the rest of the pattern or restart from the
2463     preceding bracket, in the appropriate order. */
2464    
2465     if (*ecode == OP_KETRMIN)
2466     {
2467     if (match(eptr, ecode+3, offset_top, md) ||
2468     match(eptr, prev, offset_top, md)) return TRUE;
2469     }
2470     else /* OP_KETRMAX */
2471     {
2472     if (match(eptr, prev, offset_top, md) ||
2473     match(eptr, ecode+3, offset_top, md)) return TRUE;
2474     }
2475     }
2476     return FALSE;
2477    
2478     /* Start of subject unless notbol, or after internal newline if multiline */
2479    
2480     case OP_CIRC:
2481     if (md->notbol && eptr == md->start_subject) return FALSE;
2482     if (md->multiline)
2483     {
2484     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2485     ecode++;
2486     break;
2487     }
2488     /* ... else fall through */
2489    
2490     /* Start of subject assertion */
2491    
2492     case OP_SOD:
2493     if (eptr != md->start_subject) return FALSE;
2494     ecode++;
2495     break;
2496    
2497     /* Assert before internal newline if multiline, or before
2498     a terminating newline unless endonly is set, else end of subject unless
2499     noteol is set. */
2500    
2501     case OP_DOLL:
2502     if (md->noteol && eptr >= md->end_subject) return FALSE;
2503     if (md->multiline)
2504     {
2505     if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2506     ecode++;
2507     break;
2508     }
2509     else if (!md->endonly)
2510     {
2511     if (eptr < md->end_subject - 1 ||
2512     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2513     ecode++;
2514     break;
2515     }
2516     /* ... else fall through */
2517    
2518     /* End of subject assertion */
2519    
2520     case OP_EOD:
2521     if (eptr < md->end_subject) return FALSE;
2522     ecode++;
2523     break;
2524    
2525     /* Word boundary assertions */
2526    
2527     case OP_NOT_WORD_BOUNDARY:
2528     case OP_WORD_BOUNDARY:
2529     {
2530     BOOL prev_is_word = (eptr != md->start_subject) &&
2531     ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2532     BOOL cur_is_word = (eptr < md->end_subject) &&
2533     ((pcre_ctypes[*eptr] & ctype_word) != 0);
2534     if ((*ecode++ == OP_WORD_BOUNDARY)?
2535     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2536     return FALSE;
2537     }
2538     break;
2539    
2540     /* Match a single character type; inline for speed */
2541    
2542     case OP_ANY:
2543     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2544     if (eptr++ >= md->end_subject) return FALSE;
2545     ecode++;
2546     break;
2547    
2548     case OP_NOT_DIGIT:
2549     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2550     return FALSE;
2551     ecode++;
2552     break;
2553    
2554     case OP_DIGIT:
2555     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2556     return FALSE;
2557     ecode++;
2558     break;
2559    
2560     case OP_NOT_WHITESPACE:
2561     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2562     return FALSE;
2563     ecode++;
2564     break;
2565    
2566     case OP_WHITESPACE:
2567     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2568     return FALSE;
2569     ecode++;
2570     break;
2571    
2572     case OP_NOT_WORDCHAR:
2573     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2574     return FALSE;
2575     ecode++;
2576     break;
2577    
2578     case OP_WORDCHAR:
2579     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2580     return FALSE;
2581     ecode++;
2582     break;
2583    
2584     /* Match a back reference, possibly repeatedly. Look past the end of the
2585     item to see if there is repeat information following. The code is similar
2586     to that for character classes, but repeated for efficiency. Then obey
2587     similar code to character type repeats - written out again for speed.
2588     However, if the referenced string is the empty string, always treat
2589     it as matched, any number of times (otherwise there could be infinite
2590     loops). */
2591    
2592     case OP_REF:
2593     {
2594     int length;
2595     int number = ecode[1] << 1; /* Doubled reference number */
2596     ecode += 2; /* Advance past the item */
2597    
2598     if (number >= offset_top || md->offset_vector[number] < 0)
2599     {
2600     md->errorcode = PCRE_ERROR_BADREF;
2601     return FALSE;
2602     }
2603    
2604     length = md->offset_vector[number+1] - md->offset_vector[number];
2605    
2606     switch (*ecode)
2607     {
2608     case OP_CRSTAR:
2609     case OP_CRMINSTAR:
2610     case OP_CRPLUS:
2611     case OP_CRMINPLUS:
2612     case OP_CRQUERY:
2613     case OP_CRMINQUERY:
2614     c = *ecode++ - OP_CRSTAR;
2615     minimize = (c & 1) != 0;
2616     min = rep_min[c]; /* Pick up values from tables; */
2617     max = rep_max[c]; /* zero for max => infinity */
2618     if (max == 0) max = INT_MAX;
2619     break;
2620    
2621     case OP_CRRANGE:
2622     case OP_CRMINRANGE:
2623     minimize = (*ecode == OP_CRMINRANGE);
2624     min = (ecode[1] << 8) + ecode[2];
2625     max = (ecode[3] << 8) + ecode[4];
2626     if (max == 0) max = INT_MAX;
2627     ecode += 5;
2628     break;
2629    
2630     default: /* No repeat follows */
2631     if (!match_ref(number, eptr, length, md)) return FALSE;
2632     eptr += length;
2633     continue; /* With the main loop */
2634     }
2635    
2636     /* If the length of the reference is zero, just continue with the
2637     main loop. */
2638    
2639     if (length == 0) continue;
2640    
2641     /* First, ensure the minimum number of matches are present. We get back
2642     the length of the reference string explicitly rather than passing the
2643     address of eptr, so that eptr can be a register variable. */
2644    
2645     for (i = 1; i <= min; i++)
2646     {
2647     if (!match_ref(number, eptr, length, md)) return FALSE;
2648     eptr += length;
2649     }
2650    
2651     /* If min = max, continue at the same level without recursion.
2652     They are not both allowed to be zero. */
2653    
2654     if (min == max) continue;
2655    
2656     /* If minimizing, keep trying and advancing the pointer */
2657    
2658     if (minimize)
2659     {
2660     for (i = min;; i++)
2661     {
2662     if (match(eptr, ecode, offset_top, md)) return TRUE;
2663     if (i >= max || !match_ref(number, eptr, length, md))
2664     return FALSE;
2665     eptr += length;
2666     }
2667     /* Control never gets here */
2668     }
2669    
2670     /* If maximizing, find the longest string and work backwards */
2671    
2672     else
2673     {
2674     uschar *pp = eptr;
2675     for (i = min; i < max; i++)
2676     {
2677     if (!match_ref(number, eptr, length, md)) break;
2678     eptr += length;
2679     }
2680     while (eptr >= pp)
2681     {
2682     if (match(eptr, ecode, offset_top, md)) return TRUE;
2683     eptr -= length;
2684     }
2685     return FALSE;
2686     }
2687     }
2688     /* Control never gets here */
2689    
2690     /* Match a character class, possibly repeatedly. Look past the end of the
2691     item to see if there is repeat information following. Then obey similar
2692     code to character type repeats - written out again for speed. If caseless
2693     matching was set at runtime but not at compile time, we have to check both
2694     versions of a character. */
2695    
2696     case OP_CLASS:
2697     {
2698     uschar *data = ecode + 1; /* Save for matching */
2699     ecode += 33; /* Advance past the item */
2700    
2701     switch (*ecode)
2702     {
2703     case OP_CRSTAR:
2704     case OP_CRMINSTAR:
2705     case OP_CRPLUS:
2706     case OP_CRMINPLUS:
2707     case OP_CRQUERY:
2708     case OP_CRMINQUERY:
2709     c = *ecode++ - OP_CRSTAR;
2710     minimize = (c & 1) != 0;
2711     min = rep_min[c]; /* Pick up values from tables; */
2712     max = rep_max[c]; /* zero for max => infinity */
2713     if (max == 0) max = INT_MAX;
2714     break;
2715    
2716     case OP_CRRANGE:
2717     case OP_CRMINRANGE:
2718     minimize = (*ecode == OP_CRMINRANGE);
2719     min = (ecode[1] << 8) + ecode[2];
2720     max = (ecode[3] << 8) + ecode[4];
2721     if (max == 0) max = INT_MAX;
2722     ecode += 5;
2723     break;
2724    
2725     default: /* No repeat follows */
2726     if (eptr >= md->end_subject) return FALSE;
2727     c = *eptr++;
2728     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2729     if (md->runtime_caseless)
2730     {
2731     c = pcre_fcc[c];
2732     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2733     }
2734     return FALSE;
2735     }
2736    
2737     /* First, ensure the minimum number of matches are present. */
2738    
2739     for (i = 1; i <= min; i++)
2740     {
2741     if (eptr >= md->end_subject) return FALSE;
2742     c = *eptr++;
2743     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2744     if (md->runtime_caseless)
2745     {
2746     c = pcre_fcc[c];
2747     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2748     }
2749     return FALSE;
2750     }
2751    
2752     /* If max == min we can continue with the main loop without the
2753     need to recurse. */
2754    
2755     if (min == max) continue;
2756    
2757     /* If minimizing, keep testing the rest of the expression and advancing
2758     the pointer while it matches the class. */
2759    
2760     if (minimize)
2761     {
2762     for (i = min;; i++)
2763     {
2764     if (match(eptr, ecode, offset_top, md)) return TRUE;
2765     if (i >= max || eptr >= md->end_subject) return FALSE;
2766     c = *eptr++;
2767     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2768     if (md->runtime_caseless)
2769     {
2770     c = pcre_fcc[c];
2771     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2772     }
2773     return FALSE;
2774     }
2775     /* Control never gets here */
2776     }
2777    
2778     /* If maximizing, find the longest possible run, then work backwards. */
2779    
2780     else
2781     {
2782     uschar *pp = eptr;
2783     for (i = min; i < max; eptr++, i++)
2784     {
2785     if (eptr >= md->end_subject) break;
2786     c = *eptr;
2787     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2788     if (md->runtime_caseless)
2789     {
2790     c = pcre_fcc[c];
2791     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2792     }
2793     break;
2794     }
2795    
2796     while (eptr >= pp)
2797     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2798     return FALSE;
2799     }
2800     }
2801     /* Control never gets here */
2802    
2803     /* Match a run of characters */
2804    
2805     case OP_CHARS:
2806     {
2807     register int length = ecode[1];
2808     ecode += 2;
2809    
2810     #ifdef DEBUG
2811     if (eptr >= md->end_subject)
2812     printf("matching subject <null> against pattern ");
2813     else
2814     {
2815     printf("matching subject ");
2816     pchars(eptr, length, TRUE, md);
2817     printf(" against pattern ");
2818     }
2819     pchars(ecode, length, FALSE, md);
2820     printf("\n");
2821     #endif
2822    
2823     if (length > md->end_subject - eptr) return FALSE;
2824     if (md->caseless)
2825     {
2826     while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2827     }
2828     else
2829     {
2830     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2831     }
2832     }
2833     break;
2834    
2835     /* Match a single character repeatedly; different opcodes share code. */
2836    
2837     case OP_EXACT:
2838     min = max = (ecode[1] << 8) + ecode[2];
2839     ecode += 3;
2840     goto REPEATCHAR;
2841    
2842     case OP_UPTO:
2843     case OP_MINUPTO:
2844     min = 0;
2845     max = (ecode[1] << 8) + ecode[2];
2846     minimize = *ecode == OP_MINUPTO;
2847     ecode += 3;
2848     goto REPEATCHAR;
2849    
2850     case OP_STAR:
2851     case OP_MINSTAR:
2852     case OP_PLUS:
2853     case OP_MINPLUS:
2854     case OP_QUERY:
2855     case OP_MINQUERY:
2856     c = *ecode++ - OP_STAR;
2857     minimize = (c & 1) != 0;
2858     min = rep_min[c]; /* Pick up values from tables; */
2859     max = rep_max[c]; /* zero for max => infinity */
2860     if (max == 0) max = INT_MAX;
2861    
2862     /* Common code for all repeated single-character matches. We can give
2863     up quickly if there are fewer than the minimum number of characters left in
2864     the subject. */
2865    
2866     REPEATCHAR:
2867     if (min > md->end_subject - eptr) return FALSE;
2868     c = *ecode++;
2869    
2870     /* The code is duplicated for the caseless and caseful cases, for speed,
2871     since matching characters is likely to be quite common. First, ensure the
2872     minimum number of matches are present. If min = max, continue at the same
2873     level without recursing. Otherwise, if minimizing, keep trying the rest of
2874     the expression and advancing one matching character if failing, up to the
2875     maximum. Alternatively, if maximizing, find the maximum number of
2876     characters and work backwards. */
2877    
2878     #ifdef DEBUG
2879     printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2880     max, eptr);
2881     #endif
2882    
2883     if (md->caseless)
2884     {
2885     c = pcre_lcc[c];
2886     for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2887     if (min == max) continue;
2888     if (minimize)
2889     {
2890     for (i = min;; i++)
2891     {
2892     if (match(eptr, ecode, offset_top, md)) return TRUE;
2893     if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2894     return FALSE;
2895     }
2896     /* Control never gets here */
2897     }
2898     else
2899     {
2900     uschar *pp = eptr;
2901     for (i = min; i < max; i++)
2902     {
2903     if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2904     eptr++;
2905     }
2906     while (eptr >= pp)
2907     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2908     return FALSE;
2909     }
2910     /* Control never gets here */
2911     }
2912    
2913     /* Caseful comparisons */
2914    
2915     else
2916     {
2917     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2918     if (min == max) continue;
2919     if (minimize)
2920     {
2921     for (i = min;; i++)
2922     {
2923     if (match(eptr, ecode, offset_top, md)) return TRUE;
2924     if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2925     }
2926     /* Control never gets here */
2927     }
2928     else
2929     {
2930     uschar *pp = eptr;
2931     for (i = min; i < max; i++)
2932     {
2933     if (eptr >= md->end_subject || c != *eptr) break;
2934     eptr++;
2935     }
2936     while (eptr >= pp)
2937     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2938     return FALSE;
2939     }
2940     }
2941     /* Control never gets here */
2942    
2943     /* Match a negated single character */
2944    
2945     case OP_NOT:
2946     if (eptr > md->end_subject) return FALSE;
2947     ecode++;
2948     if (md->caseless)
2949     {
2950     if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2951     }
2952     else
2953     {
2954     if (*ecode++ == *eptr++) return FALSE;
2955     }
2956     break;
2957    
2958     /* Match a negated single character repeatedly. This is almost a repeat of
2959     the code for a repeated single character, but I haven't found a nice way of
2960     commoning these up that doesn't require a test of the positive/negative
2961     option for each character match. Maybe that wouldn't add very much to the
2962     time taken, but character matching *is* what this is all about... */
2963    
2964     case OP_NOTEXACT:
2965     min = max = (ecode[1] << 8) + ecode[2];
2966     ecode += 3;
2967     goto REPEATNOTCHAR;
2968    
2969     case OP_NOTUPTO:
2970     case OP_NOTMINUPTO:
2971     min = 0;
2972     max = (ecode[1] << 8) + ecode[2];
2973     minimize = *ecode == OP_NOTMINUPTO;
2974     ecode += 3;
2975     goto REPEATNOTCHAR;
2976    
2977     case OP_NOTSTAR:
2978     case OP_NOTMINSTAR:
2979     case OP_NOTPLUS:
2980     case OP_NOTMINPLUS:
2981     case OP_NOTQUERY:
2982     case OP_NOTMINQUERY:
2983     c = *ecode++ - OP_NOTSTAR;
2984     minimize = (c & 1) != 0;
2985     min = rep_min[c]; /* Pick up values from tables; */
2986     max = rep_max[c]; /* zero for max => infinity */
2987     if (max == 0) max = INT_MAX;
2988    
2989     /* Common code for all repeated single-character matches. We can give
2990     up quickly if there are fewer than the minimum number of characters left in
2991     the subject. */
2992    
2993     REPEATNOTCHAR:
2994     if (min > md->end_subject - eptr) return FALSE;
2995     c = *ecode++;
2996    
2997     /* The code is duplicated for the caseless and caseful cases, for speed,
2998     since matching characters is likely to be quite common. First, ensure the
2999     minimum number of matches are present. If min = max, continue at the same
3000     level without recursing. Otherwise, if minimizing, keep trying the rest of
3001     the expression and advancing one matching character if failing, up to the
3002     maximum. Alternatively, if maximizing, find the maximum number of
3003     characters and work backwards. */
3004    
3005     #ifdef DEBUG
3006     printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3007     max, eptr);
3008     #endif
3009    
3010     if (md->caseless)
3011     {
3012     c = pcre_lcc[c];
3013     for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3014     if (min == max) continue;
3015     if (minimize)
3016     {
3017     for (i = min;; i++)
3018     {
3019     if (match(eptr, ecode, offset_top, md)) return TRUE;
3020     if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3021     return FALSE;
3022     }
3023     /* Control never gets here */
3024     }
3025     else
3026     {
3027     uschar *pp = eptr;
3028     for (i = min; i < max; i++)
3029     {
3030     if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3031     eptr++;
3032     }
3033     while (eptr >= pp)
3034     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3035     return FALSE;
3036     }
3037     /* Control never gets here */
3038     }
3039    
3040     /* Caseful comparisons */
3041    
3042     else
3043     {
3044     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3045     if (min == max) continue;
3046     if (minimize)
3047     {
3048     for (i = min;; i++)
3049     {
3050     if (match(eptr, ecode, offset_top, md)) return TRUE;
3051     if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3052     }
3053     /* Control never gets here */
3054     }
3055     else
3056     {
3057     uschar *pp = eptr;
3058     for (i = min; i < max; i++)
3059     {
3060     if (eptr >= md->end_subject || c == *eptr) break;
3061     eptr++;
3062     }
3063     while (eptr >= pp)
3064     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3065     return FALSE;
3066     }
3067     }
3068     /* Control never gets here */
3069    
3070     /* Match a single character type repeatedly; several different opcodes
3071     share code. This is very similar to the code for single characters, but we
3072     repeat it in the interests of efficiency. */
3073    
3074     case OP_TYPEEXACT:
3075     min = max = (ecode[1] << 8) + ecode[2];
3076     minimize = TRUE;
3077     ecode += 3;
3078     goto REPEATTYPE;
3079    
3080     case OP_TYPEUPTO:
3081     case OP_TYPEMINUPTO:
3082     min = 0;
3083     max = (ecode[1] << 8) + ecode[2];
3084     minimize = *ecode == OP_TYPEMINUPTO;
3085     ecode += 3;
3086     goto REPEATTYPE;
3087    
3088     case OP_TYPESTAR:
3089     case OP_TYPEMINSTAR:
3090     case OP_TYPEPLUS:
3091     case OP_TYPEMINPLUS:
3092     case OP_TYPEQUERY:
3093     case OP_TYPEMINQUERY:
3094     c = *ecode++ - OP_TYPESTAR;
3095     minimize = (c & 1) != 0;
3096     min = rep_min[c]; /* Pick up values from tables; */
3097     max = rep_max[c]; /* zero for max => infinity */
3098     if (max == 0) max = INT_MAX;
3099    
3100     /* Common code for all repeated single character type matches */
3101    
3102     REPEATTYPE:
3103     ctype = *ecode++; /* Code for the character type */
3104    
3105     /* First, ensure the minimum number of matches are present. Use inline
3106     code for maximizing the speed, and do the type test once at the start
3107     (i.e. keep it out of the loop). Also test that there are at least the
3108     minimum number of characters before we start. */
3109    
3110     if (min > md->end_subject - eptr) return FALSE;
3111     if (min > 0) switch(ctype)
3112     {
3113     case OP_ANY:
3114     if (!md->dotall)
3115     { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3116     else eptr += min;
3117     break;
3118    
3119     case OP_NOT_DIGIT:
3120     for (i = 1; i <= min; i++)
3121     if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3122     break;
3123    
3124     case OP_DIGIT:
3125     for (i = 1; i <= min; i++)
3126     if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3127     break;
3128    
3129     case OP_NOT_WHITESPACE:
3130     for (i = 1; i <= min; i++)
3131     if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3132     break;
3133    
3134     case OP_WHITESPACE:
3135     for (i = 1; i <= min; i++)
3136     if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3137     break;
3138    
3139     case OP_NOT_WORDCHAR:
3140     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3141     return FALSE;
3142     break;
3143    
3144     case OP_WORDCHAR:
3145     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3146     return FALSE;
3147     break;
3148     }
3149    
3150     /* If min = max, continue at the same level without recursing */
3151    
3152     if (min == max) continue;
3153    
3154     /* If minimizing, we have to test the rest of the pattern before each
3155     subsequent match, so inlining isn't much help; just use the function. */
3156    
3157     if (minimize)
3158     {
3159     for (i = min;; i++)
3160     {
3161     if (match(eptr, ecode, offset_top, md)) return TRUE;
3162     if (i >= max || eptr >= md->end_subject ||
3163     !match_type(ctype, *eptr++, md->dotall))
3164     return FALSE;
3165     }
3166     /* Control never gets here */
3167     }
3168    
3169     /* If maximizing it is worth using inline code for speed, doing the type
3170     test once at the start (i.e. keep it out of the loop). */
3171    
3172     else
3173     {
3174     uschar *pp = eptr;
3175     switch(ctype)
3176     {
3177     case OP_ANY:
3178     if (!md->dotall)
3179     {
3180     for (i = min; i < max; i++)
3181     {
3182     if (eptr >= md->end_subject || *eptr == '\n') break;
3183     eptr++;
3184     }
3185     }
3186     else
3187     {
3188     c = max - min;
3189     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3190     eptr += c;
3191     }
3192     break;
3193    
3194     case OP_NOT_DIGIT:
3195     for (i = min; i < max; i++)
3196     {
3197     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3198     break;
3199     eptr++;
3200     }
3201     break;
3202    
3203     case OP_DIGIT:
3204     for (i = min; i < max; i++)
3205     {
3206     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3207     break;
3208     eptr++;
3209     }
3210     break;
3211    
3212     case OP_NOT_WHITESPACE:
3213     for (i = min; i < max; i++)
3214     {
3215     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3216     break;
3217     eptr++;
3218     }
3219     break;
3220    
3221     case OP_WHITESPACE:
3222     for (i = min; i < max; i++)
3223     {
3224     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3225     break;
3226     eptr++;
3227     }
3228     break;
3229    
3230     case OP_NOT_WORDCHAR:
3231     for (i = min; i < max; i++)
3232     {
3233     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3234     break;
3235     eptr++;
3236     }
3237     break;
3238    
3239     case OP_WORDCHAR:
3240     for (i = min; i < max; i++)
3241     {
3242     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3243     break;
3244     eptr++;
3245     }
3246     break;
3247     }
3248    
3249     while (eptr >= pp)
3250     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3251     return FALSE;
3252     }
3253     /* Control never gets here */
3254    
3255     /* There's been some horrible disaster. */
3256    
3257     default:
3258     #ifdef DEBUG
3259     printf("Unknown opcode %d\n", *ecode);
3260     #endif
3261     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3262     return FALSE;
3263     }
3264    
3265     /* Do not stick any code in here without much thought; it is assumed
3266     that "continue" in the code above comes out to here to repeat the main
3267     loop. */
3268    
3269     } /* End of main loop */
3270     /* Control never reaches here */
3271     }
3272    
3273    
3274    
3275     /*************************************************
3276     * Execute a Regular Expression *
3277     *************************************************/
3278    
3279     /* This function applies a compiled re to a subject string and picks out
3280     portions of the string if it matches. Two elements in the vector are set for
3281     each substring: the offsets to the start and end of the substring.
3282    
3283     Arguments:
3284     external_re points to the compiled expression
3285     external_extra points to "hints" from pcre_study() or is NULL
3286     subject points to the subject string
3287     length length of subject string (may contain binary zeros)
3288     options option bits
3289     offsets points to a vector of ints to be filled in with offsets
3290     offsetcount the number of elements in the vector
3291    
3292     Returns: > 0 => success; value is the number of elements filled in
3293     = 0 => success, but offsets is not big enough
3294     -1 => failed to match
3295     < -1 => some kind of unexpected problem
3296     */
3297    
3298     int
3299     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3300     const char *subject, int length, int options, int *offsets, int offsetcount)
3301     {
3302     int resetcount;
3303     int ocount = offsetcount;
3304     int first_char = -1;
3305     match_data match_block;
3306     uschar *start_bits = NULL;
3307     uschar *start_match = (uschar *)subject;
3308     uschar *end_subject;
3309     real_pcre *re = (real_pcre *)external_re;
3310     real_pcre_extra *extra = (real_pcre_extra *)external_extra;
3311     BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3312     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3313    
3314     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3315    
3316     if (re == NULL || subject == NULL ||
3317     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3318     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3319    
3320     match_block.start_subject = (uschar *)subject;
3321     match_block.end_subject = match_block.start_subject + length;
3322     end_subject = match_block.end_subject;
3323    
3324     match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3325     match_block.runtime_caseless = match_block.caseless &&
3326     (re->options & PCRE_CASELESS) == 0;
3327    
3328     match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3329     match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3330     match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3331    
3332     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3333     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3334    
3335     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3336    
3337     /* If the expression has got more back references than the offsets supplied can
3338     hold, we get a temporary bit of working store to use during the matching.
3339     Otherwise, we can use the vector supplied, rounding down the size of it to a
3340     multiple of 2. */
3341    
3342     ocount &= (-2);
3343     if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
3344     {
3345     ocount = re->top_backref * 2 + 2;
3346     match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3347     if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3348     #ifdef DEBUG
3349     printf("Got memory to hold back references\n");
3350     #endif
3351     }
3352     else match_block.offset_vector = offsets;
3353    
3354     match_block.offset_end = ocount;
3355     match_block.offset_overflow = FALSE;
3356    
3357     /* Compute the minimum number of offsets that we need to reset each time. Doing
3358     this makes a huge difference to execution time when there aren't many brackets
3359     in the pattern. */
3360    
3361     resetcount = 2 + re->top_bracket * 2;
3362     if (resetcount > offsetcount) resetcount = ocount;
3363    
3364     /* If MULTILINE is set at exec time but was not set at compile time, and the
3365     anchored flag is set, we must re-check because a setting provoked by ^ in the
3366     pattern is not right in multi-line mode. Calling is_anchored() again here does
3367     the right check, because multiline is now set. If it now yields FALSE, the
3368     expression must have had ^ starting some of its branches. Check to see if
3369     that is true for *all* branches, and if so, set the startline flag. */
3370    
3371     if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3372     !is_anchored(re->code, match_block.multiline))
3373     {
3374     anchored = FALSE;
3375     if (is_startline(re->code)) startline = TRUE;
3376     }
3377    
3378     /* Set up the first character to match, if available. The first_char value is
3379     never set for an anchored regular expression, but the anchoring may be forced
3380     at run time, so we have to test for anchoring. The first char may be unset for
3381     an unanchored pattern, of course. If there's no first char and the pattern was
3382     studied, the may be a bitmap of possible first characters. However, we can
3383     use this only if the caseless state of the studying was correct. */
3384    
3385     if (!anchored)
3386     {
3387     if ((re->options & PCRE_FIRSTSET) != 0)
3388     {
3389     first_char = re->first_char;
3390     if (match_block.caseless) first_char = pcre_lcc[first_char];
3391     }
3392     else
3393     if (!startline && extra != NULL &&
3394     (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3395     ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3396     start_bits = extra->start_bits;
3397     }
3398    
3399     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3400    
3401     do
3402     {
3403     register int *iptr = match_block.offset_vector;
3404     register int *iend = iptr + resetcount;
3405    
3406     /* Reset the maximum number of extractions we might see. */
3407    
3408     while (iptr < iend) *iptr++ = -1;
3409    
3410     /* Advance to a unique first char if possible */
3411    
3412     if (first_char >= 0)
3413     {
3414     if (match_block.caseless)
3415     while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3416     start_match++;
3417     else
3418     while (start_match < end_subject && *start_match != first_char)
3419     start_match++;
3420     }
3421    
3422     /* Or to just after \n for a multiline match if possible */
3423    
3424     else if (startline)
3425     {
3426     if (start_match > match_block.start_subject)
3427     {
3428     while (start_match < end_subject && start_match[-1] != '\n')
3429     start_match++;
3430     }
3431     }
3432    
3433     /* Or to a non-unique first char */
3434    
3435     else if (start_bits != NULL)
3436     {
3437     while (start_match < end_subject)
3438     {
3439     register int c = *start_match;
3440     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3441     }
3442     }
3443    
3444     #ifdef DEBUG
3445     printf(">>>> Match against: ");
3446     pchars(start_match, end_subject - start_match, TRUE, &match_block);
3447     printf("\n");
3448     #endif
3449    
3450     /* When a match occurs, substrings will be set for all internal extractions;
3451     we just need to set up the whole thing as substring 0 before returning. If
3452     there were too many extractions, set the return code to zero. In the case
3453     where we had to get some local store to hold offsets for backreferences, copy
3454     those back references that we can. In this case there need not be overflow
3455     if certain parts of the pattern were not used.
3456    
3457     Before starting the match, we have to set up a longjmp() target to enable
3458     the "cut" operation to fail a match completely without backtracking. */
3459    
3460     if (setjmp(match_block.fail_env) == 0 &&
3461     match(start_match, re->code, 2, &match_block))
3462     {
3463     int rc;
3464    
3465     if (ocount != offsetcount)
3466     {
3467     if (offsetcount >= 4)
3468     {
3469     memcpy(offsets + 2, match_block.offset_vector + 2,
3470     (offsetcount - 2) * sizeof(int));
3471     #ifdef DEBUG
3472     printf("Copied offsets; freeing temporary memory\n");
3473     #endif
3474     }
3475     if (match_block.end_offset_top > offsetcount)
3476     match_block.offset_overflow = TRUE;
3477    
3478     #ifdef DEBUG
3479     printf("Freeing temporary memory\n");
3480     #endif
3481    
3482     (pcre_free)(match_block.offset_vector);
3483     }
3484    
3485     rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3486    
3487     if (match_block.offset_end < 2) rc = 0; else
3488     {
3489     offsets[0] = start_match - match_block.start_subject;
3490     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3491     }
3492    
3493     #ifdef DEBUG
3494     printf(">>>> returning %d\n", rc);
3495     #endif
3496     return rc;
3497     }
3498     }
3499     while (!anchored &&
3500     match_block.errorcode == PCRE_ERROR_NOMATCH &&
3501     start_match++ < end_subject);
3502    
3503     #ifdef DEBUG
3504     printf(">>>> returning %d\n", match_block.errorcode);
3505     #endif
3506    
3507     return match_block.errorcode;
3508     }
3509    
3510     /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12