/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 7 - (hide annotations) (download)
Sat Feb 24 21:38:09 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 99406 byte(s)
Load pcre-1.02 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12     Copyright (c) 1997 University of Cambridge
13    
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Define DEBUG to get debugging output on stdout. */
33    
34     /* #define DEBUG */
35    
36    
37     /* Include the internals header, which itself includes Standard C headers plus
38     the external pcre header. */
39    
40     #include "internal.h"
41    
42    
43     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
44    
45     static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
46     static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
47    
48     /* Text forms of OP_ values and things, for debugging */
49    
50     #ifdef DEBUG
51 nigel 7 static const char *OP_names[] = {
52     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
53 nigel 3 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
54     "not",
55     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
56     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
57     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
58     "*", "*?", "+", "+?", "?", "??", "{", "{",
59     "class", "Ref",
60     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
61     "Brazero", "Braminzero", "Bra"
62     };
63     #endif
64    
65     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
66     are simple data values; negative values are for special things like \d and so
67     on. Zero means further processing is needed (for things like \x), or the escape
68     is invalid. */
69    
70     static short int escapes[] = {
71     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
72     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
73     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
74     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
75     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
76     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
77     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
78     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
79     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
80     0, 0, 0 /* x - z */
81     };
82    
83     /* Definition to allow mutual recursion */
84    
85 nigel 7 static BOOL compile_regex(int, int *, uschar **, const uschar **, const char **);
86 nigel 3
87     /* Structure for passing "static" information around between the functions
88     doing the matching, so that they are thread-safe. */
89    
90     typedef struct match_data {
91     int errorcode; /* As it says */
92     int *offset_vector; /* Offset vector */
93     int offset_end; /* One past the end */
94     BOOL offset_overflow; /* Set if too many extractions */
95     BOOL caseless; /* Case-independent flag */
96     BOOL runtime_caseless; /* Caseless forced at run time */
97     BOOL multiline; /* Multiline flag */
98     BOOL notbol; /* NOTBOL flag */
99     BOOL noteol; /* NOTEOL flag */
100     BOOL dotall; /* Dot matches any char */
101     BOOL endonly; /* Dollar not before final \n */
102 nigel 7 const uschar *start_subject; /* Start of the subject string */
103     const uschar *end_subject; /* End of the subject string */
104 nigel 3 jmp_buf fail_env; /* Environment for longjump() break out */
105 nigel 7 const uschar *end_match_ptr; /* Subject position at end match */
106 nigel 3 int end_offset_top; /* Highwater mark at end of match */
107     } match_data;
108    
109    
110    
111     /*************************************************
112     * Global variables *
113     *************************************************/
114    
115     /* PCRE is thread-clean and doesn't use any global variables in the normal
116     sense. However, it calls memory allocation and free functions via the two
117     indirections below, which are can be changed by the caller, but are shared
118     between all threads. */
119    
120     void *(*pcre_malloc)(size_t) = malloc;
121     void (*pcre_free)(void *) = free;
122    
123    
124    
125    
126     /*************************************************
127     * Return version string *
128     *************************************************/
129    
130 nigel 7 const char *
131 nigel 3 pcre_version(void)
132     {
133     return PCRE_VERSION;
134     }
135    
136    
137    
138    
139     /*************************************************
140     * Return info about a compiled pattern *
141     *************************************************/
142    
143     /* This function picks potentially useful data out of the private
144     structure.
145    
146     Arguments:
147     external_re points to compiled code
148     optptr where to pass back the options
149     first_char where to pass back the first character,
150     or -1 if multiline and all branches start ^,
151     or -2 otherwise
152    
153     Returns: number of identifying extraction brackets
154     or negative values on error
155     */
156    
157     int
158     pcre_info(const pcre *external_re, int *optptr, int *first_char)
159     {
160 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
161 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
162     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
163     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
164     if (first_char != NULL)
165     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
166     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
167     return re->top_bracket;
168     }
169    
170    
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Debugging function to print chars *
176     *************************************************/
177    
178     /* Print a sequence of chars in printable format, stopping at the end of the
179     subject if the requested.
180    
181     Arguments:
182     p points to characters
183     length number to print
184     is_subject TRUE if printing from within md->start_subject
185     md pointer to matching data block, if is_subject is TRUE
186    
187     Returns: nothing
188     */
189    
190     static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
191     {
192     int c;
193     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
194     while (length-- > 0)
195     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
196     }
197     #endif
198    
199    
200    
201    
202     /*************************************************
203     * Check subpattern for empty operand *
204     *************************************************/
205    
206     /* This function checks a bracketed subpattern to see if any of the paths
207     through it could match an empty string. This is used to diagnose an error if
208     such a subpattern is followed by a quantifier with an unlimited upper bound.
209    
210     Argument:
211     code points to the opening bracket
212    
213     Returns: TRUE or FALSE
214     */
215    
216     static BOOL
217     could_be_empty(uschar *code)
218     {
219     do {
220     uschar *cc = code + 3;
221    
222     /* Scan along the opcodes for this branch; as soon as we find something
223     that matches a non-empty string, break out and advance to test the next
224     branch. If we get to the end of the branch, return TRUE for the whole
225     sub-expression. */
226    
227     for (;;)
228     {
229     /* Test an embedded subpattern; if it could not be empty, break the
230     loop. Otherwise carry on in the branch. */
231    
232 nigel 5 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
233 nigel 3 {
234     if (!could_be_empty(cc)) break;
235     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
236     cc += 3;
237     }
238    
239     else switch (*cc)
240     {
241     /* Reached end of a branch: the subpattern may match the empty string */
242    
243     case OP_ALT:
244     case OP_KET:
245     case OP_KETRMAX:
246     case OP_KETRMIN:
247     return TRUE;
248    
249     /* Skip over assertive subpatterns */
250    
251     case OP_ASSERT:
252     case OP_ASSERT_NOT:
253     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
254     cc += 3;
255     break;
256    
257     /* Skip over things that don't match chars */
258    
259     case OP_SOD:
260     case OP_EOD:
261     case OP_CIRC:
262     case OP_DOLL:
263     case OP_BRAZERO:
264     case OP_BRAMINZERO:
265     case OP_NOT_WORD_BOUNDARY:
266     case OP_WORD_BOUNDARY:
267     cc++;
268     break;
269    
270     /* Skip over simple repeats with zero lower bound */
271    
272     case OP_STAR:
273     case OP_MINSTAR:
274     case OP_QUERY:
275     case OP_MINQUERY:
276 nigel 5 case OP_NOTSTAR:
277     case OP_NOTMINSTAR:
278     case OP_NOTQUERY:
279     case OP_NOTMINQUERY:
280 nigel 3 case OP_TYPESTAR:
281     case OP_TYPEMINSTAR:
282     case OP_TYPEQUERY:
283     case OP_TYPEMINQUERY:
284     cc += 2;
285     break;
286    
287     /* Skip over UPTOs (lower bound is zero) */
288    
289     case OP_UPTO:
290     case OP_MINUPTO:
291     case OP_TYPEUPTO:
292     case OP_TYPEMINUPTO:
293     cc += 4;
294     break;
295    
296     /* Check a class or a back reference for a zero minimum */
297    
298     case OP_CLASS:
299     case OP_REF:
300 nigel 5 cc += (*cc == OP_REF)? 2 : 33;
301 nigel 3
302     switch (*cc)
303     {
304     case OP_CRSTAR:
305     case OP_CRMINSTAR:
306     case OP_CRQUERY:
307     case OP_CRMINQUERY:
308     cc++;
309     break;
310    
311     case OP_CRRANGE:
312     case OP_CRMINRANGE:
313     if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
314     cc += 3;
315     break;
316    
317     default:
318     goto NEXT_BRANCH;
319     }
320     break;
321    
322     /* Anything else matches at least one character */
323    
324     default:
325     goto NEXT_BRANCH;
326     }
327     }
328    
329     NEXT_BRANCH:
330     code += (code[1] << 8) + code[2];
331     }
332     while (*code == OP_ALT);
333    
334     /* No branches match the empty string */
335    
336     return FALSE;
337     }
338    
339    
340    
341     /*************************************************
342     * Handle escapes *
343     *************************************************/
344    
345     /* This function is called when a \ has been encountered. It either returns a
346     positive value for a simple escape such as \n, or a negative value which
347     encodes one of the more complicated things such as \d. On entry, ptr is
348     pointing at the \. On exit, it is on the final character of the escape
349     sequence.
350    
351     Arguments:
352     ptrptr points to the pattern position pointer
353     errorptr points to the pointer to the error message
354     bracount number of previous extracting brackets
355     options the options bits
356     isclass TRUE if inside a character class
357    
358     Returns: zero or positive => a data character
359     negative => a special escape sequence
360     on error, errorptr is set
361     */
362    
363     static int
364 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
365     int options, BOOL isclass)
366 nigel 3 {
367 nigel 7 const uschar *ptr = *ptrptr;
368 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
369     int i;
370    
371     if (c == 0) *errorptr = ERR1;
372    
373     /* Digits or letters may have special meaning; all others are literals. */
374    
375     else if (c < '0' || c > 'z') {}
376    
377     /* Do an initial lookup in a table. A non-zero result is something that can be
378     returned immediately. Otherwise further processing may be required. */
379    
380     else if ((i = escapes[c - '0']) != 0) c = i;
381    
382     /* Escapes that need further processing, or are illegal. */
383    
384     else
385     {
386 nigel 7 const uschar *oldptr;
387 nigel 3 switch (c)
388     {
389     /* The handling of escape sequences consisting of a string of digits
390     starting with one that is not zero is not straightforward. By experiment,
391     the way Perl works seems to be as follows:
392    
393     Outside a character class, the digits are read as a decimal number. If the
394     number is less than 10, or if there are that many previous extracting
395     left brackets, then it is a back reference. Otherwise, up to three octal
396     digits are read to form an escaped byte. Thus \123 is likely to be octal
397     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
398     value is greater than 377, the least significant 8 bits are taken. Inside a
399     character class, \ followed by a digit is always an octal number. */
400    
401     case '1': case '2': case '3': case '4': case '5':
402     case '6': case '7': case '8': case '9':
403    
404     if (!isclass)
405     {
406     oldptr = ptr;
407     c -= '0';
408     while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
409     c = c * 10 + *(++ptr) - '0';
410     if (c < 10 || c <= bracount)
411     {
412     c = -(ESC_REF + c);
413     break;
414     }
415     ptr = oldptr; /* Put the pointer back and fall through */
416     }
417    
418     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
419     generates a binary zero byte and treats the digit as a following literal.
420     Thus we have to pull back the pointer by one. */
421    
422     if ((c = *ptr) >= '8')
423     {
424     ptr--;
425     c = 0;
426     break;
427     }
428    
429     /* \0 always starts an octal number, but we may drop through to here with a
430     larger first octal digit */
431    
432     case '0':
433     c -= '0';
434     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
435     ptr[1] != '8' && ptr[1] != '9')
436     c = c * 8 + *(++ptr) - '0';
437     break;
438    
439     /* Special escapes not starting with a digit are straightforward */
440    
441     case 'x':
442     c = 0;
443     while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
444     {
445     ptr++;
446     c = c * 16 + pcre_lcc[*ptr] -
447     (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
448     }
449     break;
450    
451     case 'c':
452     c = *(++ptr);
453     if (c == 0)
454     {
455     *errorptr = ERR2;
456     return 0;
457     }
458    
459     /* A letter is upper-cased; then the 0x40 bit is flipped */
460    
461     if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
462     c ^= 0x40;
463     break;
464    
465     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
466     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
467     for Perl compatibility, it is a literal. */
468    
469     default:
470     if ((options & PCRE_EXTRA) != 0) switch(c)
471     {
472     case 'X':
473     c = -ESC_X; /* This could be a lookup if it ever got into Perl */
474     break;
475    
476     default:
477     *errorptr = ERR3;
478     break;
479     }
480     break;
481     }
482     }
483    
484     *ptrptr = ptr;
485     return c;
486     }
487    
488    
489    
490     /*************************************************
491     * Check for counted repeat *
492     *************************************************/
493    
494     /* This function is called when a '{' is encountered in a place where it might
495     start a quantifier. It looks ahead to see if it really is a quantifier or not.
496     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
497     where the ddds are digits.
498    
499     Arguments:
500     p pointer to the first char after '{'
501    
502     Returns: TRUE or FALSE
503     */
504    
505     static BOOL
506 nigel 7 is_counted_repeat(const uschar *p)
507 nigel 3 {
508     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
509     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
510     if (*p == '}') return TRUE;
511    
512     if (*p++ != ',') return FALSE;
513     if (*p == '}') return TRUE;
514    
515     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
516     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
517     return (*p == '}');
518     }
519    
520    
521    
522     /*************************************************
523     * Read repeat counts *
524     *************************************************/
525    
526     /* Read an item of the form {n,m} and return the values. This is called only
527     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
528     so the syntax is guaranteed to be correct, but we need to check the values.
529    
530     Arguments:
531     p pointer to first char after '{'
532     minp pointer to int for min
533     maxp pointer to int for max
534     returned as -1 if no max
535     errorptr points to pointer to error message
536    
537     Returns: pointer to '}' on success;
538     current ptr on error, with errorptr set
539     */
540    
541 nigel 7 static const uschar *
542     read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
543 nigel 3 {
544     int min = 0;
545     int max = -1;
546    
547     while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
548    
549     if (*p == '}') max = min; else
550     {
551     if (*(++p) != '}')
552     {
553     max = 0;
554     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
555     if (max < min)
556     {
557     *errorptr = ERR4;
558     return p;
559     }
560     }
561     }
562    
563     /* Do paranoid checks, then fill in the required variables, and pass back the
564     pointer to the terminating '}'. */
565    
566     if (min > 65535 || max > 65535)
567     *errorptr = ERR5;
568     else
569     {
570     *minp = min;
571     *maxp = max;
572     }
573     return p;
574     }
575    
576    
577    
578     /*************************************************
579     * Compile one branch *
580     *************************************************/
581    
582     /* Scan the pattern, compiling it into the code vector.
583    
584     Arguments:
585     options the option bits
586     bracket points to number of brackets used
587     code points to the pointer to the current code point
588     ptrptr points to the current pattern pointer
589     errorptr points to pointer to error message
590    
591     Returns: TRUE on success
592     FALSE, with *errorptr set on error
593     */
594    
595     static BOOL
596 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
597     const uschar **ptrptr, const char **errorptr)
598 nigel 3 {
599     int repeat_type, op_type;
600     int repeat_min, repeat_max;
601     int bravalue, length;
602     register int c;
603     register uschar *code = *codeptr;
604 nigel 7 const uschar *ptr = *ptrptr;
605     const uschar *oldptr;
606 nigel 3 uschar *previous = NULL;
607     uschar class[32];
608    
609     /* Switch on next character until the end of the branch */
610    
611     for (;; ptr++)
612     {
613     BOOL negate_class;
614     int class_charcount;
615     int class_lastchar;
616    
617     c = *ptr;
618     if ((options & PCRE_EXTENDED) != 0)
619     {
620     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
621     if (c == '#')
622     {
623     while ((c = *(++ptr)) != 0 && c != '\n');
624     continue;
625     }
626     }
627    
628     switch(c)
629     {
630     /* The branch terminates at end of string, |, or ). */
631    
632     case 0:
633     case '|':
634     case ')':
635     *codeptr = code;
636     *ptrptr = ptr;
637     return TRUE;
638    
639     /* Handle single-character metacharacters */
640    
641     case '^':
642     previous = NULL;
643     *code++ = OP_CIRC;
644     break;
645    
646     case '$':
647     previous = NULL;
648     *code++ = OP_DOLL;
649     break;
650    
651     case '.':
652     previous = code;
653     *code++ = OP_ANY;
654     break;
655    
656     /* Character classes. These always build a 32-byte bitmap of the permitted
657     characters, except in the special case where there is only one character.
658     For negated classes, we build the map as usual, then invert it at the end.
659     */
660    
661     case '[':
662     previous = code;
663     *code++ = OP_CLASS;
664    
665     /* If the first character is '^', set the negation flag */
666    
667     if ((c = *(++ptr)) == '^')
668     {
669     negate_class = TRUE;
670     c = *(++ptr);
671     }
672     else negate_class = FALSE;
673    
674     /* Keep a count of chars so that we can optimize the case of just a single
675     character. */
676    
677     class_charcount = 0;
678     class_lastchar = -1;
679    
680     /* Initialize the 32-char bit map to all zeros. We have to build the
681     map in a temporary bit of store, in case the class contains only 1
682     character, because in that case the compiled code doesn't use the
683     bit map. */
684    
685     memset(class, 0, 32 * sizeof(uschar));
686    
687     /* Process characters until ] is reached. By writing this as a "do" it
688     means that an initial ] is taken as a data character. */
689    
690     do
691     {
692     if (c == 0)
693     {
694     *errorptr = ERR6;
695     goto FAILED;
696     }
697    
698     /* Backslash may introduce a single character, or it may introduce one
699     of the specials, which just set a flag. Escaped items are checked for
700     validity in the pre-compiling pass. The sequence \b is a special case.
701 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
702 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
703     or into the one we are building. We assume they have more than one
704     character in them, so set class_count bigger than one. */
705    
706     if (c == '\\')
707     {
708     c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
709     if (-c == ESC_b) c = '\b';
710     else if (c < 0)
711     {
712     class_charcount = 10;
713     switch (-c)
714     {
715     case ESC_d:
716     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
717     continue;
718    
719     case ESC_D:
720     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
721     continue;
722    
723     case ESC_w:
724     for (c = 0; c < 32; c++)
725     class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
726     continue;
727    
728     case ESC_W:
729     for (c = 0; c < 32; c++)
730     class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
731     continue;
732    
733     case ESC_s:
734     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
735     continue;
736    
737     case ESC_S:
738     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
739     continue;
740    
741     default:
742     *errorptr = ERR7;
743     goto FAILED;
744     }
745     }
746     /* Fall through if single character */
747     }
748    
749     /* A single character may be followed by '-' to form a range. However,
750     Perl does not permit ']' to be the end of the range. A '-' character
751     here is treated as a literal. */
752    
753     if (ptr[1] == '-' && ptr[2] != ']')
754     {
755     int d;
756     ptr += 2;
757     d = *ptr;
758    
759     if (d == 0)
760     {
761     *errorptr = ERR6;
762     goto FAILED;
763     }
764    
765     /* The second part of a range can be a single-character escape, but
766     not any of the other escapes. */
767    
768     if (d == '\\')
769     {
770     d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
771     if (d < 0)
772     {
773     if (d == -ESC_b) d = '\b'; else
774     {
775     *errorptr = ERR7;
776     goto FAILED;
777     }
778     }
779     }
780    
781     if (d < c)
782     {
783     *errorptr = ERR8;
784     goto FAILED;
785     }
786    
787     for (; c <= d; c++)
788     {
789     class[c/8] |= (1 << (c&7));
790     if ((options & PCRE_CASELESS) != 0)
791     {
792     int uc = pcre_fcc[c]; /* flip case */
793     class[uc/8] |= (1 << (uc&7));
794     }
795     class_charcount++; /* in case a one-char range */
796     class_lastchar = c;
797     }
798     continue; /* Go get the next char in the class */
799     }
800    
801     /* Handle a lone single character - we can get here for a normal
802     non-escape char, or after \ that introduces a single character. */
803    
804     class [c/8] |= (1 << (c&7));
805     if ((options & PCRE_CASELESS) != 0)
806     {
807     c = pcre_fcc[c]; /* flip case */
808     class[c/8] |= (1 << (c&7));
809     }
810     class_charcount++;
811     class_lastchar = c;
812     }
813    
814     /* Loop until ']' reached; the check for end of string happens inside the
815     loop. This "while" is the end of the "do" above. */
816    
817     while ((c = *(++ptr)) != ']');
818    
819     /* If class_charcount is 1 and class_lastchar is not negative, we saw
820     precisely one character. This doesn't need the whole 32-byte bit map.
821     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
822     it's negative. */
823    
824     if (class_charcount == 1 && class_lastchar >= 0)
825     {
826     if (negate_class)
827     {
828     code[-1] = OP_NOT;
829     }
830     else
831     {
832     code[-1] = OP_CHARS;
833     *code++ = 1;
834     }
835     *code++ = class_lastchar;
836     }
837    
838     /* Otherwise, negate the 32-byte map if necessary, and copy it into
839     the code vector. */
840    
841     else
842     {
843     if (negate_class)
844     for (c = 0; c < 32; c++) code[c] = ~class[c];
845     else
846     memcpy(code, class, 32);
847     code += 32;
848     }
849     break;
850    
851     /* Various kinds of repeat */
852    
853     case '{':
854     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
855     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
856     if (*errorptr != NULL) goto FAILED;
857     goto REPEAT;
858    
859     case '*':
860     repeat_min = 0;
861     repeat_max = -1;
862     goto REPEAT;
863    
864     case '+':
865     repeat_min = 1;
866     repeat_max = -1;
867     goto REPEAT;
868    
869     case '?':
870     repeat_min = 0;
871     repeat_max = 1;
872    
873     REPEAT:
874     if (previous == NULL)
875     {
876     *errorptr = ERR9;
877     goto FAILED;
878     }
879    
880     /* If the next character is '?' this is a minimizing repeat. Advance to the
881     next character. */
882    
883     if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
884    
885     /* If the maximum is zero then the minimum must also be zero; Perl allows
886     this case, so we do too - by simply omitting the item altogether. */
887    
888     if (repeat_max == 0) code = previous;
889    
890     /* If previous was a string of characters, chop off the last one and use it
891     as the subject of the repeat. If there was only one character, we can
892     abolish the previous item altogether. */
893    
894     else if (*previous == OP_CHARS)
895     {
896     int len = previous[1];
897     if (len == 1)
898     {
899     c = previous[2];
900     code = previous;
901     }
902     else
903     {
904     c = previous[len+1];
905     previous[1]--;
906     code--;
907     }
908     op_type = 0; /* Use single-char op codes */
909     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
910     }
911    
912     /* If previous was a single negated character ([^a] or similar), we use
913     one of the special opcodes, replacing it. The code is shared with single-
914     character repeats by adding a suitable offset into repeat_type. */
915    
916     else if ((int)*previous == OP_NOT)
917     {
918     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
919     c = previous[1];
920     code = previous;
921     goto OUTPUT_SINGLE_REPEAT;
922     }
923    
924     /* If previous was a character type match (\d or similar), abolish it and
925     create a suitable repeat item. The code is shared with single-character
926     repeats by adding a suitable offset into repeat_type. */
927    
928     else if ((int)*previous < OP_EOD || *previous == OP_ANY)
929     {
930     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
931     c = *previous;
932     code = previous;
933    
934     OUTPUT_SINGLE_REPEAT:
935     repeat_type += op_type; /* Combine both values for many cases */
936    
937     /* A minimum of zero is handled either as the special case * or ?, or as
938     an UPTO, with the maximum given. */
939    
940     if (repeat_min == 0)
941     {
942     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
943     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
944     else
945     {
946     *code++ = OP_UPTO + repeat_type;
947     *code++ = repeat_max >> 8;
948     *code++ = (repeat_max & 255);
949     }
950     }
951    
952     /* The case {1,} is handled as the special case + */
953    
954     else if (repeat_min == 1 && repeat_max == -1)
955     *code++ = OP_PLUS + repeat_type;
956    
957     /* The case {n,n} is just an EXACT, while the general case {n,m} is
958     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
959    
960     else
961     {
962     if (repeat_min != 1)
963     {
964     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
965     *code++ = repeat_min >> 8;
966     *code++ = (repeat_min & 255);
967     }
968    
969     /* If the mininum is 1 and the previous item was a character string,
970     we either have to put back the item that got cancelled if the string
971     length was 1, or add the character back onto the end of a longer
972     string. For a character type nothing need be done; it will just get put
973     back naturally. */
974    
975     else if (*previous == OP_CHARS)
976     {
977     if (code == previous) code += 2; else previous[1]++;
978     }
979    
980     /* Insert an UPTO if the max is greater than the min. */
981    
982     if (repeat_max != repeat_min)
983     {
984     *code++ = c;
985     repeat_max -= repeat_min;
986     *code++ = OP_UPTO + repeat_type;
987     *code++ = repeat_max >> 8;
988     *code++ = (repeat_max & 255);
989     }
990     }
991    
992     /* The character or character type itself comes last in all cases. */
993    
994     *code++ = c;
995     }
996    
997     /* If previous was a character class or a back reference, we put the repeat
998     stuff after it. */
999    
1000     else if (*previous == OP_CLASS || *previous == OP_REF)
1001     {
1002     if (repeat_min == 0 && repeat_max == -1)
1003     *code++ = OP_CRSTAR + repeat_type;
1004     else if (repeat_min == 1 && repeat_max == -1)
1005     *code++ = OP_CRPLUS + repeat_type;
1006     else if (repeat_min == 0 && repeat_max == 1)
1007     *code++ = OP_CRQUERY + repeat_type;
1008     else
1009     {
1010     *code++ = OP_CRRANGE + repeat_type;
1011     *code++ = repeat_min >> 8;
1012     *code++ = repeat_min & 255;
1013     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1014     *code++ = repeat_max >> 8;
1015     *code++ = repeat_max & 255;
1016     }
1017     }
1018    
1019     /* If previous was a bracket group, we may have to replicate it in certain
1020     cases. If the maximum repeat count is unlimited, check that the bracket
1021     group cannot match the empty string, and diagnose an error if it can. */
1022    
1023     else if ((int)*previous >= OP_BRA)
1024     {
1025     int i;
1026     int length = code - previous;
1027    
1028     if (repeat_max == -1 && could_be_empty(previous))
1029     {
1030     *errorptr = ERR10;
1031     goto FAILED;
1032     }
1033    
1034     /* If the minimum is greater than zero, and the maximum is unlimited or
1035     equal to the minimum, the first copy remains where it is, and is
1036     replicated up to the minimum number of times. This case includes the +
1037     repeat, but of course no replication is needed in that case. */
1038    
1039     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1040     {
1041     for (i = 1; i < repeat_min; i++)
1042     {
1043     memcpy(code, previous, length);
1044     code += length;
1045     }
1046     }
1047    
1048     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1049     Then, if there is a fixed upper limit, replicated up to that many times,
1050     sticking BRAZERO in front of all the optional ones. */
1051    
1052     else
1053     {
1054     if (repeat_min == 0)
1055     {
1056     memmove(previous+1, previous, length);
1057     code++;
1058     *previous++ = OP_BRAZERO + repeat_type;
1059     }
1060    
1061     for (i = 1; i < repeat_min; i++)
1062     {
1063     memcpy(code, previous, length);
1064     code += length;
1065     }
1066    
1067     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1068     {
1069     *code++ = OP_BRAZERO + repeat_type;
1070     memcpy(code, previous, length);
1071     code += length;
1072     }
1073     }
1074    
1075     /* If the maximum is unlimited, set a repeater in the final copy. */
1076    
1077     if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1078     }
1079    
1080     /* Else there's some kind of shambles */
1081    
1082     else
1083     {
1084     *errorptr = ERR11;
1085     goto FAILED;
1086     }
1087    
1088     /* In all case we no longer have a previous item. */
1089    
1090     previous = NULL;
1091     break;
1092    
1093    
1094     /* Start of nested bracket sub-expression, or comment or lookahead.
1095     First deal with special things that can come after a bracket; all are
1096     introduced by ?, and the appearance of any of them means that this is not a
1097     referencing group. They were checked for validity in the first pass over
1098     the string, so we don't have to check for syntax errors here. */
1099    
1100     case '(':
1101     previous = code; /* Only real brackets can be repeated */
1102     if (*(++ptr) == '?')
1103     {
1104     bravalue = OP_BRA;
1105    
1106     switch (*(++ptr))
1107     {
1108     case '#':
1109     case 'i':
1110     case 'm':
1111     case 's':
1112     case 'x':
1113     ptr++;
1114     while (*ptr != ')') ptr++;
1115     previous = NULL;
1116     continue;
1117    
1118     case ':': /* Non-extracting bracket */
1119     ptr++;
1120     break;
1121    
1122     case '=': /* Assertions can't be repeated */
1123     bravalue = OP_ASSERT;
1124     ptr++;
1125     previous = NULL;
1126     break;
1127    
1128     case '!':
1129     bravalue = OP_ASSERT_NOT;
1130     ptr++;
1131     previous = NULL;
1132     break;
1133    
1134     case '>': /* "Match once" brackets */
1135     if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1136     {
1137     bravalue = OP_ONCE;
1138     ptr++;
1139     previous = NULL;
1140     break;
1141     }
1142     /* Else fall through */
1143    
1144     default:
1145     *errorptr = ERR12;
1146     goto FAILED;
1147     }
1148     }
1149    
1150     /* Else we have a referencing group */
1151    
1152     else
1153     {
1154     if (++(*brackets) > EXTRACT_MAX)
1155     {
1156     *errorptr = ERR13;
1157     goto FAILED;
1158     }
1159     bravalue = OP_BRA + *brackets;
1160     }
1161    
1162     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1163     code into a non-register variable in order to be able to pass its address
1164     because some compilers complain otherwise. */
1165    
1166     *code = bravalue;
1167     {
1168     uschar *mcode = code;
1169     if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1170     goto FAILED;
1171     code = mcode;
1172     }
1173    
1174     if (*ptr != ')')
1175     {
1176     *errorptr = ERR14;
1177     goto FAILED;
1178     }
1179     break;
1180    
1181     /* Check \ for being a real metacharacter; if not, fall through and handle
1182     it as a data character at the start of a string. Escape items are checked
1183     for validity in the pre-compiling pass. */
1184    
1185     case '\\':
1186     oldptr = ptr;
1187     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1188    
1189     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1190     are arranged to be the negation of the corresponding OP_values. For the
1191     back references, the values are ESC_REF plus the reference number. Only
1192     back references and those types that consume a character may be repeated.
1193     We can test for values between ESC_b and ESC_Z for the latter; this may
1194     have to change if any new ones are ever created. */
1195    
1196     if (c < 0)
1197     {
1198     if (-c >= ESC_REF)
1199     {
1200     int refnum = -c - ESC_REF;
1201     if (*brackets < refnum)
1202     {
1203     *errorptr = ERR15;
1204     goto FAILED;
1205     }
1206     previous = code;
1207     *code++ = OP_REF;
1208     *code++ = refnum;
1209     }
1210     else
1211     {
1212     previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1213     *code++ = -c;
1214     }
1215     continue;
1216     }
1217    
1218 nigel 7 /* Data character: reset and fall through */
1219 nigel 3
1220     ptr = oldptr;
1221     c = '\\';
1222    
1223     /* Handle a run of data characters until a metacharacter is encountered.
1224     The first character is guaranteed not to be whitespace or # when the
1225     extended flag is set. */
1226    
1227     NORMAL_CHAR:
1228     default:
1229     previous = code;
1230     *code = OP_CHARS;
1231     code += 2;
1232     length = 0;
1233    
1234     do
1235     {
1236     if ((options & PCRE_EXTENDED) != 0)
1237     {
1238     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1239     if (c == '#')
1240     {
1241     while ((c = *(++ptr)) != 0 && c != '\n');
1242     if (c == 0) break;
1243     continue;
1244     }
1245     }
1246    
1247     /* Backslash may introduce a data char or a metacharacter. Escaped items
1248     are checked for validity in the pre-compiling pass. Stop the string
1249     before a metaitem. */
1250    
1251     if (c == '\\')
1252     {
1253     oldptr = ptr;
1254     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1255     if (c < 0) { ptr = oldptr; break; }
1256     }
1257    
1258     /* Ordinary character or single-char escape */
1259    
1260     *code++ = c;
1261     length++;
1262     }
1263    
1264     /* This "while" is the end of the "do" above. */
1265    
1266     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1267    
1268     /* Compute the length and set it in the data vector, and advance to
1269     the next state. */
1270    
1271     previous[1] = length;
1272     ptr--;
1273     break;
1274     }
1275     } /* end of big loop */
1276    
1277     /* Control never reaches here by falling through, only by a goto for all the
1278     error states. Pass back the position in the pattern so that it can be displayed
1279     to the user for diagnosing the error. */
1280    
1281     FAILED:
1282     *ptrptr = ptr;
1283     return FALSE;
1284     }
1285    
1286    
1287    
1288    
1289     /*************************************************
1290     * Compile sequence of alternatives *
1291     *************************************************/
1292    
1293     /* On entry, ptr is pointing past the bracket character, but on return
1294     it points to the closing bracket, or vertical bar, or end of string.
1295     The code variable is pointing at the byte into which the BRA operator has been
1296     stored.
1297    
1298     Argument:
1299     options the option bits
1300     brackets -> int containing the number of extracting brackets used
1301     codeptr -> the address of the current code pointer
1302     ptrptr -> the address of the current pattern pointer
1303     errorptr -> pointer to error message
1304    
1305     Returns: TRUE on success
1306     */
1307    
1308     static BOOL
1309 nigel 7 compile_regex(int options, int *brackets, uschar **codeptr,
1310     const uschar **ptrptr, const char **errorptr)
1311 nigel 3 {
1312 nigel 7 const uschar *ptr = *ptrptr;
1313 nigel 3 uschar *code = *codeptr;
1314     uschar *start_bracket = code;
1315    
1316     for (;;)
1317     {
1318     int length;
1319     uschar *last_branch = code;
1320    
1321     code += 3;
1322     if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1323     {
1324     *ptrptr = ptr;
1325     return FALSE;
1326     }
1327    
1328     /* Fill in the length of the last branch */
1329    
1330     length = code - last_branch;
1331     last_branch[1] = length >> 8;
1332     last_branch[2] = length & 255;
1333    
1334     /* Reached end of expression, either ')' or end of pattern. Insert a
1335     terminating ket and the length of the whole bracketed item, and return,
1336     leaving the pointer at the terminating char. */
1337    
1338     if (*ptr != '|')
1339     {
1340     length = code - start_bracket;
1341     *code++ = OP_KET;
1342     *code++ = length >> 8;
1343     *code++ = length & 255;
1344     *codeptr = code;
1345     *ptrptr = ptr;
1346     return TRUE;
1347     }
1348    
1349     /* Another branch follows; insert an "or" node and advance the pointer. */
1350    
1351     *code = OP_ALT;
1352     ptr++;
1353     }
1354     /* Control never reaches here */
1355     }
1356    
1357    
1358    
1359     /*************************************************
1360     * Check for anchored expression *
1361     *************************************************/
1362    
1363     /* Try to find out if this is an anchored regular expression. Consider each
1364     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1365     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1366     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1367     counts, since OP_CIRC can match in the middle.
1368    
1369     A branch is also implicitly anchored if it starts with .* because that will try
1370     the rest of the pattern at all possible matching points, so there is no point
1371     trying them again.
1372    
1373     Argument: points to start of expression (the bracket)
1374     Returns: TRUE or FALSE
1375     */
1376    
1377     static BOOL
1378 nigel 7 is_anchored(register const uschar *code, BOOL multiline)
1379 nigel 3 {
1380     do {
1381     int op = (int)code[3];
1382     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1383     { if (!is_anchored(code+3, multiline)) return FALSE; }
1384     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1385     { if (code[4] != OP_ANY) return FALSE; }
1386     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1387     code += (code[1] << 8) + code[2];
1388     }
1389     while (*code == OP_ALT);
1390     return TRUE;
1391     }
1392    
1393    
1394    
1395     /*************************************************
1396     * Check for start with \n line expression *
1397     *************************************************/
1398    
1399     /* This is called for multiline expressions to try to find out if every branch
1400     starts with ^ so that "first char" processing can be done to speed things up.
1401    
1402     Argument: points to start of expression (the bracket)
1403     Returns: TRUE or FALSE
1404     */
1405    
1406     static BOOL
1407 nigel 7 is_startline(const uschar *code)
1408 nigel 3 {
1409     do {
1410     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1411     { if (!is_startline(code+3)) return FALSE; }
1412     else if (code[3] != OP_CIRC) return FALSE;
1413     code += (code[1] << 8) + code[2];
1414     }
1415     while (*code == OP_ALT);
1416     return TRUE;
1417     }
1418    
1419    
1420    
1421     /*************************************************
1422     * Check for fixed first char *
1423     *************************************************/
1424    
1425     /* Try to find out if there is a fixed first character. This is called for
1426     unanchored expressions, as it speeds up their processing quite considerably.
1427     Consider each alternative branch. If they all start with the same char, or with
1428     a bracket all of whose alternatives start with the same char (recurse ad lib),
1429     then we return that char, otherwise -1.
1430    
1431     Argument: points to start of expression (the bracket)
1432     Returns: -1 or the fixed first char
1433     */
1434    
1435     static int
1436     find_firstchar(uschar *code)
1437     {
1438     register int c = -1;
1439     do
1440     {
1441     register int charoffset = 4;
1442    
1443     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1444     {
1445     register int d;
1446     if ((d = find_firstchar(code+3)) < 0) return -1;
1447     if (c < 0) c = d; else if (c != d) return -1;
1448     }
1449    
1450     else switch(code[3])
1451     {
1452     default:
1453     return -1;
1454    
1455     case OP_EXACT: /* Fall through */
1456     charoffset++;
1457    
1458     case OP_CHARS: /* Fall through */
1459     charoffset++;
1460    
1461     case OP_PLUS:
1462     case OP_MINPLUS:
1463     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1464     break;
1465     }
1466     code += (code[1] << 8) + code[2];
1467     }
1468     while (*code == OP_ALT);
1469     return c;
1470     }
1471    
1472    
1473    
1474     /*************************************************
1475     * Compile a Regular Expression *
1476     *************************************************/
1477    
1478     /* This function takes a string and returns a pointer to a block of store
1479     holding a compiled version of the expression.
1480    
1481     Arguments:
1482     pattern the regular expression
1483     options various option bits
1484     errorptr pointer to pointer to error text
1485     erroroffset ptr offset in pattern where error was detected
1486    
1487     Returns: pointer to compiled data block, or NULL on error,
1488     with errorptr and erroroffset set
1489     */
1490    
1491     pcre *
1492 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
1493 nigel 3 int *erroroffset)
1494     {
1495     real_pcre *re;
1496     int spaces = 0;
1497     int length = 3; /* For initial BRA plus length */
1498     int runlength;
1499     int c, size;
1500     int bracount = 0;
1501     int brastack[200];
1502     int top_backref = 0;
1503 nigel 7 unsigned int brastackptr = 0;
1504     uschar *code;
1505     const uschar *ptr;
1506 nigel 3
1507     #ifdef DEBUG
1508     uschar *code_base, *code_end;
1509     #endif
1510    
1511     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1512     can do is just return NULL. */
1513    
1514     if (errorptr == NULL) return NULL;
1515     *errorptr = NULL;
1516    
1517     /* However, we can give a message for this error */
1518    
1519     if (erroroffset == NULL)
1520     {
1521     *errorptr = ERR16;
1522     return NULL;
1523     }
1524     *erroroffset = 0;
1525    
1526     if ((options & ~PUBLIC_OPTIONS) != 0)
1527     {
1528     *errorptr = ERR17;
1529     return NULL;
1530     }
1531    
1532     #ifdef DEBUG
1533     printf("------------------------------------------------------------------\n");
1534     printf("%s\n", pattern);
1535     #endif
1536    
1537     /* The first thing to do is to make a pass over the pattern to compute the
1538     amount of store required to hold the compiled code. This does not have to be
1539     perfect as long as errors are overestimates. At the same time we can detect any
1540     internal flag settings. Make an attempt to correct for any counted white space
1541     if an "extended" flag setting appears late in the pattern. We can't be so
1542     clever for #-comments. */
1543    
1544 nigel 7 ptr = (const uschar *)(pattern - 1);
1545 nigel 3 while ((c = *(++ptr)) != 0)
1546     {
1547     int min, max;
1548     int class_charcount;
1549    
1550     if ((pcre_ctypes[c] & ctype_space) != 0)
1551     {
1552     if ((options & PCRE_EXTENDED) != 0) continue;
1553     spaces++;
1554     }
1555    
1556     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1557     {
1558     while ((c = *(++ptr)) != 0 && c != '\n');
1559     continue;
1560     }
1561    
1562     switch(c)
1563     {
1564     /* A backslashed item may be an escaped "normal" character or a
1565     character type. For a "normal" character, put the pointers and
1566     character back so that tests for whitespace etc. in the input
1567     are done correctly. */
1568    
1569     case '\\':
1570     {
1571 nigel 7 const uschar *save_ptr = ptr;
1572 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1573     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1574     if (c >= 0)
1575     {
1576     ptr = save_ptr;
1577     c = '\\';
1578     goto NORMAL_CHAR;
1579     }
1580     }
1581     length++;
1582    
1583     /* A back reference needs an additional char, plus either one or 5
1584     bytes for a repeat. We also need to keep the value of the highest
1585     back reference. */
1586    
1587     if (c <= -ESC_REF)
1588     {
1589     int refnum = -c - ESC_REF;
1590     if (refnum > top_backref) top_backref = refnum;
1591     length++; /* For single back reference */
1592     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1593     {
1594     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1595     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1596     if ((min == 0 && (max == 1 || max == -1)) ||
1597     (min == 1 && max == -1))
1598     length++;
1599     else length += 5;
1600     if (ptr[1] == '?') ptr++;
1601     }
1602     }
1603     continue;
1604    
1605     case '^':
1606     case '.':
1607     case '$':
1608     case '*': /* These repeats won't be after brackets; */
1609     case '+': /* those are handled separately */
1610     case '?':
1611     length++;
1612     continue;
1613    
1614     /* This covers the cases of repeats after a single char, metachar, class,
1615     or back reference. */
1616    
1617     case '{':
1618     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1619     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1620     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1621     if ((min == 0 && (max == 1 || max == -1)) ||
1622     (min == 1 && max == -1))
1623     length++;
1624     else
1625     {
1626     length--; /* Uncount the original char or metachar */
1627     if (min == 1) length++; else if (min > 0) length += 4;
1628     if (max > 0) length += 4; else length += 2;
1629     }
1630     if (ptr[1] == '?') ptr++;
1631     continue;
1632    
1633     /* An alternation contains an offset to the next branch or ket. */
1634     case '|':
1635     length += 3;
1636     continue;
1637    
1638     /* A character class uses 33 characters. Don't worry about character types
1639     that aren't allowed in classes - they'll get picked up during the compile.
1640     A character class that contains only one character uses 2 or 3 bytes,
1641     depending on whether it is negated or not. Notice this where we can. */
1642    
1643     case '[':
1644     class_charcount = 0;
1645     if (*(++ptr) == '^') ptr++;
1646     do
1647     {
1648     if (*ptr == '\\')
1649     {
1650     int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
1651     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1652     if (-c == ESC_b) class_charcount++; else class_charcount = 10;
1653     }
1654     else class_charcount++;
1655     ptr++;
1656     }
1657     while (*ptr != 0 && *ptr != ']');
1658    
1659     /* Repeats for negated single chars are handled by the general code */
1660    
1661     if (class_charcount == 1) length += 3; else
1662     {
1663     length += 33;
1664    
1665     /* A repeat needs either 1 or 5 bytes. */
1666    
1667     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1668     {
1669     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1670     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1671     if ((min == 0 && (max == 1 || max == -1)) ||
1672     (min == 1 && max == -1))
1673     length++;
1674     else length += 5;
1675     if (ptr[1] == '?') ptr++;
1676     }
1677     }
1678     continue;
1679    
1680     /* Brackets may be genuine groups or special things */
1681    
1682     case '(':
1683    
1684     /* Handle special forms of bracket, which all start (? */
1685    
1686     if (ptr[1] == '?') switch (c = ptr[2])
1687     {
1688     /* Skip over comments entirely */
1689     case '#':
1690     ptr += 3;
1691     while (*ptr != 0 && *ptr != ')') ptr++;
1692     if (*ptr == 0)
1693     {
1694     *errorptr = ERR18;
1695     goto PCRE_ERROR_RETURN;
1696     }
1697     continue;
1698    
1699     /* Non-referencing groups and lookaheads just move the pointer on, and
1700     then behave like a non-special bracket, except that they don't increment
1701     the count of extracting brackets. */
1702    
1703     case ':':
1704     case '=':
1705     case '!':
1706     ptr += 2;
1707     break;
1708    
1709     /* Ditto for the "once only" bracket, allowed only if the extra bit
1710     is set. */
1711    
1712     case '>':
1713     if ((options & PCRE_EXTRA) != 0)
1714     {
1715     ptr += 2;
1716     break;
1717     }
1718     /* Else fall thourh */
1719    
1720     /* Else loop setting valid options until ) is met. Anything else is an
1721     error. */
1722    
1723     default:
1724     ptr += 2;
1725     for (;; ptr++)
1726     {
1727     if ((c = *ptr) == 'i')
1728     {
1729     options |= PCRE_CASELESS;
1730     continue;
1731     }
1732     else if ((c = *ptr) == 'm')
1733     {
1734     options |= PCRE_MULTILINE;
1735     continue;
1736     }
1737     else if (c == 's')
1738     {
1739     options |= PCRE_DOTALL;
1740     continue;
1741     }
1742     else if (c == 'x')
1743     {
1744     options |= PCRE_EXTENDED;
1745     length -= spaces; /* Already counted spaces */
1746     continue;
1747     }
1748     else if (c == ')') break;
1749    
1750     *errorptr = ERR12;
1751     goto PCRE_ERROR_RETURN;
1752     }
1753     continue; /* End of this bracket handling */
1754     }
1755    
1756     /* Extracting brackets must be counted so we can process escapes in a
1757     Perlish way. */
1758    
1759     else bracount++;
1760    
1761     /* Non-special forms of bracket. Save length for computing whole length
1762     at end if there's a repeat that requires duplication of the group. */
1763    
1764     if (brastackptr >= sizeof(brastack)/sizeof(int))
1765     {
1766     *errorptr = ERR19;
1767     goto PCRE_ERROR_RETURN;
1768     }
1769    
1770     brastack[brastackptr++] = length;
1771     length += 3;
1772     continue;
1773    
1774     /* Handle ket. Look for subsequent max/min; for certain sets of values we
1775     have to replicate this bracket up to that many times. */
1776    
1777     case ')':
1778     length += 3;
1779     {
1780     int min = 1;
1781     int max = 1;
1782     int duplength = length - brastack[--brastackptr];
1783    
1784     /* Leave ptr at the final char; for read_repeat_counts this happens
1785     automatically; for the others we need an increment. */
1786    
1787     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1788     {
1789     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1790     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1791     }
1792     else if (c == '*') { min = 0; max = -1; ptr++; }
1793     else if (c == '+') { max = -1; ptr++; }
1794     else if (c == '?') { min = 0; ptr++; }
1795    
1796     /* If there is a minimum > 1 we have to replicate up to min-1 times; if
1797     there is a limited maximum we have to replicate up to max-1 times and
1798     allow for a BRAZERO item before each optional copy, as we also have to
1799     do before the first copy if the minimum is zero. */
1800    
1801     if (min == 0) length++;
1802     else if (min > 1) length += (min - 1) * duplength;
1803     if (max > min) length += (max - min) * (duplength + 1);
1804     }
1805    
1806     continue;
1807    
1808     /* Non-special character. For a run of such characters the length required
1809     is the number of characters + 2, except that the maximum run length is 255.
1810     We won't get a skipped space or a non-data escape or the start of a #
1811     comment as the first character, so the length can't be zero. */
1812    
1813     NORMAL_CHAR:
1814     default:
1815     length += 2;
1816     runlength = 0;
1817     do
1818     {
1819     if ((pcre_ctypes[c] & ctype_space) != 0)
1820     {
1821     if ((options & PCRE_EXTENDED) != 0) continue;
1822     spaces++;
1823     }
1824    
1825     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1826     {
1827     while ((c = *(++ptr)) != 0 && c != '\n');
1828     continue;
1829     }
1830    
1831     /* Backslash may introduce a data char or a metacharacter; stop the
1832     string before the latter. */
1833    
1834     if (c == '\\')
1835     {
1836 nigel 7 const uschar *saveptr = ptr;
1837 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1838     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1839     if (c < 0) { ptr = saveptr; break; }
1840     }
1841    
1842     /* Ordinary character or single-char escape */
1843    
1844     runlength++;
1845     }
1846    
1847     /* This "while" is the end of the "do" above. */
1848    
1849     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1850    
1851     ptr--;
1852     length += runlength;
1853     continue;
1854     }
1855     }
1856    
1857     length += 4; /* For final KET and END */
1858    
1859     if (length > 65539)
1860     {
1861     *errorptr = ERR20;
1862     return NULL;
1863     }
1864    
1865     /* Compute the size of data block needed and get it, either from malloc or
1866     externally provided function. Put in the magic number and the options. */
1867    
1868     size = length + offsetof(real_pcre, code);
1869     re = (real_pcre *)(pcre_malloc)(size);
1870    
1871     if (re == NULL)
1872     {
1873     *errorptr = ERR21;
1874     return NULL;
1875     }
1876    
1877     re->magic_number = MAGIC_NUMBER;
1878     re->options = options;
1879    
1880     /* Set up a starting, non-extracting bracket, then compile the expression. On
1881     error, *errorptr will be set non-NULL, so we don't need to look at the result
1882     of the function here. */
1883    
1884 nigel 7 ptr = (const uschar *)pattern;
1885 nigel 3 code = re->code;
1886     *code = OP_BRA;
1887     bracount = 0;
1888     (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1889     re->top_bracket = bracount;
1890     re->top_backref = top_backref;
1891    
1892     /* If not reached end of pattern on success, there's an excess bracket. */
1893    
1894     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1895    
1896     /* Fill in the terminating state and check for disastrous overflow, but
1897     if debugging, leave the test till after things are printed out. */
1898    
1899     *code++ = OP_END;
1900    
1901     #ifndef DEBUG
1902     if (code - re->code > length) *errorptr = ERR23;
1903     #endif
1904    
1905     /* Failed to compile */
1906    
1907     if (*errorptr != NULL)
1908     {
1909     (pcre_free)(re);
1910     PCRE_ERROR_RETURN:
1911 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
1912 nigel 3 return NULL;
1913     }
1914    
1915     /* If the anchored option was not passed, set flag if we can determine that it
1916     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1917     we can determine what the first character has to be, because that speeds up
1918     unanchored matches no end. In the case of multiline matches, an alternative is
1919     to set the PCRE_STARTLINE flag if all branches start with ^. */
1920    
1921     if ((options & PCRE_ANCHORED) == 0)
1922     {
1923     if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1924     re->options |= PCRE_ANCHORED;
1925     else
1926     {
1927     int c = find_firstchar(re->code);
1928     if (c >= 0)
1929     {
1930     re->first_char = c;
1931     re->options |= PCRE_FIRSTSET;
1932     }
1933     else if (is_startline(re->code))
1934     re->options |= PCRE_STARTLINE;
1935     }
1936     }
1937    
1938     /* Print out the compiled data for debugging */
1939    
1940     #ifdef DEBUG
1941    
1942     printf("Length = %d top_bracket = %d top_backref=%d\n",
1943     length, re->top_bracket, re->top_backref);
1944    
1945     if (re->options != 0)
1946     {
1947     printf("%s%s%s%s%s%s%s\n",
1948     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1949     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1950     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1951     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1952     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1953     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1954     ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1955     }
1956    
1957     if ((re->options & PCRE_FIRSTSET) != 0)
1958     {
1959     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1960     else printf("First char = \\x%02x\n", re->first_char);
1961     }
1962    
1963     code_end = code;
1964     code_base = code = re->code;
1965    
1966     while (code < code_end)
1967     {
1968     int charlength;
1969    
1970     printf("%3d ", code - code_base);
1971    
1972     if (*code >= OP_BRA)
1973     {
1974     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1975     code += 2;
1976     }
1977    
1978     else switch(*code)
1979     {
1980     case OP_CHARS:
1981     charlength = *(++code);
1982     printf("%3d ", charlength);
1983     while (charlength-- > 0)
1984     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
1985     break;
1986    
1987     case OP_KETRMAX:
1988     case OP_KETRMIN:
1989     case OP_ALT:
1990     case OP_KET:
1991     case OP_ASSERT:
1992     case OP_ASSERT_NOT:
1993     case OP_ONCE:
1994     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
1995     code += 2;
1996     break;
1997    
1998     case OP_STAR:
1999     case OP_MINSTAR:
2000     case OP_PLUS:
2001     case OP_MINPLUS:
2002     case OP_QUERY:
2003     case OP_MINQUERY:
2004     case OP_TYPESTAR:
2005     case OP_TYPEMINSTAR:
2006     case OP_TYPEPLUS:
2007     case OP_TYPEMINPLUS:
2008     case OP_TYPEQUERY:
2009     case OP_TYPEMINQUERY:
2010     if (*code >= OP_TYPESTAR)
2011     printf(" %s", OP_names[code[1]]);
2012     else if (isprint(c = code[1])) printf(" %c", c);
2013     else printf(" \\x%02x", c);
2014     printf("%s", OP_names[*code++]);
2015     break;
2016    
2017     case OP_EXACT:
2018     case OP_UPTO:
2019     case OP_MINUPTO:
2020     if (isprint(c = code[3])) printf(" %c{", c);
2021     else printf(" \\x%02x{", c);
2022     if (*code != OP_EXACT) printf(",");
2023     printf("%d}", (code[1] << 8) + code[2]);
2024     if (*code == OP_MINUPTO) printf("?");
2025     code += 3;
2026     break;
2027    
2028     case OP_TYPEEXACT:
2029     case OP_TYPEUPTO:
2030     case OP_TYPEMINUPTO:
2031     printf(" %s{", OP_names[code[3]]);
2032     if (*code != OP_TYPEEXACT) printf(",");
2033     printf("%d}", (code[1] << 8) + code[2]);
2034     if (*code == OP_TYPEMINUPTO) printf("?");
2035     code += 3;
2036     break;
2037    
2038     case OP_NOT:
2039     if (isprint(c = *(++code))) printf(" [^%c]", c);
2040     else printf(" [^\\x%02x]", c);
2041     break;
2042    
2043     case OP_NOTSTAR:
2044     case OP_NOTMINSTAR:
2045     case OP_NOTPLUS:
2046     case OP_NOTMINPLUS:
2047     case OP_NOTQUERY:
2048     case OP_NOTMINQUERY:
2049     if (isprint(c = code[1])) printf(" [^%c]", c);
2050     else printf(" [^\\x%02x]", c);
2051     printf("%s", OP_names[*code++]);
2052     break;
2053    
2054     case OP_NOTEXACT:
2055     case OP_NOTUPTO:
2056     case OP_NOTMINUPTO:
2057     if (isprint(c = code[3])) printf(" [^%c]{", c);
2058     else printf(" [^\\x%02x]{", c);
2059     if (*code != OP_NOTEXACT) printf(",");
2060     printf("%d}", (code[1] << 8) + code[2]);
2061     if (*code == OP_NOTMINUPTO) printf("?");
2062     code += 3;
2063     break;
2064    
2065     case OP_REF:
2066     printf(" \\%d", *(++code));
2067     break;
2068    
2069     case OP_CLASS:
2070     {
2071     int i, min, max;
2072    
2073     code++;
2074     printf(" [");
2075    
2076     for (i = 0; i < 256; i++)
2077     {
2078     if ((code[i/8] & (1 << (i&7))) != 0)
2079     {
2080     int j;
2081     for (j = i+1; j < 256; j++)
2082     if ((code[j/8] & (1 << (j&7))) == 0) break;
2083     if (i == '-' || i == ']') printf("\\");
2084     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2085     if (--j > i)
2086     {
2087     printf("-");
2088     if (j == '-' || j == ']') printf("\\");
2089     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2090     }
2091     i = j;
2092     }
2093     }
2094     printf("]");
2095     code += 32;
2096    
2097     switch(*code)
2098     {
2099     case OP_CRSTAR:
2100     case OP_CRMINSTAR:
2101     case OP_CRPLUS:
2102     case OP_CRMINPLUS:
2103     case OP_CRQUERY:
2104     case OP_CRMINQUERY:
2105     printf("%s", OP_names[*code]);
2106     break;
2107    
2108     case OP_CRRANGE:
2109     case OP_CRMINRANGE:
2110     min = (code[1] << 8) + code[2];
2111     max = (code[3] << 8) + code[4];
2112     if (max == 0) printf("{%d,}", min);
2113     else printf("{%d,%d}", min, max);
2114     if (*code == OP_CRMINRANGE) printf("?");
2115     code += 4;
2116     break;
2117    
2118     default:
2119     code--;
2120     }
2121     }
2122     break;
2123    
2124     /* Anything else is just a one-node item */
2125    
2126     default:
2127     printf(" %s", OP_names[*code]);
2128     break;
2129     }
2130    
2131     code++;
2132     printf("\n");
2133     }
2134     printf("------------------------------------------------------------------\n");
2135    
2136     /* This check is done here in the debugging case so that the code that
2137     was compiled can be seen. */
2138    
2139     if (code - re->code > length)
2140     {
2141     *errorptr = ERR23;
2142     (pcre_free)(re);
2143     *erroroffset = ptr - (uschar *)pattern;
2144     return NULL;
2145     }
2146     #endif
2147    
2148     return (pcre *)re;
2149     }
2150    
2151    
2152    
2153     /*************************************************
2154     * Match a character type *
2155     *************************************************/
2156    
2157     /* Not used in all the places it might be as it's sometimes faster
2158     to put the code inline.
2159    
2160     Arguments:
2161     type the character type
2162     c the character
2163     dotall the dotall flag
2164    
2165     Returns: TRUE if character is of the type
2166     */
2167    
2168     static BOOL
2169     match_type(int type, int c, BOOL dotall)
2170     {
2171    
2172     #ifdef DEBUG
2173     if (isprint(c)) printf("matching subject %c against ", c);
2174     else printf("matching subject \\x%02x against ", c);
2175     printf("%s\n", OP_names[type]);
2176     #endif
2177    
2178     switch(type)
2179     {
2180     case OP_ANY: return dotall || c != '\n';
2181     case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2182     case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2183     case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2184     case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2185     case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2186     case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2187     }
2188     return FALSE;
2189     }
2190    
2191    
2192    
2193     /*************************************************
2194     * Match a back-reference *
2195     *************************************************/
2196    
2197     /* If a back reference hasn't been set, the match fails.
2198    
2199     Arguments:
2200     number reference number
2201     eptr points into the subject
2202     length length to be matched
2203     md points to match data block
2204    
2205     Returns: TRUE if matched
2206     */
2207    
2208     static BOOL
2209 nigel 7 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2210 nigel 3 {
2211 nigel 7 const uschar *p = md->start_subject + md->offset_vector[number];
2212 nigel 3
2213     #ifdef DEBUG
2214     if (eptr >= md->end_subject)
2215     printf("matching subject <null>");
2216     else
2217     {
2218     printf("matching subject ");
2219     pchars(eptr, length, TRUE, md);
2220     }
2221     printf(" against backref ");
2222     pchars(p, length, FALSE, md);
2223     printf("\n");
2224     #endif
2225    
2226     /* Always fail if not enough characters left */
2227    
2228     if (length > md->end_subject - p) return FALSE;
2229    
2230     /* Separate the caselesss case for speed */
2231    
2232     if (md->caseless)
2233     { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2234     else
2235     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2236    
2237     return TRUE;
2238     }
2239    
2240    
2241    
2242     /*************************************************
2243     * Match from current position *
2244     *************************************************/
2245    
2246     /* On entry ecode points to the first opcode, and eptr to the first character.
2247    
2248     Arguments:
2249     eptr pointer in subject
2250     ecode position in code
2251     offset_top current top pointer
2252     md pointer to "static" info for the match
2253    
2254     Returns: TRUE if matched
2255     */
2256    
2257     static BOOL
2258 nigel 7 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2259 nigel 3 match_data *md)
2260     {
2261     for (;;)
2262     {
2263     int min, max, ctype;
2264     register int i;
2265     register int c;
2266 nigel 7 BOOL minimize = FALSE;
2267 nigel 3
2268     /* Opening bracket. Check the alternative branches in turn, failing if none
2269     match. We have to set the start offset if required and there is space
2270     in the offset vector so that it is available for subsequent back references
2271     if the bracket matches. However, if the bracket fails, we must put back the
2272     previous value of both offsets in case they were set by a previous copy of
2273     the same bracket. Don't worry about setting the flag for the error case here;
2274     that is handled in the code for KET. */
2275    
2276     if ((int)*ecode >= OP_BRA)
2277     {
2278     int number = (*ecode - OP_BRA) << 1;
2279 nigel 7 int save_offset1 = 0, save_offset2 = 0;
2280 nigel 3
2281     #ifdef DEBUG
2282     printf("start bracket %d\n", number/2);
2283     #endif
2284    
2285     if (number > 0 && number < md->offset_end)
2286     {
2287     save_offset1 = md->offset_vector[number];
2288     save_offset2 = md->offset_vector[number+1];
2289     md->offset_vector[number] = eptr - md->start_subject;
2290    
2291     #ifdef DEBUG
2292     printf("saving %d %d\n", save_offset1, save_offset2);
2293     #endif
2294     }
2295    
2296     /* Recurse for all the alternatives. */
2297    
2298     do
2299     {
2300     if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2301     ecode += (ecode[1] << 8) + ecode[2];
2302     }
2303     while (*ecode == OP_ALT);
2304    
2305     #ifdef DEBUG
2306     printf("bracket %d failed\n", number/2);
2307     #endif
2308    
2309     if (number > 0 && number < md->offset_end)
2310     {
2311     md->offset_vector[number] = save_offset1;
2312     md->offset_vector[number+1] = save_offset2;
2313     }
2314    
2315     return FALSE;
2316     }
2317    
2318     /* Other types of node can be handled by a switch */
2319    
2320     switch(*ecode)
2321     {
2322     case OP_END:
2323     md->end_match_ptr = eptr; /* Record where we ended */
2324     md->end_offset_top = offset_top; /* and how many extracts were taken */
2325     return TRUE;
2326    
2327     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2328     whole thing doesn't match, so we have to get out via a longjmp(). */
2329    
2330     case OP_CUT:
2331     if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2332     longjmp(md->fail_env, 1);
2333    
2334     /* Assertion brackets. Check the alternative branches in turn - the
2335     matching won't pass the KET for an assertion. If any one branch matches,
2336     the assertion is true. */
2337    
2338     case OP_ASSERT:
2339     do
2340     {
2341     if (match(eptr, ecode+3, offset_top, md)) break;
2342     ecode += (ecode[1] << 8) + ecode[2];
2343     }
2344     while (*ecode == OP_ALT);
2345     if (*ecode == OP_KET) return FALSE;
2346    
2347     /* Continue from after the assertion, updating the offsets high water
2348     mark, since extracts may have been taken during the assertion. */
2349    
2350     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2351     ecode += 3;
2352     offset_top = md->end_offset_top;
2353     continue;
2354    
2355     /* Negative assertion: all branches must fail to match */
2356    
2357     case OP_ASSERT_NOT:
2358     do
2359     {
2360     if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2361     ecode += (ecode[1] << 8) + ecode[2];
2362     }
2363     while (*ecode == OP_ALT);
2364     ecode += 3;
2365     continue;
2366    
2367     /* "Once" brackets are like assertion brackets except that after a match,
2368     the point in the subject string is not moved back. Thus there can never be
2369 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
2370 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
2371     matches, we carry on, leaving the subject pointer. */
2372    
2373     case OP_ONCE:
2374     do
2375     {
2376     if (match(eptr, ecode+3, offset_top, md)) break;
2377     ecode += (ecode[1] << 8) + ecode[2];
2378     }
2379     while (*ecode == OP_ALT);
2380     if (*ecode == OP_KET) return FALSE;
2381    
2382     /* Continue as from after the assertion, updating the offsets high water
2383     mark, since extracts may have been taken. */
2384    
2385     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2386     ecode += 3;
2387     offset_top = md->end_offset_top;
2388     eptr = md->end_match_ptr;
2389     continue;
2390    
2391     /* An alternation is the end of a branch; scan along to find the end of the
2392     bracketed group and go to there. */
2393    
2394     case OP_ALT:
2395     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2396     break;
2397    
2398     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2399     that it may occur zero times. It may repeat infinitely, or not at all -
2400     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2401     repeat limits are compiled as a number of copies, with the optional ones
2402     preceded by BRAZERO or BRAMINZERO. */
2403    
2404     case OP_BRAZERO:
2405     {
2406 nigel 7 const uschar *next = ecode+1;
2407 nigel 3 if (match(eptr, next, offset_top, md)) return TRUE;
2408     do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2409     ecode = next + 3;
2410     }
2411     break;
2412    
2413     case OP_BRAMINZERO:
2414     {
2415 nigel 7 const uschar *next = ecode+1;
2416 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2417     if (match(eptr, next+3, offset_top, md)) return TRUE;
2418     ecode++;
2419     }
2420     break;;
2421    
2422     /* End of a group, repeated or non-repeating. If we are at the end of
2423     an assertion "group", stop matching and return TRUE, but record the
2424     current high water mark for use by positive assertions. */
2425    
2426     case OP_KET:
2427     case OP_KETRMIN:
2428     case OP_KETRMAX:
2429     {
2430     int number;
2431 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2432 nigel 3
2433     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2434     {
2435     md->end_match_ptr = eptr; /* For ONCE */
2436     md->end_offset_top = offset_top;
2437     return TRUE;
2438     }
2439    
2440     /* In all other cases we have to check the group number back at the
2441     start and if necessary complete handling an extraction by setting the
2442     final offset and bumping the high water mark. */
2443    
2444     number = (*prev - OP_BRA) << 1;
2445    
2446     #ifdef DEBUG
2447     printf("end bracket %d\n", number/2);
2448     #endif
2449    
2450     if (number > 0)
2451     {
2452     if (number >= md->offset_end) md->offset_overflow = TRUE; else
2453     {
2454     md->offset_vector[number+1] = eptr - md->start_subject;
2455     if (offset_top <= number) offset_top = number + 2;
2456     }
2457     }
2458    
2459     /* For a non-repeating ket, just advance to the next node and continue at
2460     this level. */
2461    
2462     if (*ecode == OP_KET)
2463     {
2464     ecode += 3;
2465     break;
2466     }
2467    
2468     /* The repeating kets try the rest of the pattern or restart from the
2469     preceding bracket, in the appropriate order. */
2470    
2471     if (*ecode == OP_KETRMIN)
2472     {
2473     if (match(eptr, ecode+3, offset_top, md) ||
2474     match(eptr, prev, offset_top, md)) return TRUE;
2475     }
2476     else /* OP_KETRMAX */
2477     {
2478     if (match(eptr, prev, offset_top, md) ||
2479     match(eptr, ecode+3, offset_top, md)) return TRUE;
2480     }
2481     }
2482     return FALSE;
2483    
2484     /* Start of subject unless notbol, or after internal newline if multiline */
2485    
2486     case OP_CIRC:
2487     if (md->notbol && eptr == md->start_subject) return FALSE;
2488     if (md->multiline)
2489     {
2490     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2491     ecode++;
2492     break;
2493     }
2494     /* ... else fall through */
2495    
2496     /* Start of subject assertion */
2497    
2498     case OP_SOD:
2499     if (eptr != md->start_subject) return FALSE;
2500     ecode++;
2501     break;
2502    
2503     /* Assert before internal newline if multiline, or before
2504     a terminating newline unless endonly is set, else end of subject unless
2505     noteol is set. */
2506    
2507     case OP_DOLL:
2508     if (md->noteol && eptr >= md->end_subject) return FALSE;
2509     if (md->multiline)
2510     {
2511     if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2512     ecode++;
2513     break;
2514     }
2515     else if (!md->endonly)
2516     {
2517     if (eptr < md->end_subject - 1 ||
2518     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2519     ecode++;
2520     break;
2521     }
2522     /* ... else fall through */
2523    
2524     /* End of subject assertion */
2525    
2526     case OP_EOD:
2527     if (eptr < md->end_subject) return FALSE;
2528     ecode++;
2529     break;
2530    
2531     /* Word boundary assertions */
2532    
2533     case OP_NOT_WORD_BOUNDARY:
2534     case OP_WORD_BOUNDARY:
2535     {
2536     BOOL prev_is_word = (eptr != md->start_subject) &&
2537     ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2538     BOOL cur_is_word = (eptr < md->end_subject) &&
2539     ((pcre_ctypes[*eptr] & ctype_word) != 0);
2540     if ((*ecode++ == OP_WORD_BOUNDARY)?
2541     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2542     return FALSE;
2543     }
2544     break;
2545    
2546     /* Match a single character type; inline for speed */
2547    
2548     case OP_ANY:
2549     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2550     if (eptr++ >= md->end_subject) return FALSE;
2551     ecode++;
2552     break;
2553    
2554     case OP_NOT_DIGIT:
2555     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2556     return FALSE;
2557     ecode++;
2558     break;
2559    
2560     case OP_DIGIT:
2561     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2562     return FALSE;
2563     ecode++;
2564     break;
2565    
2566     case OP_NOT_WHITESPACE:
2567     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2568     return FALSE;
2569     ecode++;
2570     break;
2571    
2572     case OP_WHITESPACE:
2573     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2574     return FALSE;
2575     ecode++;
2576     break;
2577    
2578     case OP_NOT_WORDCHAR:
2579     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2580     return FALSE;
2581     ecode++;
2582     break;
2583    
2584     case OP_WORDCHAR:
2585     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2586     return FALSE;
2587     ecode++;
2588     break;
2589    
2590     /* Match a back reference, possibly repeatedly. Look past the end of the
2591     item to see if there is repeat information following. The code is similar
2592     to that for character classes, but repeated for efficiency. Then obey
2593     similar code to character type repeats - written out again for speed.
2594     However, if the referenced string is the empty string, always treat
2595     it as matched, any number of times (otherwise there could be infinite
2596     loops). */
2597    
2598     case OP_REF:
2599     {
2600     int length;
2601     int number = ecode[1] << 1; /* Doubled reference number */
2602     ecode += 2; /* Advance past the item */
2603    
2604     if (number >= offset_top || md->offset_vector[number] < 0)
2605     {
2606     md->errorcode = PCRE_ERROR_BADREF;
2607     return FALSE;
2608     }
2609    
2610     length = md->offset_vector[number+1] - md->offset_vector[number];
2611    
2612     switch (*ecode)
2613     {
2614     case OP_CRSTAR:
2615     case OP_CRMINSTAR:
2616     case OP_CRPLUS:
2617     case OP_CRMINPLUS:
2618     case OP_CRQUERY:
2619     case OP_CRMINQUERY:
2620     c = *ecode++ - OP_CRSTAR;
2621     minimize = (c & 1) != 0;
2622     min = rep_min[c]; /* Pick up values from tables; */
2623     max = rep_max[c]; /* zero for max => infinity */
2624     if (max == 0) max = INT_MAX;
2625     break;
2626    
2627     case OP_CRRANGE:
2628     case OP_CRMINRANGE:
2629     minimize = (*ecode == OP_CRMINRANGE);
2630     min = (ecode[1] << 8) + ecode[2];
2631     max = (ecode[3] << 8) + ecode[4];
2632     if (max == 0) max = INT_MAX;
2633     ecode += 5;
2634     break;
2635    
2636     default: /* No repeat follows */
2637     if (!match_ref(number, eptr, length, md)) return FALSE;
2638     eptr += length;
2639     continue; /* With the main loop */
2640     }
2641    
2642     /* If the length of the reference is zero, just continue with the
2643     main loop. */
2644    
2645     if (length == 0) continue;
2646    
2647     /* First, ensure the minimum number of matches are present. We get back
2648     the length of the reference string explicitly rather than passing the
2649     address of eptr, so that eptr can be a register variable. */
2650    
2651     for (i = 1; i <= min; i++)
2652     {
2653     if (!match_ref(number, eptr, length, md)) return FALSE;
2654     eptr += length;
2655     }
2656    
2657     /* If min = max, continue at the same level without recursion.
2658     They are not both allowed to be zero. */
2659    
2660     if (min == max) continue;
2661    
2662     /* If minimizing, keep trying and advancing the pointer */
2663    
2664     if (minimize)
2665     {
2666     for (i = min;; i++)
2667     {
2668     if (match(eptr, ecode, offset_top, md)) return TRUE;
2669     if (i >= max || !match_ref(number, eptr, length, md))
2670     return FALSE;
2671     eptr += length;
2672     }
2673     /* Control never gets here */
2674     }
2675    
2676     /* If maximizing, find the longest string and work backwards */
2677    
2678     else
2679     {
2680 nigel 7 const uschar *pp = eptr;
2681 nigel 3 for (i = min; i < max; i++)
2682     {
2683     if (!match_ref(number, eptr, length, md)) break;
2684     eptr += length;
2685     }
2686     while (eptr >= pp)
2687     {
2688     if (match(eptr, ecode, offset_top, md)) return TRUE;
2689     eptr -= length;
2690     }
2691     return FALSE;
2692     }
2693     }
2694     /* Control never gets here */
2695    
2696     /* Match a character class, possibly repeatedly. Look past the end of the
2697     item to see if there is repeat information following. Then obey similar
2698     code to character type repeats - written out again for speed. If caseless
2699     matching was set at runtime but not at compile time, we have to check both
2700     versions of a character. */
2701    
2702     case OP_CLASS:
2703     {
2704 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
2705     ecode += 33; /* Advance past the item */
2706 nigel 3
2707     switch (*ecode)
2708     {
2709     case OP_CRSTAR:
2710     case OP_CRMINSTAR:
2711     case OP_CRPLUS:
2712     case OP_CRMINPLUS:
2713     case OP_CRQUERY:
2714     case OP_CRMINQUERY:
2715     c = *ecode++ - OP_CRSTAR;
2716     minimize = (c & 1) != 0;
2717     min = rep_min[c]; /* Pick up values from tables; */
2718     max = rep_max[c]; /* zero for max => infinity */
2719     if (max == 0) max = INT_MAX;
2720     break;
2721    
2722     case OP_CRRANGE:
2723     case OP_CRMINRANGE:
2724     minimize = (*ecode == OP_CRMINRANGE);
2725     min = (ecode[1] << 8) + ecode[2];
2726     max = (ecode[3] << 8) + ecode[4];
2727     if (max == 0) max = INT_MAX;
2728     ecode += 5;
2729     break;
2730    
2731     default: /* No repeat follows */
2732     if (eptr >= md->end_subject) return FALSE;
2733     c = *eptr++;
2734     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2735     if (md->runtime_caseless)
2736     {
2737     c = pcre_fcc[c];
2738     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2739     }
2740     return FALSE;
2741     }
2742    
2743     /* First, ensure the minimum number of matches are present. */
2744    
2745     for (i = 1; i <= min; i++)
2746     {
2747     if (eptr >= md->end_subject) return FALSE;
2748     c = *eptr++;
2749     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2750     if (md->runtime_caseless)
2751     {
2752     c = pcre_fcc[c];
2753     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2754     }
2755     return FALSE;
2756     }
2757    
2758     /* If max == min we can continue with the main loop without the
2759     need to recurse. */
2760    
2761     if (min == max) continue;
2762    
2763     /* If minimizing, keep testing the rest of the expression and advancing
2764     the pointer while it matches the class. */
2765    
2766     if (minimize)
2767     {
2768     for (i = min;; i++)
2769     {
2770     if (match(eptr, ecode, offset_top, md)) return TRUE;
2771     if (i >= max || eptr >= md->end_subject) return FALSE;
2772     c = *eptr++;
2773     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2774     if (md->runtime_caseless)
2775     {
2776     c = pcre_fcc[c];
2777     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2778     }
2779     return FALSE;
2780     }
2781     /* Control never gets here */
2782     }
2783    
2784     /* If maximizing, find the longest possible run, then work backwards. */
2785    
2786     else
2787     {
2788 nigel 7 const uschar *pp = eptr;
2789 nigel 3 for (i = min; i < max; eptr++, i++)
2790     {
2791     if (eptr >= md->end_subject) break;
2792     c = *eptr;
2793     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2794     if (md->runtime_caseless)
2795     {
2796     c = pcre_fcc[c];
2797     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2798     }
2799     break;
2800     }
2801    
2802     while (eptr >= pp)
2803     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2804     return FALSE;
2805     }
2806     }
2807     /* Control never gets here */
2808    
2809     /* Match a run of characters */
2810    
2811     case OP_CHARS:
2812     {
2813     register int length = ecode[1];
2814     ecode += 2;
2815    
2816     #ifdef DEBUG
2817     if (eptr >= md->end_subject)
2818     printf("matching subject <null> against pattern ");
2819     else
2820     {
2821     printf("matching subject ");
2822     pchars(eptr, length, TRUE, md);
2823     printf(" against pattern ");
2824     }
2825     pchars(ecode, length, FALSE, md);
2826     printf("\n");
2827     #endif
2828    
2829     if (length > md->end_subject - eptr) return FALSE;
2830     if (md->caseless)
2831     {
2832     while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2833     }
2834     else
2835     {
2836     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2837     }
2838     }
2839     break;
2840    
2841     /* Match a single character repeatedly; different opcodes share code. */
2842    
2843     case OP_EXACT:
2844     min = max = (ecode[1] << 8) + ecode[2];
2845     ecode += 3;
2846     goto REPEATCHAR;
2847    
2848     case OP_UPTO:
2849     case OP_MINUPTO:
2850     min = 0;
2851     max = (ecode[1] << 8) + ecode[2];
2852     minimize = *ecode == OP_MINUPTO;
2853     ecode += 3;
2854     goto REPEATCHAR;
2855    
2856     case OP_STAR:
2857     case OP_MINSTAR:
2858     case OP_PLUS:
2859     case OP_MINPLUS:
2860     case OP_QUERY:
2861     case OP_MINQUERY:
2862     c = *ecode++ - OP_STAR;
2863     minimize = (c & 1) != 0;
2864     min = rep_min[c]; /* Pick up values from tables; */
2865     max = rep_max[c]; /* zero for max => infinity */
2866     if (max == 0) max = INT_MAX;
2867    
2868     /* Common code for all repeated single-character matches. We can give
2869     up quickly if there are fewer than the minimum number of characters left in
2870     the subject. */
2871    
2872     REPEATCHAR:
2873     if (min > md->end_subject - eptr) return FALSE;
2874     c = *ecode++;
2875    
2876     /* The code is duplicated for the caseless and caseful cases, for speed,
2877     since matching characters is likely to be quite common. First, ensure the
2878     minimum number of matches are present. If min = max, continue at the same
2879     level without recursing. Otherwise, if minimizing, keep trying the rest of
2880     the expression and advancing one matching character if failing, up to the
2881     maximum. Alternatively, if maximizing, find the maximum number of
2882     characters and work backwards. */
2883    
2884     #ifdef DEBUG
2885     printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2886     max, eptr);
2887     #endif
2888    
2889     if (md->caseless)
2890     {
2891     c = pcre_lcc[c];
2892     for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2893     if (min == max) continue;
2894     if (minimize)
2895     {
2896     for (i = min;; i++)
2897     {
2898     if (match(eptr, ecode, offset_top, md)) return TRUE;
2899     if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2900     return FALSE;
2901     }
2902     /* Control never gets here */
2903     }
2904     else
2905     {
2906 nigel 7 const uschar *pp = eptr;
2907 nigel 3 for (i = min; i < max; i++)
2908     {
2909     if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2910     eptr++;
2911     }
2912     while (eptr >= pp)
2913     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2914     return FALSE;
2915     }
2916     /* Control never gets here */
2917     }
2918    
2919     /* Caseful comparisons */
2920    
2921     else
2922     {
2923     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2924     if (min == max) continue;
2925     if (minimize)
2926     {
2927     for (i = min;; i++)
2928     {
2929     if (match(eptr, ecode, offset_top, md)) return TRUE;
2930     if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2931     }
2932     /* Control never gets here */
2933     }
2934     else
2935     {
2936 nigel 7 const uschar *pp = eptr;
2937 nigel 3 for (i = min; i < max; i++)
2938     {
2939     if (eptr >= md->end_subject || c != *eptr) break;
2940     eptr++;
2941     }
2942     while (eptr >= pp)
2943     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2944     return FALSE;
2945     }
2946     }
2947     /* Control never gets here */
2948    
2949     /* Match a negated single character */
2950    
2951     case OP_NOT:
2952     if (eptr > md->end_subject) return FALSE;
2953     ecode++;
2954     if (md->caseless)
2955     {
2956     if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2957     }
2958     else
2959     {
2960     if (*ecode++ == *eptr++) return FALSE;
2961     }
2962     break;
2963    
2964     /* Match a negated single character repeatedly. This is almost a repeat of
2965     the code for a repeated single character, but I haven't found a nice way of
2966     commoning these up that doesn't require a test of the positive/negative
2967     option for each character match. Maybe that wouldn't add very much to the
2968     time taken, but character matching *is* what this is all about... */
2969    
2970     case OP_NOTEXACT:
2971     min = max = (ecode[1] << 8) + ecode[2];
2972     ecode += 3;
2973     goto REPEATNOTCHAR;
2974    
2975     case OP_NOTUPTO:
2976     case OP_NOTMINUPTO:
2977     min = 0;
2978     max = (ecode[1] << 8) + ecode[2];
2979     minimize = *ecode == OP_NOTMINUPTO;
2980     ecode += 3;
2981     goto REPEATNOTCHAR;
2982    
2983     case OP_NOTSTAR:
2984     case OP_NOTMINSTAR:
2985     case OP_NOTPLUS:
2986     case OP_NOTMINPLUS:
2987     case OP_NOTQUERY:
2988     case OP_NOTMINQUERY:
2989     c = *ecode++ - OP_NOTSTAR;
2990     minimize = (c & 1) != 0;
2991     min = rep_min[c]; /* Pick up values from tables; */
2992     max = rep_max[c]; /* zero for max => infinity */
2993     if (max == 0) max = INT_MAX;
2994    
2995     /* Common code for all repeated single-character matches. We can give
2996     up quickly if there are fewer than the minimum number of characters left in
2997     the subject. */
2998    
2999     REPEATNOTCHAR:
3000     if (min > md->end_subject - eptr) return FALSE;
3001     c = *ecode++;
3002    
3003     /* The code is duplicated for the caseless and caseful cases, for speed,
3004     since matching characters is likely to be quite common. First, ensure the
3005     minimum number of matches are present. If min = max, continue at the same
3006     level without recursing. Otherwise, if minimizing, keep trying the rest of
3007     the expression and advancing one matching character if failing, up to the
3008     maximum. Alternatively, if maximizing, find the maximum number of
3009     characters and work backwards. */
3010    
3011     #ifdef DEBUG
3012     printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3013     max, eptr);
3014     #endif
3015    
3016     if (md->caseless)
3017     {
3018     c = pcre_lcc[c];
3019     for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3020     if (min == max) continue;
3021     if (minimize)
3022     {
3023     for (i = min;; i++)
3024     {
3025     if (match(eptr, ecode, offset_top, md)) return TRUE;
3026     if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3027     return FALSE;
3028     }
3029     /* Control never gets here */
3030     }
3031     else
3032     {
3033 nigel 7 const uschar *pp = eptr;
3034 nigel 3 for (i = min; i < max; i++)
3035     {
3036     if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3037     eptr++;
3038     }
3039     while (eptr >= pp)
3040     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3041     return FALSE;
3042     }
3043     /* Control never gets here */
3044     }
3045    
3046     /* Caseful comparisons */
3047    
3048     else
3049     {
3050     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3051     if (min == max) continue;
3052     if (minimize)
3053     {
3054     for (i = min;; i++)
3055     {
3056     if (match(eptr, ecode, offset_top, md)) return TRUE;
3057     if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3058     }
3059     /* Control never gets here */
3060     }
3061     else
3062     {
3063 nigel 7 const uschar *pp = eptr;
3064 nigel 3 for (i = min; i < max; i++)
3065     {
3066     if (eptr >= md->end_subject || c == *eptr) break;
3067     eptr++;
3068     }
3069     while (eptr >= pp)
3070     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3071     return FALSE;
3072     }
3073     }
3074     /* Control never gets here */
3075    
3076     /* Match a single character type repeatedly; several different opcodes
3077     share code. This is very similar to the code for single characters, but we
3078     repeat it in the interests of efficiency. */
3079    
3080     case OP_TYPEEXACT:
3081     min = max = (ecode[1] << 8) + ecode[2];
3082     minimize = TRUE;
3083     ecode += 3;
3084     goto REPEATTYPE;
3085    
3086     case OP_TYPEUPTO:
3087     case OP_TYPEMINUPTO:
3088     min = 0;
3089     max = (ecode[1] << 8) + ecode[2];
3090     minimize = *ecode == OP_TYPEMINUPTO;
3091     ecode += 3;
3092     goto REPEATTYPE;
3093    
3094     case OP_TYPESTAR:
3095     case OP_TYPEMINSTAR:
3096     case OP_TYPEPLUS:
3097     case OP_TYPEMINPLUS:
3098     case OP_TYPEQUERY:
3099     case OP_TYPEMINQUERY:
3100     c = *ecode++ - OP_TYPESTAR;
3101     minimize = (c & 1) != 0;
3102     min = rep_min[c]; /* Pick up values from tables; */
3103     max = rep_max[c]; /* zero for max => infinity */
3104     if (max == 0) max = INT_MAX;
3105    
3106     /* Common code for all repeated single character type matches */
3107    
3108     REPEATTYPE:
3109     ctype = *ecode++; /* Code for the character type */
3110    
3111     /* First, ensure the minimum number of matches are present. Use inline
3112     code for maximizing the speed, and do the type test once at the start
3113     (i.e. keep it out of the loop). Also test that there are at least the
3114     minimum number of characters before we start. */
3115    
3116     if (min > md->end_subject - eptr) return FALSE;
3117     if (min > 0) switch(ctype)
3118     {
3119     case OP_ANY:
3120     if (!md->dotall)
3121     { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3122     else eptr += min;
3123     break;
3124    
3125     case OP_NOT_DIGIT:
3126     for (i = 1; i <= min; i++)
3127     if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3128     break;
3129    
3130     case OP_DIGIT:
3131     for (i = 1; i <= min; i++)
3132     if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3133     break;
3134    
3135     case OP_NOT_WHITESPACE:
3136     for (i = 1; i <= min; i++)
3137     if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3138     break;
3139    
3140     case OP_WHITESPACE:
3141     for (i = 1; i <= min; i++)
3142     if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3143     break;
3144    
3145     case OP_NOT_WORDCHAR:
3146     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3147     return FALSE;
3148     break;
3149    
3150     case OP_WORDCHAR:
3151     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3152     return FALSE;
3153     break;
3154     }
3155    
3156     /* If min = max, continue at the same level without recursing */
3157    
3158     if (min == max) continue;
3159    
3160     /* If minimizing, we have to test the rest of the pattern before each
3161     subsequent match, so inlining isn't much help; just use the function. */
3162    
3163     if (minimize)
3164     {
3165     for (i = min;; i++)
3166     {
3167     if (match(eptr, ecode, offset_top, md)) return TRUE;
3168     if (i >= max || eptr >= md->end_subject ||
3169     !match_type(ctype, *eptr++, md->dotall))
3170     return FALSE;
3171     }
3172     /* Control never gets here */
3173     }
3174    
3175     /* If maximizing it is worth using inline code for speed, doing the type
3176     test once at the start (i.e. keep it out of the loop). */
3177    
3178     else
3179     {
3180 nigel 7 const uschar *pp = eptr;
3181 nigel 3 switch(ctype)
3182     {
3183     case OP_ANY:
3184     if (!md->dotall)
3185     {
3186     for (i = min; i < max; i++)
3187     {
3188     if (eptr >= md->end_subject || *eptr == '\n') break;
3189     eptr++;
3190     }
3191     }
3192     else
3193     {
3194     c = max - min;
3195     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3196     eptr += c;
3197     }
3198     break;
3199    
3200     case OP_NOT_DIGIT:
3201     for (i = min; i < max; i++)
3202     {
3203     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3204     break;
3205     eptr++;
3206     }
3207     break;
3208    
3209     case OP_DIGIT:
3210     for (i = min; i < max; i++)
3211     {
3212     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3213     break;
3214     eptr++;
3215     }
3216     break;
3217    
3218     case OP_NOT_WHITESPACE:
3219     for (i = min; i < max; i++)
3220     {
3221     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3222     break;
3223     eptr++;
3224     }
3225     break;
3226    
3227     case OP_WHITESPACE:
3228     for (i = min; i < max; i++)
3229     {
3230     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3231     break;
3232     eptr++;
3233     }
3234     break;
3235    
3236     case OP_NOT_WORDCHAR:
3237     for (i = min; i < max; i++)
3238     {
3239     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3240     break;
3241     eptr++;
3242     }
3243     break;
3244    
3245     case OP_WORDCHAR:
3246     for (i = min; i < max; i++)
3247     {
3248     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3249     break;
3250     eptr++;
3251     }
3252     break;
3253     }
3254    
3255     while (eptr >= pp)
3256     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3257     return FALSE;
3258     }
3259     /* Control never gets here */
3260    
3261     /* There's been some horrible disaster. */
3262    
3263     default:
3264     #ifdef DEBUG
3265     printf("Unknown opcode %d\n", *ecode);
3266     #endif
3267     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3268     return FALSE;
3269     }
3270    
3271     /* Do not stick any code in here without much thought; it is assumed
3272     that "continue" in the code above comes out to here to repeat the main
3273     loop. */
3274    
3275     } /* End of main loop */
3276     /* Control never reaches here */
3277     }
3278    
3279    
3280    
3281     /*************************************************
3282     * Execute a Regular Expression *
3283     *************************************************/
3284    
3285     /* This function applies a compiled re to a subject string and picks out
3286     portions of the string if it matches. Two elements in the vector are set for
3287     each substring: the offsets to the start and end of the substring.
3288    
3289     Arguments:
3290     external_re points to the compiled expression
3291     external_extra points to "hints" from pcre_study() or is NULL
3292     subject points to the subject string
3293     length length of subject string (may contain binary zeros)
3294     options option bits
3295     offsets points to a vector of ints to be filled in with offsets
3296     offsetcount the number of elements in the vector
3297    
3298     Returns: > 0 => success; value is the number of elements filled in
3299     = 0 => success, but offsets is not big enough
3300     -1 => failed to match
3301     < -1 => some kind of unexpected problem
3302     */
3303    
3304     int
3305     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3306     const char *subject, int length, int options, int *offsets, int offsetcount)
3307     {
3308     int resetcount;
3309     int ocount = offsetcount;
3310     int first_char = -1;
3311     match_data match_block;
3312 nigel 7 const uschar *start_bits = NULL;
3313     const uschar *start_match = (uschar *)subject;
3314     const uschar *end_subject;
3315     const real_pcre *re = (const real_pcre *)external_re;
3316     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3317 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3318     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3319    
3320     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3321    
3322     if (re == NULL || subject == NULL ||
3323     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3324     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3325    
3326 nigel 7 match_block.start_subject = (const uschar *)subject;
3327 nigel 3 match_block.end_subject = match_block.start_subject + length;
3328     end_subject = match_block.end_subject;
3329    
3330     match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3331     match_block.runtime_caseless = match_block.caseless &&
3332     (re->options & PCRE_CASELESS) == 0;
3333    
3334     match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3335     match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3336     match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3337    
3338     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3339     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3340    
3341     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3342    
3343     /* If the expression has got more back references than the offsets supplied can
3344     hold, we get a temporary bit of working store to use during the matching.
3345     Otherwise, we can use the vector supplied, rounding down the size of it to a
3346     multiple of 2. */
3347    
3348     ocount &= (-2);
3349     if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
3350     {
3351     ocount = re->top_backref * 2 + 2;
3352     match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3353     if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3354     #ifdef DEBUG
3355     printf("Got memory to hold back references\n");
3356     #endif
3357     }
3358     else match_block.offset_vector = offsets;
3359    
3360     match_block.offset_end = ocount;
3361     match_block.offset_overflow = FALSE;
3362    
3363     /* Compute the minimum number of offsets that we need to reset each time. Doing
3364     this makes a huge difference to execution time when there aren't many brackets
3365     in the pattern. */
3366    
3367     resetcount = 2 + re->top_bracket * 2;
3368     if (resetcount > offsetcount) resetcount = ocount;
3369    
3370     /* If MULTILINE is set at exec time but was not set at compile time, and the
3371     anchored flag is set, we must re-check because a setting provoked by ^ in the
3372     pattern is not right in multi-line mode. Calling is_anchored() again here does
3373     the right check, because multiline is now set. If it now yields FALSE, the
3374     expression must have had ^ starting some of its branches. Check to see if
3375     that is true for *all* branches, and if so, set the startline flag. */
3376    
3377     if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3378     !is_anchored(re->code, match_block.multiline))
3379     {
3380     anchored = FALSE;
3381     if (is_startline(re->code)) startline = TRUE;
3382     }
3383    
3384     /* Set up the first character to match, if available. The first_char value is
3385     never set for an anchored regular expression, but the anchoring may be forced
3386     at run time, so we have to test for anchoring. The first char may be unset for
3387     an unanchored pattern, of course. If there's no first char and the pattern was
3388     studied, the may be a bitmap of possible first characters. However, we can
3389     use this only if the caseless state of the studying was correct. */
3390    
3391     if (!anchored)
3392     {
3393     if ((re->options & PCRE_FIRSTSET) != 0)
3394     {
3395     first_char = re->first_char;
3396     if (match_block.caseless) first_char = pcre_lcc[first_char];
3397     }
3398     else
3399     if (!startline && extra != NULL &&
3400     (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3401     ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3402     start_bits = extra->start_bits;
3403     }
3404    
3405     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3406    
3407     do
3408     {
3409     register int *iptr = match_block.offset_vector;
3410     register int *iend = iptr + resetcount;
3411    
3412     /* Reset the maximum number of extractions we might see. */
3413    
3414     while (iptr < iend) *iptr++ = -1;
3415    
3416     /* Advance to a unique first char if possible */
3417    
3418     if (first_char >= 0)
3419     {
3420     if (match_block.caseless)
3421     while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3422     start_match++;
3423     else
3424     while (start_match < end_subject && *start_match != first_char)
3425     start_match++;
3426     }
3427    
3428     /* Or to just after \n for a multiline match if possible */
3429    
3430     else if (startline)
3431     {
3432     if (start_match > match_block.start_subject)
3433     {
3434     while (start_match < end_subject && start_match[-1] != '\n')
3435     start_match++;
3436     }
3437     }
3438    
3439     /* Or to a non-unique first char */
3440    
3441     else if (start_bits != NULL)
3442     {
3443     while (start_match < end_subject)
3444     {
3445     register int c = *start_match;
3446     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3447     }
3448     }
3449    
3450     #ifdef DEBUG
3451     printf(">>>> Match against: ");
3452     pchars(start_match, end_subject - start_match, TRUE, &match_block);
3453     printf("\n");
3454     #endif
3455    
3456     /* When a match occurs, substrings will be set for all internal extractions;
3457     we just need to set up the whole thing as substring 0 before returning. If
3458     there were too many extractions, set the return code to zero. In the case
3459     where we had to get some local store to hold offsets for backreferences, copy
3460     those back references that we can. In this case there need not be overflow
3461     if certain parts of the pattern were not used.
3462    
3463     Before starting the match, we have to set up a longjmp() target to enable
3464     the "cut" operation to fail a match completely without backtracking. */
3465    
3466     if (setjmp(match_block.fail_env) == 0 &&
3467     match(start_match, re->code, 2, &match_block))
3468     {
3469     int rc;
3470    
3471     if (ocount != offsetcount)
3472     {
3473     if (offsetcount >= 4)
3474     {
3475     memcpy(offsets + 2, match_block.offset_vector + 2,
3476     (offsetcount - 2) * sizeof(int));
3477     #ifdef DEBUG
3478     printf("Copied offsets; freeing temporary memory\n");
3479     #endif
3480     }
3481     if (match_block.end_offset_top > offsetcount)
3482     match_block.offset_overflow = TRUE;
3483    
3484     #ifdef DEBUG
3485     printf("Freeing temporary memory\n");
3486     #endif
3487    
3488     (pcre_free)(match_block.offset_vector);
3489     }
3490    
3491     rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3492    
3493     if (match_block.offset_end < 2) rc = 0; else
3494     {
3495     offsets[0] = start_match - match_block.start_subject;
3496     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3497     }
3498    
3499     #ifdef DEBUG
3500     printf(">>>> returning %d\n", rc);
3501     #endif
3502     return rc;
3503     }
3504     }
3505     while (!anchored &&
3506     match_block.errorcode == PCRE_ERROR_NOMATCH &&
3507     start_match++ < end_subject);
3508    
3509     #ifdef DEBUG
3510     printf(">>>> returning %d\n", match_block.errorcode);
3511     #endif
3512    
3513     return match_block.errorcode;
3514     }
3515    
3516     /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12