/[pcre]/code/tags/pcre-2.03/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-2.03/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 9 - (hide annotations) (download)
Sat Feb 24 21:38:13 2007 UTC (7 years, 7 months ago) by nigel
Original Path: code/trunk/pcre.c
File MIME type: text/plain
File size: 101143 byte(s)
Load pcre-1.03 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12     Copyright (c) 1997 University of Cambridge
13    
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Define DEBUG to get debugging output on stdout. */
33    
34     /* #define DEBUG */
35    
36 nigel 9 /* Use a macro for debugging printing, 'cause that eliminates the the use
37     of #ifdef inline, and there are *still* stupid compilers about that don't like
38     indented pre-processor statements. I suppose it's only been 10 years... */
39 nigel 3
40 nigel 9 #ifdef DEBUG
41     #define DPRINTF(p) printf p
42     #else
43     #define DPRINTF(p) /*nothing*/
44     #endif
45    
46 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
47     the external pcre header. */
48    
49     #include "internal.h"
50    
51    
52     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
53    
54     static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
55     static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
56    
57     /* Text forms of OP_ values and things, for debugging */
58    
59     #ifdef DEBUG
60 nigel 7 static const char *OP_names[] = {
61     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
62 nigel 3 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
63     "not",
64     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
65     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
66     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
67     "*", "*?", "+", "+?", "?", "??", "{", "{",
68     "class", "Ref",
69     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
70     "Brazero", "Braminzero", "Bra"
71     };
72     #endif
73    
74     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
75     are simple data values; negative values are for special things like \d and so
76     on. Zero means further processing is needed (for things like \x), or the escape
77     is invalid. */
78    
79     static short int escapes[] = {
80     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
81     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
82     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
83     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
84     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
85     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
86     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
87     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
88     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
89     0, 0, 0 /* x - z */
90     };
91    
92     /* Definition to allow mutual recursion */
93    
94 nigel 7 static BOOL compile_regex(int, int *, uschar **, const uschar **, const char **);
95 nigel 3
96     /* Structure for passing "static" information around between the functions
97     doing the matching, so that they are thread-safe. */
98    
99     typedef struct match_data {
100     int errorcode; /* As it says */
101     int *offset_vector; /* Offset vector */
102     int offset_end; /* One past the end */
103     BOOL offset_overflow; /* Set if too many extractions */
104     BOOL caseless; /* Case-independent flag */
105     BOOL runtime_caseless; /* Caseless forced at run time */
106     BOOL multiline; /* Multiline flag */
107     BOOL notbol; /* NOTBOL flag */
108     BOOL noteol; /* NOTEOL flag */
109     BOOL dotall; /* Dot matches any char */
110     BOOL endonly; /* Dollar not before final \n */
111 nigel 7 const uschar *start_subject; /* Start of the subject string */
112     const uschar *end_subject; /* End of the subject string */
113 nigel 3 jmp_buf fail_env; /* Environment for longjump() break out */
114 nigel 7 const uschar *end_match_ptr; /* Subject position at end match */
115 nigel 3 int end_offset_top; /* Highwater mark at end of match */
116     } match_data;
117    
118    
119    
120     /*************************************************
121     * Global variables *
122     *************************************************/
123    
124     /* PCRE is thread-clean and doesn't use any global variables in the normal
125     sense. However, it calls memory allocation and free functions via the two
126     indirections below, which are can be changed by the caller, but are shared
127     between all threads. */
128    
129     void *(*pcre_malloc)(size_t) = malloc;
130     void (*pcre_free)(void *) = free;
131    
132    
133    
134    
135     /*************************************************
136     * Return version string *
137     *************************************************/
138    
139 nigel 7 const char *
140 nigel 3 pcre_version(void)
141     {
142     return PCRE_VERSION;
143     }
144    
145    
146    
147    
148     /*************************************************
149     * Return info about a compiled pattern *
150     *************************************************/
151    
152     /* This function picks potentially useful data out of the private
153     structure.
154    
155     Arguments:
156     external_re points to compiled code
157     optptr where to pass back the options
158     first_char where to pass back the first character,
159     or -1 if multiline and all branches start ^,
160     or -2 otherwise
161    
162     Returns: number of identifying extraction brackets
163     or negative values on error
164     */
165    
166     int
167     pcre_info(const pcre *external_re, int *optptr, int *first_char)
168     {
169 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
170 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
171     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
172     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
173     if (first_char != NULL)
174     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
175     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
176     return re->top_bracket;
177     }
178    
179    
180    
181    
182     #ifdef DEBUG
183     /*************************************************
184     * Debugging function to print chars *
185     *************************************************/
186    
187     /* Print a sequence of chars in printable format, stopping at the end of the
188     subject if the requested.
189    
190     Arguments:
191     p points to characters
192     length number to print
193     is_subject TRUE if printing from within md->start_subject
194     md pointer to matching data block, if is_subject is TRUE
195    
196     Returns: nothing
197     */
198    
199 nigel 9 static void
200     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
201 nigel 3 {
202     int c;
203     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
204     while (length-- > 0)
205     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
206     }
207     #endif
208    
209    
210    
211    
212     /*************************************************
213     * Check subpattern for empty operand *
214     *************************************************/
215    
216     /* This function checks a bracketed subpattern to see if any of the paths
217     through it could match an empty string. This is used to diagnose an error if
218     such a subpattern is followed by a quantifier with an unlimited upper bound.
219    
220     Argument:
221     code points to the opening bracket
222    
223     Returns: TRUE or FALSE
224     */
225    
226     static BOOL
227     could_be_empty(uschar *code)
228     {
229     do {
230     uschar *cc = code + 3;
231    
232     /* Scan along the opcodes for this branch; as soon as we find something
233     that matches a non-empty string, break out and advance to test the next
234     branch. If we get to the end of the branch, return TRUE for the whole
235     sub-expression. */
236    
237     for (;;)
238     {
239     /* Test an embedded subpattern; if it could not be empty, break the
240     loop. Otherwise carry on in the branch. */
241    
242 nigel 5 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
243 nigel 3 {
244     if (!could_be_empty(cc)) break;
245     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
246     cc += 3;
247     }
248    
249     else switch (*cc)
250     {
251     /* Reached end of a branch: the subpattern may match the empty string */
252    
253     case OP_ALT:
254     case OP_KET:
255     case OP_KETRMAX:
256     case OP_KETRMIN:
257     return TRUE;
258    
259     /* Skip over assertive subpatterns */
260    
261     case OP_ASSERT:
262     case OP_ASSERT_NOT:
263     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
264     cc += 3;
265     break;
266    
267     /* Skip over things that don't match chars */
268    
269     case OP_SOD:
270     case OP_EOD:
271     case OP_CIRC:
272     case OP_DOLL:
273     case OP_BRAZERO:
274     case OP_BRAMINZERO:
275     case OP_NOT_WORD_BOUNDARY:
276     case OP_WORD_BOUNDARY:
277     cc++;
278     break;
279    
280     /* Skip over simple repeats with zero lower bound */
281    
282     case OP_STAR:
283     case OP_MINSTAR:
284     case OP_QUERY:
285     case OP_MINQUERY:
286 nigel 5 case OP_NOTSTAR:
287     case OP_NOTMINSTAR:
288     case OP_NOTQUERY:
289     case OP_NOTMINQUERY:
290 nigel 3 case OP_TYPESTAR:
291     case OP_TYPEMINSTAR:
292     case OP_TYPEQUERY:
293     case OP_TYPEMINQUERY:
294     cc += 2;
295     break;
296    
297     /* Skip over UPTOs (lower bound is zero) */
298    
299     case OP_UPTO:
300     case OP_MINUPTO:
301     case OP_TYPEUPTO:
302     case OP_TYPEMINUPTO:
303     cc += 4;
304     break;
305    
306     /* Check a class or a back reference for a zero minimum */
307    
308     case OP_CLASS:
309     case OP_REF:
310 nigel 5 cc += (*cc == OP_REF)? 2 : 33;
311 nigel 3
312     switch (*cc)
313     {
314     case OP_CRSTAR:
315     case OP_CRMINSTAR:
316     case OP_CRQUERY:
317     case OP_CRMINQUERY:
318     cc++;
319     break;
320    
321     case OP_CRRANGE:
322     case OP_CRMINRANGE:
323     if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
324     cc += 3;
325     break;
326    
327     default:
328     goto NEXT_BRANCH;
329     }
330     break;
331    
332     /* Anything else matches at least one character */
333    
334     default:
335     goto NEXT_BRANCH;
336     }
337     }
338    
339     NEXT_BRANCH:
340     code += (code[1] << 8) + code[2];
341     }
342     while (*code == OP_ALT);
343    
344     /* No branches match the empty string */
345    
346     return FALSE;
347     }
348    
349    
350    
351     /*************************************************
352     * Handle escapes *
353     *************************************************/
354    
355     /* This function is called when a \ has been encountered. It either returns a
356     positive value for a simple escape such as \n, or a negative value which
357     encodes one of the more complicated things such as \d. On entry, ptr is
358     pointing at the \. On exit, it is on the final character of the escape
359     sequence.
360    
361     Arguments:
362     ptrptr points to the pattern position pointer
363     errorptr points to the pointer to the error message
364     bracount number of previous extracting brackets
365     options the options bits
366     isclass TRUE if inside a character class
367    
368     Returns: zero or positive => a data character
369     negative => a special escape sequence
370     on error, errorptr is set
371     */
372    
373     static int
374 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
375     int options, BOOL isclass)
376 nigel 3 {
377 nigel 7 const uschar *ptr = *ptrptr;
378 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
379     int i;
380    
381     if (c == 0) *errorptr = ERR1;
382    
383     /* Digits or letters may have special meaning; all others are literals. */
384    
385     else if (c < '0' || c > 'z') {}
386    
387     /* Do an initial lookup in a table. A non-zero result is something that can be
388     returned immediately. Otherwise further processing may be required. */
389    
390     else if ((i = escapes[c - '0']) != 0) c = i;
391    
392     /* Escapes that need further processing, or are illegal. */
393    
394     else
395     {
396 nigel 7 const uschar *oldptr;
397 nigel 3 switch (c)
398     {
399     /* The handling of escape sequences consisting of a string of digits
400     starting with one that is not zero is not straightforward. By experiment,
401     the way Perl works seems to be as follows:
402    
403     Outside a character class, the digits are read as a decimal number. If the
404     number is less than 10, or if there are that many previous extracting
405     left brackets, then it is a back reference. Otherwise, up to three octal
406     digits are read to form an escaped byte. Thus \123 is likely to be octal
407     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
408     value is greater than 377, the least significant 8 bits are taken. Inside a
409     character class, \ followed by a digit is always an octal number. */
410    
411     case '1': case '2': case '3': case '4': case '5':
412     case '6': case '7': case '8': case '9':
413    
414     if (!isclass)
415     {
416     oldptr = ptr;
417     c -= '0';
418     while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
419     c = c * 10 + *(++ptr) - '0';
420     if (c < 10 || c <= bracount)
421     {
422     c = -(ESC_REF + c);
423     break;
424     }
425     ptr = oldptr; /* Put the pointer back and fall through */
426     }
427    
428     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
429     generates a binary zero byte and treats the digit as a following literal.
430     Thus we have to pull back the pointer by one. */
431    
432     if ((c = *ptr) >= '8')
433     {
434     ptr--;
435     c = 0;
436     break;
437     }
438    
439     /* \0 always starts an octal number, but we may drop through to here with a
440     larger first octal digit */
441    
442     case '0':
443     c -= '0';
444     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
445     ptr[1] != '8' && ptr[1] != '9')
446     c = c * 8 + *(++ptr) - '0';
447     break;
448    
449     /* Special escapes not starting with a digit are straightforward */
450    
451     case 'x':
452     c = 0;
453     while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
454     {
455     ptr++;
456     c = c * 16 + pcre_lcc[*ptr] -
457     (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
458     }
459     break;
460    
461     case 'c':
462     c = *(++ptr);
463     if (c == 0)
464     {
465     *errorptr = ERR2;
466     return 0;
467     }
468    
469     /* A letter is upper-cased; then the 0x40 bit is flipped */
470    
471     if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
472     c ^= 0x40;
473     break;
474    
475     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
476     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
477     for Perl compatibility, it is a literal. */
478    
479     default:
480     if ((options & PCRE_EXTRA) != 0) switch(c)
481     {
482     case 'X':
483     c = -ESC_X; /* This could be a lookup if it ever got into Perl */
484     break;
485    
486     default:
487     *errorptr = ERR3;
488     break;
489     }
490     break;
491     }
492     }
493    
494     *ptrptr = ptr;
495     return c;
496     }
497    
498    
499    
500     /*************************************************
501     * Check for counted repeat *
502     *************************************************/
503    
504     /* This function is called when a '{' is encountered in a place where it might
505     start a quantifier. It looks ahead to see if it really is a quantifier or not.
506     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
507     where the ddds are digits.
508    
509     Arguments:
510     p pointer to the first char after '{'
511    
512     Returns: TRUE or FALSE
513     */
514    
515     static BOOL
516 nigel 7 is_counted_repeat(const uschar *p)
517 nigel 3 {
518     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
519     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
520     if (*p == '}') return TRUE;
521    
522     if (*p++ != ',') return FALSE;
523     if (*p == '}') return TRUE;
524    
525     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
526     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
527     return (*p == '}');
528     }
529    
530    
531    
532     /*************************************************
533     * Read repeat counts *
534     *************************************************/
535    
536     /* Read an item of the form {n,m} and return the values. This is called only
537     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
538     so the syntax is guaranteed to be correct, but we need to check the values.
539    
540     Arguments:
541     p pointer to first char after '{'
542     minp pointer to int for min
543     maxp pointer to int for max
544     returned as -1 if no max
545     errorptr points to pointer to error message
546    
547     Returns: pointer to '}' on success;
548     current ptr on error, with errorptr set
549     */
550    
551 nigel 7 static const uschar *
552     read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
553 nigel 3 {
554     int min = 0;
555     int max = -1;
556    
557     while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
558    
559     if (*p == '}') max = min; else
560     {
561     if (*(++p) != '}')
562     {
563     max = 0;
564     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
565     if (max < min)
566     {
567     *errorptr = ERR4;
568     return p;
569     }
570     }
571     }
572    
573     /* Do paranoid checks, then fill in the required variables, and pass back the
574     pointer to the terminating '}'. */
575    
576     if (min > 65535 || max > 65535)
577     *errorptr = ERR5;
578     else
579     {
580     *minp = min;
581     *maxp = max;
582     }
583     return p;
584     }
585    
586    
587    
588     /*************************************************
589     * Compile one branch *
590     *************************************************/
591    
592     /* Scan the pattern, compiling it into the code vector.
593    
594     Arguments:
595     options the option bits
596     bracket points to number of brackets used
597     code points to the pointer to the current code point
598     ptrptr points to the current pattern pointer
599     errorptr points to pointer to error message
600    
601     Returns: TRUE on success
602     FALSE, with *errorptr set on error
603     */
604    
605     static BOOL
606 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
607     const uschar **ptrptr, const char **errorptr)
608 nigel 3 {
609     int repeat_type, op_type;
610     int repeat_min, repeat_max;
611     int bravalue, length;
612     register int c;
613     register uschar *code = *codeptr;
614 nigel 7 const uschar *ptr = *ptrptr;
615     const uschar *oldptr;
616 nigel 3 uschar *previous = NULL;
617     uschar class[32];
618    
619     /* Switch on next character until the end of the branch */
620    
621     for (;; ptr++)
622     {
623     BOOL negate_class;
624     int class_charcount;
625     int class_lastchar;
626    
627     c = *ptr;
628     if ((options & PCRE_EXTENDED) != 0)
629     {
630     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
631     if (c == '#')
632     {
633     while ((c = *(++ptr)) != 0 && c != '\n');
634     continue;
635     }
636     }
637    
638     switch(c)
639     {
640     /* The branch terminates at end of string, |, or ). */
641    
642     case 0:
643     case '|':
644     case ')':
645     *codeptr = code;
646     *ptrptr = ptr;
647     return TRUE;
648    
649     /* Handle single-character metacharacters */
650    
651     case '^':
652     previous = NULL;
653     *code++ = OP_CIRC;
654     break;
655    
656     case '$':
657     previous = NULL;
658     *code++ = OP_DOLL;
659     break;
660    
661     case '.':
662     previous = code;
663     *code++ = OP_ANY;
664     break;
665    
666     /* Character classes. These always build a 32-byte bitmap of the permitted
667     characters, except in the special case where there is only one character.
668     For negated classes, we build the map as usual, then invert it at the end.
669     */
670    
671     case '[':
672     previous = code;
673     *code++ = OP_CLASS;
674    
675     /* If the first character is '^', set the negation flag */
676    
677     if ((c = *(++ptr)) == '^')
678     {
679     negate_class = TRUE;
680     c = *(++ptr);
681     }
682     else negate_class = FALSE;
683    
684     /* Keep a count of chars so that we can optimize the case of just a single
685     character. */
686    
687     class_charcount = 0;
688     class_lastchar = -1;
689    
690     /* Initialize the 32-char bit map to all zeros. We have to build the
691     map in a temporary bit of store, in case the class contains only 1
692     character, because in that case the compiled code doesn't use the
693     bit map. */
694    
695     memset(class, 0, 32 * sizeof(uschar));
696    
697     /* Process characters until ] is reached. By writing this as a "do" it
698     means that an initial ] is taken as a data character. */
699    
700     do
701     {
702     if (c == 0)
703     {
704     *errorptr = ERR6;
705     goto FAILED;
706     }
707    
708     /* Backslash may introduce a single character, or it may introduce one
709     of the specials, which just set a flag. Escaped items are checked for
710     validity in the pre-compiling pass. The sequence \b is a special case.
711 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
712 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
713     or into the one we are building. We assume they have more than one
714     character in them, so set class_count bigger than one. */
715    
716     if (c == '\\')
717     {
718     c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
719     if (-c == ESC_b) c = '\b';
720     else if (c < 0)
721     {
722     class_charcount = 10;
723     switch (-c)
724     {
725     case ESC_d:
726     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
727     continue;
728    
729     case ESC_D:
730     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
731     continue;
732    
733     case ESC_w:
734     for (c = 0; c < 32; c++)
735     class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
736     continue;
737    
738     case ESC_W:
739     for (c = 0; c < 32; c++)
740     class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
741     continue;
742    
743     case ESC_s:
744     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
745     continue;
746    
747     case ESC_S:
748     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
749     continue;
750    
751     default:
752     *errorptr = ERR7;
753     goto FAILED;
754     }
755     }
756     /* Fall through if single character */
757     }
758    
759     /* A single character may be followed by '-' to form a range. However,
760     Perl does not permit ']' to be the end of the range. A '-' character
761     here is treated as a literal. */
762    
763     if (ptr[1] == '-' && ptr[2] != ']')
764     {
765     int d;
766     ptr += 2;
767     d = *ptr;
768    
769     if (d == 0)
770     {
771     *errorptr = ERR6;
772     goto FAILED;
773     }
774    
775     /* The second part of a range can be a single-character escape, but
776     not any of the other escapes. */
777    
778     if (d == '\\')
779     {
780     d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
781     if (d < 0)
782     {
783     if (d == -ESC_b) d = '\b'; else
784     {
785     *errorptr = ERR7;
786     goto FAILED;
787     }
788     }
789     }
790    
791     if (d < c)
792     {
793     *errorptr = ERR8;
794     goto FAILED;
795     }
796    
797     for (; c <= d; c++)
798     {
799     class[c/8] |= (1 << (c&7));
800     if ((options & PCRE_CASELESS) != 0)
801     {
802     int uc = pcre_fcc[c]; /* flip case */
803     class[uc/8] |= (1 << (uc&7));
804     }
805     class_charcount++; /* in case a one-char range */
806     class_lastchar = c;
807     }
808     continue; /* Go get the next char in the class */
809     }
810    
811     /* Handle a lone single character - we can get here for a normal
812     non-escape char, or after \ that introduces a single character. */
813    
814     class [c/8] |= (1 << (c&7));
815     if ((options & PCRE_CASELESS) != 0)
816     {
817     c = pcre_fcc[c]; /* flip case */
818     class[c/8] |= (1 << (c&7));
819     }
820     class_charcount++;
821     class_lastchar = c;
822     }
823    
824     /* Loop until ']' reached; the check for end of string happens inside the
825     loop. This "while" is the end of the "do" above. */
826    
827     while ((c = *(++ptr)) != ']');
828    
829     /* If class_charcount is 1 and class_lastchar is not negative, we saw
830     precisely one character. This doesn't need the whole 32-byte bit map.
831     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
832     it's negative. */
833    
834     if (class_charcount == 1 && class_lastchar >= 0)
835     {
836     if (negate_class)
837     {
838     code[-1] = OP_NOT;
839     }
840     else
841     {
842     code[-1] = OP_CHARS;
843     *code++ = 1;
844     }
845     *code++ = class_lastchar;
846     }
847    
848     /* Otherwise, negate the 32-byte map if necessary, and copy it into
849     the code vector. */
850    
851     else
852     {
853     if (negate_class)
854     for (c = 0; c < 32; c++) code[c] = ~class[c];
855     else
856     memcpy(code, class, 32);
857     code += 32;
858     }
859     break;
860    
861     /* Various kinds of repeat */
862    
863     case '{':
864     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
865     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
866     if (*errorptr != NULL) goto FAILED;
867     goto REPEAT;
868    
869     case '*':
870     repeat_min = 0;
871     repeat_max = -1;
872     goto REPEAT;
873    
874     case '+':
875     repeat_min = 1;
876     repeat_max = -1;
877     goto REPEAT;
878    
879     case '?':
880     repeat_min = 0;
881     repeat_max = 1;
882    
883     REPEAT:
884     if (previous == NULL)
885     {
886     *errorptr = ERR9;
887     goto FAILED;
888     }
889    
890     /* If the next character is '?' this is a minimizing repeat. Advance to the
891     next character. */
892    
893     if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
894    
895     /* If the maximum is zero then the minimum must also be zero; Perl allows
896     this case, so we do too - by simply omitting the item altogether. */
897    
898     if (repeat_max == 0) code = previous;
899    
900     /* If previous was a string of characters, chop off the last one and use it
901     as the subject of the repeat. If there was only one character, we can
902     abolish the previous item altogether. */
903    
904     else if (*previous == OP_CHARS)
905     {
906     int len = previous[1];
907     if (len == 1)
908     {
909     c = previous[2];
910     code = previous;
911     }
912     else
913     {
914     c = previous[len+1];
915     previous[1]--;
916     code--;
917     }
918     op_type = 0; /* Use single-char op codes */
919     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
920     }
921    
922     /* If previous was a single negated character ([^a] or similar), we use
923     one of the special opcodes, replacing it. The code is shared with single-
924     character repeats by adding a suitable offset into repeat_type. */
925    
926     else if ((int)*previous == OP_NOT)
927     {
928     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
929     c = previous[1];
930     code = previous;
931     goto OUTPUT_SINGLE_REPEAT;
932     }
933    
934     /* If previous was a character type match (\d or similar), abolish it and
935     create a suitable repeat item. The code is shared with single-character
936     repeats by adding a suitable offset into repeat_type. */
937    
938     else if ((int)*previous < OP_EOD || *previous == OP_ANY)
939     {
940     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
941     c = *previous;
942     code = previous;
943    
944     OUTPUT_SINGLE_REPEAT:
945     repeat_type += op_type; /* Combine both values for many cases */
946    
947     /* A minimum of zero is handled either as the special case * or ?, or as
948     an UPTO, with the maximum given. */
949    
950     if (repeat_min == 0)
951     {
952     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
953     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
954     else
955     {
956     *code++ = OP_UPTO + repeat_type;
957     *code++ = repeat_max >> 8;
958     *code++ = (repeat_max & 255);
959     }
960     }
961    
962     /* The case {1,} is handled as the special case + */
963    
964     else if (repeat_min == 1 && repeat_max == -1)
965     *code++ = OP_PLUS + repeat_type;
966    
967     /* The case {n,n} is just an EXACT, while the general case {n,m} is
968     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
969    
970     else
971     {
972     if (repeat_min != 1)
973     {
974     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
975     *code++ = repeat_min >> 8;
976     *code++ = (repeat_min & 255);
977     }
978    
979     /* If the mininum is 1 and the previous item was a character string,
980     we either have to put back the item that got cancelled if the string
981     length was 1, or add the character back onto the end of a longer
982     string. For a character type nothing need be done; it will just get put
983     back naturally. */
984    
985     else if (*previous == OP_CHARS)
986     {
987     if (code == previous) code += 2; else previous[1]++;
988     }
989    
990 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
991 nigel 3
992 nigel 9 if (repeat_max < 0)
993 nigel 3 {
994     *code++ = c;
995 nigel 9 *code++ = OP_STAR + repeat_type;
996     }
997    
998     /* Else insert an UPTO if the max is greater than the min. */
999    
1000     else if (repeat_max != repeat_min)
1001     {
1002     *code++ = c;
1003 nigel 3 repeat_max -= repeat_min;
1004     *code++ = OP_UPTO + repeat_type;
1005     *code++ = repeat_max >> 8;
1006     *code++ = (repeat_max & 255);
1007     }
1008     }
1009    
1010     /* The character or character type itself comes last in all cases. */
1011    
1012     *code++ = c;
1013     }
1014    
1015     /* If previous was a character class or a back reference, we put the repeat
1016     stuff after it. */
1017    
1018     else if (*previous == OP_CLASS || *previous == OP_REF)
1019     {
1020     if (repeat_min == 0 && repeat_max == -1)
1021     *code++ = OP_CRSTAR + repeat_type;
1022     else if (repeat_min == 1 && repeat_max == -1)
1023     *code++ = OP_CRPLUS + repeat_type;
1024     else if (repeat_min == 0 && repeat_max == 1)
1025     *code++ = OP_CRQUERY + repeat_type;
1026     else
1027     {
1028     *code++ = OP_CRRANGE + repeat_type;
1029     *code++ = repeat_min >> 8;
1030     *code++ = repeat_min & 255;
1031     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1032     *code++ = repeat_max >> 8;
1033     *code++ = repeat_max & 255;
1034     }
1035     }
1036    
1037     /* If previous was a bracket group, we may have to replicate it in certain
1038     cases. If the maximum repeat count is unlimited, check that the bracket
1039     group cannot match the empty string, and diagnose an error if it can. */
1040    
1041     else if ((int)*previous >= OP_BRA)
1042     {
1043     int i;
1044 nigel 9 int len = code - previous;
1045 nigel 3
1046     if (repeat_max == -1 && could_be_empty(previous))
1047     {
1048     *errorptr = ERR10;
1049     goto FAILED;
1050     }
1051    
1052     /* If the minimum is greater than zero, and the maximum is unlimited or
1053     equal to the minimum, the first copy remains where it is, and is
1054     replicated up to the minimum number of times. This case includes the +
1055     repeat, but of course no replication is needed in that case. */
1056    
1057     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1058     {
1059     for (i = 1; i < repeat_min; i++)
1060     {
1061 nigel 9 memcpy(code, previous, len);
1062     code += len;
1063 nigel 3 }
1064     }
1065    
1066     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1067     Then, if there is a fixed upper limit, replicated up to that many times,
1068     sticking BRAZERO in front of all the optional ones. */
1069    
1070     else
1071     {
1072     if (repeat_min == 0)
1073     {
1074 nigel 9 memmove(previous+1, previous, len);
1075 nigel 3 code++;
1076     *previous++ = OP_BRAZERO + repeat_type;
1077     }
1078    
1079     for (i = 1; i < repeat_min; i++)
1080     {
1081 nigel 9 memcpy(code, previous, len);
1082     code += len;
1083 nigel 3 }
1084    
1085     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1086     {
1087     *code++ = OP_BRAZERO + repeat_type;
1088 nigel 9 memcpy(code, previous, len);
1089     code += len;
1090 nigel 3 }
1091     }
1092    
1093     /* If the maximum is unlimited, set a repeater in the final copy. */
1094    
1095     if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1096     }
1097    
1098     /* Else there's some kind of shambles */
1099    
1100     else
1101     {
1102     *errorptr = ERR11;
1103     goto FAILED;
1104     }
1105    
1106     /* In all case we no longer have a previous item. */
1107    
1108     previous = NULL;
1109     break;
1110    
1111    
1112     /* Start of nested bracket sub-expression, or comment or lookahead.
1113     First deal with special things that can come after a bracket; all are
1114     introduced by ?, and the appearance of any of them means that this is not a
1115     referencing group. They were checked for validity in the first pass over
1116     the string, so we don't have to check for syntax errors here. */
1117    
1118     case '(':
1119     previous = code; /* Only real brackets can be repeated */
1120     if (*(++ptr) == '?')
1121     {
1122     bravalue = OP_BRA;
1123    
1124     switch (*(++ptr))
1125     {
1126     case '#':
1127     case 'i':
1128     case 'm':
1129     case 's':
1130     case 'x':
1131     ptr++;
1132     while (*ptr != ')') ptr++;
1133     previous = NULL;
1134     continue;
1135    
1136     case ':': /* Non-extracting bracket */
1137     ptr++;
1138     break;
1139    
1140     case '=': /* Assertions can't be repeated */
1141     bravalue = OP_ASSERT;
1142     ptr++;
1143     previous = NULL;
1144     break;
1145    
1146     case '!':
1147     bravalue = OP_ASSERT_NOT;
1148     ptr++;
1149     previous = NULL;
1150     break;
1151    
1152     case '>': /* "Match once" brackets */
1153     if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1154     {
1155     bravalue = OP_ONCE;
1156     ptr++;
1157     previous = NULL;
1158     break;
1159     }
1160     /* Else fall through */
1161    
1162     default:
1163     *errorptr = ERR12;
1164     goto FAILED;
1165     }
1166     }
1167    
1168     /* Else we have a referencing group */
1169    
1170     else
1171     {
1172     if (++(*brackets) > EXTRACT_MAX)
1173     {
1174     *errorptr = ERR13;
1175     goto FAILED;
1176     }
1177     bravalue = OP_BRA + *brackets;
1178     }
1179    
1180     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1181     code into a non-register variable in order to be able to pass its address
1182     because some compilers complain otherwise. */
1183    
1184     *code = bravalue;
1185     {
1186     uschar *mcode = code;
1187     if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1188     goto FAILED;
1189     code = mcode;
1190     }
1191    
1192     if (*ptr != ')')
1193     {
1194     *errorptr = ERR14;
1195     goto FAILED;
1196     }
1197     break;
1198    
1199     /* Check \ for being a real metacharacter; if not, fall through and handle
1200     it as a data character at the start of a string. Escape items are checked
1201     for validity in the pre-compiling pass. */
1202    
1203     case '\\':
1204     oldptr = ptr;
1205     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1206    
1207     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1208     are arranged to be the negation of the corresponding OP_values. For the
1209     back references, the values are ESC_REF plus the reference number. Only
1210     back references and those types that consume a character may be repeated.
1211     We can test for values between ESC_b and ESC_Z for the latter; this may
1212     have to change if any new ones are ever created. */
1213    
1214     if (c < 0)
1215     {
1216     if (-c >= ESC_REF)
1217     {
1218     int refnum = -c - ESC_REF;
1219     if (*brackets < refnum)
1220     {
1221     *errorptr = ERR15;
1222     goto FAILED;
1223     }
1224     previous = code;
1225     *code++ = OP_REF;
1226     *code++ = refnum;
1227     }
1228     else
1229     {
1230     previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1231     *code++ = -c;
1232     }
1233     continue;
1234     }
1235    
1236 nigel 7 /* Data character: reset and fall through */
1237 nigel 3
1238     ptr = oldptr;
1239     c = '\\';
1240    
1241     /* Handle a run of data characters until a metacharacter is encountered.
1242     The first character is guaranteed not to be whitespace or # when the
1243     extended flag is set. */
1244    
1245     NORMAL_CHAR:
1246     default:
1247     previous = code;
1248     *code = OP_CHARS;
1249     code += 2;
1250     length = 0;
1251    
1252     do
1253     {
1254     if ((options & PCRE_EXTENDED) != 0)
1255     {
1256     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1257     if (c == '#')
1258     {
1259     while ((c = *(++ptr)) != 0 && c != '\n');
1260     if (c == 0) break;
1261     continue;
1262     }
1263     }
1264    
1265     /* Backslash may introduce a data char or a metacharacter. Escaped items
1266     are checked for validity in the pre-compiling pass. Stop the string
1267     before a metaitem. */
1268    
1269     if (c == '\\')
1270     {
1271     oldptr = ptr;
1272     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1273     if (c < 0) { ptr = oldptr; break; }
1274     }
1275    
1276     /* Ordinary character or single-char escape */
1277    
1278     *code++ = c;
1279     length++;
1280     }
1281    
1282     /* This "while" is the end of the "do" above. */
1283    
1284     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1285    
1286     /* Compute the length and set it in the data vector, and advance to
1287     the next state. */
1288    
1289     previous[1] = length;
1290     ptr--;
1291     break;
1292     }
1293     } /* end of big loop */
1294    
1295     /* Control never reaches here by falling through, only by a goto for all the
1296     error states. Pass back the position in the pattern so that it can be displayed
1297     to the user for diagnosing the error. */
1298    
1299     FAILED:
1300     *ptrptr = ptr;
1301     return FALSE;
1302     }
1303    
1304    
1305    
1306    
1307     /*************************************************
1308     * Compile sequence of alternatives *
1309     *************************************************/
1310    
1311     /* On entry, ptr is pointing past the bracket character, but on return
1312     it points to the closing bracket, or vertical bar, or end of string.
1313     The code variable is pointing at the byte into which the BRA operator has been
1314     stored.
1315    
1316     Argument:
1317     options the option bits
1318     brackets -> int containing the number of extracting brackets used
1319     codeptr -> the address of the current code pointer
1320     ptrptr -> the address of the current pattern pointer
1321     errorptr -> pointer to error message
1322    
1323     Returns: TRUE on success
1324     */
1325    
1326     static BOOL
1327 nigel 7 compile_regex(int options, int *brackets, uschar **codeptr,
1328     const uschar **ptrptr, const char **errorptr)
1329 nigel 3 {
1330 nigel 7 const uschar *ptr = *ptrptr;
1331 nigel 3 uschar *code = *codeptr;
1332     uschar *start_bracket = code;
1333    
1334     for (;;)
1335     {
1336     int length;
1337     uschar *last_branch = code;
1338    
1339     code += 3;
1340     if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1341     {
1342     *ptrptr = ptr;
1343     return FALSE;
1344     }
1345    
1346     /* Fill in the length of the last branch */
1347    
1348     length = code - last_branch;
1349     last_branch[1] = length >> 8;
1350     last_branch[2] = length & 255;
1351    
1352     /* Reached end of expression, either ')' or end of pattern. Insert a
1353     terminating ket and the length of the whole bracketed item, and return,
1354     leaving the pointer at the terminating char. */
1355    
1356     if (*ptr != '|')
1357     {
1358     length = code - start_bracket;
1359     *code++ = OP_KET;
1360     *code++ = length >> 8;
1361     *code++ = length & 255;
1362     *codeptr = code;
1363     *ptrptr = ptr;
1364     return TRUE;
1365     }
1366    
1367     /* Another branch follows; insert an "or" node and advance the pointer. */
1368    
1369     *code = OP_ALT;
1370     ptr++;
1371     }
1372     /* Control never reaches here */
1373     }
1374    
1375    
1376    
1377     /*************************************************
1378     * Check for anchored expression *
1379     *************************************************/
1380    
1381     /* Try to find out if this is an anchored regular expression. Consider each
1382     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1383     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1384     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1385     counts, since OP_CIRC can match in the middle.
1386    
1387     A branch is also implicitly anchored if it starts with .* because that will try
1388     the rest of the pattern at all possible matching points, so there is no point
1389     trying them again.
1390    
1391     Argument: points to start of expression (the bracket)
1392     Returns: TRUE or FALSE
1393     */
1394    
1395     static BOOL
1396 nigel 7 is_anchored(register const uschar *code, BOOL multiline)
1397 nigel 3 {
1398     do {
1399     int op = (int)code[3];
1400     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1401     { if (!is_anchored(code+3, multiline)) return FALSE; }
1402     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1403     { if (code[4] != OP_ANY) return FALSE; }
1404     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1405     code += (code[1] << 8) + code[2];
1406     }
1407     while (*code == OP_ALT);
1408     return TRUE;
1409     }
1410    
1411    
1412    
1413     /*************************************************
1414     * Check for start with \n line expression *
1415     *************************************************/
1416    
1417     /* This is called for multiline expressions to try to find out if every branch
1418     starts with ^ so that "first char" processing can be done to speed things up.
1419    
1420     Argument: points to start of expression (the bracket)
1421     Returns: TRUE or FALSE
1422     */
1423    
1424     static BOOL
1425 nigel 7 is_startline(const uschar *code)
1426 nigel 3 {
1427     do {
1428     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1429     { if (!is_startline(code+3)) return FALSE; }
1430     else if (code[3] != OP_CIRC) return FALSE;
1431     code += (code[1] << 8) + code[2];
1432     }
1433     while (*code == OP_ALT);
1434     return TRUE;
1435     }
1436    
1437    
1438    
1439     /*************************************************
1440     * Check for fixed first char *
1441     *************************************************/
1442    
1443     /* Try to find out if there is a fixed first character. This is called for
1444     unanchored expressions, as it speeds up their processing quite considerably.
1445     Consider each alternative branch. If they all start with the same char, or with
1446     a bracket all of whose alternatives start with the same char (recurse ad lib),
1447     then we return that char, otherwise -1.
1448    
1449     Argument: points to start of expression (the bracket)
1450     Returns: -1 or the fixed first char
1451     */
1452    
1453     static int
1454     find_firstchar(uschar *code)
1455     {
1456     register int c = -1;
1457     do
1458     {
1459     register int charoffset = 4;
1460    
1461     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1462     {
1463     register int d;
1464     if ((d = find_firstchar(code+3)) < 0) return -1;
1465     if (c < 0) c = d; else if (c != d) return -1;
1466     }
1467    
1468     else switch(code[3])
1469     {
1470     default:
1471     return -1;
1472    
1473     case OP_EXACT: /* Fall through */
1474     charoffset++;
1475    
1476     case OP_CHARS: /* Fall through */
1477     charoffset++;
1478    
1479     case OP_PLUS:
1480     case OP_MINPLUS:
1481     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1482     break;
1483     }
1484     code += (code[1] << 8) + code[2];
1485     }
1486     while (*code == OP_ALT);
1487     return c;
1488     }
1489    
1490    
1491    
1492     /*************************************************
1493     * Compile a Regular Expression *
1494     *************************************************/
1495    
1496     /* This function takes a string and returns a pointer to a block of store
1497     holding a compiled version of the expression.
1498    
1499     Arguments:
1500     pattern the regular expression
1501     options various option bits
1502     errorptr pointer to pointer to error text
1503     erroroffset ptr offset in pattern where error was detected
1504    
1505     Returns: pointer to compiled data block, or NULL on error,
1506     with errorptr and erroroffset set
1507     */
1508    
1509     pcre *
1510 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
1511 nigel 3 int *erroroffset)
1512     {
1513     real_pcre *re;
1514     int spaces = 0;
1515     int length = 3; /* For initial BRA plus length */
1516     int runlength;
1517     int c, size;
1518     int bracount = 0;
1519     int brastack[200];
1520     int top_backref = 0;
1521 nigel 7 unsigned int brastackptr = 0;
1522     uschar *code;
1523     const uschar *ptr;
1524 nigel 3
1525     #ifdef DEBUG
1526     uschar *code_base, *code_end;
1527     #endif
1528    
1529     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1530     can do is just return NULL. */
1531    
1532     if (errorptr == NULL) return NULL;
1533     *errorptr = NULL;
1534    
1535     /* However, we can give a message for this error */
1536    
1537     if (erroroffset == NULL)
1538     {
1539     *errorptr = ERR16;
1540     return NULL;
1541     }
1542     *erroroffset = 0;
1543    
1544     if ((options & ~PUBLIC_OPTIONS) != 0)
1545     {
1546     *errorptr = ERR17;
1547     return NULL;
1548     }
1549    
1550 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
1551     DPRINTF(("%s\n", pattern));
1552 nigel 3
1553     /* The first thing to do is to make a pass over the pattern to compute the
1554     amount of store required to hold the compiled code. This does not have to be
1555     perfect as long as errors are overestimates. At the same time we can detect any
1556     internal flag settings. Make an attempt to correct for any counted white space
1557     if an "extended" flag setting appears late in the pattern. We can't be so
1558     clever for #-comments. */
1559    
1560 nigel 7 ptr = (const uschar *)(pattern - 1);
1561 nigel 3 while ((c = *(++ptr)) != 0)
1562     {
1563     int min, max;
1564     int class_charcount;
1565    
1566     if ((pcre_ctypes[c] & ctype_space) != 0)
1567     {
1568     if ((options & PCRE_EXTENDED) != 0) continue;
1569     spaces++;
1570     }
1571    
1572     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1573     {
1574     while ((c = *(++ptr)) != 0 && c != '\n');
1575     continue;
1576     }
1577    
1578     switch(c)
1579     {
1580     /* A backslashed item may be an escaped "normal" character or a
1581     character type. For a "normal" character, put the pointers and
1582     character back so that tests for whitespace etc. in the input
1583     are done correctly. */
1584    
1585     case '\\':
1586     {
1587 nigel 7 const uschar *save_ptr = ptr;
1588 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1589     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1590     if (c >= 0)
1591     {
1592     ptr = save_ptr;
1593     c = '\\';
1594     goto NORMAL_CHAR;
1595     }
1596     }
1597     length++;
1598    
1599     /* A back reference needs an additional char, plus either one or 5
1600     bytes for a repeat. We also need to keep the value of the highest
1601     back reference. */
1602    
1603     if (c <= -ESC_REF)
1604     {
1605     int refnum = -c - ESC_REF;
1606     if (refnum > top_backref) top_backref = refnum;
1607     length++; /* For single back reference */
1608     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1609     {
1610     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1611     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1612     if ((min == 0 && (max == 1 || max == -1)) ||
1613     (min == 1 && max == -1))
1614     length++;
1615     else length += 5;
1616     if (ptr[1] == '?') ptr++;
1617     }
1618     }
1619     continue;
1620    
1621     case '^':
1622     case '.':
1623     case '$':
1624     case '*': /* These repeats won't be after brackets; */
1625     case '+': /* those are handled separately */
1626     case '?':
1627     length++;
1628     continue;
1629    
1630     /* This covers the cases of repeats after a single char, metachar, class,
1631     or back reference. */
1632    
1633     case '{':
1634     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1635     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1636     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1637     if ((min == 0 && (max == 1 || max == -1)) ||
1638     (min == 1 && max == -1))
1639     length++;
1640     else
1641     {
1642     length--; /* Uncount the original char or metachar */
1643     if (min == 1) length++; else if (min > 0) length += 4;
1644     if (max > 0) length += 4; else length += 2;
1645     }
1646     if (ptr[1] == '?') ptr++;
1647     continue;
1648    
1649     /* An alternation contains an offset to the next branch or ket. */
1650     case '|':
1651     length += 3;
1652     continue;
1653    
1654     /* A character class uses 33 characters. Don't worry about character types
1655     that aren't allowed in classes - they'll get picked up during the compile.
1656     A character class that contains only one character uses 2 or 3 bytes,
1657     depending on whether it is negated or not. Notice this where we can. */
1658    
1659     case '[':
1660     class_charcount = 0;
1661     if (*(++ptr) == '^') ptr++;
1662     do
1663     {
1664     if (*ptr == '\\')
1665     {
1666 nigel 9 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1667 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1668 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1669 nigel 3 }
1670     else class_charcount++;
1671     ptr++;
1672     }
1673     while (*ptr != 0 && *ptr != ']');
1674    
1675     /* Repeats for negated single chars are handled by the general code */
1676    
1677     if (class_charcount == 1) length += 3; else
1678     {
1679     length += 33;
1680    
1681     /* A repeat needs either 1 or 5 bytes. */
1682    
1683 nigel 9 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1684 nigel 3 {
1685     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1686     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1687     if ((min == 0 && (max == 1 || max == -1)) ||
1688     (min == 1 && max == -1))
1689     length++;
1690     else length += 5;
1691     if (ptr[1] == '?') ptr++;
1692     }
1693     }
1694     continue;
1695    
1696     /* Brackets may be genuine groups or special things */
1697    
1698     case '(':
1699    
1700     /* Handle special forms of bracket, which all start (? */
1701    
1702     if (ptr[1] == '?') switch (c = ptr[2])
1703     {
1704     /* Skip over comments entirely */
1705     case '#':
1706     ptr += 3;
1707     while (*ptr != 0 && *ptr != ')') ptr++;
1708     if (*ptr == 0)
1709     {
1710     *errorptr = ERR18;
1711     goto PCRE_ERROR_RETURN;
1712     }
1713     continue;
1714    
1715     /* Non-referencing groups and lookaheads just move the pointer on, and
1716     then behave like a non-special bracket, except that they don't increment
1717     the count of extracting brackets. */
1718    
1719     case ':':
1720     case '=':
1721     case '!':
1722     ptr += 2;
1723     break;
1724    
1725     /* Ditto for the "once only" bracket, allowed only if the extra bit
1726     is set. */
1727    
1728     case '>':
1729     if ((options & PCRE_EXTRA) != 0)
1730     {
1731     ptr += 2;
1732     break;
1733     }
1734     /* Else fall thourh */
1735    
1736     /* Else loop setting valid options until ) is met. Anything else is an
1737     error. */
1738    
1739     default:
1740     ptr += 2;
1741     for (;; ptr++)
1742     {
1743     if ((c = *ptr) == 'i')
1744     {
1745     options |= PCRE_CASELESS;
1746     continue;
1747     }
1748     else if ((c = *ptr) == 'm')
1749     {
1750     options |= PCRE_MULTILINE;
1751     continue;
1752     }
1753     else if (c == 's')
1754     {
1755     options |= PCRE_DOTALL;
1756     continue;
1757     }
1758     else if (c == 'x')
1759     {
1760     options |= PCRE_EXTENDED;
1761     length -= spaces; /* Already counted spaces */
1762     continue;
1763     }
1764     else if (c == ')') break;
1765    
1766     *errorptr = ERR12;
1767     goto PCRE_ERROR_RETURN;
1768     }
1769     continue; /* End of this bracket handling */
1770     }
1771    
1772     /* Extracting brackets must be counted so we can process escapes in a
1773     Perlish way. */
1774    
1775     else bracount++;
1776    
1777     /* Non-special forms of bracket. Save length for computing whole length
1778     at end if there's a repeat that requires duplication of the group. */
1779    
1780     if (brastackptr >= sizeof(brastack)/sizeof(int))
1781     {
1782     *errorptr = ERR19;
1783     goto PCRE_ERROR_RETURN;
1784     }
1785    
1786     brastack[brastackptr++] = length;
1787     length += 3;
1788     continue;
1789    
1790     /* Handle ket. Look for subsequent max/min; for certain sets of values we
1791 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
1792     0 this is an unmatched bracket which will generate an error, but take care
1793     not to try to access brastack[-1]. */
1794 nigel 3
1795     case ')':
1796     length += 3;
1797     {
1798 nigel 9 int minval = 1;
1799     int maxval = 1;
1800     int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1801 nigel 3
1802     /* Leave ptr at the final char; for read_repeat_counts this happens
1803     automatically; for the others we need an increment. */
1804    
1805     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1806     {
1807 nigel 9 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1808 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1809     }
1810 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1811     else if (c == '+') { maxval = -1; ptr++; }
1812     else if (c == '?') { minval = 0; ptr++; }
1813 nigel 3
1814 nigel 9 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1815     if there is a limited maximum we have to replicate up to maxval-1 times
1816     and allow for a BRAZERO item before each optional copy, as we also have
1817     to do before the first copy if the minimum is zero. */
1818 nigel 3
1819 nigel 9 if (minval == 0) length++;
1820     else if (minval > 1) length += (minval - 1) * duplength;
1821     if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1822 nigel 3 }
1823     continue;
1824    
1825     /* Non-special character. For a run of such characters the length required
1826     is the number of characters + 2, except that the maximum run length is 255.
1827     We won't get a skipped space or a non-data escape or the start of a #
1828     comment as the first character, so the length can't be zero. */
1829    
1830     NORMAL_CHAR:
1831     default:
1832     length += 2;
1833     runlength = 0;
1834     do
1835     {
1836     if ((pcre_ctypes[c] & ctype_space) != 0)
1837     {
1838     if ((options & PCRE_EXTENDED) != 0) continue;
1839     spaces++;
1840     }
1841    
1842     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1843     {
1844     while ((c = *(++ptr)) != 0 && c != '\n');
1845     continue;
1846     }
1847    
1848     /* Backslash may introduce a data char or a metacharacter; stop the
1849     string before the latter. */
1850    
1851     if (c == '\\')
1852     {
1853 nigel 7 const uschar *saveptr = ptr;
1854 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1855     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1856     if (c < 0) { ptr = saveptr; break; }
1857     }
1858    
1859     /* Ordinary character or single-char escape */
1860    
1861     runlength++;
1862     }
1863    
1864     /* This "while" is the end of the "do" above. */
1865    
1866     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1867    
1868     ptr--;
1869     length += runlength;
1870     continue;
1871     }
1872     }
1873    
1874     length += 4; /* For final KET and END */
1875    
1876     if (length > 65539)
1877     {
1878     *errorptr = ERR20;
1879     return NULL;
1880     }
1881    
1882     /* Compute the size of data block needed and get it, either from malloc or
1883 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
1884     rather than just "code", because it has been reported that one broken compiler
1885     fails on "code" because it is also an independent variable. It should make no
1886     difference to the value of the offsetof(). */
1887 nigel 3
1888 nigel 9 size = length + offsetof(real_pcre, code[0]);
1889 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
1890    
1891     if (re == NULL)
1892     {
1893     *errorptr = ERR21;
1894     return NULL;
1895     }
1896    
1897 nigel 9 /* Put in the magic number and the options. */
1898    
1899 nigel 3 re->magic_number = MAGIC_NUMBER;
1900     re->options = options;
1901    
1902     /* Set up a starting, non-extracting bracket, then compile the expression. On
1903     error, *errorptr will be set non-NULL, so we don't need to look at the result
1904     of the function here. */
1905    
1906 nigel 7 ptr = (const uschar *)pattern;
1907 nigel 3 code = re->code;
1908     *code = OP_BRA;
1909     bracount = 0;
1910     (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1911     re->top_bracket = bracount;
1912     re->top_backref = top_backref;
1913    
1914     /* If not reached end of pattern on success, there's an excess bracket. */
1915    
1916     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1917    
1918     /* Fill in the terminating state and check for disastrous overflow, but
1919     if debugging, leave the test till after things are printed out. */
1920    
1921     *code++ = OP_END;
1922    
1923     #ifndef DEBUG
1924     if (code - re->code > length) *errorptr = ERR23;
1925     #endif
1926    
1927     /* Failed to compile */
1928    
1929     if (*errorptr != NULL)
1930     {
1931     (pcre_free)(re);
1932     PCRE_ERROR_RETURN:
1933 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
1934 nigel 3 return NULL;
1935     }
1936    
1937     /* If the anchored option was not passed, set flag if we can determine that it
1938     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1939     we can determine what the first character has to be, because that speeds up
1940     unanchored matches no end. In the case of multiline matches, an alternative is
1941     to set the PCRE_STARTLINE flag if all branches start with ^. */
1942    
1943     if ((options & PCRE_ANCHORED) == 0)
1944     {
1945     if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1946     re->options |= PCRE_ANCHORED;
1947     else
1948     {
1949 nigel 9 int ch = find_firstchar(re->code);
1950     if (ch >= 0)
1951 nigel 3 {
1952 nigel 9 re->first_char = ch;
1953 nigel 3 re->options |= PCRE_FIRSTSET;
1954     }
1955     else if (is_startline(re->code))
1956     re->options |= PCRE_STARTLINE;
1957     }
1958     }
1959    
1960     /* Print out the compiled data for debugging */
1961    
1962     #ifdef DEBUG
1963    
1964     printf("Length = %d top_bracket = %d top_backref=%d\n",
1965     length, re->top_bracket, re->top_backref);
1966    
1967     if (re->options != 0)
1968     {
1969     printf("%s%s%s%s%s%s%s\n",
1970     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1971     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1972     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1973     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1974     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1975     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1976     ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1977     }
1978    
1979     if ((re->options & PCRE_FIRSTSET) != 0)
1980     {
1981     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1982     else printf("First char = \\x%02x\n", re->first_char);
1983     }
1984    
1985     code_end = code;
1986     code_base = code = re->code;
1987    
1988     while (code < code_end)
1989     {
1990     int charlength;
1991    
1992     printf("%3d ", code - code_base);
1993    
1994     if (*code >= OP_BRA)
1995     {
1996     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1997     code += 2;
1998     }
1999    
2000     else switch(*code)
2001     {
2002     case OP_CHARS:
2003     charlength = *(++code);
2004     printf("%3d ", charlength);
2005     while (charlength-- > 0)
2006     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2007     break;
2008    
2009     case OP_KETRMAX:
2010     case OP_KETRMIN:
2011     case OP_ALT:
2012     case OP_KET:
2013     case OP_ASSERT:
2014     case OP_ASSERT_NOT:
2015     case OP_ONCE:
2016     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2017     code += 2;
2018     break;
2019    
2020     case OP_STAR:
2021     case OP_MINSTAR:
2022     case OP_PLUS:
2023     case OP_MINPLUS:
2024     case OP_QUERY:
2025     case OP_MINQUERY:
2026     case OP_TYPESTAR:
2027     case OP_TYPEMINSTAR:
2028     case OP_TYPEPLUS:
2029     case OP_TYPEMINPLUS:
2030     case OP_TYPEQUERY:
2031     case OP_TYPEMINQUERY:
2032     if (*code >= OP_TYPESTAR)
2033     printf(" %s", OP_names[code[1]]);
2034     else if (isprint(c = code[1])) printf(" %c", c);
2035     else printf(" \\x%02x", c);
2036     printf("%s", OP_names[*code++]);
2037     break;
2038    
2039     case OP_EXACT:
2040     case OP_UPTO:
2041     case OP_MINUPTO:
2042     if (isprint(c = code[3])) printf(" %c{", c);
2043     else printf(" \\x%02x{", c);
2044     if (*code != OP_EXACT) printf(",");
2045     printf("%d}", (code[1] << 8) + code[2]);
2046     if (*code == OP_MINUPTO) printf("?");
2047     code += 3;
2048     break;
2049    
2050     case OP_TYPEEXACT:
2051     case OP_TYPEUPTO:
2052     case OP_TYPEMINUPTO:
2053     printf(" %s{", OP_names[code[3]]);
2054     if (*code != OP_TYPEEXACT) printf(",");
2055     printf("%d}", (code[1] << 8) + code[2]);
2056     if (*code == OP_TYPEMINUPTO) printf("?");
2057     code += 3;
2058     break;
2059    
2060     case OP_NOT:
2061     if (isprint(c = *(++code))) printf(" [^%c]", c);
2062     else printf(" [^\\x%02x]", c);
2063     break;
2064    
2065     case OP_NOTSTAR:
2066     case OP_NOTMINSTAR:
2067     case OP_NOTPLUS:
2068     case OP_NOTMINPLUS:
2069     case OP_NOTQUERY:
2070     case OP_NOTMINQUERY:
2071     if (isprint(c = code[1])) printf(" [^%c]", c);
2072     else printf(" [^\\x%02x]", c);
2073     printf("%s", OP_names[*code++]);
2074     break;
2075    
2076     case OP_NOTEXACT:
2077     case OP_NOTUPTO:
2078     case OP_NOTMINUPTO:
2079     if (isprint(c = code[3])) printf(" [^%c]{", c);
2080     else printf(" [^\\x%02x]{", c);
2081     if (*code != OP_NOTEXACT) printf(",");
2082     printf("%d}", (code[1] << 8) + code[2]);
2083     if (*code == OP_NOTMINUPTO) printf("?");
2084     code += 3;
2085     break;
2086    
2087     case OP_REF:
2088     printf(" \\%d", *(++code));
2089 nigel 9 code ++;
2090     goto CLASS_REF_REPEAT;
2091 nigel 3
2092     case OP_CLASS:
2093     {
2094     int i, min, max;
2095    
2096     code++;
2097     printf(" [");
2098    
2099     for (i = 0; i < 256; i++)
2100     {
2101     if ((code[i/8] & (1 << (i&7))) != 0)
2102     {
2103     int j;
2104     for (j = i+1; j < 256; j++)
2105     if ((code[j/8] & (1 << (j&7))) == 0) break;
2106     if (i == '-' || i == ']') printf("\\");
2107     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2108     if (--j > i)
2109     {
2110     printf("-");
2111     if (j == '-' || j == ']') printf("\\");
2112     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2113     }
2114     i = j;
2115     }
2116     }
2117     printf("]");
2118     code += 32;
2119    
2120 nigel 9 CLASS_REF_REPEAT:
2121    
2122 nigel 3 switch(*code)
2123     {
2124     case OP_CRSTAR:
2125     case OP_CRMINSTAR:
2126     case OP_CRPLUS:
2127     case OP_CRMINPLUS:
2128     case OP_CRQUERY:
2129     case OP_CRMINQUERY:
2130     printf("%s", OP_names[*code]);
2131     break;
2132    
2133     case OP_CRRANGE:
2134     case OP_CRMINRANGE:
2135     min = (code[1] << 8) + code[2];
2136     max = (code[3] << 8) + code[4];
2137     if (max == 0) printf("{%d,}", min);
2138     else printf("{%d,%d}", min, max);
2139     if (*code == OP_CRMINRANGE) printf("?");
2140     code += 4;
2141     break;
2142    
2143     default:
2144     code--;
2145     }
2146     }
2147     break;
2148    
2149     /* Anything else is just a one-node item */
2150    
2151     default:
2152     printf(" %s", OP_names[*code]);
2153     break;
2154     }
2155    
2156     code++;
2157     printf("\n");
2158     }
2159     printf("------------------------------------------------------------------\n");
2160    
2161     /* This check is done here in the debugging case so that the code that
2162     was compiled can be seen. */
2163    
2164     if (code - re->code > length)
2165     {
2166     *errorptr = ERR23;
2167     (pcre_free)(re);
2168     *erroroffset = ptr - (uschar *)pattern;
2169     return NULL;
2170     }
2171     #endif
2172    
2173     return (pcre *)re;
2174     }
2175    
2176    
2177    
2178     /*************************************************
2179     * Match a character type *
2180     *************************************************/
2181    
2182     /* Not used in all the places it might be as it's sometimes faster
2183     to put the code inline.
2184    
2185     Arguments:
2186     type the character type
2187     c the character
2188     dotall the dotall flag
2189    
2190     Returns: TRUE if character is of the type
2191     */
2192    
2193     static BOOL
2194     match_type(int type, int c, BOOL dotall)
2195     {
2196    
2197     #ifdef DEBUG
2198     if (isprint(c)) printf("matching subject %c against ", c);
2199     else printf("matching subject \\x%02x against ", c);
2200     printf("%s\n", OP_names[type]);
2201     #endif
2202    
2203     switch(type)
2204     {
2205     case OP_ANY: return dotall || c != '\n';
2206     case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2207     case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2208     case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2209     case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2210     case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2211     case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2212     }
2213     return FALSE;
2214     }
2215    
2216    
2217    
2218     /*************************************************
2219     * Match a back-reference *
2220     *************************************************/
2221    
2222     /* If a back reference hasn't been set, the match fails.
2223    
2224     Arguments:
2225     number reference number
2226     eptr points into the subject
2227     length length to be matched
2228     md points to match data block
2229    
2230     Returns: TRUE if matched
2231     */
2232    
2233     static BOOL
2234 nigel 7 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2235 nigel 3 {
2236 nigel 7 const uschar *p = md->start_subject + md->offset_vector[number];
2237 nigel 3
2238     #ifdef DEBUG
2239     if (eptr >= md->end_subject)
2240     printf("matching subject <null>");
2241     else
2242     {
2243     printf("matching subject ");
2244     pchars(eptr, length, TRUE, md);
2245     }
2246     printf(" against backref ");
2247     pchars(p, length, FALSE, md);
2248     printf("\n");
2249     #endif
2250    
2251     /* Always fail if not enough characters left */
2252    
2253     if (length > md->end_subject - p) return FALSE;
2254    
2255     /* Separate the caselesss case for speed */
2256    
2257     if (md->caseless)
2258     { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2259     else
2260     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2261    
2262     return TRUE;
2263     }
2264    
2265    
2266    
2267     /*************************************************
2268     * Match from current position *
2269     *************************************************/
2270    
2271     /* On entry ecode points to the first opcode, and eptr to the first character.
2272    
2273     Arguments:
2274     eptr pointer in subject
2275     ecode position in code
2276     offset_top current top pointer
2277     md pointer to "static" info for the match
2278    
2279     Returns: TRUE if matched
2280     */
2281    
2282     static BOOL
2283 nigel 7 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2284 nigel 3 match_data *md)
2285     {
2286     for (;;)
2287     {
2288     int min, max, ctype;
2289     register int i;
2290     register int c;
2291 nigel 7 BOOL minimize = FALSE;
2292 nigel 3
2293     /* Opening bracket. Check the alternative branches in turn, failing if none
2294     match. We have to set the start offset if required and there is space
2295     in the offset vector so that it is available for subsequent back references
2296     if the bracket matches. However, if the bracket fails, we must put back the
2297     previous value of both offsets in case they were set by a previous copy of
2298     the same bracket. Don't worry about setting the flag for the error case here;
2299     that is handled in the code for KET. */
2300    
2301     if ((int)*ecode >= OP_BRA)
2302     {
2303     int number = (*ecode - OP_BRA) << 1;
2304 nigel 7 int save_offset1 = 0, save_offset2 = 0;
2305 nigel 3
2306 nigel 9 DPRINTF(("start bracket %d\n", number/2));
2307 nigel 3
2308     if (number > 0 && number < md->offset_end)
2309     {
2310     save_offset1 = md->offset_vector[number];
2311     save_offset2 = md->offset_vector[number+1];
2312     md->offset_vector[number] = eptr - md->start_subject;
2313    
2314 nigel 9 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2315 nigel 3 }
2316    
2317     /* Recurse for all the alternatives. */
2318    
2319     do
2320     {
2321     if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2322     ecode += (ecode[1] << 8) + ecode[2];
2323     }
2324     while (*ecode == OP_ALT);
2325    
2326 nigel 9 DPRINTF(("bracket %d failed\n", number/2));
2327 nigel 3
2328     if (number > 0 && number < md->offset_end)
2329     {
2330     md->offset_vector[number] = save_offset1;
2331     md->offset_vector[number+1] = save_offset2;
2332     }
2333    
2334     return FALSE;
2335     }
2336    
2337     /* Other types of node can be handled by a switch */
2338    
2339     switch(*ecode)
2340     {
2341     case OP_END:
2342     md->end_match_ptr = eptr; /* Record where we ended */
2343     md->end_offset_top = offset_top; /* and how many extracts were taken */
2344     return TRUE;
2345    
2346     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2347     whole thing doesn't match, so we have to get out via a longjmp(). */
2348    
2349     case OP_CUT:
2350     if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2351     longjmp(md->fail_env, 1);
2352    
2353     /* Assertion brackets. Check the alternative branches in turn - the
2354     matching won't pass the KET for an assertion. If any one branch matches,
2355     the assertion is true. */
2356    
2357     case OP_ASSERT:
2358     do
2359     {
2360     if (match(eptr, ecode+3, offset_top, md)) break;
2361     ecode += (ecode[1] << 8) + ecode[2];
2362     }
2363     while (*ecode == OP_ALT);
2364     if (*ecode == OP_KET) return FALSE;
2365    
2366     /* Continue from after the assertion, updating the offsets high water
2367     mark, since extracts may have been taken during the assertion. */
2368    
2369     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2370     ecode += 3;
2371     offset_top = md->end_offset_top;
2372     continue;
2373    
2374     /* Negative assertion: all branches must fail to match */
2375    
2376     case OP_ASSERT_NOT:
2377     do
2378     {
2379     if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2380     ecode += (ecode[1] << 8) + ecode[2];
2381     }
2382     while (*ecode == OP_ALT);
2383     ecode += 3;
2384     continue;
2385    
2386     /* "Once" brackets are like assertion brackets except that after a match,
2387     the point in the subject string is not moved back. Thus there can never be
2388 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
2389 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
2390     matches, we carry on, leaving the subject pointer. */
2391    
2392     case OP_ONCE:
2393     do
2394     {
2395     if (match(eptr, ecode+3, offset_top, md)) break;
2396     ecode += (ecode[1] << 8) + ecode[2];
2397     }
2398     while (*ecode == OP_ALT);
2399     if (*ecode == OP_KET) return FALSE;
2400    
2401     /* Continue as from after the assertion, updating the offsets high water
2402     mark, since extracts may have been taken. */
2403    
2404     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2405     ecode += 3;
2406     offset_top = md->end_offset_top;
2407     eptr = md->end_match_ptr;
2408     continue;
2409    
2410     /* An alternation is the end of a branch; scan along to find the end of the
2411     bracketed group and go to there. */
2412    
2413     case OP_ALT:
2414     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2415     break;
2416    
2417     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2418     that it may occur zero times. It may repeat infinitely, or not at all -
2419     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2420     repeat limits are compiled as a number of copies, with the optional ones
2421     preceded by BRAZERO or BRAMINZERO. */
2422    
2423     case OP_BRAZERO:
2424     {
2425 nigel 7 const uschar *next = ecode+1;
2426 nigel 3 if (match(eptr, next, offset_top, md)) return TRUE;
2427     do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2428     ecode = next + 3;
2429     }
2430     break;
2431    
2432     case OP_BRAMINZERO:
2433     {
2434 nigel 7 const uschar *next = ecode+1;
2435 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2436     if (match(eptr, next+3, offset_top, md)) return TRUE;
2437     ecode++;
2438     }
2439     break;;
2440    
2441     /* End of a group, repeated or non-repeating. If we are at the end of
2442     an assertion "group", stop matching and return TRUE, but record the
2443     current high water mark for use by positive assertions. */
2444    
2445     case OP_KET:
2446     case OP_KETRMIN:
2447     case OP_KETRMAX:
2448     {
2449     int number;
2450 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2451 nigel 3
2452     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2453     {
2454     md->end_match_ptr = eptr; /* For ONCE */
2455     md->end_offset_top = offset_top;
2456     return TRUE;
2457     }
2458    
2459     /* In all other cases we have to check the group number back at the
2460     start and if necessary complete handling an extraction by setting the
2461     final offset and bumping the high water mark. */
2462    
2463     number = (*prev - OP_BRA) << 1;
2464    
2465 nigel 9 DPRINTF(("end bracket %d\n", number/2));
2466 nigel 3
2467     if (number > 0)
2468     {
2469     if (number >= md->offset_end) md->offset_overflow = TRUE; else
2470     {
2471     md->offset_vector[number+1] = eptr - md->start_subject;
2472     if (offset_top <= number) offset_top = number + 2;
2473     }
2474     }
2475    
2476     /* For a non-repeating ket, just advance to the next node and continue at
2477     this level. */
2478    
2479     if (*ecode == OP_KET)
2480     {
2481     ecode += 3;
2482     break;
2483     }
2484    
2485     /* The repeating kets try the rest of the pattern or restart from the
2486     preceding bracket, in the appropriate order. */
2487    
2488     if (*ecode == OP_KETRMIN)
2489     {
2490     if (match(eptr, ecode+3, offset_top, md) ||
2491     match(eptr, prev, offset_top, md)) return TRUE;
2492     }
2493     else /* OP_KETRMAX */
2494     {
2495     if (match(eptr, prev, offset_top, md) ||
2496     match(eptr, ecode+3, offset_top, md)) return TRUE;
2497     }
2498     }
2499     return FALSE;
2500    
2501     /* Start of subject unless notbol, or after internal newline if multiline */
2502    
2503     case OP_CIRC:
2504     if (md->notbol && eptr == md->start_subject) return FALSE;
2505     if (md->multiline)
2506     {
2507     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2508     ecode++;
2509     break;
2510     }
2511     /* ... else fall through */
2512    
2513     /* Start of subject assertion */
2514    
2515     case OP_SOD:
2516     if (eptr != md->start_subject) return FALSE;
2517     ecode++;
2518     break;
2519    
2520     /* Assert before internal newline if multiline, or before
2521     a terminating newline unless endonly is set, else end of subject unless
2522     noteol is set. */
2523    
2524     case OP_DOLL:
2525     if (md->noteol && eptr >= md->end_subject) return FALSE;
2526     if (md->multiline)
2527     {
2528     if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2529     ecode++;
2530     break;
2531     }
2532     else if (!md->endonly)
2533     {
2534     if (eptr < md->end_subject - 1 ||
2535     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2536     ecode++;
2537     break;
2538     }
2539     /* ... else fall through */
2540    
2541     /* End of subject assertion */
2542    
2543     case OP_EOD:
2544     if (eptr < md->end_subject) return FALSE;
2545     ecode++;
2546     break;
2547    
2548     /* Word boundary assertions */
2549    
2550     case OP_NOT_WORD_BOUNDARY:
2551     case OP_WORD_BOUNDARY:
2552     {
2553     BOOL prev_is_word = (eptr != md->start_subject) &&
2554     ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2555     BOOL cur_is_word = (eptr < md->end_subject) &&
2556     ((pcre_ctypes[*eptr] & ctype_word) != 0);
2557     if ((*ecode++ == OP_WORD_BOUNDARY)?
2558     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2559     return FALSE;
2560     }
2561     break;
2562    
2563     /* Match a single character type; inline for speed */
2564    
2565     case OP_ANY:
2566     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2567     if (eptr++ >= md->end_subject) return FALSE;
2568     ecode++;
2569     break;
2570    
2571     case OP_NOT_DIGIT:
2572     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2573     return FALSE;
2574     ecode++;
2575     break;
2576    
2577     case OP_DIGIT:
2578     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2579     return FALSE;
2580     ecode++;
2581     break;
2582    
2583     case OP_NOT_WHITESPACE:
2584     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2585     return FALSE;
2586     ecode++;
2587     break;
2588    
2589     case OP_WHITESPACE:
2590     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2591     return FALSE;
2592     ecode++;
2593     break;
2594    
2595     case OP_NOT_WORDCHAR:
2596     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2597     return FALSE;
2598     ecode++;
2599     break;
2600    
2601     case OP_WORDCHAR:
2602     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2603     return FALSE;
2604     ecode++;
2605     break;
2606    
2607     /* Match a back reference, possibly repeatedly. Look past the end of the
2608     item to see if there is repeat information following. The code is similar
2609     to that for character classes, but repeated for efficiency. Then obey
2610     similar code to character type repeats - written out again for speed.
2611     However, if the referenced string is the empty string, always treat
2612     it as matched, any number of times (otherwise there could be infinite
2613     loops). */
2614    
2615     case OP_REF:
2616     {
2617     int length;
2618     int number = ecode[1] << 1; /* Doubled reference number */
2619     ecode += 2; /* Advance past the item */
2620    
2621     if (number >= offset_top || md->offset_vector[number] < 0)
2622     {
2623     md->errorcode = PCRE_ERROR_BADREF;
2624     return FALSE;
2625     }
2626    
2627     length = md->offset_vector[number+1] - md->offset_vector[number];
2628    
2629     switch (*ecode)
2630     {
2631     case OP_CRSTAR:
2632     case OP_CRMINSTAR:
2633     case OP_CRPLUS:
2634     case OP_CRMINPLUS:
2635     case OP_CRQUERY:
2636     case OP_CRMINQUERY:
2637     c = *ecode++ - OP_CRSTAR;
2638     minimize = (c & 1) != 0;
2639     min = rep_min[c]; /* Pick up values from tables; */
2640     max = rep_max[c]; /* zero for max => infinity */
2641     if (max == 0) max = INT_MAX;
2642     break;
2643    
2644     case OP_CRRANGE:
2645     case OP_CRMINRANGE:
2646     minimize = (*ecode == OP_CRMINRANGE);
2647     min = (ecode[1] << 8) + ecode[2];
2648     max = (ecode[3] << 8) + ecode[4];
2649     if (max == 0) max = INT_MAX;
2650     ecode += 5;
2651     break;
2652    
2653     default: /* No repeat follows */
2654     if (!match_ref(number, eptr, length, md)) return FALSE;
2655     eptr += length;
2656     continue; /* With the main loop */
2657     }
2658    
2659     /* If the length of the reference is zero, just continue with the
2660     main loop. */
2661    
2662     if (length == 0) continue;
2663    
2664     /* First, ensure the minimum number of matches are present. We get back
2665     the length of the reference string explicitly rather than passing the
2666     address of eptr, so that eptr can be a register variable. */
2667    
2668     for (i = 1; i <= min; i++)
2669     {
2670     if (!match_ref(number, eptr, length, md)) return FALSE;
2671     eptr += length;
2672     }
2673    
2674     /* If min = max, continue at the same level without recursion.
2675     They are not both allowed to be zero. */
2676    
2677     if (min == max) continue;
2678    
2679     /* If minimizing, keep trying and advancing the pointer */
2680    
2681     if (minimize)
2682     {
2683     for (i = min;; i++)
2684     {
2685     if (match(eptr, ecode, offset_top, md)) return TRUE;
2686     if (i >= max || !match_ref(number, eptr, length, md))
2687     return FALSE;
2688     eptr += length;
2689     }
2690     /* Control never gets here */
2691     }
2692    
2693     /* If maximizing, find the longest string and work backwards */
2694    
2695     else
2696     {
2697 nigel 7 const uschar *pp = eptr;
2698 nigel 3 for (i = min; i < max; i++)
2699     {
2700     if (!match_ref(number, eptr, length, md)) break;
2701     eptr += length;
2702     }
2703     while (eptr >= pp)
2704     {
2705     if (match(eptr, ecode, offset_top, md)) return TRUE;
2706     eptr -= length;
2707     }
2708     return FALSE;
2709     }
2710     }
2711     /* Control never gets here */
2712    
2713     /* Match a character class, possibly repeatedly. Look past the end of the
2714     item to see if there is repeat information following. Then obey similar
2715     code to character type repeats - written out again for speed. If caseless
2716     matching was set at runtime but not at compile time, we have to check both
2717     versions of a character. */
2718    
2719     case OP_CLASS:
2720     {
2721 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
2722     ecode += 33; /* Advance past the item */
2723 nigel 3
2724     switch (*ecode)
2725     {
2726     case OP_CRSTAR:
2727     case OP_CRMINSTAR:
2728     case OP_CRPLUS:
2729     case OP_CRMINPLUS:
2730     case OP_CRQUERY:
2731     case OP_CRMINQUERY:
2732     c = *ecode++ - OP_CRSTAR;
2733     minimize = (c & 1) != 0;
2734     min = rep_min[c]; /* Pick up values from tables; */
2735     max = rep_max[c]; /* zero for max => infinity */
2736     if (max == 0) max = INT_MAX;
2737     break;
2738    
2739     case OP_CRRANGE:
2740     case OP_CRMINRANGE:
2741     minimize = (*ecode == OP_CRMINRANGE);
2742     min = (ecode[1] << 8) + ecode[2];
2743     max = (ecode[3] << 8) + ecode[4];
2744     if (max == 0) max = INT_MAX;
2745     ecode += 5;
2746     break;
2747    
2748     default: /* No repeat follows */
2749     if (eptr >= md->end_subject) return FALSE;
2750     c = *eptr++;
2751     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2752     if (md->runtime_caseless)
2753     {
2754     c = pcre_fcc[c];
2755     if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2756     }
2757     return FALSE;
2758     }
2759    
2760     /* First, ensure the minimum number of matches are present. */
2761    
2762     for (i = 1; i <= min; i++)
2763     {
2764     if (eptr >= md->end_subject) return FALSE;
2765     c = *eptr++;
2766     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2767     if (md->runtime_caseless)
2768     {
2769     c = pcre_fcc[c];
2770     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2771     }
2772     return FALSE;
2773     }
2774    
2775     /* If max == min we can continue with the main loop without the
2776     need to recurse. */
2777    
2778     if (min == max) continue;
2779    
2780     /* If minimizing, keep testing the rest of the expression and advancing
2781     the pointer while it matches the class. */
2782    
2783     if (minimize)
2784     {
2785     for (i = min;; i++)
2786     {
2787     if (match(eptr, ecode, offset_top, md)) return TRUE;
2788     if (i >= max || eptr >= md->end_subject) return FALSE;
2789     c = *eptr++;
2790     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2791     if (md->runtime_caseless)
2792     {
2793     c = pcre_fcc[c];
2794     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2795     }
2796     return FALSE;
2797     }
2798     /* Control never gets here */
2799     }
2800    
2801     /* If maximizing, find the longest possible run, then work backwards. */
2802    
2803     else
2804     {
2805 nigel 7 const uschar *pp = eptr;
2806 nigel 3 for (i = min; i < max; eptr++, i++)
2807     {
2808     if (eptr >= md->end_subject) break;
2809     c = *eptr;
2810     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2811     if (md->runtime_caseless)
2812     {
2813     c = pcre_fcc[c];
2814     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2815     }
2816     break;
2817     }
2818    
2819     while (eptr >= pp)
2820     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2821     return FALSE;
2822     }
2823     }
2824     /* Control never gets here */
2825    
2826     /* Match a run of characters */
2827    
2828     case OP_CHARS:
2829     {
2830     register int length = ecode[1];
2831     ecode += 2;
2832    
2833 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2834 nigel 3 if (eptr >= md->end_subject)
2835     printf("matching subject <null> against pattern ");
2836     else
2837     {
2838     printf("matching subject ");
2839     pchars(eptr, length, TRUE, md);
2840     printf(" against pattern ");
2841     }
2842     pchars(ecode, length, FALSE, md);
2843     printf("\n");
2844 nigel 9 #endif
2845 nigel 3
2846     if (length > md->end_subject - eptr) return FALSE;
2847     if (md->caseless)
2848     {
2849     while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2850     }
2851     else
2852     {
2853     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2854     }
2855     }
2856     break;
2857    
2858     /* Match a single character repeatedly; different opcodes share code. */
2859    
2860     case OP_EXACT:
2861     min = max = (ecode[1] << 8) + ecode[2];
2862     ecode += 3;
2863     goto REPEATCHAR;
2864    
2865     case OP_UPTO:
2866     case OP_MINUPTO:
2867     min = 0;
2868     max = (ecode[1] << 8) + ecode[2];
2869     minimize = *ecode == OP_MINUPTO;
2870     ecode += 3;
2871     goto REPEATCHAR;
2872    
2873     case OP_STAR:
2874     case OP_MINSTAR:
2875     case OP_PLUS:
2876     case OP_MINPLUS:
2877     case OP_QUERY:
2878     case OP_MINQUERY:
2879     c = *ecode++ - OP_STAR;
2880     minimize = (c & 1) != 0;
2881     min = rep_min[c]; /* Pick up values from tables; */
2882     max = rep_max[c]; /* zero for max => infinity */
2883     if (max == 0) max = INT_MAX;
2884    
2885     /* Common code for all repeated single-character matches. We can give
2886     up quickly if there are fewer than the minimum number of characters left in
2887     the subject. */
2888    
2889     REPEATCHAR:
2890     if (min > md->end_subject - eptr) return FALSE;
2891     c = *ecode++;
2892    
2893     /* The code is duplicated for the caseless and caseful cases, for speed,
2894     since matching characters is likely to be quite common. First, ensure the
2895     minimum number of matches are present. If min = max, continue at the same
2896     level without recursing. Otherwise, if minimizing, keep trying the rest of
2897     the expression and advancing one matching character if failing, up to the
2898     maximum. Alternatively, if maximizing, find the maximum number of
2899     characters and work backwards. */
2900    
2901 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2902     max, eptr));
2903 nigel 3
2904     if (md->caseless)
2905     {
2906     c = pcre_lcc[c];
2907     for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2908     if (min == max) continue;
2909     if (minimize)
2910     {
2911     for (i = min;; i++)
2912     {
2913     if (match(eptr, ecode, offset_top, md)) return TRUE;
2914     if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2915     return FALSE;
2916     }
2917     /* Control never gets here */
2918     }
2919     else
2920     {
2921 nigel 7 const uschar *pp = eptr;
2922 nigel 3 for (i = min; i < max; i++)
2923     {
2924     if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2925     eptr++;
2926     }
2927     while (eptr >= pp)
2928     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2929     return FALSE;
2930     }
2931     /* Control never gets here */
2932     }
2933    
2934     /* Caseful comparisons */
2935    
2936     else
2937     {
2938     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2939     if (min == max) continue;
2940     if (minimize)
2941     {
2942     for (i = min;; i++)
2943     {
2944     if (match(eptr, ecode, offset_top, md)) return TRUE;
2945     if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2946     }
2947     /* Control never gets here */
2948     }
2949     else
2950     {
2951 nigel 7 const uschar *pp = eptr;
2952 nigel 3 for (i = min; i < max; i++)
2953     {
2954     if (eptr >= md->end_subject || c != *eptr) break;
2955     eptr++;
2956     }
2957     while (eptr >= pp)
2958     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2959     return FALSE;
2960     }
2961     }
2962     /* Control never gets here */
2963    
2964     /* Match a negated single character */
2965    
2966     case OP_NOT:
2967 nigel 9 if (eptr >= md->end_subject) return FALSE;
2968 nigel 3 ecode++;
2969     if (md->caseless)
2970     {
2971     if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2972     }
2973     else
2974     {
2975     if (*ecode++ == *eptr++) return FALSE;
2976     }
2977     break;
2978    
2979     /* Match a negated single character repeatedly. This is almost a repeat of
2980     the code for a repeated single character, but I haven't found a nice way of
2981     commoning these up that doesn't require a test of the positive/negative
2982     option for each character match. Maybe that wouldn't add very much to the
2983     time taken, but character matching *is* what this is all about... */
2984    
2985     case OP_NOTEXACT:
2986     min = max = (ecode[1] << 8) + ecode[2];
2987     ecode += 3;
2988     goto REPEATNOTCHAR;
2989    
2990     case OP_NOTUPTO:
2991     case OP_NOTMINUPTO:
2992     min = 0;
2993     max = (ecode[1] << 8) + ecode[2];
2994     minimize = *ecode == OP_NOTMINUPTO;
2995     ecode += 3;
2996     goto REPEATNOTCHAR;
2997    
2998     case OP_NOTSTAR:
2999     case OP_NOTMINSTAR:
3000     case OP_NOTPLUS:
3001     case OP_NOTMINPLUS:
3002     case OP_NOTQUERY:
3003     case OP_NOTMINQUERY:
3004     c = *ecode++ - OP_NOTSTAR;
3005     minimize = (c & 1) != 0;
3006     min = rep_min[c]; /* Pick up values from tables; */
3007     max = rep_max[c]; /* zero for max => infinity */
3008     if (max == 0) max = INT_MAX;
3009    
3010     /* Common code for all repeated single-character matches. We can give
3011     up quickly if there are fewer than the minimum number of characters left in
3012     the subject. */
3013    
3014     REPEATNOTCHAR:
3015     if (min > md->end_subject - eptr) return FALSE;
3016     c = *ecode++;
3017    
3018     /* The code is duplicated for the caseless and caseful cases, for speed,
3019     since matching characters is likely to be quite common. First, ensure the
3020     minimum number of matches are present. If min = max, continue at the same
3021     level without recursing. Otherwise, if minimizing, keep trying the rest of
3022     the expression and advancing one matching character if failing, up to the
3023     maximum. Alternatively, if maximizing, find the maximum number of
3024     characters and work backwards. */
3025    
3026 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3027     max, eptr));
3028 nigel 3
3029     if (md->caseless)
3030     {
3031     c = pcre_lcc[c];
3032     for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3033     if (min == max) continue;
3034     if (minimize)
3035     {
3036     for (i = min;; i++)
3037     {
3038     if (match(eptr, ecode, offset_top, md)) return TRUE;
3039     if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3040     return FALSE;
3041     }
3042     /* Control never gets here */
3043     }
3044     else
3045     {
3046 nigel 7 const uschar *pp = eptr;
3047 nigel 3 for (i = min; i < max; i++)
3048     {
3049     if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3050     eptr++;
3051     }
3052     while (eptr >= pp)
3053     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3054     return FALSE;
3055     }
3056     /* Control never gets here */
3057     }
3058    
3059     /* Caseful comparisons */
3060    
3061     else
3062     {
3063     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3064     if (min == max) continue;
3065     if (minimize)
3066     {
3067     for (i = min;; i++)
3068     {
3069     if (match(eptr, ecode, offset_top, md)) return TRUE;
3070     if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3071     }
3072     /* Control never gets here */
3073     }
3074     else
3075     {
3076 nigel 7 const uschar *pp = eptr;
3077 nigel 3 for (i = min; i < max; i++)
3078     {
3079     if (eptr >= md->end_subject || c == *eptr) break;
3080     eptr++;
3081     }
3082     while (eptr >= pp)
3083     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3084     return FALSE;
3085     }
3086     }
3087     /* Control never gets here */
3088    
3089     /* Match a single character type repeatedly; several different opcodes
3090     share code. This is very similar to the code for single characters, but we
3091     repeat it in the interests of efficiency. */
3092    
3093     case OP_TYPEEXACT:
3094     min = max = (ecode[1] << 8) + ecode[2];
3095     minimize = TRUE;
3096     ecode += 3;
3097     goto REPEATTYPE;
3098    
3099     case OP_TYPEUPTO:
3100     case OP_TYPEMINUPTO:
3101     min = 0;
3102     max = (ecode[1] << 8) + ecode[2];
3103     minimize = *ecode == OP_TYPEMINUPTO;
3104     ecode += 3;
3105     goto REPEATTYPE;
3106    
3107     case OP_TYPESTAR:
3108     case OP_TYPEMINSTAR:
3109     case OP_TYPEPLUS:
3110     case OP_TYPEMINPLUS:
3111     case OP_TYPEQUERY:
3112     case OP_TYPEMINQUERY:
3113     c = *ecode++ - OP_TYPESTAR;
3114     minimize = (c & 1) != 0;
3115     min = rep_min[c]; /* Pick up values from tables; */
3116     max = rep_max[c]; /* zero for max => infinity */
3117     if (max == 0) max = INT_MAX;
3118    
3119     /* Common code for all repeated single character type matches */
3120    
3121     REPEATTYPE:
3122     ctype = *ecode++; /* Code for the character type */
3123    
3124     /* First, ensure the minimum number of matches are present. Use inline
3125     code for maximizing the speed, and do the type test once at the start
3126     (i.e. keep it out of the loop). Also test that there are at least the
3127     minimum number of characters before we start. */
3128    
3129     if (min > md->end_subject - eptr) return FALSE;
3130     if (min > 0) switch(ctype)
3131     {
3132     case OP_ANY:
3133     if (!md->dotall)
3134     { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3135     else eptr += min;
3136     break;
3137    
3138     case OP_NOT_DIGIT:
3139     for (i = 1; i <= min; i++)
3140     if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3141     break;
3142    
3143     case OP_DIGIT:
3144     for (i = 1; i <= min; i++)
3145     if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3146     break;
3147    
3148     case OP_NOT_WHITESPACE:
3149     for (i = 1; i <= min; i++)
3150     if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3151     break;
3152    
3153     case OP_WHITESPACE:
3154     for (i = 1; i <= min; i++)
3155     if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3156     break;
3157    
3158     case OP_NOT_WORDCHAR:
3159     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3160     return FALSE;
3161     break;
3162    
3163     case OP_WORDCHAR:
3164     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3165     return FALSE;
3166     break;
3167     }
3168    
3169     /* If min = max, continue at the same level without recursing */
3170    
3171     if (min == max) continue;
3172    
3173     /* If minimizing, we have to test the rest of the pattern before each
3174     subsequent match, so inlining isn't much help; just use the function. */
3175    
3176     if (minimize)
3177     {
3178     for (i = min;; i++)
3179     {
3180     if (match(eptr, ecode, offset_top, md)) return TRUE;
3181     if (i >= max || eptr >= md->end_subject ||
3182     !match_type(ctype, *eptr++, md->dotall))
3183     return FALSE;
3184     }
3185     /* Control never gets here */
3186     }
3187    
3188     /* If maximizing it is worth using inline code for speed, doing the type
3189     test once at the start (i.e. keep it out of the loop). */
3190    
3191     else
3192     {
3193 nigel 7 const uschar *pp = eptr;
3194 nigel 3 switch(ctype)
3195     {
3196     case OP_ANY:
3197     if (!md->dotall)
3198     {
3199     for (i = min; i < max; i++)
3200     {
3201     if (eptr >= md->end_subject || *eptr == '\n') break;
3202     eptr++;
3203     }
3204     }
3205     else
3206     {
3207     c = max - min;
3208     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3209     eptr += c;
3210     }
3211     break;
3212    
3213     case OP_NOT_DIGIT:
3214     for (i = min; i < max; i++)
3215     {
3216     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3217     break;
3218     eptr++;
3219     }
3220     break;
3221    
3222     case OP_DIGIT:
3223     for (i = min; i < max; i++)
3224     {
3225     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3226     break;
3227     eptr++;
3228     }
3229     break;
3230    
3231     case OP_NOT_WHITESPACE:
3232     for (i = min; i < max; i++)
3233     {
3234     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3235     break;
3236     eptr++;
3237     }
3238     break;
3239    
3240     case OP_WHITESPACE:
3241     for (i = min; i < max; i++)
3242     {
3243     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3244     break;
3245     eptr++;
3246     }
3247     break;
3248    
3249     case OP_NOT_WORDCHAR:
3250     for (i = min; i < max; i++)
3251     {
3252     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3253     break;
3254     eptr++;
3255     }
3256     break;
3257    
3258     case OP_WORDCHAR:
3259     for (i = min; i < max; i++)
3260     {
3261     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3262     break;
3263     eptr++;
3264     }
3265     break;
3266     }
3267    
3268     while (eptr >= pp)
3269     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3270     return FALSE;
3271     }
3272     /* Control never gets here */
3273    
3274     /* There's been some horrible disaster. */
3275    
3276     default:
3277 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
3278 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3279     return FALSE;
3280     }
3281    
3282     /* Do not stick any code in here without much thought; it is assumed
3283     that "continue" in the code above comes out to here to repeat the main
3284     loop. */
3285    
3286     } /* End of main loop */
3287     /* Control never reaches here */
3288     }
3289    
3290    
3291    
3292     /*************************************************
3293 nigel 9 * Segregate setjmp() *
3294     *************************************************/
3295    
3296     /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3297     is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3298     hide it in a separate function. This is called only when PCRE_EXTRA is set,
3299     since it's needed only for the extension \X option, and with any luck, a good
3300     compiler will spot the tail recursion and compile it efficiently.
3301    
3302     Arguments: The block containing the match data
3303     Returns: The return from setjump()
3304     */
3305    
3306     static int
3307     my_setjmp(match_data *match_block)
3308     {
3309     return setjmp(match_block->fail_env);
3310     }
3311    
3312    
3313    
3314     /*************************************************
3315 nigel 3 * Execute a Regular Expression *
3316     *************************************************/
3317    
3318     /* This function applies a compiled re to a subject string and picks out
3319     portions of the string if it matches. Two elements in the vector are set for
3320     each substring: the offsets to the start and end of the substring.
3321    
3322     Arguments:
3323     external_re points to the compiled expression
3324     external_extra points to "hints" from pcre_study() or is NULL
3325     subject points to the subject string
3326     length length of subject string (may contain binary zeros)
3327     options option bits
3328     offsets points to a vector of ints to be filled in with offsets
3329     offsetcount the number of elements in the vector
3330    
3331     Returns: > 0 => success; value is the number of elements filled in
3332     = 0 => success, but offsets is not big enough
3333     -1 => failed to match
3334     < -1 => some kind of unexpected problem
3335     */
3336    
3337     int
3338     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3339     const char *subject, int length, int options, int *offsets, int offsetcount)
3340     {
3341     int resetcount;
3342     int ocount = offsetcount;
3343     int first_char = -1;
3344     match_data match_block;
3345 nigel 7 const uschar *start_bits = NULL;
3346 nigel 9 const uschar *start_match = (const uschar *)subject;
3347 nigel 7 const uschar *end_subject;
3348     const real_pcre *re = (const real_pcre *)external_re;
3349     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3350 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3351     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3352    
3353     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3354    
3355     if (re == NULL || subject == NULL ||
3356     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3357     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3358    
3359 nigel 7 match_block.start_subject = (const uschar *)subject;
3360 nigel 3 match_block.end_subject = match_block.start_subject + length;
3361     end_subject = match_block.end_subject;
3362    
3363     match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3364     match_block.runtime_caseless = match_block.caseless &&
3365     (re->options & PCRE_CASELESS) == 0;
3366    
3367     match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3368     match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3369     match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3370    
3371     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3372     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3373    
3374     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3375    
3376     /* If the expression has got more back references than the offsets supplied can
3377     hold, we get a temporary bit of working store to use during the matching.
3378     Otherwise, we can use the vector supplied, rounding down the size of it to a
3379     multiple of 2. */
3380    
3381     ocount &= (-2);
3382     if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
3383     {
3384     ocount = re->top_backref * 2 + 2;
3385     match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3386     if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3387 nigel 9 DPRINTF(("Got memory to hold back references\n"));
3388 nigel 3 }
3389     else match_block.offset_vector = offsets;
3390    
3391     match_block.offset_end = ocount;
3392     match_block.offset_overflow = FALSE;
3393    
3394     /* Compute the minimum number of offsets that we need to reset each time. Doing
3395     this makes a huge difference to execution time when there aren't many brackets
3396     in the pattern. */
3397    
3398     resetcount = 2 + re->top_bracket * 2;
3399     if (resetcount > offsetcount) resetcount = ocount;
3400    
3401     /* If MULTILINE is set at exec time but was not set at compile time, and the
3402     anchored flag is set, we must re-check because a setting provoked by ^ in the
3403     pattern is not right in multi-line mode. Calling is_anchored() again here does
3404     the right check, because multiline is now set. If it now yields FALSE, the
3405     expression must have had ^ starting some of its branches. Check to see if
3406     that is true for *all* branches, and if so, set the startline flag. */
3407    
3408     if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3409     !is_anchored(re->code, match_block.multiline))
3410     {
3411     anchored = FALSE;
3412     if (is_startline(re->code)) startline = TRUE;
3413     }
3414    
3415     /* Set up the first character to match, if available. The first_char value is
3416     never set for an anchored regular expression, but the anchoring may be forced
3417     at run time, so we have to test for anchoring. The first char may be unset for
3418     an unanchored pattern, of course. If there's no first char and the pattern was
3419     studied, the may be a bitmap of possible first characters. However, we can
3420     use this only if the caseless state of the studying was correct. */
3421    
3422     if (!anchored)
3423     {
3424     if ((re->options & PCRE_FIRSTSET) != 0)
3425     {
3426     first_char = re->first_char;
3427     if (match_block.caseless) first_char = pcre_lcc[first_char];
3428     }
3429     else
3430     if (!startline && extra != NULL &&
3431     (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3432     ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3433     start_bits = extra->start_bits;
3434     }
3435    
3436     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3437    
3438     do
3439     {
3440 nigel 9 int rc;
3441 nigel 3 register int *iptr = match_block.offset_vector;
3442     register int *iend = iptr + resetcount;
3443    
3444     /* Reset the maximum number of extractions we might see. */
3445    
3446     while (iptr < iend) *iptr++ = -1;
3447    
3448     /* Advance to a unique first char if possible */
3449    
3450     if (first_char >= 0)
3451     {
3452     if (match_block.caseless)
3453     while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3454     start_match++;
3455     else
3456     while (start_match < end_subject && *start_match != first_char)
3457     start_match++;
3458     }
3459    
3460     /* Or to just after \n for a multiline match if possible */
3461    
3462     else if (startline)
3463     {
3464     if (start_match > match_block.start_subject)
3465     {
3466     while (start_match < end_subject && start_match[-1] != '\n')
3467     start_match++;
3468     }
3469     }
3470    
3471     /* Or to a non-unique first char */
3472    
3473     else if (start_bits != NULL)
3474     {
3475     while (start_match < end_subject)
3476     {
3477     register int c = *start_match;
3478     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3479     }
3480     }
3481    
3482 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3483 nigel 3 printf(">>>> Match against: ");
3484     pchars(start_match, end_subject - start_match, TRUE, &match_block);
3485     printf("\n");
3486 nigel 9 #endif
3487 nigel 3
3488     /* When a match occurs, substrings will be set for all internal extractions;
3489     we just need to set up the whole thing as substring 0 before returning. If
3490     there were too many extractions, set the return code to zero. In the case
3491     where we had to get some local store to hold offsets for backreferences, copy
3492     those back references that we can. In this case there need not be overflow
3493     if certain parts of the pattern were not used.
3494    
3495     Before starting the match, we have to set up a longjmp() target to enable
3496 nigel 9 the "cut" operation to fail a match completely without backtracking. This
3497     is done in a separate function to avoid compiler warnings. We need not do
3498     it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3499     enabled. */
3500 nigel 3
3501 nigel 9 if (((re->options & PCRE_EXTRA) != 0 && my_setjmp(&match_block) != 0) ||
3502     !match(start_match, re->code, 2, &match_block))
3503     continue;
3504    
3505     /* Copy the offset information from temporary store if necessary */
3506    
3507     if (ocount != offsetcount)
3508 nigel 3 {
3509 nigel 9 if (offsetcount >= 4)
3510 nigel 3 {
3511 nigel 9 memcpy(offsets + 2, match_block.offset_vector + 2,
3512     (offsetcount - 2) * sizeof(int));
3513     DPRINTF(("Copied offsets; freeing temporary memory\n"));
3514 nigel 3 }
3515 nigel 9 if (match_block.end_offset_top > offsetcount)
3516     match_block.offset_overflow = TRUE;
3517 nigel 3
3518 nigel 9 DPRINTF(("Freeing temporary memory\n"));
3519     (pcre_free)(match_block.offset_vector);
3520     }
3521 nigel 3
3522 nigel 9 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3523 nigel 3
3524 nigel 9 if (match_block.offset_end < 2) rc = 0; else
3525     {
3526     offsets[0] = start_match - match_block.start_subject;
3527     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3528 nigel 3 }
3529 nigel 9
3530     DPRINTF((">>>> returning %d\n", rc));
3531     return rc;
3532 nigel 3 }
3533     while (!anchored &&
3534     match_block.errorcode == PCRE_ERROR_NOMATCH &&
3535     start_match++ < end_subject);
3536    
3537 nigel 9 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3538 nigel 3
3539     return match_block.errorcode;
3540     }
3541    
3542     /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12