/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 21 - (hide annotations) (download)
Sat Feb 24 21:38:37 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 104554 byte(s)
Load pcre-1.09 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 15 Copyright (c) 1998 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Define DEBUG to get debugging output on stdout. */
33    
34     /* #define DEBUG */
35    
36 nigel 9 /* Use a macro for debugging printing, 'cause that eliminates the the use
37     of #ifdef inline, and there are *still* stupid compilers about that don't like
38     indented pre-processor statements. I suppose it's only been 10 years... */
39 nigel 3
40 nigel 9 #ifdef DEBUG
41     #define DPRINTF(p) printf p
42     #else
43     #define DPRINTF(p) /*nothing*/
44     #endif
45    
46 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
47     the external pcre header. */
48    
49     #include "internal.h"
50    
51    
52 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
53    
54     #ifdef __cplusplus
55     #define class pcre_class
56     #endif
57    
58    
59 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
60    
61 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
62     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
63 nigel 3
64 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
65 nigel 3
66     #ifdef DEBUG
67 nigel 7 static const char *OP_names[] = {
68     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
69 nigel 3 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
70     "not",
71     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
72     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
73     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
74     "*", "*?", "+", "+?", "?", "??", "{", "{",
75 nigel 13 "class", "negclass", "Ref",
76 nigel 3 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
77     "Brazero", "Braminzero", "Bra"
78     };
79     #endif
80    
81     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
82     are simple data values; negative values are for special things like \d and so
83     on. Zero means further processing is needed (for things like \x), or the escape
84     is invalid. */
85    
86 nigel 15 static const short int escapes[] = {
87 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
90     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
94     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
95     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
96     0, 0, 0 /* x - z */
97     };
98    
99     /* Definition to allow mutual recursion */
100    
101 nigel 13 static BOOL
102     compile_regex(int, int *, uschar **, const uschar **, const char **);
103 nigel 3
104     /* Structure for passing "static" information around between the functions
105     doing the matching, so that they are thread-safe. */
106    
107     typedef struct match_data {
108     int errorcode; /* As it says */
109     int *offset_vector; /* Offset vector */
110     int offset_end; /* One past the end */
111     BOOL offset_overflow; /* Set if too many extractions */
112     BOOL caseless; /* Case-independent flag */
113     BOOL runtime_caseless; /* Caseless forced at run time */
114     BOOL multiline; /* Multiline flag */
115     BOOL notbol; /* NOTBOL flag */
116     BOOL noteol; /* NOTEOL flag */
117     BOOL dotall; /* Dot matches any char */
118     BOOL endonly; /* Dollar not before final \n */
119 nigel 7 const uschar *start_subject; /* Start of the subject string */
120     const uschar *end_subject; /* End of the subject string */
121 nigel 3 jmp_buf fail_env; /* Environment for longjump() break out */
122 nigel 7 const uschar *end_match_ptr; /* Subject position at end match */
123 nigel 3 int end_offset_top; /* Highwater mark at end of match */
124     } match_data;
125    
126    
127    
128     /*************************************************
129     * Global variables *
130     *************************************************/
131    
132     /* PCRE is thread-clean and doesn't use any global variables in the normal
133     sense. However, it calls memory allocation and free functions via the two
134     indirections below, which are can be changed by the caller, but are shared
135     between all threads. */
136    
137     void *(*pcre_malloc)(size_t) = malloc;
138     void (*pcre_free)(void *) = free;
139    
140    
141    
142    
143     /*************************************************
144     * Return version string *
145     *************************************************/
146    
147 nigel 7 const char *
148 nigel 3 pcre_version(void)
149     {
150     return PCRE_VERSION;
151     }
152    
153    
154    
155    
156     /*************************************************
157     * Return info about a compiled pattern *
158     *************************************************/
159    
160     /* This function picks potentially useful data out of the private
161     structure.
162    
163     Arguments:
164     external_re points to compiled code
165     optptr where to pass back the options
166     first_char where to pass back the first character,
167     or -1 if multiline and all branches start ^,
168     or -2 otherwise
169    
170     Returns: number of identifying extraction brackets
171     or negative values on error
172     */
173    
174     int
175     pcre_info(const pcre *external_re, int *optptr, int *first_char)
176     {
177 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
178 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
179     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
180     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
181     if (first_char != NULL)
182     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
183     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
184     return re->top_bracket;
185     }
186    
187    
188    
189    
190     #ifdef DEBUG
191     /*************************************************
192     * Debugging function to print chars *
193     *************************************************/
194    
195     /* Print a sequence of chars in printable format, stopping at the end of the
196     subject if the requested.
197    
198     Arguments:
199     p points to characters
200     length number to print
201     is_subject TRUE if printing from within md->start_subject
202     md pointer to matching data block, if is_subject is TRUE
203    
204     Returns: nothing
205     */
206    
207 nigel 9 static void
208     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
209 nigel 3 {
210     int c;
211     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
212     while (length-- > 0)
213     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
214     }
215     #endif
216    
217    
218    
219    
220     /*************************************************
221     * Check subpattern for empty operand *
222     *************************************************/
223    
224     /* This function checks a bracketed subpattern to see if any of the paths
225     through it could match an empty string. This is used to diagnose an error if
226     such a subpattern is followed by a quantifier with an unlimited upper bound.
227    
228     Argument:
229     code points to the opening bracket
230    
231     Returns: TRUE or FALSE
232     */
233    
234     static BOOL
235     could_be_empty(uschar *code)
236     {
237     do {
238     uschar *cc = code + 3;
239    
240     /* Scan along the opcodes for this branch; as soon as we find something
241     that matches a non-empty string, break out and advance to test the next
242     branch. If we get to the end of the branch, return TRUE for the whole
243     sub-expression. */
244    
245     for (;;)
246     {
247     /* Test an embedded subpattern; if it could not be empty, break the
248     loop. Otherwise carry on in the branch. */
249    
250 nigel 5 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
251 nigel 3 {
252     if (!could_be_empty(cc)) break;
253     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
254     cc += 3;
255     }
256    
257     else switch (*cc)
258     {
259     /* Reached end of a branch: the subpattern may match the empty string */
260    
261     case OP_ALT:
262     case OP_KET:
263     case OP_KETRMAX:
264     case OP_KETRMIN:
265     return TRUE;
266    
267 nigel 17 /* Skip over entire bracket groups with zero lower bound */
268    
269     case OP_BRAZERO:
270     case OP_BRAMINZERO:
271     cc++;
272     /* Fall through */
273    
274 nigel 3 /* Skip over assertive subpatterns */
275    
276     case OP_ASSERT:
277     case OP_ASSERT_NOT:
278     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
279     cc += 3;
280     break;
281    
282     /* Skip over things that don't match chars */
283    
284     case OP_SOD:
285     case OP_EOD:
286     case OP_CIRC:
287     case OP_DOLL:
288     case OP_NOT_WORD_BOUNDARY:
289     case OP_WORD_BOUNDARY:
290     cc++;
291     break;
292    
293     /* Skip over simple repeats with zero lower bound */
294    
295     case OP_STAR:
296     case OP_MINSTAR:
297     case OP_QUERY:
298     case OP_MINQUERY:
299 nigel 5 case OP_NOTSTAR:
300     case OP_NOTMINSTAR:
301     case OP_NOTQUERY:
302     case OP_NOTMINQUERY:
303 nigel 3 case OP_TYPESTAR:
304     case OP_TYPEMINSTAR:
305     case OP_TYPEQUERY:
306     case OP_TYPEMINQUERY:
307     cc += 2;
308     break;
309    
310     /* Skip over UPTOs (lower bound is zero) */
311    
312     case OP_UPTO:
313     case OP_MINUPTO:
314     case OP_TYPEUPTO:
315     case OP_TYPEMINUPTO:
316     cc += 4;
317     break;
318    
319     /* Check a class or a back reference for a zero minimum */
320    
321     case OP_CLASS:
322 nigel 13 case OP_NEGCLASS:
323 nigel 3 case OP_REF:
324 nigel 5 cc += (*cc == OP_REF)? 2 : 33;
325 nigel 3
326     switch (*cc)
327     {
328     case OP_CRSTAR:
329     case OP_CRMINSTAR:
330     case OP_CRQUERY:
331     case OP_CRMINQUERY:
332     cc++;
333     break;
334    
335     case OP_CRRANGE:
336     case OP_CRMINRANGE:
337     if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
338     cc += 3;
339     break;
340    
341     default:
342     goto NEXT_BRANCH;
343     }
344     break;
345    
346     /* Anything else matches at least one character */
347    
348     default:
349     goto NEXT_BRANCH;
350     }
351     }
352    
353     NEXT_BRANCH:
354     code += (code[1] << 8) + code[2];
355     }
356     while (*code == OP_ALT);
357    
358     /* No branches match the empty string */
359    
360     return FALSE;
361     }
362    
363    
364    
365     /*************************************************
366     * Handle escapes *
367     *************************************************/
368    
369     /* This function is called when a \ has been encountered. It either returns a
370     positive value for a simple escape such as \n, or a negative value which
371     encodes one of the more complicated things such as \d. On entry, ptr is
372     pointing at the \. On exit, it is on the final character of the escape
373     sequence.
374    
375     Arguments:
376     ptrptr points to the pattern position pointer
377     errorptr points to the pointer to the error message
378     bracount number of previous extracting brackets
379     options the options bits
380     isclass TRUE if inside a character class
381    
382     Returns: zero or positive => a data character
383     negative => a special escape sequence
384     on error, errorptr is set
385     */
386    
387     static int
388 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
389     int options, BOOL isclass)
390 nigel 3 {
391 nigel 7 const uschar *ptr = *ptrptr;
392 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
393     int i;
394    
395     if (c == 0) *errorptr = ERR1;
396    
397     /* Digits or letters may have special meaning; all others are literals. */
398    
399     else if (c < '0' || c > 'z') {}
400    
401     /* Do an initial lookup in a table. A non-zero result is something that can be
402     returned immediately. Otherwise further processing may be required. */
403    
404     else if ((i = escapes[c - '0']) != 0) c = i;
405    
406     /* Escapes that need further processing, or are illegal. */
407    
408     else
409     {
410 nigel 7 const uschar *oldptr;
411 nigel 3 switch (c)
412     {
413     /* The handling of escape sequences consisting of a string of digits
414     starting with one that is not zero is not straightforward. By experiment,
415     the way Perl works seems to be as follows:
416    
417     Outside a character class, the digits are read as a decimal number. If the
418     number is less than 10, or if there are that many previous extracting
419     left brackets, then it is a back reference. Otherwise, up to three octal
420     digits are read to form an escaped byte. Thus \123 is likely to be octal
421     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
422     value is greater than 377, the least significant 8 bits are taken. Inside a
423     character class, \ followed by a digit is always an octal number. */
424    
425     case '1': case '2': case '3': case '4': case '5':
426     case '6': case '7': case '8': case '9':
427    
428     if (!isclass)
429     {
430     oldptr = ptr;
431     c -= '0';
432     while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
433     c = c * 10 + *(++ptr) - '0';
434     if (c < 10 || c <= bracount)
435     {
436     c = -(ESC_REF + c);
437     break;
438     }
439     ptr = oldptr; /* Put the pointer back and fall through */
440     }
441    
442     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
443     generates a binary zero byte and treats the digit as a following literal.
444     Thus we have to pull back the pointer by one. */
445    
446     if ((c = *ptr) >= '8')
447     {
448     ptr--;
449     c = 0;
450     break;
451     }
452    
453     /* \0 always starts an octal number, but we may drop through to here with a
454     larger first octal digit */
455    
456     case '0':
457     c -= '0';
458     while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
459     ptr[1] != '8' && ptr[1] != '9')
460     c = c * 8 + *(++ptr) - '0';
461     break;
462    
463     /* Special escapes not starting with a digit are straightforward */
464    
465     case 'x':
466     c = 0;
467     while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
468     {
469     ptr++;
470     c = c * 16 + pcre_lcc[*ptr] -
471     (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
472     }
473     break;
474    
475     case 'c':
476     c = *(++ptr);
477     if (c == 0)
478     {
479     *errorptr = ERR2;
480     return 0;
481     }
482    
483     /* A letter is upper-cased; then the 0x40 bit is flipped */
484    
485     if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
486     c ^= 0x40;
487     break;
488    
489     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
490     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
491     for Perl compatibility, it is a literal. */
492    
493     default:
494     if ((options & PCRE_EXTRA) != 0) switch(c)
495     {
496     case 'X':
497     c = -ESC_X; /* This could be a lookup if it ever got into Perl */
498     break;
499    
500     default:
501     *errorptr = ERR3;
502     break;
503     }
504     break;
505     }
506     }
507    
508     *ptrptr = ptr;
509     return c;
510     }
511    
512    
513    
514     /*************************************************
515     * Check for counted repeat *
516     *************************************************/
517    
518     /* This function is called when a '{' is encountered in a place where it might
519     start a quantifier. It looks ahead to see if it really is a quantifier or not.
520     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
521     where the ddds are digits.
522    
523     Arguments:
524     p pointer to the first char after '{'
525    
526     Returns: TRUE or FALSE
527     */
528    
529     static BOOL
530 nigel 7 is_counted_repeat(const uschar *p)
531 nigel 3 {
532     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
533     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
534     if (*p == '}') return TRUE;
535    
536     if (*p++ != ',') return FALSE;
537     if (*p == '}') return TRUE;
538    
539     if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
540     while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
541     return (*p == '}');
542     }
543    
544    
545    
546     /*************************************************
547     * Read repeat counts *
548     *************************************************/
549    
550     /* Read an item of the form {n,m} and return the values. This is called only
551     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
552     so the syntax is guaranteed to be correct, but we need to check the values.
553    
554     Arguments:
555     p pointer to first char after '{'
556     minp pointer to int for min
557     maxp pointer to int for max
558     returned as -1 if no max
559     errorptr points to pointer to error message
560    
561     Returns: pointer to '}' on success;
562     current ptr on error, with errorptr set
563     */
564    
565 nigel 7 static const uschar *
566     read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
567 nigel 3 {
568     int min = 0;
569     int max = -1;
570    
571     while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
572    
573     if (*p == '}') max = min; else
574     {
575     if (*(++p) != '}')
576     {
577     max = 0;
578     while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
579     if (max < min)
580     {
581     *errorptr = ERR4;
582     return p;
583     }
584     }
585     }
586    
587     /* Do paranoid checks, then fill in the required variables, and pass back the
588     pointer to the terminating '}'. */
589    
590     if (min > 65535 || max > 65535)
591     *errorptr = ERR5;
592     else
593     {
594     *minp = min;
595     *maxp = max;
596     }
597     return p;
598     }
599    
600    
601    
602     /*************************************************
603     * Compile one branch *
604     *************************************************/
605    
606     /* Scan the pattern, compiling it into the code vector.
607    
608     Arguments:
609     options the option bits
610     bracket points to number of brackets used
611     code points to the pointer to the current code point
612     ptrptr points to the current pattern pointer
613     errorptr points to pointer to error message
614    
615     Returns: TRUE on success
616     FALSE, with *errorptr set on error
617     */
618    
619     static BOOL
620 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
621     const uschar **ptrptr, const char **errorptr)
622 nigel 3 {
623     int repeat_type, op_type;
624     int repeat_min, repeat_max;
625     int bravalue, length;
626 nigel 19 int greedy_default, greedy_non_default;
627 nigel 3 register int c;
628     register uschar *code = *codeptr;
629 nigel 7 const uschar *ptr = *ptrptr;
630     const uschar *oldptr;
631 nigel 3 uschar *previous = NULL;
632     uschar class[32];
633    
634 nigel 19 /* Set up the default and non-default settings for greediness */
635    
636     greedy_default = ((options & PCRE_UNGREEDY) != 0);
637     greedy_non_default = greedy_default ^ 1;
638    
639 nigel 3 /* Switch on next character until the end of the branch */
640    
641     for (;; ptr++)
642     {
643     BOOL negate_class;
644     int class_charcount;
645     int class_lastchar;
646    
647     c = *ptr;
648     if ((options & PCRE_EXTENDED) != 0)
649     {
650     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
651     if (c == '#')
652     {
653     while ((c = *(++ptr)) != 0 && c != '\n');
654     continue;
655     }
656     }
657    
658     switch(c)
659     {
660     /* The branch terminates at end of string, |, or ). */
661    
662     case 0:
663     case '|':
664     case ')':
665     *codeptr = code;
666     *ptrptr = ptr;
667     return TRUE;
668    
669     /* Handle single-character metacharacters */
670    
671     case '^':
672     previous = NULL;
673     *code++ = OP_CIRC;
674     break;
675    
676     case '$':
677     previous = NULL;
678     *code++ = OP_DOLL;
679     break;
680    
681     case '.':
682     previous = code;
683     *code++ = OP_ANY;
684     break;
685    
686     /* Character classes. These always build a 32-byte bitmap of the permitted
687     characters, except in the special case where there is only one character.
688     For negated classes, we build the map as usual, then invert it at the end.
689     */
690    
691     case '[':
692     previous = code;
693    
694 nigel 13 /* If the first character is '^', set the negation flag, and use a
695     different opcode. This only matters if caseless matching is specified at
696     runtime. */
697 nigel 3
698     if ((c = *(++ptr)) == '^')
699     {
700     negate_class = TRUE;
701 nigel 13 *code++ = OP_NEGCLASS;
702 nigel 3 c = *(++ptr);
703     }
704 nigel 13 else
705     {
706     negate_class = FALSE;
707     *code++ = OP_CLASS;
708     }
709 nigel 3
710     /* Keep a count of chars so that we can optimize the case of just a single
711     character. */
712    
713     class_charcount = 0;
714     class_lastchar = -1;
715    
716     /* Initialize the 32-char bit map to all zeros. We have to build the
717     map in a temporary bit of store, in case the class contains only 1
718     character, because in that case the compiled code doesn't use the
719     bit map. */
720    
721     memset(class, 0, 32 * sizeof(uschar));
722    
723     /* Process characters until ] is reached. By writing this as a "do" it
724     means that an initial ] is taken as a data character. */
725    
726     do
727     {
728     if (c == 0)
729     {
730     *errorptr = ERR6;
731     goto FAILED;
732     }
733    
734     /* Backslash may introduce a single character, or it may introduce one
735     of the specials, which just set a flag. Escaped items are checked for
736     validity in the pre-compiling pass. The sequence \b is a special case.
737 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
738 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
739     or into the one we are building. We assume they have more than one
740     character in them, so set class_count bigger than one. */
741    
742     if (c == '\\')
743     {
744     c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
745     if (-c == ESC_b) c = '\b';
746     else if (c < 0)
747     {
748     class_charcount = 10;
749     switch (-c)
750     {
751     case ESC_d:
752     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
753     continue;
754    
755     case ESC_D:
756     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
757     continue;
758    
759     case ESC_w:
760     for (c = 0; c < 32; c++)
761     class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
762     continue;
763    
764     case ESC_W:
765     for (c = 0; c < 32; c++)
766     class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
767     continue;
768    
769     case ESC_s:
770     for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
771     continue;
772    
773     case ESC_S:
774     for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
775     continue;
776    
777     default:
778     *errorptr = ERR7;
779     goto FAILED;
780     }
781     }
782     /* Fall through if single character */
783     }
784    
785     /* A single character may be followed by '-' to form a range. However,
786     Perl does not permit ']' to be the end of the range. A '-' character
787     here is treated as a literal. */
788    
789     if (ptr[1] == '-' && ptr[2] != ']')
790     {
791     int d;
792     ptr += 2;
793     d = *ptr;
794    
795     if (d == 0)
796     {
797     *errorptr = ERR6;
798     goto FAILED;
799     }
800    
801     /* The second part of a range can be a single-character escape, but
802     not any of the other escapes. */
803    
804     if (d == '\\')
805     {
806     d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
807     if (d < 0)
808     {
809     if (d == -ESC_b) d = '\b'; else
810     {
811     *errorptr = ERR7;
812     goto FAILED;
813     }
814     }
815     }
816    
817     if (d < c)
818     {
819     *errorptr = ERR8;
820     goto FAILED;
821     }
822    
823     for (; c <= d; c++)
824     {
825     class[c/8] |= (1 << (c&7));
826     if ((options & PCRE_CASELESS) != 0)
827     {
828     int uc = pcre_fcc[c]; /* flip case */
829     class[uc/8] |= (1 << (uc&7));
830     }
831     class_charcount++; /* in case a one-char range */
832     class_lastchar = c;
833     }
834     continue; /* Go get the next char in the class */
835     }
836    
837     /* Handle a lone single character - we can get here for a normal
838     non-escape char, or after \ that introduces a single character. */
839    
840     class [c/8] |= (1 << (c&7));
841     if ((options & PCRE_CASELESS) != 0)
842     {
843     c = pcre_fcc[c]; /* flip case */
844     class[c/8] |= (1 << (c&7));
845     }
846     class_charcount++;
847     class_lastchar = c;
848     }
849    
850     /* Loop until ']' reached; the check for end of string happens inside the
851     loop. This "while" is the end of the "do" above. */
852    
853     while ((c = *(++ptr)) != ']');
854    
855     /* If class_charcount is 1 and class_lastchar is not negative, we saw
856     precisely one character. This doesn't need the whole 32-byte bit map.
857     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
858     it's negative. */
859    
860     if (class_charcount == 1 && class_lastchar >= 0)
861     {
862     if (negate_class)
863     {
864     code[-1] = OP_NOT;
865     }
866     else
867     {
868     code[-1] = OP_CHARS;
869     *code++ = 1;
870     }
871     *code++ = class_lastchar;
872     }
873    
874     /* Otherwise, negate the 32-byte map if necessary, and copy it into
875     the code vector. */
876    
877     else
878     {
879     if (negate_class)
880     for (c = 0; c < 32; c++) code[c] = ~class[c];
881     else
882     memcpy(code, class, 32);
883     code += 32;
884     }
885     break;
886    
887     /* Various kinds of repeat */
888    
889     case '{':
890     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
891     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
892     if (*errorptr != NULL) goto FAILED;
893     goto REPEAT;
894    
895     case '*':
896     repeat_min = 0;
897     repeat_max = -1;
898     goto REPEAT;
899    
900     case '+':
901     repeat_min = 1;
902     repeat_max = -1;
903     goto REPEAT;
904    
905     case '?':
906     repeat_min = 0;
907     repeat_max = 1;
908    
909     REPEAT:
910     if (previous == NULL)
911     {
912     *errorptr = ERR9;
913     goto FAILED;
914     }
915    
916 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
917     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
918 nigel 3 next character. */
919    
920 nigel 19 if (ptr[1] == '?')
921     { repeat_type = greedy_non_default; ptr++; }
922     else repeat_type = greedy_default;
923 nigel 3
924     /* If the maximum is zero then the minimum must also be zero; Perl allows
925     this case, so we do too - by simply omitting the item altogether. */
926    
927     if (repeat_max == 0) code = previous;
928    
929     /* If previous was a string of characters, chop off the last one and use it
930     as the subject of the repeat. If there was only one character, we can
931     abolish the previous item altogether. */
932    
933     else if (*previous == OP_CHARS)
934     {
935     int len = previous[1];
936     if (len == 1)
937     {
938     c = previous[2];
939     code = previous;
940     }
941     else
942     {
943     c = previous[len+1];
944     previous[1]--;
945     code--;
946     }
947     op_type = 0; /* Use single-char op codes */
948     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
949     }
950    
951     /* If previous was a single negated character ([^a] or similar), we use
952     one of the special opcodes, replacing it. The code is shared with single-
953     character repeats by adding a suitable offset into repeat_type. */
954    
955     else if ((int)*previous == OP_NOT)
956     {
957     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
958     c = previous[1];
959     code = previous;
960     goto OUTPUT_SINGLE_REPEAT;
961     }
962    
963     /* If previous was a character type match (\d or similar), abolish it and
964     create a suitable repeat item. The code is shared with single-character
965     repeats by adding a suitable offset into repeat_type. */
966    
967     else if ((int)*previous < OP_EOD || *previous == OP_ANY)
968     {
969     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
970     c = *previous;
971     code = previous;
972    
973     OUTPUT_SINGLE_REPEAT:
974     repeat_type += op_type; /* Combine both values for many cases */
975    
976     /* A minimum of zero is handled either as the special case * or ?, or as
977     an UPTO, with the maximum given. */
978    
979     if (repeat_min == 0)
980     {
981     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
982     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
983     else
984     {
985     *code++ = OP_UPTO + repeat_type;
986     *code++ = repeat_max >> 8;
987     *code++ = (repeat_max & 255);
988     }
989     }
990    
991     /* The case {1,} is handled as the special case + */
992    
993     else if (repeat_min == 1 && repeat_max == -1)
994     *code++ = OP_PLUS + repeat_type;
995    
996     /* The case {n,n} is just an EXACT, while the general case {n,m} is
997     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
998    
999     else
1000     {
1001     if (repeat_min != 1)
1002     {
1003     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1004     *code++ = repeat_min >> 8;
1005     *code++ = (repeat_min & 255);
1006     }
1007    
1008     /* If the mininum is 1 and the previous item was a character string,
1009     we either have to put back the item that got cancelled if the string
1010     length was 1, or add the character back onto the end of a longer
1011 nigel 21 string. For a character type nothing need be done; it will just get
1012     put back naturally. Note that the final character is always going to
1013     get added below. */
1014 nigel 3
1015     else if (*previous == OP_CHARS)
1016     {
1017     if (code == previous) code += 2; else previous[1]++;
1018     }
1019    
1020 nigel 21 /* For a single negated character we also have to put back the
1021     item that got cancelled. */
1022    
1023     else if (*previous == OP_NOT) code++;
1024    
1025 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1026 nigel 3
1027 nigel 9 if (repeat_max < 0)
1028 nigel 3 {
1029     *code++ = c;
1030 nigel 9 *code++ = OP_STAR + repeat_type;
1031     }
1032    
1033     /* Else insert an UPTO if the max is greater than the min. */
1034    
1035     else if (repeat_max != repeat_min)
1036     {
1037     *code++ = c;
1038 nigel 3 repeat_max -= repeat_min;
1039     *code++ = OP_UPTO + repeat_type;
1040     *code++ = repeat_max >> 8;
1041     *code++ = (repeat_max & 255);
1042     }
1043     }
1044    
1045     /* The character or character type itself comes last in all cases. */
1046    
1047     *code++ = c;
1048     }
1049    
1050     /* If previous was a character class or a back reference, we put the repeat
1051     stuff after it. */
1052    
1053 nigel 13 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1054     *previous == OP_REF)
1055 nigel 3 {
1056     if (repeat_min == 0 && repeat_max == -1)
1057     *code++ = OP_CRSTAR + repeat_type;
1058     else if (repeat_min == 1 && repeat_max == -1)
1059     *code++ = OP_CRPLUS + repeat_type;
1060     else if (repeat_min == 0 && repeat_max == 1)
1061     *code++ = OP_CRQUERY + repeat_type;
1062     else
1063     {
1064     *code++ = OP_CRRANGE + repeat_type;
1065     *code++ = repeat_min >> 8;
1066     *code++ = repeat_min & 255;
1067     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1068     *code++ = repeat_max >> 8;
1069     *code++ = repeat_max & 255;
1070     }
1071     }
1072    
1073     /* If previous was a bracket group, we may have to replicate it in certain
1074     cases. If the maximum repeat count is unlimited, check that the bracket
1075     group cannot match the empty string, and diagnose an error if it can. */
1076    
1077     else if ((int)*previous >= OP_BRA)
1078     {
1079     int i;
1080 nigel 9 int len = code - previous;
1081 nigel 3
1082     if (repeat_max == -1 && could_be_empty(previous))
1083     {
1084     *errorptr = ERR10;
1085     goto FAILED;
1086     }
1087    
1088     /* If the minimum is greater than zero, and the maximum is unlimited or
1089     equal to the minimum, the first copy remains where it is, and is
1090     replicated up to the minimum number of times. This case includes the +
1091     repeat, but of course no replication is needed in that case. */
1092    
1093     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1094     {
1095     for (i = 1; i < repeat_min; i++)
1096     {
1097 nigel 9 memcpy(code, previous, len);
1098     code += len;
1099 nigel 3 }
1100     }
1101    
1102     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1103     Then, if there is a fixed upper limit, replicated up to that many times,
1104     sticking BRAZERO in front of all the optional ones. */
1105    
1106     else
1107     {
1108     if (repeat_min == 0)
1109     {
1110 nigel 9 memmove(previous+1, previous, len);
1111 nigel 3 code++;
1112     *previous++ = OP_BRAZERO + repeat_type;
1113     }
1114    
1115     for (i = 1; i < repeat_min; i++)
1116     {
1117 nigel 9 memcpy(code, previous, len);
1118     code += len;
1119 nigel 3 }
1120    
1121     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1122     {
1123     *code++ = OP_BRAZERO + repeat_type;
1124 nigel 9 memcpy(code, previous, len);
1125     code += len;
1126 nigel 3 }
1127     }
1128    
1129     /* If the maximum is unlimited, set a repeater in the final copy. */
1130    
1131     if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1132     }
1133    
1134     /* Else there's some kind of shambles */
1135    
1136     else
1137     {
1138     *errorptr = ERR11;
1139     goto FAILED;
1140     }
1141    
1142     /* In all case we no longer have a previous item. */
1143    
1144     previous = NULL;
1145     break;
1146    
1147    
1148     /* Start of nested bracket sub-expression, or comment or lookahead.
1149     First deal with special things that can come after a bracket; all are
1150     introduced by ?, and the appearance of any of them means that this is not a
1151     referencing group. They were checked for validity in the first pass over
1152     the string, so we don't have to check for syntax errors here. */
1153    
1154     case '(':
1155     previous = code; /* Only real brackets can be repeated */
1156     if (*(++ptr) == '?')
1157     {
1158     bravalue = OP_BRA;
1159    
1160     switch (*(++ptr))
1161     {
1162     case '#':
1163     case 'i':
1164     case 'm':
1165     case 's':
1166     case 'x':
1167 nigel 19 case 'U':
1168     case 'X':
1169 nigel 3 ptr++;
1170     while (*ptr != ')') ptr++;
1171     previous = NULL;
1172     continue;
1173    
1174     case ':': /* Non-extracting bracket */
1175     ptr++;
1176     break;
1177    
1178     case '=': /* Assertions can't be repeated */
1179     bravalue = OP_ASSERT;
1180     ptr++;
1181     previous = NULL;
1182     break;
1183    
1184     case '!':
1185     bravalue = OP_ASSERT_NOT;
1186     ptr++;
1187     previous = NULL;
1188     break;
1189    
1190     case '>': /* "Match once" brackets */
1191     if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1192     {
1193     bravalue = OP_ONCE;
1194     ptr++;
1195     previous = NULL;
1196     break;
1197     }
1198     /* Else fall through */
1199    
1200     default:
1201     *errorptr = ERR12;
1202     goto FAILED;
1203     }
1204     }
1205    
1206     /* Else we have a referencing group */
1207    
1208     else
1209     {
1210     if (++(*brackets) > EXTRACT_MAX)
1211     {
1212     *errorptr = ERR13;
1213     goto FAILED;
1214     }
1215     bravalue = OP_BRA + *brackets;
1216     }
1217    
1218     /* Process nested bracketed re; at end pointer is on the bracket. We copy
1219     code into a non-register variable in order to be able to pass its address
1220     because some compilers complain otherwise. */
1221    
1222     *code = bravalue;
1223     {
1224     uschar *mcode = code;
1225     if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1226     goto FAILED;
1227     code = mcode;
1228     }
1229    
1230     if (*ptr != ')')
1231     {
1232     *errorptr = ERR14;
1233     goto FAILED;
1234     }
1235     break;
1236    
1237     /* Check \ for being a real metacharacter; if not, fall through and handle
1238     it as a data character at the start of a string. Escape items are checked
1239     for validity in the pre-compiling pass. */
1240    
1241     case '\\':
1242     oldptr = ptr;
1243     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1244    
1245     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1246     are arranged to be the negation of the corresponding OP_values. For the
1247     back references, the values are ESC_REF plus the reference number. Only
1248     back references and those types that consume a character may be repeated.
1249     We can test for values between ESC_b and ESC_Z for the latter; this may
1250     have to change if any new ones are ever created. */
1251    
1252     if (c < 0)
1253     {
1254     if (-c >= ESC_REF)
1255     {
1256     int refnum = -c - ESC_REF;
1257     if (*brackets < refnum)
1258     {
1259     *errorptr = ERR15;
1260     goto FAILED;
1261     }
1262     previous = code;
1263     *code++ = OP_REF;
1264     *code++ = refnum;
1265     }
1266     else
1267     {
1268     previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1269     *code++ = -c;
1270     }
1271     continue;
1272     }
1273    
1274 nigel 7 /* Data character: reset and fall through */
1275 nigel 3
1276     ptr = oldptr;
1277     c = '\\';
1278    
1279     /* Handle a run of data characters until a metacharacter is encountered.
1280     The first character is guaranteed not to be whitespace or # when the
1281     extended flag is set. */
1282    
1283     NORMAL_CHAR:
1284     default:
1285     previous = code;
1286     *code = OP_CHARS;
1287     code += 2;
1288     length = 0;
1289    
1290     do
1291     {
1292     if ((options & PCRE_EXTENDED) != 0)
1293     {
1294     if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1295     if (c == '#')
1296     {
1297     while ((c = *(++ptr)) != 0 && c != '\n');
1298     if (c == 0) break;
1299     continue;
1300     }
1301     }
1302    
1303     /* Backslash may introduce a data char or a metacharacter. Escaped items
1304     are checked for validity in the pre-compiling pass. Stop the string
1305     before a metaitem. */
1306    
1307     if (c == '\\')
1308     {
1309     oldptr = ptr;
1310     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1311     if (c < 0) { ptr = oldptr; break; }
1312     }
1313    
1314     /* Ordinary character or single-char escape */
1315    
1316     *code++ = c;
1317     length++;
1318     }
1319    
1320     /* This "while" is the end of the "do" above. */
1321    
1322     while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1323    
1324     /* Compute the length and set it in the data vector, and advance to
1325     the next state. */
1326    
1327     previous[1] = length;
1328 nigel 15 if (length < 255) ptr--;
1329 nigel 3 break;
1330     }
1331     } /* end of big loop */
1332    
1333     /* Control never reaches here by falling through, only by a goto for all the
1334     error states. Pass back the position in the pattern so that it can be displayed
1335     to the user for diagnosing the error. */
1336    
1337     FAILED:
1338     *ptrptr = ptr;
1339     return FALSE;
1340     }
1341    
1342    
1343    
1344    
1345     /*************************************************
1346     * Compile sequence of alternatives *
1347     *************************************************/
1348    
1349     /* On entry, ptr is pointing past the bracket character, but on return
1350     it points to the closing bracket, or vertical bar, or end of string.
1351     The code variable is pointing at the byte into which the BRA operator has been
1352     stored.
1353    
1354     Argument:
1355     options the option bits
1356     brackets -> int containing the number of extracting brackets used
1357     codeptr -> the address of the current code pointer
1358     ptrptr -> the address of the current pattern pointer
1359     errorptr -> pointer to error message
1360    
1361     Returns: TRUE on success
1362     */
1363    
1364     static BOOL
1365 nigel 7 compile_regex(int options, int *brackets, uschar **codeptr,
1366     const uschar **ptrptr, const char **errorptr)
1367 nigel 3 {
1368 nigel 7 const uschar *ptr = *ptrptr;
1369 nigel 3 uschar *code = *codeptr;
1370     uschar *start_bracket = code;
1371    
1372     for (;;)
1373     {
1374     int length;
1375     uschar *last_branch = code;
1376    
1377     code += 3;
1378     if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1379     {
1380     *ptrptr = ptr;
1381     return FALSE;
1382     }
1383    
1384     /* Fill in the length of the last branch */
1385    
1386     length = code - last_branch;
1387     last_branch[1] = length >> 8;
1388     last_branch[2] = length & 255;
1389    
1390     /* Reached end of expression, either ')' or end of pattern. Insert a
1391     terminating ket and the length of the whole bracketed item, and return,
1392     leaving the pointer at the terminating char. */
1393    
1394     if (*ptr != '|')
1395     {
1396     length = code - start_bracket;
1397     *code++ = OP_KET;
1398     *code++ = length >> 8;
1399     *code++ = length & 255;
1400     *codeptr = code;
1401     *ptrptr = ptr;
1402     return TRUE;
1403     }
1404    
1405     /* Another branch follows; insert an "or" node and advance the pointer. */
1406    
1407     *code = OP_ALT;
1408     ptr++;
1409     }
1410     /* Control never reaches here */
1411     }
1412    
1413    
1414    
1415     /*************************************************
1416     * Check for anchored expression *
1417     *************************************************/
1418    
1419     /* Try to find out if this is an anchored regular expression. Consider each
1420     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1421     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1422     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1423     counts, since OP_CIRC can match in the middle.
1424    
1425     A branch is also implicitly anchored if it starts with .* because that will try
1426     the rest of the pattern at all possible matching points, so there is no point
1427     trying them again.
1428    
1429     Argument: points to start of expression (the bracket)
1430     Returns: TRUE or FALSE
1431     */
1432    
1433     static BOOL
1434 nigel 7 is_anchored(register const uschar *code, BOOL multiline)
1435 nigel 3 {
1436     do {
1437     int op = (int)code[3];
1438     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1439     { if (!is_anchored(code+3, multiline)) return FALSE; }
1440     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1441     { if (code[4] != OP_ANY) return FALSE; }
1442     else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1443     code += (code[1] << 8) + code[2];
1444     }
1445     while (*code == OP_ALT);
1446     return TRUE;
1447     }
1448    
1449    
1450    
1451     /*************************************************
1452     * Check for start with \n line expression *
1453     *************************************************/
1454    
1455     /* This is called for multiline expressions to try to find out if every branch
1456     starts with ^ so that "first char" processing can be done to speed things up.
1457    
1458     Argument: points to start of expression (the bracket)
1459     Returns: TRUE or FALSE
1460     */
1461    
1462     static BOOL
1463 nigel 7 is_startline(const uschar *code)
1464 nigel 3 {
1465     do {
1466     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1467     { if (!is_startline(code+3)) return FALSE; }
1468     else if (code[3] != OP_CIRC) return FALSE;
1469     code += (code[1] << 8) + code[2];
1470     }
1471     while (*code == OP_ALT);
1472     return TRUE;
1473     }
1474    
1475    
1476    
1477     /*************************************************
1478     * Check for fixed first char *
1479     *************************************************/
1480    
1481     /* Try to find out if there is a fixed first character. This is called for
1482     unanchored expressions, as it speeds up their processing quite considerably.
1483     Consider each alternative branch. If they all start with the same char, or with
1484     a bracket all of whose alternatives start with the same char (recurse ad lib),
1485     then we return that char, otherwise -1.
1486    
1487     Argument: points to start of expression (the bracket)
1488     Returns: -1 or the fixed first char
1489     */
1490    
1491     static int
1492     find_firstchar(uschar *code)
1493     {
1494     register int c = -1;
1495     do
1496     {
1497     register int charoffset = 4;
1498    
1499     if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1500     {
1501     register int d;
1502     if ((d = find_firstchar(code+3)) < 0) return -1;
1503     if (c < 0) c = d; else if (c != d) return -1;
1504     }
1505    
1506     else switch(code[3])
1507     {
1508     default:
1509     return -1;
1510    
1511     case OP_EXACT: /* Fall through */
1512     charoffset++;
1513    
1514     case OP_CHARS: /* Fall through */
1515     charoffset++;
1516    
1517     case OP_PLUS:
1518     case OP_MINPLUS:
1519     if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1520     break;
1521     }
1522     code += (code[1] << 8) + code[2];
1523     }
1524     while (*code == OP_ALT);
1525     return c;
1526     }
1527    
1528    
1529    
1530     /*************************************************
1531     * Compile a Regular Expression *
1532     *************************************************/
1533    
1534     /* This function takes a string and returns a pointer to a block of store
1535     holding a compiled version of the expression.
1536    
1537     Arguments:
1538     pattern the regular expression
1539     options various option bits
1540     errorptr pointer to pointer to error text
1541     erroroffset ptr offset in pattern where error was detected
1542    
1543     Returns: pointer to compiled data block, or NULL on error,
1544     with errorptr and erroroffset set
1545     */
1546    
1547     pcre *
1548 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
1549 nigel 3 int *erroroffset)
1550     {
1551     real_pcre *re;
1552     int spaces = 0;
1553     int length = 3; /* For initial BRA plus length */
1554     int runlength;
1555     int c, size;
1556     int bracount = 0;
1557     int brastack[200];
1558     int top_backref = 0;
1559 nigel 7 unsigned int brastackptr = 0;
1560     uschar *code;
1561     const uschar *ptr;
1562 nigel 3
1563     #ifdef DEBUG
1564     uschar *code_base, *code_end;
1565     #endif
1566    
1567     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1568     can do is just return NULL. */
1569    
1570     if (errorptr == NULL) return NULL;
1571     *errorptr = NULL;
1572    
1573     /* However, we can give a message for this error */
1574    
1575     if (erroroffset == NULL)
1576     {
1577     *errorptr = ERR16;
1578     return NULL;
1579     }
1580     *erroroffset = 0;
1581    
1582     if ((options & ~PUBLIC_OPTIONS) != 0)
1583     {
1584     *errorptr = ERR17;
1585     return NULL;
1586     }
1587    
1588 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
1589     DPRINTF(("%s\n", pattern));
1590 nigel 3
1591     /* The first thing to do is to make a pass over the pattern to compute the
1592     amount of store required to hold the compiled code. This does not have to be
1593     perfect as long as errors are overestimates. At the same time we can detect any
1594     internal flag settings. Make an attempt to correct for any counted white space
1595     if an "extended" flag setting appears late in the pattern. We can't be so
1596     clever for #-comments. */
1597    
1598 nigel 7 ptr = (const uschar *)(pattern - 1);
1599 nigel 3 while ((c = *(++ptr)) != 0)
1600     {
1601     int min, max;
1602     int class_charcount;
1603    
1604     if ((pcre_ctypes[c] & ctype_space) != 0)
1605     {
1606     if ((options & PCRE_EXTENDED) != 0) continue;
1607     spaces++;
1608     }
1609    
1610     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1611     {
1612     while ((c = *(++ptr)) != 0 && c != '\n');
1613     continue;
1614     }
1615    
1616     switch(c)
1617     {
1618     /* A backslashed item may be an escaped "normal" character or a
1619     character type. For a "normal" character, put the pointers and
1620     character back so that tests for whitespace etc. in the input
1621     are done correctly. */
1622    
1623     case '\\':
1624     {
1625 nigel 7 const uschar *save_ptr = ptr;
1626 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1627     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1628     if (c >= 0)
1629     {
1630     ptr = save_ptr;
1631     c = '\\';
1632     goto NORMAL_CHAR;
1633     }
1634     }
1635     length++;
1636    
1637     /* A back reference needs an additional char, plus either one or 5
1638     bytes for a repeat. We also need to keep the value of the highest
1639     back reference. */
1640    
1641     if (c <= -ESC_REF)
1642     {
1643     int refnum = -c - ESC_REF;
1644     if (refnum > top_backref) top_backref = refnum;
1645     length++; /* For single back reference */
1646     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1647     {
1648     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1649     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1650     if ((min == 0 && (max == 1 || max == -1)) ||
1651     (min == 1 && max == -1))
1652     length++;
1653     else length += 5;
1654     if (ptr[1] == '?') ptr++;
1655     }
1656     }
1657     continue;
1658    
1659     case '^':
1660     case '.':
1661     case '$':
1662     case '*': /* These repeats won't be after brackets; */
1663     case '+': /* those are handled separately */
1664     case '?':
1665     length++;
1666     continue;
1667    
1668     /* This covers the cases of repeats after a single char, metachar, class,
1669     or back reference. */
1670    
1671     case '{':
1672     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1673     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1674     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1675     if ((min == 0 && (max == 1 || max == -1)) ||
1676     (min == 1 && max == -1))
1677     length++;
1678     else
1679     {
1680     length--; /* Uncount the original char or metachar */
1681     if (min == 1) length++; else if (min > 0) length += 4;
1682     if (max > 0) length += 4; else length += 2;
1683     }
1684     if (ptr[1] == '?') ptr++;
1685     continue;
1686    
1687     /* An alternation contains an offset to the next branch or ket. */
1688     case '|':
1689     length += 3;
1690     continue;
1691    
1692     /* A character class uses 33 characters. Don't worry about character types
1693     that aren't allowed in classes - they'll get picked up during the compile.
1694     A character class that contains only one character uses 2 or 3 bytes,
1695     depending on whether it is negated or not. Notice this where we can. */
1696    
1697     case '[':
1698     class_charcount = 0;
1699     if (*(++ptr) == '^') ptr++;
1700     do
1701     {
1702     if (*ptr == '\\')
1703     {
1704 nigel 9 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1705 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1706 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1707 nigel 3 }
1708     else class_charcount++;
1709     ptr++;
1710     }
1711     while (*ptr != 0 && *ptr != ']');
1712    
1713     /* Repeats for negated single chars are handled by the general code */
1714    
1715     if (class_charcount == 1) length += 3; else
1716     {
1717     length += 33;
1718    
1719     /* A repeat needs either 1 or 5 bytes. */
1720    
1721 nigel 9 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1722 nigel 3 {
1723     ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1724     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1725     if ((min == 0 && (max == 1 || max == -1)) ||
1726     (min == 1 && max == -1))
1727     length++;
1728     else length += 5;
1729     if (ptr[1] == '?') ptr++;
1730     }
1731     }
1732     continue;
1733    
1734     /* Brackets may be genuine groups or special things */
1735    
1736     case '(':
1737    
1738     /* Handle special forms of bracket, which all start (? */
1739    
1740     if (ptr[1] == '?') switch (c = ptr[2])
1741     {
1742     /* Skip over comments entirely */
1743     case '#':
1744     ptr += 3;
1745     while (*ptr != 0 && *ptr != ')') ptr++;
1746     if (*ptr == 0)
1747     {
1748     *errorptr = ERR18;
1749     goto PCRE_ERROR_RETURN;
1750     }
1751     continue;
1752    
1753     /* Non-referencing groups and lookaheads just move the pointer on, and
1754     then behave like a non-special bracket, except that they don't increment
1755     the count of extracting brackets. */
1756    
1757     case ':':
1758     case '=':
1759     case '!':
1760     ptr += 2;
1761     break;
1762    
1763     /* Ditto for the "once only" bracket, allowed only if the extra bit
1764     is set. */
1765    
1766     case '>':
1767     if ((options & PCRE_EXTRA) != 0)
1768     {
1769     ptr += 2;
1770     break;
1771     }
1772 nigel 19 /* Else fall through */
1773 nigel 3
1774     /* Else loop setting valid options until ) is met. Anything else is an
1775     error. */
1776    
1777     default:
1778     ptr += 2;
1779     for (;; ptr++)
1780     {
1781     if ((c = *ptr) == 'i')
1782     {
1783     options |= PCRE_CASELESS;
1784     continue;
1785     }
1786     else if ((c = *ptr) == 'm')
1787     {
1788     options |= PCRE_MULTILINE;
1789     continue;
1790     }
1791     else if (c == 's')
1792     {
1793     options |= PCRE_DOTALL;
1794     continue;
1795     }
1796     else if (c == 'x')
1797     {
1798     options |= PCRE_EXTENDED;
1799     length -= spaces; /* Already counted spaces */
1800     continue;
1801     }
1802 nigel 19 else if (c == 'X')
1803     {
1804     options |= PCRE_EXTRA;
1805     continue;
1806     }
1807     else if (c == 'U')
1808     {
1809     options |= PCRE_UNGREEDY;
1810     continue;
1811     }
1812 nigel 3 else if (c == ')') break;
1813    
1814     *errorptr = ERR12;
1815     goto PCRE_ERROR_RETURN;
1816     }
1817     continue; /* End of this bracket handling */
1818     }
1819    
1820     /* Extracting brackets must be counted so we can process escapes in a
1821     Perlish way. */
1822    
1823     else bracount++;
1824    
1825     /* Non-special forms of bracket. Save length for computing whole length
1826     at end if there's a repeat that requires duplication of the group. */
1827    
1828     if (brastackptr >= sizeof(brastack)/sizeof(int))
1829     {
1830     *errorptr = ERR19;
1831     goto PCRE_ERROR_RETURN;
1832     }
1833    
1834     brastack[brastackptr++] = length;
1835     length += 3;
1836     continue;
1837    
1838     /* Handle ket. Look for subsequent max/min; for certain sets of values we
1839 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
1840     0 this is an unmatched bracket which will generate an error, but take care
1841     not to try to access brastack[-1]. */
1842 nigel 3
1843     case ')':
1844     length += 3;
1845     {
1846 nigel 9 int minval = 1;
1847     int maxval = 1;
1848     int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1849 nigel 3
1850     /* Leave ptr at the final char; for read_repeat_counts this happens
1851     automatically; for the others we need an increment. */
1852    
1853     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1854     {
1855 nigel 9 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1856 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1857     }
1858 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1859     else if (c == '+') { maxval = -1; ptr++; }
1860     else if (c == '?') { minval = 0; ptr++; }
1861 nigel 3
1862 nigel 9 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1863     if there is a limited maximum we have to replicate up to maxval-1 times
1864     and allow for a BRAZERO item before each optional copy, as we also have
1865     to do before the first copy if the minimum is zero. */
1866 nigel 3
1867 nigel 9 if (minval == 0) length++;
1868     else if (minval > 1) length += (minval - 1) * duplength;
1869     if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1870 nigel 3 }
1871     continue;
1872    
1873     /* Non-special character. For a run of such characters the length required
1874     is the number of characters + 2, except that the maximum run length is 255.
1875     We won't get a skipped space or a non-data escape or the start of a #
1876     comment as the first character, so the length can't be zero. */
1877    
1878     NORMAL_CHAR:
1879     default:
1880     length += 2;
1881     runlength = 0;
1882     do
1883     {
1884     if ((pcre_ctypes[c] & ctype_space) != 0)
1885     {
1886     if ((options & PCRE_EXTENDED) != 0) continue;
1887     spaces++;
1888     }
1889    
1890     if (c == '#' && (options & PCRE_EXTENDED) != 0)
1891     {
1892     while ((c = *(++ptr)) != 0 && c != '\n');
1893     continue;
1894     }
1895    
1896     /* Backslash may introduce a data char or a metacharacter; stop the
1897     string before the latter. */
1898    
1899     if (c == '\\')
1900     {
1901 nigel 7 const uschar *saveptr = ptr;
1902 nigel 3 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1903     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1904     if (c < 0) { ptr = saveptr; break; }
1905     }
1906    
1907     /* Ordinary character or single-char escape */
1908    
1909     runlength++;
1910     }
1911    
1912     /* This "while" is the end of the "do" above. */
1913    
1914     while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1915    
1916     ptr--;
1917     length += runlength;
1918     continue;
1919     }
1920     }
1921    
1922     length += 4; /* For final KET and END */
1923    
1924     if (length > 65539)
1925     {
1926     *errorptr = ERR20;
1927     return NULL;
1928     }
1929    
1930     /* Compute the size of data block needed and get it, either from malloc or
1931 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
1932     rather than just "code", because it has been reported that one broken compiler
1933     fails on "code" because it is also an independent variable. It should make no
1934     difference to the value of the offsetof(). */
1935 nigel 3
1936 nigel 9 size = length + offsetof(real_pcre, code[0]);
1937 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
1938    
1939     if (re == NULL)
1940     {
1941     *errorptr = ERR21;
1942     return NULL;
1943     }
1944    
1945 nigel 9 /* Put in the magic number and the options. */
1946    
1947 nigel 3 re->magic_number = MAGIC_NUMBER;
1948     re->options = options;
1949    
1950     /* Set up a starting, non-extracting bracket, then compile the expression. On
1951     error, *errorptr will be set non-NULL, so we don't need to look at the result
1952     of the function here. */
1953    
1954 nigel 7 ptr = (const uschar *)pattern;
1955 nigel 3 code = re->code;
1956     *code = OP_BRA;
1957     bracount = 0;
1958     (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1959     re->top_bracket = bracount;
1960     re->top_backref = top_backref;
1961    
1962     /* If not reached end of pattern on success, there's an excess bracket. */
1963    
1964     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1965    
1966     /* Fill in the terminating state and check for disastrous overflow, but
1967     if debugging, leave the test till after things are printed out. */
1968    
1969     *code++ = OP_END;
1970    
1971     #ifndef DEBUG
1972     if (code - re->code > length) *errorptr = ERR23;
1973     #endif
1974    
1975     /* Failed to compile */
1976    
1977     if (*errorptr != NULL)
1978     {
1979     (pcre_free)(re);
1980     PCRE_ERROR_RETURN:
1981 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
1982 nigel 3 return NULL;
1983     }
1984    
1985     /* If the anchored option was not passed, set flag if we can determine that it
1986     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1987     we can determine what the first character has to be, because that speeds up
1988     unanchored matches no end. In the case of multiline matches, an alternative is
1989     to set the PCRE_STARTLINE flag if all branches start with ^. */
1990    
1991     if ((options & PCRE_ANCHORED) == 0)
1992     {
1993     if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1994     re->options |= PCRE_ANCHORED;
1995     else
1996     {
1997 nigel 9 int ch = find_firstchar(re->code);
1998     if (ch >= 0)
1999 nigel 3 {
2000 nigel 9 re->first_char = ch;
2001 nigel 3 re->options |= PCRE_FIRSTSET;
2002     }
2003     else if (is_startline(re->code))
2004     re->options |= PCRE_STARTLINE;
2005     }
2006     }
2007    
2008     /* Print out the compiled data for debugging */
2009    
2010     #ifdef DEBUG
2011    
2012     printf("Length = %d top_bracket = %d top_backref=%d\n",
2013     length, re->top_bracket, re->top_backref);
2014    
2015     if (re->options != 0)
2016     {
2017 nigel 19 printf("%s%s%s%s%s%s%s%s\n",
2018 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2019     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2020     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2021     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2022     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2023     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2024 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2025     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2026 nigel 3 }
2027    
2028     if ((re->options & PCRE_FIRSTSET) != 0)
2029     {
2030     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2031     else printf("First char = \\x%02x\n", re->first_char);
2032     }
2033    
2034     code_end = code;
2035     code_base = code = re->code;
2036    
2037     while (code < code_end)
2038     {
2039     int charlength;
2040    
2041     printf("%3d ", code - code_base);
2042    
2043     if (*code >= OP_BRA)
2044     {
2045     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2046     code += 2;
2047     }
2048    
2049     else switch(*code)
2050     {
2051     case OP_CHARS:
2052     charlength = *(++code);
2053     printf("%3d ", charlength);
2054     while (charlength-- > 0)
2055     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2056     break;
2057    
2058     case OP_KETRMAX:
2059     case OP_KETRMIN:
2060     case OP_ALT:
2061     case OP_KET:
2062     case OP_ASSERT:
2063     case OP_ASSERT_NOT:
2064     case OP_ONCE:
2065     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2066     code += 2;
2067     break;
2068    
2069     case OP_STAR:
2070     case OP_MINSTAR:
2071     case OP_PLUS:
2072     case OP_MINPLUS:
2073     case OP_QUERY:
2074     case OP_MINQUERY:
2075     case OP_TYPESTAR:
2076     case OP_TYPEMINSTAR:
2077     case OP_TYPEPLUS:
2078     case OP_TYPEMINPLUS:
2079     case OP_TYPEQUERY:
2080     case OP_TYPEMINQUERY:
2081     if (*code >= OP_TYPESTAR)
2082     printf(" %s", OP_names[code[1]]);
2083     else if (isprint(c = code[1])) printf(" %c", c);
2084     else printf(" \\x%02x", c);
2085     printf("%s", OP_names[*code++]);
2086     break;
2087    
2088     case OP_EXACT:
2089     case OP_UPTO:
2090     case OP_MINUPTO:
2091     if (isprint(c = code[3])) printf(" %c{", c);
2092     else printf(" \\x%02x{", c);
2093 nigel 11 if (*code != OP_EXACT) printf("0,");
2094 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2095     if (*code == OP_MINUPTO) printf("?");
2096     code += 3;
2097     break;
2098    
2099     case OP_TYPEEXACT:
2100     case OP_TYPEUPTO:
2101     case OP_TYPEMINUPTO:
2102     printf(" %s{", OP_names[code[3]]);
2103     if (*code != OP_TYPEEXACT) printf(",");
2104     printf("%d}", (code[1] << 8) + code[2]);
2105     if (*code == OP_TYPEMINUPTO) printf("?");
2106     code += 3;
2107     break;
2108    
2109     case OP_NOT:
2110     if (isprint(c = *(++code))) printf(" [^%c]", c);
2111     else printf(" [^\\x%02x]", c);
2112     break;
2113    
2114     case OP_NOTSTAR:
2115     case OP_NOTMINSTAR:
2116     case OP_NOTPLUS:
2117     case OP_NOTMINPLUS:
2118     case OP_NOTQUERY:
2119     case OP_NOTMINQUERY:
2120     if (isprint(c = code[1])) printf(" [^%c]", c);
2121     else printf(" [^\\x%02x]", c);
2122     printf("%s", OP_names[*code++]);
2123     break;
2124    
2125     case OP_NOTEXACT:
2126     case OP_NOTUPTO:
2127     case OP_NOTMINUPTO:
2128     if (isprint(c = code[3])) printf(" [^%c]{", c);
2129     else printf(" [^\\x%02x]{", c);
2130     if (*code != OP_NOTEXACT) printf(",");
2131     printf("%d}", (code[1] << 8) + code[2]);
2132     if (*code == OP_NOTMINUPTO) printf("?");
2133     code += 3;
2134     break;
2135    
2136     case OP_REF:
2137     printf(" \\%d", *(++code));
2138 nigel 9 code ++;
2139     goto CLASS_REF_REPEAT;
2140 nigel 3
2141     case OP_CLASS:
2142 nigel 13 case OP_NEGCLASS:
2143 nigel 3 {
2144     int i, min, max;
2145    
2146 nigel 13 if (*code++ == OP_CLASS) printf(" [");
2147     else printf(" ^[");
2148 nigel 3
2149     for (i = 0; i < 256; i++)
2150     {
2151     if ((code[i/8] & (1 << (i&7))) != 0)
2152     {
2153     int j;
2154     for (j = i+1; j < 256; j++)
2155     if ((code[j/8] & (1 << (j&7))) == 0) break;
2156     if (i == '-' || i == ']') printf("\\");
2157     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2158     if (--j > i)
2159     {
2160     printf("-");
2161     if (j == '-' || j == ']') printf("\\");
2162     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2163     }
2164     i = j;
2165     }
2166     }
2167     printf("]");
2168     code += 32;
2169    
2170 nigel 9 CLASS_REF_REPEAT:
2171    
2172 nigel 3 switch(*code)
2173     {
2174     case OP_CRSTAR:
2175     case OP_CRMINSTAR:
2176     case OP_CRPLUS:
2177     case OP_CRMINPLUS:
2178     case OP_CRQUERY:
2179     case OP_CRMINQUERY:
2180     printf("%s", OP_names[*code]);
2181     break;
2182    
2183     case OP_CRRANGE:
2184     case OP_CRMINRANGE:
2185     min = (code[1] << 8) + code[2];
2186     max = (code[3] << 8) + code[4];
2187     if (max == 0) printf("{%d,}", min);
2188     else printf("{%d,%d}", min, max);
2189     if (*code == OP_CRMINRANGE) printf("?");
2190     code += 4;
2191     break;
2192    
2193     default:
2194     code--;
2195     }
2196     }
2197     break;
2198    
2199     /* Anything else is just a one-node item */
2200    
2201     default:
2202     printf(" %s", OP_names[*code]);
2203     break;
2204     }
2205    
2206     code++;
2207     printf("\n");
2208     }
2209     printf("------------------------------------------------------------------\n");
2210    
2211     /* This check is done here in the debugging case so that the code that
2212     was compiled can be seen. */
2213    
2214     if (code - re->code > length)
2215     {
2216     *errorptr = ERR23;
2217     (pcre_free)(re);
2218     *erroroffset = ptr - (uschar *)pattern;
2219     return NULL;
2220     }
2221     #endif
2222    
2223     return (pcre *)re;
2224     }
2225    
2226    
2227    
2228     /*************************************************
2229     * Match a character type *
2230     *************************************************/
2231    
2232     /* Not used in all the places it might be as it's sometimes faster
2233     to put the code inline.
2234    
2235     Arguments:
2236     type the character type
2237     c the character
2238     dotall the dotall flag
2239    
2240     Returns: TRUE if character is of the type
2241     */
2242    
2243     static BOOL
2244     match_type(int type, int c, BOOL dotall)
2245     {
2246    
2247     #ifdef DEBUG
2248     if (isprint(c)) printf("matching subject %c against ", c);
2249     else printf("matching subject \\x%02x against ", c);
2250     printf("%s\n", OP_names[type]);
2251     #endif
2252    
2253     switch(type)
2254     {
2255     case OP_ANY: return dotall || c != '\n';
2256     case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2257     case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2258     case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2259     case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2260     case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2261     case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2262     }
2263     return FALSE;
2264     }
2265    
2266    
2267    
2268     /*************************************************
2269     * Match a back-reference *
2270     *************************************************/
2271    
2272     /* If a back reference hasn't been set, the match fails.
2273    
2274     Arguments:
2275     number reference number
2276     eptr points into the subject
2277     length length to be matched
2278     md points to match data block
2279    
2280     Returns: TRUE if matched
2281     */
2282    
2283     static BOOL
2284 nigel 7 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2285 nigel 3 {
2286 nigel 7 const uschar *p = md->start_subject + md->offset_vector[number];
2287 nigel 3
2288     #ifdef DEBUG
2289     if (eptr >= md->end_subject)
2290     printf("matching subject <null>");
2291     else
2292     {
2293     printf("matching subject ");
2294     pchars(eptr, length, TRUE, md);
2295     }
2296     printf(" against backref ");
2297     pchars(p, length, FALSE, md);
2298     printf("\n");
2299     #endif
2300    
2301     /* Always fail if not enough characters left */
2302    
2303     if (length > md->end_subject - p) return FALSE;
2304    
2305     /* Separate the caselesss case for speed */
2306    
2307     if (md->caseless)
2308     { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2309     else
2310     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2311    
2312     return TRUE;
2313     }
2314    
2315    
2316    
2317     /*************************************************
2318     * Match from current position *
2319     *************************************************/
2320    
2321     /* On entry ecode points to the first opcode, and eptr to the first character.
2322    
2323     Arguments:
2324     eptr pointer in subject
2325     ecode position in code
2326     offset_top current top pointer
2327     md pointer to "static" info for the match
2328    
2329     Returns: TRUE if matched
2330     */
2331    
2332     static BOOL
2333 nigel 7 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2334 nigel 3 match_data *md)
2335     {
2336     for (;;)
2337     {
2338     int min, max, ctype;
2339     register int i;
2340     register int c;
2341 nigel 7 BOOL minimize = FALSE;
2342 nigel 3
2343     /* Opening bracket. Check the alternative branches in turn, failing if none
2344     match. We have to set the start offset if required and there is space
2345     in the offset vector so that it is available for subsequent back references
2346     if the bracket matches. However, if the bracket fails, we must put back the
2347     previous value of both offsets in case they were set by a previous copy of
2348     the same bracket. Don't worry about setting the flag for the error case here;
2349     that is handled in the code for KET. */
2350    
2351     if ((int)*ecode >= OP_BRA)
2352     {
2353     int number = (*ecode - OP_BRA) << 1;
2354 nigel 7 int save_offset1 = 0, save_offset2 = 0;
2355 nigel 3
2356 nigel 9 DPRINTF(("start bracket %d\n", number/2));
2357 nigel 3
2358     if (number > 0 && number < md->offset_end)
2359     {
2360     save_offset1 = md->offset_vector[number];
2361     save_offset2 = md->offset_vector[number+1];
2362     md->offset_vector[number] = eptr - md->start_subject;
2363    
2364 nigel 9 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2365 nigel 3 }
2366    
2367     /* Recurse for all the alternatives. */
2368    
2369     do
2370     {
2371     if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2372     ecode += (ecode[1] << 8) + ecode[2];
2373     }
2374     while (*ecode == OP_ALT);
2375    
2376 nigel 9 DPRINTF(("bracket %d failed\n", number/2));
2377 nigel 3
2378     if (number > 0 && number < md->offset_end)
2379     {
2380     md->offset_vector[number] = save_offset1;
2381     md->offset_vector[number+1] = save_offset2;
2382     }
2383    
2384     return FALSE;
2385     }
2386    
2387     /* Other types of node can be handled by a switch */
2388    
2389     switch(*ecode)
2390     {
2391     case OP_END:
2392     md->end_match_ptr = eptr; /* Record where we ended */
2393     md->end_offset_top = offset_top; /* and how many extracts were taken */
2394     return TRUE;
2395    
2396     /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2397     whole thing doesn't match, so we have to get out via a longjmp(). */
2398    
2399     case OP_CUT:
2400     if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2401     longjmp(md->fail_env, 1);
2402    
2403     /* Assertion brackets. Check the alternative branches in turn - the
2404     matching won't pass the KET for an assertion. If any one branch matches,
2405     the assertion is true. */
2406    
2407     case OP_ASSERT:
2408     do
2409     {
2410     if (match(eptr, ecode+3, offset_top, md)) break;
2411     ecode += (ecode[1] << 8) + ecode[2];
2412     }
2413     while (*ecode == OP_ALT);
2414     if (*ecode == OP_KET) return FALSE;
2415    
2416     /* Continue from after the assertion, updating the offsets high water
2417     mark, since extracts may have been taken during the assertion. */
2418    
2419     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2420     ecode += 3;
2421     offset_top = md->end_offset_top;
2422     continue;
2423    
2424     /* Negative assertion: all branches must fail to match */
2425    
2426     case OP_ASSERT_NOT:
2427     do
2428     {
2429     if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2430     ecode += (ecode[1] << 8) + ecode[2];
2431     }
2432     while (*ecode == OP_ALT);
2433     ecode += 3;
2434     continue;
2435    
2436     /* "Once" brackets are like assertion brackets except that after a match,
2437     the point in the subject string is not moved back. Thus there can never be
2438 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
2439 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
2440     matches, we carry on, leaving the subject pointer. */
2441    
2442     case OP_ONCE:
2443     do
2444     {
2445     if (match(eptr, ecode+3, offset_top, md)) break;
2446     ecode += (ecode[1] << 8) + ecode[2];
2447     }
2448     while (*ecode == OP_ALT);
2449     if (*ecode == OP_KET) return FALSE;
2450    
2451     /* Continue as from after the assertion, updating the offsets high water
2452     mark, since extracts may have been taken. */
2453    
2454     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2455     ecode += 3;
2456     offset_top = md->end_offset_top;
2457     eptr = md->end_match_ptr;
2458     continue;
2459    
2460     /* An alternation is the end of a branch; scan along to find the end of the
2461     bracketed group and go to there. */
2462    
2463     case OP_ALT:
2464     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2465     break;
2466    
2467     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2468     that it may occur zero times. It may repeat infinitely, or not at all -
2469     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2470     repeat limits are compiled as a number of copies, with the optional ones
2471     preceded by BRAZERO or BRAMINZERO. */
2472    
2473     case OP_BRAZERO:
2474     {
2475 nigel 7 const uschar *next = ecode+1;
2476 nigel 3 if (match(eptr, next, offset_top, md)) return TRUE;
2477     do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2478     ecode = next + 3;
2479     }
2480     break;
2481    
2482     case OP_BRAMINZERO:
2483     {
2484 nigel 7 const uschar *next = ecode+1;
2485 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2486     if (match(eptr, next+3, offset_top, md)) return TRUE;
2487     ecode++;
2488     }
2489     break;;
2490    
2491     /* End of a group, repeated or non-repeating. If we are at the end of
2492     an assertion "group", stop matching and return TRUE, but record the
2493     current high water mark for use by positive assertions. */
2494    
2495     case OP_KET:
2496     case OP_KETRMIN:
2497     case OP_KETRMAX:
2498     {
2499     int number;
2500 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2501 nigel 3
2502     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2503     {
2504     md->end_match_ptr = eptr; /* For ONCE */
2505     md->end_offset_top = offset_top;
2506     return TRUE;
2507     }
2508    
2509     /* In all other cases we have to check the group number back at the
2510     start and if necessary complete handling an extraction by setting the
2511     final offset and bumping the high water mark. */
2512    
2513     number = (*prev - OP_BRA) << 1;
2514    
2515 nigel 9 DPRINTF(("end bracket %d\n", number/2));
2516 nigel 3
2517     if (number > 0)
2518     {
2519     if (number >= md->offset_end) md->offset_overflow = TRUE; else
2520     {
2521     md->offset_vector[number+1] = eptr - md->start_subject;
2522     if (offset_top <= number) offset_top = number + 2;
2523     }
2524     }
2525    
2526     /* For a non-repeating ket, just advance to the next node and continue at
2527     this level. */
2528    
2529     if (*ecode == OP_KET)
2530     {
2531     ecode += 3;
2532     break;
2533     }
2534    
2535     /* The repeating kets try the rest of the pattern or restart from the
2536     preceding bracket, in the appropriate order. */
2537    
2538     if (*ecode == OP_KETRMIN)
2539     {
2540     if (match(eptr, ecode+3, offset_top, md) ||
2541     match(eptr, prev, offset_top, md)) return TRUE;
2542     }
2543     else /* OP_KETRMAX */
2544     {
2545     if (match(eptr, prev, offset_top, md) ||
2546     match(eptr, ecode+3, offset_top, md)) return TRUE;
2547     }
2548     }
2549     return FALSE;
2550    
2551     /* Start of subject unless notbol, or after internal newline if multiline */
2552    
2553     case OP_CIRC:
2554     if (md->notbol && eptr == md->start_subject) return FALSE;
2555     if (md->multiline)
2556     {
2557     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2558     ecode++;
2559     break;
2560     }
2561     /* ... else fall through */
2562    
2563     /* Start of subject assertion */
2564    
2565     case OP_SOD:
2566     if (eptr != md->start_subject) return FALSE;
2567     ecode++;
2568     break;
2569    
2570     /* Assert before internal newline if multiline, or before
2571     a terminating newline unless endonly is set, else end of subject unless
2572     noteol is set. */
2573    
2574     case OP_DOLL:
2575     if (md->noteol && eptr >= md->end_subject) return FALSE;
2576     if (md->multiline)
2577     {
2578     if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2579     ecode++;
2580     break;
2581     }
2582     else if (!md->endonly)
2583     {
2584     if (eptr < md->end_subject - 1 ||
2585     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2586     ecode++;
2587     break;
2588     }
2589     /* ... else fall through */
2590    
2591     /* End of subject assertion */
2592    
2593     case OP_EOD:
2594     if (eptr < md->end_subject) return FALSE;
2595     ecode++;
2596     break;
2597    
2598     /* Word boundary assertions */
2599    
2600     case OP_NOT_WORD_BOUNDARY:
2601     case OP_WORD_BOUNDARY:
2602     {
2603     BOOL prev_is_word = (eptr != md->start_subject) &&
2604     ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2605     BOOL cur_is_word = (eptr < md->end_subject) &&
2606     ((pcre_ctypes[*eptr] & ctype_word) != 0);
2607     if ((*ecode++ == OP_WORD_BOUNDARY)?
2608     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2609     return FALSE;
2610     }
2611     break;
2612    
2613     /* Match a single character type; inline for speed */
2614    
2615     case OP_ANY:
2616     if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2617     if (eptr++ >= md->end_subject) return FALSE;
2618     ecode++;
2619     break;
2620    
2621     case OP_NOT_DIGIT:
2622     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2623     return FALSE;
2624     ecode++;
2625     break;
2626    
2627     case OP_DIGIT:
2628     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2629     return FALSE;
2630     ecode++;
2631     break;
2632    
2633     case OP_NOT_WHITESPACE:
2634     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2635     return FALSE;
2636     ecode++;
2637     break;
2638    
2639     case OP_WHITESPACE:
2640     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2641     return FALSE;
2642     ecode++;
2643     break;
2644    
2645     case OP_NOT_WORDCHAR:
2646     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2647     return FALSE;
2648     ecode++;
2649     break;
2650    
2651     case OP_WORDCHAR:
2652     if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2653     return FALSE;
2654     ecode++;
2655     break;
2656    
2657     /* Match a back reference, possibly repeatedly. Look past the end of the
2658     item to see if there is repeat information following. The code is similar
2659     to that for character classes, but repeated for efficiency. Then obey
2660     similar code to character type repeats - written out again for speed.
2661     However, if the referenced string is the empty string, always treat
2662     it as matched, any number of times (otherwise there could be infinite
2663     loops). */
2664    
2665     case OP_REF:
2666     {
2667     int length;
2668     int number = ecode[1] << 1; /* Doubled reference number */
2669     ecode += 2; /* Advance past the item */
2670    
2671     if (number >= offset_top || md->offset_vector[number] < 0)
2672     {
2673     md->errorcode = PCRE_ERROR_BADREF;
2674     return FALSE;
2675     }
2676    
2677     length = md->offset_vector[number+1] - md->offset_vector[number];
2678    
2679     switch (*ecode)
2680     {
2681     case OP_CRSTAR:
2682     case OP_CRMINSTAR:
2683     case OP_CRPLUS:
2684     case OP_CRMINPLUS:
2685     case OP_CRQUERY:
2686     case OP_CRMINQUERY:
2687     c = *ecode++ - OP_CRSTAR;
2688     minimize = (c & 1) != 0;
2689     min = rep_min[c]; /* Pick up values from tables; */
2690     max = rep_max[c]; /* zero for max => infinity */
2691     if (max == 0) max = INT_MAX;
2692     break;
2693    
2694     case OP_CRRANGE:
2695     case OP_CRMINRANGE:
2696     minimize = (*ecode == OP_CRMINRANGE);
2697     min = (ecode[1] << 8) + ecode[2];
2698     max = (ecode[3] << 8) + ecode[4];
2699     if (max == 0) max = INT_MAX;
2700     ecode += 5;
2701     break;
2702    
2703     default: /* No repeat follows */
2704     if (!match_ref(number, eptr, length, md)) return FALSE;
2705     eptr += length;
2706     continue; /* With the main loop */
2707     }
2708    
2709     /* If the length of the reference is zero, just continue with the
2710     main loop. */
2711    
2712     if (length == 0) continue;
2713    
2714     /* First, ensure the minimum number of matches are present. We get back
2715     the length of the reference string explicitly rather than passing the
2716     address of eptr, so that eptr can be a register variable. */
2717    
2718     for (i = 1; i <= min; i++)
2719     {
2720     if (!match_ref(number, eptr, length, md)) return FALSE;
2721     eptr += length;
2722     }
2723    
2724     /* If min = max, continue at the same level without recursion.
2725     They are not both allowed to be zero. */
2726    
2727     if (min == max) continue;
2728    
2729     /* If minimizing, keep trying and advancing the pointer */
2730    
2731     if (minimize)
2732     {
2733     for (i = min;; i++)
2734     {
2735     if (match(eptr, ecode, offset_top, md)) return TRUE;
2736     if (i >= max || !match_ref(number, eptr, length, md))
2737     return FALSE;
2738     eptr += length;
2739     }
2740     /* Control never gets here */
2741     }
2742    
2743     /* If maximizing, find the longest string and work backwards */
2744    
2745     else
2746     {
2747 nigel 7 const uschar *pp = eptr;
2748 nigel 3 for (i = min; i < max; i++)
2749     {
2750     if (!match_ref(number, eptr, length, md)) break;
2751     eptr += length;
2752     }
2753     while (eptr >= pp)
2754     {
2755     if (match(eptr, ecode, offset_top, md)) return TRUE;
2756     eptr -= length;
2757     }
2758     return FALSE;
2759     }
2760     }
2761     /* Control never gets here */
2762    
2763     /* Match a character class, possibly repeatedly. Look past the end of the
2764     item to see if there is repeat information following. Then obey similar
2765     code to character type repeats - written out again for speed. If caseless
2766     matching was set at runtime but not at compile time, we have to check both
2767 nigel 13 versions of a character, and we have to behave differently for positive and
2768     negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
2769     treated differently. */
2770 nigel 3
2771     case OP_CLASS:
2772 nigel 13 case OP_NEGCLASS:
2773 nigel 3 {
2774 nigel 13 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
2775 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
2776     ecode += 33; /* Advance past the item */
2777 nigel 3
2778     switch (*ecode)
2779     {
2780     case OP_CRSTAR:
2781     case OP_CRMINSTAR:
2782     case OP_CRPLUS:
2783     case OP_CRMINPLUS:
2784     case OP_CRQUERY:
2785     case OP_CRMINQUERY:
2786     c = *ecode++ - OP_CRSTAR;
2787     minimize = (c & 1) != 0;
2788     min = rep_min[c]; /* Pick up values from tables; */
2789     max = rep_max[c]; /* zero for max => infinity */
2790     if (max == 0) max = INT_MAX;
2791     break;
2792    
2793     case OP_CRRANGE:
2794     case OP_CRMINRANGE:
2795     minimize = (*ecode == OP_CRMINRANGE);
2796     min = (ecode[1] << 8) + ecode[2];
2797     max = (ecode[3] << 8) + ecode[4];
2798     if (max == 0) max = INT_MAX;
2799     ecode += 5;
2800     break;
2801    
2802     default: /* No repeat follows */
2803 nigel 13 min = max = 1;
2804     break;
2805 nigel 3 }
2806    
2807     /* First, ensure the minimum number of matches are present. */
2808    
2809     for (i = 1; i <= min; i++)
2810     {
2811     if (eptr >= md->end_subject) return FALSE;
2812     c = *eptr++;
2813 nigel 13
2814     /* Either not runtime caseless, or it was a positive class. For
2815     runtime caseless, continue if either case is in the map. */
2816    
2817     if (!nasty_case)
2818 nigel 3 {
2819 nigel 13 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2820     if (md->runtime_caseless)
2821     {
2822     c = pcre_fcc[c];
2823     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2824     }
2825     }
2826    
2827     /* Runtime caseless and it was a negative class. Continue only if
2828     both cases are in the map. */
2829    
2830     else
2831     {
2832     if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2833 nigel 3 c = pcre_fcc[c];
2834     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2835     }
2836 nigel 13
2837 nigel 3 return FALSE;
2838     }
2839    
2840     /* If max == min we can continue with the main loop without the
2841     need to recurse. */
2842    
2843     if (min == max) continue;
2844    
2845     /* If minimizing, keep testing the rest of the expression and advancing
2846     the pointer while it matches the class. */
2847    
2848     if (minimize)
2849     {
2850     for (i = min;; i++)
2851     {
2852     if (match(eptr, ecode, offset_top, md)) return TRUE;
2853     if (i >= max || eptr >= md->end_subject) return FALSE;
2854     c = *eptr++;
2855 nigel 13
2856     /* Either not runtime caseless, or it was a positive class. For
2857     runtime caseless, continue if either case is in the map. */
2858    
2859     if (!nasty_case)
2860 nigel 3 {
2861 nigel 13 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2862     if (md->runtime_caseless)
2863     {
2864     c = pcre_fcc[c];
2865     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2866     }
2867     }
2868    
2869     /* Runtime caseless and it was a negative class. Continue only if
2870     both cases are in the map. */
2871    
2872     else
2873     {
2874     if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2875 nigel 3 c = pcre_fcc[c];
2876     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2877     }
2878 nigel 13
2879 nigel 3 return FALSE;
2880     }
2881     /* Control never gets here */
2882     }
2883    
2884     /* If maximizing, find the longest possible run, then work backwards. */
2885    
2886     else
2887     {
2888 nigel 7 const uschar *pp = eptr;
2889 nigel 3 for (i = min; i < max; eptr++, i++)
2890     {
2891     if (eptr >= md->end_subject) break;
2892     c = *eptr;
2893 nigel 13
2894     /* Either not runtime caseless, or it was a positive class. For
2895     runtime caseless, continue if either case is in the map. */
2896    
2897     if (!nasty_case)
2898 nigel 3 {
2899 nigel 13 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2900     if (md->runtime_caseless)
2901     {
2902     c = pcre_fcc[c];
2903     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2904     }
2905     }
2906    
2907     /* Runtime caseless and it was a negative class. Continue only if
2908     both cases are in the map. */
2909    
2910     else
2911     {
2912     if ((data[c/8] & (1 << (c&7))) == 0) break;
2913 nigel 3 c = pcre_fcc[c];
2914     if ((data[c/8] & (1 << (c&7))) != 0) continue;
2915     }
2916 nigel 13
2917 nigel 3 break;
2918     }
2919    
2920     while (eptr >= pp)
2921     if (match(eptr--, ecode, offset_top, md)) return TRUE;
2922     return FALSE;
2923     }
2924     }
2925     /* Control never gets here */
2926    
2927     /* Match a run of characters */
2928    
2929     case OP_CHARS:
2930     {
2931     register int length = ecode[1];
2932     ecode += 2;
2933    
2934 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2935 nigel 3 if (eptr >= md->end_subject)
2936     printf("matching subject <null> against pattern ");
2937     else
2938     {
2939     printf("matching subject ");
2940     pchars(eptr, length, TRUE, md);
2941     printf(" against pattern ");
2942     }
2943     pchars(ecode, length, FALSE, md);
2944     printf("\n");
2945 nigel 9 #endif
2946 nigel 3
2947     if (length > md->end_subject - eptr) return FALSE;
2948     if (md->caseless)
2949     {
2950     while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2951     }
2952     else
2953     {
2954     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2955     }
2956     }
2957     break;
2958    
2959     /* Match a single character repeatedly; different opcodes share code. */
2960    
2961     case OP_EXACT:
2962     min = max = (ecode[1] << 8) + ecode[2];
2963     ecode += 3;
2964     goto REPEATCHAR;
2965    
2966     case OP_UPTO:
2967     case OP_MINUPTO:
2968     min = 0;
2969     max = (ecode[1] << 8) + ecode[2];
2970     minimize = *ecode == OP_MINUPTO;
2971     ecode += 3;
2972     goto REPEATCHAR;
2973    
2974     case OP_STAR:
2975     case OP_MINSTAR:
2976     case OP_PLUS:
2977     case OP_MINPLUS:
2978     case OP_QUERY:
2979     case OP_MINQUERY:
2980     c = *ecode++ - OP_STAR;
2981     minimize = (c & 1) != 0;
2982     min = rep_min[c]; /* Pick up values from tables; */
2983     max = rep_max[c]; /* zero for max => infinity */
2984     if (max == 0) max = INT_MAX;
2985    
2986     /* Common code for all repeated single-character matches. We can give
2987     up quickly if there are fewer than the minimum number of characters left in
2988     the subject. */
2989    
2990     REPEATCHAR:
2991     if (min > md->end_subject - eptr) return FALSE;
2992     c = *ecode++;
2993    
2994     /* The code is duplicated for the caseless and caseful cases, for speed,
2995     since matching characters is likely to be quite common. First, ensure the
2996     minimum number of matches are present. If min = max, continue at the same
2997     level without recursing. Otherwise, if minimizing, keep trying the rest of
2998     the expression and advancing one matching character if failing, up to the
2999     maximum. Alternatively, if maximizing, find the maximum number of
3000     characters and work backwards. */
3001    
3002 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3003     max, eptr));
3004 nigel 3
3005     if (md->caseless)
3006     {
3007     c = pcre_lcc[c];
3008     for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
3009     if (min == max) continue;
3010     if (minimize)
3011     {
3012     for (i = min;; i++)
3013     {
3014     if (match(eptr, ecode, offset_top, md)) return TRUE;
3015     if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3016     return FALSE;
3017     }
3018     /* Control never gets here */
3019     }
3020     else
3021     {
3022 nigel 7 const uschar *pp = eptr;
3023 nigel 3 for (i = min; i < max; i++)
3024     {
3025     if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3026     eptr++;
3027     }
3028     while (eptr >= pp)
3029     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3030     return FALSE;
3031     }
3032     /* Control never gets here */
3033     }
3034    
3035     /* Caseful comparisons */
3036    
3037     else
3038     {
3039     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3040     if (min == max) continue;
3041     if (minimize)
3042     {
3043     for (i = min;; i++)
3044     {
3045     if (match(eptr, ecode, offset_top, md)) return TRUE;
3046     if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3047     }
3048     /* Control never gets here */
3049     }
3050     else
3051     {
3052 nigel 7 const uschar *pp = eptr;
3053 nigel 3 for (i = min; i < max; i++)
3054     {
3055     if (eptr >= md->end_subject || c != *eptr) break;
3056     eptr++;
3057     }
3058     while (eptr >= pp)
3059     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3060     return FALSE;
3061     }
3062     }
3063     /* Control never gets here */
3064    
3065     /* Match a negated single character */
3066    
3067     case OP_NOT:
3068 nigel 9 if (eptr >= md->end_subject) return FALSE;
3069 nigel 3 ecode++;
3070     if (md->caseless)
3071     {
3072     if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3073     }
3074     else
3075     {
3076     if (*ecode++ == *eptr++) return FALSE;
3077     }
3078     break;
3079    
3080     /* Match a negated single character repeatedly. This is almost a repeat of
3081     the code for a repeated single character, but I haven't found a nice way of
3082     commoning these up that doesn't require a test of the positive/negative
3083     option for each character match. Maybe that wouldn't add very much to the
3084     time taken, but character matching *is* what this is all about... */
3085    
3086     case OP_NOTEXACT:
3087     min = max = (ecode[1] << 8) + ecode[2];
3088     ecode += 3;
3089     goto REPEATNOTCHAR;
3090    
3091     case OP_NOTUPTO:
3092     case OP_NOTMINUPTO:
3093     min = 0;
3094     max = (ecode[1] << 8) + ecode[2];
3095     minimize = *ecode == OP_NOTMINUPTO;
3096     ecode += 3;
3097     goto REPEATNOTCHAR;
3098    
3099     case OP_NOTSTAR:
3100     case OP_NOTMINSTAR:
3101     case OP_NOTPLUS:
3102     case OP_NOTMINPLUS:
3103     case OP_NOTQUERY:
3104     case OP_NOTMINQUERY:
3105     c = *ecode++ - OP_NOTSTAR;
3106     minimize = (c & 1) != 0;
3107     min = rep_min[c]; /* Pick up values from tables; */
3108     max = rep_max[c]; /* zero for max => infinity */
3109     if (max == 0) max = INT_MAX;
3110    
3111     /* Common code for all repeated single-character matches. We can give
3112     up quickly if there are fewer than the minimum number of characters left in
3113     the subject. */
3114    
3115     REPEATNOTCHAR:
3116     if (min > md->end_subject - eptr) return FALSE;
3117     c = *ecode++;
3118    
3119     /* The code is duplicated for the caseless and caseful cases, for speed,
3120     since matching characters is likely to be quite common. First, ensure the
3121     minimum number of matches are present. If min = max, continue at the same
3122     level without recursing. Otherwise, if minimizing, keep trying the rest of
3123     the expression and advancing one matching character if failing, up to the
3124     maximum. Alternatively, if maximizing, find the maximum number of
3125     characters and work backwards. */
3126    
3127 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3128     max, eptr));
3129 nigel 3
3130     if (md->caseless)
3131     {
3132     c = pcre_lcc[c];
3133     for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3134     if (min == max) continue;
3135     if (minimize)
3136     {
3137     for (i = min;; i++)
3138     {
3139     if (match(eptr, ecode, offset_top, md)) return TRUE;
3140     if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3141     return FALSE;
3142     }
3143     /* Control never gets here */
3144     }
3145     else
3146     {
3147 nigel 7 const uschar *pp = eptr;
3148 nigel 3 for (i = min; i < max; i++)
3149     {
3150     if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3151     eptr++;
3152     }
3153     while (eptr >= pp)
3154     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3155     return FALSE;
3156     }
3157     /* Control never gets here */
3158     }
3159    
3160     /* Caseful comparisons */
3161    
3162     else
3163     {
3164     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3165     if (min == max) continue;
3166     if (minimize)
3167     {
3168     for (i = min;; i++)
3169     {
3170     if (match(eptr, ecode, offset_top, md)) return TRUE;
3171     if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3172     }
3173     /* Control never gets here */
3174     }
3175     else
3176     {
3177 nigel 7 const uschar *pp = eptr;
3178 nigel 3 for (i = min; i < max; i++)
3179     {
3180     if (eptr >= md->end_subject || c == *eptr) break;
3181     eptr++;
3182     }
3183     while (eptr >= pp)
3184     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3185     return FALSE;
3186     }
3187     }
3188     /* Control never gets here */
3189    
3190     /* Match a single character type repeatedly; several different opcodes
3191     share code. This is very similar to the code for single characters, but we
3192     repeat it in the interests of efficiency. */
3193    
3194     case OP_TYPEEXACT:
3195     min = max = (ecode[1] << 8) + ecode[2];
3196     minimize = TRUE;
3197     ecode += 3;
3198     goto REPEATTYPE;
3199    
3200     case OP_TYPEUPTO:
3201     case OP_TYPEMINUPTO:
3202     min = 0;
3203     max = (ecode[1] << 8) + ecode[2];
3204     minimize = *ecode == OP_TYPEMINUPTO;
3205     ecode += 3;
3206     goto REPEATTYPE;
3207    
3208     case OP_TYPESTAR:
3209     case OP_TYPEMINSTAR:
3210     case OP_TYPEPLUS:
3211     case OP_TYPEMINPLUS:
3212     case OP_TYPEQUERY:
3213     case OP_TYPEMINQUERY:
3214     c = *ecode++ - OP_TYPESTAR;
3215     minimize = (c & 1) != 0;
3216     min = rep_min[c]; /* Pick up values from tables; */
3217     max = rep_max[c]; /* zero for max => infinity */
3218     if (max == 0) max = INT_MAX;
3219    
3220     /* Common code for all repeated single character type matches */
3221    
3222     REPEATTYPE:
3223     ctype = *ecode++; /* Code for the character type */
3224    
3225     /* First, ensure the minimum number of matches are present. Use inline
3226     code for maximizing the speed, and do the type test once at the start
3227     (i.e. keep it out of the loop). Also test that there are at least the
3228     minimum number of characters before we start. */
3229    
3230     if (min > md->end_subject - eptr) return FALSE;
3231     if (min > 0) switch(ctype)
3232     {
3233     case OP_ANY:
3234     if (!md->dotall)
3235     { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3236     else eptr += min;
3237     break;
3238    
3239     case OP_NOT_DIGIT:
3240     for (i = 1; i <= min; i++)
3241     if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3242     break;
3243    
3244     case OP_DIGIT:
3245     for (i = 1; i <= min; i++)
3246     if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3247     break;
3248    
3249     case OP_NOT_WHITESPACE:
3250     for (i = 1; i <= min; i++)
3251     if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3252     break;
3253    
3254     case OP_WHITESPACE:
3255     for (i = 1; i <= min; i++)
3256     if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3257     break;
3258    
3259     case OP_NOT_WORDCHAR:
3260     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3261     return FALSE;
3262     break;
3263    
3264     case OP_WORDCHAR:
3265     for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3266     return FALSE;
3267     break;
3268     }
3269    
3270     /* If min = max, continue at the same level without recursing */
3271    
3272     if (min == max) continue;
3273    
3274     /* If minimizing, we have to test the rest of the pattern before each
3275     subsequent match, so inlining isn't much help; just use the function. */
3276    
3277     if (minimize)
3278     {
3279     for (i = min;; i++)
3280     {
3281     if (match(eptr, ecode, offset_top, md)) return TRUE;
3282     if (i >= max || eptr >= md->end_subject ||
3283     !match_type(ctype, *eptr++, md->dotall))
3284     return FALSE;
3285     }
3286     /* Control never gets here */
3287     }
3288    
3289     /* If maximizing it is worth using inline code for speed, doing the type
3290     test once at the start (i.e. keep it out of the loop). */
3291    
3292     else
3293     {
3294 nigel 7 const uschar *pp = eptr;
3295 nigel 3 switch(ctype)
3296     {
3297     case OP_ANY:
3298     if (!md->dotall)
3299     {
3300     for (i = min; i < max; i++)
3301     {
3302     if (eptr >= md->end_subject || *eptr == '\n') break;
3303     eptr++;
3304     }
3305     }
3306     else
3307     {
3308     c = max - min;
3309     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3310     eptr += c;
3311     }
3312     break;
3313    
3314     case OP_NOT_DIGIT:
3315     for (i = min; i < max; i++)
3316     {
3317     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3318     break;
3319     eptr++;
3320     }
3321     break;
3322    
3323     case OP_DIGIT:
3324     for (i = min; i < max; i++)
3325     {
3326     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3327     break;
3328     eptr++;
3329     }
3330     break;
3331    
3332     case OP_NOT_WHITESPACE:
3333     for (i = min; i < max; i++)
3334     {
3335     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3336     break;
3337     eptr++;
3338     }
3339     break;
3340    
3341     case OP_WHITESPACE:
3342     for (i = min; i < max; i++)
3343     {
3344     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3345     break;
3346     eptr++;
3347     }
3348     break;
3349    
3350     case OP_NOT_WORDCHAR:
3351     for (i = min; i < max; i++)
3352     {
3353     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3354     break;
3355     eptr++;
3356     }
3357     break;
3358    
3359     case OP_WORDCHAR:
3360     for (i = min; i < max; i++)
3361     {
3362     if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3363     break;
3364     eptr++;
3365     }
3366     break;
3367     }
3368    
3369     while (eptr >= pp)
3370     if (match(eptr--, ecode, offset_top, md)) return TRUE;
3371     return FALSE;
3372     }
3373     /* Control never gets here */
3374    
3375     /* There's been some horrible disaster. */
3376    
3377     default:
3378 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
3379 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3380     return FALSE;
3381     }
3382    
3383     /* Do not stick any code in here without much thought; it is assumed
3384     that "continue" in the code above comes out to here to repeat the main
3385     loop. */
3386    
3387     } /* End of main loop */
3388     /* Control never reaches here */
3389     }
3390    
3391    
3392    
3393     /*************************************************
3394 nigel 9 * Segregate setjmp() *
3395     *************************************************/
3396    
3397     /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3398     is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3399     hide it in a separate function. This is called only when PCRE_EXTRA is set,
3400     since it's needed only for the extension \X option, and with any luck, a good
3401     compiler will spot the tail recursion and compile it efficiently.
3402    
3403 nigel 11 Arguments:
3404     eptr pointer in subject
3405     ecode position in code
3406     offset_top current top pointer
3407     md pointer to "static" info for the match
3408    
3409     Returns: TRUE if matched
3410 nigel 9 */
3411    
3412 nigel 11 static BOOL
3413     match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
3414     match_data *match_block)
3415 nigel 9 {
3416 nigel 11 return setjmp(match_block->fail_env) == 0 &&
3417     match(eptr, ecode, offset_top, match_block);
3418 nigel 9 }
3419    
3420    
3421    
3422     /*************************************************
3423 nigel 3 * Execute a Regular Expression *
3424     *************************************************/
3425    
3426     /* This function applies a compiled re to a subject string and picks out
3427     portions of the string if it matches. Two elements in the vector are set for
3428     each substring: the offsets to the start and end of the substring.
3429    
3430     Arguments:
3431     external_re points to the compiled expression
3432     external_extra points to "hints" from pcre_study() or is NULL
3433     subject points to the subject string
3434     length length of subject string (may contain binary zeros)
3435     options option bits
3436     offsets points to a vector of ints to be filled in with offsets
3437     offsetcount the number of elements in the vector
3438    
3439     Returns: > 0 => success; value is the number of elements filled in
3440     = 0 => success, but offsets is not big enough
3441     -1 => failed to match
3442     < -1 => some kind of unexpected problem
3443     */
3444    
3445     int
3446     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3447     const char *subject, int length, int options, int *offsets, int offsetcount)
3448     {
3449 nigel 11 int resetcount, ocount;
3450 nigel 3 int first_char = -1;
3451     match_data match_block;
3452 nigel 7 const uschar *start_bits = NULL;
3453 nigel 9 const uschar *start_match = (const uschar *)subject;
3454 nigel 7 const uschar *end_subject;
3455     const real_pcre *re = (const real_pcre *)external_re;
3456     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3457 nigel 11 BOOL using_temporary_offsets = FALSE;
3458 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3459     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3460    
3461     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3462    
3463     if (re == NULL || subject == NULL ||
3464     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3465     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3466    
3467 nigel 7 match_block.start_subject = (const uschar *)subject;
3468 nigel 3 match_block.end_subject = match_block.start_subject + length;
3469     end_subject = match_block.end_subject;
3470    
3471     match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3472     match_block.runtime_caseless = match_block.caseless &&
3473     (re->options & PCRE_CASELESS) == 0;
3474    
3475     match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3476     match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3477     match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3478    
3479     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3480     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3481    
3482     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3483    
3484     /* If the expression has got more back references than the offsets supplied can
3485     hold, we get a temporary bit of working store to use during the matching.
3486 nigel 11 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3487     of 2. */
3488 nigel 3
3489 nigel 11 ocount = offsetcount & (-2);
3490     if (re->top_backref > 0 && re->top_backref >= ocount/2)
3491 nigel 3 {
3492     ocount = re->top_backref * 2 + 2;
3493 nigel 15 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3494 nigel 3 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3495 nigel 11 using_temporary_offsets = TRUE;
3496 nigel 9 DPRINTF(("Got memory to hold back references\n"));
3497 nigel 3 }
3498     else match_block.offset_vector = offsets;
3499    
3500     match_block.offset_end = ocount;
3501     match_block.offset_overflow = FALSE;
3502    
3503     /* Compute the minimum number of offsets that we need to reset each time. Doing
3504     this makes a huge difference to execution time when there aren't many brackets
3505     in the pattern. */
3506    
3507     resetcount = 2 + re->top_bracket * 2;
3508     if (resetcount > offsetcount) resetcount = ocount;
3509    
3510     /* If MULTILINE is set at exec time but was not set at compile time, and the
3511     anchored flag is set, we must re-check because a setting provoked by ^ in the
3512     pattern is not right in multi-line mode. Calling is_anchored() again here does
3513     the right check, because multiline is now set. If it now yields FALSE, the
3514     expression must have had ^ starting some of its branches. Check to see if
3515     that is true for *all* branches, and if so, set the startline flag. */
3516    
3517     if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3518     !is_anchored(re->code, match_block.multiline))
3519     {
3520     anchored = FALSE;
3521     if (is_startline(re->code)) startline = TRUE;
3522     }
3523    
3524     /* Set up the first character to match, if available. The first_char value is
3525     never set for an anchored regular expression, but the anchoring may be forced
3526     at run time, so we have to test for anchoring. The first char may be unset for
3527     an unanchored pattern, of course. If there's no first char and the pattern was
3528     studied, the may be a bitmap of possible first characters. However, we can
3529     use this only if the caseless state of the studying was correct. */
3530    
3531     if (!anchored)
3532     {
3533     if ((re->options & PCRE_FIRSTSET) != 0)
3534     {
3535     first_char = re->first_char;
3536     if (match_block.caseless) first_char = pcre_lcc[first_char];
3537     }
3538     else
3539     if (!startline && extra != NULL &&
3540     (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3541     ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3542     start_bits = extra->start_bits;
3543     }
3544    
3545     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3546    
3547     do
3548     {
3549 nigel 9 int rc;
3550 nigel 3 register int *iptr = match_block.offset_vector;
3551     register int *iend = iptr + resetcount;
3552    
3553     /* Reset the maximum number of extractions we might see. */
3554    
3555     while (iptr < iend) *iptr++ = -1;
3556    
3557     /* Advance to a unique first char if possible */
3558    
3559     if (first_char >= 0)
3560     {
3561     if (match_block.caseless)
3562     while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3563     start_match++;
3564     else
3565     while (start_match < end_subject && *start_match != first_char)
3566     start_match++;
3567     }
3568    
3569     /* Or to just after \n for a multiline match if possible */
3570    
3571     else if (startline)
3572     {
3573     if (start_match > match_block.start_subject)
3574     {
3575     while (start_match < end_subject && start_match[-1] != '\n')
3576     start_match++;
3577     }
3578     }
3579    
3580     /* Or to a non-unique first char */
3581    
3582     else if (start_bits != NULL)
3583     {
3584     while (start_match < end_subject)
3585     {
3586     register int c = *start_match;
3587     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3588     }
3589     }
3590    
3591 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3592 nigel 3 printf(">>>> Match against: ");
3593     pchars(start_match, end_subject - start_match, TRUE, &match_block);
3594     printf("\n");
3595 nigel 9 #endif
3596 nigel 3
3597     /* When a match occurs, substrings will be set for all internal extractions;
3598     we just need to set up the whole thing as substring 0 before returning. If
3599     there were too many extractions, set the return code to zero. In the case
3600     where we had to get some local store to hold offsets for backreferences, copy
3601     those back references that we can. In this case there need not be overflow
3602     if certain parts of the pattern were not used.
3603    
3604     Before starting the match, we have to set up a longjmp() target to enable
3605 nigel 9 the "cut" operation to fail a match completely without backtracking. This
3606     is done in a separate function to avoid compiler warnings. We need not do
3607     it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3608     enabled. */
3609 nigel 3
3610 nigel 11 if ((re->options & PCRE_EXTRA) != 0)
3611     {
3612     if (!match_with_setjmp(start_match, re->code, 2, &match_block))
3613     continue;
3614     }
3615     else if (!match(start_match, re->code, 2, &match_block)) continue;
3616 nigel 9
3617     /* Copy the offset information from temporary store if necessary */
3618    
3619 nigel 11 if (using_temporary_offsets)
3620 nigel 3 {
3621 nigel 9 if (offsetcount >= 4)
3622 nigel 3 {
3623 nigel 9 memcpy(offsets + 2, match_block.offset_vector + 2,
3624     (offsetcount - 2) * sizeof(int));
3625 nigel 11 DPRINTF(("Copied offsets from temporary memory\n"));
3626 nigel 3 }
3627 nigel 9 if (match_block.end_offset_top > offsetcount)
3628     match_block.offset_overflow = TRUE;
3629 nigel 3
3630 nigel 9 DPRINTF(("Freeing temporary memory\n"));
3631     (pcre_free)(match_block.offset_vector);
3632     }
3633 nigel 3
3634 nigel 9 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3635 nigel 3
3636 nigel 9 if (match_block.offset_end < 2) rc = 0; else
3637     {
3638     offsets[0] = start_match - match_block.start_subject;
3639     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3640 nigel 3 }
3641 nigel 9
3642     DPRINTF((">>>> returning %d\n", rc));
3643     return rc;
3644 nigel 3 }
3645     while (!anchored &&
3646     match_block.errorcode == PCRE_ERROR_NOMATCH &&
3647     start_match++ < end_subject);
3648    
3649 nigel 11 if (using_temporary_offsets)
3650     {
3651     DPRINTF(("Freeing temporary memory\n"));
3652     (pcre_free)(match_block.offset_vector);
3653     }
3654    
3655 nigel 9 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3656 nigel 3
3657     return match_block.errorcode;
3658     }
3659    
3660     /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12