/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 43 - (hide annotations) (download)
Sat Feb 24 21:39:21 2007 UTC (6 years, 2 months ago) by nigel
File MIME type: text/plain
File size: 140886 byte(s)
Load pcre-3.0 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 43 Copyright (c) 1997-2000 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Define DEBUG to get debugging output on stdout. */
37    
38     /* #define DEBUG */
39    
40 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41     inline, and there are *still* stupid compilers about that don't like indented
42     pre-processor statements. I suppose it's only been 10 years... */
43 nigel 3
44 nigel 9 #ifdef DEBUG
45     #define DPRINTF(p) printf p
46     #else
47     #define DPRINTF(p) /*nothing*/
48     #endif
49    
50 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
51     the external pcre header. */
52    
53     #include "internal.h"
54    
55    
56 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58     #ifdef __cplusplus
59     #define class pcre_class
60     #endif
61    
62    
63 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
64     not be set greater than 200. */
65    
66     #define BRASTACK_SIZE 200
67    
68    
69 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70    
71 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73 nigel 3
74 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
75 nigel 3
76     #ifdef DEBUG
77 nigel 7 static const char *OP_names[] = {
78     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80     "Opt", "^", "$", "Any", "chars", "not",
81 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84     "*", "*?", "+", "+?", "?", "??", "{", "{",
85 nigel 43 "class", "Ref", "Recurse",
86 nigel 23 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 nigel 3 "Brazero", "Braminzero", "Bra"
89     };
90     #endif
91    
92     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93     are simple data values; negative values are for special things like \d and so
94     on. Zero means further processing is needed (for things like \x), or the escape
95     is invalid. */
96    
97 nigel 15 static const short int escapes[] = {
98 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 nigel 23 0, 0, -ESC_z /* x - z */
108 nigel 3 };
109    
110 nigel 43 /* Tables of names of POSIX character classes and their lengths. The list is
111     terminated by a zero length entry. The first three must be alpha, upper, lower,
112     as this is assumed for handling case independence. */
113    
114     static const char *posix_names[] = {
115     "alpha", "lower", "upper",
116     "alnum", "ascii", "cntrl", "digit", "graph",
117     "print", "punct", "space", "word", "xdigit" };
118    
119     static const uschar posix_name_lengths[] = {
120     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121    
122     /* Table of class bit maps for each POSIX class; up to three may be combined
123     to form the class. */
124    
125     static const int posix_class_maps[] = {
126     cbit_lower, cbit_upper, -1, /* alpha */
127     cbit_lower, -1, -1, /* lower */
128     cbit_upper, -1, -1, /* upper */
129     cbit_digit, cbit_lower, cbit_upper, /* alnum */
130     cbit_print, cbit_cntrl, -1, /* ascii */
131     cbit_cntrl, -1, -1, /* cntrl */
132     cbit_digit, -1, -1, /* digit */
133     cbit_graph, -1, -1, /* graph */
134     cbit_print, -1, -1, /* print */
135     cbit_punct, -1, -1, /* punct */
136     cbit_space, -1, -1, /* space */
137     cbit_word, -1, -1, /* word */
138     cbit_xdigit,-1, -1 /* xdigit */
139     };
140    
141    
142 nigel 3 /* Definition to allow mutual recursion */
143    
144 nigel 13 static BOOL
145 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146 nigel 37 BOOL, int, int *, int *, compile_data *);
147 nigel 3
148    
149    
150     /*************************************************
151     * Global variables *
152     *************************************************/
153    
154     /* PCRE is thread-clean and doesn't use any global variables in the normal
155     sense. However, it calls memory allocation and free functions via the two
156     indirections below, which are can be changed by the caller, but are shared
157     between all threads. */
158    
159     void *(*pcre_malloc)(size_t) = malloc;
160     void (*pcre_free)(void *) = free;
161    
162    
163    
164    
165     /*************************************************
166 nigel 25 * Default character tables *
167     *************************************************/
168    
169     /* A default set of character tables is included in the PCRE binary. Its source
170     is built by the maketables auxiliary program, which uses the default C ctypes
171     functions, and put in the file chartables.c. These tables are used by PCRE
172     whenever the caller of pcre_compile() does not provide an alternate set of
173     tables. */
174    
175     #include "chartables.c"
176    
177    
178    
179     /*************************************************
180 nigel 3 * Return version string *
181     *************************************************/
182    
183 nigel 39 #define STRING(a) # a
184     #define XSTRING(s) STRING(s)
185    
186 nigel 7 const char *
187 nigel 3 pcre_version(void)
188     {
189 nigel 39 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
190 nigel 3 }
191    
192    
193    
194    
195     /*************************************************
196 nigel 43 * (Obsolete) Return info about compiled pattern *
197 nigel 3 *************************************************/
198    
199 nigel 43 /* This is the original "info" function. It picks potentially useful data out
200     of the private structure, but its interface was too rigid. It remains for
201     backwards compatibility. The public options are passed back in an int - though
202     the re->options field has been expanded to a long int, all the public options
203 nigel 37 at the low end of it, and so even on 16-bit systems this will still be OK.
204     Therefore, I haven't changed the API for pcre_info().
205 nigel 3
206     Arguments:
207     external_re points to compiled code
208     optptr where to pass back the options
209     first_char where to pass back the first character,
210     or -1 if multiline and all branches start ^,
211     or -2 otherwise
212    
213 nigel 43 Returns: number of capturing subpatterns
214 nigel 3 or negative values on error
215     */
216    
217     int
218     pcre_info(const pcre *external_re, int *optptr, int *first_char)
219     {
220 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
221 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
222     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
223 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
224 nigel 3 if (first_char != NULL)
225     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
226     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
227     return re->top_bracket;
228     }
229    
230    
231    
232 nigel 43 /*************************************************
233     * Return info about compiled pattern *
234     *************************************************/
235 nigel 3
236 nigel 43 /* This is a newer "info" function which has an extensible interface so
237     that additional items can be added compatibly.
238    
239     Arguments:
240     external_re points to compiled code
241     external_study points to study data, or NULL
242     what what information is required
243     where where to put the information
244    
245     Returns: 0 if data returned, negative on error
246     */
247    
248     int
249     pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
250     void *where)
251     {
252     const real_pcre *re = (const real_pcre *)external_re;
253     const real_pcre_extra *study = (const real_pcre_extra *)study_data;
254    
255     if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
256     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
257    
258     switch (what)
259     {
260     case PCRE_INFO_OPTIONS:
261     *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
262     break;
263    
264     case PCRE_INFO_SIZE:
265     *((size_t *)where) = re->size;
266     break;
267    
268     case PCRE_INFO_CAPTURECOUNT:
269     *((int *)where) = re->top_bracket;
270     break;
271    
272     case PCRE_INFO_BACKREFMAX:
273     *((int *)where) = re->top_backref;
274     break;
275    
276     case PCRE_INFO_FIRSTCHAR:
277     *((int *)where) =
278     ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
279     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
280     break;
281    
282     case PCRE_INFO_FIRSTTABLE:
283     *((const uschar **)where) =
284     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
285     study->start_bits : NULL;
286     break;
287    
288     case PCRE_INFO_LASTLITERAL:
289     *((int *)where) =
290     ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
291     break;
292    
293     default: return PCRE_ERROR_BADOPTION;
294     }
295    
296     return 0;
297     }
298    
299    
300    
301 nigel 3 #ifdef DEBUG
302     /*************************************************
303     * Debugging function to print chars *
304     *************************************************/
305    
306     /* Print a sequence of chars in printable format, stopping at the end of the
307     subject if the requested.
308    
309     Arguments:
310     p points to characters
311     length number to print
312     is_subject TRUE if printing from within md->start_subject
313     md pointer to matching data block, if is_subject is TRUE
314    
315     Returns: nothing
316     */
317    
318 nigel 9 static void
319     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
320 nigel 3 {
321     int c;
322     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
323     while (length-- > 0)
324     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
325     }
326     #endif
327    
328    
329    
330    
331     /*************************************************
332     * Handle escapes *
333     *************************************************/
334    
335     /* This function is called when a \ has been encountered. It either returns a
336     positive value for a simple escape such as \n, or a negative value which
337     encodes one of the more complicated things such as \d. On entry, ptr is
338     pointing at the \. On exit, it is on the final character of the escape
339     sequence.
340    
341     Arguments:
342     ptrptr points to the pattern position pointer
343     errorptr points to the pointer to the error message
344     bracount number of previous extracting brackets
345     options the options bits
346     isclass TRUE if inside a character class
347 nigel 25 cd pointer to char tables block
348 nigel 3
349     Returns: zero or positive => a data character
350     negative => a special escape sequence
351     on error, errorptr is set
352     */
353    
354     static int
355 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
356 nigel 25 int options, BOOL isclass, compile_data *cd)
357 nigel 3 {
358 nigel 7 const uschar *ptr = *ptrptr;
359 nigel 43 int c, i;
360 nigel 3
361 nigel 43 c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
362 nigel 3 if (c == 0) *errorptr = ERR1;
363    
364     /* Digits or letters may have special meaning; all others are literals. */
365    
366     else if (c < '0' || c > 'z') {}
367    
368     /* Do an initial lookup in a table. A non-zero result is something that can be
369     returned immediately. Otherwise further processing may be required. */
370    
371     else if ((i = escapes[c - '0']) != 0) c = i;
372    
373     /* Escapes that need further processing, or are illegal. */
374    
375     else
376     {
377 nigel 7 const uschar *oldptr;
378 nigel 3 switch (c)
379     {
380     /* The handling of escape sequences consisting of a string of digits
381     starting with one that is not zero is not straightforward. By experiment,
382     the way Perl works seems to be as follows:
383    
384     Outside a character class, the digits are read as a decimal number. If the
385     number is less than 10, or if there are that many previous extracting
386     left brackets, then it is a back reference. Otherwise, up to three octal
387     digits are read to form an escaped byte. Thus \123 is likely to be octal
388     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
389     value is greater than 377, the least significant 8 bits are taken. Inside a
390     character class, \ followed by a digit is always an octal number. */
391    
392     case '1': case '2': case '3': case '4': case '5':
393     case '6': case '7': case '8': case '9':
394    
395     if (!isclass)
396     {
397     oldptr = ptr;
398     c -= '0';
399 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
400 nigel 3 c = c * 10 + *(++ptr) - '0';
401     if (c < 10 || c <= bracount)
402     {
403     c = -(ESC_REF + c);
404     break;
405     }
406     ptr = oldptr; /* Put the pointer back and fall through */
407     }
408    
409     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
410     generates a binary zero byte and treats the digit as a following literal.
411     Thus we have to pull back the pointer by one. */
412    
413     if ((c = *ptr) >= '8')
414     {
415     ptr--;
416     c = 0;
417     break;
418     }
419    
420     /* \0 always starts an octal number, but we may drop through to here with a
421     larger first octal digit */
422    
423     case '0':
424     c -= '0';
425 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
426 nigel 3 ptr[1] != '8' && ptr[1] != '9')
427     c = c * 8 + *(++ptr) - '0';
428     break;
429    
430     /* Special escapes not starting with a digit are straightforward */
431    
432     case 'x':
433     c = 0;
434 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
435 nigel 3 {
436     ptr++;
437 nigel 25 c = c * 16 + cd->lcc[*ptr] -
438     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
439 nigel 3 }
440     break;
441    
442     case 'c':
443     c = *(++ptr);
444     if (c == 0)
445     {
446     *errorptr = ERR2;
447     return 0;
448     }
449    
450     /* A letter is upper-cased; then the 0x40 bit is flipped */
451    
452 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
453 nigel 3 c ^= 0x40;
454     break;
455    
456     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
457     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
458 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
459     there used to be some cases other than the default, and there may be again
460     in future, so I haven't "optimized" it. */
461 nigel 3
462     default:
463     if ((options & PCRE_EXTRA) != 0) switch(c)
464     {
465     default:
466     *errorptr = ERR3;
467     break;
468     }
469     break;
470     }
471     }
472    
473     *ptrptr = ptr;
474     return c;
475     }
476    
477    
478    
479     /*************************************************
480     * Check for counted repeat *
481     *************************************************/
482    
483     /* This function is called when a '{' is encountered in a place where it might
484     start a quantifier. It looks ahead to see if it really is a quantifier or not.
485     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
486     where the ddds are digits.
487    
488     Arguments:
489     p pointer to the first char after '{'
490 nigel 25 cd pointer to char tables block
491 nigel 3
492     Returns: TRUE or FALSE
493     */
494    
495     static BOOL
496 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
497 nigel 3 {
498 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
499     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
500 nigel 3 if (*p == '}') return TRUE;
501    
502     if (*p++ != ',') return FALSE;
503     if (*p == '}') return TRUE;
504    
505 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
506     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
507 nigel 3 return (*p == '}');
508     }
509    
510    
511    
512     /*************************************************
513     * Read repeat counts *
514     *************************************************/
515    
516     /* Read an item of the form {n,m} and return the values. This is called only
517     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
518     so the syntax is guaranteed to be correct, but we need to check the values.
519    
520     Arguments:
521     p pointer to first char after '{'
522     minp pointer to int for min
523     maxp pointer to int for max
524     returned as -1 if no max
525     errorptr points to pointer to error message
526 nigel 25 cd pointer to character tables clock
527 nigel 3
528     Returns: pointer to '}' on success;
529     current ptr on error, with errorptr set
530     */
531    
532 nigel 7 static const uschar *
533 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
534     const char **errorptr, compile_data *cd)
535 nigel 3 {
536     int min = 0;
537     int max = -1;
538    
539 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
540 nigel 3
541     if (*p == '}') max = min; else
542     {
543     if (*(++p) != '}')
544     {
545     max = 0;
546 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
547 nigel 3 if (max < min)
548     {
549     *errorptr = ERR4;
550     return p;
551     }
552     }
553     }
554    
555     /* Do paranoid checks, then fill in the required variables, and pass back the
556     pointer to the terminating '}'. */
557    
558     if (min > 65535 || max > 65535)
559     *errorptr = ERR5;
560     else
561     {
562     *minp = min;
563     *maxp = max;
564     }
565     return p;
566     }
567    
568    
569    
570     /*************************************************
571 nigel 23 * Find the fixed length of a pattern *
572     *************************************************/
573    
574     /* Scan a pattern and compute the fixed length of subject that will match it,
575     if the length is fixed. This is needed for dealing with backward assertions.
576    
577     Arguments:
578     code points to the start of the pattern (the bracket)
579    
580     Returns: the fixed length, or -1 if there is no fixed length
581     */
582    
583     static int
584     find_fixedlength(uschar *code)
585     {
586     int length = -1;
587    
588     register int branchlength = 0;
589     register uschar *cc = code + 3;
590    
591     /* Scan along the opcodes for this branch. If we get to the end of the
592     branch, check the length against that of the other branches. */
593    
594     for (;;)
595     {
596     int d;
597     register int op = *cc;
598     if (op >= OP_BRA) op = OP_BRA;
599    
600     switch (op)
601     {
602     case OP_BRA:
603     case OP_ONCE:
604     case OP_COND:
605     d = find_fixedlength(cc);
606     if (d < 0) return -1;
607     branchlength += d;
608     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
609     cc += 3;
610     break;
611    
612     /* Reached end of a branch; if it's a ket it is the end of a nested
613     call. If it's ALT it is an alternation in a nested call. If it is
614     END it's the end of the outer call. All can be handled by the same code. */
615    
616     case OP_ALT:
617     case OP_KET:
618     case OP_KETRMAX:
619     case OP_KETRMIN:
620     case OP_END:
621     if (length < 0) length = branchlength;
622     else if (length != branchlength) return -1;
623     if (*cc != OP_ALT) return length;
624     cc += 3;
625     branchlength = 0;
626     break;
627    
628     /* Skip over assertive subpatterns */
629    
630     case OP_ASSERT:
631     case OP_ASSERT_NOT:
632     case OP_ASSERTBACK:
633     case OP_ASSERTBACK_NOT:
634     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
635     cc += 3;
636     break;
637    
638     /* Skip over things that don't match chars */
639    
640     case OP_REVERSE:
641     cc++;
642 nigel 37 /* Fall through */
643 nigel 23
644     case OP_CREF:
645     case OP_OPT:
646     cc++;
647     /* Fall through */
648    
649     case OP_SOD:
650     case OP_EOD:
651     case OP_EODN:
652     case OP_CIRC:
653     case OP_DOLL:
654     case OP_NOT_WORD_BOUNDARY:
655     case OP_WORD_BOUNDARY:
656     cc++;
657     break;
658    
659     /* Handle char strings */
660    
661     case OP_CHARS:
662     branchlength += *(++cc);
663     cc += *cc + 1;
664     break;
665    
666     /* Handle exact repetitions */
667    
668     case OP_EXACT:
669     case OP_TYPEEXACT:
670     branchlength += (cc[1] << 8) + cc[2];
671     cc += 4;
672     break;
673    
674     /* Handle single-char matchers */
675    
676     case OP_NOT_DIGIT:
677     case OP_DIGIT:
678     case OP_NOT_WHITESPACE:
679     case OP_WHITESPACE:
680     case OP_NOT_WORDCHAR:
681     case OP_WORDCHAR:
682     case OP_ANY:
683     branchlength++;
684     cc++;
685     break;
686    
687    
688     /* Check a class for variable quantification */
689    
690     case OP_CLASS:
691     cc += (*cc == OP_REF)? 2 : 33;
692    
693     switch (*cc)
694     {
695     case OP_CRSTAR:
696     case OP_CRMINSTAR:
697     case OP_CRQUERY:
698     case OP_CRMINQUERY:
699     return -1;
700    
701     case OP_CRRANGE:
702     case OP_CRMINRANGE:
703     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
704     branchlength += (cc[1] << 8) + cc[2];
705     cc += 5;
706     break;
707    
708     default:
709     branchlength++;
710     }
711     break;
712    
713     /* Anything else is variable length */
714    
715     default:
716     return -1;
717     }
718     }
719     /* Control never gets here */
720     }
721    
722    
723    
724    
725     /*************************************************
726 nigel 43 * Check for POSIX class syntax *
727     *************************************************/
728    
729     /* This function is called when the sequence "[:" or "[." or "[=" is
730     encountered in a character class. It checks whether this is followed by an
731     optional ^ and then a sequence of letters, terminated by a matching ":]" or
732     ".]" or "=]".
733    
734     Argument:
735     ptr pointer to the initial [
736     endptr where to return the end pointer
737     cd pointer to compile data
738    
739     Returns: TRUE or FALSE
740     */
741    
742     static BOOL
743     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
744     {
745     int terminator; /* Don't combine these lines; the Solaris cc */
746     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
747     if (*(++ptr) == '^') ptr++;
748     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
749     if (*ptr == terminator && ptr[1] == ']')
750     {
751     *endptr = ptr;
752     return TRUE;
753     }
754     return FALSE;
755     }
756    
757    
758    
759    
760     /*************************************************
761     * Check POSIX class name *
762     *************************************************/
763    
764     /* This function is called to check the name given in a POSIX-style class entry
765     such as [:alnum:].
766    
767     Arguments:
768     ptr points to the first letter
769     len the length of the name
770    
771     Returns: a value representing the name, or -1 if unknown
772     */
773    
774     static int
775     check_posix_name(const uschar *ptr, int len)
776     {
777     register int yield = 0;
778     while (posix_name_lengths[yield] != 0)
779     {
780     if (len == posix_name_lengths[yield] &&
781     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
782     yield++;
783     }
784     return -1;
785     }
786    
787    
788    
789    
790     /*************************************************
791 nigel 3 * Compile one branch *
792     *************************************************/
793    
794     /* Scan the pattern, compiling it into the code vector.
795    
796     Arguments:
797 nigel 25 options the option bits
798     brackets points to number of brackets used
799     code points to the pointer to the current code point
800     ptrptr points to the current pattern pointer
801     errorptr points to pointer to error message
802     optchanged set to the value of the last OP_OPT item compiled
803 nigel 37 reqchar set to the last literal character required, else -1
804     countlits set to count of mandatory literal characters
805 nigel 25 cd contains pointers to tables
806 nigel 3
807 nigel 25 Returns: TRUE on success
808     FALSE, with *errorptr set on error
809 nigel 3 */
810    
811     static BOOL
812 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
813 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
814 nigel 37 int *reqchar, int *countlits, compile_data *cd)
815 nigel 3 {
816     int repeat_type, op_type;
817     int repeat_min, repeat_max;
818     int bravalue, length;
819 nigel 19 int greedy_default, greedy_non_default;
820 nigel 37 int prevreqchar;
821     int condcount = 0;
822     int subcountlits = 0;
823 nigel 3 register int c;
824     register uschar *code = *codeptr;
825 nigel 23 uschar *tempcode;
826 nigel 7 const uschar *ptr = *ptrptr;
827 nigel 23 const uschar *tempptr;
828 nigel 3 uschar *previous = NULL;
829     uschar class[32];
830    
831 nigel 19 /* Set up the default and non-default settings for greediness */
832    
833     greedy_default = ((options & PCRE_UNGREEDY) != 0);
834     greedy_non_default = greedy_default ^ 1;
835    
836 nigel 37 /* Initialize no required char, and count of literals */
837    
838     *reqchar = prevreqchar = -1;
839     *countlits = 0;
840    
841 nigel 3 /* Switch on next character until the end of the branch */
842    
843     for (;; ptr++)
844     {
845     BOOL negate_class;
846 nigel 23 int class_charcount;
847     int class_lastchar;
848     int newoptions;
849     int condref;
850 nigel 37 int subreqchar;
851 nigel 3
852     c = *ptr;
853     if ((options & PCRE_EXTENDED) != 0)
854     {
855 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
856 nigel 3 if (c == '#')
857     {
858     while ((c = *(++ptr)) != 0 && c != '\n');
859     continue;
860     }
861     }
862    
863     switch(c)
864     {
865     /* The branch terminates at end of string, |, or ). */
866    
867     case 0:
868     case '|':
869     case ')':
870     *codeptr = code;
871     *ptrptr = ptr;
872     return TRUE;
873    
874     /* Handle single-character metacharacters */
875    
876     case '^':
877     previous = NULL;
878     *code++ = OP_CIRC;
879     break;
880    
881     case '$':
882     previous = NULL;
883     *code++ = OP_DOLL;
884     break;
885    
886     case '.':
887     previous = code;
888     *code++ = OP_ANY;
889     break;
890    
891     /* Character classes. These always build a 32-byte bitmap of the permitted
892     characters, except in the special case where there is only one character.
893     For negated classes, we build the map as usual, then invert it at the end.
894     */
895    
896     case '[':
897     previous = code;
898 nigel 23 *code++ = OP_CLASS;
899 nigel 3
900 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
901 nigel 3
902     if ((c = *(++ptr)) == '^')
903     {
904     negate_class = TRUE;
905     c = *(++ptr);
906     }
907 nigel 23 else negate_class = FALSE;
908 nigel 3
909     /* Keep a count of chars so that we can optimize the case of just a single
910     character. */
911    
912     class_charcount = 0;
913     class_lastchar = -1;
914    
915     /* Initialize the 32-char bit map to all zeros. We have to build the
916     map in a temporary bit of store, in case the class contains only 1
917     character, because in that case the compiled code doesn't use the
918     bit map. */
919    
920     memset(class, 0, 32 * sizeof(uschar));
921    
922     /* Process characters until ] is reached. By writing this as a "do" it
923     means that an initial ] is taken as a data character. */
924    
925     do
926     {
927     if (c == 0)
928     {
929     *errorptr = ERR6;
930     goto FAILED;
931     }
932    
933 nigel 43 /* Handle POSIX class names. Perl allows a negation extension of the
934     form [:^name]. A square bracket that doesn't match the syntax is
935     treated as a literal. We also recognize the POSIX constructions
936     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
937     5.6 does. */
938    
939     if (c == '[' &&
940     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
941     check_posix_syntax(ptr, &tempptr, cd))
942     {
943     BOOL local_negate = FALSE;
944     int posix_class, i;
945     register const uschar *cbits = cd->cbits;
946    
947     if (ptr[1] != ':')
948     {
949     *errorptr = ERR31;
950     goto FAILED;
951     }
952    
953     ptr += 2;
954     if (*ptr == '^')
955     {
956     local_negate = TRUE;
957     ptr++;
958     }
959    
960     posix_class = check_posix_name(ptr, tempptr - ptr);
961     if (posix_class < 0)
962     {
963     *errorptr = ERR30;
964     goto FAILED;
965     }
966    
967     /* If matching is caseless, upper and lower are converted to
968     alpha. This relies on the fact that the class table starts with
969     alpha, lower, upper as the first 3 entries. */
970    
971     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
972     posix_class = 0;
973    
974     /* Or into the map we are building up to 3 of the static class
975     tables, or their negations. */
976    
977     posix_class *= 3;
978     for (i = 0; i < 3; i++)
979     {
980     int taboffset = posix_class_maps[posix_class + i];
981     if (taboffset < 0) break;
982     if (local_negate)
983     for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
984     else
985     for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
986     }
987    
988     ptr = tempptr + 1;
989     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
990     continue;
991     }
992    
993 nigel 3 /* Backslash may introduce a single character, or it may introduce one
994     of the specials, which just set a flag. Escaped items are checked for
995     validity in the pre-compiling pass. The sequence \b is a special case.
996 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
997 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
998     or into the one we are building. We assume they have more than one
999     character in them, so set class_count bigger than one. */
1000    
1001     if (c == '\\')
1002     {
1003 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1004 nigel 3 if (-c == ESC_b) c = '\b';
1005     else if (c < 0)
1006     {
1007 nigel 25 register const uschar *cbits = cd->cbits;
1008 nigel 3 class_charcount = 10;
1009     switch (-c)
1010     {
1011     case ESC_d:
1012 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1013 nigel 3 continue;
1014    
1015     case ESC_D:
1016 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1017 nigel 3 continue;
1018    
1019     case ESC_w:
1020 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1021 nigel 3 continue;
1022    
1023     case ESC_W:
1024 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1025 nigel 3 continue;
1026    
1027     case ESC_s:
1028 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1029 nigel 3 continue;
1030    
1031     case ESC_S:
1032 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1033 nigel 3 continue;
1034    
1035     default:
1036     *errorptr = ERR7;
1037     goto FAILED;
1038     }
1039     }
1040     /* Fall through if single character */
1041     }
1042    
1043     /* A single character may be followed by '-' to form a range. However,
1044     Perl does not permit ']' to be the end of the range. A '-' character
1045     here is treated as a literal. */
1046    
1047     if (ptr[1] == '-' && ptr[2] != ']')
1048     {
1049     int d;
1050     ptr += 2;
1051     d = *ptr;
1052    
1053     if (d == 0)
1054     {
1055     *errorptr = ERR6;
1056     goto FAILED;
1057     }
1058    
1059     /* The second part of a range can be a single-character escape, but
1060     not any of the other escapes. */
1061    
1062     if (d == '\\')
1063     {
1064 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1065 nigel 3 if (d < 0)
1066     {
1067     if (d == -ESC_b) d = '\b'; else
1068     {
1069     *errorptr = ERR7;
1070     goto FAILED;
1071     }
1072     }
1073     }
1074    
1075     if (d < c)
1076     {
1077     *errorptr = ERR8;
1078     goto FAILED;
1079     }
1080    
1081     for (; c <= d; c++)
1082     {
1083     class[c/8] |= (1 << (c&7));
1084     if ((options & PCRE_CASELESS) != 0)
1085     {
1086 nigel 25 int uc = cd->fcc[c]; /* flip case */
1087 nigel 3 class[uc/8] |= (1 << (uc&7));
1088     }
1089     class_charcount++; /* in case a one-char range */
1090     class_lastchar = c;
1091     }
1092     continue; /* Go get the next char in the class */
1093     }
1094    
1095     /* Handle a lone single character - we can get here for a normal
1096     non-escape char, or after \ that introduces a single character. */
1097    
1098     class [c/8] |= (1 << (c&7));
1099     if ((options & PCRE_CASELESS) != 0)
1100     {
1101 nigel 25 c = cd->fcc[c]; /* flip case */
1102 nigel 3 class[c/8] |= (1 << (c&7));
1103     }
1104     class_charcount++;
1105     class_lastchar = c;
1106     }
1107    
1108     /* Loop until ']' reached; the check for end of string happens inside the
1109     loop. This "while" is the end of the "do" above. */
1110    
1111     while ((c = *(++ptr)) != ']');
1112    
1113     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1114     precisely one character. This doesn't need the whole 32-byte bit map.
1115     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1116     it's negative. */
1117    
1118     if (class_charcount == 1 && class_lastchar >= 0)
1119     {
1120     if (negate_class)
1121     {
1122     code[-1] = OP_NOT;
1123     }
1124     else
1125     {
1126     code[-1] = OP_CHARS;
1127     *code++ = 1;
1128     }
1129     *code++ = class_lastchar;
1130     }
1131    
1132     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1133     the code vector. */
1134    
1135     else
1136     {
1137     if (negate_class)
1138     for (c = 0; c < 32; c++) code[c] = ~class[c];
1139     else
1140     memcpy(code, class, 32);
1141     code += 32;
1142     }
1143     break;
1144    
1145     /* Various kinds of repeat */
1146    
1147     case '{':
1148 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1149     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1150 nigel 3 if (*errorptr != NULL) goto FAILED;
1151     goto REPEAT;
1152    
1153     case '*':
1154     repeat_min = 0;
1155     repeat_max = -1;
1156     goto REPEAT;
1157    
1158     case '+':
1159     repeat_min = 1;
1160     repeat_max = -1;
1161     goto REPEAT;
1162    
1163     case '?':
1164     repeat_min = 0;
1165     repeat_max = 1;
1166    
1167     REPEAT:
1168     if (previous == NULL)
1169     {
1170     *errorptr = ERR9;
1171     goto FAILED;
1172     }
1173    
1174 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
1175     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1176 nigel 3 next character. */
1177    
1178 nigel 19 if (ptr[1] == '?')
1179     { repeat_type = greedy_non_default; ptr++; }
1180     else repeat_type = greedy_default;
1181 nigel 3
1182     /* If previous was a string of characters, chop off the last one and use it
1183     as the subject of the repeat. If there was only one character, we can
1184 nigel 37 abolish the previous item altogether. A repeat with a zero minimum wipes
1185     out any reqchar setting, backing up to the previous value. We must also
1186     adjust the countlits value. */
1187 nigel 3
1188 nigel 37 if (*previous == OP_CHARS)
1189 nigel 3 {
1190     int len = previous[1];
1191 nigel 37
1192     if (repeat_min == 0) *reqchar = prevreqchar;
1193     *countlits += repeat_min - 1;
1194    
1195 nigel 3 if (len == 1)
1196     {
1197     c = previous[2];
1198     code = previous;
1199     }
1200     else
1201     {
1202     c = previous[len+1];
1203     previous[1]--;
1204     code--;
1205     }
1206     op_type = 0; /* Use single-char op codes */
1207     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1208     }
1209    
1210     /* If previous was a single negated character ([^a] or similar), we use
1211     one of the special opcodes, replacing it. The code is shared with single-
1212     character repeats by adding a suitable offset into repeat_type. */
1213    
1214     else if ((int)*previous == OP_NOT)
1215     {
1216     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1217     c = previous[1];
1218     code = previous;
1219     goto OUTPUT_SINGLE_REPEAT;
1220     }
1221    
1222     /* If previous was a character type match (\d or similar), abolish it and
1223     create a suitable repeat item. The code is shared with single-character
1224     repeats by adding a suitable offset into repeat_type. */
1225    
1226 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1227 nigel 3 {
1228     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1229     c = *previous;
1230     code = previous;
1231    
1232     OUTPUT_SINGLE_REPEAT:
1233    
1234 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
1235     this case, so we do too - by simply omitting the item altogether. */
1236    
1237     if (repeat_max == 0) goto END_REPEAT;
1238    
1239     /* Combine the op_type with the repeat_type */
1240    
1241     repeat_type += op_type;
1242    
1243 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
1244     an UPTO, with the maximum given. */
1245    
1246     if (repeat_min == 0)
1247     {
1248     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1249     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1250     else
1251     {
1252     *code++ = OP_UPTO + repeat_type;
1253     *code++ = repeat_max >> 8;
1254     *code++ = (repeat_max & 255);
1255     }
1256     }
1257    
1258     /* The case {1,} is handled as the special case + */
1259    
1260     else if (repeat_min == 1 && repeat_max == -1)
1261     *code++ = OP_PLUS + repeat_type;
1262    
1263     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1264     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1265    
1266     else
1267     {
1268     if (repeat_min != 1)
1269     {
1270     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1271     *code++ = repeat_min >> 8;
1272     *code++ = (repeat_min & 255);
1273     }
1274    
1275     /* If the mininum is 1 and the previous item was a character string,
1276     we either have to put back the item that got cancelled if the string
1277     length was 1, or add the character back onto the end of a longer
1278 nigel 21 string. For a character type nothing need be done; it will just get
1279     put back naturally. Note that the final character is always going to
1280     get added below. */
1281 nigel 3
1282     else if (*previous == OP_CHARS)
1283     {
1284     if (code == previous) code += 2; else previous[1]++;
1285     }
1286    
1287 nigel 21 /* For a single negated character we also have to put back the
1288     item that got cancelled. */
1289    
1290     else if (*previous == OP_NOT) code++;
1291    
1292 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1293 nigel 3
1294 nigel 9 if (repeat_max < 0)
1295 nigel 3 {
1296     *code++ = c;
1297 nigel 9 *code++ = OP_STAR + repeat_type;
1298     }
1299    
1300     /* Else insert an UPTO if the max is greater than the min. */
1301    
1302     else if (repeat_max != repeat_min)
1303     {
1304     *code++ = c;
1305 nigel 3 repeat_max -= repeat_min;
1306     *code++ = OP_UPTO + repeat_type;
1307     *code++ = repeat_max >> 8;
1308     *code++ = (repeat_max & 255);
1309     }
1310     }
1311    
1312     /* The character or character type itself comes last in all cases. */
1313    
1314     *code++ = c;
1315     }
1316    
1317     /* If previous was a character class or a back reference, we put the repeat
1318 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
1319 nigel 3
1320 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1321 nigel 3 {
1322 nigel 37 if (repeat_max == 0)
1323     {
1324     code = previous;
1325     goto END_REPEAT;
1326     }
1327 nigel 3 if (repeat_min == 0 && repeat_max == -1)
1328     *code++ = OP_CRSTAR + repeat_type;
1329     else if (repeat_min == 1 && repeat_max == -1)
1330     *code++ = OP_CRPLUS + repeat_type;
1331     else if (repeat_min == 0 && repeat_max == 1)
1332     *code++ = OP_CRQUERY + repeat_type;
1333     else
1334     {
1335     *code++ = OP_CRRANGE + repeat_type;
1336     *code++ = repeat_min >> 8;
1337     *code++ = repeat_min & 255;
1338     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1339     *code++ = repeat_max >> 8;
1340     *code++ = repeat_max & 255;
1341     }
1342     }
1343    
1344     /* If previous was a bracket group, we may have to replicate it in certain
1345 nigel 23 cases. */
1346 nigel 3
1347 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1348     (int)*previous == OP_COND)
1349 nigel 3 {
1350 nigel 31 register int i;
1351     int ketoffset = 0;
1352 nigel 9 int len = code - previous;
1353 nigel 31 uschar *bralink = NULL;
1354 nigel 3
1355 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1356     by scanning through from the start, and compute the offset back to it
1357     from the current code pointer. There may be an OP_OPT setting following
1358     the final KET, so we can't find the end just by going back from the code
1359     pointer. */
1360    
1361     if (repeat_max == -1)
1362 nigel 3 {
1363 nigel 23 register uschar *ket = previous;
1364     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1365     ketoffset = code - ket;
1366 nigel 3 }
1367    
1368 nigel 31 /* The case of a zero minimum is special because of the need to stick
1369     OP_BRAZERO in front of it, and because the group appears once in the
1370     data, whereas in other cases it appears the minimum number of times. For
1371     this reason, it is simplest to treat this case separately, as otherwise
1372     the code gets far too mess. There are several special subcases when the
1373     minimum is zero. */
1374    
1375     if (repeat_min == 0)
1376     {
1377 nigel 37 /* If we set up a required char from the bracket, we must back off
1378     to the previous value and reset the countlits value too. */
1379    
1380     if (subcountlits > 0)
1381     {
1382     *reqchar = prevreqchar;
1383     *countlits -= subcountlits;
1384     }
1385    
1386 nigel 31 /* If the maximum is also zero, we just omit the group from the output
1387     altogether. */
1388    
1389     if (repeat_max == 0)
1390     {
1391     code = previous;
1392 nigel 37 goto END_REPEAT;
1393 nigel 31 }
1394    
1395     /* If the maximum is 1 or unlimited, we just have to stick in the
1396     BRAZERO and do no more at this point. */
1397    
1398     if (repeat_max <= 1)
1399     {
1400     memmove(previous+1, previous, len);
1401     code++;
1402     *previous++ = OP_BRAZERO + repeat_type;
1403     }
1404    
1405     /* If the maximum is greater than 1 and limited, we have to replicate
1406     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1407     The first one has to be handled carefully because it's the original
1408     copy, which has to be moved up. The remainder can be handled by code
1409     that is common with the non-zero minimum case below. We just have to
1410     adjust the value or repeat_max, since one less copy is required. */
1411    
1412     else
1413     {
1414     int offset;
1415     memmove(previous+4, previous, len);
1416     code += 4;
1417     *previous++ = OP_BRAZERO + repeat_type;
1418     *previous++ = OP_BRA;
1419    
1420     /* We chain together the bracket offset fields that have to be
1421     filled in later when the ends of the brackets are reached. */
1422    
1423     offset = (bralink == NULL)? 0 : previous - bralink;
1424     bralink = previous;
1425     *previous++ = offset >> 8;
1426     *previous++ = offset & 255;
1427     }
1428    
1429     repeat_max--;
1430     }
1431    
1432     /* If the minimum is greater than zero, replicate the group as many
1433     times as necessary, and adjust the maximum to the number of subsequent
1434     copies that we need. */
1435    
1436     else
1437     {
1438     for (i = 1; i < repeat_min; i++)
1439     {
1440     memcpy(code, previous, len);
1441     code += len;
1442     }
1443     if (repeat_max > 0) repeat_max -= repeat_min;
1444     }
1445    
1446     /* This code is common to both the zero and non-zero minimum cases. If
1447     the maximum is limited, it replicates the group in a nested fashion,
1448     remembering the bracket starts on a stack. In the case of a zero minimum,
1449     the first one was set up above. In all cases the repeat_max now specifies
1450     the number of additional copies needed. */
1451    
1452     if (repeat_max >= 0)
1453     {
1454     for (i = repeat_max - 1; i >= 0; i--)
1455     {
1456     *code++ = OP_BRAZERO + repeat_type;
1457    
1458     /* All but the final copy start a new nesting, maintaining the
1459     chain of brackets outstanding. */
1460    
1461     if (i != 0)
1462     {
1463     int offset;
1464     *code++ = OP_BRA;
1465     offset = (bralink == NULL)? 0 : code - bralink;
1466     bralink = code;
1467     *code++ = offset >> 8;
1468     *code++ = offset & 255;
1469     }
1470    
1471     memcpy(code, previous, len);
1472     code += len;
1473     }
1474    
1475     /* Now chain through the pending brackets, and fill in their length
1476     fields (which are holding the chain links pro tem). */
1477    
1478     while (bralink != NULL)
1479     {
1480     int oldlinkoffset;
1481     int offset = code - bralink + 1;
1482     uschar *bra = code - offset;
1483     oldlinkoffset = (bra[1] << 8) + bra[2];
1484     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1485     *code++ = OP_KET;
1486     *code++ = bra[1] = offset >> 8;
1487     *code++ = bra[2] = (offset & 255);
1488     }
1489     }
1490    
1491     /* If the maximum is unlimited, set a repeater in the final copy. We
1492     can't just offset backwards from the current code point, because we
1493     don't know if there's been an options resetting after the ket. The
1494     correct offset was computed above. */
1495    
1496     else code[-ketoffset] = OP_KETRMAX + repeat_type;
1497 nigel 3 }
1498    
1499     /* Else there's some kind of shambles */
1500    
1501     else
1502     {
1503     *errorptr = ERR11;
1504     goto FAILED;
1505     }
1506    
1507     /* In all case we no longer have a previous item. */
1508    
1509 nigel 37 END_REPEAT:
1510 nigel 3 previous = NULL;
1511     break;
1512    
1513    
1514 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1515     lookbehind or option setting or condition. First deal with special things
1516     that can come after a bracket; all are introduced by ?, and the appearance
1517     of any of them means that this is not a referencing group. They were
1518     checked for validity in the first pass over the string, so we don't have to
1519     check for syntax errors here. */
1520 nigel 3
1521     case '(':
1522 nigel 23 newoptions = options;
1523     condref = -1;
1524    
1525 nigel 3 if (*(++ptr) == '?')
1526     {
1527 nigel 23 int set, unset;
1528     int *optset;
1529 nigel 3
1530     switch (*(++ptr))
1531     {
1532 nigel 23 case '#': /* Comment; skip to ket */
1533 nigel 3 ptr++;
1534     while (*ptr != ')') ptr++;
1535     continue;
1536    
1537     case ':': /* Non-extracting bracket */
1538 nigel 23 bravalue = OP_BRA;
1539 nigel 3 ptr++;
1540     break;
1541    
1542 nigel 23 case '(':
1543     bravalue = OP_COND; /* Conditional group */
1544 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1545 nigel 23 {
1546     condref = *ptr - '0';
1547     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1548     ptr++;
1549     }
1550     else ptr--;
1551     break;
1552    
1553     case '=': /* Positive lookahead */
1554 nigel 3 bravalue = OP_ASSERT;
1555     ptr++;
1556     break;
1557    
1558 nigel 23 case '!': /* Negative lookahead */
1559 nigel 3 bravalue = OP_ASSERT_NOT;
1560     ptr++;
1561     break;
1562    
1563 nigel 23 case '<': /* Lookbehinds */
1564     switch (*(++ptr))
1565 nigel 3 {
1566 nigel 23 case '=': /* Positive lookbehind */
1567     bravalue = OP_ASSERTBACK;
1568 nigel 3 ptr++;
1569     break;
1570 nigel 23
1571     case '!': /* Negative lookbehind */
1572     bravalue = OP_ASSERTBACK_NOT;
1573     ptr++;
1574     break;
1575    
1576     default: /* Syntax error */
1577     *errorptr = ERR24;
1578     goto FAILED;
1579 nigel 3 }
1580 nigel 23 break;
1581 nigel 3
1582 nigel 23 case '>': /* One-time brackets */
1583     bravalue = OP_ONCE;
1584     ptr++;
1585     break;
1586    
1587 nigel 43 case 'R': /* Pattern recursion */
1588     *code++ = OP_RECURSE;
1589     ptr++;
1590     continue;
1591    
1592 nigel 23 default: /* Option setting */
1593     set = unset = 0;
1594     optset = &set;
1595    
1596     while (*ptr != ')' && *ptr != ':')
1597     {
1598     switch (*ptr++)
1599     {
1600     case '-': optset = &unset; break;
1601    
1602     case 'i': *optset |= PCRE_CASELESS; break;
1603     case 'm': *optset |= PCRE_MULTILINE; break;
1604     case 's': *optset |= PCRE_DOTALL; break;
1605     case 'x': *optset |= PCRE_EXTENDED; break;
1606     case 'U': *optset |= PCRE_UNGREEDY; break;
1607     case 'X': *optset |= PCRE_EXTRA; break;
1608    
1609     default:
1610     *errorptr = ERR12;
1611     goto FAILED;
1612     }
1613     }
1614    
1615     /* Set up the changed option bits, but don't change anything yet. */
1616    
1617     newoptions = (options | set) & (~unset);
1618    
1619     /* If the options ended with ')' this is not the start of a nested
1620     group with option changes, so the options change at this level. At top
1621     level there is nothing else to be done (the options will in fact have
1622     been set from the start of compiling as a result of the first pass) but
1623     at an inner level we must compile code to change the ims options if
1624     necessary, and pass the new setting back so that it can be put at the
1625     start of any following branches, and when this group ends, a resetting
1626     item can be compiled. */
1627    
1628     if (*ptr == ')')
1629     {
1630     if ((options & PCRE_INGROUP) != 0 &&
1631     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1632     {
1633     *code++ = OP_OPT;
1634     *code++ = *optchanged = newoptions & PCRE_IMS;
1635     }
1636     options = newoptions; /* Change options at this level */
1637     previous = NULL; /* This item can't be repeated */
1638     continue; /* It is complete */
1639     }
1640    
1641     /* If the options ended with ':' we are heading into a nested group
1642     with possible change of options. Such groups are non-capturing and are
1643     not assertions of any kind. All we need to do is skip over the ':';
1644     the newoptions value is handled below. */
1645    
1646     bravalue = OP_BRA;
1647     ptr++;
1648 nigel 3 }
1649     }
1650    
1651 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1652 nigel 3
1653     else
1654     {
1655     if (++(*brackets) > EXTRACT_MAX)
1656     {
1657     *errorptr = ERR13;
1658     goto FAILED;
1659     }
1660     bravalue = OP_BRA + *brackets;
1661     }
1662    
1663 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1664     kinds can be. We copy code into a non-register variable in order to be able
1665     to pass its address because some compilers complain otherwise. Pass in a
1666     new setting for the ims options if they have changed. */
1667 nigel 3
1668 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1669 nigel 3 *code = bravalue;
1670 nigel 23 tempcode = code;
1671    
1672     if (!compile_regex(
1673     options | PCRE_INGROUP, /* Set for all nested groups */
1674     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1675     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1676     brackets, /* Bracket level */
1677     &tempcode, /* Where to put code (updated) */
1678     &ptr, /* Input pointer (updated) */
1679     errorptr, /* Where to put an error message */
1680     (bravalue == OP_ASSERTBACK ||
1681     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1682 nigel 25 condref, /* Condition reference number */
1683 nigel 37 &subreqchar, /* For possible last char */
1684     &subcountlits, /* For literal count */
1685 nigel 25 cd)) /* Tables block */
1686 nigel 23 goto FAILED;
1687    
1688     /* At the end of compiling, code is still pointing to the start of the
1689     group, while tempcode has been updated to point past the end of the group
1690     and any option resetting that may follow it. The pattern pointer (ptr)
1691     is on the bracket. */
1692    
1693     /* If this is a conditional bracket, check that there are no more than
1694     two branches in the group. */
1695    
1696     if (bravalue == OP_COND)
1697 nigel 3 {
1698 nigel 23 uschar *tc = code;
1699 nigel 37 condcount = 0;
1700 nigel 23
1701     do {
1702 nigel 37 condcount++;
1703 nigel 23 tc += (tc[1] << 8) | tc[2];
1704     }
1705     while (*tc != OP_KET);
1706    
1707 nigel 37 if (condcount > 2)
1708 nigel 23 {
1709     *errorptr = ERR27;
1710 nigel 3 goto FAILED;
1711 nigel 23 }
1712 nigel 3 }
1713    
1714 nigel 37 /* Handle updating of the required character. If the subpattern didn't
1715     set one, leave it as it was. Otherwise, update it for normal brackets of
1716     all kinds, forward assertions, and conditions with two branches. Don't
1717     update the literal count for forward assertions, however. If the bracket
1718     is followed by a quantifier with zero repeat, we have to back off. Hence
1719     the definition of prevreqchar and subcountlits outside the main loop so
1720     that they can be accessed for the back off. */
1721    
1722     if (subreqchar > 0 &&
1723     (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1724     (bravalue == OP_COND && condcount == 2)))
1725     {
1726     prevreqchar = *reqchar;
1727     *reqchar = subreqchar;
1728     if (bravalue != OP_ASSERT) *countlits += subcountlits;
1729     }
1730    
1731 nigel 23 /* Now update the main code pointer to the end of the group. */
1732    
1733     code = tempcode;
1734    
1735     /* Error if hit end of pattern */
1736    
1737 nigel 3 if (*ptr != ')')
1738     {
1739     *errorptr = ERR14;
1740     goto FAILED;
1741     }
1742     break;
1743    
1744     /* Check \ for being a real metacharacter; if not, fall through and handle
1745     it as a data character at the start of a string. Escape items are checked
1746     for validity in the pre-compiling pass. */
1747    
1748     case '\\':
1749 nigel 23 tempptr = ptr;
1750 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1751 nigel 3
1752     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1753     are arranged to be the negation of the corresponding OP_values. For the
1754     back references, the values are ESC_REF plus the reference number. Only
1755     back references and those types that consume a character may be repeated.
1756     We can test for values between ESC_b and ESC_Z for the latter; this may
1757     have to change if any new ones are ever created. */
1758    
1759     if (c < 0)
1760     {
1761     if (-c >= ESC_REF)
1762     {
1763     previous = code;
1764     *code++ = OP_REF;
1765 nigel 23 *code++ = -c - ESC_REF;
1766 nigel 3 }
1767     else
1768     {
1769 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1770 nigel 3 *code++ = -c;
1771     }
1772     continue;
1773     }
1774    
1775 nigel 7 /* Data character: reset and fall through */
1776 nigel 3
1777 nigel 23 ptr = tempptr;
1778 nigel 3 c = '\\';
1779    
1780     /* Handle a run of data characters until a metacharacter is encountered.
1781     The first character is guaranteed not to be whitespace or # when the
1782     extended flag is set. */
1783    
1784     NORMAL_CHAR:
1785     default:
1786     previous = code;
1787     *code = OP_CHARS;
1788     code += 2;
1789     length = 0;
1790    
1791     do
1792     {
1793     if ((options & PCRE_EXTENDED) != 0)
1794     {
1795 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1796 nigel 3 if (c == '#')
1797     {
1798     while ((c = *(++ptr)) != 0 && c != '\n');
1799     if (c == 0) break;
1800     continue;
1801     }
1802     }
1803    
1804     /* Backslash may introduce a data char or a metacharacter. Escaped items
1805     are checked for validity in the pre-compiling pass. Stop the string
1806     before a metaitem. */
1807    
1808     if (c == '\\')
1809     {
1810 nigel 23 tempptr = ptr;
1811 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1812 nigel 23 if (c < 0) { ptr = tempptr; break; }
1813 nigel 3 }
1814    
1815     /* Ordinary character or single-char escape */
1816    
1817     *code++ = c;
1818     length++;
1819     }
1820    
1821     /* This "while" is the end of the "do" above. */
1822    
1823 nigel 25 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1824 nigel 3
1825 nigel 37 /* Update the last character and the count of literals */
1826    
1827     prevreqchar = (length > 1)? code[-2] : *reqchar;
1828     *reqchar = code[-1];
1829     *countlits += length;
1830    
1831 nigel 3 /* Compute the length and set it in the data vector, and advance to
1832     the next state. */
1833    
1834     previous[1] = length;
1835 nigel 15 if (length < 255) ptr--;
1836 nigel 3 break;
1837     }
1838     } /* end of big loop */
1839    
1840     /* Control never reaches here by falling through, only by a goto for all the
1841     error states. Pass back the position in the pattern so that it can be displayed
1842     to the user for diagnosing the error. */
1843    
1844     FAILED:
1845     *ptrptr = ptr;
1846     return FALSE;
1847     }
1848    
1849    
1850    
1851    
1852     /*************************************************
1853     * Compile sequence of alternatives *
1854     *************************************************/
1855    
1856     /* On entry, ptr is pointing past the bracket character, but on return
1857     it points to the closing bracket, or vertical bar, or end of string.
1858     The code variable is pointing at the byte into which the BRA operator has been
1859 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
1860     during any branch, we need to insert an OP_OPT item at the start of every
1861     following branch to ensure they get set correctly at run time, and also pass
1862     the new options into every subsequent branch compile.
1863 nigel 3
1864     Argument:
1865 nigel 23 options the option bits
1866     optchanged new ims options to set as if (?ims) were at the start, or -1
1867     for no change
1868     brackets -> int containing the number of extracting brackets used
1869     codeptr -> the address of the current code pointer
1870     ptrptr -> the address of the current pattern pointer
1871     errorptr -> pointer to error message
1872     lookbehind TRUE if this is a lookbehind assertion
1873     condref > 0 for OPT_CREF setting at start of conditional group
1874 nigel 37 reqchar -> place to put the last required character, or a negative number
1875     countlits -> place to put the shortest literal count of any branch
1876 nigel 25 cd points to the data block with tables pointers
1877 nigel 3
1878 nigel 23 Returns: TRUE on success
1879 nigel 3 */
1880    
1881     static BOOL
1882 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1883 nigel 25 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1884 nigel 37 int *reqchar, int *countlits, compile_data *cd)
1885 nigel 3 {
1886 nigel 7 const uschar *ptr = *ptrptr;
1887 nigel 3 uschar *code = *codeptr;
1888 nigel 23 uschar *last_branch = code;
1889 nigel 3 uschar *start_bracket = code;
1890 nigel 23 uschar *reverse_count = NULL;
1891     int oldoptions = options & PCRE_IMS;
1892 nigel 37 int branchreqchar, branchcountlits;
1893 nigel 3
1894 nigel 37 *reqchar = -1;
1895     *countlits = INT_MAX;
1896 nigel 23 code += 3;
1897    
1898     /* At the start of a reference-based conditional group, insert the reference
1899     number as an OP_CREF item. */
1900    
1901     if (condref > 0)
1902     {
1903     *code++ = OP_CREF;
1904     *code++ = condref;
1905     }
1906    
1907     /* Loop for each alternative branch */
1908    
1909 nigel 3 for (;;)
1910     {
1911     int length;
1912    
1913 nigel 23 /* Handle change of options */
1914    
1915     if (optchanged >= 0)
1916 nigel 3 {
1917 nigel 23 *code++ = OP_OPT;
1918     *code++ = optchanged;
1919     options = (options & ~PCRE_IMS) | optchanged;
1920     }
1921    
1922     /* Set up dummy OP_REVERSE if lookbehind assertion */
1923    
1924     if (lookbehind)
1925     {
1926     *code++ = OP_REVERSE;
1927     reverse_count = code;
1928     *code++ = 0;
1929     *code++ = 0;
1930     }
1931    
1932     /* Now compile the branch */
1933    
1934 nigel 37 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1935     &branchreqchar, &branchcountlits, cd))
1936 nigel 23 {
1937 nigel 3 *ptrptr = ptr;
1938     return FALSE;
1939     }
1940    
1941     /* Fill in the length of the last branch */
1942    
1943     length = code - last_branch;
1944     last_branch[1] = length >> 8;
1945     last_branch[2] = length & 255;
1946    
1947 nigel 37 /* Save the last required character if all branches have the same; a current
1948     value of -1 means unset, while -2 means "previous branch had no last required
1949     char". */
1950    
1951     if (*reqchar != -2)
1952     {
1953     if (branchreqchar >= 0)
1954     {
1955     if (*reqchar == -1) *reqchar = branchreqchar;
1956     else if (*reqchar != branchreqchar) *reqchar = -2;
1957     }
1958     else *reqchar = -2;
1959     }
1960    
1961     /* Keep the shortest literal count */
1962    
1963     if (branchcountlits < *countlits) *countlits = branchcountlits;
1964     DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1965    
1966 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
1967     and put the length into the OP_REVERSE item. Temporarily mark the end of
1968     the branch with OP_END. */
1969    
1970     if (lookbehind)
1971     {
1972     *code = OP_END;
1973     length = find_fixedlength(last_branch);
1974     DPRINTF(("fixed length = %d\n", length));
1975     if (length < 0)
1976     {
1977     *errorptr = ERR25;
1978     *ptrptr = ptr;
1979     return FALSE;
1980     }
1981     reverse_count[0] = (length >> 8);
1982     reverse_count[1] = length & 255;
1983     }
1984    
1985 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
1986     terminating ket and the length of the whole bracketed item, and return,
1987 nigel 23 leaving the pointer at the terminating char. If any of the ims options
1988     were changed inside the group, compile a resetting op-code following. */
1989 nigel 3
1990     if (*ptr != '|')
1991     {
1992     length = code - start_bracket;
1993     *code++ = OP_KET;
1994     *code++ = length >> 8;
1995     *code++ = length & 255;
1996 nigel 23 if (optchanged >= 0)
1997     {
1998     *code++ = OP_OPT;
1999     *code++ = oldoptions;
2000     }
2001 nigel 3 *codeptr = code;
2002     *ptrptr = ptr;
2003     return TRUE;
2004     }
2005    
2006     /* Another branch follows; insert an "or" node and advance the pointer. */
2007    
2008     *code = OP_ALT;
2009 nigel 23 last_branch = code;
2010     code += 3;
2011 nigel 3 ptr++;
2012     }
2013     /* Control never reaches here */
2014     }
2015    
2016    
2017    
2018 nigel 23
2019 nigel 3 /*************************************************
2020 nigel 23 * Find first significant op code *
2021     *************************************************/
2022    
2023     /* This is called by several functions that scan a compiled expression looking
2024     for a fixed first character, or an anchoring op code etc. It skips over things
2025     that do not influence this. For one application, a change of caseless option is
2026     important.
2027    
2028     Arguments:
2029     code pointer to the start of the group
2030     options pointer to external options
2031     optbit the option bit whose changing is significant, or
2032     zero if none are
2033     optstop TRUE to return on option change, otherwise change the options
2034     value and continue
2035    
2036     Returns: pointer to the first significant opcode
2037     */
2038    
2039     static const uschar*
2040     first_significant_code(const uschar *code, int *options, int optbit,
2041     BOOL optstop)
2042     {
2043     for (;;)
2044     {
2045     switch ((int)*code)
2046     {
2047     case OP_OPT:
2048     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2049     {
2050     if (optstop) return code;
2051     *options = (int)code[1];
2052     }
2053     code += 2;
2054     break;
2055    
2056     case OP_CREF:
2057     code += 2;
2058     break;
2059    
2060 nigel 35 case OP_WORD_BOUNDARY:
2061     case OP_NOT_WORD_BOUNDARY:
2062     code++;
2063     break;
2064    
2065 nigel 23 case OP_ASSERT_NOT:
2066     case OP_ASSERTBACK:
2067     case OP_ASSERTBACK_NOT:
2068     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2069     code += 3;
2070     break;
2071    
2072     default:
2073     return code;
2074     }
2075     }
2076     /* Control never reaches here */
2077     }
2078    
2079    
2080    
2081    
2082     /*************************************************
2083 nigel 3 * Check for anchored expression *
2084     *************************************************/
2085    
2086     /* Try to find out if this is an anchored regular expression. Consider each
2087     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2088     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2089     it's anchored. However, if this is a multiline pattern, then only OP_SOD
2090     counts, since OP_CIRC can match in the middle.
2091    
2092 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2093     because that will try the rest of the pattern at all possible matching points,
2094     so there is no point trying them again.
2095 nigel 3
2096 nigel 23 Arguments:
2097     code points to start of expression (the bracket)
2098     options points to the options setting
2099    
2100     Returns: TRUE or FALSE
2101 nigel 3 */
2102    
2103     static BOOL
2104 nigel 23 is_anchored(register const uschar *code, int *options)
2105 nigel 3 {
2106     do {
2107 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
2108     PCRE_MULTILINE, FALSE);
2109     register int op = *scode;
2110     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2111     { if (!is_anchored(scode, options)) return FALSE; }
2112 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2113     (*options & PCRE_DOTALL) != 0)
2114 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
2115     else if (op != OP_SOD &&
2116     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2117     return FALSE;
2118 nigel 3 code += (code[1] << 8) + code[2];
2119     }
2120     while (*code == OP_ALT);
2121     return TRUE;
2122     }
2123    
2124    
2125    
2126     /*************************************************
2127 nigel 33 * Check for starting with ^ or .* *
2128 nigel 3 *************************************************/
2129    
2130 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
2131     "first char" processing can be done to speed things up in multiline
2132     matching and for non-DOTALL patterns that start with .* (which must start at
2133     the beginning or after \n).
2134 nigel 3
2135     Argument: points to start of expression (the bracket)
2136     Returns: TRUE or FALSE
2137     */
2138    
2139     static BOOL
2140 nigel 7 is_startline(const uschar *code)
2141 nigel 3 {
2142     do {
2143 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2144     register int op = *scode;
2145     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2146     { if (!is_startline(scode)) return FALSE; }
2147 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2148     { if (scode[1] != OP_ANY) return FALSE; }
2149 nigel 23 else if (op != OP_CIRC) return FALSE;
2150 nigel 3 code += (code[1] << 8) + code[2];
2151     }
2152     while (*code == OP_ALT);
2153     return TRUE;
2154     }
2155    
2156    
2157    
2158     /*************************************************
2159     * Check for fixed first char *
2160     *************************************************/
2161    
2162     /* Try to find out if there is a fixed first character. This is called for
2163     unanchored expressions, as it speeds up their processing quite considerably.
2164     Consider each alternative branch. If they all start with the same char, or with
2165     a bracket all of whose alternatives start with the same char (recurse ad lib),
2166     then we return that char, otherwise -1.
2167    
2168 nigel 23 Arguments:
2169     code points to start of expression (the bracket)
2170     options pointer to the options (used to check casing changes)
2171    
2172     Returns: -1 or the fixed first char
2173 nigel 3 */
2174    
2175     static int
2176 nigel 23 find_firstchar(const uschar *code, int *options)
2177 nigel 3 {
2178     register int c = -1;
2179 nigel 23 do {
2180     int d;
2181     const uschar *scode = first_significant_code(code + 3, options,
2182     PCRE_CASELESS, TRUE);
2183     register int op = *scode;
2184 nigel 3
2185 nigel 23 if (op >= OP_BRA) op = OP_BRA;
2186 nigel 3
2187 nigel 23 switch(op)
2188     {
2189     default:
2190     return -1;
2191 nigel 3
2192 nigel 23 case OP_BRA:
2193     case OP_ASSERT:
2194     case OP_ONCE:
2195     case OP_COND:
2196     if ((d = find_firstchar(scode, options)) < 0) return -1;
2197     if (c < 0) c = d; else if (c != d) return -1;
2198     break;
2199 nigel 3
2200 nigel 23 case OP_EXACT: /* Fall through */
2201     scode++;
2202 nigel 3
2203 nigel 23 case OP_CHARS: /* Fall through */
2204     scode++;
2205    
2206     case OP_PLUS:
2207     case OP_MINPLUS:
2208     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2209     break;
2210     }
2211    
2212     code += (code[1] << 8) + code[2];
2213     }
2214 nigel 3 while (*code == OP_ALT);
2215     return c;
2216     }
2217    
2218    
2219    
2220 nigel 23
2221    
2222 nigel 3 /*************************************************
2223     * Compile a Regular Expression *
2224     *************************************************/
2225    
2226     /* This function takes a string and returns a pointer to a block of store
2227     holding a compiled version of the expression.
2228    
2229     Arguments:
2230     pattern the regular expression
2231     options various option bits
2232     errorptr pointer to pointer to error text
2233     erroroffset ptr offset in pattern where error was detected
2234 nigel 25 tables pointer to character tables or NULL
2235 nigel 3
2236     Returns: pointer to compiled data block, or NULL on error,
2237     with errorptr and erroroffset set
2238     */
2239    
2240     pcre *
2241 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
2242 nigel 25 int *erroroffset, const unsigned char *tables)
2243 nigel 3 {
2244     real_pcre *re;
2245     int length = 3; /* For initial BRA plus length */
2246     int runlength;
2247 nigel 43 int c, reqchar, countlits;
2248 nigel 3 int bracount = 0;
2249     int top_backref = 0;
2250 nigel 23 int branch_extra = 0;
2251     int branch_newextra;
2252 nigel 7 unsigned int brastackptr = 0;
2253 nigel 43 size_t size;
2254 nigel 7 uschar *code;
2255     const uschar *ptr;
2256 nigel 25 compile_data compile_block;
2257 nigel 23 int brastack[BRASTACK_SIZE];
2258     uschar bralenstack[BRASTACK_SIZE];
2259 nigel 3
2260     #ifdef DEBUG
2261     uschar *code_base, *code_end;
2262     #endif
2263    
2264     /* We can't pass back an error message if errorptr is NULL; I guess the best we
2265     can do is just return NULL. */
2266    
2267     if (errorptr == NULL) return NULL;
2268     *errorptr = NULL;
2269    
2270     /* However, we can give a message for this error */
2271    
2272     if (erroroffset == NULL)
2273     {
2274     *errorptr = ERR16;
2275     return NULL;
2276     }
2277     *erroroffset = 0;
2278    
2279     if ((options & ~PUBLIC_OPTIONS) != 0)
2280     {
2281     *errorptr = ERR17;
2282     return NULL;
2283     }
2284    
2285 nigel 25 /* Set up pointers to the individual character tables */
2286    
2287     if (tables == NULL) tables = pcre_default_tables;
2288     compile_block.lcc = tables + lcc_offset;
2289     compile_block.fcc = tables + fcc_offset;
2290     compile_block.cbits = tables + cbits_offset;
2291     compile_block.ctypes = tables + ctypes_offset;
2292    
2293     /* Reflect pattern for debugging output */
2294    
2295 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
2296     DPRINTF(("%s\n", pattern));
2297 nigel 3
2298     /* The first thing to do is to make a pass over the pattern to compute the
2299     amount of store required to hold the compiled code. This does not have to be
2300     perfect as long as errors are overestimates. At the same time we can detect any
2301     internal flag settings. Make an attempt to correct for any counted white space
2302     if an "extended" flag setting appears late in the pattern. We can't be so
2303     clever for #-comments. */
2304    
2305 nigel 7 ptr = (const uschar *)(pattern - 1);
2306 nigel 3 while ((c = *(++ptr)) != 0)
2307     {
2308     int min, max;
2309     int class_charcount;
2310    
2311 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2312 nigel 3 {
2313 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2314 nigel 23 if (c == '#')
2315     {
2316     while ((c = *(++ptr)) != 0 && c != '\n');
2317     continue;
2318     }
2319 nigel 3 }
2320    
2321     switch(c)
2322     {
2323     /* A backslashed item may be an escaped "normal" character or a
2324     character type. For a "normal" character, put the pointers and
2325     character back so that tests for whitespace etc. in the input
2326     are done correctly. */
2327    
2328     case '\\':
2329     {
2330 nigel 7 const uschar *save_ptr = ptr;
2331 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2332 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2333     if (c >= 0)
2334     {
2335     ptr = save_ptr;
2336     c = '\\';
2337     goto NORMAL_CHAR;
2338     }
2339     }
2340     length++;
2341    
2342     /* A back reference needs an additional char, plus either one or 5
2343     bytes for a repeat. We also need to keep the value of the highest
2344     back reference. */
2345    
2346     if (c <= -ESC_REF)
2347     {
2348     int refnum = -c - ESC_REF;
2349     if (refnum > top_backref) top_backref = refnum;
2350     length++; /* For single back reference */
2351 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2352 nigel 3 {
2353 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2354 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2355     if ((min == 0 && (max == 1 || max == -1)) ||
2356     (min == 1 && max == -1))
2357     length++;
2358     else length += 5;
2359     if (ptr[1] == '?') ptr++;
2360     }
2361     }
2362     continue;
2363    
2364     case '^':
2365     case '.':
2366     case '$':
2367     case '*': /* These repeats won't be after brackets; */
2368     case '+': /* those are handled separately */
2369     case '?':
2370     length++;
2371     continue;
2372    
2373     /* This covers the cases of repeats after a single char, metachar, class,
2374     or back reference. */
2375    
2376     case '{':
2377 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2378     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2379 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2380     if ((min == 0 && (max == 1 || max == -1)) ||
2381     (min == 1 && max == -1))
2382     length++;
2383     else
2384     {
2385     length--; /* Uncount the original char or metachar */
2386     if (min == 1) length++; else if (min > 0) length += 4;
2387     if (max > 0) length += 4; else length += 2;
2388     }
2389     if (ptr[1] == '?') ptr++;
2390     continue;
2391    
2392 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
2393     options changed in the previous branch(es), and/or if we are in a
2394     lookbehind assertion, extra space will be needed at the start of the
2395     branch. This is handled by branch_extra. */
2396    
2397 nigel 3 case '|':
2398 nigel 23 length += 3 + branch_extra;
2399 nigel 3 continue;
2400    
2401     /* A character class uses 33 characters. Don't worry about character types
2402     that aren't allowed in classes - they'll get picked up during the compile.
2403     A character class that contains only one character uses 2 or 3 bytes,
2404     depending on whether it is negated or not. Notice this where we can. */
2405    
2406     case '[':
2407     class_charcount = 0;
2408     if (*(++ptr) == '^') ptr++;
2409     do
2410     {
2411     if (*ptr == '\\')
2412     {
2413 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2414     &compile_block);
2415 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2416 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2417 nigel 3 }
2418     else class_charcount++;
2419     ptr++;
2420     }
2421     while (*ptr != 0 && *ptr != ']');
2422    
2423     /* Repeats for negated single chars are handled by the general code */
2424    
2425     if (class_charcount == 1) length += 3; else
2426     {
2427     length += 33;
2428    
2429     /* A repeat needs either 1 or 5 bytes. */
2430    
2431 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2432 nigel 3 {
2433 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2434 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2435     if ((min == 0 && (max == 1 || max == -1)) ||
2436     (min == 1 && max == -1))
2437     length++;
2438     else length += 5;
2439     if (ptr[1] == '?') ptr++;
2440     }
2441     }
2442     continue;
2443    
2444     /* Brackets may be genuine groups or special things */
2445    
2446     case '(':
2447 nigel 23 branch_newextra = 0;
2448 nigel 3
2449     /* Handle special forms of bracket, which all start (? */
2450    
2451 nigel 23 if (ptr[1] == '?')
2452 nigel 3 {
2453 nigel 23 int set, unset;
2454     int *optset;
2455    
2456     switch (c = ptr[2])
2457 nigel 3 {
2458 nigel 23 /* Skip over comments entirely */
2459     case '#':
2460     ptr += 3;
2461     while (*ptr != 0 && *ptr != ')') ptr++;
2462     if (*ptr == 0)
2463     {
2464     *errorptr = ERR18;
2465     goto PCRE_ERROR_RETURN;
2466     }
2467     continue;
2468 nigel 3
2469 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2470     then behave like a non-special bracket, except that they don't increment
2471     the count of extracting brackets. Ditto for the "once only" bracket,
2472     which is in Perl from version 5.005. */
2473 nigel 3
2474 nigel 23 case ':':
2475     case '=':
2476     case '!':
2477     case '>':
2478 nigel 3 ptr += 2;
2479     break;
2480    
2481 nigel 43 /* A recursive call to the regex is an extension, to provide the
2482     facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2483    
2484     case 'R':
2485     if (ptr[3] != ')')
2486     {
2487     *errorptr = ERR29;
2488     goto PCRE_ERROR_RETURN;
2489     }
2490     ptr += 3;
2491     length += 1;
2492     break;
2493    
2494 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2495 nigel 3
2496 nigel 23 case '<':
2497     if (ptr[3] == '=' || ptr[3] == '!')
2498 nigel 3 {
2499 nigel 23 ptr += 3;
2500     branch_newextra = 3;
2501     length += 3; /* For the first branch */
2502     break;
2503 nigel 3 }
2504 nigel 23 *errorptr = ERR24;
2505     goto PCRE_ERROR_RETURN;
2506    
2507     /* Conditionals are in Perl from version 5.005. The bracket must either
2508     be followed by a number (for bracket reference) or by an assertion
2509     group. */
2510    
2511     case '(':
2512 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2513 nigel 3 {
2514 nigel 23 ptr += 4;
2515     length += 2;
2516 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2517 nigel 23 if (*ptr != ')')
2518     {
2519     *errorptr = ERR26;
2520     goto PCRE_ERROR_RETURN;
2521     }
2522 nigel 3 }
2523 nigel 23 else /* An assertion must follow */
2524 nigel 3 {
2525 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2526    
2527     if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2528     {
2529     ptr += 2; /* To get right offset in message */
2530     *errorptr = ERR28;
2531     goto PCRE_ERROR_RETURN;
2532     }
2533 nigel 3 }
2534 nigel 23 break;
2535    
2536     /* Else loop checking valid options until ) is met. Anything else is an
2537     error. If we are without any brackets, i.e. at top level, the settings
2538     act as if specified in the options, so massage the options immediately.
2539     This is for backward compatibility with Perl 5.004. */
2540    
2541     default:
2542     set = unset = 0;
2543     optset = &set;
2544     ptr += 2;
2545    
2546     for (;; ptr++)
2547 nigel 3 {
2548 nigel 23 c = *ptr;
2549     switch (c)
2550     {
2551     case 'i':
2552     *optset |= PCRE_CASELESS;
2553     continue;
2554    
2555     case 'm':
2556     *optset |= PCRE_MULTILINE;
2557     continue;
2558    
2559     case 's':
2560     *optset |= PCRE_DOTALL;
2561     continue;
2562    
2563     case 'x':
2564     *optset |= PCRE_EXTENDED;
2565     continue;
2566    
2567     case 'X':
2568     *optset |= PCRE_EXTRA;
2569     continue;
2570    
2571     case 'U':
2572     *optset |= PCRE_UNGREEDY;
2573     continue;
2574    
2575     case '-':
2576     optset = &unset;
2577     continue;
2578    
2579     /* A termination by ')' indicates an options-setting-only item;
2580     this is global at top level; otherwise nothing is done here and
2581     it is handled during the compiling process on a per-bracket-group
2582     basis. */
2583    
2584     case ')':
2585     if (brastackptr == 0)
2586     {
2587     options = (options | set) & (~unset);
2588     set = unset = 0; /* To save length */
2589     }
2590     /* Fall through */
2591    
2592     /* A termination by ':' indicates the start of a nested group with
2593     the given options set. This is again handled at compile time, but
2594     we must allow for compiled space if any of the ims options are
2595     set. We also have to allow for resetting space at the end of
2596     the group, which is why 4 is added to the length and not just 2.
2597     If there are several changes of options within the same group, this
2598     will lead to an over-estimate on the length, but this shouldn't
2599     matter very much. We also have to allow for resetting options at
2600     the start of any alternations, which we do by setting
2601 nigel 37 branch_newextra to 2. Finally, we record whether the case-dependent
2602     flag ever changes within the regex. This is used by the "required
2603     character" code. */
2604 nigel 23
2605     case ':':
2606     if (((set|unset) & PCRE_IMS) != 0)
2607     {
2608     length += 4;
2609     branch_newextra = 2;
2610 nigel 37 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2611 nigel 23 }
2612     goto END_OPTIONS;
2613    
2614     /* Unrecognized option character */
2615    
2616     default:
2617     *errorptr = ERR12;
2618     goto PCRE_ERROR_RETURN;
2619     }
2620 nigel 3 }
2621 nigel 23
2622     /* If we hit a closing bracket, that's it - this is a freestanding
2623     option-setting. We need to ensure that branch_extra is updated if
2624     necessary. The only values branch_newextra can have here are 0 or 2.
2625     If the value is 2, then branch_extra must either be 2 or 5, depending
2626     on whether this is a lookbehind group or not. */
2627    
2628     END_OPTIONS:
2629     if (c == ')')
2630 nigel 19 {
2631 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2632     branch_extra += branch_newextra;
2633 nigel 19 continue;
2634     }
2635 nigel 3
2636 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2637     to handle the group below. */
2638 nigel 3 }
2639     }
2640    
2641     /* Extracting brackets must be counted so we can process escapes in a
2642     Perlish way. */
2643    
2644     else bracount++;
2645    
2646     /* Non-special forms of bracket. Save length for computing whole length
2647 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2648     save the current value of branch_extra, and start the new group with
2649     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2650     for a lookbehind assertion. */
2651 nigel 3
2652     if (brastackptr >= sizeof(brastack)/sizeof(int))
2653     {
2654     *errorptr = ERR19;
2655     goto PCRE_ERROR_RETURN;
2656     }
2657    
2658 nigel 23 bralenstack[brastackptr] = branch_extra;
2659     branch_extra = branch_newextra;
2660    
2661 nigel 3 brastack[brastackptr++] = length;
2662     length += 3;
2663     continue;
2664    
2665     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2666 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2667     0 this is an unmatched bracket which will generate an error, but take care
2668 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2669     the branch_extra value. */
2670 nigel 3
2671     case ')':
2672     length += 3;
2673     {
2674 nigel 9 int minval = 1;
2675     int maxval = 1;
2676 nigel 23 int duplength;
2677 nigel 3
2678 nigel 23 if (brastackptr > 0)
2679     {
2680     duplength = length - brastack[--brastackptr];
2681     branch_extra = bralenstack[brastackptr];
2682     }
2683     else duplength = 0;
2684    
2685 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2686     automatically; for the others we need an increment. */
2687    
2688 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2689 nigel 3 {
2690 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2691     &compile_block);
2692 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2693     }
2694 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2695     else if (c == '+') { maxval = -1; ptr++; }
2696     else if (c == '?') { minval = 0; ptr++; }
2697 nigel 3
2698 nigel 31 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2699     group, and if the maximum is greater than zero, we have to replicate
2700     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2701     bracket set - hence the 7. */
2702 nigel 3
2703 nigel 31 if (minval == 0)
2704     {
2705     length++;
2706     if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2707     }
2708    
2709     /* When the minimum is greater than zero, 1 we have to replicate up to
2710     minval-1 times, with no additions required in the copies. Then, if
2711     there is a limited maximum we have to replicate up to maxval-1 times
2712     allowing for a BRAZERO item before each optional copy and nesting
2713     brackets for all but one of the optional copies. */
2714    
2715     else
2716     {
2717     length += (minval - 1) * duplength;
2718     if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2719     length += (maxval - minval) * (duplength + 7) - 6;
2720     }
2721 nigel 3 }
2722     continue;
2723    
2724     /* Non-special character. For a run of such characters the length required
2725     is the number of characters + 2, except that the maximum run length is 255.
2726     We won't get a skipped space or a non-data escape or the start of a #
2727     comment as the first character, so the length can't be zero. */
2728    
2729     NORMAL_CHAR:
2730     default:
2731     length += 2;
2732     runlength = 0;
2733     do
2734     {
2735 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2736 nigel 3 {
2737 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2738 nigel 23 if (c == '#')
2739     {
2740     while ((c = *(++ptr)) != 0 && c != '\n');
2741     continue;
2742     }
2743 nigel 3 }
2744    
2745     /* Backslash may introduce a data char or a metacharacter; stop the
2746     string before the latter. */
2747    
2748     if (c == '\\')
2749     {
2750 nigel 7 const uschar *saveptr = ptr;
2751 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2752     &compile_block);
2753 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2754     if (c < 0) { ptr = saveptr; break; }
2755     }
2756    
2757     /* Ordinary character or single-char escape */
2758    
2759     runlength++;
2760     }
2761    
2762     /* This "while" is the end of the "do" above. */
2763    
2764 nigel 25 while (runlength < 255 &&
2765     (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2766 nigel 3
2767     ptr--;
2768     length += runlength;
2769     continue;
2770     }
2771     }
2772    
2773     length += 4; /* For final KET and END */
2774    
2775     if (length > 65539)
2776     {
2777     *errorptr = ERR20;
2778     return NULL;
2779     }
2780    
2781     /* Compute the size of data block needed and get it, either from malloc or
2782 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
2783     rather than just "code", because it has been reported that one broken compiler
2784     fails on "code" because it is also an independent variable. It should make no
2785     difference to the value of the offsetof(). */
2786 nigel 3
2787 nigel 9 size = length + offsetof(real_pcre, code[0]);
2788 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
2789    
2790     if (re == NULL)
2791     {
2792     *errorptr = ERR21;
2793     return NULL;
2794     }
2795    
2796 nigel 43 /* Put in the magic number, and save the size, options, and table pointer */
2797 nigel 9
2798 nigel 3 re->magic_number = MAGIC_NUMBER;
2799 nigel 43 re->size = size;
2800 nigel 3 re->options = options;
2801 nigel 25 re->tables = tables;
2802 nigel 3
2803     /* Set up a starting, non-extracting bracket, then compile the expression. On
2804     error, *errorptr will be set non-NULL, so we don't need to look at the result
2805     of the function here. */
2806    
2807 nigel 7 ptr = (const uschar *)pattern;
2808 nigel 3 code = re->code;
2809     *code = OP_BRA;
2810     bracount = 0;
2811 nigel 25 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2812 nigel 37 &reqchar, &countlits, &compile_block);
2813 nigel 3 re->top_bracket = bracount;
2814     re->top_backref = top_backref;
2815    
2816     /* If not reached end of pattern on success, there's an excess bracket. */
2817    
2818     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2819    
2820     /* Fill in the terminating state and check for disastrous overflow, but
2821     if debugging, leave the test till after things are printed out. */
2822    
2823     *code++ = OP_END;
2824    
2825     #ifndef DEBUG
2826     if (code - re->code > length) *errorptr = ERR23;
2827     #endif
2828    
2829 nigel 23 /* Give an error if there's back reference to a non-existent capturing
2830     subpattern. */
2831    
2832     if (top_backref > re->top_bracket) *errorptr = ERR15;
2833    
2834 nigel 3 /* Failed to compile */
2835    
2836     if (*errorptr != NULL)
2837     {
2838     (pcre_free)(re);
2839     PCRE_ERROR_RETURN:
2840 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
2841 nigel 3 return NULL;
2842     }
2843    
2844 nigel 33 /* If the anchored option was not passed, set flag if we can determine that the
2845     pattern is anchored by virtue of ^ characters or \A or anything else (such as
2846     starting with .* when DOTALL is set).
2847 nigel 3
2848 nigel 33 Otherwise, see if we can determine what the first character has to be, because
2849     that speeds up unanchored matches no end. If not, see if we can set the
2850     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2851     start with ^. and also when all branches start with .* for non-DOTALL matches.
2852     */
2853    
2854 nigel 3 if ((options & PCRE_ANCHORED) == 0)
2855     {
2856 nigel 23 int temp_options = options;
2857     if (is_anchored(re->code, &temp_options))
2858 nigel 3 re->options |= PCRE_ANCHORED;
2859     else
2860     {
2861 nigel 23 int ch = find_firstchar(re->code, &temp_options);
2862 nigel 9 if (ch >= 0)
2863 nigel 3 {
2864 nigel 9 re->first_char = ch;
2865 nigel 3 re->options |= PCRE_FIRSTSET;
2866     }
2867     else if (is_startline(re->code))
2868     re->options |= PCRE_STARTLINE;
2869     }
2870     }
2871    
2872 nigel 37 /* Save the last required character if there are at least two literal
2873     characters on all paths, or if there is no first character setting. */
2874    
2875     if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2876     {
2877     re->req_char = reqchar;
2878     re->options |= PCRE_REQCHSET;
2879     }
2880    
2881 nigel 3 /* Print out the compiled data for debugging */
2882    
2883     #ifdef DEBUG
2884    
2885 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
2886 nigel 3 length, re->top_bracket, re->top_backref);
2887    
2888     if (re->options != 0)
2889     {
2890 nigel 37 printf("%s%s%s%s%s%s%s%s%s\n",
2891 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2892     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2893 nigel 37 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2894 nigel 3 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2895     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2896     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2897     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2898 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2899     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2900 nigel 3 }
2901    
2902     if ((re->options & PCRE_FIRSTSET) != 0)
2903     {
2904     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2905     else printf("First char = \\x%02x\n", re->first_char);
2906     }
2907    
2908 nigel 37 if ((re->options & PCRE_REQCHSET) != 0)
2909     {
2910     if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2911     else printf("Req char = \\x%02x\n", re->req_char);
2912     }
2913    
2914 nigel 3 code_end = code;
2915     code_base = code = re->code;
2916    
2917     while (code < code_end)
2918     {
2919     int charlength;
2920    
2921     printf("%3d ", code - code_base);
2922    
2923     if (*code >= OP_BRA)
2924     {
2925     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2926     code += 2;
2927     }
2928    
2929     else switch(*code)
2930     {
2931 nigel 23 case OP_OPT:
2932     printf(" %.2x %s", code[1], OP_names[*code]);
2933     code++;
2934     break;
2935    
2936     case OP_COND:
2937     printf("%3d Cond", (code[1] << 8) + code[2]);
2938     code += 2;
2939     break;
2940    
2941     case OP_CREF:
2942     printf(" %.2d %s", code[1], OP_names[*code]);
2943     code++;
2944     break;
2945    
2946 nigel 3 case OP_CHARS:
2947     charlength = *(++code);
2948     printf("%3d ", charlength);
2949     while (charlength-- > 0)
2950     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2951     break;
2952    
2953     case OP_KETRMAX:
2954     case OP_KETRMIN:
2955     case OP_ALT:
2956     case OP_KET:
2957     case OP_ASSERT:
2958     case OP_ASSERT_NOT:
2959 nigel 23 case OP_ASSERTBACK:
2960     case OP_ASSERTBACK_NOT:
2961 nigel 3 case OP_ONCE:
2962     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2963     code += 2;
2964     break;
2965    
2966 nigel 23 case OP_REVERSE:
2967     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2968     code += 2;
2969     break;
2970    
2971 nigel 3 case OP_STAR:
2972     case OP_MINSTAR:
2973     case OP_PLUS:
2974     case OP_MINPLUS:
2975     case OP_QUERY:
2976     case OP_MINQUERY:
2977     case OP_TYPESTAR:
2978     case OP_TYPEMINSTAR:
2979     case OP_TYPEPLUS:
2980     case OP_TYPEMINPLUS:
2981     case OP_TYPEQUERY:
2982     case OP_TYPEMINQUERY:
2983     if (*code >= OP_TYPESTAR)
2984     printf(" %s", OP_names[code[1]]);
2985     else if (isprint(c = code[1])) printf(" %c", c);
2986     else printf(" \\x%02x", c);
2987     printf("%s", OP_names[*code++]);
2988     break;
2989    
2990     case OP_EXACT:
2991     case OP_UPTO:
2992     case OP_MINUPTO:
2993     if (isprint(c = code[3])) printf(" %c{", c);
2994     else printf(" \\x%02x{", c);
2995 nigel 11 if (*code != OP_EXACT) printf("0,");
2996 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2997     if (*code == OP_MINUPTO) printf("?");
2998     code += 3;
2999     break;
3000    
3001     case OP_TYPEEXACT:
3002     case OP_TYPEUPTO:
3003     case OP_TYPEMINUPTO:
3004     printf(" %s{", OP_names[code[3]]);
3005     if (*code != OP_TYPEEXACT) printf(",");
3006     printf("%d}", (code[1] << 8) + code[2]);
3007     if (*code == OP_TYPEMINUPTO) printf("?");
3008     code += 3;
3009     break;
3010    
3011     case OP_NOT:
3012     if (isprint(c = *(++code))) printf(" [^%c]", c);
3013     else printf(" [^\\x%02x]", c);
3014     break;
3015    
3016     case OP_NOTSTAR:
3017     case OP_NOTMINSTAR:
3018     case OP_NOTPLUS:
3019     case OP_NOTMINPLUS:
3020     case OP_NOTQUERY:
3021     case OP_NOTMINQUERY:
3022     if (isprint(c = code[1])) printf(" [^%c]", c);
3023     else printf(" [^\\x%02x]", c);
3024     printf("%s", OP_names[*code++]);
3025     break;
3026    
3027     case OP_NOTEXACT:
3028     case OP_NOTUPTO:
3029     case OP_NOTMINUPTO:
3030     if (isprint(c = code[3])) printf(" [^%c]{", c);
3031     else printf(" [^\\x%02x]{", c);
3032     if (*code != OP_NOTEXACT) printf(",");
3033     printf("%d}", (code[1] << 8) + code[2]);
3034     if (*code == OP_NOTMINUPTO) printf("?");
3035     code += 3;
3036     break;
3037    
3038     case OP_REF:
3039     printf(" \\%d", *(++code));
3040 nigel 9 code ++;
3041     goto CLASS_REF_REPEAT;
3042 nigel 3
3043     case OP_CLASS:
3044     {
3045     int i, min, max;
3046 nigel 23 code++;
3047     printf(" [");
3048 nigel 3
3049     for (i = 0; i < 256; i++)
3050     {
3051     if ((code[i/8] & (1 << (i&7))) != 0)
3052     {
3053     int j;
3054     for (j = i+1; j < 256; j++)
3055     if ((code[j/8] & (1 << (j&7))) == 0) break;
3056     if (i == '-' || i == ']') printf("\\");
3057     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3058     if (--j > i)
3059     {
3060     printf("-");
3061     if (j == '-' || j == ']') printf("\\");
3062     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3063     }
3064     i = j;
3065     }
3066     }
3067     printf("]");
3068     code += 32;
3069    
3070 nigel 9 CLASS_REF_REPEAT:
3071    
3072 nigel 3 switch(*code)
3073     {
3074     case OP_CRSTAR:
3075     case OP_CRMINSTAR:
3076     case OP_CRPLUS:
3077     case OP_CRMINPLUS:
3078     case OP_CRQUERY:
3079     case OP_CRMINQUERY:
3080     printf("%s", OP_names[*code]);
3081     break;
3082    
3083     case OP_CRRANGE:
3084     case OP_CRMINRANGE:
3085     min = (code[1] << 8) + code[2];
3086     max = (code[3] << 8) + code[4];
3087     if (max == 0) printf("{%d,}", min);
3088     else printf("{%d,%d}", min, max);
3089     if (*code == OP_CRMINRANGE) printf("?");
3090     code += 4;
3091     break;
3092    
3093     default:
3094     code--;
3095     }
3096     }
3097     break;
3098    
3099     /* Anything else is just a one-node item */
3100    
3101     default:
3102     printf(" %s", OP_names[*code]);
3103     break;
3104     }
3105    
3106     code++;
3107     printf("\n");
3108     }
3109     printf("------------------------------------------------------------------\n");
3110    
3111     /* This check is done here in the debugging case so that the code that
3112     was compiled can be seen. */
3113    
3114     if (code - re->code > length)
3115     {
3116     *errorptr = ERR23;
3117     (pcre_free)(re);
3118     *erroroffset = ptr - (uschar *)pattern;
3119     return NULL;
3120     }
3121     #endif
3122    
3123     return (pcre *)re;
3124     }
3125    
3126    
3127    
3128     /*************************************************
3129     * Match a back-reference *
3130     *************************************************/
3131    
3132 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
3133     than the number of characters left in the string, so the match fails.
3134 nigel 3
3135     Arguments:
3136 nigel 23 offset index into the offset vector
3137 nigel 3 eptr points into the subject
3138     length length to be matched
3139     md points to match data block
3140 nigel 23 ims the ims flags
3141 nigel 3
3142     Returns: TRUE if matched
3143     */
3144    
3145     static BOOL
3146 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3147 nigel 37 unsigned long int ims)
3148 nigel 3 {
3149 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
3150 nigel 3
3151     #ifdef DEBUG
3152     if (eptr >= md->end_subject)
3153     printf("matching subject <null>");
3154     else
3155     {
3156     printf("matching subject ");
3157     pchars(eptr, length, TRUE, md);
3158     }
3159     printf(" against backref ");
3160     pchars(p, length, FALSE, md);
3161     printf("\n");
3162     #endif
3163    
3164     /* Always fail if not enough characters left */
3165    
3166 nigel 23 if (length > md->end_subject - eptr) return FALSE;
3167 nigel 3
3168     /* Separate the caselesss case for speed */
3169    
3170 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3171 nigel 25 {
3172     while (length-- > 0)
3173     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3174     }
3175 nigel 3 else
3176     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3177    
3178     return TRUE;
3179     }
3180    
3181    
3182    
3183     /*************************************************
3184     * Match from current position *
3185     *************************************************/
3186    
3187 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
3188     in the subject string, while eptrb holds the value of eptr at the start of the
3189     last bracketed group - used for breaking infinite loops matching zero-length
3190     strings.
3191 nigel 3
3192     Arguments:
3193     eptr pointer in subject
3194     ecode position in code
3195     offset_top current top pointer
3196     md pointer to "static" info for the match
3197 nigel 23 ims current /i, /m, and /s options
3198     condassert TRUE if called to check a condition assertion
3199     eptrb eptr at start of last bracket
3200 nigel 3
3201     Returns: TRUE if matched
3202     */
3203    
3204     static BOOL
3205 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
3206 nigel 37 int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
3207     const uschar *eptrb)
3208 nigel 3 {
3209 nigel 37 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3210 nigel 23
3211 nigel 3 for (;;)
3212     {
3213 nigel 23 int op = (int)*ecode;
3214 nigel 3 int min, max, ctype;
3215     register int i;
3216     register int c;
3217 nigel 7 BOOL minimize = FALSE;
3218 nigel 3
3219 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
3220     the current subject position in the working slot at the top of the vector. We
3221     mustn't change the current values of the data slot, because they may be set
3222     from a previous iteration of this group, and be referred to by a reference
3223     inside the group.
3224 nigel 3
3225 nigel 23 If the bracket fails to match, we need to restore this value and also the
3226     values of the final offsets, in case they were set by a previous iteration of
3227     the same bracket.
3228    
3229     If there isn't enough space in the offset vector, treat this as if it were a
3230     non-capturing bracket. Don't worry about setting the flag for the error case
3231     here; that is handled in the code for KET. */
3232    
3233     if (op > OP_BRA)
3234 nigel 3 {
3235 nigel 23 int number = op - OP_BRA;
3236     int offset = number << 1;
3237 nigel 3
3238 nigel 31 #ifdef DEBUG
3239     printf("start bracket %d subject=", number);
3240     pchars(eptr, 16, TRUE, md);
3241     printf("\n");
3242     #endif
3243 nigel 3
3244 nigel 23 if (offset < md->offset_max)
3245 nigel 3 {
3246 nigel 23 int save_offset1 = md->offset_vector[offset];
3247     int save_offset2 = md->offset_vector[offset+1];
3248     int save_offset3 = md->offset_vector[md->offset_end - number];
3249 nigel 3
3250 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3251     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3252    
3253     do
3254     {
3255     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3256     ecode += (ecode[1] << 8) + ecode[2];
3257     }
3258     while (*ecode == OP_ALT);
3259    
3260     DPRINTF(("bracket %d failed\n", number));
3261    
3262     md->offset_vector[offset] = save_offset1;
3263     md->offset_vector[offset+1] = save_offset2;
3264     md->offset_vector[md->offset_end - number] = save_offset3;
3265     return FALSE;
3266 nigel 3 }
3267    
3268 nigel 23 /* Insufficient room for saving captured contents */
3269 nigel 3
3270 nigel 23 else op = OP_BRA;
3271     }
3272    
3273     /* Other types of node can be handled by a switch */
3274    
3275     switch(op)
3276     {
3277     case OP_BRA: /* Non-capturing bracket: optimized */
3278     DPRINTF(("start bracket 0\n"));
3279 nigel 3 do
3280     {
3281 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3282 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3283     }
3284     while (*ecode == OP_ALT);
3285 nigel 23 DPRINTF(("bracket 0 failed\n"));
3286     return FALSE;
3287 nigel 3
3288 nigel 23 /* Conditional group: compilation checked that there are no more than
3289     two branches. If the condition is false, skipping the first branch takes us
3290     past the end if there is only one branch, but that's OK because that is
3291     exactly what going to the ket would do. */
3292 nigel 3
3293 nigel 23 case OP_COND:
3294     if (ecode[3] == OP_CREF) /* Condition is extraction test */
3295 nigel 3 {
3296 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
3297     return match(eptr,
3298     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3299     5 : 3 + (ecode[1] << 8) + ecode[2]),
3300     offset_top, md, ims, FALSE, eptr);
3301 nigel 3 }
3302    
3303 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
3304     the final argument TRUE causes it to stop at the end of an assertion. */
3305 nigel 3
3306 nigel 23 else
3307     {
3308     if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3309     {
3310     ecode += 3 + (ecode[4] << 8) + ecode[5];
3311     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3312     }
3313     else ecode += (ecode[1] << 8) + ecode[2];
3314     return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3315     }
3316     /* Control never reaches here */
3317 nigel 3
3318 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
3319    
3320     case OP_CREF:
3321     ecode += 2;
3322     break;
3323    
3324 nigel 37 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3325     an empty string - recursion will then try other alternatives, if any. */
3326 nigel 23
3327 nigel 3 case OP_END:
3328 nigel 37 if (md->notempty && eptr == md->start_match) return FALSE;
3329 nigel 3 md->end_match_ptr = eptr; /* Record where we ended */
3330     md->end_offset_top = offset_top; /* and how many extracts were taken */
3331     return TRUE;
3332    
3333 nigel 23 /* Change option settings */
3334 nigel 3
3335 nigel 23 case OP_OPT:
3336     ims = ecode[1];
3337     ecode += 2;
3338 nigel 39 DPRINTF(("ims set to %02lx\n", ims));
3339 nigel 23 break;
3340 nigel 3
3341     /* Assertion brackets. Check the alternative branches in turn - the
3342     matching won't pass the KET for an assertion. If any one branch matches,
3343 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3344     start of each branch to move the current point backwards, so the code at
3345     this level is identical to the lookahead case. */
3346 nigel 3
3347     case OP_ASSERT:
3348 nigel 23 case OP_ASSERTBACK:
3349 nigel 3 do
3350     {
3351 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3352 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3353     }
3354     while (*ecode == OP_ALT);
3355     if (*ecode == OP_KET) return FALSE;
3356    
3357 nigel 23 /* If checking an assertion for a condition, return TRUE. */
3358    
3359     if (condassert) return TRUE;
3360    
3361 nigel 3 /* Continue from after the assertion, updating the offsets high water
3362     mark, since extracts may have been taken during the assertion. */
3363    
3364     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3365     ecode += 3;
3366     offset_top = md->end_offset_top;
3367     continue;
3368    
3369     /* Negative assertion: all branches must fail to match */
3370    
3371     case OP_ASSERT_NOT:
3372 nigel 23 case OP_ASSERTBACK_NOT:
3373 nigel 3 do
3374     {
3375 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3376 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3377     }
3378     while (*ecode == OP_ALT);
3379 nigel 23
3380     if (condassert) return TRUE;
3381 nigel 3 ecode += 3;
3382     continue;
3383    
3384 nigel 23 /* Move the subject pointer back. This occurs only at the start of
3385     each branch of a lookbehind assertion. If we are too close to the start to
3386     move back, this match function fails. */
3387    
3388     case OP_REVERSE:
3389     eptr -= (ecode[1] << 8) + ecode[2];
3390     if (eptr < md->start_subject) return FALSE;
3391     ecode += 3;
3392     break;
3393    
3394 nigel 43 /* Recursion matches the current regex, nested. If there are any capturing
3395     brackets started but not finished, we have to save their starting points
3396     and reinstate them after the recursion. However, we don't know how many
3397     such there are (offset_top records the completed total) so we just have
3398     to save all the potential data. There may be up to 99 such values, which
3399     is a bit large to put on the stack, but using malloc for small numbers
3400     seems expensive. As a compromise, the stack is used when there are fewer
3401     than 16 values to store; otherwise malloc is used. A problem is what to do
3402     if the malloc fails ... there is no way of returning to the top level with
3403     an error. Save the top 15 values on the stack, and accept that the rest
3404     may be wrong. */
3405 nigel 23
3406 nigel 43 case OP_RECURSE:
3407     {
3408     BOOL rc;
3409     int *save;
3410     int stacksave[15];
3411    
3412     c = md->offset_max;
3413    
3414     if (c < 16) save = stacksave; else
3415     {
3416     save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3417     if (save == NULL)
3418     {
3419     save = stacksave;
3420     c = 15;
3421     }
3422     }
3423    
3424     for (i = 1; i <= c; i++)
3425     save[i] = md->offset_vector[md->offset_end - i];
3426     rc = match(eptr, md->start_pattern, offset_top, md, ims, FALSE, eptrb);
3427     for (i = 1; i <= c; i++)
3428     md->offset_vector[md->offset_end - i] = save[i];
3429     if (save != stacksave) (pcre_free)(save);
3430     if (!rc) return FALSE;
3431    
3432     /* In case the recursion has set more capturing values, save the final
3433     number, then move along the subject till after the recursive match,
3434     and advance one byte in the pattern code. */
3435    
3436     offset_top = md->end_offset_top;
3437     eptr = md->end_match_ptr;
3438     ecode++;
3439     }
3440     break;
3441    
3442 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
3443     the point in the subject string is not moved back. Thus there can never be
3444 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
3445 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
3446 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
3447     pointer. */
3448 nigel 3
3449     case OP_ONCE:
3450     {
3451 nigel 23 const uschar *prev = ecode;
3452 nigel 3
3453 nigel 23 do
3454     {
3455     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3456     ecode += (ecode[1] << 8) + ecode[2];
3457     }
3458     while (*ecode == OP_ALT);
3459 nigel 3
3460 nigel 23 /* If hit the end of the group (which could be repeated), fail */
3461 nigel 3
3462 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3463    
3464     /* Continue as from after the assertion, updating the offsets high water
3465     mark, since extracts may have been taken. */
3466    
3467     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3468    
3469     offset_top = md->end_offset_top;
3470     eptr = md->end_match_ptr;
3471    
3472     /* For a non-repeating ket, just continue at this level. This also
3473     happens for a repeating ket if no characters were matched in the group.
3474     This is the forcible breaking of infinite loops as implemented in Perl
3475     5.005. If there is an options reset, it will get obeyed in the normal
3476     course of events. */
3477    
3478     if (*ecode == OP_KET || eptr == eptrb)
3479     {
3480     ecode += 3;
3481     break;
3482     }
3483    
3484     /* The repeating kets try the rest of the pattern or restart from the
3485     preceding bracket, in the appropriate order. We need to reset any options
3486     that changed within the bracket before re-running it, so check the next
3487     opcode. */
3488    
3489     if (ecode[3] == OP_OPT)
3490     {
3491     ims = (ims & ~PCRE_IMS) | ecode[4];
3492 nigel 39 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3493 nigel 23 }
3494    
3495     if (*ecode == OP_KETRMIN)
3496     {
3497     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3498     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3499     }
3500     else /* OP_KETRMAX */
3501     {
3502     if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3503     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3504     }
3505     }
3506     return FALSE;
3507    
3508 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
3509     bracketed group and go to there. */
3510    
3511     case OP_ALT:
3512     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3513     break;
3514    
3515     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3516     that it may occur zero times. It may repeat infinitely, or not at all -
3517     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3518     repeat limits are compiled as a number of copies, with the optional ones
3519     preceded by BRAZERO or BRAMINZERO. */
3520    
3521     case OP_BRAZERO:
3522     {
3523 nigel 7 const uschar *next = ecode+1;
3524 nigel 23 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3525 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3526     ecode = next + 3;
3527     }
3528     break;
3529    
3530     case OP_BRAMINZERO:
3531     {
3532 nigel 7 const uschar *next = ecode+1;
3533 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3534 nigel 23 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3535 nigel 3 ecode++;
3536     }
3537 nigel 23 break;
3538 nigel 3
3539     /* End of a group, repeated or non-repeating. If we are at the end of
3540     an assertion "group", stop matching and return TRUE, but record the
3541 nigel 23 current high water mark for use by positive assertions. Do this also
3542     for the "once" (not-backup up) groups. */
3543 nigel 3
3544     case OP_KET:
3545     case OP_KETRMIN:
3546     case OP_KETRMAX:
3547     {
3548 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3549 nigel 3
3550 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3551     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3552     *prev == OP_ONCE)
3553 nigel 3 {
3554     md->end_match_ptr = eptr; /* For ONCE */
3555     md->end_offset_top = offset_top;
3556     return TRUE;
3557     }
3558    
3559 nigel 23 /* In all other cases except a conditional group we have to check the
3560     group number back at the start and if necessary complete handling an
3561     extraction by setting the offsets and bumping the high water mark. */
3562 nigel 3
3563 nigel 23 if (*prev != OP_COND)
3564     {
3565     int number = *prev - OP_BRA;
3566     int offset = number << 1;
3567 nigel 3
3568 nigel 23 DPRINTF(("end bracket %d\n", number));
3569 nigel 3
3570 nigel 23 if (number > 0)
3571 nigel 3 {
3572 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3573     {
3574     md->offset_vector[offset] =
3575     md->offset_vector[md->offset_end - number];
3576     md->offset_vector[offset+1] = eptr - md->start_subject;
3577     if (offset_top <= offset) offset_top = offset + 2;
3578     }
3579 nigel 3 }
3580     }
3581    
3582 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3583     the group. */
3584 nigel 3
3585 nigel 23 ims = original_ims;
3586 nigel 39 DPRINTF(("ims reset to %02lx\n", ims));
3587 nigel 23
3588     /* For a non-repeating ket, just continue at this level. This also
3589     happens for a repeating ket if no characters were matched in the group.
3590     This is the forcible breaking of infinite loops as implemented in Perl
3591     5.005. If there is an options reset, it will get obeyed in the normal
3592     course of events. */
3593    
3594     if (*ecode == OP_KET || eptr == eptrb)
3595 nigel 3 {
3596     ecode += 3;
3597     break;
3598     }
3599    
3600     /* The repeating kets try the rest of the pattern or restart from the
3601     preceding bracket, in the appropriate order. */
3602    
3603     if (*ecode == OP_KETRMIN)
3604     {
3605 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3606     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3607 nigel 3 }
3608     else /* OP_KETRMAX */
3609     {
3610 nigel 23 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3611     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3612 nigel 3 }
3613     }
3614     return FALSE;
3615    
3616     /* Start of subject unless notbol, or after internal newline if multiline */
3617    
3618     case OP_CIRC:
3619     if (md->notbol && eptr == md->start_subject) return FALSE;
3620 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3621 nigel 3 {
3622     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3623     ecode++;
3624     break;
3625     }
3626     /* ... else fall through */
3627    
3628     /* Start of subject assertion */
3629    
3630     case OP_SOD:
3631     if (eptr != md->start_subject) return FALSE;
3632     ecode++;
3633     break;
3634    
3635 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3636     newline unless endonly is set, else end of subject unless noteol is set. */
3637 nigel 3
3638     case OP_DOLL:
3639 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3640 nigel 3 {
3641 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3642     else { if (md->noteol) return FALSE; }
3643 nigel 3 ecode++;
3644     break;
3645     }
3646 nigel 23 else
3647 nigel 3 {
3648 nigel 23 if (md->noteol) return FALSE;
3649     if (!md->endonly)
3650     {
3651     if (eptr < md->end_subject - 1 ||
3652     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3653    
3654     ecode++;
3655     break;
3656     }
3657 nigel 3 }
3658     /* ... else fall through */
3659    
3660 nigel 23 /* End of subject assertion (\z) */
3661 nigel 3
3662     case OP_EOD:
3663     if (eptr < md->end_subject) return FALSE;
3664     ecode++;
3665     break;
3666    
3667 nigel 23 /* End of subject or ending \n assertion (\Z) */
3668    
3669     case OP_EODN:
3670     if (eptr < md->end_subject - 1 ||
3671     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3672     ecode++;
3673     break;
3674    
3675 nigel 3 /* Word boundary assertions */
3676    
3677     case OP_NOT_WORD_BOUNDARY:
3678     case OP_WORD_BOUNDARY:
3679     {
3680     BOOL prev_is_word = (eptr != md->start_subject) &&
3681 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3682 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
3683 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
3684 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
3685     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3686     return FALSE;
3687     }
3688     break;
3689    
3690     /* Match a single character type; inline for speed */
3691    
3692     case OP_ANY:
3693 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3694     return FALSE;
3695 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3696     ecode++;
3697     break;
3698    
3699     case OP_NOT_DIGIT:
3700 nigel 25 if (eptr >= md->end_subject ||
3701     (md->ctypes[*eptr++] & ctype_digit) != 0)
3702 nigel 3 return FALSE;
3703     ecode++;
3704     break;
3705    
3706     case OP_DIGIT:
3707 nigel 25 if (eptr >= md->end_subject ||
3708     (md->ctypes[*eptr++] & ctype_digit) == 0)
3709 nigel 3 return FALSE;
3710     ecode++;
3711     break;
3712    
3713     case OP_NOT_WHITESPACE:
3714 nigel 25 if (eptr >= md->end_subject ||
3715     (md->ctypes[*eptr++] & ctype_space) != 0)
3716 nigel 3 return FALSE;
3717     ecode++;
3718     break;
3719    
3720     case OP_WHITESPACE:
3721 nigel 25 if (eptr >= md->end_subject ||
3722     (md->ctypes[*eptr++] & ctype_space) == 0)
3723 nigel 3 return FALSE;
3724     ecode++;
3725     break;
3726    
3727     case OP_NOT_WORDCHAR:
3728 nigel 25 if (eptr >= md->end_subject ||
3729     (md->ctypes[*eptr++] & ctype_word) != 0)
3730 nigel 3 return FALSE;
3731     ecode++;
3732     break;
3733    
3734     case OP_WORDCHAR:
3735 nigel 25 if (eptr >= md->end_subject ||
3736     (md->ctypes[*eptr++] & ctype_word) == 0)
3737 nigel 3 return FALSE;
3738     ecode++;
3739     break;
3740    
3741     /* Match a back reference, possibly repeatedly. Look past the end of the
3742     item to see if there is repeat information following. The code is similar
3743     to that for character classes, but repeated for efficiency. Then obey
3744     similar code to character type repeats - written out again for speed.
3745     However, if the referenced string is the empty string, always treat
3746     it as matched, any number of times (otherwise there could be infinite
3747     loops). */
3748    
3749     case OP_REF:
3750     {
3751     int length;
3752 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
3753 nigel 3 ecode += 2; /* Advance past the item */
3754    
3755 nigel 23 /* If the reference is unset, set the length to be longer than the amount
3756     of subject left; this ensures that every attempt at a match fails. We
3757     can't just fail here, because of the possibility of quantifiers with zero
3758     minima. */
3759 nigel 3
3760 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3761     md->end_subject - eptr + 1 :
3762     md->offset_vector[offset+1] - md->offset_vector[offset];
3763 nigel 3
3764 nigel 23 /* Set up for repetition, or handle the non-repeated case */
3765    
3766 nigel 3 switch (*ecode)
3767     {
3768     case OP_CRSTAR:
3769     case OP_CRMINSTAR:
3770     case OP_CRPLUS:
3771     case OP_CRMINPLUS:
3772     case OP_CRQUERY:
3773     case OP_CRMINQUERY:
3774     c = *ecode++ - OP_CRSTAR;
3775     minimize = (c & 1) != 0;
3776     min = rep_min[c]; /* Pick up values from tables; */
3777     max = rep_max[c]; /* zero for max => infinity */
3778     if (max == 0) max = INT_MAX;
3779     break;
3780    
3781     case OP_CRRANGE:
3782     case OP_CRMINRANGE:
3783     minimize = (*ecode == OP_CRMINRANGE);
3784     min = (ecode[1] << 8) + ecode[2];
3785     max = (ecode[3] << 8) + ecode[4];
3786     if (max == 0) max = INT_MAX;
3787     ecode += 5;
3788     break;
3789    
3790     default: /* No repeat follows */
3791 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3792 nigel 3 eptr += length;
3793     continue; /* With the main loop */
3794     }
3795    
3796     /* If the length of the reference is zero, just continue with the
3797     main loop. */
3798    
3799     if (length == 0) continue;
3800    
3801     /* First, ensure the minimum number of matches are present. We get back
3802     the length of the reference string explicitly rather than passing the
3803     address of eptr, so that eptr can be a register variable. */
3804    
3805     for (i = 1; i <= min; i++)
3806     {
3807 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3808 nigel 3 eptr += length;
3809     }
3810    
3811     /* If min = max, continue at the same level without recursion.
3812     They are not both allowed to be zero. */
3813    
3814     if (min == max) continue;
3815    
3816     /* If minimizing, keep trying and advancing the pointer */
3817    
3818     if (minimize)
3819     {
3820     for (i = min;; i++)
3821     {
3822 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3823     return TRUE;
3824     if (i >= max || !match_ref(offset, eptr, length, md, ims))
3825 nigel 3 return FALSE;
3826     eptr += length;
3827     }
3828     /* Control never gets here */
3829     }
3830    
3831     /* If maximizing, find the longest string and work backwards */
3832    
3833     else
3834     {
3835 nigel 7 const uschar *pp = eptr;
3836 nigel 3 for (i = min; i < max; i++)
3837     {
3838 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
3839 nigel 3 eptr += length;
3840     }
3841     while (eptr >= pp)
3842     {
3843 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3844     return TRUE;
3845 nigel 3 eptr -= length;
3846     }
3847     return FALSE;
3848     }
3849     }
3850     /* Control never gets here */
3851    
3852 nigel 23
3853    
3854 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
3855     item to see if there is repeat information following. Then obey similar
3856 nigel 23 code to character type repeats - written out again for speed. */
3857 nigel 3
3858     case OP_CLASS:
3859     {
3860 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
3861     ecode += 33; /* Advance past the item */
3862 nigel 3
3863     switch (*ecode)
3864     {
3865     case OP_CRSTAR:
3866     case OP_CRMINSTAR:
3867     case OP_CRPLUS:
3868     case OP_CRMINPLUS:
3869     case OP_CRQUERY:
3870     case OP_CRMINQUERY:
3871     c = *ecode++ - OP_CRSTAR;
3872     minimize = (c & 1) != 0;
3873     min = rep_min[c]; /* Pick up values from tables; */
3874     max = rep_max[c]; /* zero for max => infinity */
3875     if (max == 0) max = INT_MAX;
3876     break;
3877    
3878     case OP_CRRANGE:
3879     case OP_CRMINRANGE:
3880     minimize = (*ecode == OP_CRMINRANGE);
3881     min = (ecode[1] << 8) + ecode[2];
3882     max = (ecode[3] << 8) + ecode[4];
3883     if (max == 0) max = INT_MAX;
3884     ecode += 5;
3885     break;
3886    
3887     default: /* No repeat follows */
3888 nigel 13 min = max = 1;
3889     break;
3890 nigel 3 }
3891    
3892     /* First, ensure the minimum number of matches are present. */
3893    
3894     for (i = 1; i <= min; i++)
3895     {
3896     if (eptr >= md->end_subject) return FALSE;
3897     c = *eptr++;
3898 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3899 nigel 3 return FALSE;
3900     }
3901    
3902     /* If max == min we can continue with the main loop without the
3903     need to recurse. */
3904    
3905     if (min == max) continue;
3906    
3907     /* If minimizing, keep testing the rest of the expression and advancing
3908     the pointer while it matches the class. */
3909    
3910     if (minimize)
3911     {
3912     for (i = min;; i++)
3913     {
3914 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3915     return TRUE;
3916 nigel 3 if (i >= max || eptr >= md->end_subject) return FALSE;
3917     c = *eptr++;
3918 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3919 nigel 3 return FALSE;
3920     }
3921     /* Control never gets here */
3922     }
3923    
3924     /* If maximizing, find the longest possible run, then work backwards. */
3925    
3926     else
3927     {
3928 nigel 7 const uschar *pp = eptr;
3929 nigel 3 for (i = min; i < max; eptr++, i++)
3930     {
3931     if (eptr >= md->end_subject) break;
3932     c = *eptr;
3933 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3934 nigel 3 break;
3935     }
3936    
3937     while (eptr >= pp)
3938 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3939     return TRUE;
3940 nigel 3 return FALSE;
3941     }
3942     }
3943     /* Control never gets here */
3944    
3945     /* Match a run of characters */
3946    
3947     case OP_CHARS:
3948     {
3949     register int length = ecode[1];
3950     ecode += 2;
3951    
3952 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3953 nigel 3 if (eptr >= md->end_subject)
3954     printf("matching subject <null> against pattern ");
3955     else
3956     {
3957     printf("matching subject ");
3958     pchars(eptr, length, TRUE, md);
3959     printf(" against pattern ");
3960     }
3961     pchars(ecode, length, FALSE, md);
3962     printf("\n");
3963 nigel 9 #endif
3964 nigel 3
3965     if (length > md->end_subject - eptr) return FALSE;
3966 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3967 nigel 3 {
3968 nigel 25 while (length-- > 0)
3969     if (md->lcc[*ecode++] != md->lcc[*eptr++])
3970     return FALSE;
3971 nigel 3 }
3972     else
3973     {
3974     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3975     }
3976     }
3977     break;
3978    
3979     /* Match a single character repeatedly; different opcodes share code. */
3980    
3981     case OP_EXACT:
3982     min = max = (ecode[1] << 8) + ecode[2];
3983     ecode += 3;
3984     goto REPEATCHAR;
3985    
3986     case OP_UPTO:
3987     case OP_MINUPTO:
3988     min = 0;
3989     max = (ecode[1] << 8) + ecode[2];
3990     minimize = *ecode == OP_MINUPTO;
3991     ecode += 3;
3992     goto REPEATCHAR;
3993    
3994     case OP_STAR:
3995     case OP_MINSTAR:
3996     case OP_PLUS:
3997     case OP_MINPLUS:
3998     case OP_QUERY:
3999     case OP_MINQUERY:
4000     c = *ecode++ - OP_STAR;
4001     minimize = (c & 1) != 0;
4002     min = rep_min[c]; /* Pick up values from tables; */
4003     max = rep_max[c]; /* zero for max => infinity */
4004     if (max == 0) max = INT_MAX;
4005    
4006     /* Common code for all repeated single-character matches. We can give
4007     up quickly if there are fewer than the minimum number of characters left in
4008     the subject. */
4009    
4010     REPEATCHAR:
4011     if (min > md->end_subject - eptr) return FALSE;
4012     c = *ecode++;
4013    
4014     /* The code is duplicated for the caseless and caseful cases, for speed,
4015     since matching characters is likely to be quite common. First, ensure the
4016     minimum number of matches are present. If min = max, continue at the same
4017     level without recursing. Otherwise, if minimizing, keep trying the rest of
4018     the expression and advancing one matching character if failing, up to the
4019     maximum. Alternatively, if maximizing, find the maximum number of
4020     characters and work backwards. */
4021    
4022 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4023     max, eptr));
4024 nigel 3
4025 nigel 23 if ((ims & PCRE_CASELESS) != 0)
4026 nigel 3 {
4027 nigel 25 c = md->lcc[c];
4028     for (i = 1; i <= min; i++)
4029     if (c != md->lcc[*eptr++]) return FALSE;
4030 nigel 3 if (min == max) continue;
4031     if (minimize)
4032     {
4033     for (i = min;; i++)
4034     {
4035 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4036     return TRUE;
4037 nigel 25 if (i >= max || eptr >= md->end_subject ||
4038     c != md->lcc[*eptr++])
4039 nigel 3 return FALSE;
4040     }
4041     /* Control never gets here */
4042     }
4043     else
4044     {
4045 nigel 7 const uschar *pp = eptr;
4046 nigel 3 for (i = min; i < max; i++)
4047     {
4048 nigel 25 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4049 nigel 3 eptr++;
4050     }
4051     while (eptr >= pp)
4052 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4053     return TRUE;
4054 nigel 3 return FALSE;
4055     }
4056     /* Control never gets here */
4057     }
4058    
4059     /* Caseful comparisons */
4060    
4061     else
4062     {
4063     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4064     if (min == max) continue;
4065     if (minimize)
4066     {
4067     for (i = min;; i++)
4068     {
4069 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4070     return TRUE;
4071 nigel 3 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4072     }
4073     /* Control never gets here */
4074     }
4075     else
4076     {
4077 nigel 7 const uschar *pp = eptr;
4078 nigel 3 for (i = min; i < max; i++)
4079     {
4080     if (eptr >= md->end_subject || c != *eptr) break;
4081     eptr++;
4082     }
4083     while (eptr >= pp)
4084 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4085     return TRUE;
4086 nigel 3 return FALSE;
4087     }
4088     }
4089     /* Control never gets here */
4090    
4091     /* Match a negated single character */
4092    
4093     case OP_NOT:
4094 nigel 9 if (eptr >= md->end_subject) return FALSE;
4095 nigel 3 ecode++;
4096 nigel 23 if ((ims & PCRE_CASELESS) != 0)
4097 nigel 3 {
4098 nigel 25 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4099 nigel 3 }
4100     else
4101     {
4102     if (*ecode++ == *eptr++) return FALSE;
4103     }
4104     break;
4105    
4106     /* Match a negated single character repeatedly. This is almost a repeat of
4107     the code for a repeated single character, but I haven't found a nice way of
4108     commoning these up that doesn't require a test of the positive/negative
4109     option for each character match. Maybe that wouldn't add very much to the
4110     time taken, but character matching *is* what this is all about... */
4111    
4112     case OP_NOTEXACT:
4113     min = max = (ecode[1] << 8) + ecode[2];
4114     ecode += 3;
4115     goto REPEATNOTCHAR;
4116    
4117     case OP_NOTUPTO:
4118     case OP_NOTMINUPTO:
4119     min = 0;
4120     max = (ecode[1] << 8) + ecode[2];
4121     minimize = *ecode == OP_NOTMINUPTO;
4122     ecode += 3;
4123     goto REPEATNOTCHAR;
4124    
4125     case OP_NOTSTAR:
4126     case OP_NOTMINSTAR:
4127     case OP_NOTPLUS:
4128     case OP_NOTMINPLUS:
4129     case OP_NOTQUERY:
4130     case OP_NOTMINQUERY:
4131     c = *ecode++ - OP_NOTSTAR;
4132     minimize = (c & 1) != 0;
4133     min = rep_min[c]; /* Pick up values from tables; */
4134     max = rep_max[c]; /* zero for max => infinity */
4135     if (max == 0) max = INT_MAX;
4136    
4137     /* Common code for all repeated single-character matches. We can give
4138     up quickly if there are fewer than the minimum number of characters left in
4139     the subject. */
4140    
4141     REPEATNOTCHAR:
4142     if (min > md->end_subject - eptr) return FALSE;
4143     c = *ecode++;
4144    
4145     /* The code is duplicated for the caseless and caseful cases, for speed,
4146     since matching characters is likely to be quite common. First, ensure the
4147     minimum number of matches are present. If min = max, continue at the same
4148     level without recursing. Otherwise, if minimizing, keep trying the rest of
4149     the expression and advancing one matching character if failing, up to the
4150     maximum. Alternatively, if maximizing, find the maximum number of
4151     characters and work backwards. */
4152    
4153 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4154     max, eptr));
4155 nigel 3
4156 nigel 23 if ((ims & PCRE_CASELESS) != 0)
4157 nigel 3 {
4158 nigel 25 c = md->lcc[c];
4159     for (i = 1; i <= min; i++)
4160     if (c == md->lcc[*eptr++]) return FALSE;
4161 nigel 3 if (min == max) continue;
4162     if (minimize)
4163     {
4164     for (i = min;; i++)
4165     {
4166 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4167     return TRUE;
4168 nigel 25 if (i >= max || eptr >= md->end_subject ||
4169     c == md->lcc[*eptr++])
4170 nigel 3 return FALSE;
4171     }
4172     /* Control never gets here */
4173     }
4174     else
4175     {
4176 nigel 7 const uschar *pp = eptr;
4177 nigel 3 for (i = min; i < max; i++)
4178     {
4179 nigel 25 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4180 nigel 3 eptr++;
4181     }
4182     while (eptr >= pp)
4183 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4184     return TRUE;
4185 nigel 3 return FALSE;
4186     }
4187     /* Control never gets here */
4188     }
4189    
4190     /* Caseful comparisons */
4191    
4192