/[pcre]/code/tags/pcre-4.0/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-4.0/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 39 - (hide annotations) (download)
Sat Feb 24 21:39:13 2007 UTC (6 years, 2 months ago) by nigel
Original Path: code/trunk/pcre.c
File MIME type: text/plain
File size: 131999 byte(s)
Load pcre-2.08 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 27 Copyright (c) 1997-1999 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Define DEBUG to get debugging output on stdout. */
37    
38     /* #define DEBUG */
39    
40 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41     inline, and there are *still* stupid compilers about that don't like indented
42     pre-processor statements. I suppose it's only been 10 years... */
43 nigel 3
44 nigel 9 #ifdef DEBUG
45     #define DPRINTF(p) printf p
46     #else
47     #define DPRINTF(p) /*nothing*/
48     #endif
49    
50 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
51     the external pcre header. */
52    
53     #include "internal.h"
54    
55    
56 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58     #ifdef __cplusplus
59     #define class pcre_class
60     #endif
61    
62    
63 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
64     not be set greater than 200. */
65    
66     #define BRASTACK_SIZE 200
67    
68    
69 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70    
71 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73 nigel 3
74 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
75 nigel 3
76     #ifdef DEBUG
77 nigel 7 static const char *OP_names[] = {
78     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80     "Opt", "^", "$", "Any", "chars", "not",
81 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84     "*", "*?", "+", "+?", "?", "??", "{", "{",
85 nigel 23 "class", "Ref",
86     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 nigel 3 "Brazero", "Braminzero", "Bra"
89     };
90     #endif
91    
92     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93     are simple data values; negative values are for special things like \d and so
94     on. Zero means further processing is needed (for things like \x), or the escape
95     is invalid. */
96    
97 nigel 15 static const short int escapes[] = {
98 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 nigel 23 0, 0, -ESC_z /* x - z */
108 nigel 3 };
109    
110     /* Definition to allow mutual recursion */
111    
112 nigel 13 static BOOL
113 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 nigel 37 BOOL, int, int *, int *, compile_data *);
115 nigel 3
116    
117    
118     /*************************************************
119     * Global variables *
120     *************************************************/
121    
122     /* PCRE is thread-clean and doesn't use any global variables in the normal
123     sense. However, it calls memory allocation and free functions via the two
124     indirections below, which are can be changed by the caller, but are shared
125     between all threads. */
126    
127     void *(*pcre_malloc)(size_t) = malloc;
128     void (*pcre_free)(void *) = free;
129    
130    
131    
132    
133     /*************************************************
134 nigel 25 * Default character tables *
135     *************************************************/
136    
137     /* A default set of character tables is included in the PCRE binary. Its source
138     is built by the maketables auxiliary program, which uses the default C ctypes
139     functions, and put in the file chartables.c. These tables are used by PCRE
140     whenever the caller of pcre_compile() does not provide an alternate set of
141     tables. */
142    
143     #include "chartables.c"
144    
145    
146    
147     /*************************************************
148 nigel 3 * Return version string *
149     *************************************************/
150    
151 nigel 39 #define STRING(a) # a
152     #define XSTRING(s) STRING(s)
153    
154 nigel 7 const char *
155 nigel 3 pcre_version(void)
156     {
157 nigel 39 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
158 nigel 3 }
159    
160    
161    
162    
163     /*************************************************
164     * Return info about a compiled pattern *
165     *************************************************/
166    
167     /* This function picks potentially useful data out of the private
168 nigel 37 structure. The public options are passed back in an int - though the
169     re->options field has been expanded to a long int, all the public options
170     at the low end of it, and so even on 16-bit systems this will still be OK.
171     Therefore, I haven't changed the API for pcre_info().
172 nigel 3
173     Arguments:
174     external_re points to compiled code
175     optptr where to pass back the options
176     first_char where to pass back the first character,
177     or -1 if multiline and all branches start ^,
178     or -2 otherwise
179    
180     Returns: number of identifying extraction brackets
181     or negative values on error
182     */
183    
184     int
185     pcre_info(const pcre *external_re, int *optptr, int *first_char)
186     {
187 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
188 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
189     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
190 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
191 nigel 3 if (first_char != NULL)
192     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
193     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
194     return re->top_bracket;
195     }
196    
197    
198    
199    
200     #ifdef DEBUG
201     /*************************************************
202     * Debugging function to print chars *
203     *************************************************/
204    
205     /* Print a sequence of chars in printable format, stopping at the end of the
206     subject if the requested.
207    
208     Arguments:
209     p points to characters
210     length number to print
211     is_subject TRUE if printing from within md->start_subject
212     md pointer to matching data block, if is_subject is TRUE
213    
214     Returns: nothing
215     */
216    
217 nigel 9 static void
218     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
219 nigel 3 {
220     int c;
221     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
222     while (length-- > 0)
223     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
224     }
225     #endif
226    
227    
228    
229    
230     /*************************************************
231     * Handle escapes *
232     *************************************************/
233    
234     /* This function is called when a \ has been encountered. It either returns a
235     positive value for a simple escape such as \n, or a negative value which
236     encodes one of the more complicated things such as \d. On entry, ptr is
237     pointing at the \. On exit, it is on the final character of the escape
238     sequence.
239    
240     Arguments:
241     ptrptr points to the pattern position pointer
242     errorptr points to the pointer to the error message
243     bracount number of previous extracting brackets
244     options the options bits
245     isclass TRUE if inside a character class
246 nigel 25 cd pointer to char tables block
247 nigel 3
248     Returns: zero or positive => a data character
249     negative => a special escape sequence
250     on error, errorptr is set
251     */
252    
253     static int
254 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
255 nigel 25 int options, BOOL isclass, compile_data *cd)
256 nigel 3 {
257 nigel 7 const uschar *ptr = *ptrptr;
258 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
259     int i;
260    
261     if (c == 0) *errorptr = ERR1;
262    
263     /* Digits or letters may have special meaning; all others are literals. */
264    
265     else if (c < '0' || c > 'z') {}
266    
267     /* Do an initial lookup in a table. A non-zero result is something that can be
268     returned immediately. Otherwise further processing may be required. */
269    
270     else if ((i = escapes[c - '0']) != 0) c = i;
271    
272     /* Escapes that need further processing, or are illegal. */
273    
274     else
275     {
276 nigel 7 const uschar *oldptr;
277 nigel 3 switch (c)
278     {
279     /* The handling of escape sequences consisting of a string of digits
280     starting with one that is not zero is not straightforward. By experiment,
281     the way Perl works seems to be as follows:
282    
283     Outside a character class, the digits are read as a decimal number. If the
284     number is less than 10, or if there are that many previous extracting
285     left brackets, then it is a back reference. Otherwise, up to three octal
286     digits are read to form an escaped byte. Thus \123 is likely to be octal
287     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
288     value is greater than 377, the least significant 8 bits are taken. Inside a
289     character class, \ followed by a digit is always an octal number. */
290    
291     case '1': case '2': case '3': case '4': case '5':
292     case '6': case '7': case '8': case '9':
293    
294     if (!isclass)
295     {
296     oldptr = ptr;
297     c -= '0';
298 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
299 nigel 3 c = c * 10 + *(++ptr) - '0';
300     if (c < 10 || c <= bracount)
301     {
302     c = -(ESC_REF + c);
303     break;
304     }
305     ptr = oldptr; /* Put the pointer back and fall through */
306     }
307    
308     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
309     generates a binary zero byte and treats the digit as a following literal.
310     Thus we have to pull back the pointer by one. */
311    
312     if ((c = *ptr) >= '8')
313     {
314     ptr--;
315     c = 0;
316     break;
317     }
318    
319     /* \0 always starts an octal number, but we may drop through to here with a
320     larger first octal digit */
321    
322     case '0':
323     c -= '0';
324 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
325 nigel 3 ptr[1] != '8' && ptr[1] != '9')
326     c = c * 8 + *(++ptr) - '0';
327     break;
328    
329     /* Special escapes not starting with a digit are straightforward */
330    
331     case 'x':
332     c = 0;
333 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
334 nigel 3 {
335     ptr++;
336 nigel 25 c = c * 16 + cd->lcc[*ptr] -
337     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
338 nigel 3 }
339     break;
340    
341     case 'c':
342     c = *(++ptr);
343     if (c == 0)
344     {
345     *errorptr = ERR2;
346     return 0;
347     }
348    
349     /* A letter is upper-cased; then the 0x40 bit is flipped */
350    
351 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
352 nigel 3 c ^= 0x40;
353     break;
354    
355     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
356     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
357 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
358     there used to be some cases other than the default, and there may be again
359     in future, so I haven't "optimized" it. */
360 nigel 3
361     default:
362     if ((options & PCRE_EXTRA) != 0) switch(c)
363     {
364     default:
365     *errorptr = ERR3;
366     break;
367     }
368     break;
369     }
370     }
371    
372     *ptrptr = ptr;
373     return c;
374     }
375    
376    
377    
378     /*************************************************
379     * Check for counted repeat *
380     *************************************************/
381    
382     /* This function is called when a '{' is encountered in a place where it might
383     start a quantifier. It looks ahead to see if it really is a quantifier or not.
384     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
385     where the ddds are digits.
386    
387     Arguments:
388     p pointer to the first char after '{'
389 nigel 25 cd pointer to char tables block
390 nigel 3
391     Returns: TRUE or FALSE
392     */
393    
394     static BOOL
395 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
396 nigel 3 {
397 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
398     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
399 nigel 3 if (*p == '}') return TRUE;
400    
401     if (*p++ != ',') return FALSE;
402     if (*p == '}') return TRUE;
403    
404 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
405     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
406 nigel 3 return (*p == '}');
407     }
408    
409    
410    
411     /*************************************************
412     * Read repeat counts *
413     *************************************************/
414    
415     /* Read an item of the form {n,m} and return the values. This is called only
416     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
417     so the syntax is guaranteed to be correct, but we need to check the values.
418    
419     Arguments:
420     p pointer to first char after '{'
421     minp pointer to int for min
422     maxp pointer to int for max
423     returned as -1 if no max
424     errorptr points to pointer to error message
425 nigel 25 cd pointer to character tables clock
426 nigel 3
427     Returns: pointer to '}' on success;
428     current ptr on error, with errorptr set
429     */
430    
431 nigel 7 static const uschar *
432 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
433     const char **errorptr, compile_data *cd)
434 nigel 3 {
435     int min = 0;
436     int max = -1;
437    
438 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
439 nigel 3
440     if (*p == '}') max = min; else
441     {
442     if (*(++p) != '}')
443     {
444     max = 0;
445 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
446 nigel 3 if (max < min)
447     {
448     *errorptr = ERR4;
449     return p;
450     }
451     }
452     }
453    
454     /* Do paranoid checks, then fill in the required variables, and pass back the
455     pointer to the terminating '}'. */
456    
457     if (min > 65535 || max > 65535)
458     *errorptr = ERR5;
459     else
460     {
461     *minp = min;
462     *maxp = max;
463     }
464     return p;
465     }
466    
467    
468    
469     /*************************************************
470 nigel 23 * Find the fixed length of a pattern *
471     *************************************************/
472    
473     /* Scan a pattern and compute the fixed length of subject that will match it,
474     if the length is fixed. This is needed for dealing with backward assertions.
475    
476     Arguments:
477     code points to the start of the pattern (the bracket)
478    
479     Returns: the fixed length, or -1 if there is no fixed length
480     */
481    
482     static int
483     find_fixedlength(uschar *code)
484     {
485     int length = -1;
486    
487     register int branchlength = 0;
488     register uschar *cc = code + 3;
489    
490     /* Scan along the opcodes for this branch. If we get to the end of the
491     branch, check the length against that of the other branches. */
492    
493     for (;;)
494     {
495     int d;
496     register int op = *cc;
497     if (op >= OP_BRA) op = OP_BRA;
498    
499     switch (op)
500     {
501     case OP_BRA:
502     case OP_ONCE:
503     case OP_COND:
504     d = find_fixedlength(cc);
505     if (d < 0) return -1;
506     branchlength += d;
507     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
508     cc += 3;
509     break;
510    
511     /* Reached end of a branch; if it's a ket it is the end of a nested
512     call. If it's ALT it is an alternation in a nested call. If it is
513     END it's the end of the outer call. All can be handled by the same code. */
514    
515     case OP_ALT:
516     case OP_KET:
517     case OP_KETRMAX:
518     case OP_KETRMIN:
519     case OP_END:
520     if (length < 0) length = branchlength;
521     else if (length != branchlength) return -1;
522     if (*cc != OP_ALT) return length;
523     cc += 3;
524     branchlength = 0;
525     break;
526    
527     /* Skip over assertive subpatterns */
528    
529     case OP_ASSERT:
530     case OP_ASSERT_NOT:
531     case OP_ASSERTBACK:
532     case OP_ASSERTBACK_NOT:
533     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
534     cc += 3;
535     break;
536    
537     /* Skip over things that don't match chars */
538    
539     case OP_REVERSE:
540     cc++;
541 nigel 37 /* Fall through */
542 nigel 23
543     case OP_CREF:
544     case OP_OPT:
545     cc++;
546     /* Fall through */
547    
548     case OP_SOD:
549     case OP_EOD:
550     case OP_EODN:
551     case OP_CIRC:
552     case OP_DOLL:
553     case OP_NOT_WORD_BOUNDARY:
554     case OP_WORD_BOUNDARY:
555     cc++;
556     break;
557    
558     /* Handle char strings */
559    
560     case OP_CHARS:
561     branchlength += *(++cc);
562     cc += *cc + 1;
563     break;
564    
565     /* Handle exact repetitions */
566    
567     case OP_EXACT:
568     case OP_TYPEEXACT:
569     branchlength += (cc[1] << 8) + cc[2];
570     cc += 4;
571     break;
572    
573     /* Handle single-char matchers */
574    
575     case OP_NOT_DIGIT:
576     case OP_DIGIT:
577     case OP_NOT_WHITESPACE:
578     case OP_WHITESPACE:
579     case OP_NOT_WORDCHAR:
580     case OP_WORDCHAR:
581     case OP_ANY:
582     branchlength++;
583     cc++;
584     break;
585    
586    
587     /* Check a class for variable quantification */
588    
589     case OP_CLASS:
590     cc += (*cc == OP_REF)? 2 : 33;
591    
592     switch (*cc)
593     {
594     case OP_CRSTAR:
595     case OP_CRMINSTAR:
596     case OP_CRQUERY:
597     case OP_CRMINQUERY:
598     return -1;
599    
600     case OP_CRRANGE:
601     case OP_CRMINRANGE:
602     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
603     branchlength += (cc[1] << 8) + cc[2];
604     cc += 5;
605     break;
606    
607     default:
608     branchlength++;
609     }
610     break;
611    
612     /* Anything else is variable length */
613    
614     default:
615     return -1;
616     }
617     }
618     /* Control never gets here */
619     }
620    
621    
622    
623    
624     /*************************************************
625 nigel 3 * Compile one branch *
626     *************************************************/
627    
628     /* Scan the pattern, compiling it into the code vector.
629    
630     Arguments:
631 nigel 25 options the option bits
632     brackets points to number of brackets used
633     code points to the pointer to the current code point
634     ptrptr points to the current pattern pointer
635     errorptr points to pointer to error message
636     optchanged set to the value of the last OP_OPT item compiled
637 nigel 37 reqchar set to the last literal character required, else -1
638     countlits set to count of mandatory literal characters
639 nigel 25 cd contains pointers to tables
640 nigel 3
641 nigel 25 Returns: TRUE on success
642     FALSE, with *errorptr set on error
643 nigel 3 */
644    
645     static BOOL
646 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
647 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
648 nigel 37 int *reqchar, int *countlits, compile_data *cd)
649 nigel 3 {
650     int repeat_type, op_type;
651     int repeat_min, repeat_max;
652     int bravalue, length;
653 nigel 19 int greedy_default, greedy_non_default;
654 nigel 37 int prevreqchar;
655     int condcount = 0;
656     int subcountlits = 0;
657 nigel 3 register int c;
658     register uschar *code = *codeptr;
659 nigel 23 uschar *tempcode;
660 nigel 7 const uschar *ptr = *ptrptr;
661 nigel 23 const uschar *tempptr;
662 nigel 3 uschar *previous = NULL;
663     uschar class[32];
664    
665 nigel 19 /* Set up the default and non-default settings for greediness */
666    
667     greedy_default = ((options & PCRE_UNGREEDY) != 0);
668     greedy_non_default = greedy_default ^ 1;
669    
670 nigel 37 /* Initialize no required char, and count of literals */
671    
672     *reqchar = prevreqchar = -1;
673     *countlits = 0;
674    
675 nigel 3 /* Switch on next character until the end of the branch */
676    
677     for (;; ptr++)
678     {
679     BOOL negate_class;
680 nigel 23 int class_charcount;
681     int class_lastchar;
682     int newoptions;
683     int condref;
684 nigel 37 int subreqchar;
685 nigel 3
686     c = *ptr;
687     if ((options & PCRE_EXTENDED) != 0)
688     {
689 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
690 nigel 3 if (c == '#')
691     {
692     while ((c = *(++ptr)) != 0 && c != '\n');
693     continue;
694     }
695     }
696    
697     switch(c)
698     {
699     /* The branch terminates at end of string, |, or ). */
700    
701     case 0:
702     case '|':
703     case ')':
704     *codeptr = code;
705     *ptrptr = ptr;
706     return TRUE;
707    
708     /* Handle single-character metacharacters */
709    
710     case '^':
711     previous = NULL;
712     *code++ = OP_CIRC;
713     break;
714    
715     case '$':
716     previous = NULL;
717     *code++ = OP_DOLL;
718     break;
719    
720     case '.':
721     previous = code;
722     *code++ = OP_ANY;
723     break;
724    
725     /* Character classes. These always build a 32-byte bitmap of the permitted
726     characters, except in the special case where there is only one character.
727     For negated classes, we build the map as usual, then invert it at the end.
728     */
729    
730     case '[':
731     previous = code;
732 nigel 23 *code++ = OP_CLASS;
733 nigel 3
734 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
735 nigel 3
736     if ((c = *(++ptr)) == '^')
737     {
738     negate_class = TRUE;
739     c = *(++ptr);
740     }
741 nigel 23 else negate_class = FALSE;
742 nigel 3
743     /* Keep a count of chars so that we can optimize the case of just a single
744     character. */
745    
746     class_charcount = 0;
747     class_lastchar = -1;
748    
749     /* Initialize the 32-char bit map to all zeros. We have to build the
750     map in a temporary bit of store, in case the class contains only 1
751     character, because in that case the compiled code doesn't use the
752     bit map. */
753    
754     memset(class, 0, 32 * sizeof(uschar));
755    
756     /* Process characters until ] is reached. By writing this as a "do" it
757     means that an initial ] is taken as a data character. */
758    
759     do
760     {
761     if (c == 0)
762     {
763     *errorptr = ERR6;
764     goto FAILED;
765     }
766    
767     /* Backslash may introduce a single character, or it may introduce one
768     of the specials, which just set a flag. Escaped items are checked for
769     validity in the pre-compiling pass. The sequence \b is a special case.
770 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
771 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
772     or into the one we are building. We assume they have more than one
773     character in them, so set class_count bigger than one. */
774    
775     if (c == '\\')
776     {
777 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
778 nigel 3 if (-c == ESC_b) c = '\b';
779     else if (c < 0)
780     {
781 nigel 25 register const uschar *cbits = cd->cbits;
782 nigel 3 class_charcount = 10;
783     switch (-c)
784     {
785     case ESC_d:
786 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
787 nigel 3 continue;
788    
789     case ESC_D:
790 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
791 nigel 3 continue;
792    
793     case ESC_w:
794     for (c = 0; c < 32; c++)
795 nigel 25 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
796 nigel 3 continue;
797    
798     case ESC_W:
799     for (c = 0; c < 32; c++)
800 nigel 25 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
801 nigel 3 continue;
802    
803     case ESC_s:
804 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
805 nigel 3 continue;
806    
807     case ESC_S:
808 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
809 nigel 3 continue;
810    
811     default:
812     *errorptr = ERR7;
813     goto FAILED;
814     }
815     }
816     /* Fall through if single character */
817     }
818    
819     /* A single character may be followed by '-' to form a range. However,
820     Perl does not permit ']' to be the end of the range. A '-' character
821     here is treated as a literal. */
822    
823     if (ptr[1] == '-' && ptr[2] != ']')
824     {
825     int d;
826     ptr += 2;
827     d = *ptr;
828    
829     if (d == 0)
830     {
831     *errorptr = ERR6;
832     goto FAILED;
833     }
834    
835     /* The second part of a range can be a single-character escape, but
836     not any of the other escapes. */
837    
838     if (d == '\\')
839     {
840 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
841 nigel 3 if (d < 0)
842     {
843     if (d == -ESC_b) d = '\b'; else
844     {
845     *errorptr = ERR7;
846     goto FAILED;
847     }
848     }
849     }
850    
851     if (d < c)
852     {
853     *errorptr = ERR8;
854     goto FAILED;
855     }
856    
857     for (; c <= d; c++)
858     {
859     class[c/8] |= (1 << (c&7));
860     if ((options & PCRE_CASELESS) != 0)
861     {
862 nigel 25 int uc = cd->fcc[c]; /* flip case */
863 nigel 3 class[uc/8] |= (1 << (uc&7));
864     }
865     class_charcount++; /* in case a one-char range */
866     class_lastchar = c;
867     }
868     continue; /* Go get the next char in the class */
869     }
870    
871     /* Handle a lone single character - we can get here for a normal
872     non-escape char, or after \ that introduces a single character. */
873    
874     class [c/8] |= (1 << (c&7));
875     if ((options & PCRE_CASELESS) != 0)
876     {
877 nigel 25 c = cd->fcc[c]; /* flip case */
878 nigel 3 class[c/8] |= (1 << (c&7));
879     }
880     class_charcount++;
881     class_lastchar = c;
882     }
883    
884     /* Loop until ']' reached; the check for end of string happens inside the
885     loop. This "while" is the end of the "do" above. */
886    
887     while ((c = *(++ptr)) != ']');
888    
889     /* If class_charcount is 1 and class_lastchar is not negative, we saw
890     precisely one character. This doesn't need the whole 32-byte bit map.
891     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
892     it's negative. */
893    
894     if (class_charcount == 1 && class_lastchar >= 0)
895     {
896     if (negate_class)
897     {
898     code[-1] = OP_NOT;
899     }
900     else
901     {
902     code[-1] = OP_CHARS;
903     *code++ = 1;
904     }
905     *code++ = class_lastchar;
906     }
907    
908     /* Otherwise, negate the 32-byte map if necessary, and copy it into
909     the code vector. */
910    
911     else
912     {
913     if (negate_class)
914     for (c = 0; c < 32; c++) code[c] = ~class[c];
915     else
916     memcpy(code, class, 32);
917     code += 32;
918     }
919     break;
920    
921     /* Various kinds of repeat */
922    
923     case '{':
924 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
925     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
926 nigel 3 if (*errorptr != NULL) goto FAILED;
927     goto REPEAT;
928    
929     case '*':
930     repeat_min = 0;
931     repeat_max = -1;
932     goto REPEAT;
933    
934     case '+':
935     repeat_min = 1;
936     repeat_max = -1;
937     goto REPEAT;
938    
939     case '?':
940     repeat_min = 0;
941     repeat_max = 1;
942    
943     REPEAT:
944     if (previous == NULL)
945     {
946     *errorptr = ERR9;
947     goto FAILED;
948     }
949    
950 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
951     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
952 nigel 3 next character. */
953    
954 nigel 19 if (ptr[1] == '?')
955     { repeat_type = greedy_non_default; ptr++; }
956     else repeat_type = greedy_default;
957 nigel 3
958     /* If previous was a string of characters, chop off the last one and use it
959     as the subject of the repeat. If there was only one character, we can
960 nigel 37 abolish the previous item altogether. A repeat with a zero minimum wipes
961     out any reqchar setting, backing up to the previous value. We must also
962     adjust the countlits value. */
963 nigel 3
964 nigel 37 if (*previous == OP_CHARS)
965 nigel 3 {
966     int len = previous[1];
967 nigel 37
968     if (repeat_min == 0) *reqchar = prevreqchar;
969     *countlits += repeat_min - 1;
970    
971 nigel 3 if (len == 1)
972     {
973     c = previous[2];
974     code = previous;
975     }
976     else
977     {
978     c = previous[len+1];
979     previous[1]--;
980     code--;
981     }
982     op_type = 0; /* Use single-char op codes */
983     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
984     }
985    
986     /* If previous was a single negated character ([^a] or similar), we use
987     one of the special opcodes, replacing it. The code is shared with single-
988     character repeats by adding a suitable offset into repeat_type. */
989    
990     else if ((int)*previous == OP_NOT)
991     {
992     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
993     c = previous[1];
994     code = previous;
995     goto OUTPUT_SINGLE_REPEAT;
996     }
997    
998     /* If previous was a character type match (\d or similar), abolish it and
999     create a suitable repeat item. The code is shared with single-character
1000     repeats by adding a suitable offset into repeat_type. */
1001    
1002 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1003 nigel 3 {
1004     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1005     c = *previous;
1006     code = previous;
1007    
1008     OUTPUT_SINGLE_REPEAT:
1009    
1010 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
1011     this case, so we do too - by simply omitting the item altogether. */
1012    
1013     if (repeat_max == 0) goto END_REPEAT;
1014    
1015     /* Combine the op_type with the repeat_type */
1016    
1017     repeat_type += op_type;
1018    
1019 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
1020     an UPTO, with the maximum given. */
1021    
1022     if (repeat_min == 0)
1023     {
1024     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1025     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1026     else
1027     {
1028     *code++ = OP_UPTO + repeat_type;
1029     *code++ = repeat_max >> 8;
1030     *code++ = (repeat_max & 255);
1031     }
1032     }
1033    
1034     /* The case {1,} is handled as the special case + */
1035    
1036     else if (repeat_min == 1 && repeat_max == -1)
1037     *code++ = OP_PLUS + repeat_type;
1038    
1039     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1040     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1041    
1042     else
1043     {
1044     if (repeat_min != 1)
1045     {
1046     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1047     *code++ = repeat_min >> 8;
1048     *code++ = (repeat_min & 255);
1049     }
1050    
1051     /* If the mininum is 1 and the previous item was a character string,
1052     we either have to put back the item that got cancelled if the string
1053     length was 1, or add the character back onto the end of a longer
1054 nigel 21 string. For a character type nothing need be done; it will just get
1055     put back naturally. Note that the final character is always going to
1056     get added below. */
1057 nigel 3
1058     else if (*previous == OP_CHARS)
1059     {
1060     if (code == previous) code += 2; else previous[1]++;
1061     }
1062    
1063 nigel 21 /* For a single negated character we also have to put back the
1064     item that got cancelled. */
1065    
1066     else if (*previous == OP_NOT) code++;
1067    
1068 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1069 nigel 3
1070 nigel 9 if (repeat_max < 0)
1071 nigel 3 {
1072     *code++ = c;
1073 nigel 9 *code++ = OP_STAR + repeat_type;
1074     }
1075    
1076     /* Else insert an UPTO if the max is greater than the min. */
1077    
1078     else if (repeat_max != repeat_min)
1079     {
1080     *code++ = c;
1081 nigel 3 repeat_max -= repeat_min;
1082     *code++ = OP_UPTO + repeat_type;
1083     *code++ = repeat_max >> 8;
1084     *code++ = (repeat_max & 255);
1085     }
1086     }
1087    
1088     /* The character or character type itself comes last in all cases. */
1089    
1090     *code++ = c;
1091     }
1092    
1093     /* If previous was a character class or a back reference, we put the repeat
1094 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
1095 nigel 3
1096 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1097 nigel 3 {
1098 nigel 37 if (repeat_max == 0)
1099     {
1100     code = previous;
1101     goto END_REPEAT;
1102     }
1103 nigel 3 if (repeat_min == 0 && repeat_max == -1)
1104     *code++ = OP_CRSTAR + repeat_type;
1105     else if (repeat_min == 1 && repeat_max == -1)
1106     *code++ = OP_CRPLUS + repeat_type;
1107     else if (repeat_min == 0 && repeat_max == 1)
1108     *code++ = OP_CRQUERY + repeat_type;
1109     else
1110     {
1111     *code++ = OP_CRRANGE + repeat_type;
1112     *code++ = repeat_min >> 8;
1113     *code++ = repeat_min & 255;
1114     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1115     *code++ = repeat_max >> 8;
1116     *code++ = repeat_max & 255;
1117     }
1118     }
1119    
1120     /* If previous was a bracket group, we may have to replicate it in certain
1121 nigel 23 cases. */
1122 nigel 3
1123 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1124     (int)*previous == OP_COND)
1125 nigel 3 {
1126 nigel 31 register int i;
1127     int ketoffset = 0;
1128 nigel 9 int len = code - previous;
1129 nigel 31 uschar *bralink = NULL;
1130 nigel 3
1131 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1132     by scanning through from the start, and compute the offset back to it
1133     from the current code pointer. There may be an OP_OPT setting following
1134     the final KET, so we can't find the end just by going back from the code
1135     pointer. */
1136    
1137     if (repeat_max == -1)
1138 nigel 3 {
1139 nigel 23 register uschar *ket = previous;
1140     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1141     ketoffset = code - ket;
1142 nigel 3 }
1143    
1144 nigel 31 /* The case of a zero minimum is special because of the need to stick
1145     OP_BRAZERO in front of it, and because the group appears once in the
1146     data, whereas in other cases it appears the minimum number of times. For
1147     this reason, it is simplest to treat this case separately, as otherwise
1148     the code gets far too mess. There are several special subcases when the
1149     minimum is zero. */
1150    
1151     if (repeat_min == 0)
1152     {
1153 nigel 37 /* If we set up a required char from the bracket, we must back off
1154     to the previous value and reset the countlits value too. */
1155    
1156     if (subcountlits > 0)
1157     {
1158     *reqchar = prevreqchar;
1159     *countlits -= subcountlits;
1160     }
1161    
1162 nigel 31 /* If the maximum is also zero, we just omit the group from the output
1163     altogether. */
1164    
1165     if (repeat_max == 0)
1166     {
1167     code = previous;
1168 nigel 37 goto END_REPEAT;
1169 nigel 31 }
1170    
1171     /* If the maximum is 1 or unlimited, we just have to stick in the
1172     BRAZERO and do no more at this point. */
1173    
1174     if (repeat_max <= 1)
1175     {
1176     memmove(previous+1, previous, len);
1177     code++;
1178     *previous++ = OP_BRAZERO + repeat_type;
1179     }
1180    
1181     /* If the maximum is greater than 1 and limited, we have to replicate
1182     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1183     The first one has to be handled carefully because it's the original
1184     copy, which has to be moved up. The remainder can be handled by code
1185     that is common with the non-zero minimum case below. We just have to
1186     adjust the value or repeat_max, since one less copy is required. */
1187    
1188     else
1189     {
1190     int offset;
1191     memmove(previous+4, previous, len);
1192     code += 4;
1193     *previous++ = OP_BRAZERO + repeat_type;
1194     *previous++ = OP_BRA;
1195    
1196     /* We chain together the bracket offset fields that have to be
1197     filled in later when the ends of the brackets are reached. */
1198    
1199     offset = (bralink == NULL)? 0 : previous - bralink;
1200     bralink = previous;
1201     *previous++ = offset >> 8;
1202     *previous++ = offset & 255;
1203     }
1204    
1205     repeat_max--;
1206     }
1207    
1208     /* If the minimum is greater than zero, replicate the group as many
1209     times as necessary, and adjust the maximum to the number of subsequent
1210     copies that we need. */
1211    
1212     else
1213     {
1214     for (i = 1; i < repeat_min; i++)
1215     {
1216     memcpy(code, previous, len);
1217     code += len;
1218     }
1219     if (repeat_max > 0) repeat_max -= repeat_min;
1220     }
1221    
1222     /* This code is common to both the zero and non-zero minimum cases. If
1223     the maximum is limited, it replicates the group in a nested fashion,
1224     remembering the bracket starts on a stack. In the case of a zero minimum,
1225     the first one was set up above. In all cases the repeat_max now specifies
1226     the number of additional copies needed. */
1227    
1228     if (repeat_max >= 0)
1229     {
1230     for (i = repeat_max - 1; i >= 0; i--)
1231     {
1232     *code++ = OP_BRAZERO + repeat_type;
1233    
1234     /* All but the final copy start a new nesting, maintaining the
1235     chain of brackets outstanding. */
1236    
1237     if (i != 0)
1238     {
1239     int offset;
1240     *code++ = OP_BRA;
1241     offset = (bralink == NULL)? 0 : code - bralink;
1242     bralink = code;
1243     *code++ = offset >> 8;
1244     *code++ = offset & 255;
1245     }
1246    
1247     memcpy(code, previous, len);
1248     code += len;
1249     }
1250    
1251     /* Now chain through the pending brackets, and fill in their length
1252     fields (which are holding the chain links pro tem). */
1253    
1254     while (bralink != NULL)
1255     {
1256     int oldlinkoffset;
1257     int offset = code - bralink + 1;
1258     uschar *bra = code - offset;
1259     oldlinkoffset = (bra[1] << 8) + bra[2];
1260     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1261     *code++ = OP_KET;
1262     *code++ = bra[1] = offset >> 8;
1263     *code++ = bra[2] = (offset & 255);
1264     }
1265     }
1266    
1267     /* If the maximum is unlimited, set a repeater in the final copy. We
1268     can't just offset backwards from the current code point, because we
1269     don't know if there's been an options resetting after the ket. The
1270     correct offset was computed above. */
1271    
1272     else code[-ketoffset] = OP_KETRMAX + repeat_type;
1273 nigel 3 }
1274    
1275     /* Else there's some kind of shambles */
1276    
1277     else
1278     {
1279     *errorptr = ERR11;
1280     goto FAILED;
1281     }
1282    
1283     /* In all case we no longer have a previous item. */
1284    
1285 nigel 37 END_REPEAT:
1286 nigel 3 previous = NULL;
1287     break;
1288    
1289    
1290 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1291     lookbehind or option setting or condition. First deal with special things
1292     that can come after a bracket; all are introduced by ?, and the appearance
1293     of any of them means that this is not a referencing group. They were
1294     checked for validity in the first pass over the string, so we don't have to
1295     check for syntax errors here. */
1296 nigel 3
1297     case '(':
1298 nigel 23 newoptions = options;
1299     condref = -1;
1300    
1301 nigel 3 if (*(++ptr) == '?')
1302     {
1303 nigel 23 int set, unset;
1304     int *optset;
1305 nigel 3
1306     switch (*(++ptr))
1307     {
1308 nigel 23 case '#': /* Comment; skip to ket */
1309 nigel 3 ptr++;
1310     while (*ptr != ')') ptr++;
1311     continue;
1312    
1313     case ':': /* Non-extracting bracket */
1314 nigel 23 bravalue = OP_BRA;
1315 nigel 3 ptr++;
1316     break;
1317    
1318 nigel 23 case '(':
1319     bravalue = OP_COND; /* Conditional group */
1320 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1321 nigel 23 {
1322     condref = *ptr - '0';
1323     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1324     ptr++;
1325     }
1326     else ptr--;
1327     break;
1328    
1329     case '=': /* Positive lookahead */
1330 nigel 3 bravalue = OP_ASSERT;
1331     ptr++;
1332     break;
1333    
1334 nigel 23 case '!': /* Negative lookahead */
1335 nigel 3 bravalue = OP_ASSERT_NOT;
1336     ptr++;
1337     break;
1338    
1339 nigel 23 case '<': /* Lookbehinds */
1340     switch (*(++ptr))
1341 nigel 3 {
1342 nigel 23 case '=': /* Positive lookbehind */
1343     bravalue = OP_ASSERTBACK;
1344 nigel 3 ptr++;
1345     break;
1346 nigel 23
1347     case '!': /* Negative lookbehind */
1348     bravalue = OP_ASSERTBACK_NOT;
1349     ptr++;
1350     break;
1351    
1352     default: /* Syntax error */
1353     *errorptr = ERR24;
1354     goto FAILED;
1355 nigel 3 }
1356 nigel 23 break;
1357 nigel 3
1358 nigel 23 case '>': /* One-time brackets */
1359     bravalue = OP_ONCE;
1360     ptr++;
1361     break;
1362    
1363     default: /* Option setting */
1364     set = unset = 0;
1365     optset = &set;
1366    
1367     while (*ptr != ')' && *ptr != ':')
1368     {
1369     switch (*ptr++)
1370     {
1371     case '-': optset = &unset; break;
1372    
1373     case 'i': *optset |= PCRE_CASELESS; break;
1374     case 'm': *optset |= PCRE_MULTILINE; break;
1375     case 's': *optset |= PCRE_DOTALL; break;
1376     case 'x': *optset |= PCRE_EXTENDED; break;
1377     case 'U': *optset |= PCRE_UNGREEDY; break;
1378     case 'X': *optset |= PCRE_EXTRA; break;
1379    
1380     default:
1381     *errorptr = ERR12;
1382     goto FAILED;
1383     }
1384     }
1385    
1386     /* Set up the changed option bits, but don't change anything yet. */
1387    
1388     newoptions = (options | set) & (~unset);
1389    
1390     /* If the options ended with ')' this is not the start of a nested
1391     group with option changes, so the options change at this level. At top
1392     level there is nothing else to be done (the options will in fact have
1393     been set from the start of compiling as a result of the first pass) but
1394     at an inner level we must compile code to change the ims options if
1395     necessary, and pass the new setting back so that it can be put at the
1396     start of any following branches, and when this group ends, a resetting
1397     item can be compiled. */
1398    
1399     if (*ptr == ')')
1400     {
1401     if ((options & PCRE_INGROUP) != 0 &&
1402     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1403     {
1404     *code++ = OP_OPT;
1405     *code++ = *optchanged = newoptions & PCRE_IMS;
1406     }
1407     options = newoptions; /* Change options at this level */
1408     previous = NULL; /* This item can't be repeated */
1409     continue; /* It is complete */
1410     }
1411    
1412     /* If the options ended with ':' we are heading into a nested group
1413     with possible change of options. Such groups are non-capturing and are
1414     not assertions of any kind. All we need to do is skip over the ':';
1415     the newoptions value is handled below. */
1416    
1417     bravalue = OP_BRA;
1418     ptr++;
1419 nigel 3 }
1420     }
1421    
1422 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1423 nigel 3
1424     else
1425     {
1426     if (++(*brackets) > EXTRACT_MAX)
1427     {
1428     *errorptr = ERR13;
1429     goto FAILED;
1430     }
1431     bravalue = OP_BRA + *brackets;
1432     }
1433    
1434 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1435     kinds can be. We copy code into a non-register variable in order to be able
1436     to pass its address because some compilers complain otherwise. Pass in a
1437     new setting for the ims options if they have changed. */
1438 nigel 3
1439 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1440 nigel 3 *code = bravalue;
1441 nigel 23 tempcode = code;
1442    
1443     if (!compile_regex(
1444     options | PCRE_INGROUP, /* Set for all nested groups */
1445     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1446     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1447     brackets, /* Bracket level */
1448     &tempcode, /* Where to put code (updated) */
1449     &ptr, /* Input pointer (updated) */
1450     errorptr, /* Where to put an error message */
1451     (bravalue == OP_ASSERTBACK ||
1452     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1453 nigel 25 condref, /* Condition reference number */
1454 nigel 37 &subreqchar, /* For possible last char */
1455     &subcountlits, /* For literal count */
1456 nigel 25 cd)) /* Tables block */
1457 nigel 23 goto FAILED;
1458    
1459     /* At the end of compiling, code is still pointing to the start of the
1460     group, while tempcode has been updated to point past the end of the group
1461     and any option resetting that may follow it. The pattern pointer (ptr)
1462     is on the bracket. */
1463    
1464     /* If this is a conditional bracket, check that there are no more than
1465     two branches in the group. */
1466    
1467     if (bravalue == OP_COND)
1468 nigel 3 {
1469 nigel 23 uschar *tc = code;
1470 nigel 37 condcount = 0;
1471 nigel 23
1472     do {
1473 nigel 37 condcount++;
1474 nigel 23 tc += (tc[1] << 8) | tc[2];
1475     }
1476     while (*tc != OP_KET);
1477    
1478 nigel 37 if (condcount > 2)
1479 nigel 23 {
1480     *errorptr = ERR27;
1481 nigel 3 goto FAILED;
1482 nigel 23 }
1483 nigel 3 }
1484    
1485 nigel 37 /* Handle updating of the required character. If the subpattern didn't
1486     set one, leave it as it was. Otherwise, update it for normal brackets of
1487     all kinds, forward assertions, and conditions with two branches. Don't
1488     update the literal count for forward assertions, however. If the bracket
1489     is followed by a quantifier with zero repeat, we have to back off. Hence
1490     the definition of prevreqchar and subcountlits outside the main loop so
1491     that they can be accessed for the back off. */
1492    
1493     if (subreqchar > 0 &&
1494     (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1495     (bravalue == OP_COND && condcount == 2)))
1496     {
1497     prevreqchar = *reqchar;
1498     *reqchar = subreqchar;
1499     if (bravalue != OP_ASSERT) *countlits += subcountlits;
1500     }
1501    
1502 nigel 23 /* Now update the main code pointer to the end of the group. */
1503    
1504     code = tempcode;
1505    
1506     /* Error if hit end of pattern */
1507    
1508 nigel 3 if (*ptr != ')')
1509     {
1510     *errorptr = ERR14;
1511     goto FAILED;
1512     }
1513     break;
1514    
1515     /* Check \ for being a real metacharacter; if not, fall through and handle
1516     it as a data character at the start of a string. Escape items are checked
1517     for validity in the pre-compiling pass. */
1518    
1519     case '\\':
1520 nigel 23 tempptr = ptr;
1521 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1522 nigel 3
1523     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1524     are arranged to be the negation of the corresponding OP_values. For the
1525     back references, the values are ESC_REF plus the reference number. Only
1526     back references and those types that consume a character may be repeated.
1527     We can test for values between ESC_b and ESC_Z for the latter; this may
1528     have to change if any new ones are ever created. */
1529    
1530     if (c < 0)
1531     {
1532     if (-c >= ESC_REF)
1533     {
1534     previous = code;
1535     *code++ = OP_REF;
1536 nigel 23 *code++ = -c - ESC_REF;
1537 nigel 3 }
1538     else
1539     {
1540 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1541 nigel 3 *code++ = -c;
1542     }
1543     continue;
1544     }
1545    
1546 nigel 7 /* Data character: reset and fall through */
1547 nigel 3
1548 nigel 23 ptr = tempptr;
1549 nigel 3 c = '\\';
1550    
1551     /* Handle a run of data characters until a metacharacter is encountered.
1552     The first character is guaranteed not to be whitespace or # when the
1553     extended flag is set. */
1554    
1555     NORMAL_CHAR:
1556     default:
1557     previous = code;
1558     *code = OP_CHARS;
1559     code += 2;
1560     length = 0;
1561    
1562     do
1563     {
1564     if ((options & PCRE_EXTENDED) != 0)
1565     {
1566 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1567 nigel 3 if (c == '#')
1568     {
1569     while ((c = *(++ptr)) != 0 && c != '\n');
1570     if (c == 0) break;
1571     continue;
1572     }
1573     }
1574    
1575     /* Backslash may introduce a data char or a metacharacter. Escaped items
1576     are checked for validity in the pre-compiling pass. Stop the string
1577     before a metaitem. */
1578    
1579     if (c == '\\')
1580     {
1581 nigel 23 tempptr = ptr;
1582 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1583 nigel 23 if (c < 0) { ptr = tempptr; break; }
1584 nigel 3 }
1585    
1586     /* Ordinary character or single-char escape */
1587    
1588     *code++ = c;
1589     length++;
1590     }
1591    
1592     /* This "while" is the end of the "do" above. */
1593    
1594 nigel 25 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1595 nigel 3
1596 nigel 37 /* Update the last character and the count of literals */
1597    
1598     prevreqchar = (length > 1)? code[-2] : *reqchar;
1599     *reqchar = code[-1];
1600     *countlits += length;
1601    
1602 nigel 3 /* Compute the length and set it in the data vector, and advance to
1603     the next state. */
1604    
1605     previous[1] = length;
1606 nigel 15 if (length < 255) ptr--;
1607 nigel 3 break;
1608     }
1609     } /* end of big loop */
1610    
1611     /* Control never reaches here by falling through, only by a goto for all the
1612     error states. Pass back the position in the pattern so that it can be displayed
1613     to the user for diagnosing the error. */
1614    
1615     FAILED:
1616     *ptrptr = ptr;
1617     return FALSE;
1618     }
1619    
1620    
1621    
1622    
1623     /*************************************************
1624     * Compile sequence of alternatives *
1625     *************************************************/
1626    
1627     /* On entry, ptr is pointing past the bracket character, but on return
1628     it points to the closing bracket, or vertical bar, or end of string.
1629     The code variable is pointing at the byte into which the BRA operator has been
1630 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
1631     during any branch, we need to insert an OP_OPT item at the start of every
1632     following branch to ensure they get set correctly at run time, and also pass
1633     the new options into every subsequent branch compile.
1634 nigel 3
1635     Argument:
1636 nigel 23 options the option bits
1637     optchanged new ims options to set as if (?ims) were at the start, or -1
1638     for no change
1639     brackets -> int containing the number of extracting brackets used
1640     codeptr -> the address of the current code pointer
1641     ptrptr -> the address of the current pattern pointer
1642     errorptr -> pointer to error message
1643     lookbehind TRUE if this is a lookbehind assertion
1644     condref > 0 for OPT_CREF setting at start of conditional group
1645 nigel 37 reqchar -> place to put the last required character, or a negative number
1646     countlits -> place to put the shortest literal count of any branch
1647 nigel 25 cd points to the data block with tables pointers
1648 nigel 3
1649 nigel 23 Returns: TRUE on success
1650 nigel 3 */
1651    
1652     static BOOL
1653 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1654 nigel 25 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1655 nigel 37 int *reqchar, int *countlits, compile_data *cd)
1656 nigel 3 {
1657 nigel 7 const uschar *ptr = *ptrptr;
1658 nigel 3 uschar *code = *codeptr;
1659 nigel 23 uschar *last_branch = code;
1660 nigel 3 uschar *start_bracket = code;
1661 nigel 23 uschar *reverse_count = NULL;
1662     int oldoptions = options & PCRE_IMS;
1663 nigel 37 int branchreqchar, branchcountlits;
1664 nigel 3
1665 nigel 37 *reqchar = -1;
1666     *countlits = INT_MAX;
1667 nigel 23 code += 3;
1668    
1669     /* At the start of a reference-based conditional group, insert the reference
1670     number as an OP_CREF item. */
1671    
1672     if (condref > 0)
1673     {
1674     *code++ = OP_CREF;
1675     *code++ = condref;
1676     }
1677    
1678     /* Loop for each alternative branch */
1679    
1680 nigel 3 for (;;)
1681     {
1682     int length;
1683    
1684 nigel 23 /* Handle change of options */
1685    
1686     if (optchanged >= 0)
1687 nigel 3 {
1688 nigel 23 *code++ = OP_OPT;
1689     *code++ = optchanged;
1690     options = (options & ~PCRE_IMS) | optchanged;
1691     }
1692    
1693     /* Set up dummy OP_REVERSE if lookbehind assertion */
1694    
1695     if (lookbehind)
1696     {
1697     *code++ = OP_REVERSE;
1698     reverse_count = code;
1699     *code++ = 0;
1700     *code++ = 0;
1701     }
1702    
1703     /* Now compile the branch */
1704    
1705 nigel 37 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1706     &branchreqchar, &branchcountlits, cd))
1707 nigel 23 {
1708 nigel 3 *ptrptr = ptr;
1709     return FALSE;
1710     }
1711    
1712     /* Fill in the length of the last branch */
1713    
1714     length = code - last_branch;
1715     last_branch[1] = length >> 8;
1716     last_branch[2] = length & 255;
1717    
1718 nigel 37 /* Save the last required character if all branches have the same; a current
1719     value of -1 means unset, while -2 means "previous branch had no last required
1720     char". */
1721    
1722     if (*reqchar != -2)
1723     {
1724     if (branchreqchar >= 0)
1725     {
1726     if (*reqchar == -1) *reqchar = branchreqchar;
1727     else if (*reqchar != branchreqchar) *reqchar = -2;
1728     }
1729     else *reqchar = -2;
1730     }
1731    
1732     /* Keep the shortest literal count */
1733    
1734     if (branchcountlits < *countlits) *countlits = branchcountlits;
1735     DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1736    
1737 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
1738     and put the length into the OP_REVERSE item. Temporarily mark the end of
1739     the branch with OP_END. */
1740    
1741     if (lookbehind)
1742     {
1743     *code = OP_END;
1744     length = find_fixedlength(last_branch);
1745     DPRINTF(("fixed length = %d\n", length));
1746     if (length < 0)
1747     {
1748     *errorptr = ERR25;
1749     *ptrptr = ptr;
1750     return FALSE;
1751     }
1752     reverse_count[0] = (length >> 8);
1753     reverse_count[1] = length & 255;
1754     }
1755    
1756 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
1757     terminating ket and the length of the whole bracketed item, and return,
1758 nigel 23 leaving the pointer at the terminating char. If any of the ims options
1759     were changed inside the group, compile a resetting op-code following. */
1760 nigel 3
1761     if (*ptr != '|')
1762     {
1763     length = code - start_bracket;
1764     *code++ = OP_KET;
1765     *code++ = length >> 8;
1766     *code++ = length & 255;
1767 nigel 23 if (optchanged >= 0)
1768     {
1769     *code++ = OP_OPT;
1770     *code++ = oldoptions;
1771     }
1772 nigel 3 *codeptr = code;
1773     *ptrptr = ptr;
1774     return TRUE;
1775     }
1776    
1777     /* Another branch follows; insert an "or" node and advance the pointer. */
1778    
1779     *code = OP_ALT;
1780 nigel 23 last_branch = code;
1781     code += 3;
1782 nigel 3 ptr++;
1783     }
1784     /* Control never reaches here */
1785     }
1786    
1787    
1788    
1789 nigel 23
1790 nigel 3 /*************************************************
1791 nigel 23 * Find first significant op code *
1792     *************************************************/
1793    
1794     /* This is called by several functions that scan a compiled expression looking
1795     for a fixed first character, or an anchoring op code etc. It skips over things
1796     that do not influence this. For one application, a change of caseless option is
1797     important.
1798    
1799     Arguments:
1800     code pointer to the start of the group
1801     options pointer to external options
1802     optbit the option bit whose changing is significant, or
1803     zero if none are
1804     optstop TRUE to return on option change, otherwise change the options
1805     value and continue
1806    
1807     Returns: pointer to the first significant opcode
1808     */
1809    
1810     static const uschar*
1811     first_significant_code(const uschar *code, int *options, int optbit,
1812     BOOL optstop)
1813     {
1814     for (;;)
1815     {
1816     switch ((int)*code)
1817     {
1818     case OP_OPT:
1819     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1820     {
1821     if (optstop) return code;
1822     *options = (int)code[1];
1823     }
1824     code += 2;
1825     break;
1826    
1827     case OP_CREF:
1828     code += 2;
1829     break;
1830    
1831 nigel 35 case OP_WORD_BOUNDARY:
1832     case OP_NOT_WORD_BOUNDARY:
1833     code++;
1834     break;
1835    
1836 nigel 23 case OP_ASSERT_NOT:
1837     case OP_ASSERTBACK:
1838     case OP_ASSERTBACK_NOT:
1839     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1840     code += 3;
1841     break;
1842    
1843     default:
1844     return code;
1845     }
1846     }
1847     /* Control never reaches here */
1848     }
1849    
1850    
1851    
1852    
1853     /*************************************************
1854 nigel 3 * Check for anchored expression *
1855     *************************************************/
1856    
1857     /* Try to find out if this is an anchored regular expression. Consider each
1858     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1859     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1860     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1861     counts, since OP_CIRC can match in the middle.
1862    
1863 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1864     because that will try the rest of the pattern at all possible matching points,
1865     so there is no point trying them again.
1866 nigel 3
1867 nigel 23 Arguments:
1868     code points to start of expression (the bracket)
1869     options points to the options setting
1870    
1871     Returns: TRUE or FALSE
1872 nigel 3 */
1873    
1874     static BOOL
1875 nigel 23 is_anchored(register const uschar *code, int *options)
1876 nigel 3 {
1877     do {
1878 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
1879     PCRE_MULTILINE, FALSE);
1880     register int op = *scode;
1881     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1882     { if (!is_anchored(scode, options)) return FALSE; }
1883 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1884     (*options & PCRE_DOTALL) != 0)
1885 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
1886     else if (op != OP_SOD &&
1887     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1888     return FALSE;
1889 nigel 3 code += (code[1] << 8) + code[2];
1890     }
1891     while (*code == OP_ALT);
1892     return TRUE;
1893     }
1894    
1895    
1896    
1897     /*************************************************
1898 nigel 33 * Check for starting with ^ or .* *
1899 nigel 3 *************************************************/
1900    
1901 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
1902     "first char" processing can be done to speed things up in multiline
1903     matching and for non-DOTALL patterns that start with .* (which must start at
1904     the beginning or after \n).
1905 nigel 3
1906     Argument: points to start of expression (the bracket)
1907     Returns: TRUE or FALSE
1908     */
1909    
1910     static BOOL
1911 nigel 7 is_startline(const uschar *code)
1912 nigel 3 {
1913     do {
1914 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1915     register int op = *scode;
1916     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1917     { if (!is_startline(scode)) return FALSE; }
1918 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1919     { if (scode[1] != OP_ANY) return FALSE; }
1920 nigel 23 else if (op != OP_CIRC) return FALSE;
1921 nigel 3 code += (code[1] << 8) + code[2];
1922     }
1923     while (*code == OP_ALT);
1924     return TRUE;
1925     }
1926    
1927    
1928    
1929     /*************************************************
1930     * Check for fixed first char *
1931     *************************************************/
1932    
1933     /* Try to find out if there is a fixed first character. This is called for
1934     unanchored expressions, as it speeds up their processing quite considerably.
1935     Consider each alternative branch. If they all start with the same char, or with
1936     a bracket all of whose alternatives start with the same char (recurse ad lib),
1937     then we return that char, otherwise -1.
1938    
1939 nigel 23 Arguments:
1940     code points to start of expression (the bracket)
1941     options pointer to the options (used to check casing changes)
1942    
1943     Returns: -1 or the fixed first char
1944 nigel 3 */
1945    
1946     static int
1947 nigel 23 find_firstchar(const uschar *code, int *options)
1948 nigel 3 {
1949     register int c = -1;
1950 nigel 23 do {
1951     int d;
1952     const uschar *scode = first_significant_code(code + 3, options,
1953     PCRE_CASELESS, TRUE);
1954     register int op = *scode;
1955 nigel 3
1956 nigel 23 if (op >= OP_BRA) op = OP_BRA;
1957 nigel 3
1958 nigel 23 switch(op)
1959     {
1960     default:
1961     return -1;
1962 nigel 3
1963 nigel 23 case OP_BRA:
1964     case OP_ASSERT:
1965     case OP_ONCE:
1966     case OP_COND:
1967     if ((d = find_firstchar(scode, options)) < 0) return -1;
1968     if (c < 0) c = d; else if (c != d) return -1;
1969     break;
1970 nigel 3
1971 nigel 23 case OP_EXACT: /* Fall through */
1972     scode++;
1973 nigel 3
1974 nigel 23 case OP_CHARS: /* Fall through */
1975     scode++;
1976    
1977     case OP_PLUS:
1978     case OP_MINPLUS:
1979     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1980     break;
1981     }
1982    
1983     code += (code[1] << 8) + code[2];
1984     }
1985 nigel 3 while (*code == OP_ALT);
1986     return c;
1987     }
1988    
1989    
1990    
1991 nigel 23
1992    
1993 nigel 3 /*************************************************
1994     * Compile a Regular Expression *
1995     *************************************************/
1996    
1997     /* This function takes a string and returns a pointer to a block of store
1998     holding a compiled version of the expression.
1999    
2000     Arguments:
2001     pattern the regular expression
2002     options various option bits
2003     errorptr pointer to pointer to error text
2004     erroroffset ptr offset in pattern where error was detected
2005 nigel 25 tables pointer to character tables or NULL
2006 nigel 3
2007     Returns: pointer to compiled data block, or NULL on error,
2008     with errorptr and erroroffset set
2009     */
2010    
2011     pcre *
2012 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
2013 nigel 25 int *erroroffset, const unsigned char *tables)
2014 nigel 3 {
2015     real_pcre *re;
2016     int length = 3; /* For initial BRA plus length */
2017     int runlength;
2018 nigel 37 int c, size, reqchar, countlits;
2019 nigel 3 int bracount = 0;
2020     int top_backref = 0;
2021 nigel 23 int branch_extra = 0;
2022     int branch_newextra;
2023 nigel 7 unsigned int brastackptr = 0;
2024     uschar *code;
2025     const uschar *ptr;
2026 nigel 25 compile_data compile_block;
2027 nigel 23 int brastack[BRASTACK_SIZE];
2028     uschar bralenstack[BRASTACK_SIZE];
2029 nigel 3
2030     #ifdef DEBUG
2031     uschar *code_base, *code_end;
2032     #endif
2033    
2034     /* We can't pass back an error message if errorptr is NULL; I guess the best we
2035     can do is just return NULL. */
2036    
2037     if (errorptr == NULL) return NULL;
2038     *errorptr = NULL;
2039    
2040     /* However, we can give a message for this error */
2041    
2042     if (erroroffset == NULL)
2043     {
2044     *errorptr = ERR16;
2045     return NULL;
2046     }
2047     *erroroffset = 0;
2048    
2049     if ((options & ~PUBLIC_OPTIONS) != 0)
2050     {
2051     *errorptr = ERR17;
2052     return NULL;
2053     }
2054    
2055 nigel 25 /* Set up pointers to the individual character tables */
2056    
2057     if (tables == NULL) tables = pcre_default_tables;
2058     compile_block.lcc = tables + lcc_offset;
2059     compile_block.fcc = tables + fcc_offset;
2060     compile_block.cbits = tables + cbits_offset;
2061     compile_block.ctypes = tables + ctypes_offset;
2062    
2063     /* Reflect pattern for debugging output */
2064    
2065 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
2066     DPRINTF(("%s\n", pattern));
2067 nigel 3
2068     /* The first thing to do is to make a pass over the pattern to compute the
2069     amount of store required to hold the compiled code. This does not have to be
2070     perfect as long as errors are overestimates. At the same time we can detect any
2071     internal flag settings. Make an attempt to correct for any counted white space
2072     if an "extended" flag setting appears late in the pattern. We can't be so
2073     clever for #-comments. */
2074    
2075 nigel 7 ptr = (const uschar *)(pattern - 1);
2076 nigel 3 while ((c = *(++ptr)) != 0)
2077     {
2078     int min, max;
2079     int class_charcount;
2080    
2081 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2082 nigel 3 {
2083 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2084 nigel 23 if (c == '#')
2085     {
2086     while ((c = *(++ptr)) != 0 && c != '\n');
2087     continue;
2088     }
2089 nigel 3 }
2090    
2091     switch(c)
2092     {
2093     /* A backslashed item may be an escaped "normal" character or a
2094     character type. For a "normal" character, put the pointers and
2095     character back so that tests for whitespace etc. in the input
2096     are done correctly. */
2097    
2098     case '\\':
2099     {
2100 nigel 7 const uschar *save_ptr = ptr;
2101 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2102 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2103     if (c >= 0)
2104     {
2105     ptr = save_ptr;
2106     c = '\\';
2107     goto NORMAL_CHAR;
2108     }
2109     }
2110     length++;
2111    
2112     /* A back reference needs an additional char, plus either one or 5
2113     bytes for a repeat. We also need to keep the value of the highest
2114     back reference. */
2115    
2116     if (c <= -ESC_REF)
2117     {
2118     int refnum = -c - ESC_REF;
2119     if (refnum > top_backref) top_backref = refnum;
2120     length++; /* For single back reference */
2121 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2122 nigel 3 {
2123 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2124 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2125     if ((min == 0 && (max == 1 || max == -1)) ||
2126     (min == 1 && max == -1))
2127     length++;
2128     else length += 5;
2129     if (ptr[1] == '?') ptr++;
2130     }
2131     }
2132     continue;
2133    
2134     case '^':
2135     case '.':
2136     case '$':
2137     case '*': /* These repeats won't be after brackets; */
2138     case '+': /* those are handled separately */
2139     case '?':
2140     length++;
2141     continue;
2142    
2143     /* This covers the cases of repeats after a single char, metachar, class,
2144     or back reference. */
2145    
2146     case '{':
2147 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2148     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2149 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2150     if ((min == 0 && (max == 1 || max == -1)) ||
2151     (min == 1 && max == -1))
2152     length++;
2153     else
2154     {
2155     length--; /* Uncount the original char or metachar */
2156     if (min == 1) length++; else if (min > 0) length += 4;
2157     if (max > 0) length += 4; else length += 2;
2158     }
2159     if (ptr[1] == '?') ptr++;
2160     continue;
2161    
2162 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
2163     options changed in the previous branch(es), and/or if we are in a
2164     lookbehind assertion, extra space will be needed at the start of the
2165     branch. This is handled by branch_extra. */
2166    
2167 nigel 3 case '|':
2168 nigel 23 length += 3 + branch_extra;
2169 nigel 3 continue;
2170    
2171     /* A character class uses 33 characters. Don't worry about character types
2172     that aren't allowed in classes - they'll get picked up during the compile.
2173     A character class that contains only one character uses 2 or 3 bytes,
2174     depending on whether it is negated or not. Notice this where we can. */
2175    
2176     case '[':
2177     class_charcount = 0;
2178     if (*(++ptr) == '^') ptr++;
2179     do
2180     {
2181     if (*ptr == '\\')
2182     {
2183 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2184     &compile_block);
2185 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2186 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2187 nigel 3 }
2188     else class_charcount++;
2189     ptr++;
2190     }
2191     while (*ptr != 0 && *ptr != ']');
2192    
2193     /* Repeats for negated single chars are handled by the general code */
2194    
2195     if (class_charcount == 1) length += 3; else
2196     {
2197     length += 33;
2198    
2199     /* A repeat needs either 1 or 5 bytes. */
2200    
2201 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2202 nigel 3 {
2203 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2204 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2205     if ((min == 0 && (max == 1 || max == -1)) ||
2206     (min == 1 && max == -1))
2207     length++;
2208     else length += 5;
2209     if (ptr[1] == '?') ptr++;
2210     }
2211     }
2212     continue;
2213    
2214     /* Brackets may be genuine groups or special things */
2215    
2216     case '(':
2217 nigel 23 branch_newextra = 0;
2218 nigel 3
2219     /* Handle special forms of bracket, which all start (? */
2220    
2221 nigel 23 if (ptr[1] == '?')
2222 nigel 3 {
2223 nigel 23 int set, unset;
2224     int *optset;
2225    
2226     switch (c = ptr[2])
2227 nigel 3 {
2228 nigel 23 /* Skip over comments entirely */
2229     case '#':
2230     ptr += 3;
2231     while (*ptr != 0 && *ptr != ')') ptr++;
2232     if (*ptr == 0)
2233     {
2234     *errorptr = ERR18;
2235     goto PCRE_ERROR_RETURN;
2236     }
2237     continue;
2238 nigel 3
2239 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2240     then behave like a non-special bracket, except that they don't increment
2241     the count of extracting brackets. Ditto for the "once only" bracket,
2242     which is in Perl from version 5.005. */
2243 nigel 3
2244 nigel 23 case ':':
2245     case '=':
2246     case '!':
2247     case '>':
2248 nigel 3 ptr += 2;
2249     break;
2250    
2251 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2252 nigel 3
2253 nigel 23 case '<':
2254     if (ptr[3] == '=' || ptr[3] == '!')
2255 nigel 3 {
2256 nigel 23 ptr += 3;
2257     branch_newextra = 3;
2258     length += 3; /* For the first branch */
2259     break;
2260 nigel 3 }
2261 nigel 23 *errorptr = ERR24;
2262     goto PCRE_ERROR_RETURN;
2263    
2264     /* Conditionals are in Perl from version 5.005. The bracket must either
2265     be followed by a number (for bracket reference) or by an assertion
2266     group. */
2267    
2268     case '(':
2269 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2270 nigel 3 {
2271 nigel 23 ptr += 4;
2272     length += 2;
2273 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2274 nigel 23 if (*ptr != ')')
2275     {
2276     *errorptr = ERR26;
2277     goto PCRE_ERROR_RETURN;
2278     }
2279 nigel 3 }
2280 nigel 23 else /* An assertion must follow */
2281 nigel 3 {
2282 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2283    
2284     if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2285     {
2286     ptr += 2; /* To get right offset in message */
2287     *errorptr = ERR28;
2288     goto PCRE_ERROR_RETURN;
2289     }
2290 nigel 3 }
2291 nigel 23 break;
2292    
2293     /* Else loop checking valid options until ) is met. Anything else is an
2294     error. If we are without any brackets, i.e. at top level, the settings
2295     act as if specified in the options, so massage the options immediately.
2296     This is for backward compatibility with Perl 5.004. */
2297    
2298     default:
2299     set = unset = 0;
2300     optset = &set;
2301     ptr += 2;
2302    
2303     for (;; ptr++)
2304 nigel 3 {
2305 nigel 23 c = *ptr;
2306     switch (c)
2307     {
2308     case 'i':
2309     *optset |= PCRE_CASELESS;
2310     continue;
2311    
2312     case 'm':
2313     *optset |= PCRE_MULTILINE;
2314     continue;
2315    
2316     case 's':
2317     *optset |= PCRE_DOTALL;
2318     continue;
2319    
2320     case 'x':
2321     *optset |= PCRE_EXTENDED;
2322     continue;
2323    
2324     case 'X':
2325     *optset |= PCRE_EXTRA;
2326     continue;
2327    
2328     case 'U':
2329     *optset |= PCRE_UNGREEDY;
2330     continue;
2331    
2332     case '-':
2333     optset = &unset;
2334     continue;
2335    
2336     /* A termination by ')' indicates an options-setting-only item;
2337     this is global at top level; otherwise nothing is done here and
2338     it is handled during the compiling process on a per-bracket-group
2339     basis. */
2340    
2341     case ')':
2342     if (brastackptr == 0)
2343     {
2344     options = (options | set) & (~unset);
2345     set = unset = 0; /* To save length */
2346     }
2347     /* Fall through */
2348    
2349     /* A termination by ':' indicates the start of a nested group with
2350     the given options set. This is again handled at compile time, but
2351     we must allow for compiled space if any of the ims options are
2352     set. We also have to allow for resetting space at the end of
2353     the group, which is why 4 is added to the length and not just 2.
2354     If there are several changes of options within the same group, this
2355     will lead to an over-estimate on the length, but this shouldn't
2356     matter very much. We also have to allow for resetting options at
2357     the start of any alternations, which we do by setting
2358 nigel 37 branch_newextra to 2. Finally, we record whether the case-dependent
2359     flag ever changes within the regex. This is used by the "required
2360     character" code. */
2361 nigel 23
2362     case ':':
2363     if (((set|unset) & PCRE_IMS) != 0)
2364     {
2365     length += 4;
2366     branch_newextra = 2;
2367 nigel 37 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2368 nigel 23 }
2369     goto END_OPTIONS;
2370    
2371     /* Unrecognized option character */
2372    
2373     default:
2374     *errorptr = ERR12;
2375     goto PCRE_ERROR_RETURN;
2376     }
2377 nigel 3 }
2378 nigel 23
2379     /* If we hit a closing bracket, that's it - this is a freestanding
2380     option-setting. We need to ensure that branch_extra is updated if
2381     necessary. The only values branch_newextra can have here are 0 or 2.
2382     If the value is 2, then branch_extra must either be 2 or 5, depending
2383     on whether this is a lookbehind group or not. */
2384    
2385     END_OPTIONS:
2386     if (c == ')')
2387 nigel 19 {
2388 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2389     branch_extra += branch_newextra;
2390 nigel 19 continue;
2391     }
2392 nigel 3
2393 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2394     to handle the group below. */
2395 nigel 3 }
2396     }
2397    
2398     /* Extracting brackets must be counted so we can process escapes in a
2399     Perlish way. */
2400    
2401     else bracount++;
2402    
2403     /* Non-special forms of bracket. Save length for computing whole length
2404 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2405     save the current value of branch_extra, and start the new group with
2406     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2407     for a lookbehind assertion. */
2408 nigel 3
2409     if (brastackptr >= sizeof(brastack)/sizeof(int))
2410     {
2411     *errorptr = ERR19;
2412     goto PCRE_ERROR_RETURN;
2413     }
2414    
2415 nigel 23 bralenstack[brastackptr] = branch_extra;
2416     branch_extra = branch_newextra;
2417    
2418 nigel 3 brastack[brastackptr++] = length;
2419     length += 3;
2420     continue;
2421    
2422     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2423 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2424     0 this is an unmatched bracket which will generate an error, but take care
2425 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2426     the branch_extra value. */
2427 nigel 3
2428     case ')':
2429     length += 3;
2430     {
2431 nigel 9 int minval = 1;
2432     int maxval = 1;
2433 nigel 23 int duplength;
2434 nigel 3
2435 nigel 23 if (brastackptr > 0)
2436     {
2437     duplength = length - brastack[--brastackptr];
2438     branch_extra = bralenstack[brastackptr];
2439     }
2440     else duplength = 0;
2441    
2442 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2443     automatically; for the others we need an increment. */
2444    
2445 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2446 nigel 3 {
2447 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2448     &compile_block);
2449 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2450     }
2451 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2452     else if (c == '+') { maxval = -1; ptr++; }
2453     else if (c == '?') { minval = 0; ptr++; }
2454 nigel 3
2455 nigel 31 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2456     group, and if the maximum is greater than zero, we have to replicate
2457     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2458     bracket set - hence the 7. */
2459 nigel 3
2460 nigel 31 if (minval == 0)
2461     {
2462     length++;
2463     if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2464     }
2465    
2466     /* When the minimum is greater than zero, 1 we have to replicate up to
2467     minval-1 times, with no additions required in the copies. Then, if
2468     there is a limited maximum we have to replicate up to maxval-1 times
2469     allowing for a BRAZERO item before each optional copy and nesting
2470     brackets for all but one of the optional copies. */
2471    
2472     else
2473     {
2474     length += (minval - 1) * duplength;
2475     if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2476     length += (maxval - minval) * (duplength + 7) - 6;
2477     }
2478 nigel 3 }
2479     continue;
2480    
2481     /* Non-special character. For a run of such characters the length required
2482     is the number of characters + 2, except that the maximum run length is 255.
2483     We won't get a skipped space or a non-data escape or the start of a #
2484     comment as the first character, so the length can't be zero. */
2485    
2486     NORMAL_CHAR:
2487     default:
2488     length += 2;
2489     runlength = 0;
2490     do
2491     {
2492 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2493 nigel 3 {
2494 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2495 nigel 23 if (c == '#')
2496     {
2497     while ((c = *(++ptr)) != 0 && c != '\n');
2498     continue;
2499     }
2500 nigel 3 }
2501    
2502     /* Backslash may introduce a data char or a metacharacter; stop the
2503     string before the latter. */
2504    
2505     if (c == '\\')
2506     {
2507 nigel 7 const uschar *saveptr = ptr;
2508 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2509     &compile_block);
2510 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2511     if (c < 0) { ptr = saveptr; break; }
2512     }
2513    
2514     /* Ordinary character or single-char escape */
2515    
2516     runlength++;
2517     }
2518    
2519     /* This "while" is the end of the "do" above. */
2520    
2521 nigel 25 while (runlength < 255 &&
2522     (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2523 nigel 3
2524     ptr--;
2525     length += runlength;
2526     continue;
2527     }
2528     }
2529    
2530     length += 4; /* For final KET and END */
2531    
2532     if (length > 65539)
2533     {
2534     *errorptr = ERR20;
2535     return NULL;
2536     }
2537    
2538     /* Compute the size of data block needed and get it, either from malloc or
2539 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
2540     rather than just "code", because it has been reported that one broken compiler
2541     fails on "code" because it is also an independent variable. It should make no
2542     difference to the value of the offsetof(). */
2543 nigel 3
2544 nigel 9 size = length + offsetof(real_pcre, code[0]);
2545 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
2546    
2547     if (re == NULL)
2548     {
2549     *errorptr = ERR21;
2550     return NULL;
2551     }
2552    
2553 nigel 9 /* Put in the magic number and the options. */
2554    
2555 nigel 3 re->magic_number = MAGIC_NUMBER;
2556     re->options = options;
2557 nigel 25 re->tables = tables;
2558 nigel 3
2559     /* Set up a starting, non-extracting bracket, then compile the expression. On
2560     error, *errorptr will be set non-NULL, so we don't need to look at the result
2561     of the function here. */
2562    
2563 nigel 7 ptr = (const uschar *)pattern;
2564 nigel 3 code = re->code;
2565     *code = OP_BRA;
2566     bracount = 0;
2567 nigel 25 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2568 nigel 37 &reqchar, &countlits, &compile_block);
2569 nigel 3 re->top_bracket = bracount;
2570     re->top_backref = top_backref;
2571    
2572     /* If not reached end of pattern on success, there's an excess bracket. */
2573    
2574     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2575    
2576     /* Fill in the terminating state and check for disastrous overflow, but
2577     if debugging, leave the test till after things are printed out. */
2578    
2579     *code++ = OP_END;
2580    
2581     #ifndef DEBUG
2582     if (code - re->code > length) *errorptr = ERR23;
2583     #endif
2584    
2585 nigel 23 /* Give an error if there's back reference to a non-existent capturing
2586     subpattern. */
2587    
2588     if (top_backref > re->top_bracket) *errorptr = ERR15;
2589    
2590 nigel 3 /* Failed to compile */
2591    
2592     if (*errorptr != NULL)
2593     {
2594     (pcre_free)(re);
2595     PCRE_ERROR_RETURN:
2596 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
2597 nigel 3 return NULL;
2598     }
2599    
2600 nigel 33 /* If the anchored option was not passed, set flag if we can determine that the
2601     pattern is anchored by virtue of ^ characters or \A or anything else (such as
2602     starting with .* when DOTALL is set).
2603 nigel 3
2604 nigel 33 Otherwise, see if we can determine what the first character has to be, because
2605     that speeds up unanchored matches no end. If not, see if we can set the
2606     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2607     start with ^. and also when all branches start with .* for non-DOTALL matches.
2608     */
2609    
2610 nigel 3 if ((options & PCRE_ANCHORED) == 0)
2611     {
2612 nigel 23 int temp_options = options;
2613     if (is_anchored(re->code, &temp_options))
2614 nigel 3 re->options |= PCRE_ANCHORED;
2615     else
2616     {
2617 nigel 23 int ch = find_firstchar(re->code, &temp_options);
2618 nigel 9 if (ch >= 0)
2619 nigel 3 {
2620 nigel 9 re->first_char = ch;
2621 nigel 3 re->options |= PCRE_FIRSTSET;
2622     }
2623     else if (is_startline(re->code))
2624     re->options |= PCRE_STARTLINE;
2625     }
2626     }
2627    
2628 nigel 37 /* Save the last required character if there are at least two literal
2629     characters on all paths, or if there is no first character setting. */
2630    
2631     if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2632     {
2633     re->req_char = reqchar;
2634     re->options |= PCRE_REQCHSET;
2635     }
2636    
2637 nigel 3 /* Print out the compiled data for debugging */
2638    
2639     #ifdef DEBUG
2640    
2641 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
2642 nigel 3 length, re->top_bracket, re->top_backref);
2643    
2644     if (re->options != 0)
2645     {
2646 nigel 37 printf("%s%s%s%s%s%s%s%s%s\n",
2647 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2648     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2649 nigel 37 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2650 nigel 3 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2651     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2652     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2653     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2654 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2655     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2656 nigel 3 }
2657    
2658     if ((re->options & PCRE_FIRSTSET) != 0)
2659     {
2660     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2661     else printf("First char = \\x%02x\n", re->first_char);
2662     }
2663    
2664 nigel 37 if ((re->options & PCRE_REQCHSET) != 0)
2665     {
2666     if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2667     else printf("Req char = \\x%02x\n", re->req_char);
2668     }
2669    
2670 nigel 3 code_end = code;
2671     code_base = code = re->code;
2672    
2673     while (code < code_end)
2674     {
2675     int charlength;
2676    
2677     printf("%3d ", code - code_base);
2678    
2679     if (*code >= OP_BRA)
2680     {
2681     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2682     code += 2;
2683     }
2684    
2685     else switch(*code)
2686     {
2687 nigel 23 case OP_OPT:
2688     printf(" %.2x %s", code[1], OP_names[*code]);
2689     code++;
2690     break;
2691    
2692     case OP_COND:
2693     printf("%3d Cond", (code[1] << 8) + code[2]);
2694     code += 2;
2695     break;
2696    
2697     case OP_CREF:
2698     printf(" %.2d %s", code[1], OP_names[*code]);
2699     code++;
2700     break;
2701    
2702 nigel 3 case OP_CHARS:
2703     charlength = *(++code);
2704     printf("%3d ", charlength);
2705     while (charlength-- > 0)
2706     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2707     break;
2708    
2709     case OP_KETRMAX:
2710     case OP_KETRMIN:
2711     case OP_ALT:
2712     case OP_KET:
2713     case OP_ASSERT:
2714     case OP_ASSERT_NOT:
2715 nigel 23 case OP_ASSERTBACK:
2716     case OP_ASSERTBACK_NOT:
2717 nigel 3 case OP_ONCE:
2718     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2719     code += 2;
2720     break;
2721    
2722 nigel 23 case OP_REVERSE:
2723     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2724     code += 2;
2725     break;
2726    
2727 nigel 3 case OP_STAR:
2728     case OP_MINSTAR:
2729     case OP_PLUS:
2730     case OP_MINPLUS:
2731     case OP_QUERY:
2732     case OP_MINQUERY:
2733     case OP_TYPESTAR:
2734     case OP_TYPEMINSTAR:
2735     case OP_TYPEPLUS:
2736     case OP_TYPEMINPLUS:
2737     case OP_TYPEQUERY:
2738     case OP_TYPEMINQUERY:
2739     if (*code >= OP_TYPESTAR)
2740     printf(" %s", OP_names[code[1]]);
2741     else if (isprint(c = code[1])) printf(" %c", c);
2742     else printf(" \\x%02x", c);
2743     printf("%s", OP_names[*code++]);
2744     break;
2745    
2746     case OP_EXACT:
2747     case OP_UPTO:
2748     case OP_MINUPTO:
2749     if (isprint(c = code[3])) printf(" %c{", c);
2750     else printf(" \\x%02x{", c);
2751 nigel 11 if (*code != OP_EXACT) printf("0,");
2752 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2753     if (*code == OP_MINUPTO) printf("?");
2754     code += 3;
2755     break;
2756    
2757     case OP_TYPEEXACT:
2758     case OP_TYPEUPTO:
2759     case OP_TYPEMINUPTO:
2760     printf(" %s{", OP_names[code[3]]);
2761     if (*code != OP_TYPEEXACT) printf(",");
2762     printf("%d}", (code[1] << 8) + code[2]);
2763     if (*code == OP_TYPEMINUPTO) printf("?");
2764     code += 3;
2765     break;
2766    
2767     case OP_NOT:
2768     if (isprint(c = *(++code))) printf(" [^%c]", c);
2769     else printf(" [^\\x%02x]", c);
2770     break;
2771    
2772     case OP_NOTSTAR:
2773     case OP_NOTMINSTAR:
2774     case OP_NOTPLUS:
2775     case OP_NOTMINPLUS:
2776     case OP_NOTQUERY:
2777     case OP_NOTMINQUERY:
2778     if (isprint(c = code[1])) printf(" [^%c]", c);
2779     else printf(" [^\\x%02x]", c);
2780     printf("%s", OP_names[*code++]);
2781     break;
2782    
2783     case OP_NOTEXACT:
2784     case OP_NOTUPTO:
2785     case OP_NOTMINUPTO:
2786     if (isprint(c = code[3])) printf(" [^%c]{", c);
2787     else printf(" [^\\x%02x]{", c);
2788     if (*code != OP_NOTEXACT) printf(",");
2789     printf("%d}", (code[1] << 8) + code[2]);
2790     if (*code == OP_NOTMINUPTO) printf("?");
2791     code += 3;
2792     break;
2793    
2794     case OP_REF:
2795     printf(" \\%d", *(++code));
2796 nigel 9 code ++;
2797     goto CLASS_REF_REPEAT;
2798 nigel 3
2799     case OP_CLASS:
2800     {
2801     int i, min, max;
2802 nigel 23 code++;
2803     printf(" [");
2804 nigel 3
2805     for (i = 0; i < 256; i++)
2806     {
2807     if ((code[i/8] & (1 << (i&7))) != 0)
2808     {
2809     int j;
2810     for (j = i+1; j < 256; j++)
2811     if ((code[j/8] & (1 << (j&7))) == 0) break;
2812     if (i == '-' || i == ']') printf("\\");
2813     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2814     if (--j > i)
2815     {
2816     printf("-");
2817     if (j == '-' || j == ']') printf("\\");
2818     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2819     }
2820     i = j;
2821     }
2822     }
2823     printf("]");
2824     code += 32;
2825    
2826 nigel 9 CLASS_REF_REPEAT:
2827    
2828 nigel 3 switch(*code)
2829     {
2830     case OP_CRSTAR:
2831     case OP_CRMINSTAR:
2832     case OP_CRPLUS:
2833     case OP_CRMINPLUS:
2834     case OP_CRQUERY:
2835     case OP_CRMINQUERY:
2836     printf("%s", OP_names[*code]);
2837     break;
2838    
2839     case OP_CRRANGE:
2840     case OP_CRMINRANGE:
2841     min = (code[1] << 8) + code[2];
2842     max = (code[3] << 8) + code[4];
2843     if (max == 0) printf("{%d,}", min);
2844     else printf("{%d,%d}", min, max);
2845     if (*code == OP_CRMINRANGE) printf("?");
2846     code += 4;
2847     break;
2848    
2849     default:
2850     code--;
2851     }
2852     }
2853     break;
2854    
2855     /* Anything else is just a one-node item */
2856    
2857     default:
2858     printf(" %s", OP_names[*code]);
2859     break;
2860     }
2861    
2862     code++;
2863     printf("\n");
2864     }
2865     printf("------------------------------------------------------------------\n");
2866    
2867     /* This check is done here in the debugging case so that the code that
2868     was compiled can be seen. */
2869    
2870     if (code - re->code > length)
2871     {
2872     *errorptr = ERR23;
2873     (pcre_free)(re);
2874     *erroroffset = ptr - (uschar *)pattern;
2875     return NULL;
2876     }
2877     #endif
2878    
2879     return (pcre *)re;
2880     }
2881    
2882    
2883    
2884     /*************************************************
2885     * Match a back-reference *
2886     *************************************************/
2887    
2888 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
2889     than the number of characters left in the string, so the match fails.
2890 nigel 3
2891     Arguments:
2892 nigel 23 offset index into the offset vector
2893 nigel 3 eptr points into the subject
2894     length length to be matched
2895     md points to match data block
2896 nigel 23 ims the ims flags
2897 nigel 3
2898     Returns: TRUE if matched
2899     */
2900    
2901     static BOOL
2902 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2903 nigel 37 unsigned long int ims)
2904 nigel 3 {
2905 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
2906 nigel 3
2907     #ifdef DEBUG
2908     if (eptr >= md->end_subject)
2909     printf("matching subject <null>");
2910     else
2911     {
2912     printf("matching subject ");
2913     pchars(eptr, length, TRUE, md);
2914     }
2915     printf(" against backref ");
2916     pchars(p, length, FALSE, md);
2917     printf("\n");
2918     #endif
2919    
2920     /* Always fail if not enough characters left */
2921    
2922 nigel 23 if (length > md->end_subject - eptr) return FALSE;
2923 nigel 3
2924     /* Separate the caselesss case for speed */
2925    
2926 nigel 23 if ((ims & PCRE_CASELESS) != 0)
2927 nigel 25 {
2928     while (length-- > 0)
2929     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2930     }
2931 nigel 3 else
2932     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2933    
2934     return TRUE;
2935     }
2936    
2937    
2938    
2939     /*************************************************
2940     * Match from current position *
2941     *************************************************/
2942    
2943 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
2944     in the subject string, while eptrb holds the value of eptr at the start of the
2945     last bracketed group - used for breaking infinite loops matching zero-length
2946     strings.
2947 nigel 3
2948     Arguments:
2949     eptr pointer in subject
2950     ecode position in code
2951     offset_top current top pointer
2952     md pointer to "static" info for the match
2953 nigel 23 ims current /i, /m, and /s options
2954     condassert TRUE if called to check a condition assertion
2955     eptrb eptr at start of last bracket
2956 nigel 3
2957     Returns: TRUE if matched
2958     */
2959    
2960     static BOOL
2961 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
2962 nigel 37 int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
2963     const uschar *eptrb)
2964 nigel 3 {
2965 nigel 37 unsigned long int original_ims = ims; /* Save for resetting on ')' */
2966 nigel 23
2967 nigel 3 for (;;)
2968     {
2969 nigel 23 int op = (int)*ecode;
2970 nigel 3 int min, max, ctype;
2971     register int i;
2972     register int c;
2973 nigel 7 BOOL minimize = FALSE;
2974 nigel 3
2975 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
2976     the current subject position in the working slot at the top of the vector. We
2977     mustn't change the current values of the data slot, because they may be set
2978     from a previous iteration of this group, and be referred to by a reference
2979     inside the group.
2980 nigel 3
2981 nigel 23 If the bracket fails to match, we need to restore this value and also the
2982     values of the final offsets, in case they were set by a previous iteration of
2983     the same bracket.
2984    
2985     If there isn't enough space in the offset vector, treat this as if it were a
2986     non-capturing bracket. Don't worry about setting the flag for the error case
2987     here; that is handled in the code for KET. */
2988    
2989     if (op > OP_BRA)
2990 nigel 3 {
2991 nigel 23 int number = op - OP_BRA;
2992     int offset = number << 1;
2993 nigel 3
2994 nigel 31 #ifdef DEBUG
2995     printf("start bracket %d subject=", number);
2996     pchars(eptr, 16, TRUE, md);
2997     printf("\n");
2998     #endif
2999 nigel 3
3000 nigel 23 if (offset < md->offset_max)
3001 nigel 3 {
3002 nigel 23 int save_offset1 = md->offset_vector[offset];
3003     int save_offset2 = md->offset_vector[offset+1];
3004     int save_offset3 = md->offset_vector[md->offset_end - number];
3005 nigel 3
3006 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3007     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3008    
3009     do
3010     {
3011     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3012     ecode += (ecode[1] << 8) + ecode[2];
3013     }
3014     while (*ecode == OP_ALT);
3015    
3016     DPRINTF(("bracket %d failed\n", number));
3017    
3018     md->offset_vector[offset] = save_offset1;
3019     md->offset_vector[offset+1] = save_offset2;
3020     md->offset_vector[md->offset_end - number] = save_offset3;
3021     return FALSE;
3022 nigel 3 }
3023    
3024 nigel 23 /* Insufficient room for saving captured contents */
3025 nigel 3
3026 nigel 23 else op = OP_BRA;
3027     }
3028    
3029     /* Other types of node can be handled by a switch */
3030    
3031     switch(op)
3032     {
3033     case OP_BRA: /* Non-capturing bracket: optimized */
3034     DPRINTF(("start bracket 0\n"));
3035 nigel 3 do
3036     {
3037 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3038 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3039     }
3040     while (*ecode == OP_ALT);
3041 nigel 23 DPRINTF(("bracket 0 failed\n"));
3042     return FALSE;
3043 nigel 3
3044 nigel 23 /* Conditional group: compilation checked that there are no more than
3045     two branches. If the condition is false, skipping the first branch takes us
3046     past the end if there is only one branch, but that's OK because that is
3047     exactly what going to the ket would do. */
3048 nigel 3
3049 nigel 23 case OP_COND:
3050     if (ecode[3] == OP_CREF) /* Condition is extraction test */
3051 nigel 3 {
3052 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
3053     return match(eptr,
3054     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3055     5 : 3 + (ecode[1] << 8) + ecode[2]),
3056     offset_top, md, ims, FALSE, eptr);
3057 nigel 3 }
3058    
3059 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
3060     the final argument TRUE causes it to stop at the end of an assertion. */
3061 nigel 3
3062 nigel 23 else
3063     {
3064     if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3065     {
3066     ecode += 3 + (ecode[4] << 8) + ecode[5];
3067     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3068     }
3069     else ecode += (ecode[1] << 8) + ecode[2];
3070     return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3071     }
3072     /* Control never reaches here */
3073 nigel 3
3074 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
3075    
3076     case OP_CREF:
3077     ecode += 2;
3078     break;
3079    
3080 nigel 37 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3081     an empty string - recursion will then try other alternatives, if any. */
3082 nigel 23
3083 nigel 3 case OP_END:
3084 nigel 37 if (md->notempty && eptr == md->start_match) return FALSE;
3085 nigel 3 md->end_match_ptr = eptr; /* Record where we ended */
3086     md->end_offset_top = offset_top; /* and how many extracts were taken */
3087     return TRUE;
3088    
3089 nigel 23 /* Change option settings */
3090 nigel 3
3091 nigel 23 case OP_OPT:
3092     ims = ecode[1];
3093     ecode += 2;
3094 nigel 39 DPRINTF(("ims set to %02lx\n", ims));
3095 nigel 23 break;
3096 nigel 3
3097     /* Assertion brackets. Check the alternative branches in turn - the
3098     matching won't pass the KET for an assertion. If any one branch matches,
3099 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3100     start of each branch to move the current point backwards, so the code at
3101     this level is identical to the lookahead case. */
3102 nigel 3
3103     case OP_ASSERT:
3104 nigel 23 case OP_ASSERTBACK:
3105 nigel 3 do
3106     {
3107 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3108 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3109     }
3110     while (*ecode == OP_ALT);
3111     if (*ecode == OP_KET) return FALSE;
3112    
3113 nigel 23 /* If checking an assertion for a condition, return TRUE. */
3114    
3115     if (condassert) return TRUE;
3116    
3117 nigel 3 /* Continue from after the assertion, updating the offsets high water
3118     mark, since extracts may have been taken during the assertion. */
3119    
3120     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3121     ecode += 3;
3122     offset_top = md->end_offset_top;
3123     continue;
3124    
3125     /* Negative assertion: all branches must fail to match */
3126    
3127     case OP_ASSERT_NOT:
3128 nigel 23 case OP_ASSERTBACK_NOT:
3129 nigel 3 do
3130     {
3131 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3132 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3133     }
3134     while (*ecode == OP_ALT);
3135 nigel 23
3136     if (condassert) return TRUE;
3137 nigel 3 ecode += 3;
3138     continue;
3139    
3140 nigel 23 /* Move the subject pointer back. This occurs only at the start of
3141     each branch of a lookbehind assertion. If we are too close to the start to
3142     move back, this match function fails. */
3143    
3144     case OP_REVERSE:
3145     eptr -= (ecode[1] << 8) + ecode[2];
3146     if (eptr < md->start_subject) return FALSE;
3147     ecode += 3;
3148     break;
3149    
3150    
3151 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
3152     the point in the subject string is not moved back. Thus there can never be
3153 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
3154 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
3155 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
3156     pointer. */
3157 nigel 3
3158     case OP_ONCE:
3159     {
3160 nigel 23 const uschar *prev = ecode;
3161 nigel 3
3162 nigel 23 do
3163     {
3164     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3165     ecode += (ecode[1] << 8) + ecode[2];
3166     }
3167     while (*ecode == OP_ALT);
3168 nigel 3
3169 nigel 23 /* If hit the end of the group (which could be repeated), fail */
3170 nigel 3
3171 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3172    
3173     /* Continue as from after the assertion, updating the offsets high water
3174     mark, since extracts may have been taken. */
3175    
3176     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3177    
3178     offset_top = md->end_offset_top;
3179     eptr = md->end_match_ptr;
3180    
3181     /* For a non-repeating ket, just continue at this level. This also
3182     happens for a repeating ket if no characters were matched in the group.
3183     This is the forcible breaking of infinite loops as implemented in Perl
3184     5.005. If there is an options reset, it will get obeyed in the normal
3185     course of events. */
3186    
3187     if (*ecode == OP_KET || eptr == eptrb)
3188     {
3189     ecode += 3;
3190     break;
3191     }
3192    
3193     /* The repeating kets try the rest of the pattern or restart from the
3194     preceding bracket, in the appropriate order. We need to reset any options
3195     that changed within the bracket before re-running it, so check the next
3196     opcode. */
3197    
3198     if (ecode[3] == OP_OPT)
3199     {
3200     ims = (ims & ~PCRE_IMS) | ecode[4];
3201 nigel 39 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3202 nigel 23 }
3203    
3204     if (*ecode == OP_KETRMIN)
3205     {
3206     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3207     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3208     }
3209     else /* OP_KETRMAX */
3210     {
3211     if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3212     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3213     }
3214     }
3215     return FALSE;
3216    
3217 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
3218     bracketed group and go to there. */
3219    
3220     case OP_ALT:
3221     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3222     break;
3223    
3224     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3225     that it may occur zero times. It may repeat infinitely, or not at all -
3226     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3227     repeat limits are compiled as a number of copies, with the optional ones
3228     preceded by BRAZERO or BRAMINZERO. */
3229    
3230     case OP_BRAZERO:
3231     {
3232 nigel 7 const uschar *next = ecode+1;
3233 nigel 23 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3234 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3235     ecode = next + 3;
3236     }
3237     break;
3238    
3239     case OP_BRAMINZERO:
3240     {
3241 nigel 7 const uschar *next = ecode+1;
3242 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3243 nigel 23 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3244 nigel 3 ecode++;
3245     }
3246 nigel 23 break;
3247 nigel 3
3248     /* End of a group, repeated or non-repeating. If we are at the end of
3249     an assertion "group", stop matching and return TRUE, but record the
3250 nigel 23 current high water mark for use by positive assertions. Do this also
3251     for the "once" (not-backup up) groups. */
3252 nigel 3
3253     case OP_KET:
3254     case OP_KETRMIN:
3255     case OP_KETRMAX:
3256     {
3257 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3258 nigel 3
3259 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3260     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3261     *prev == OP_ONCE)
3262 nigel 3 {
3263     md->end_match_ptr = eptr; /* For ONCE */
3264     md->end_offset_top = offset_top;
3265     return TRUE;
3266     }
3267    
3268 nigel 23 /* In all other cases except a conditional group we have to check the
3269     group number back at the start and if necessary complete handling an
3270     extraction by setting the offsets and bumping the high water mark. */
3271 nigel 3
3272 nigel 23 if (*prev != OP_COND)
3273     {
3274     int number = *prev - OP_BRA;
3275     int offset = number << 1;
3276 nigel 3
3277 nigel 23 DPRINTF(("end bracket %d\n", number));
3278 nigel 3
3279 nigel 23 if (number > 0)
3280 nigel 3 {
3281 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3282     {
3283     md->offset_vector[offset] =
3284     md->offset_vector[md->offset_end - number];
3285     md->offset_vector[offset+1] = eptr - md->start_subject;
3286     if (offset_top <= offset) offset_top = offset + 2;
3287     }
3288 nigel 3 }
3289     }
3290    
3291 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3292     the group. */
3293 nigel 3
3294 nigel 23 ims = original_ims;
3295 nigel 39 DPRINTF(("ims reset to %02lx\n", ims));
3296 nigel 23
3297     /* For a non-repeating ket, just continue at this level. This also
3298     happens for a repeating ket if no characters were matched in the group.
3299     This is the forcible breaking of infinite loops as implemented in Perl
3300     5.005. If there is an options reset, it will get obeyed in the normal
3301     course of events. */
3302    
3303     if (*ecode == OP_KET || eptr == eptrb)
3304 nigel 3 {
3305     ecode += 3;
3306     break;
3307     }
3308    
3309     /* The repeating kets try the rest of the pattern or restart from the
3310     preceding bracket, in the appropriate order. */
3311    
3312     if (*ecode == OP_KETRMIN)
3313     {
3314 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3315     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3316 nigel 3 }
3317     else /* OP_KETRMAX */
3318     {
3319 nigel 23 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3320     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3321 nigel 3 }
3322     }
3323     return FALSE;
3324    
3325     /* Start of subject unless notbol, or after internal newline if multiline */
3326    
3327     case OP_CIRC:
3328     if (md->notbol && eptr == md->start_subject) return FALSE;
3329 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3330 nigel 3 {
3331     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3332     ecode++;
3333     break;
3334     }
3335     /* ... else fall through */
3336    
3337     /* Start of subject assertion */
3338    
3339     case OP_SOD:
3340     if (eptr != md->start_subject) return FALSE;
3341     ecode++;
3342     break;
3343    
3344 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3345     newline unless endonly is set, else end of subject unless noteol is set. */
3346 nigel 3
3347     case OP_DOLL:
3348 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3349 nigel 3 {
3350 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3351     else { if (md->noteol) return FALSE; }
3352 nigel 3 ecode++;
3353     break;
3354     }
3355 nigel 23 else
3356 nigel 3 {
3357 nigel 23 if (md->noteol) return FALSE;
3358     if (!md->endonly)
3359     {
3360     if (eptr < md->end_subject - 1 ||
3361     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3362    
3363     ecode++;
3364     break;
3365     }
3366 nigel 3 }
3367     /* ... else fall through */
3368    
3369 nigel 23 /* End of subject assertion (\z) */
3370 nigel 3
3371     case OP_EOD:
3372     if (eptr < md->end_subject) return FALSE;
3373     ecode++;
3374     break;
3375    
3376 nigel 23 /* End of subject or ending \n assertion (\Z) */
3377    
3378     case OP_EODN:
3379     if (eptr < md->end_subject - 1 ||
3380     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3381     ecode++;
3382     break;
3383    
3384 nigel 3 /* Word boundary assertions */
3385    
3386     case OP_NOT_WORD_BOUNDARY:
3387     case OP_WORD_BOUNDARY:
3388     {
3389     BOOL prev_is_word = (eptr != md->start_subject) &&
3390 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3391 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
3392 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
3393 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
3394     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3395     return FALSE;
3396     }
3397     break;
3398    
3399     /* Match a single character type; inline for speed */
3400    
3401     case OP_ANY:
3402 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3403     return FALSE;
3404 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3405     ecode++;
3406     break;
3407    
3408     case OP_NOT_DIGIT:
3409 nigel 25 if (eptr >= md->end_subject ||
3410     (md->ctypes[*eptr++] & ctype_digit) != 0)
3411 nigel 3 return FALSE;
3412     ecode++;
3413     break;
3414    
3415     case OP_DIGIT:
3416 nigel 25 if (eptr >= md->end_subject ||
3417     (md->ctypes[*eptr++] & ctype_digit) == 0)
3418 nigel 3 return FALSE;
3419     ecode++;
3420     break;
3421    
3422     case OP_NOT_WHITESPACE:
3423 nigel 25 if (eptr >= md->end_subject ||
3424     (md->ctypes[*eptr++] & ctype_space) != 0)
3425 nigel 3 return FALSE;
3426     ecode++;
3427     break;
3428    
3429     case OP_WHITESPACE:
3430 nigel 25 if (eptr >= md->end_subject ||
3431     (md->ctypes[*eptr++] & ctype_space) == 0)
3432 nigel 3 return FALSE;
3433     ecode++;
3434     break;
3435    
3436     case OP_NOT_WORDCHAR:
3437 nigel 25 if (eptr >= md->end_subject ||
3438     (md->ctypes[*eptr++] & ctype_word) != 0)
3439 nigel 3 return FALSE;
3440     ecode++;
3441     break;
3442    
3443     case OP_WORDCHAR:
3444 nigel 25 if (eptr >= md->end_subject ||
3445     (md->ctypes[*eptr++] & ctype_word) == 0)
3446 nigel 3 return FALSE;
3447     ecode++;
3448     break;
3449    
3450     /* Match a back reference, possibly repeatedly. Look past the end of the
3451     item to see if there is repeat information following. The code is similar
3452     to that for character classes, but repeated for efficiency. Then obey
3453     similar code to character type repeats - written out again for speed.
3454     However, if the referenced string is the empty string, always treat
3455     it as matched, any number of times (otherwise there could be infinite
3456     loops). */
3457    
3458     case OP_REF:
3459     {
3460     int length;
3461 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
3462 nigel 3 ecode += 2; /* Advance past the item */
3463    
3464 nigel 23 /* If the reference is unset, set the length to be longer than the amount
3465     of subject left; this ensures that every attempt at a match fails. We
3466     can't just fail here, because of the possibility of quantifiers with zero
3467     minima. */
3468 nigel 3
3469 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3470     md->end_subject - eptr + 1 :
3471     md->offset_vector[offset+1] - md->offset_vector[offset];
3472 nigel 3
3473 nigel 23 /* Set up for repetition, or handle the non-repeated case */
3474    
3475 nigel 3 switch (*ecode)
3476     {
3477     case OP_CRSTAR:
3478     case OP_CRMINSTAR:
3479     case OP_CRPLUS:
3480     case OP_CRMINPLUS:
3481     case OP_CRQUERY:
3482     case OP_CRMINQUERY:
3483     c = *ecode++ - OP_CRSTAR;
3484     minimize = (c & 1) != 0;
3485     min = rep_min[c]; /* Pick up values from tables; */
3486     max = rep_max[c]; /* zero for max => infinity */
3487     if (max == 0) max = INT_MAX;
3488     break;
3489    
3490     case OP_CRRANGE:
3491     case OP_CRMINRANGE:
3492     minimize = (*ecode == OP_CRMINRANGE);
3493     min = (ecode[1] << 8) + ecode[2];
3494     max = (ecode[3] << 8) + ecode[4];
3495     if (max == 0) max = INT_MAX;
3496     ecode += 5;
3497     break;
3498    
3499     default: /* No repeat follows */
3500 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3501 nigel 3 eptr += length;
3502     continue; /* With the main loop */
3503     }
3504    
3505     /* If the length of the reference is zero, just continue with the
3506     main loop. */
3507    
3508     if (length == 0) continue;
3509    
3510     /* First, ensure the minimum number of matches are present. We get back
3511     the length of the reference string explicitly rather than passing the
3512     address of eptr, so that eptr can be a register variable. */
3513    
3514     for (i = 1; i <= min; i++)
3515     {
3516 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3517 nigel 3 eptr += length;
3518     }
3519    
3520     /* If min = max, continue at the same level without recursion.
3521     They are not both allowed to be zero. */
3522    
3523     if (min == max) continue;
3524    
3525     /* If minimizing, keep trying and advancing the pointer */
3526    
3527     if (minimize)
3528     {
3529     for (i = min;; i++)
3530     {
3531 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3532     return TRUE;
3533     if (i >= max || !match_ref(offset, eptr, length, md, ims))
3534 nigel 3 return FALSE;
3535     eptr += length;
3536     }
3537     /* Control never gets here */
3538     }
3539    
3540     /* If maximizing, find the longest string and work backwards */
3541    
3542     else
3543     {
3544 nigel 7 const uschar *pp = eptr;
3545 nigel 3 for (i = min; i < max; i++)
3546     {
3547 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
3548 nigel 3 eptr += length;
3549     }
3550     while (eptr >= pp)
3551     {
3552 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3553     return TRUE;
3554 nigel 3 eptr -= length;
3555     }
3556     return FALSE;
3557     }
3558     }
3559     /* Control never gets here */
3560    
3561 nigel 23
3562    
3563 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
3564     item to see if there is repeat information following. Then obey similar
3565 nigel 23 code to character type repeats - written out again for speed. */
3566 nigel 3
3567     case OP_CLASS:
3568     {
3569 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
3570     ecode += 33; /* Advance past the item */
3571 nigel 3
3572     switch (*ecode)
3573     {
3574     case OP_CRSTAR:
3575     case OP_CRMINSTAR:
3576     case OP_CRPLUS:
3577     case OP_CRMINPLUS:
3578     case OP_CRQUERY:
3579     case OP_CRMINQUERY:
3580     c = *ecode++ - OP_CRSTAR;
3581     minimize = (c & 1) != 0;
3582     min = rep_min[c]; /* Pick up values from tables; */
3583     max = rep_max[c]; /* zero for max => infinity */
3584     if (max == 0) max = INT_MAX;
3585     break;
3586    
3587     case OP_CRRANGE:
3588     case OP_CRMINRANGE:
3589     minimize = (*ecode == OP_CRMINRANGE);
3590     min = (ecode[1] << 8) + ecode[2];
3591     max = (ecode[3] << 8) + ecode[4];
3592     if (max == 0) max = INT_MAX;
3593     ecode += 5;
3594     break;
3595    
3596     default: /* No repeat follows */
3597 nigel 13 min = max = 1;
3598     break;
3599 nigel 3 }
3600    
3601     /* First, ensure the minimum number of matches are present. */
3602    
3603     for (i = 1; i <= min; i++)
3604     {
3605     if (eptr >= md->end_subject) return FALSE;
3606     c = *eptr++;
3607 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3608 nigel 3 return FALSE;
3609     }
3610    
3611     /* If max == min we can continue with the main loop without the
3612     need to recurse. */
3613    
3614     if (min == max) continue;
3615    
3616     /* If minimizing, keep testing the rest of the expression and advancing
3617     the pointer while it matches the class. */
3618    
3619     if (minimize)
3620     {
3621     for (i = min;; i++)
3622     {
3623 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3624     return TRUE;
3625 nigel 3 if (i >= max || eptr >= md->end_subject) return FALSE;
3626     c = *eptr++;
3627 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3628 nigel 3 return FALSE;
3629     }
3630     /* Control never gets here */
3631     }
3632    
3633     /* If maximizing, find the longest possible run, then work backwards. */
3634    
3635     else
3636     {
3637 nigel 7 const uschar *pp = eptr;
3638 nigel 3 for (i = min; i < max; eptr++, i++)
3639     {
3640     if (eptr >= md->end_subject) break;
3641     c = *eptr;
3642 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3643 nigel 3 break;
3644     }
3645    
3646     while (eptr >= pp)
3647 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3648     return TRUE;
3649 nigel 3 return FALSE;
3650     }
3651     }
3652     /* Control never gets here */
3653    
3654     /* Match a run of characters */
3655    
3656     case OP_CHARS:
3657     {
3658     register int length = ecode[1];
3659     ecode += 2;
3660    
3661 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3662 nigel 3 if (eptr >= md->end_subject)
3663     printf("matching subject <null> against pattern ");
3664     else
3665     {
3666     printf("matching subject ");
3667     pchars(eptr, length, TRUE, md);
3668     printf(" against pattern ");
3669     }
3670     pchars(ecode, length, FALSE, md);
3671     printf("\n");
3672 nigel 9 #endif
3673 nigel 3
3674     if (length > md->end_subject - eptr) return FALSE;
3675 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3676 nigel 3 {
3677 nigel 25 while (length-- > 0)
3678     if (md->lcc[*ecode++] != md->lcc[*eptr++])
3679     return FALSE;
3680 nigel 3 }
3681     else
3682     {
3683     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3684     }
3685     }
3686     break;
3687    
3688     /* Match a single character repeatedly; different opcodes share code. */
3689    
3690     case OP_EXACT:
3691     min = max = (ecode[1] << 8) + ecode[2];
3692     ecode += 3;
3693     goto REPEATCHAR;
3694    
3695     case OP_UPTO:
3696     case OP_MINUPTO:
3697     min = 0;
3698     max = (ecode[1] << 8) + ecode[2];
3699     minimize = *ecode == OP_MINUPTO;
3700     ecode += 3;
3701     goto REPEATCHAR;
3702    
3703     case OP_STAR:
3704     case OP_MINSTAR:
3705     case OP_PLUS:
3706     case OP_MINPLUS:
3707     case OP_QUERY:
3708     case OP_MINQUERY:
3709     c = *ecode++ - OP_STAR;
3710     minimize = (c & 1) != 0;
3711     min = rep_min[c]; /* Pick up values from tables; */
3712     max = rep_max[c]; /* zero for max => infinity */
3713     if (max == 0) max = INT_MAX;
3714    
3715     /* Common code for all repeated single-character matches. We can give
3716     up quickly if there are fewer than the minimum number of characters left in
3717     the subject. */
3718    
3719     REPEATCHAR:
3720     if (min > md->end_subject - eptr) return FALSE;
3721     c = *ecode++;
3722    
3723     /* The code is duplicated for the caseless and caseful cases, for speed,
3724     since matching characters is likely to be quite common. First, ensure the
3725     minimum number of matches are present. If min = max, continue at the same
3726     level without recursing. Otherwise, if minimizing, keep trying the rest of
3727     the expression and advancing one matching character if failing, up to the
3728     maximum. Alternatively, if maximizing, find the maximum number of
3729     characters and work backwards. */
3730    
3731 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3732     max, eptr));
3733 nigel 3
3734 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3735 nigel 3 {
3736 nigel 25 c = md->lcc[c];
3737     for (i = 1; i <= min; i++)
3738     if (c != md->lcc[*eptr++]) return FALSE;
3739 nigel 3 if (min == max) continue;
3740     if (minimize)
3741     {
3742     for (i = min;; i++)
3743     {
3744 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3745     return TRUE;
3746 nigel 25 if (i >= max || eptr >= md->end_subject ||
3747     c != md->lcc[*eptr++])
3748 nigel 3 return FALSE;
3749     }
3750     /* Control never gets here */
3751     }
3752     else
3753     {
3754 nigel 7 const uschar *pp = eptr;
3755 nigel 3 for (i = min; i < max; i++)
3756     {
3757 nigel 25 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3758 nigel 3 eptr++;
3759     }
3760     while (eptr >= pp)
3761 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3762     return TRUE;
3763 nigel 3 return FALSE;
3764     }
3765     /* Control never gets here */
3766     }
3767    
3768     /* Caseful comparisons */
3769    
3770     else
3771     {
3772     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3773     if (min == max) continue;
3774     if (minimize)
3775     {
3776     for (i = min;; i++)
3777     {
3778 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3779     return TRUE;
3780 nigel 3 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3781     }
3782     /* Control never gets here */
3783     }
3784     else
3785     {
3786 nigel 7 const uschar *pp = eptr;
3787 nigel 3 for (i = min; i < max; i++)
3788     {
3789     if (eptr >= md->end_subject || c != *eptr) break;
3790     eptr++;
3791     }
3792     while (eptr >= pp)
3793 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3794     return TRUE;
3795 nigel 3 return FALSE;
3796     }
3797     }
3798     /* Control never gets here */
3799    
3800     /* Match a negated single character */
3801    
3802     case OP_NOT:
3803 nigel 9 if (eptr >= md->end_subject) return FALSE;
3804 nigel 3 ecode++;
3805 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3806 nigel 3 {
3807 nigel 25 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3808 nigel 3 }
3809     else
3810     {
3811     if (*ecode++ == *eptr++) return FALSE;
3812     }
3813     break;
3814    
3815     /* Match a negated single character repeatedly. This is almost a repeat of
3816     the code for a repeated single character, but I haven't found a nice way of
3817     commoning these up that doesn't require a test of the positive/negative
3818     option for each character match. Maybe that wouldn't add very much to the
3819     time taken, but character matching *is* what this is all about... */
3820    
3821     case OP_NOTEXACT:
3822     min = max = (ecode[1] << 8) + ecode[2];
3823     ecode += 3;
3824     goto REPEATNOTCHAR;
3825    
3826     case OP_NOTUPTO:
3827     case OP_NOTMINUPTO:
3828     min = 0;
3829     max = (ecode[1] << 8) + ecode[2];
3830     minimize = *ecode == OP_NOTMINUPTO;
3831     ecode += 3;
3832     goto REPEATNOTCHAR;
3833    
3834     case OP_NOTSTAR:
3835     case OP_NOTMINSTAR:
3836     case OP_NOTPLUS:
3837     case OP_NOTMINPLUS:
3838     case OP_NOTQUERY:
3839     case OP_NOTMINQUERY:
3840     c = *ecode++ - OP_NOTSTAR;
3841     minimize = (c & 1) != 0;
3842     min = rep_min[c]; /* Pick up values from tables; */
3843     max = rep_max[c]; /* zero for max => infinity */
3844     if (max == 0) max = INT_MAX;
3845    
3846     /* Common code for all repeated single-character matches. We can give
3847     up quickly if there are fewer than the minimum number of characters left in
3848     the subject. */
3849    
3850     REPEATNOTCHAR:
3851     if (min > md->end_subject - eptr) return FALSE;
3852     c = *ecode++;
3853    
3854     /* The code is duplicated for the caseless and caseful cases, for speed,
3855     since matching characters is likely to be quite common. First, ensure the
3856     minimum number of matches are present. If min = max, continue at the same
3857     level without recursing. Otherwise, if minimizing, keep trying the rest of
3858     the expression and advancing one matching character if failing, up to the
3859     maximum. Alternatively, if maximizing, find the maximum number of
3860     characters and work backwards. */
3861    
3862 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3863     max, eptr));
3864 nigel 3
3865 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3866 nigel 3 {
3867 nigel 25 c = md->lcc[c];
3868     for (i = 1; i <= min; i++)
3869     if (c == md->lcc[*eptr++]) return FALSE;
3870 nigel 3 if (min == max) continue;
3871     if (minimize)
3872     {
3873     for (i = min;; i++)
3874     {
3875 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3876     return TRUE;
3877 nigel 25 if (i >= max || eptr >= md->end_subject ||
3878     c == md->lcc[*eptr++])
3879 nigel 3 return FALSE;
3880     }
3881     /* Control never gets here */
3882     }
3883     else
3884     {
3885 nigel 7 const uschar *pp = eptr;
3886 nigel 3 for (i = min; i < max; i++)
3887     {
3888 nigel 25 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3889 nigel 3 eptr++;
3890     }
3891     while (eptr >= pp)
3892 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3893     return TRUE;
3894 nigel 3 return FALSE;
3895     }
3896     /* Control never gets here */
3897     }
3898    
3899     /* Caseful comparisons */
3900    
3901     else
3902     {
3903     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3904     if (min == max) continue;
3905     if (minimize)
3906     {
3907     for (i = min;; i++)
3908     {
3909 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3910     return TRUE;
3911 nigel 3 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3912     }
3913     /* Control never gets here */
3914     }
3915     else
3916     {
3917 nigel 7 const uschar *pp = eptr;
3918 nigel 3 for (i = min; i < max; i++)
3919     {
3920     if (eptr >= md->end_subject || c == *eptr) break;
3921     eptr++;
3922     }
3923     while (eptr >= pp)
3924 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3925     return TRUE;
3926 nigel 3 return FALSE;
3927     }
3928     }
3929     /* Control never gets here */
3930    
3931     /* Match a single character type repeatedly; several different opcodes
3932     share code. This is very similar to the code for single characters, but we
3933     repeat it in the interests of efficiency. */
3934    
3935     case OP_TYPEEXACT:
3936     min = max = (ecode[1] << 8) + ecode[2];
3937     minimize = TRUE;
3938     ecode += 3;
3939     goto REPEATTYPE;
3940    
3941     case OP_TYPEUPTO:
3942     case OP_TYPEMINUPTO:
3943     min = 0;
3944     max = (ecode[1] << 8) + ecode[2];
3945     minimize = *ecode == OP_TYPEMINUPTO;
3946     ecode += 3;
3947     goto REPEATTYPE;
3948    
3949     case OP_TYPESTAR:
3950     case OP_TYPEMINSTAR:
3951     case OP_TYPEPLUS:
3952     case OP_TYPEMINPLUS:
3953     case OP_TYPEQUERY:
3954     case OP_TYPEMINQUERY:
3955     c = *ecode++ - OP_TYPESTAR;
3956     minimize = (c & 1) != 0;
3957     min = rep_min[c]; /* Pick up values from tables; */
3958     max = rep_max[c]; /* zero for max => infinity */
3959     if (max == 0) max = INT_MAX;
3960    
3961     /* Common code for all repeated single character type matches */
3962    
3963     REPEATTYPE:
3964     ctype = *ecode++; /* Code for the character type */
3965    
3966     /* First, ensure the minimum number of matches are present. Use inline
3967     code for maximizing the speed, and do the type test once at the start
3968     (i.e. keep it out of the loop). Also test that there are at least the
3969     minimum number of characters before we start. */
3970    
3971     if (min > md->end_subject - eptr) return FALSE;
3972     if (min > 0) switch(ctype)
3973     {
3974     case OP_ANY:
3975 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3976 nigel 3 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3977     else eptr += min;
3978     break;
3979    
3980     case OP_NOT_DIGIT:
3981     for (i = 1; i <= min; i++)
3982 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3983 nigel 3 break;
3984    
3985     case OP_DIGIT:
3986     for (i = 1; i <= min; i++)
3987 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3988 nigel 3 break;
3989    
3990     case OP_NOT_WHITESPACE:
3991     for (i = 1; i <= min; i++)
3992 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3993 nigel 3 break;
3994    
3995     case OP_WHITESPACE:
3996     for (i = 1; i <= min; i++)
3997 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3998 nigel 3 break;
3999    
4000     case OP_NOT_WORDCHAR:
4001 nigel 25 for (i = 1; i <= min; i++)
4002     if ((md->ctypes[*eptr++] & ctype_word) != 0)
4003     return FALSE;
4004 nigel 3 break;
4005    
4006     case OP_WORDCHAR:
4007 nigel 25 for (i = 1; i <= min; i++)
4008     if ((md->ctypes[*eptr++] & ctype_word) == 0)
4009     return FALSE;
4010 nigel 3 break;
4011     }
4012    
4013     /* If min = max, continue at the same level without recursing */
4014    
4015     if (min == max) continue;
4016    
4017     /* If minimizing, we have to test the rest of the pattern before each
4018 nigel 25 subsequent match. */
4019 nigel 3
4020     if (minimize)
4021     {
4022     for (i = min;; i++)
4023     {
4024 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
4025 nigel 25 if (i >= max || eptr >= md->end_subject) return FALSE;
4026    
4027     c = *eptr++;
4028     switch(ctype)
4029     {
4030     case OP_ANY:
4031     if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4032     break;
4033    
4034     case OP_NOT_DIGIT:
4035     if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4036     break;
4037    
4038     case OP_DIGIT:
4039     if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4040     break;
4041    
4042     case OP_NOT_WHITESPACE:
4043     if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4044     break;
4045    
4046     case OP_WHITESPACE:
4047     if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4048     break;
4049    
4050     case OP_NOT_WORDCHAR:
4051     if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4052     break;
4053    
4054     case OP_WORDCHAR:
4055     if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4056     break;
4057     }
4058 nigel 3 }
4059     /* Control never gets here */
4060     }
4061    
4062     /* If maximizing it is worth using inline code for speed, doing the type
4063     test once at the start (i.e. keep it out of the loop). */
4064    
4065     else
4066     {
4067 nigel 7 const uschar *pp = eptr;
4068 nigel 3 switch(ctype)
4069     {
4070     case OP_ANY:
4071 nigel 23 if ((ims & PCRE_DOTALL) == 0)
4072 nigel 3 {
4073     for (i = min; i < max; i++)
4074     {
4075     if (eptr >= md->end_subject || *eptr == '\n') break;
4076     eptr++;
4077     }
4078     }
4079     else
4080     {
4081     c = max - min;
4082     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4083     eptr += c;
4084     }
4085     break;
4086    
4087     case OP_NOT_DIGIT:
4088     for (i = min; i < max; i++)
4089     {
4090 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4091 nigel 3 break;
4092     eptr++;
4093     }
4094     break;
4095    
4096     case OP_DIGIT:
4097     for (i = min; i < max; i++)
4098     {
4099 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4100 nigel 3 break;
4101     eptr++;
4102     }
4103     break;
4104    
4105     case OP_NOT_WHITESPACE:
4106     for (i = min; i < max; i++)
4107     {
4108 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4109 nigel 3 break;
4110     eptr++;
4111     }
4112     break;
4113    
4114     case OP_WHITESPACE:
4115     for (i = min; i < max; i++)
4116     {
4117 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4118 nigel 3 break;
4119     eptr++;
4120     }
4121     break;
4122    
4123     case OP_NOT_WORDCHAR:
4124     for (i = min; i < max; i++)
4125     {
4126 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4127 nigel 3 break;
4128     eptr++;
4129     }
4130     break;
4131    
4132     case OP_WORDCHAR:
4133     for (i = min; i < max; i++)
4134     {
4135 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4136 nigel 3 break;
4137     eptr++;
4138     }
4139     break;
4140     }
4141    
4142     while (eptr >= pp)
4143 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4144     return TRUE;
4145 nigel 3 return FALSE;
4146     }
4147     /* Control never gets here */
4148    
4149     /* There's been some horrible disaster. */
4150    
4151     default:
4152 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
4153 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4154     return FALSE;
4155     }
4156    
4157     /* Do not stick any code in here without much thought; it is assumed
4158     that "continue" in the code above comes out to here to repeat the main
4159     loop. */
4160    
4161     } /* End of main loop */
4162     /* Control never reaches here */
4163     }
4164    
4165    
4166    
4167 nigel 9
4168     /*************************************************
4169 nigel 3 * Execute a Regular Expression *
4170     *************************************************/
4171<