/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 37 - (hide annotations) (download)
Sat Feb 24 21:39:09 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 131874 byte(s)
Load pcre-2.07 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 27 Copyright (c) 1997-1999 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Define DEBUG to get debugging output on stdout. */
37    
38     /* #define DEBUG */
39    
40 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41     inline, and there are *still* stupid compilers about that don't like indented
42     pre-processor statements. I suppose it's only been 10 years... */
43 nigel 3
44 nigel 9 #ifdef DEBUG
45     #define DPRINTF(p) printf p
46     #else
47     #define DPRINTF(p) /*nothing*/
48     #endif
49    
50 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
51     the external pcre header. */
52    
53     #include "internal.h"
54    
55    
56 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58     #ifdef __cplusplus
59     #define class pcre_class
60     #endif
61    
62    
63 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
64     not be set greater than 200. */
65    
66     #define BRASTACK_SIZE 200
67    
68    
69 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70    
71 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73 nigel 3
74 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
75 nigel 3
76     #ifdef DEBUG
77 nigel 7 static const char *OP_names[] = {
78     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80     "Opt", "^", "$", "Any", "chars", "not",
81 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84     "*", "*?", "+", "+?", "?", "??", "{", "{",
85 nigel 23 "class", "Ref",
86     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 nigel 3 "Brazero", "Braminzero", "Bra"
89     };
90     #endif
91    
92     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93     are simple data values; negative values are for special things like \d and so
94     on. Zero means further processing is needed (for things like \x), or the escape
95     is invalid. */
96    
97 nigel 15 static const short int escapes[] = {
98 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 nigel 23 0, 0, -ESC_z /* x - z */
108 nigel 3 };
109    
110     /* Definition to allow mutual recursion */
111    
112 nigel 13 static BOOL
113 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 nigel 37 BOOL, int, int *, int *, compile_data *);
115 nigel 3
116    
117    
118     /*************************************************
119     * Global variables *
120     *************************************************/
121    
122     /* PCRE is thread-clean and doesn't use any global variables in the normal
123     sense. However, it calls memory allocation and free functions via the two
124     indirections below, which are can be changed by the caller, but are shared
125     between all threads. */
126    
127     void *(*pcre_malloc)(size_t) = malloc;
128     void (*pcre_free)(void *) = free;
129    
130    
131    
132    
133     /*************************************************
134 nigel 25 * Default character tables *
135     *************************************************/
136    
137     /* A default set of character tables is included in the PCRE binary. Its source
138     is built by the maketables auxiliary program, which uses the default C ctypes
139     functions, and put in the file chartables.c. These tables are used by PCRE
140     whenever the caller of pcre_compile() does not provide an alternate set of
141     tables. */
142    
143     #include "chartables.c"
144    
145    
146    
147     /*************************************************
148 nigel 3 * Return version string *
149     *************************************************/
150    
151 nigel 7 const char *
152 nigel 3 pcre_version(void)
153     {
154     return PCRE_VERSION;
155     }
156    
157    
158    
159    
160     /*************************************************
161     * Return info about a compiled pattern *
162     *************************************************/
163    
164     /* This function picks potentially useful data out of the private
165 nigel 37 structure. The public options are passed back in an int - though the
166     re->options field has been expanded to a long int, all the public options
167     at the low end of it, and so even on 16-bit systems this will still be OK.
168     Therefore, I haven't changed the API for pcre_info().
169 nigel 3
170     Arguments:
171     external_re points to compiled code
172     optptr where to pass back the options
173     first_char where to pass back the first character,
174     or -1 if multiline and all branches start ^,
175     or -2 otherwise
176    
177     Returns: number of identifying extraction brackets
178     or negative values on error
179     */
180    
181     int
182     pcre_info(const pcre *external_re, int *optptr, int *first_char)
183     {
184 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
185 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
186     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
187 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
188 nigel 3 if (first_char != NULL)
189     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
190     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
191     return re->top_bracket;
192     }
193    
194    
195    
196    
197     #ifdef DEBUG
198     /*************************************************
199     * Debugging function to print chars *
200     *************************************************/
201    
202     /* Print a sequence of chars in printable format, stopping at the end of the
203     subject if the requested.
204    
205     Arguments:
206     p points to characters
207     length number to print
208     is_subject TRUE if printing from within md->start_subject
209     md pointer to matching data block, if is_subject is TRUE
210    
211     Returns: nothing
212     */
213    
214 nigel 9 static void
215     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
216 nigel 3 {
217     int c;
218     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
219     while (length-- > 0)
220     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
221     }
222     #endif
223    
224    
225    
226    
227     /*************************************************
228     * Handle escapes *
229     *************************************************/
230    
231     /* This function is called when a \ has been encountered. It either returns a
232     positive value for a simple escape such as \n, or a negative value which
233     encodes one of the more complicated things such as \d. On entry, ptr is
234     pointing at the \. On exit, it is on the final character of the escape
235     sequence.
236    
237     Arguments:
238     ptrptr points to the pattern position pointer
239     errorptr points to the pointer to the error message
240     bracount number of previous extracting brackets
241     options the options bits
242     isclass TRUE if inside a character class
243 nigel 25 cd pointer to char tables block
244 nigel 3
245     Returns: zero or positive => a data character
246     negative => a special escape sequence
247     on error, errorptr is set
248     */
249    
250     static int
251 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
252 nigel 25 int options, BOOL isclass, compile_data *cd)
253 nigel 3 {
254 nigel 7 const uschar *ptr = *ptrptr;
255 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
256     int i;
257    
258     if (c == 0) *errorptr = ERR1;
259    
260     /* Digits or letters may have special meaning; all others are literals. */
261    
262     else if (c < '0' || c > 'z') {}
263    
264     /* Do an initial lookup in a table. A non-zero result is something that can be
265     returned immediately. Otherwise further processing may be required. */
266    
267     else if ((i = escapes[c - '0']) != 0) c = i;
268    
269     /* Escapes that need further processing, or are illegal. */
270    
271     else
272     {
273 nigel 7 const uschar *oldptr;
274 nigel 3 switch (c)
275     {
276     /* The handling of escape sequences consisting of a string of digits
277     starting with one that is not zero is not straightforward. By experiment,
278     the way Perl works seems to be as follows:
279    
280     Outside a character class, the digits are read as a decimal number. If the
281     number is less than 10, or if there are that many previous extracting
282     left brackets, then it is a back reference. Otherwise, up to three octal
283     digits are read to form an escaped byte. Thus \123 is likely to be octal
284     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
285     value is greater than 377, the least significant 8 bits are taken. Inside a
286     character class, \ followed by a digit is always an octal number. */
287    
288     case '1': case '2': case '3': case '4': case '5':
289     case '6': case '7': case '8': case '9':
290    
291     if (!isclass)
292     {
293     oldptr = ptr;
294     c -= '0';
295 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
296 nigel 3 c = c * 10 + *(++ptr) - '0';
297     if (c < 10 || c <= bracount)
298     {
299     c = -(ESC_REF + c);
300     break;
301     }
302     ptr = oldptr; /* Put the pointer back and fall through */
303     }
304    
305     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
306     generates a binary zero byte and treats the digit as a following literal.
307     Thus we have to pull back the pointer by one. */
308    
309     if ((c = *ptr) >= '8')
310     {
311     ptr--;
312     c = 0;
313     break;
314     }
315    
316     /* \0 always starts an octal number, but we may drop through to here with a
317     larger first octal digit */
318    
319     case '0':
320     c -= '0';
321 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
322 nigel 3 ptr[1] != '8' && ptr[1] != '9')
323     c = c * 8 + *(++ptr) - '0';
324     break;
325    
326     /* Special escapes not starting with a digit are straightforward */
327    
328     case 'x':
329     c = 0;
330 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
331 nigel 3 {
332     ptr++;
333 nigel 25 c = c * 16 + cd->lcc[*ptr] -
334     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
335 nigel 3 }
336     break;
337    
338     case 'c':
339     c = *(++ptr);
340     if (c == 0)
341     {
342     *errorptr = ERR2;
343     return 0;
344     }
345    
346     /* A letter is upper-cased; then the 0x40 bit is flipped */
347    
348 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
349 nigel 3 c ^= 0x40;
350     break;
351    
352     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
353     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
354 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
355     there used to be some cases other than the default, and there may be again
356     in future, so I haven't "optimized" it. */
357 nigel 3
358     default:
359     if ((options & PCRE_EXTRA) != 0) switch(c)
360     {
361     default:
362     *errorptr = ERR3;
363     break;
364     }
365     break;
366     }
367     }
368    
369     *ptrptr = ptr;
370     return c;
371     }
372    
373    
374    
375     /*************************************************
376     * Check for counted repeat *
377     *************************************************/
378    
379     /* This function is called when a '{' is encountered in a place where it might
380     start a quantifier. It looks ahead to see if it really is a quantifier or not.
381     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
382     where the ddds are digits.
383    
384     Arguments:
385     p pointer to the first char after '{'
386 nigel 25 cd pointer to char tables block
387 nigel 3
388     Returns: TRUE or FALSE
389     */
390    
391     static BOOL
392 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
393 nigel 3 {
394 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
395     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
396 nigel 3 if (*p == '}') return TRUE;
397    
398     if (*p++ != ',') return FALSE;
399     if (*p == '}') return TRUE;
400    
401 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
402     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
403 nigel 3 return (*p == '}');
404     }
405    
406    
407    
408     /*************************************************
409     * Read repeat counts *
410     *************************************************/
411    
412     /* Read an item of the form {n,m} and return the values. This is called only
413     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
414     so the syntax is guaranteed to be correct, but we need to check the values.
415    
416     Arguments:
417     p pointer to first char after '{'
418     minp pointer to int for min
419     maxp pointer to int for max
420     returned as -1 if no max
421     errorptr points to pointer to error message
422 nigel 25 cd pointer to character tables clock
423 nigel 3
424     Returns: pointer to '}' on success;
425     current ptr on error, with errorptr set
426     */
427    
428 nigel 7 static const uschar *
429 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
430     const char **errorptr, compile_data *cd)
431 nigel 3 {
432     int min = 0;
433     int max = -1;
434    
435 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
436 nigel 3
437     if (*p == '}') max = min; else
438     {
439     if (*(++p) != '}')
440     {
441     max = 0;
442 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
443 nigel 3 if (max < min)
444     {
445     *errorptr = ERR4;
446     return p;
447     }
448     }
449     }
450    
451     /* Do paranoid checks, then fill in the required variables, and pass back the
452     pointer to the terminating '}'. */
453    
454     if (min > 65535 || max > 65535)
455     *errorptr = ERR5;
456     else
457     {
458     *minp = min;
459     *maxp = max;
460     }
461     return p;
462     }
463    
464    
465    
466     /*************************************************
467 nigel 23 * Find the fixed length of a pattern *
468     *************************************************/
469    
470     /* Scan a pattern and compute the fixed length of subject that will match it,
471     if the length is fixed. This is needed for dealing with backward assertions.
472    
473     Arguments:
474     code points to the start of the pattern (the bracket)
475    
476     Returns: the fixed length, or -1 if there is no fixed length
477     */
478    
479     static int
480     find_fixedlength(uschar *code)
481     {
482     int length = -1;
483    
484     register int branchlength = 0;
485     register uschar *cc = code + 3;
486    
487     /* Scan along the opcodes for this branch. If we get to the end of the
488     branch, check the length against that of the other branches. */
489    
490     for (;;)
491     {
492     int d;
493     register int op = *cc;
494     if (op >= OP_BRA) op = OP_BRA;
495    
496     switch (op)
497     {
498     case OP_BRA:
499     case OP_ONCE:
500     case OP_COND:
501     d = find_fixedlength(cc);
502     if (d < 0) return -1;
503     branchlength += d;
504     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
505     cc += 3;
506     break;
507    
508     /* Reached end of a branch; if it's a ket it is the end of a nested
509     call. If it's ALT it is an alternation in a nested call. If it is
510     END it's the end of the outer call. All can be handled by the same code. */
511    
512     case OP_ALT:
513     case OP_KET:
514     case OP_KETRMAX:
515     case OP_KETRMIN:
516     case OP_END:
517     if (length < 0) length = branchlength;
518     else if (length != branchlength) return -1;
519     if (*cc != OP_ALT) return length;
520     cc += 3;
521     branchlength = 0;
522     break;
523    
524     /* Skip over assertive subpatterns */
525    
526     case OP_ASSERT:
527     case OP_ASSERT_NOT:
528     case OP_ASSERTBACK:
529     case OP_ASSERTBACK_NOT:
530     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
531     cc += 3;
532     break;
533    
534     /* Skip over things that don't match chars */
535    
536     case OP_REVERSE:
537     cc++;
538 nigel 37 /* Fall through */
539 nigel 23
540     case OP_CREF:
541     case OP_OPT:
542     cc++;
543     /* Fall through */
544    
545     case OP_SOD:
546     case OP_EOD:
547     case OP_EODN:
548     case OP_CIRC:
549     case OP_DOLL:
550     case OP_NOT_WORD_BOUNDARY:
551     case OP_WORD_BOUNDARY:
552     cc++;
553     break;
554    
555     /* Handle char strings */
556    
557     case OP_CHARS:
558     branchlength += *(++cc);
559     cc += *cc + 1;
560     break;
561    
562     /* Handle exact repetitions */
563    
564     case OP_EXACT:
565     case OP_TYPEEXACT:
566     branchlength += (cc[1] << 8) + cc[2];
567     cc += 4;
568     break;
569    
570     /* Handle single-char matchers */
571    
572     case OP_NOT_DIGIT:
573     case OP_DIGIT:
574     case OP_NOT_WHITESPACE:
575     case OP_WHITESPACE:
576     case OP_NOT_WORDCHAR:
577     case OP_WORDCHAR:
578     case OP_ANY:
579     branchlength++;
580     cc++;
581     break;
582    
583    
584     /* Check a class for variable quantification */
585    
586     case OP_CLASS:
587     cc += (*cc == OP_REF)? 2 : 33;
588    
589     switch (*cc)
590     {
591     case OP_CRSTAR:
592     case OP_CRMINSTAR:
593     case OP_CRQUERY:
594     case OP_CRMINQUERY:
595     return -1;
596    
597     case OP_CRRANGE:
598     case OP_CRMINRANGE:
599     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
600     branchlength += (cc[1] << 8) + cc[2];
601     cc += 5;
602     break;
603    
604     default:
605     branchlength++;
606     }
607     break;
608    
609     /* Anything else is variable length */
610    
611     default:
612     return -1;
613     }
614     }
615     /* Control never gets here */
616     }
617    
618    
619    
620    
621     /*************************************************
622 nigel 3 * Compile one branch *
623     *************************************************/
624    
625     /* Scan the pattern, compiling it into the code vector.
626    
627     Arguments:
628 nigel 25 options the option bits
629     brackets points to number of brackets used
630     code points to the pointer to the current code point
631     ptrptr points to the current pattern pointer
632     errorptr points to pointer to error message
633     optchanged set to the value of the last OP_OPT item compiled
634 nigel 37 reqchar set to the last literal character required, else -1
635     countlits set to count of mandatory literal characters
636 nigel 25 cd contains pointers to tables
637 nigel 3
638 nigel 25 Returns: TRUE on success
639     FALSE, with *errorptr set on error
640 nigel 3 */
641    
642     static BOOL
643 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
644 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
645 nigel 37 int *reqchar, int *countlits, compile_data *cd)
646 nigel 3 {
647     int repeat_type, op_type;
648     int repeat_min, repeat_max;
649     int bravalue, length;
650 nigel 19 int greedy_default, greedy_non_default;
651 nigel 37 int prevreqchar;
652     int condcount = 0;
653     int subcountlits = 0;
654 nigel 3 register int c;
655     register uschar *code = *codeptr;
656 nigel 23 uschar *tempcode;
657 nigel 7 const uschar *ptr = *ptrptr;
658 nigel 23 const uschar *tempptr;
659 nigel 3 uschar *previous = NULL;
660     uschar class[32];
661    
662 nigel 19 /* Set up the default and non-default settings for greediness */
663    
664     greedy_default = ((options & PCRE_UNGREEDY) != 0);
665     greedy_non_default = greedy_default ^ 1;
666    
667 nigel 37 /* Initialize no required char, and count of literals */
668    
669     *reqchar = prevreqchar = -1;
670     *countlits = 0;
671    
672 nigel 3 /* Switch on next character until the end of the branch */
673    
674     for (;; ptr++)
675     {
676     BOOL negate_class;
677 nigel 23 int class_charcount;
678     int class_lastchar;
679     int newoptions;
680     int condref;
681 nigel 37 int subreqchar;
682 nigel 3
683     c = *ptr;
684     if ((options & PCRE_EXTENDED) != 0)
685     {
686 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
687 nigel 3 if (c == '#')
688     {
689     while ((c = *(++ptr)) != 0 && c != '\n');
690     continue;
691     }
692     }
693    
694     switch(c)
695     {
696     /* The branch terminates at end of string, |, or ). */
697    
698     case 0:
699     case '|':
700     case ')':
701     *codeptr = code;
702     *ptrptr = ptr;
703     return TRUE;
704    
705     /* Handle single-character metacharacters */
706    
707     case '^':
708     previous = NULL;
709     *code++ = OP_CIRC;
710     break;
711    
712     case '$':
713     previous = NULL;
714     *code++ = OP_DOLL;
715     break;
716    
717     case '.':
718     previous = code;
719     *code++ = OP_ANY;
720     break;
721    
722     /* Character classes. These always build a 32-byte bitmap of the permitted
723     characters, except in the special case where there is only one character.
724     For negated classes, we build the map as usual, then invert it at the end.
725     */
726    
727     case '[':
728     previous = code;
729 nigel 23 *code++ = OP_CLASS;
730 nigel 3
731 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
732 nigel 3
733     if ((c = *(++ptr)) == '^')
734     {
735     negate_class = TRUE;
736     c = *(++ptr);
737     }
738 nigel 23 else negate_class = FALSE;
739 nigel 3
740     /* Keep a count of chars so that we can optimize the case of just a single
741     character. */
742    
743     class_charcount = 0;
744     class_lastchar = -1;
745    
746     /* Initialize the 32-char bit map to all zeros. We have to build the
747     map in a temporary bit of store, in case the class contains only 1
748     character, because in that case the compiled code doesn't use the
749     bit map. */
750    
751     memset(class, 0, 32 * sizeof(uschar));
752    
753     /* Process characters until ] is reached. By writing this as a "do" it
754     means that an initial ] is taken as a data character. */
755    
756     do
757     {
758     if (c == 0)
759     {
760     *errorptr = ERR6;
761     goto FAILED;
762     }
763    
764     /* Backslash may introduce a single character, or it may introduce one
765     of the specials, which just set a flag. Escaped items are checked for
766     validity in the pre-compiling pass. The sequence \b is a special case.
767 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
768 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
769     or into the one we are building. We assume they have more than one
770     character in them, so set class_count bigger than one. */
771    
772     if (c == '\\')
773     {
774 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
775 nigel 3 if (-c == ESC_b) c = '\b';
776     else if (c < 0)
777     {
778 nigel 25 register const uschar *cbits = cd->cbits;
779 nigel 3 class_charcount = 10;
780     switch (-c)
781     {
782     case ESC_d:
783 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
784 nigel 3 continue;
785    
786     case ESC_D:
787 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
788 nigel 3 continue;
789    
790     case ESC_w:
791     for (c = 0; c < 32; c++)
792 nigel 25 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
793 nigel 3 continue;
794    
795     case ESC_W:
796     for (c = 0; c < 32; c++)
797 nigel 25 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
798 nigel 3 continue;
799    
800     case ESC_s:
801 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
802 nigel 3 continue;
803    
804     case ESC_S:
805 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
806 nigel 3 continue;
807    
808     default:
809     *errorptr = ERR7;
810     goto FAILED;
811     }
812     }
813     /* Fall through if single character */
814     }
815    
816     /* A single character may be followed by '-' to form a range. However,
817     Perl does not permit ']' to be the end of the range. A '-' character
818     here is treated as a literal. */
819    
820     if (ptr[1] == '-' && ptr[2] != ']')
821     {
822     int d;
823     ptr += 2;
824     d = *ptr;
825    
826     if (d == 0)
827     {
828     *errorptr = ERR6;
829     goto FAILED;
830     }
831    
832     /* The second part of a range can be a single-character escape, but
833     not any of the other escapes. */
834    
835     if (d == '\\')
836     {
837 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
838 nigel 3 if (d < 0)
839     {
840     if (d == -ESC_b) d = '\b'; else
841     {
842     *errorptr = ERR7;
843     goto FAILED;
844     }
845     }
846     }
847    
848     if (d < c)
849     {
850     *errorptr = ERR8;
851     goto FAILED;
852     }
853    
854     for (; c <= d; c++)
855     {
856     class[c/8] |= (1 << (c&7));
857     if ((options & PCRE_CASELESS) != 0)
858     {
859 nigel 25 int uc = cd->fcc[c]; /* flip case */
860 nigel 3 class[uc/8] |= (1 << (uc&7));
861     }
862     class_charcount++; /* in case a one-char range */
863     class_lastchar = c;
864     }
865     continue; /* Go get the next char in the class */
866     }
867    
868     /* Handle a lone single character - we can get here for a normal
869     non-escape char, or after \ that introduces a single character. */
870    
871     class [c/8] |= (1 << (c&7));
872     if ((options & PCRE_CASELESS) != 0)
873     {
874 nigel 25 c = cd->fcc[c]; /* flip case */
875 nigel 3 class[c/8] |= (1 << (c&7));
876     }
877     class_charcount++;
878     class_lastchar = c;
879     }
880    
881     /* Loop until ']' reached; the check for end of string happens inside the
882     loop. This "while" is the end of the "do" above. */
883    
884     while ((c = *(++ptr)) != ']');
885    
886     /* If class_charcount is 1 and class_lastchar is not negative, we saw
887     precisely one character. This doesn't need the whole 32-byte bit map.
888     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
889     it's negative. */
890    
891     if (class_charcount == 1 && class_lastchar >= 0)
892     {
893     if (negate_class)
894     {
895     code[-1] = OP_NOT;
896     }
897     else
898     {
899     code[-1] = OP_CHARS;
900     *code++ = 1;
901     }
902     *code++ = class_lastchar;
903     }
904    
905     /* Otherwise, negate the 32-byte map if necessary, and copy it into
906     the code vector. */
907    
908     else
909     {
910     if (negate_class)
911     for (c = 0; c < 32; c++) code[c] = ~class[c];
912     else
913     memcpy(code, class, 32);
914     code += 32;
915     }
916     break;
917    
918     /* Various kinds of repeat */
919    
920     case '{':
921 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
922     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
923 nigel 3 if (*errorptr != NULL) goto FAILED;
924     goto REPEAT;
925    
926     case '*':
927     repeat_min = 0;
928     repeat_max = -1;
929     goto REPEAT;
930    
931     case '+':
932     repeat_min = 1;
933     repeat_max = -1;
934     goto REPEAT;
935    
936     case '?':
937     repeat_min = 0;
938     repeat_max = 1;
939    
940     REPEAT:
941     if (previous == NULL)
942     {
943     *errorptr = ERR9;
944     goto FAILED;
945     }
946    
947 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
948     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
949 nigel 3 next character. */
950    
951 nigel 19 if (ptr[1] == '?')
952     { repeat_type = greedy_non_default; ptr++; }
953     else repeat_type = greedy_default;
954 nigel 3
955     /* If previous was a string of characters, chop off the last one and use it
956     as the subject of the repeat. If there was only one character, we can
957 nigel 37 abolish the previous item altogether. A repeat with a zero minimum wipes
958     out any reqchar setting, backing up to the previous value. We must also
959     adjust the countlits value. */
960 nigel 3
961 nigel 37 if (*previous == OP_CHARS)
962 nigel 3 {
963     int len = previous[1];
964 nigel 37
965     if (repeat_min == 0) *reqchar = prevreqchar;
966     *countlits += repeat_min - 1;
967    
968 nigel 3 if (len == 1)
969     {
970     c = previous[2];
971     code = previous;
972     }
973     else
974     {
975     c = previous[len+1];
976     previous[1]--;
977     code--;
978     }
979     op_type = 0; /* Use single-char op codes */
980     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
981     }
982    
983     /* If previous was a single negated character ([^a] or similar), we use
984     one of the special opcodes, replacing it. The code is shared with single-
985     character repeats by adding a suitable offset into repeat_type. */
986    
987     else if ((int)*previous == OP_NOT)
988     {
989     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
990     c = previous[1];
991     code = previous;
992     goto OUTPUT_SINGLE_REPEAT;
993     }
994    
995     /* If previous was a character type match (\d or similar), abolish it and
996     create a suitable repeat item. The code is shared with single-character
997     repeats by adding a suitable offset into repeat_type. */
998    
999 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1000 nigel 3 {
1001     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1002     c = *previous;
1003     code = previous;
1004    
1005     OUTPUT_SINGLE_REPEAT:
1006    
1007 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
1008     this case, so we do too - by simply omitting the item altogether. */
1009    
1010     if (repeat_max == 0) goto END_REPEAT;
1011    
1012     /* Combine the op_type with the repeat_type */
1013    
1014     repeat_type += op_type;
1015    
1016 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
1017     an UPTO, with the maximum given. */
1018    
1019     if (repeat_min == 0)
1020     {
1021     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1022     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1023     else
1024     {
1025     *code++ = OP_UPTO + repeat_type;
1026     *code++ = repeat_max >> 8;
1027     *code++ = (repeat_max & 255);
1028     }
1029     }
1030    
1031     /* The case {1,} is handled as the special case + */
1032    
1033     else if (repeat_min == 1 && repeat_max == -1)
1034     *code++ = OP_PLUS + repeat_type;
1035    
1036     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1037     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1038    
1039     else
1040     {
1041     if (repeat_min != 1)
1042     {
1043     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1044     *code++ = repeat_min >> 8;
1045     *code++ = (repeat_min & 255);
1046     }
1047    
1048     /* If the mininum is 1 and the previous item was a character string,
1049     we either have to put back the item that got cancelled if the string
1050     length was 1, or add the character back onto the end of a longer
1051 nigel 21 string. For a character type nothing need be done; it will just get
1052     put back naturally. Note that the final character is always going to
1053     get added below. */
1054 nigel 3
1055     else if (*previous == OP_CHARS)
1056     {
1057     if (code == previous) code += 2; else previous[1]++;
1058     }
1059    
1060 nigel 21 /* For a single negated character we also have to put back the
1061     item that got cancelled. */
1062    
1063     else if (*previous == OP_NOT) code++;
1064    
1065 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1066 nigel 3
1067 nigel 9 if (repeat_max < 0)
1068 nigel 3 {
1069     *code++ = c;
1070 nigel 9 *code++ = OP_STAR + repeat_type;
1071     }
1072    
1073     /* Else insert an UPTO if the max is greater than the min. */
1074    
1075     else if (repeat_max != repeat_min)
1076     {
1077     *code++ = c;
1078 nigel 3 repeat_max -= repeat_min;
1079     *code++ = OP_UPTO + repeat_type;
1080     *code++ = repeat_max >> 8;
1081     *code++ = (repeat_max & 255);
1082     }
1083     }
1084    
1085     /* The character or character type itself comes last in all cases. */
1086    
1087     *code++ = c;
1088     }
1089    
1090     /* If previous was a character class or a back reference, we put the repeat
1091 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
1092 nigel 3
1093 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1094 nigel 3 {
1095 nigel 37 if (repeat_max == 0)
1096     {
1097     code = previous;
1098     goto END_REPEAT;
1099     }
1100 nigel 3 if (repeat_min == 0 && repeat_max == -1)
1101     *code++ = OP_CRSTAR + repeat_type;
1102     else if (repeat_min == 1 && repeat_max == -1)
1103     *code++ = OP_CRPLUS + repeat_type;
1104     else if (repeat_min == 0 && repeat_max == 1)
1105     *code++ = OP_CRQUERY + repeat_type;
1106     else
1107     {
1108     *code++ = OP_CRRANGE + repeat_type;
1109     *code++ = repeat_min >> 8;
1110     *code++ = repeat_min & 255;
1111     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1112     *code++ = repeat_max >> 8;
1113     *code++ = repeat_max & 255;
1114     }
1115     }
1116    
1117     /* If previous was a bracket group, we may have to replicate it in certain
1118 nigel 23 cases. */
1119 nigel 3
1120 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1121     (int)*previous == OP_COND)
1122 nigel 3 {
1123 nigel 31 register int i;
1124     int ketoffset = 0;
1125 nigel 9 int len = code - previous;
1126 nigel 31 uschar *bralink = NULL;
1127 nigel 3
1128 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1129     by scanning through from the start, and compute the offset back to it
1130     from the current code pointer. There may be an OP_OPT setting following
1131     the final KET, so we can't find the end just by going back from the code
1132     pointer. */
1133    
1134     if (repeat_max == -1)
1135 nigel 3 {
1136 nigel 23 register uschar *ket = previous;
1137     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1138     ketoffset = code - ket;
1139 nigel 3 }
1140    
1141 nigel 31 /* The case of a zero minimum is special because of the need to stick
1142     OP_BRAZERO in front of it, and because the group appears once in the
1143     data, whereas in other cases it appears the minimum number of times. For
1144     this reason, it is simplest to treat this case separately, as otherwise
1145     the code gets far too mess. There are several special subcases when the
1146     minimum is zero. */
1147    
1148     if (repeat_min == 0)
1149     {
1150 nigel 37 /* If we set up a required char from the bracket, we must back off
1151     to the previous value and reset the countlits value too. */
1152    
1153     if (subcountlits > 0)
1154     {
1155     *reqchar = prevreqchar;
1156     *countlits -= subcountlits;
1157     }
1158    
1159 nigel 31 /* If the maximum is also zero, we just omit the group from the output
1160     altogether. */
1161    
1162     if (repeat_max == 0)
1163     {
1164     code = previous;
1165 nigel 37 goto END_REPEAT;
1166 nigel 31 }
1167    
1168     /* If the maximum is 1 or unlimited, we just have to stick in the
1169     BRAZERO and do no more at this point. */
1170    
1171     if (repeat_max <= 1)
1172     {
1173     memmove(previous+1, previous, len);
1174     code++;
1175     *previous++ = OP_BRAZERO + repeat_type;
1176     }
1177    
1178     /* If the maximum is greater than 1 and limited, we have to replicate
1179     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1180     The first one has to be handled carefully because it's the original
1181     copy, which has to be moved up. The remainder can be handled by code
1182     that is common with the non-zero minimum case below. We just have to
1183     adjust the value or repeat_max, since one less copy is required. */
1184    
1185     else
1186     {
1187     int offset;
1188     memmove(previous+4, previous, len);
1189     code += 4;
1190     *previous++ = OP_BRAZERO + repeat_type;
1191     *previous++ = OP_BRA;
1192    
1193     /* We chain together the bracket offset fields that have to be
1194     filled in later when the ends of the brackets are reached. */
1195    
1196     offset = (bralink == NULL)? 0 : previous - bralink;
1197     bralink = previous;
1198     *previous++ = offset >> 8;
1199     *previous++ = offset & 255;
1200     }
1201    
1202     repeat_max--;
1203     }
1204    
1205     /* If the minimum is greater than zero, replicate the group as many
1206     times as necessary, and adjust the maximum to the number of subsequent
1207     copies that we need. */
1208    
1209     else
1210     {
1211     for (i = 1; i < repeat_min; i++)
1212     {
1213     memcpy(code, previous, len);
1214     code += len;
1215     }
1216     if (repeat_max > 0) repeat_max -= repeat_min;
1217     }
1218    
1219     /* This code is common to both the zero and non-zero minimum cases. If
1220     the maximum is limited, it replicates the group in a nested fashion,
1221     remembering the bracket starts on a stack. In the case of a zero minimum,
1222     the first one was set up above. In all cases the repeat_max now specifies
1223     the number of additional copies needed. */
1224    
1225     if (repeat_max >= 0)
1226     {
1227     for (i = repeat_max - 1; i >= 0; i--)
1228     {
1229     *code++ = OP_BRAZERO + repeat_type;
1230    
1231     /* All but the final copy start a new nesting, maintaining the
1232     chain of brackets outstanding. */
1233    
1234     if (i != 0)
1235     {
1236     int offset;
1237     *code++ = OP_BRA;
1238     offset = (bralink == NULL)? 0 : code - bralink;
1239     bralink = code;
1240     *code++ = offset >> 8;
1241     *code++ = offset & 255;
1242     }
1243    
1244     memcpy(code, previous, len);
1245     code += len;
1246     }
1247    
1248     /* Now chain through the pending brackets, and fill in their length
1249     fields (which are holding the chain links pro tem). */
1250    
1251     while (bralink != NULL)
1252     {
1253     int oldlinkoffset;
1254     int offset = code - bralink + 1;
1255     uschar *bra = code - offset;
1256     oldlinkoffset = (bra[1] << 8) + bra[2];
1257     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1258     *code++ = OP_KET;
1259     *code++ = bra[1] = offset >> 8;
1260     *code++ = bra[2] = (offset & 255);
1261     }
1262     }
1263    
1264     /* If the maximum is unlimited, set a repeater in the final copy. We
1265     can't just offset backwards from the current code point, because we
1266     don't know if there's been an options resetting after the ket. The
1267     correct offset was computed above. */
1268    
1269     else code[-ketoffset] = OP_KETRMAX + repeat_type;
1270 nigel 3 }
1271    
1272     /* Else there's some kind of shambles */
1273    
1274     else
1275     {
1276     *errorptr = ERR11;
1277     goto FAILED;
1278     }
1279    
1280     /* In all case we no longer have a previous item. */
1281    
1282 nigel 37 END_REPEAT:
1283 nigel 3 previous = NULL;
1284     break;
1285    
1286    
1287 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1288     lookbehind or option setting or condition. First deal with special things
1289     that can come after a bracket; all are introduced by ?, and the appearance
1290     of any of them means that this is not a referencing group. They were
1291     checked for validity in the first pass over the string, so we don't have to
1292     check for syntax errors here. */
1293 nigel 3
1294     case '(':
1295 nigel 23 newoptions = options;
1296     condref = -1;
1297    
1298 nigel 3 if (*(++ptr) == '?')
1299     {
1300 nigel 23 int set, unset;
1301     int *optset;
1302 nigel 3
1303     switch (*(++ptr))
1304     {
1305 nigel 23 case '#': /* Comment; skip to ket */
1306 nigel 3 ptr++;
1307     while (*ptr != ')') ptr++;
1308     continue;
1309    
1310     case ':': /* Non-extracting bracket */
1311 nigel 23 bravalue = OP_BRA;
1312 nigel 3 ptr++;
1313     break;
1314    
1315 nigel 23 case '(':
1316     bravalue = OP_COND; /* Conditional group */
1317 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1318 nigel 23 {
1319     condref = *ptr - '0';
1320     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1321     ptr++;
1322     }
1323     else ptr--;
1324     break;
1325    
1326     case '=': /* Positive lookahead */
1327 nigel 3 bravalue = OP_ASSERT;
1328     ptr++;
1329     break;
1330    
1331 nigel 23 case '!': /* Negative lookahead */
1332 nigel 3 bravalue = OP_ASSERT_NOT;
1333     ptr++;
1334     break;
1335    
1336 nigel 23 case '<': /* Lookbehinds */
1337     switch (*(++ptr))
1338 nigel 3 {
1339 nigel 23 case '=': /* Positive lookbehind */
1340     bravalue = OP_ASSERTBACK;
1341 nigel 3 ptr++;
1342     break;
1343 nigel 23
1344     case '!': /* Negative lookbehind */
1345     bravalue = OP_ASSERTBACK_NOT;
1346     ptr++;
1347     break;
1348    
1349     default: /* Syntax error */
1350     *errorptr = ERR24;
1351     goto FAILED;
1352 nigel 3 }
1353 nigel 23 break;
1354 nigel 3
1355 nigel 23 case '>': /* One-time brackets */
1356     bravalue = OP_ONCE;
1357     ptr++;
1358     break;
1359    
1360     default: /* Option setting */
1361     set = unset = 0;
1362     optset = &set;
1363    
1364     while (*ptr != ')' && *ptr != ':')
1365     {
1366     switch (*ptr++)
1367     {
1368     case '-': optset = &unset; break;
1369    
1370     case 'i': *optset |= PCRE_CASELESS; break;
1371     case 'm': *optset |= PCRE_MULTILINE; break;
1372     case 's': *optset |= PCRE_DOTALL; break;
1373     case 'x': *optset |= PCRE_EXTENDED; break;
1374     case 'U': *optset |= PCRE_UNGREEDY; break;
1375     case 'X': *optset |= PCRE_EXTRA; break;
1376    
1377     default:
1378     *errorptr = ERR12;
1379     goto FAILED;
1380     }
1381     }
1382    
1383     /* Set up the changed option bits, but don't change anything yet. */
1384    
1385     newoptions = (options | set) & (~unset);
1386    
1387     /* If the options ended with ')' this is not the start of a nested
1388     group with option changes, so the options change at this level. At top
1389     level there is nothing else to be done (the options will in fact have
1390     been set from the start of compiling as a result of the first pass) but
1391     at an inner level we must compile code to change the ims options if
1392     necessary, and pass the new setting back so that it can be put at the
1393     start of any following branches, and when this group ends, a resetting
1394     item can be compiled. */
1395    
1396     if (*ptr == ')')
1397     {
1398     if ((options & PCRE_INGROUP) != 0 &&
1399     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1400     {
1401     *code++ = OP_OPT;
1402     *code++ = *optchanged = newoptions & PCRE_IMS;
1403     }
1404     options = newoptions; /* Change options at this level */
1405     previous = NULL; /* This item can't be repeated */
1406     continue; /* It is complete */
1407     }
1408    
1409     /* If the options ended with ':' we are heading into a nested group
1410     with possible change of options. Such groups are non-capturing and are
1411     not assertions of any kind. All we need to do is skip over the ':';
1412     the newoptions value is handled below. */
1413    
1414     bravalue = OP_BRA;
1415     ptr++;
1416 nigel 3 }
1417     }
1418    
1419 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1420 nigel 3
1421     else
1422     {
1423     if (++(*brackets) > EXTRACT_MAX)
1424     {
1425     *errorptr = ERR13;
1426     goto FAILED;
1427     }
1428     bravalue = OP_BRA + *brackets;
1429     }
1430    
1431 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1432     kinds can be. We copy code into a non-register variable in order to be able
1433     to pass its address because some compilers complain otherwise. Pass in a
1434     new setting for the ims options if they have changed. */
1435 nigel 3
1436 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1437 nigel 3 *code = bravalue;
1438 nigel 23 tempcode = code;
1439    
1440     if (!compile_regex(
1441     options | PCRE_INGROUP, /* Set for all nested groups */
1442     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1443     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1444     brackets, /* Bracket level */
1445     &tempcode, /* Where to put code (updated) */
1446     &ptr, /* Input pointer (updated) */
1447     errorptr, /* Where to put an error message */
1448     (bravalue == OP_ASSERTBACK ||
1449     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1450 nigel 25 condref, /* Condition reference number */
1451 nigel 37 &subreqchar, /* For possible last char */
1452     &subcountlits, /* For literal count */
1453 nigel 25 cd)) /* Tables block */
1454 nigel 23 goto FAILED;
1455    
1456     /* At the end of compiling, code is still pointing to the start of the
1457     group, while tempcode has been updated to point past the end of the group
1458     and any option resetting that may follow it. The pattern pointer (ptr)
1459     is on the bracket. */
1460    
1461     /* If this is a conditional bracket, check that there are no more than
1462     two branches in the group. */
1463    
1464     if (bravalue == OP_COND)
1465 nigel 3 {
1466 nigel 23 uschar *tc = code;
1467 nigel 37 condcount = 0;
1468 nigel 23
1469     do {
1470 nigel 37 condcount++;
1471 nigel 23 tc += (tc[1] << 8) | tc[2];
1472     }
1473     while (*tc != OP_KET);
1474    
1475 nigel 37 if (condcount > 2)
1476 nigel 23 {
1477     *errorptr = ERR27;
1478 nigel 3 goto FAILED;
1479 nigel 23 }
1480 nigel 3 }
1481    
1482 nigel 37 /* Handle updating of the required character. If the subpattern didn't
1483     set one, leave it as it was. Otherwise, update it for normal brackets of
1484     all kinds, forward assertions, and conditions with two branches. Don't
1485     update the literal count for forward assertions, however. If the bracket
1486     is followed by a quantifier with zero repeat, we have to back off. Hence
1487     the definition of prevreqchar and subcountlits outside the main loop so
1488     that they can be accessed for the back off. */
1489    
1490     if (subreqchar > 0 &&
1491     (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1492     (bravalue == OP_COND && condcount == 2)))
1493     {
1494     prevreqchar = *reqchar;
1495     *reqchar = subreqchar;
1496     if (bravalue != OP_ASSERT) *countlits += subcountlits;
1497     }
1498    
1499 nigel 23 /* Now update the main code pointer to the end of the group. */
1500    
1501     code = tempcode;
1502    
1503     /* Error if hit end of pattern */
1504    
1505 nigel 3 if (*ptr != ')')
1506     {
1507     *errorptr = ERR14;
1508     goto FAILED;
1509     }
1510     break;
1511    
1512     /* Check \ for being a real metacharacter; if not, fall through and handle
1513     it as a data character at the start of a string. Escape items are checked
1514     for validity in the pre-compiling pass. */
1515    
1516     case '\\':
1517 nigel 23 tempptr = ptr;
1518 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1519 nigel 3
1520     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1521     are arranged to be the negation of the corresponding OP_values. For the
1522     back references, the values are ESC_REF plus the reference number. Only
1523     back references and those types that consume a character may be repeated.
1524     We can test for values between ESC_b and ESC_Z for the latter; this may
1525     have to change if any new ones are ever created. */
1526    
1527     if (c < 0)
1528     {
1529     if (-c >= ESC_REF)
1530     {
1531     previous = code;
1532     *code++ = OP_REF;
1533 nigel 23 *code++ = -c - ESC_REF;
1534 nigel 3 }
1535     else
1536     {
1537 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1538 nigel 3 *code++ = -c;
1539     }
1540     continue;
1541     }
1542    
1543 nigel 7 /* Data character: reset and fall through */
1544 nigel 3
1545 nigel 23 ptr = tempptr;
1546 nigel 3 c = '\\';
1547    
1548     /* Handle a run of data characters until a metacharacter is encountered.
1549     The first character is guaranteed not to be whitespace or # when the
1550     extended flag is set. */
1551    
1552     NORMAL_CHAR:
1553     default:
1554     previous = code;
1555     *code = OP_CHARS;
1556     code += 2;
1557     length = 0;
1558    
1559     do
1560     {
1561     if ((options & PCRE_EXTENDED) != 0)
1562     {
1563 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1564 nigel 3 if (c == '#')
1565     {
1566     while ((c = *(++ptr)) != 0 && c != '\n');
1567     if (c == 0) break;
1568     continue;
1569     }
1570     }
1571    
1572     /* Backslash may introduce a data char or a metacharacter. Escaped items
1573     are checked for validity in the pre-compiling pass. Stop the string
1574     before a metaitem. */
1575    
1576     if (c == '\\')
1577     {
1578 nigel 23 tempptr = ptr;
1579 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1580 nigel 23 if (c < 0) { ptr = tempptr; break; }
1581 nigel 3 }
1582    
1583     /* Ordinary character or single-char escape */
1584    
1585     *code++ = c;
1586     length++;
1587     }
1588    
1589     /* This "while" is the end of the "do" above. */
1590    
1591 nigel 25 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1592 nigel 3
1593 nigel 37 /* Update the last character and the count of literals */
1594    
1595     prevreqchar = (length > 1)? code[-2] : *reqchar;
1596     *reqchar = code[-1];
1597     *countlits += length;
1598    
1599 nigel 3 /* Compute the length and set it in the data vector, and advance to
1600     the next state. */
1601    
1602     previous[1] = length;
1603 nigel 15 if (length < 255) ptr--;
1604 nigel 3 break;
1605     }
1606     } /* end of big loop */
1607    
1608     /* Control never reaches here by falling through, only by a goto for all the
1609     error states. Pass back the position in the pattern so that it can be displayed
1610     to the user for diagnosing the error. */
1611    
1612     FAILED:
1613     *ptrptr = ptr;
1614     return FALSE;
1615     }
1616    
1617    
1618    
1619    
1620     /*************************************************
1621     * Compile sequence of alternatives *
1622     *************************************************/
1623    
1624     /* On entry, ptr is pointing past the bracket character, but on return
1625     it points to the closing bracket, or vertical bar, or end of string.
1626     The code variable is pointing at the byte into which the BRA operator has been
1627 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
1628     during any branch, we need to insert an OP_OPT item at the start of every
1629     following branch to ensure they get set correctly at run time, and also pass
1630     the new options into every subsequent branch compile.
1631 nigel 3
1632     Argument:
1633 nigel 23 options the option bits
1634     optchanged new ims options to set as if (?ims) were at the start, or -1
1635     for no change
1636     brackets -> int containing the number of extracting brackets used
1637     codeptr -> the address of the current code pointer
1638     ptrptr -> the address of the current pattern pointer
1639     errorptr -> pointer to error message
1640     lookbehind TRUE if this is a lookbehind assertion
1641     condref > 0 for OPT_CREF setting at start of conditional group
1642 nigel 37 reqchar -> place to put the last required character, or a negative number
1643     countlits -> place to put the shortest literal count of any branch
1644 nigel 25 cd points to the data block with tables pointers
1645 nigel 3
1646 nigel 23 Returns: TRUE on success
1647 nigel 3 */
1648    
1649     static BOOL
1650 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1651 nigel 25 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1652 nigel 37 int *reqchar, int *countlits, compile_data *cd)
1653 nigel 3 {
1654 nigel 7 const uschar *ptr = *ptrptr;
1655 nigel 3 uschar *code = *codeptr;
1656 nigel 23 uschar *last_branch = code;
1657 nigel 3 uschar *start_bracket = code;
1658 nigel 23 uschar *reverse_count = NULL;
1659     int oldoptions = options & PCRE_IMS;
1660 nigel 37 int branchreqchar, branchcountlits;
1661 nigel 3
1662 nigel 37 *reqchar = -1;
1663     *countlits = INT_MAX;
1664 nigel 23 code += 3;
1665    
1666     /* At the start of a reference-based conditional group, insert the reference
1667     number as an OP_CREF item. */
1668    
1669     if (condref > 0)
1670     {
1671     *code++ = OP_CREF;
1672     *code++ = condref;
1673     }
1674    
1675     /* Loop for each alternative branch */
1676    
1677 nigel 3 for (;;)
1678     {
1679     int length;
1680    
1681 nigel 23 /* Handle change of options */
1682    
1683     if (optchanged >= 0)
1684 nigel 3 {
1685 nigel 23 *code++ = OP_OPT;
1686     *code++ = optchanged;
1687     options = (options & ~PCRE_IMS) | optchanged;
1688     }
1689    
1690     /* Set up dummy OP_REVERSE if lookbehind assertion */
1691    
1692     if (lookbehind)
1693     {
1694     *code++ = OP_REVERSE;
1695     reverse_count = code;
1696     *code++ = 0;
1697     *code++ = 0;
1698     }
1699    
1700     /* Now compile the branch */
1701    
1702 nigel 37 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1703     &branchreqchar, &branchcountlits, cd))
1704 nigel 23 {
1705 nigel 3 *ptrptr = ptr;
1706     return FALSE;
1707     }
1708    
1709     /* Fill in the length of the last branch */
1710    
1711     length = code - last_branch;
1712     last_branch[1] = length >> 8;
1713     last_branch[2] = length & 255;
1714    
1715 nigel 37 /* Save the last required character if all branches have the same; a current
1716     value of -1 means unset, while -2 means "previous branch had no last required
1717     char". */
1718    
1719     if (*reqchar != -2)
1720     {
1721     if (branchreqchar >= 0)
1722     {
1723     if (*reqchar == -1) *reqchar = branchreqchar;
1724     else if (*reqchar != branchreqchar) *reqchar = -2;
1725     }
1726     else *reqchar = -2;
1727     }
1728    
1729     /* Keep the shortest literal count */
1730    
1731     if (branchcountlits < *countlits) *countlits = branchcountlits;
1732     DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1733    
1734 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
1735     and put the length into the OP_REVERSE item. Temporarily mark the end of
1736     the branch with OP_END. */
1737    
1738     if (lookbehind)
1739     {
1740     *code = OP_END;
1741     length = find_fixedlength(last_branch);
1742     DPRINTF(("fixed length = %d\n", length));
1743     if (length < 0)
1744     {
1745     *errorptr = ERR25;
1746     *ptrptr = ptr;
1747     return FALSE;
1748     }
1749     reverse_count[0] = (length >> 8);
1750     reverse_count[1] = length & 255;
1751     }
1752    
1753 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
1754     terminating ket and the length of the whole bracketed item, and return,
1755 nigel 23 leaving the pointer at the terminating char. If any of the ims options
1756     were changed inside the group, compile a resetting op-code following. */
1757 nigel 3
1758     if (*ptr != '|')
1759     {
1760     length = code - start_bracket;
1761     *code++ = OP_KET;
1762     *code++ = length >> 8;
1763     *code++ = length & 255;
1764 nigel 23 if (optchanged >= 0)
1765     {
1766     *code++ = OP_OPT;
1767     *code++ = oldoptions;
1768     }
1769 nigel 3 *codeptr = code;
1770     *ptrptr = ptr;
1771     return TRUE;
1772     }
1773    
1774     /* Another branch follows; insert an "or" node and advance the pointer. */
1775    
1776     *code = OP_ALT;
1777 nigel 23 last_branch = code;
1778     code += 3;
1779 nigel 3 ptr++;
1780     }
1781     /* Control never reaches here */
1782     }
1783    
1784    
1785    
1786 nigel 23
1787 nigel 3 /*************************************************
1788 nigel 23 * Find first significant op code *
1789     *************************************************/
1790    
1791     /* This is called by several functions that scan a compiled expression looking
1792     for a fixed first character, or an anchoring op code etc. It skips over things
1793     that do not influence this. For one application, a change of caseless option is
1794     important.
1795    
1796     Arguments:
1797     code pointer to the start of the group
1798     options pointer to external options
1799     optbit the option bit whose changing is significant, or
1800     zero if none are
1801     optstop TRUE to return on option change, otherwise change the options
1802     value and continue
1803    
1804     Returns: pointer to the first significant opcode
1805     */
1806    
1807     static const uschar*
1808     first_significant_code(const uschar *code, int *options, int optbit,
1809     BOOL optstop)
1810     {
1811     for (;;)
1812     {
1813     switch ((int)*code)
1814     {
1815     case OP_OPT:
1816     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1817     {
1818     if (optstop) return code;
1819     *options = (int)code[1];
1820     }
1821     code += 2;
1822     break;
1823    
1824     case OP_CREF:
1825     code += 2;
1826     break;
1827    
1828 nigel 35 case OP_WORD_BOUNDARY:
1829     case OP_NOT_WORD_BOUNDARY:
1830     code++;
1831     break;
1832    
1833 nigel 23 case OP_ASSERT_NOT:
1834     case OP_ASSERTBACK:
1835     case OP_ASSERTBACK_NOT:
1836     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1837     code += 3;
1838     break;
1839    
1840     default:
1841     return code;
1842     }
1843     }
1844     /* Control never reaches here */
1845     }
1846    
1847    
1848    
1849    
1850     /*************************************************
1851 nigel 3 * Check for anchored expression *
1852     *************************************************/
1853    
1854     /* Try to find out if this is an anchored regular expression. Consider each
1855     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1856     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1857     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1858     counts, since OP_CIRC can match in the middle.
1859    
1860 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1861     because that will try the rest of the pattern at all possible matching points,
1862     so there is no point trying them again.
1863 nigel 3
1864 nigel 23 Arguments:
1865     code points to start of expression (the bracket)
1866     options points to the options setting
1867    
1868     Returns: TRUE or FALSE
1869 nigel 3 */
1870    
1871     static BOOL
1872 nigel 23 is_anchored(register const uschar *code, int *options)
1873 nigel 3 {
1874     do {
1875 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
1876     PCRE_MULTILINE, FALSE);
1877     register int op = *scode;
1878     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1879     { if (!is_anchored(scode, options)) return FALSE; }
1880 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1881     (*options & PCRE_DOTALL) != 0)
1882 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
1883     else if (op != OP_SOD &&
1884     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1885     return FALSE;
1886 nigel 3 code += (code[1] << 8) + code[2];
1887     }
1888     while (*code == OP_ALT);
1889     return TRUE;
1890     }
1891    
1892    
1893    
1894     /*************************************************
1895 nigel 33 * Check for starting with ^ or .* *
1896 nigel 3 *************************************************/
1897    
1898 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
1899     "first char" processing can be done to speed things up in multiline
1900     matching and for non-DOTALL patterns that start with .* (which must start at
1901     the beginning or after \n).
1902 nigel 3
1903     Argument: points to start of expression (the bracket)
1904     Returns: TRUE or FALSE
1905     */
1906    
1907     static BOOL
1908 nigel 7 is_startline(const uschar *code)
1909 nigel 3 {
1910     do {
1911 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1912     register int op = *scode;
1913     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1914     { if (!is_startline(scode)) return FALSE; }
1915 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1916     { if (scode[1] != OP_ANY) return FALSE; }
1917 nigel 23 else if (op != OP_CIRC) return FALSE;
1918 nigel 3 code += (code[1] << 8) + code[2];
1919     }
1920     while (*code == OP_ALT);
1921     return TRUE;
1922     }
1923    
1924    
1925    
1926     /*************************************************
1927     * Check for fixed first char *
1928     *************************************************/
1929    
1930     /* Try to find out if there is a fixed first character. This is called for
1931     unanchored expressions, as it speeds up their processing quite considerably.
1932     Consider each alternative branch. If they all start with the same char, or with
1933     a bracket all of whose alternatives start with the same char (recurse ad lib),
1934     then we return that char, otherwise -1.
1935    
1936 nigel 23 Arguments:
1937     code points to start of expression (the bracket)
1938     options pointer to the options (used to check casing changes)
1939    
1940     Returns: -1 or the fixed first char
1941 nigel 3 */
1942    
1943     static int
1944 nigel 23 find_firstchar(const uschar *code, int *options)
1945 nigel 3 {
1946     register int c = -1;
1947 nigel 23 do {
1948     int d;
1949     const uschar *scode = first_significant_code(code + 3, options,
1950     PCRE_CASELESS, TRUE);
1951     register int op = *scode;
1952 nigel 3
1953 nigel 23 if (op >= OP_BRA) op = OP_BRA;
1954 nigel 3
1955 nigel 23 switch(op)
1956     {
1957     default:
1958     return -1;
1959 nigel 3
1960 nigel 23 case OP_BRA:
1961     case OP_ASSERT:
1962     case OP_ONCE:
1963     case OP_COND:
1964     if ((d = find_firstchar(scode, options)) < 0) return -1;
1965     if (c < 0) c = d; else if (c != d) return -1;
1966     break;
1967 nigel 3
1968 nigel 23 case OP_EXACT: /* Fall through */
1969     scode++;
1970 nigel 3
1971 nigel 23 case OP_CHARS: /* Fall through */
1972     scode++;
1973    
1974     case OP_PLUS:
1975     case OP_MINPLUS:
1976     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1977     break;
1978     }
1979    
1980     code += (code[1] << 8) + code[2];
1981     }
1982 nigel 3 while (*code == OP_ALT);
1983     return c;
1984     }
1985    
1986    
1987    
1988 nigel 23
1989    
1990 nigel 3 /*************************************************
1991     * Compile a Regular Expression *
1992     *************************************************/
1993    
1994     /* This function takes a string and returns a pointer to a block of store
1995     holding a compiled version of the expression.
1996    
1997     Arguments:
1998     pattern the regular expression
1999     options various option bits
2000     errorptr pointer to pointer to error text
2001     erroroffset ptr offset in pattern where error was detected
2002 nigel 25 tables pointer to character tables or NULL
2003 nigel 3
2004     Returns: pointer to compiled data block, or NULL on error,
2005     with errorptr and erroroffset set
2006     */
2007    
2008     pcre *
2009 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
2010 nigel 25 int *erroroffset, const unsigned char *tables)
2011 nigel 3 {
2012     real_pcre *re;
2013     int length = 3; /* For initial BRA plus length */
2014     int runlength;
2015 nigel 37 int c, size, reqchar, countlits;
2016 nigel 3 int bracount = 0;
2017     int top_backref = 0;
2018 nigel 23 int branch_extra = 0;
2019     int branch_newextra;
2020 nigel 7 unsigned int brastackptr = 0;
2021     uschar *code;
2022     const uschar *ptr;
2023 nigel 25 compile_data compile_block;
2024 nigel 23 int brastack[BRASTACK_SIZE];
2025     uschar bralenstack[BRASTACK_SIZE];
2026 nigel 3
2027     #ifdef DEBUG
2028     uschar *code_base, *code_end;
2029     #endif
2030    
2031     /* We can't pass back an error message if errorptr is NULL; I guess the best we
2032     can do is just return NULL. */
2033    
2034     if (errorptr == NULL) return NULL;
2035     *errorptr = NULL;
2036    
2037     /* However, we can give a message for this error */
2038    
2039     if (erroroffset == NULL)
2040     {
2041     *errorptr = ERR16;
2042     return NULL;
2043     }
2044     *erroroffset = 0;
2045    
2046     if ((options & ~PUBLIC_OPTIONS) != 0)
2047     {
2048     *errorptr = ERR17;
2049     return NULL;
2050     }
2051    
2052 nigel 25 /* Set up pointers to the individual character tables */
2053    
2054     if (tables == NULL) tables = pcre_default_tables;
2055     compile_block.lcc = tables + lcc_offset;
2056     compile_block.fcc = tables + fcc_offset;
2057     compile_block.cbits = tables + cbits_offset;
2058     compile_block.ctypes = tables + ctypes_offset;
2059    
2060     /* Reflect pattern for debugging output */
2061    
2062 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
2063     DPRINTF(("%s\n", pattern));
2064 nigel 3
2065     /* The first thing to do is to make a pass over the pattern to compute the
2066     amount of store required to hold the compiled code. This does not have to be
2067     perfect as long as errors are overestimates. At the same time we can detect any
2068     internal flag settings. Make an attempt to correct for any counted white space
2069     if an "extended" flag setting appears late in the pattern. We can't be so
2070     clever for #-comments. */
2071    
2072 nigel 7 ptr = (const uschar *)(pattern - 1);
2073 nigel 3 while ((c = *(++ptr)) != 0)
2074     {
2075     int min, max;
2076     int class_charcount;
2077    
2078 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2079 nigel 3 {
2080 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2081 nigel 23 if (c == '#')
2082     {
2083     while ((c = *(++ptr)) != 0 && c != '\n');
2084     continue;
2085     }
2086 nigel 3 }
2087    
2088     switch(c)
2089     {
2090     /* A backslashed item may be an escaped "normal" character or a
2091     character type. For a "normal" character, put the pointers and
2092     character back so that tests for whitespace etc. in the input
2093     are done correctly. */
2094    
2095     case '\\':
2096     {
2097 nigel 7 const uschar *save_ptr = ptr;
2098 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2099 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2100     if (c >= 0)
2101     {
2102     ptr = save_ptr;
2103     c = '\\';
2104     goto NORMAL_CHAR;
2105     }
2106     }
2107     length++;
2108    
2109     /* A back reference needs an additional char, plus either one or 5
2110     bytes for a repeat. We also need to keep the value of the highest
2111     back reference. */
2112    
2113     if (c <= -ESC_REF)
2114     {
2115     int refnum = -c - ESC_REF;
2116     if (refnum > top_backref) top_backref = refnum;
2117     length++; /* For single back reference */
2118 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2119 nigel 3 {
2120 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2121 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2122     if ((min == 0 && (max == 1 || max == -1)) ||
2123     (min == 1 && max == -1))
2124     length++;
2125     else length += 5;
2126     if (ptr[1] == '?') ptr++;
2127     }
2128     }
2129     continue;
2130    
2131     case '^':
2132     case '.':
2133     case '$':
2134     case '*': /* These repeats won't be after brackets; */
2135     case '+': /* those are handled separately */
2136     case '?':
2137     length++;
2138     continue;
2139    
2140     /* This covers the cases of repeats after a single char, metachar, class,
2141     or back reference. */
2142    
2143     case '{':
2144 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2145     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2146 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2147     if ((min == 0 && (max == 1 || max == -1)) ||
2148     (min == 1 && max == -1))
2149     length++;
2150     else
2151     {
2152     length--; /* Uncount the original char or metachar */
2153     if (min == 1) length++; else if (min > 0) length += 4;
2154     if (max > 0) length += 4; else length += 2;
2155     }
2156     if (ptr[1] == '?') ptr++;
2157     continue;
2158    
2159 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
2160     options changed in the previous branch(es), and/or if we are in a
2161     lookbehind assertion, extra space will be needed at the start of the
2162     branch. This is handled by branch_extra. */
2163    
2164 nigel 3 case '|':
2165 nigel 23 length += 3 + branch_extra;
2166 nigel 3 continue;
2167    
2168     /* A character class uses 33 characters. Don't worry about character types
2169     that aren't allowed in classes - they'll get picked up during the compile.
2170     A character class that contains only one character uses 2 or 3 bytes,
2171     depending on whether it is negated or not. Notice this where we can. */
2172    
2173     case '[':
2174     class_charcount = 0;
2175     if (*(++ptr) == '^') ptr++;
2176     do
2177     {
2178     if (*ptr == '\\')
2179     {
2180 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2181     &compile_block);
2182 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2183 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2184 nigel 3 }
2185     else class_charcount++;
2186     ptr++;
2187     }
2188     while (*ptr != 0 && *ptr != ']');
2189    
2190     /* Repeats for negated single chars are handled by the general code */
2191    
2192     if (class_charcount == 1) length += 3; else
2193     {
2194     length += 33;
2195    
2196     /* A repeat needs either 1 or 5 bytes. */
2197    
2198 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2199 nigel 3 {
2200 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2201 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2202     if ((min == 0 && (max == 1 || max == -1)) ||
2203     (min == 1 && max == -1))
2204     length++;
2205     else length += 5;
2206     if (ptr[1] == '?') ptr++;
2207     }
2208     }
2209     continue;
2210    
2211     /* Brackets may be genuine groups or special things */
2212    
2213     case '(':
2214 nigel 23 branch_newextra = 0;
2215 nigel 3
2216     /* Handle special forms of bracket, which all start (? */
2217    
2218 nigel 23 if (ptr[1] == '?')
2219 nigel 3 {
2220 nigel 23 int set, unset;
2221     int *optset;
2222    
2223     switch (c = ptr[2])
2224 nigel 3 {
2225 nigel 23 /* Skip over comments entirely */
2226     case '#':
2227     ptr += 3;
2228     while (*ptr != 0 && *ptr != ')') ptr++;
2229     if (*ptr == 0)
2230     {
2231     *errorptr = ERR18;
2232     goto PCRE_ERROR_RETURN;
2233     }
2234     continue;
2235 nigel 3
2236 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2237     then behave like a non-special bracket, except that they don't increment
2238     the count of extracting brackets. Ditto for the "once only" bracket,
2239     which is in Perl from version 5.005. */
2240 nigel 3
2241 nigel 23 case ':':
2242     case '=':
2243     case '!':
2244     case '>':
2245 nigel 3 ptr += 2;
2246     break;
2247    
2248 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2249 nigel 3
2250 nigel 23 case '<':
2251     if (ptr[3] == '=' || ptr[3] == '!')
2252 nigel 3 {
2253 nigel 23 ptr += 3;
2254     branch_newextra = 3;
2255     length += 3; /* For the first branch */
2256     break;
2257 nigel 3 }
2258 nigel 23 *errorptr = ERR24;
2259     goto PCRE_ERROR_RETURN;
2260    
2261     /* Conditionals are in Perl from version 5.005. The bracket must either
2262     be followed by a number (for bracket reference) or by an assertion
2263     group. */
2264    
2265     case '(':
2266 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2267 nigel 3 {
2268 nigel 23 ptr += 4;
2269     length += 2;
2270 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2271 nigel 23 if (*ptr != ')')
2272     {
2273     *errorptr = ERR26;
2274     goto PCRE_ERROR_RETURN;
2275     }
2276 nigel 3 }
2277 nigel 23 else /* An assertion must follow */
2278 nigel 3 {
2279 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2280    
2281     if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2282     {
2283     ptr += 2; /* To get right offset in message */
2284     *errorptr = ERR28;
2285     goto PCRE_ERROR_RETURN;
2286     }
2287 nigel 3 }
2288 nigel 23 break;
2289    
2290     /* Else loop checking valid options until ) is met. Anything else is an
2291     error. If we are without any brackets, i.e. at top level, the settings
2292     act as if specified in the options, so massage the options immediately.
2293     This is for backward compatibility with Perl 5.004. */
2294    
2295     default:
2296     set = unset = 0;
2297     optset = &set;
2298     ptr += 2;
2299    
2300     for (;; ptr++)
2301 nigel 3 {
2302 nigel 23 c = *ptr;
2303     switch (c)
2304     {
2305     case 'i':
2306     *optset |= PCRE_CASELESS;
2307     continue;
2308    
2309     case 'm':
2310     *optset |= PCRE_MULTILINE;
2311     continue;
2312    
2313     case 's':
2314     *optset |= PCRE_DOTALL;
2315     continue;
2316    
2317     case 'x':
2318     *optset |= PCRE_EXTENDED;
2319     continue;
2320    
2321     case 'X':
2322     *optset |= PCRE_EXTRA;
2323     continue;
2324    
2325     case 'U':
2326     *optset |= PCRE_UNGREEDY;
2327     continue;
2328    
2329     case '-':
2330     optset = &unset;
2331     continue;
2332    
2333     /* A termination by ')' indicates an options-setting-only item;
2334     this is global at top level; otherwise nothing is done here and
2335     it is handled during the compiling process on a per-bracket-group
2336     basis. */
2337    
2338     case ')':
2339     if (brastackptr == 0)
2340     {
2341     options = (options | set) & (~unset);
2342     set = unset = 0; /* To save length */
2343     }
2344     /* Fall through */
2345    
2346     /* A termination by ':' indicates the start of a nested group with
2347     the given options set. This is again handled at compile time, but
2348     we must allow for compiled space if any of the ims options are
2349     set. We also have to allow for resetting space at the end of
2350     the group, which is why 4 is added to the length and not just 2.
2351     If there are several changes of options within the same group, this
2352     will lead to an over-estimate on the length, but this shouldn't
2353     matter very much. We also have to allow for resetting options at
2354     the start of any alternations, which we do by setting
2355 nigel 37 branch_newextra to 2. Finally, we record whether the case-dependent
2356     flag ever changes within the regex. This is used by the "required
2357     character" code. */
2358 nigel 23
2359     case ':':
2360     if (((set|unset) & PCRE_IMS) != 0)
2361     {
2362     length += 4;
2363     branch_newextra = 2;
2364 nigel 37 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2365 nigel 23 }
2366     goto END_OPTIONS;
2367    
2368     /* Unrecognized option character */
2369    
2370     default:
2371     *errorptr = ERR12;
2372     goto PCRE_ERROR_RETURN;
2373     }
2374 nigel 3 }
2375 nigel 23
2376     /* If we hit a closing bracket, that's it - this is a freestanding
2377     option-setting. We need to ensure that branch_extra is updated if
2378     necessary. The only values branch_newextra can have here are 0 or 2.
2379     If the value is 2, then branch_extra must either be 2 or 5, depending
2380     on whether this is a lookbehind group or not. */
2381    
2382     END_OPTIONS:
2383     if (c == ')')
2384 nigel 19 {
2385 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2386     branch_extra += branch_newextra;
2387 nigel 19 continue;
2388     }
2389 nigel 3
2390 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2391     to handle the group below. */
2392 nigel 3 }
2393     }
2394    
2395     /* Extracting brackets must be counted so we can process escapes in a
2396     Perlish way. */
2397    
2398     else bracount++;
2399    
2400     /* Non-special forms of bracket. Save length for computing whole length
2401 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2402     save the current value of branch_extra, and start the new group with
2403     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2404     for a lookbehind assertion. */
2405 nigel 3
2406     if (brastackptr >= sizeof(brastack)/sizeof(int))
2407     {
2408     *errorptr = ERR19;
2409     goto PCRE_ERROR_RETURN;
2410     }
2411    
2412 nigel 23 bralenstack[brastackptr] = branch_extra;
2413     branch_extra = branch_newextra;
2414    
2415 nigel 3 brastack[brastackptr++] = length;
2416     length += 3;
2417     continue;
2418    
2419     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2420 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2421     0 this is an unmatched bracket which will generate an error, but take care
2422 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2423     the branch_extra value. */
2424 nigel 3
2425     case ')':
2426     length += 3;
2427     {
2428 nigel 9 int minval = 1;
2429     int maxval = 1;
2430 nigel 23 int duplength;
2431 nigel 3
2432 nigel 23 if (brastackptr > 0)
2433     {
2434     duplength = length - brastack[--brastackptr];
2435     branch_extra = bralenstack[brastackptr];
2436     }
2437     else duplength = 0;
2438    
2439 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2440     automatically; for the others we need an increment. */
2441    
2442 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2443 nigel 3 {
2444 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2445     &compile_block);
2446 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2447     }
2448 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2449     else if (c == '+') { maxval = -1; ptr++; }
2450     else if (c == '?') { minval = 0; ptr++; }
2451 nigel 3
2452 nigel 31 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2453     group, and if the maximum is greater than zero, we have to replicate
2454     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2455     bracket set - hence the 7. */
2456 nigel 3
2457 nigel 31 if (minval == 0)
2458     {
2459     length++;
2460     if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2461     }
2462    
2463     /* When the minimum is greater than zero, 1 we have to replicate up to
2464     minval-1 times, with no additions required in the copies. Then, if
2465     there is a limited maximum we have to replicate up to maxval-1 times
2466     allowing for a BRAZERO item before each optional copy and nesting
2467     brackets for all but one of the optional copies. */
2468    
2469     else
2470     {
2471     length += (minval - 1) * duplength;
2472     if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2473     length += (maxval - minval) * (duplength + 7) - 6;
2474     }
2475 nigel 3 }
2476     continue;
2477    
2478     /* Non-special character. For a run of such characters the length required
2479     is the number of characters + 2, except that the maximum run length is 255.
2480     We won't get a skipped space or a non-data escape or the start of a #
2481     comment as the first character, so the length can't be zero. */
2482    
2483     NORMAL_CHAR:
2484     default:
2485     length += 2;
2486     runlength = 0;
2487     do
2488     {
2489 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2490 nigel 3 {
2491 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2492 nigel 23 if (c == '#')
2493     {
2494     while ((c = *(++ptr)) != 0 && c != '\n');
2495     continue;
2496     }
2497 nigel 3 }
2498    
2499     /* Backslash may introduce a data char or a metacharacter; stop the
2500     string before the latter. */
2501    
2502     if (c == '\\')
2503     {
2504 nigel 7 const uschar *saveptr = ptr;
2505 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2506     &compile_block);
2507 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2508     if (c < 0) { ptr = saveptr; break; }
2509     }
2510    
2511     /* Ordinary character or single-char escape */
2512    
2513     runlength++;
2514     }
2515    
2516     /* This "while" is the end of the "do" above. */
2517    
2518 nigel 25 while (runlength < 255 &&
2519     (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2520 nigel 3
2521     ptr--;
2522     length += runlength;
2523     continue;
2524     }
2525     }
2526    
2527     length += 4; /* For final KET and END */
2528    
2529     if (length > 65539)
2530     {
2531     *errorptr = ERR20;
2532     return NULL;
2533     }
2534    
2535     /* Compute the size of data block needed and get it, either from malloc or
2536 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
2537     rather than just "code", because it has been reported that one broken compiler
2538     fails on "code" because it is also an independent variable. It should make no
2539     difference to the value of the offsetof(). */
2540 nigel 3
2541 nigel 9 size = length + offsetof(real_pcre, code[0]);
2542 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
2543    
2544     if (re == NULL)
2545     {
2546     *errorptr = ERR21;
2547     return NULL;
2548     }
2549    
2550 nigel 9 /* Put in the magic number and the options. */
2551    
2552 nigel 3 re->magic_number = MAGIC_NUMBER;
2553     re->options = options;
2554 nigel 25 re->tables = tables;
2555 nigel 3
2556     /* Set up a starting, non-extracting bracket, then compile the expression. On
2557     error, *errorptr will be set non-NULL, so we don't need to look at the result
2558     of the function here. */
2559    
2560 nigel 7 ptr = (const uschar *)pattern;
2561 nigel 3 code = re->code;
2562     *code = OP_BRA;
2563     bracount = 0;
2564 nigel 25 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2565 nigel 37 &reqchar, &countlits, &compile_block);
2566 nigel 3 re->top_bracket = bracount;
2567     re->top_backref = top_backref;
2568    
2569     /* If not reached end of pattern on success, there's an excess bracket. */
2570    
2571     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2572    
2573     /* Fill in the terminating state and check for disastrous overflow, but
2574     if debugging, leave the test till after things are printed out. */
2575    
2576     *code++ = OP_END;
2577    
2578     #ifndef DEBUG
2579     if (code - re->code > length) *errorptr = ERR23;
2580     #endif
2581    
2582 nigel 23 /* Give an error if there's back reference to a non-existent capturing
2583     subpattern. */
2584    
2585     if (top_backref > re->top_bracket) *errorptr = ERR15;
2586    
2587 nigel 3 /* Failed to compile */
2588    
2589     if (*errorptr != NULL)
2590     {
2591     (pcre_free)(re);
2592     PCRE_ERROR_RETURN:
2593 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
2594 nigel 3 return NULL;
2595     }
2596    
2597 nigel 33 /* If the anchored option was not passed, set flag if we can determine that the
2598     pattern is anchored by virtue of ^ characters or \A or anything else (such as
2599     starting with .* when DOTALL is set).
2600 nigel 3
2601 nigel 33 Otherwise, see if we can determine what the first character has to be, because
2602     that speeds up unanchored matches no end. If not, see if we can set the
2603     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2604     start with ^. and also when all branches start with .* for non-DOTALL matches.
2605     */
2606    
2607 nigel 3 if ((options & PCRE_ANCHORED) == 0)
2608     {
2609 nigel 23 int temp_options = options;
2610     if (is_anchored(re->code, &temp_options))
2611 nigel 3 re->options |= PCRE_ANCHORED;
2612     else
2613     {
2614 nigel 23 int ch = find_firstchar(re->code, &temp_options);
2615 nigel 9 if (ch >= 0)
2616 nigel 3 {
2617 nigel 9 re->first_char = ch;
2618 nigel 3 re->options |= PCRE_FIRSTSET;
2619     }
2620     else if (is_startline(re->code))
2621     re->options |= PCRE_STARTLINE;
2622     }
2623     }
2624    
2625 nigel 37 /* Save the last required character if there are at least two literal
2626     characters on all paths, or if there is no first character setting. */
2627    
2628     if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2629     {
2630     re->req_char = reqchar;
2631     re->options |= PCRE_REQCHSET;
2632     }
2633    
2634 nigel 3 /* Print out the compiled data for debugging */
2635    
2636     #ifdef DEBUG
2637    
2638 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
2639 nigel 3 length, re->top_bracket, re->top_backref);
2640    
2641     if (re->options != 0)
2642     {
2643 nigel 37 printf("%s%s%s%s%s%s%s%s%s\n",
2644 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2645     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2646 nigel 37 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2647 nigel 3 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2648     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2649     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2650     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2651 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2652     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2653 nigel 3 }
2654    
2655     if ((re->options & PCRE_FIRSTSET) != 0)
2656     {
2657     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2658     else printf("First char = \\x%02x\n", re->first_char);
2659     }
2660    
2661 nigel 37 if ((re->options & PCRE_REQCHSET) != 0)
2662     {
2663     if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2664     else printf("Req char = \\x%02x\n", re->req_char);
2665     }
2666    
2667 nigel 3 code_end = code;
2668     code_base = code = re->code;
2669    
2670     while (code < code_end)
2671     {
2672     int charlength;
2673    
2674     printf("%3d ", code - code_base);
2675    
2676     if (*code >= OP_BRA)
2677     {
2678     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2679     code += 2;
2680     }
2681    
2682     else switch(*code)
2683     {
2684 nigel 23 case OP_OPT:
2685     printf(" %.2x %s", code[1], OP_names[*code]);
2686     code++;
2687     break;
2688    
2689     case OP_COND:
2690     printf("%3d Cond", (code[1] << 8) + code[2]);
2691     code += 2;
2692     break;
2693    
2694     case OP_CREF:
2695     printf(" %.2d %s", code[1], OP_names[*code]);
2696     code++;
2697     break;
2698    
2699 nigel 3 case OP_CHARS:
2700     charlength = *(++code);
2701     printf("%3d ", charlength);
2702     while (charlength-- > 0)
2703     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2704     break;
2705    
2706     case OP_KETRMAX:
2707     case OP_KETRMIN:
2708     case OP_ALT:
2709     case OP_KET:
2710     case OP_ASSERT:
2711     case OP_ASSERT_NOT:
2712 nigel 23 case OP_ASSERTBACK:
2713     case OP_ASSERTBACK_NOT:
2714 nigel 3 case OP_ONCE:
2715     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2716     code += 2;
2717     break;
2718    
2719 nigel 23 case OP_REVERSE:
2720     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2721     code += 2;
2722     break;
2723    
2724 nigel 3 case OP_STAR:
2725     case OP_MINSTAR:
2726     case OP_PLUS:
2727     case OP_MINPLUS:
2728     case OP_QUERY:
2729     case OP_MINQUERY:
2730     case OP_TYPESTAR:
2731     case OP_TYPEMINSTAR:
2732     case OP_TYPEPLUS:
2733     case OP_TYPEMINPLUS:
2734     case OP_TYPEQUERY:
2735     case OP_TYPEMINQUERY:
2736     if (*code >= OP_TYPESTAR)
2737     printf(" %s", OP_names[code[1]]);
2738     else if (isprint(c = code[1])) printf(" %c", c);
2739     else printf(" \\x%02x", c);
2740     printf("%s", OP_names[*code++]);
2741     break;
2742    
2743     case OP_EXACT:
2744     case OP_UPTO:
2745     case OP_MINUPTO:
2746     if (isprint(c = code[3])) printf(" %c{", c);
2747     else printf(" \\x%02x{", c);
2748 nigel 11 if (*code != OP_EXACT) printf("0,");
2749 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2750     if (*code == OP_MINUPTO) printf("?");
2751     code += 3;
2752     break;
2753    
2754     case OP_TYPEEXACT:
2755     case OP_TYPEUPTO:
2756     case OP_TYPEMINUPTO:
2757     printf(" %s{", OP_names[code[3]]);
2758     if (*code != OP_TYPEEXACT) printf(",");
2759     printf("%d}", (code[1] << 8) + code[2]);
2760     if (*code == OP_TYPEMINUPTO) printf("?");
2761     code += 3;
2762     break;
2763    
2764     case OP_NOT:
2765     if (isprint(c = *(++code))) printf(" [^%c]", c);
2766     else printf(" [^\\x%02x]", c);
2767     break;
2768    
2769     case OP_NOTSTAR:
2770     case OP_NOTMINSTAR:
2771     case OP_NOTPLUS:
2772     case OP_NOTMINPLUS:
2773     case OP_NOTQUERY:
2774     case OP_NOTMINQUERY:
2775     if (isprint(c = code[1])) printf(" [^%c]", c);
2776     else printf(" [^\\x%02x]", c);
2777     printf("%s", OP_names[*code++]);
2778     break;
2779    
2780     case OP_NOTEXACT:
2781     case OP_NOTUPTO:
2782     case OP_NOTMINUPTO:
2783     if (isprint(c = code[3])) printf(" [^%c]{", c);
2784     else printf(" [^\\x%02x]{", c);
2785     if (*code != OP_NOTEXACT) printf(",");
2786     printf("%d}", (code[1] << 8) + code[2]);
2787     if (*code == OP_NOTMINUPTO) printf("?");
2788     code += 3;
2789     break;
2790    
2791     case OP_REF:
2792     printf(" \\%d", *(++code));
2793 nigel 9 code ++;
2794     goto CLASS_REF_REPEAT;
2795 nigel 3
2796     case OP_CLASS:
2797     {
2798     int i, min, max;
2799 nigel 23 code++;
2800     printf(" [");
2801 nigel 3
2802     for (i = 0; i < 256; i++)
2803     {
2804     if ((code[i/8] & (1 << (i&7))) != 0)
2805     {
2806     int j;
2807     for (j = i+1; j < 256; j++)
2808     if ((code[j/8] & (1 << (j&7))) == 0) break;
2809     if (i == '-' || i == ']') printf("\\");
2810     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2811     if (--j > i)
2812     {
2813     printf("-");
2814     if (j == '-' || j == ']') printf("\\");
2815     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2816     }
2817     i = j;
2818     }
2819     }
2820     printf("]");
2821     code += 32;
2822    
2823 nigel 9 CLASS_REF_REPEAT:
2824    
2825 nigel 3 switch(*code)
2826     {
2827     case OP_CRSTAR:
2828     case OP_CRMINSTAR:
2829     case OP_CRPLUS:
2830     case OP_CRMINPLUS:
2831     case OP_CRQUERY:
2832     case OP_CRMINQUERY:
2833     printf("%s", OP_names[*code]);
2834     break;
2835    
2836     case OP_CRRANGE:
2837     case OP_CRMINRANGE:
2838     min = (code[1] << 8) + code[2];
2839     max = (code[3] << 8) + code[4];
2840     if (max == 0) printf("{%d,}", min);
2841     else printf("{%d,%d}", min, max);
2842     if (*code == OP_CRMINRANGE) printf("?");
2843     code += 4;
2844     break;
2845    
2846     default:
2847     code--;
2848     }
2849     }
2850     break;
2851    
2852     /* Anything else is just a one-node item */
2853    
2854     default:
2855     printf(" %s", OP_names[*code]);
2856     break;
2857     }
2858    
2859     code++;
2860     printf("\n");
2861     }
2862     printf("------------------------------------------------------------------\n");
2863    
2864     /* This check is done here in the debugging case so that the code that
2865     was compiled can be seen. */
2866    
2867     if (code - re->code > length)
2868     {
2869     *errorptr = ERR23;
2870     (pcre_free)(re);
2871     *erroroffset = ptr - (uschar *)pattern;
2872     return NULL;
2873     }
2874     #endif
2875    
2876     return (pcre *)re;
2877     }
2878    
2879    
2880    
2881     /*************************************************
2882     * Match a back-reference *
2883     *************************************************/
2884    
2885 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
2886     than the number of characters left in the string, so the match fails.
2887 nigel 3
2888     Arguments:
2889 nigel 23 offset index into the offset vector
2890 nigel 3 eptr points into the subject
2891     length length to be matched
2892     md points to match data block
2893 nigel 23 ims the ims flags
2894 nigel 3
2895     Returns: TRUE if matched
2896     */
2897    
2898     static BOOL
2899 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2900 nigel 37 unsigned long int ims)
2901 nigel 3 {
2902 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
2903 nigel 3
2904     #ifdef DEBUG
2905     if (eptr >= md->end_subject)
2906     printf("matching subject <null>");
2907     else
2908     {
2909     printf("matching subject ");
2910     pchars(eptr, length, TRUE, md);
2911     }
2912     printf(" against backref ");
2913     pchars(p, length, FALSE, md);
2914     printf("\n");
2915     #endif
2916    
2917     /* Always fail if not enough characters left */
2918    
2919 nigel 23 if (length > md->end_subject - eptr) return FALSE;
2920 nigel 3
2921     /* Separate the caselesss case for speed */
2922    
2923 nigel 23 if ((ims & PCRE_CASELESS) != 0)
2924 nigel 25 {
2925     while (length-- > 0)
2926     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2927     }
2928 nigel 3 else
2929     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2930    
2931     return TRUE;
2932     }
2933    
2934    
2935    
2936     /*************************************************
2937     * Match from current position *
2938     *************************************************/
2939    
2940 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
2941     in the subject string, while eptrb holds the value of eptr at the start of the
2942     last bracketed group - used for breaking infinite loops matching zero-length
2943     strings.
2944 nigel 3
2945     Arguments:
2946     eptr pointer in subject
2947     ecode position in code
2948     offset_top current top pointer
2949     md pointer to "static" info for the match
2950 nigel 23 ims current /i, /m, and /s options
2951     condassert TRUE if called to check a condition assertion
2952     eptrb eptr at start of last bracket
2953 nigel 3
2954     Returns: TRUE if matched
2955     */
2956    
2957     static BOOL
2958 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
2959 nigel 37 int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
2960     const uschar *eptrb)
2961 nigel 3 {
2962 nigel 37 unsigned long int original_ims = ims; /* Save for resetting on ')' */
2963 nigel 23
2964 nigel 3 for (;;)
2965     {
2966 nigel 23 int op = (int)*ecode;
2967 nigel 3 int min, max, ctype;
2968     register int i;
2969     register int c;
2970 nigel 7 BOOL minimize = FALSE;
2971 nigel 3
2972 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
2973     the current subject position in the working slot at the top of the vector. We
2974     mustn't change the current values of the data slot, because they may be set
2975     from a previous iteration of this group, and be referred to by a reference
2976     inside the group.
2977 nigel 3
2978 nigel 23 If the bracket fails to match, we need to restore this value and also the
2979     values of the final offsets, in case they were set by a previous iteration of
2980     the same bracket.
2981    
2982     If there isn't enough space in the offset vector, treat this as if it were a
2983     non-capturing bracket. Don't worry about setting the flag for the error case
2984     here; that is handled in the code for KET. */
2985    
2986     if (op > OP_BRA)
2987 nigel 3 {
2988 nigel 23 int number = op - OP_BRA;
2989     int offset = number << 1;
2990 nigel 3
2991 nigel 31 #ifdef DEBUG
2992     printf("start bracket %d subject=", number);
2993     pchars(eptr, 16, TRUE, md);
2994     printf("\n");
2995     #endif
2996 nigel 3
2997 nigel 23 if (offset < md->offset_max)
2998 nigel 3 {
2999 nigel 23 int save_offset1 = md->offset_vector[offset];
3000     int save_offset2 = md->offset_vector[offset+1];
3001     int save_offset3 = md->offset_vector[md->offset_end - number];
3002 nigel 3
3003 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3004     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3005    
3006     do
3007     {
3008     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3009     ecode += (ecode[1] << 8) + ecode[2];
3010     }
3011     while (*ecode == OP_ALT);
3012    
3013     DPRINTF(("bracket %d failed\n", number));
3014    
3015     md->offset_vector[offset] = save_offset1;
3016     md->offset_vector[offset+1] = save_offset2;
3017     md->offset_vector[md->offset_end - number] = save_offset3;
3018     return FALSE;
3019 nigel 3 }
3020    
3021 nigel 23 /* Insufficient room for saving captured contents */
3022 nigel 3
3023 nigel 23 else op = OP_BRA;
3024     }
3025    
3026     /* Other types of node can be handled by a switch */
3027    
3028     switch(op)
3029     {
3030     case OP_BRA: /* Non-capturing bracket: optimized */
3031     DPRINTF(("start bracket 0\n"));
3032 nigel 3 do
3033     {
3034 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3035 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3036     }
3037     while (*ecode == OP_ALT);
3038 nigel 23 DPRINTF(("bracket 0 failed\n"));
3039     return FALSE;
3040 nigel 3
3041 nigel 23 /* Conditional group: compilation checked that there are no more than
3042     two branches. If the condition is false, skipping the first branch takes us
3043     past the end if there is only one branch, but that's OK because that is
3044     exactly what going to the ket would do. */
3045 nigel 3
3046 nigel 23 case OP_COND:
3047     if (ecode[3] == OP_CREF) /* Condition is extraction test */
3048 nigel 3 {
3049 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
3050     return match(eptr,
3051     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3052     5 : 3 + (ecode[1] << 8) + ecode[2]),
3053     offset_top, md, ims, FALSE, eptr);
3054 nigel 3 }
3055    
3056 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
3057     the final argument TRUE causes it to stop at the end of an assertion. */
3058 nigel 3
3059 nigel 23 else
3060     {
3061     if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3062     {
3063     ecode += 3 + (ecode[4] << 8) + ecode[5];
3064     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3065     }
3066     else ecode += (ecode[1] << 8) + ecode[2];
3067     return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3068     }
3069     /* Control never reaches here */
3070 nigel 3
3071 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
3072    
3073     case OP_CREF:
3074     ecode += 2;
3075     break;
3076    
3077 nigel 37 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3078     an empty string - recursion will then try other alternatives, if any. */
3079 nigel 23
3080 nigel 3 case OP_END:
3081 nigel 37 if (md->notempty && eptr == md->start_match) return FALSE;
3082 nigel 3 md->end_match_ptr = eptr; /* Record where we ended */
3083     md->end_offset_top = offset_top; /* and how many extracts were taken */
3084     return TRUE;
3085    
3086 nigel 23 /* Change option settings */
3087 nigel 3
3088 nigel 23 case OP_OPT:
3089     ims = ecode[1];
3090     ecode += 2;
3091     DPRINTF(("ims set to %02x\n", ims));
3092     break;
3093 nigel 3
3094     /* Assertion brackets. Check the alternative branches in turn - the
3095     matching won't pass the KET for an assertion. If any one branch matches,
3096 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3097     start of each branch to move the current point backwards, so the code at
3098     this level is identical to the lookahead case. */
3099 nigel 3
3100     case OP_ASSERT:
3101 nigel 23 case OP_ASSERTBACK:
3102 nigel 3 do
3103     {
3104 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3105 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3106     }
3107     while (*ecode == OP_ALT);
3108     if (*ecode == OP_KET) return FALSE;
3109    
3110 nigel 23 /* If checking an assertion for a condition, return TRUE. */
3111    
3112     if (condassert) return TRUE;
3113    
3114 nigel 3 /* Continue from after the assertion, updating the offsets high water
3115     mark, since extracts may have been taken during the assertion. */
3116    
3117     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3118     ecode += 3;
3119     offset_top = md->end_offset_top;
3120     continue;
3121    
3122     /* Negative assertion: all branches must fail to match */
3123    
3124     case OP_ASSERT_NOT:
3125 nigel 23 case OP_ASSERTBACK_NOT:
3126 nigel 3 do
3127     {
3128 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3129 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3130     }
3131     while (*ecode == OP_ALT);
3132 nigel 23
3133     if (condassert) return TRUE;
3134 nigel 3 ecode += 3;
3135     continue;
3136    
3137 nigel 23 /* Move the subject pointer back. This occurs only at the start of
3138     each branch of a lookbehind assertion. If we are too close to the start to
3139     move back, this match function fails. */
3140    
3141     case OP_REVERSE:
3142     eptr -= (ecode[1] << 8) + ecode[2];
3143     if (eptr < md->start_subject) return FALSE;
3144     ecode += 3;
3145     break;
3146    
3147    
3148 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
3149     the point in the subject string is not moved back. Thus there can never be
3150 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
3151 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
3152 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
3153     pointer. */
3154 nigel 3
3155     case OP_ONCE:
3156     {
3157 nigel 23 const uschar *prev = ecode;
3158 nigel 3
3159 nigel 23 do
3160     {
3161     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3162     ecode += (ecode[1] << 8) + ecode[2];
3163     }
3164     while (*ecode == OP_ALT);
3165 nigel 3
3166 nigel 23 /* If hit the end of the group (which could be repeated), fail */
3167 nigel 3
3168 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3169    
3170     /* Continue as from after the assertion, updating the offsets high water
3171     mark, since extracts may have been taken. */
3172    
3173     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3174    
3175     offset_top = md->end_offset_top;
3176     eptr = md->end_match_ptr;
3177    
3178     /* For a non-repeating ket, just continue at this level. This also
3179     happens for a repeating ket if no characters were matched in the group.
3180     This is the forcible breaking of infinite loops as implemented in Perl
3181     5.005. If there is an options reset, it will get obeyed in the normal
3182     course of events. */
3183    
3184     if (*ecode == OP_KET || eptr == eptrb)
3185     {
3186     ecode += 3;
3187     break;
3188     }
3189    
3190     /* The repeating kets try the rest of the pattern or restart from the
3191     preceding bracket, in the appropriate order. We need to reset any options
3192     that changed within the bracket before re-running it, so check the next
3193     opcode. */
3194    
3195     if (ecode[3] == OP_OPT)
3196     {
3197     ims = (ims & ~PCRE_IMS) | ecode[4];
3198     DPRINTF(("ims set to %02x at group repeat\n", ims));
3199     }
3200    
3201     if (*ecode == OP_KETRMIN)
3202     {
3203     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3204     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3205     }
3206     else /* OP_KETRMAX */
3207     {
3208     if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3209     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3210     }
3211     }
3212     return FALSE;
3213    
3214 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
3215     bracketed group and go to there. */
3216    
3217     case OP_ALT:
3218     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3219     break;
3220    
3221     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3222     that it may occur zero times. It may repeat infinitely, or not at all -
3223     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3224     repeat limits are compiled as a number of copies, with the optional ones
3225     preceded by BRAZERO or BRAMINZERO. */
3226    
3227     case OP_BRAZERO:
3228     {
3229 nigel 7 const uschar *next = ecode+1;
3230 nigel 23 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3231 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3232     ecode = next + 3;
3233     }
3234     break;
3235    
3236     case OP_BRAMINZERO:
3237     {
3238 nigel 7 const uschar *next = ecode+1;
3239 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3240 nigel 23 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3241 nigel 3 ecode++;
3242     }
3243 nigel 23 break;
3244 nigel 3
3245     /* End of a group, repeated or non-repeating. If we are at the end of
3246     an assertion "group", stop matching and return TRUE, but record the
3247 nigel 23 current high water mark for use by positive assertions. Do this also
3248     for the "once" (not-backup up) groups. */
3249 nigel 3
3250     case OP_KET:
3251     case OP_KETRMIN:
3252     case OP_KETRMAX:
3253     {
3254 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3255 nigel 3
3256 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3257     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3258     *prev == OP_ONCE)
3259 nigel 3 {
3260     md->end_match_ptr = eptr; /* For ONCE */
3261     md->end_offset_top = offset_top;
3262     return TRUE;
3263     }
3264    
3265 nigel 23 /* In all other cases except a conditional group we have to check the
3266     group number back at the start and if necessary complete handling an
3267     extraction by setting the offsets and bumping the high water mark. */
3268 nigel 3
3269 nigel 23 if (*prev != OP_COND)
3270     {
3271     int number = *prev - OP_BRA;
3272     int offset = number << 1;
3273 nigel 3
3274 nigel 23 DPRINTF(("end bracket %d\n", number));
3275 nigel 3
3276 nigel 23 if (number > 0)
3277 nigel 3 {
3278 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3279     {
3280     md->offset_vector[offset] =
3281     md->offset_vector[md->offset_end - number];
3282     md->offset_vector[offset+1] = eptr - md->start_subject;
3283     if (offset_top <= offset) offset_top = offset + 2;
3284     }
3285 nigel 3 }
3286     }
3287    
3288 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3289     the group. */
3290 nigel 3
3291 nigel 23 ims = original_ims;
3292     DPRINTF(("ims reset to %02x\n", ims));
3293    
3294     /* For a non-repeating ket, just continue at this level. This also
3295     happens for a repeating ket if no characters were matched in the group.
3296     This is the forcible breaking of infinite loops as implemented in Perl
3297     5.005. If there is an options reset, it will get obeyed in the normal
3298     course of events. */
3299    
3300     if (*ecode == OP_KET || eptr == eptrb)
3301 nigel 3 {
3302     ecode += 3;
3303     break;
3304     }
3305    
3306     /* The repeating kets try the rest of the pattern or restart from the
3307     preceding bracket, in the appropriate order. */
3308    
3309     if (*ecode == OP_KETRMIN)
3310     {
3311 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3312     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3313 nigel 3 }
3314     else /* OP_KETRMAX */
3315     {
3316 nigel 23 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3317     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3318 nigel 3 }
3319     }
3320     return FALSE;
3321    
3322     /* Start of subject unless notbol, or after internal newline if multiline */
3323    
3324     case OP_CIRC:
3325     if (md->notbol && eptr == md->start_subject) return FALSE;
3326 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3327 nigel 3 {
3328     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3329     ecode++;
3330     break;
3331     }
3332     /* ... else fall through */
3333    
3334     /* Start of subject assertion */
3335    
3336     case OP_SOD:
3337     if (eptr != md->start_subject) return FALSE;
3338     ecode++;
3339     break;
3340    
3341 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3342     newline unless endonly is set, else end of subject unless noteol is set. */
3343 nigel 3
3344     case OP_DOLL:
3345 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3346 nigel 3 {
3347 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3348     else { if (md->noteol) return FALSE; }
3349 nigel 3 ecode++;
3350     break;
3351     }
3352 nigel 23 else
3353 nigel 3 {
3354 nigel 23 if (md->noteol) return FALSE;
3355     if (!md->endonly)
3356     {
3357     if (eptr < md->end_subject - 1 ||
3358     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3359    
3360     ecode++;
3361     break;
3362     }
3363 nigel 3 }
3364     /* ... else fall through */
3365    
3366 nigel 23 /* End of subject assertion (\z) */
3367 nigel 3
3368     case OP_EOD:
3369     if (eptr < md->end_subject) return FALSE;
3370     ecode++;
3371     break;
3372    
3373 nigel 23 /* End of subject or ending \n assertion (\Z) */
3374    
3375     case OP_EODN:
3376     if (eptr < md->end_subject - 1 ||
3377     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3378     ecode++;
3379     break;
3380    
3381 nigel 3 /* Word boundary assertions */
3382    
3383     case OP_NOT_WORD_BOUNDARY:
3384     case OP_WORD_BOUNDARY:
3385     {
3386     BOOL prev_is_word = (eptr != md->start_subject) &&
3387 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3388 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
3389 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
3390 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
3391     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3392     return FALSE;
3393     }
3394     break;
3395    
3396     /* Match a single character type; inline for speed */
3397    
3398     case OP_ANY:
3399 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3400     return FALSE;
3401 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3402     ecode++;
3403     break;
3404    
3405     case OP_NOT_DIGIT:
3406 nigel 25 if (eptr >= md->end_subject ||
3407     (md->ctypes[*eptr++] & ctype_digit) != 0)
3408 nigel 3 return FALSE;
3409     ecode++;
3410     break;
3411    
3412     case OP_DIGIT:
3413 nigel 25 if (eptr >= md->end_subject ||
3414     (md->ctypes[*eptr++] & ctype_digit) == 0)
3415 nigel 3 return FALSE;
3416     ecode++;
3417     break;
3418    
3419     case OP_NOT_WHITESPACE:
3420 nigel 25 if (eptr >= md->end_subject ||
3421     (md->ctypes[*eptr++] & ctype_space) != 0)
3422 nigel 3 return FALSE;
3423     ecode++;
3424     break;
3425    
3426     case OP_WHITESPACE:
3427 nigel 25 if (eptr >= md->end_subject ||
3428     (md->ctypes[*eptr++] & ctype_space) == 0)
3429 nigel 3 return FALSE;
3430     ecode++;
3431     break;
3432    
3433     case OP_NOT_WORDCHAR:
3434 nigel 25 if (eptr >= md->end_subject ||
3435     (md->ctypes[*eptr++] & ctype_word) != 0)
3436 nigel 3 return FALSE;
3437     ecode++;
3438     break;
3439    
3440     case OP_WORDCHAR:
3441 nigel 25 if (eptr >= md->end_subject ||
3442     (md->ctypes[*eptr++] & ctype_word) == 0)
3443 nigel 3 return FALSE;
3444     ecode++;
3445     break;
3446    
3447     /* Match a back reference, possibly repeatedly. Look past the end of the
3448     item to see if there is repeat information following. The code is similar
3449     to that for character classes, but repeated for efficiency. Then obey
3450     similar code to character type repeats - written out again for speed.
3451     However, if the referenced string is the empty string, always treat
3452     it as matched, any number of times (otherwise there could be infinite
3453     loops). */
3454    
3455     case OP_REF:
3456     {
3457     int length;
3458 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
3459 nigel 3 ecode += 2; /* Advance past the item */
3460    
3461 nigel 23 /* If the reference is unset, set the length to be longer than the amount
3462     of subject left; this ensures that every attempt at a match fails. We
3463     can't just fail here, because of the possibility of quantifiers with zero
3464     minima. */
3465 nigel 3
3466 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3467     md->end_subject - eptr + 1 :
3468     md->offset_vector[offset+1] - md->offset_vector[offset];
3469 nigel 3
3470 nigel 23 /* Set up for repetition, or handle the non-repeated case */
3471    
3472 nigel 3 switch (*ecode)
3473     {
3474     case OP_CRSTAR:
3475     case OP_CRMINSTAR:
3476     case OP_CRPLUS:
3477     case OP_CRMINPLUS:
3478     case OP_CRQUERY:
3479     case OP_CRMINQUERY:
3480     c = *ecode++ - OP_CRSTAR;
3481     minimize = (c & 1) != 0;
3482     min = rep_min[c]; /* Pick up values from tables; */
3483     max = rep_max[c]; /* zero for max => infinity */
3484     if (max == 0) max = INT_MAX;
3485     break;
3486    
3487     case OP_CRRANGE:
3488     case OP_CRMINRANGE:
3489     minimize = (*ecode == OP_CRMINRANGE);
3490     min = (ecode[1] << 8) + ecode[2];
3491     max = (ecode[3] << 8) + ecode[4];
3492     if (max == 0) max = INT_MAX;
3493     ecode += 5;
3494     break;
3495    
3496     default: /* No repeat follows */
3497 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3498 nigel 3 eptr += length;
3499     continue; /* With the main loop */
3500     }
3501    
3502     /* If the length of the reference is zero, just continue with the
3503     main loop. */
3504    
3505     if (length == 0) continue;
3506    
3507     /* First, ensure the minimum number of matches are present. We get back
3508     the length of the reference string explicitly rather than passing the
3509     address of eptr, so that eptr can be a register variable. */
3510    
3511     for (i = 1; i <= min; i++)
3512     {
3513 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3514 nigel 3 eptr += length;
3515     }
3516    
3517     /* If min = max, continue at the same level without recursion.
3518     They are not both allowed to be zero. */
3519    
3520     if (min == max) continue;
3521    
3522     /* If minimizing, keep trying and advancing the pointer */
3523    
3524     if (minimize)
3525     {
3526     for (i = min;; i++)
3527     {
3528 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3529     return TRUE;
3530     if (i >= max || !match_ref(offset, eptr, length, md, ims))
3531 nigel 3 return FALSE;
3532     eptr += length;
3533     }
3534     /* Control never gets here */
3535     }
3536    
3537     /* If maximizing, find the longest string and work backwards */
3538    
3539     else
3540     {
3541 nigel 7 const uschar *pp = eptr;
3542 nigel 3 for (i = min; i < max; i++)
3543     {
3544 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
3545 nigel 3 eptr += length;
3546     }
3547     while (eptr >= pp)
3548     {
3549 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3550     return TRUE;
3551 nigel 3 eptr -= length;
3552     }
3553     return FALSE;
3554     }
3555     }
3556     /* Control never gets here */
3557    
3558 nigel 23
3559    
3560 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
3561     item to see if there is repeat information following. Then obey similar
3562 nigel 23 code to character type repeats - written out again for speed. */
3563 nigel 3
3564     case OP_CLASS:
3565     {
3566 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
3567     ecode += 33; /* Advance past the item */
3568 nigel 3
3569     switch (*ecode)
3570     {
3571     case OP_CRSTAR:
3572     case OP_CRMINSTAR:
3573     case OP_CRPLUS:
3574     case OP_CRMINPLUS:
3575     case OP_CRQUERY:
3576     case OP_CRMINQUERY:
3577     c = *ecode++ - OP_CRSTAR;
3578     minimize = (c & 1) != 0;
3579     min = rep_min[c]; /* Pick up values from tables; */
3580     max = rep_max[c]; /* zero for max => infinity */
3581     if (max == 0) max = INT_MAX;
3582     break;
3583    
3584     case OP_CRRANGE:
3585     case OP_CRMINRANGE:
3586     minimize = (*ecode == OP_CRMINRANGE);
3587     min = (ecode[1] << 8) + ecode[2];
3588     max = (ecode[3] << 8) + ecode[4];
3589     if (max == 0) max = INT_MAX;
3590     ecode += 5;
3591     break;
3592    
3593     default: /* No repeat follows */
3594 nigel 13 min = max = 1;
3595     break;
3596 nigel 3 }
3597    
3598     /* First, ensure the minimum number of matches are present. */
3599    
3600     for (i = 1; i <= min; i++)
3601     {
3602     if (eptr >= md->end_subject) return FALSE;
3603     c = *eptr++;
3604 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3605 nigel 3 return FALSE;
3606     }
3607    
3608     /* If max == min we can continue with the main loop without the
3609     need to recurse. */
3610    
3611     if (min == max) continue;
3612    
3613     /* If minimizing, keep testing the rest of the expression and advancing
3614     the pointer while it matches the class. */
3615    
3616     if (minimize)
3617     {
3618     for (i = min;; i++)
3619     {
3620 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3621     return TRUE;
3622 nigel 3 if (i >= max || eptr >= md->end_subject) return FALSE;
3623     c = *eptr++;
3624 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3625 nigel 3 return FALSE;
3626     }
3627     /* Control never gets here */
3628     }
3629    
3630     /* If maximizing, find the longest possible run, then work backwards. */
3631    
3632     else
3633     {
3634 nigel 7 const uschar *pp = eptr;
3635 nigel 3 for (i = min; i < max; eptr++, i++)
3636     {
3637     if (eptr >= md->end_subject) break;
3638     c = *eptr;
3639 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3640 nigel 3 break;
3641     }
3642    
3643     while (eptr >= pp)
3644 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3645     return TRUE;
3646 nigel 3 return FALSE;
3647     }
3648     }
3649     /* Control never gets here */
3650    
3651     /* Match a run of characters */
3652    
3653     case OP_CHARS:
3654     {
3655     register int length = ecode[1];
3656     ecode += 2;
3657    
3658 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3659 nigel 3 if (eptr >= md->end_subject)
3660     printf("matching subject <null> against pattern ");
3661     else
3662     {
3663     printf("matching subject ");
3664     pchars(eptr, length, TRUE, md);
3665     printf(" against pattern ");
3666     }
3667     pchars(ecode, length, FALSE, md);
3668     printf("\n");
3669 nigel 9 #endif
3670 nigel 3
3671     if (length > md->end_subject - eptr) return FALSE;
3672 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3673 nigel 3 {
3674 nigel 25 while (length-- > 0)
3675     if (md->lcc[*ecode++] != md->lcc[*eptr++])
3676     return FALSE;
3677 nigel 3 }
3678     else
3679     {
3680     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3681     }
3682     }
3683     break;
3684    
3685     /* Match a single character repeatedly; different opcodes share code. */
3686    
3687     case OP_EXACT:
3688     min = max = (ecode[1] << 8) + ecode[2];
3689     ecode += 3;
3690     goto REPEATCHAR;
3691    
3692     case OP_UPTO:
3693     case OP_MINUPTO:
3694     min = 0;
3695     max = (ecode[1] << 8) + ecode[2];
3696     minimize = *ecode == OP_MINUPTO;
3697     ecode += 3;
3698     goto REPEATCHAR;
3699    
3700     case OP_STAR:
3701     case OP_MINSTAR:
3702     case OP_PLUS:
3703     case OP_MINPLUS:
3704     case OP_QUERY:
3705     case OP_MINQUERY:
3706     c = *ecode++ - OP_STAR;
3707     minimize = (c & 1) != 0;
3708     min = rep_min[c]; /* Pick up values from tables; */
3709     max = rep_max[c]; /* zero for max => infinity */
3710     if (max == 0) max = INT_MAX;
3711    
3712     /* Common code for all repeated single-character matches. We can give
3713     up quickly if there are fewer than the minimum number of characters left in
3714     the subject. */
3715    
3716     REPEATCHAR:
3717     if (min > md->end_subject - eptr) return FALSE;
3718     c = *ecode++;
3719    
3720     /* The code is duplicated for the caseless and caseful cases, for speed,
3721     since matching characters is likely to be quite common. First, ensure the
3722     minimum number of matches are present. If min = max, continue at the same
3723     level without recursing. Otherwise, if minimizing, keep trying the rest of
3724     the expression and advancing one matching character if failing, up to the
3725     maximum. Alternatively, if maximizing, find the maximum number of
3726     characters and work backwards. */
3727    
3728 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3729     max, eptr));
3730 nigel 3
3731 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3732 nigel 3 {
3733 nigel 25 c = md->lcc[c];
3734     for (i = 1; i <= min; i++)
3735     if (c != md->lcc[*eptr++]) return FALSE;
3736 nigel 3 if (min == max) continue;
3737     if (minimize)
3738     {
3739     for (i = min;; i++)
3740     {
3741 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3742     return TRUE;
3743 nigel 25 if (i >= max || eptr >= md->end_subject ||
3744     c != md->lcc[*eptr++])
3745 nigel 3 return FALSE;
3746     }
3747     /* Control never gets here */
3748     }
3749     else
3750     {
3751 nigel 7 const uschar *pp = eptr;
3752 nigel 3 for (i = min; i < max; i++)
3753     {
3754 nigel 25 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3755 nigel 3 eptr++;
3756     }
3757     while (eptr >= pp)
3758 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3759     return TRUE;
3760 nigel 3 return FALSE;
3761     }
3762     /* Control never gets here */
3763     }
3764    
3765     /* Caseful comparisons */
3766    
3767     else
3768     {
3769     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3770     if (min == max) continue;
3771     if (minimize)
3772     {
3773     for (i = min;; i++)
3774     {
3775 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3776     return TRUE;
3777 nigel 3 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3778     }
3779     /* Control never gets here */
3780     }
3781     else
3782     {
3783 nigel 7 const uschar *pp = eptr;
3784 nigel 3 for (i = min; i < max; i++)
3785     {
3786     if (eptr >= md->end_subject || c != *eptr) break;
3787     eptr++;
3788     }
3789     while (eptr >= pp)
3790 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3791     return TRUE;
3792 nigel 3 return FALSE;
3793     }
3794     }
3795     /* Control never gets here */
3796    
3797     /* Match a negated single character */
3798    
3799     case OP_NOT:
3800 nigel 9 if (eptr >= md->end_subject) return FALSE;
3801 nigel 3 ecode++;
3802 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3803 nigel 3 {
3804 nigel 25 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3805 nigel 3 }
3806     else
3807     {
3808     if (*ecode++ == *eptr++) return FALSE;
3809     }
3810     break;
3811    
3812     /* Match a negated single character repeatedly. This is almost a repeat of
3813     the code for a repeated single character, but I haven't found a nice way of
3814     commoning these up that doesn't require a test of the positive/negative
3815     option for each character match. Maybe that wouldn't add very much to the
3816     time taken, but character matching *is* what this is all about... */
3817    
3818     case OP_NOTEXACT:
3819     min = max = (ecode[1] << 8) + ecode[2];
3820     ecode += 3;
3821     goto REPEATNOTCHAR;
3822    
3823     case OP_NOTUPTO:
3824     case OP_NOTMINUPTO:
3825     min = 0;
3826     max = (ecode[1] << 8) + ecode[2];
3827     minimize = *ecode == OP_NOTMINUPTO;
3828     ecode += 3;
3829     goto REPEATNOTCHAR;
3830    
3831     case OP_NOTSTAR:
3832     case OP_NOTMINSTAR:
3833     case OP_NOTPLUS:
3834     case OP_NOTMINPLUS:
3835     case OP_NOTQUERY:
3836     case OP_NOTMINQUERY:
3837     c = *ecode++ - OP_NOTSTAR;
3838     minimize = (c & 1) != 0;
3839     min = rep_min[c]; /* Pick up values from tables; */
3840     max = rep_max[c]; /* zero for max => infinity */
3841     if (max == 0) max = INT_MAX;
3842    
3843     /* Common code for all repeated single-character matches. We can give
3844     up quickly if there are fewer than the minimum number of characters left in
3845     the subject. */
3846    
3847     REPEATNOTCHAR:
3848     if (min > md->end_subject - eptr) return FALSE;
3849     c = *ecode++;
3850    
3851     /* The code is duplicated for the caseless and caseful cases, for speed,
3852     since matching characters is likely to be quite common. First, ensure the
3853     minimum number of matches are present. If min = max, continue at the same
3854     level without recursing. Otherwise, if minimizing, keep trying the rest of
3855     the expression and advancing one matching character if failing, up to the
3856     maximum. Alternatively, if maximizing, find the maximum number of
3857     characters and work backwards. */
3858    
3859 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3860     max, eptr));
3861 nigel 3
3862 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3863 nigel 3 {
3864 nigel 25 c = md->lcc[c];
3865     for (i = 1; i <= min; i++)
3866     if (c == md->lcc[*eptr++]) return FALSE;
3867 nigel 3 if (min == max) continue;
3868     if (minimize)
3869     {
3870     for (i = min;; i++)
3871     {
3872 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3873     return TRUE;
3874 nigel 25 if (i >= max || eptr >= md->end_subject ||
3875     c == md->lcc[*eptr++])
3876 nigel 3 return FALSE;
3877     }
3878     /* Control never gets here */
3879     }
3880     else
3881     {
3882 nigel 7 const uschar *pp = eptr;
3883 nigel 3 for (i = min; i < max; i++)
3884     {
3885 nigel 25 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3886 nigel 3 eptr++;
3887     }
3888     while (eptr >= pp)
3889 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3890     return TRUE;
3891 nigel 3 return FALSE;
3892     }
3893     /* Control never gets here */
3894     }
3895    
3896     /* Caseful comparisons */
3897    
3898     else
3899     {
3900     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3901     if (min == max) continue;
3902     if (minimize)
3903     {
3904     for (i = min;; i++)
3905     {
3906 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3907     return TRUE;
3908 nigel 3 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3909     }
3910     /* Control never gets here */
3911     }
3912     else
3913     {
3914 nigel 7 const uschar *pp = eptr;
3915 nigel 3 for (i = min; i < max; i++)
3916     {
3917     if (eptr >= md->end_subject || c == *eptr) break;
3918     eptr++;
3919     }
3920     while (eptr >= pp)
3921 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3922     return TRUE;
3923 nigel 3 return FALSE;
3924     }
3925     }
3926     /* Control never gets here */
3927    
3928     /* Match a single character type repeatedly; several different opcodes
3929     share code. This is very similar to the code for single characters, but we
3930     repeat it in the interests of efficiency. */
3931    
3932     case OP_TYPEEXACT:
3933     min = max = (ecode[1] << 8) + ecode[2];
3934     minimize = TRUE;
3935     ecode += 3;
3936     goto REPEATTYPE;
3937    
3938     case OP_TYPEUPTO:
3939     case OP_TYPEMINUPTO:
3940     min = 0;
3941     max = (ecode[1] << 8) + ecode[2];
3942     minimize = *ecode == OP_TYPEMINUPTO;
3943     ecode += 3;
3944     goto REPEATTYPE;
3945    
3946     case OP_TYPESTAR:
3947     case OP_TYPEMINSTAR:
3948     case OP_TYPEPLUS:
3949     case OP_TYPEMINPLUS:
3950     case OP_TYPEQUERY:
3951     case OP_TYPEMINQUERY:
3952     c = *ecode++ - OP_TYPESTAR;
3953     minimize = (c & 1) != 0;
3954     min = rep_min[c]; /* Pick up values from tables; */
3955     max = rep_max[c]; /* zero for max => infinity */
3956     if (max == 0) max = INT_MAX;
3957    
3958     /* Common code for all repeated single character type matches */
3959    
3960     REPEATTYPE:
3961     ctype = *ecode++; /* Code for the character type */
3962    
3963     /* First, ensure the minimum number of matches are present. Use inline
3964     code for maximizing the speed, and do the type test once at the start
3965     (i.e. keep it out of the loop). Also test that there are at least the
3966     minimum number of characters before we start. */
3967    
3968     if (min > md->end_subject - eptr) return FALSE;
3969     if (min > 0) switch(ctype)
3970     {
3971     case OP_ANY:
3972 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3973 nigel 3 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3974     else eptr += min;
3975     break;
3976    
3977     case OP_NOT_DIGIT:
3978     for (i = 1; i <= min; i++)
3979 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3980 nigel 3 break;
3981    
3982     case OP_DIGIT:
3983     for (i = 1; i <= min; i++)
3984 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3985 nigel 3 break;
3986    
3987     case OP_NOT_WHITESPACE:
3988     for (i = 1; i <= min; i++)
3989 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3990 nigel 3 break;
3991    
3992     case OP_WHITESPACE:
3993     for (i = 1; i <= min; i++)
3994 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3995 nigel 3 break;
3996    
3997     case OP_NOT_WORDCHAR:
3998 nigel 25 for (i = 1; i <= min; i++)
3999     if ((md->ctypes[*eptr++] & ctype_word) != 0)
4000     return FALSE;
4001 nigel 3 break;
4002    
4003     case OP_WORDCHAR:
4004 nigel 25 for (i = 1; i <= min; i++)
4005     if ((md->ctypes[*eptr++] & ctype_word) == 0)
4006     return FALSE;
4007 nigel 3 break;
4008     }
4009    
4010     /* If min = max, continue at the same level without recursing */
4011    
4012     if (min == max) continue;
4013    
4014     /* If minimizing, we have to test the rest of the pattern before each
4015 nigel 25 subsequent match. */
4016 nigel 3
4017     if (minimize)
4018     {
4019     for (i = min;; i++)
4020     {
4021 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
4022 nigel 25 if (i >= max || eptr >= md->end_subject) return FALSE;
4023    
4024     c = *eptr++;
4025     switch(ctype)
4026     {
4027     case OP_ANY:
4028     if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4029     break;
4030    
4031     case OP_NOT_DIGIT:
4032     if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4033     break;
4034    
4035     case OP_DIGIT:
4036     if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4037     break;
4038    
4039     case OP_NOT_WHITESPACE:
4040     if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4041     break;
4042    
4043     case OP_WHITESPACE:
4044     if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4045     break;
4046    
4047     case OP_NOT_WORDCHAR:
4048     if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4049     break;
4050    
4051     case OP_WORDCHAR:
4052     if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4053     break;
4054     }
4055 nigel 3 }
4056     /* Control never gets here */
4057     }
4058    
4059     /* If maximizing it is worth using inline code for speed, doing the type
4060     test once at the start (i.e. keep it out of the loop). */
4061    
4062     else
4063     {
4064 nigel 7 const uschar *pp = eptr;
4065 nigel 3 switch(ctype)
4066     {
4067     case OP_ANY:
4068 nigel 23 if ((ims & PCRE_DOTALL) == 0)
4069 nigel 3 {
4070     for (i = min; i < max; i++)
4071     {
4072     if (eptr >= md->end_subject || *eptr == '\n') break;
4073     eptr++;
4074     }
4075     }
4076     else
4077     {
4078     c = max - min;
4079     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4080     eptr += c;
4081     }
4082     break;
4083    
4084     case OP_NOT_DIGIT:
4085     for (i = min; i < max; i++)
4086     {
4087 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4088 nigel 3 break;
4089     eptr++;
4090     }
4091     break;
4092    
4093     case OP_DIGIT:
4094     for (i = min; i < max; i++)
4095     {
4096 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4097 nigel 3 break;
4098     eptr++;
4099     }
4100     break;
4101    
4102     case OP_NOT_WHITESPACE:
4103     for (i = min; i < max; i++)
4104     {
4105 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4106 nigel 3 break;
4107     eptr++;
4108     }
4109     break;
4110    
4111     case OP_WHITESPACE:
4112     for (i = min; i < max; i++)
4113     {
4114 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4115 nigel 3 break;
4116     eptr++;
4117     }
4118     break;
4119    
4120     case OP_NOT_WORDCHAR:
4121     for (i = min; i < max; i++)
4122     {
4123 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4124 nigel 3 break;
4125     eptr++;
4126     }
4127     break;
4128    
4129     case OP_WORDCHAR:
4130     for (i = min; i < max; i++)
4131     {
4132 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4133 nigel 3 break;
4134     eptr++;
4135     }
4136     break;
4137     }
4138    
4139     while (eptr >= pp)
4140 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4141     return TRUE;
4142 nigel 3 return FALSE;
4143     }
4144     /* Control never gets here */
4145    
4146     /* There's been some horrible disaster. */
4147    
4148     default:
4149 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
4150 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4151     return FALSE;
4152     }
4153    
4154     /* Do not stick any code in here without much thought; it is assumed
4155     that "continue" in the code above comes out to here to repeat the main
4156     loop. */
4157    
4158     } /* End of main loop */
4159     /* Control never reaches here */
4160     }
4161    
4162    
4163    
4164 nigel 9
4165     /*************************************************
4166 nigel 3 * Execute a Regular Expression *
4167     *************************************************/
4168    
4169     /* This function applies a compiled re to a subject string and picks out
4170     portions of the string if it matches. Two elements in the vector are set for
4171     each substring: the offsets to the start and end of the substring.
4172    
4173     Arguments:
4174     external_re points to the compiled expression
4175     external_extra points to "hints" from pcre_study() or is NULL
4176     subject points to the subject string
4177     length length of subject string (may contain binary zeros)
4178 nigel 35 start_offset where to start in the subject string
4179 nigel 3 options option bits
4180     offsets points to a vector of ints to be filled in with offsets
4181     offsetcount the number of elements in the vector
4182    
4183     Returns: > 0 => success; value is the number of elements filled in
4184     = 0 => success, but offsets is not big enough
4185     -1 => failed to match
4186     < -1 => some kind of unexpected problem
4187     */
4188    
4189     int
4190     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4191 nigel 35 const char *subject, int length, int start_offset, int options, int *offsets,
4192     int offsetcount)
4193 nigel 3 {
4194 nigel 11 int resetcount, ocount;
4195 nigel 3 int first_char = -1;
4196 nigel 37 int req_char = -1;
4197     int req_char2 = -1;
4198     unsigned long int ims = 0;
4199 nigel 3 match_data match_block;
4200 nigel 7 const uschar *start_bits = NULL;
4201 nigel 35 const uschar *start_match = (const uschar *)subject + start_offset;
4202 nigel 7 const uschar *end_subject;
4203 nigel 37 const uschar *req_char_ptr = start_match - 1;
4204 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
4205     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4206 nigel 11 BOOL using_temporary_offsets = FALSE;
4207 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4208     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4209    
4210     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4211    
4212     if (re == NULL || subject == NULL ||
4213     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4214