/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 35 - (hide annotations) (download)
Sat Feb 24 21:39:05 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 126481 byte(s)
Load pcre-2.06 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 27 Copyright (c) 1997-1999 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Define DEBUG to get debugging output on stdout. */
37    
38     /* #define DEBUG */
39    
40 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41     inline, and there are *still* stupid compilers about that don't like indented
42     pre-processor statements. I suppose it's only been 10 years... */
43 nigel 3
44 nigel 9 #ifdef DEBUG
45     #define DPRINTF(p) printf p
46     #else
47     #define DPRINTF(p) /*nothing*/
48     #endif
49    
50 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
51     the external pcre header. */
52    
53     #include "internal.h"
54    
55    
56 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58     #ifdef __cplusplus
59     #define class pcre_class
60     #endif
61    
62    
63 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
64     not be set greater than 200. */
65    
66     #define BRASTACK_SIZE 200
67    
68    
69 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70    
71 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73 nigel 3
74 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
75 nigel 3
76     #ifdef DEBUG
77 nigel 7 static const char *OP_names[] = {
78     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80     "Opt", "^", "$", "Any", "chars", "not",
81 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84     "*", "*?", "+", "+?", "?", "??", "{", "{",
85 nigel 23 "class", "Ref",
86     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 nigel 3 "Brazero", "Braminzero", "Bra"
89     };
90     #endif
91    
92     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93     are simple data values; negative values are for special things like \d and so
94     on. Zero means further processing is needed (for things like \x), or the escape
95     is invalid. */
96    
97 nigel 15 static const short int escapes[] = {
98 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 nigel 23 0, 0, -ESC_z /* x - z */
108 nigel 3 };
109    
110     /* Definition to allow mutual recursion */
111    
112 nigel 13 static BOOL
113 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 nigel 25 BOOL, int, compile_data *);
115 nigel 3
116    
117    
118     /*************************************************
119     * Global variables *
120     *************************************************/
121    
122     /* PCRE is thread-clean and doesn't use any global variables in the normal
123     sense. However, it calls memory allocation and free functions via the two
124     indirections below, which are can be changed by the caller, but are shared
125     between all threads. */
126    
127     void *(*pcre_malloc)(size_t) = malloc;
128     void (*pcre_free)(void *) = free;
129    
130    
131    
132    
133     /*************************************************
134 nigel 25 * Default character tables *
135     *************************************************/
136    
137     /* A default set of character tables is included in the PCRE binary. Its source
138     is built by the maketables auxiliary program, which uses the default C ctypes
139     functions, and put in the file chartables.c. These tables are used by PCRE
140     whenever the caller of pcre_compile() does not provide an alternate set of
141     tables. */
142    
143     #include "chartables.c"
144    
145    
146    
147     /*************************************************
148 nigel 3 * Return version string *
149     *************************************************/
150    
151 nigel 7 const char *
152 nigel 3 pcre_version(void)
153     {
154     return PCRE_VERSION;
155     }
156    
157    
158    
159    
160     /*************************************************
161     * Return info about a compiled pattern *
162     *************************************************/
163    
164     /* This function picks potentially useful data out of the private
165     structure.
166    
167     Arguments:
168     external_re points to compiled code
169     optptr where to pass back the options
170     first_char where to pass back the first character,
171     or -1 if multiline and all branches start ^,
172     or -2 otherwise
173    
174     Returns: number of identifying extraction brackets
175     or negative values on error
176     */
177    
178     int
179     pcre_info(const pcre *external_re, int *optptr, int *first_char)
180     {
181 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
182 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
183     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185     if (first_char != NULL)
186     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188     return re->top_bracket;
189     }
190    
191    
192    
193    
194     #ifdef DEBUG
195     /*************************************************
196     * Debugging function to print chars *
197     *************************************************/
198    
199     /* Print a sequence of chars in printable format, stopping at the end of the
200     subject if the requested.
201    
202     Arguments:
203     p points to characters
204     length number to print
205     is_subject TRUE if printing from within md->start_subject
206     md pointer to matching data block, if is_subject is TRUE
207    
208     Returns: nothing
209     */
210    
211 nigel 9 static void
212     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 nigel 3 {
214     int c;
215     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216     while (length-- > 0)
217     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218     }
219     #endif
220    
221    
222    
223    
224     /*************************************************
225     * Handle escapes *
226     *************************************************/
227    
228     /* This function is called when a \ has been encountered. It either returns a
229     positive value for a simple escape such as \n, or a negative value which
230     encodes one of the more complicated things such as \d. On entry, ptr is
231     pointing at the \. On exit, it is on the final character of the escape
232     sequence.
233    
234     Arguments:
235     ptrptr points to the pattern position pointer
236     errorptr points to the pointer to the error message
237     bracount number of previous extracting brackets
238     options the options bits
239     isclass TRUE if inside a character class
240 nigel 25 cd pointer to char tables block
241 nigel 3
242     Returns: zero or positive => a data character
243     negative => a special escape sequence
244     on error, errorptr is set
245     */
246    
247     static int
248 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
249 nigel 25 int options, BOOL isclass, compile_data *cd)
250 nigel 3 {
251 nigel 7 const uschar *ptr = *ptrptr;
252 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
253     int i;
254    
255     if (c == 0) *errorptr = ERR1;
256    
257     /* Digits or letters may have special meaning; all others are literals. */
258    
259     else if (c < '0' || c > 'z') {}
260    
261     /* Do an initial lookup in a table. A non-zero result is something that can be
262     returned immediately. Otherwise further processing may be required. */
263    
264     else if ((i = escapes[c - '0']) != 0) c = i;
265    
266     /* Escapes that need further processing, or are illegal. */
267    
268     else
269     {
270 nigel 7 const uschar *oldptr;
271 nigel 3 switch (c)
272     {
273     /* The handling of escape sequences consisting of a string of digits
274     starting with one that is not zero is not straightforward. By experiment,
275     the way Perl works seems to be as follows:
276    
277     Outside a character class, the digits are read as a decimal number. If the
278     number is less than 10, or if there are that many previous extracting
279     left brackets, then it is a back reference. Otherwise, up to three octal
280     digits are read to form an escaped byte. Thus \123 is likely to be octal
281     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
282     value is greater than 377, the least significant 8 bits are taken. Inside a
283     character class, \ followed by a digit is always an octal number. */
284    
285     case '1': case '2': case '3': case '4': case '5':
286     case '6': case '7': case '8': case '9':
287    
288     if (!isclass)
289     {
290     oldptr = ptr;
291     c -= '0';
292 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
293 nigel 3 c = c * 10 + *(++ptr) - '0';
294     if (c < 10 || c <= bracount)
295     {
296     c = -(ESC_REF + c);
297     break;
298     }
299     ptr = oldptr; /* Put the pointer back and fall through */
300     }
301    
302     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
303     generates a binary zero byte and treats the digit as a following literal.
304     Thus we have to pull back the pointer by one. */
305    
306     if ((c = *ptr) >= '8')
307     {
308     ptr--;
309     c = 0;
310     break;
311     }
312    
313     /* \0 always starts an octal number, but we may drop through to here with a
314     larger first octal digit */
315    
316     case '0':
317     c -= '0';
318 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
319 nigel 3 ptr[1] != '8' && ptr[1] != '9')
320     c = c * 8 + *(++ptr) - '0';
321     break;
322    
323     /* Special escapes not starting with a digit are straightforward */
324    
325     case 'x':
326     c = 0;
327 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
328 nigel 3 {
329     ptr++;
330 nigel 25 c = c * 16 + cd->lcc[*ptr] -
331     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
332 nigel 3 }
333     break;
334    
335     case 'c':
336     c = *(++ptr);
337     if (c == 0)
338     {
339     *errorptr = ERR2;
340     return 0;
341     }
342    
343     /* A letter is upper-cased; then the 0x40 bit is flipped */
344    
345 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
346 nigel 3 c ^= 0x40;
347     break;
348    
349     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
350     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
351 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
352     there used to be some cases other than the default, and there may be again
353     in future, so I haven't "optimized" it. */
354 nigel 3
355     default:
356     if ((options & PCRE_EXTRA) != 0) switch(c)
357     {
358     default:
359     *errorptr = ERR3;
360     break;
361     }
362     break;
363     }
364     }
365    
366     *ptrptr = ptr;
367     return c;
368     }
369    
370    
371    
372     /*************************************************
373     * Check for counted repeat *
374     *************************************************/
375    
376     /* This function is called when a '{' is encountered in a place where it might
377     start a quantifier. It looks ahead to see if it really is a quantifier or not.
378     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
379     where the ddds are digits.
380    
381     Arguments:
382     p pointer to the first char after '{'
383 nigel 25 cd pointer to char tables block
384 nigel 3
385     Returns: TRUE or FALSE
386     */
387    
388     static BOOL
389 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
390 nigel 3 {
391 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
392     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
393 nigel 3 if (*p == '}') return TRUE;
394    
395     if (*p++ != ',') return FALSE;
396     if (*p == '}') return TRUE;
397    
398 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
399     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
400 nigel 3 return (*p == '}');
401     }
402    
403    
404    
405     /*************************************************
406     * Read repeat counts *
407     *************************************************/
408    
409     /* Read an item of the form {n,m} and return the values. This is called only
410     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
411     so the syntax is guaranteed to be correct, but we need to check the values.
412    
413     Arguments:
414     p pointer to first char after '{'
415     minp pointer to int for min
416     maxp pointer to int for max
417     returned as -1 if no max
418     errorptr points to pointer to error message
419 nigel 25 cd pointer to character tables clock
420 nigel 3
421     Returns: pointer to '}' on success;
422     current ptr on error, with errorptr set
423     */
424    
425 nigel 7 static const uschar *
426 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
427     const char **errorptr, compile_data *cd)
428 nigel 3 {
429     int min = 0;
430     int max = -1;
431    
432 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
433 nigel 3
434     if (*p == '}') max = min; else
435     {
436     if (*(++p) != '}')
437     {
438     max = 0;
439 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
440 nigel 3 if (max < min)
441     {
442     *errorptr = ERR4;
443     return p;
444     }
445     }
446     }
447    
448     /* Do paranoid checks, then fill in the required variables, and pass back the
449     pointer to the terminating '}'. */
450    
451     if (min > 65535 || max > 65535)
452     *errorptr = ERR5;
453     else
454     {
455     *minp = min;
456     *maxp = max;
457     }
458     return p;
459     }
460    
461    
462    
463     /*************************************************
464 nigel 23 * Find the fixed length of a pattern *
465     *************************************************/
466    
467     /* Scan a pattern and compute the fixed length of subject that will match it,
468     if the length is fixed. This is needed for dealing with backward assertions.
469    
470     Arguments:
471     code points to the start of the pattern (the bracket)
472    
473     Returns: the fixed length, or -1 if there is no fixed length
474     */
475    
476     static int
477     find_fixedlength(uschar *code)
478     {
479     int length = -1;
480    
481     register int branchlength = 0;
482     register uschar *cc = code + 3;
483    
484     /* Scan along the opcodes for this branch. If we get to the end of the
485     branch, check the length against that of the other branches. */
486    
487     for (;;)
488     {
489     int d;
490     register int op = *cc;
491     if (op >= OP_BRA) op = OP_BRA;
492    
493     switch (op)
494     {
495     case OP_BRA:
496     case OP_ONCE:
497     case OP_COND:
498     d = find_fixedlength(cc);
499     if (d < 0) return -1;
500     branchlength += d;
501     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
502     cc += 3;
503     break;
504    
505     /* Reached end of a branch; if it's a ket it is the end of a nested
506     call. If it's ALT it is an alternation in a nested call. If it is
507     END it's the end of the outer call. All can be handled by the same code. */
508    
509     case OP_ALT:
510     case OP_KET:
511     case OP_KETRMAX:
512     case OP_KETRMIN:
513     case OP_END:
514     if (length < 0) length = branchlength;
515     else if (length != branchlength) return -1;
516     if (*cc != OP_ALT) return length;
517     cc += 3;
518     branchlength = 0;
519     break;
520    
521     /* Skip over assertive subpatterns */
522    
523     case OP_ASSERT:
524     case OP_ASSERT_NOT:
525     case OP_ASSERTBACK:
526     case OP_ASSERTBACK_NOT:
527     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
528     cc += 3;
529     break;
530    
531     /* Skip over things that don't match chars */
532    
533     case OP_REVERSE:
534     cc++;
535    
536     case OP_CREF:
537     case OP_OPT:
538     cc++;
539     /* Fall through */
540    
541     case OP_SOD:
542     case OP_EOD:
543     case OP_EODN:
544     case OP_CIRC:
545     case OP_DOLL:
546     case OP_NOT_WORD_BOUNDARY:
547     case OP_WORD_BOUNDARY:
548     cc++;
549     break;
550    
551     /* Handle char strings */
552    
553     case OP_CHARS:
554     branchlength += *(++cc);
555     cc += *cc + 1;
556     break;
557    
558     /* Handle exact repetitions */
559    
560     case OP_EXACT:
561     case OP_TYPEEXACT:
562     branchlength += (cc[1] << 8) + cc[2];
563     cc += 4;
564     break;
565    
566     /* Handle single-char matchers */
567    
568     case OP_NOT_DIGIT:
569     case OP_DIGIT:
570     case OP_NOT_WHITESPACE:
571     case OP_WHITESPACE:
572     case OP_NOT_WORDCHAR:
573     case OP_WORDCHAR:
574     case OP_ANY:
575     branchlength++;
576     cc++;
577     break;
578    
579    
580     /* Check a class for variable quantification */
581    
582     case OP_CLASS:
583     cc += (*cc == OP_REF)? 2 : 33;
584    
585     switch (*cc)
586     {
587     case OP_CRSTAR:
588     case OP_CRMINSTAR:
589     case OP_CRQUERY:
590     case OP_CRMINQUERY:
591     return -1;
592    
593     case OP_CRRANGE:
594     case OP_CRMINRANGE:
595     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
596     branchlength += (cc[1] << 8) + cc[2];
597     cc += 5;
598     break;
599    
600     default:
601     branchlength++;
602     }
603     break;
604    
605     /* Anything else is variable length */
606    
607     default:
608     return -1;
609     }
610     }
611     /* Control never gets here */
612     }
613    
614    
615    
616    
617     /*************************************************
618 nigel 3 * Compile one branch *
619     *************************************************/
620    
621     /* Scan the pattern, compiling it into the code vector.
622    
623     Arguments:
624 nigel 25 options the option bits
625     brackets points to number of brackets used
626     code points to the pointer to the current code point
627     ptrptr points to the current pattern pointer
628     errorptr points to pointer to error message
629     optchanged set to the value of the last OP_OPT item compiled
630     cd contains pointers to tables
631 nigel 3
632 nigel 25 Returns: TRUE on success
633     FALSE, with *errorptr set on error
634 nigel 3 */
635    
636     static BOOL
637 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
638 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
639     compile_data *cd)
640 nigel 3 {
641     int repeat_type, op_type;
642     int repeat_min, repeat_max;
643     int bravalue, length;
644 nigel 19 int greedy_default, greedy_non_default;
645 nigel 3 register int c;
646     register uschar *code = *codeptr;
647 nigel 23 uschar *tempcode;
648 nigel 7 const uschar *ptr = *ptrptr;
649 nigel 23 const uschar *tempptr;
650 nigel 3 uschar *previous = NULL;
651     uschar class[32];
652    
653 nigel 19 /* Set up the default and non-default settings for greediness */
654    
655     greedy_default = ((options & PCRE_UNGREEDY) != 0);
656     greedy_non_default = greedy_default ^ 1;
657    
658 nigel 3 /* Switch on next character until the end of the branch */
659    
660     for (;; ptr++)
661     {
662     BOOL negate_class;
663 nigel 23 int class_charcount;
664     int class_lastchar;
665     int newoptions;
666     int condref;
667 nigel 3
668     c = *ptr;
669     if ((options & PCRE_EXTENDED) != 0)
670     {
671 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
672 nigel 3 if (c == '#')
673     {
674     while ((c = *(++ptr)) != 0 && c != '\n');
675     continue;
676     }
677     }
678    
679     switch(c)
680     {
681     /* The branch terminates at end of string, |, or ). */
682    
683     case 0:
684     case '|':
685     case ')':
686     *codeptr = code;
687     *ptrptr = ptr;
688     return TRUE;
689    
690     /* Handle single-character metacharacters */
691    
692     case '^':
693     previous = NULL;
694     *code++ = OP_CIRC;
695     break;
696    
697     case '$':
698     previous = NULL;
699     *code++ = OP_DOLL;
700     break;
701    
702     case '.':
703     previous = code;
704     *code++ = OP_ANY;
705     break;
706    
707     /* Character classes. These always build a 32-byte bitmap of the permitted
708     characters, except in the special case where there is only one character.
709     For negated classes, we build the map as usual, then invert it at the end.
710     */
711    
712     case '[':
713     previous = code;
714 nigel 23 *code++ = OP_CLASS;
715 nigel 3
716 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
717 nigel 3
718     if ((c = *(++ptr)) == '^')
719     {
720     negate_class = TRUE;
721     c = *(++ptr);
722     }
723 nigel 23 else negate_class = FALSE;
724 nigel 3
725     /* Keep a count of chars so that we can optimize the case of just a single
726     character. */
727    
728     class_charcount = 0;
729     class_lastchar = -1;
730    
731     /* Initialize the 32-char bit map to all zeros. We have to build the
732     map in a temporary bit of store, in case the class contains only 1
733     character, because in that case the compiled code doesn't use the
734     bit map. */
735    
736     memset(class, 0, 32 * sizeof(uschar));
737    
738     /* Process characters until ] is reached. By writing this as a "do" it
739     means that an initial ] is taken as a data character. */
740    
741     do
742     {
743     if (c == 0)
744     {
745     *errorptr = ERR6;
746     goto FAILED;
747     }
748    
749     /* Backslash may introduce a single character, or it may introduce one
750     of the specials, which just set a flag. Escaped items are checked for
751     validity in the pre-compiling pass. The sequence \b is a special case.
752 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
753 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
754     or into the one we are building. We assume they have more than one
755     character in them, so set class_count bigger than one. */
756    
757     if (c == '\\')
758     {
759 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
760 nigel 3 if (-c == ESC_b) c = '\b';
761     else if (c < 0)
762     {
763 nigel 25 register const uschar *cbits = cd->cbits;
764 nigel 3 class_charcount = 10;
765     switch (-c)
766     {
767     case ESC_d:
768 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
769 nigel 3 continue;
770    
771     case ESC_D:
772 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
773 nigel 3 continue;
774    
775     case ESC_w:
776     for (c = 0; c < 32; c++)
777 nigel 25 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
778 nigel 3 continue;
779    
780     case ESC_W:
781     for (c = 0; c < 32; c++)
782 nigel 25 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
783 nigel 3 continue;
784    
785     case ESC_s:
786 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
787 nigel 3 continue;
788    
789     case ESC_S:
790 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
791 nigel 3 continue;
792    
793     default:
794     *errorptr = ERR7;
795     goto FAILED;
796     }
797     }
798     /* Fall through if single character */
799     }
800    
801     /* A single character may be followed by '-' to form a range. However,
802     Perl does not permit ']' to be the end of the range. A '-' character
803     here is treated as a literal. */
804    
805     if (ptr[1] == '-' && ptr[2] != ']')
806     {
807     int d;
808     ptr += 2;
809     d = *ptr;
810    
811     if (d == 0)
812     {
813     *errorptr = ERR6;
814     goto FAILED;
815     }
816    
817     /* The second part of a range can be a single-character escape, but
818     not any of the other escapes. */
819    
820     if (d == '\\')
821     {
822 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
823 nigel 3 if (d < 0)
824     {
825     if (d == -ESC_b) d = '\b'; else
826     {
827     *errorptr = ERR7;
828     goto FAILED;
829     }
830     }
831     }
832    
833     if (d < c)
834     {
835     *errorptr = ERR8;
836     goto FAILED;
837     }
838    
839     for (; c <= d; c++)
840     {
841     class[c/8] |= (1 << (c&7));
842     if ((options & PCRE_CASELESS) != 0)
843     {
844 nigel 25 int uc = cd->fcc[c]; /* flip case */
845 nigel 3 class[uc/8] |= (1 << (uc&7));
846     }
847     class_charcount++; /* in case a one-char range */
848     class_lastchar = c;
849     }
850     continue; /* Go get the next char in the class */
851     }
852    
853     /* Handle a lone single character - we can get here for a normal
854     non-escape char, or after \ that introduces a single character. */
855    
856     class [c/8] |= (1 << (c&7));
857     if ((options & PCRE_CASELESS) != 0)
858     {
859 nigel 25 c = cd->fcc[c]; /* flip case */
860 nigel 3 class[c/8] |= (1 << (c&7));
861     }
862     class_charcount++;
863     class_lastchar = c;
864     }
865    
866     /* Loop until ']' reached; the check for end of string happens inside the
867     loop. This "while" is the end of the "do" above. */
868    
869     while ((c = *(++ptr)) != ']');
870    
871     /* If class_charcount is 1 and class_lastchar is not negative, we saw
872     precisely one character. This doesn't need the whole 32-byte bit map.
873     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
874     it's negative. */
875    
876     if (class_charcount == 1 && class_lastchar >= 0)
877     {
878     if (negate_class)
879     {
880     code[-1] = OP_NOT;
881     }
882     else
883     {
884     code[-1] = OP_CHARS;
885     *code++ = 1;
886     }
887     *code++ = class_lastchar;
888     }
889    
890     /* Otherwise, negate the 32-byte map if necessary, and copy it into
891     the code vector. */
892    
893     else
894     {
895     if (negate_class)
896     for (c = 0; c < 32; c++) code[c] = ~class[c];
897     else
898     memcpy(code, class, 32);
899     code += 32;
900     }
901     break;
902    
903     /* Various kinds of repeat */
904    
905     case '{':
906 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
907     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
908 nigel 3 if (*errorptr != NULL) goto FAILED;
909     goto REPEAT;
910    
911     case '*':
912     repeat_min = 0;
913     repeat_max = -1;
914     goto REPEAT;
915    
916     case '+':
917     repeat_min = 1;
918     repeat_max = -1;
919     goto REPEAT;
920    
921     case '?':
922     repeat_min = 0;
923     repeat_max = 1;
924    
925     REPEAT:
926     if (previous == NULL)
927     {
928     *errorptr = ERR9;
929     goto FAILED;
930     }
931    
932 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
933     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
934 nigel 3 next character. */
935    
936 nigel 19 if (ptr[1] == '?')
937     { repeat_type = greedy_non_default; ptr++; }
938     else repeat_type = greedy_default;
939 nigel 3
940     /* If the maximum is zero then the minimum must also be zero; Perl allows
941     this case, so we do too - by simply omitting the item altogether. */
942    
943     if (repeat_max == 0) code = previous;
944    
945     /* If previous was a string of characters, chop off the last one and use it
946     as the subject of the repeat. If there was only one character, we can
947     abolish the previous item altogether. */
948    
949     else if (*previous == OP_CHARS)
950     {
951     int len = previous[1];
952     if (len == 1)
953     {
954     c = previous[2];
955     code = previous;
956     }
957     else
958     {
959     c = previous[len+1];
960     previous[1]--;
961     code--;
962     }
963     op_type = 0; /* Use single-char op codes */
964     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
965     }
966    
967     /* If previous was a single negated character ([^a] or similar), we use
968     one of the special opcodes, replacing it. The code is shared with single-
969     character repeats by adding a suitable offset into repeat_type. */
970    
971     else if ((int)*previous == OP_NOT)
972     {
973     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
974     c = previous[1];
975     code = previous;
976     goto OUTPUT_SINGLE_REPEAT;
977     }
978    
979     /* If previous was a character type match (\d or similar), abolish it and
980     create a suitable repeat item. The code is shared with single-character
981     repeats by adding a suitable offset into repeat_type. */
982    
983 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
984 nigel 3 {
985     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
986     c = *previous;
987     code = previous;
988    
989     OUTPUT_SINGLE_REPEAT:
990     repeat_type += op_type; /* Combine both values for many cases */
991    
992     /* A minimum of zero is handled either as the special case * or ?, or as
993     an UPTO, with the maximum given. */
994    
995     if (repeat_min == 0)
996     {
997     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
998     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
999     else
1000     {
1001     *code++ = OP_UPTO + repeat_type;
1002     *code++ = repeat_max >> 8;
1003     *code++ = (repeat_max & 255);
1004     }
1005     }
1006    
1007     /* The case {1,} is handled as the special case + */
1008    
1009     else if (repeat_min == 1 && repeat_max == -1)
1010     *code++ = OP_PLUS + repeat_type;
1011    
1012     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1013     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1014    
1015     else
1016     {
1017     if (repeat_min != 1)
1018     {
1019     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1020     *code++ = repeat_min >> 8;
1021     *code++ = (repeat_min & 255);
1022     }
1023    
1024     /* If the mininum is 1 and the previous item was a character string,
1025     we either have to put back the item that got cancelled if the string
1026     length was 1, or add the character back onto the end of a longer
1027 nigel 21 string. For a character type nothing need be done; it will just get
1028     put back naturally. Note that the final character is always going to
1029     get added below. */
1030 nigel 3
1031     else if (*previous == OP_CHARS)
1032     {
1033     if (code == previous) code += 2; else previous[1]++;
1034     }
1035    
1036 nigel 21 /* For a single negated character we also have to put back the
1037     item that got cancelled. */
1038    
1039     else if (*previous == OP_NOT) code++;
1040    
1041 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1042 nigel 3
1043 nigel 9 if (repeat_max < 0)
1044 nigel 3 {
1045     *code++ = c;
1046 nigel 9 *code++ = OP_STAR + repeat_type;
1047     }
1048    
1049     /* Else insert an UPTO if the max is greater than the min. */
1050    
1051     else if (repeat_max != repeat_min)
1052     {
1053     *code++ = c;
1054 nigel 3 repeat_max -= repeat_min;
1055     *code++ = OP_UPTO + repeat_type;
1056     *code++ = repeat_max >> 8;
1057     *code++ = (repeat_max & 255);
1058     }
1059     }
1060    
1061     /* The character or character type itself comes last in all cases. */
1062    
1063     *code++ = c;
1064     }
1065    
1066     /* If previous was a character class or a back reference, we put the repeat
1067     stuff after it. */
1068    
1069 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1070 nigel 3 {
1071     if (repeat_min == 0 && repeat_max == -1)
1072     *code++ = OP_CRSTAR + repeat_type;
1073     else if (repeat_min == 1 && repeat_max == -1)
1074     *code++ = OP_CRPLUS + repeat_type;
1075     else if (repeat_min == 0 && repeat_max == 1)
1076     *code++ = OP_CRQUERY + repeat_type;
1077     else
1078     {
1079     *code++ = OP_CRRANGE + repeat_type;
1080     *code++ = repeat_min >> 8;
1081     *code++ = repeat_min & 255;
1082     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1083     *code++ = repeat_max >> 8;
1084     *code++ = repeat_max & 255;
1085     }
1086     }
1087    
1088     /* If previous was a bracket group, we may have to replicate it in certain
1089 nigel 23 cases. */
1090 nigel 3
1091 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092     (int)*previous == OP_COND)
1093 nigel 3 {
1094 nigel 31 register int i;
1095     int ketoffset = 0;
1096 nigel 9 int len = code - previous;
1097 nigel 31 uschar *bralink = NULL;
1098 nigel 3
1099 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1100     by scanning through from the start, and compute the offset back to it
1101     from the current code pointer. There may be an OP_OPT setting following
1102     the final KET, so we can't find the end just by going back from the code
1103     pointer. */
1104    
1105     if (repeat_max == -1)
1106 nigel 3 {
1107 nigel 23 register uschar *ket = previous;
1108     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1109     ketoffset = code - ket;
1110 nigel 3 }
1111    
1112 nigel 31 /* The case of a zero minimum is special because of the need to stick
1113     OP_BRAZERO in front of it, and because the group appears once in the
1114     data, whereas in other cases it appears the minimum number of times. For
1115     this reason, it is simplest to treat this case separately, as otherwise
1116     the code gets far too mess. There are several special subcases when the
1117     minimum is zero. */
1118    
1119     if (repeat_min == 0)
1120     {
1121     /* If the maximum is also zero, we just omit the group from the output
1122     altogether. */
1123    
1124     if (repeat_max == 0)
1125     {
1126     code = previous;
1127     previous = NULL;
1128     break;
1129     }
1130    
1131     /* If the maximum is 1 or unlimited, we just have to stick in the
1132     BRAZERO and do no more at this point. */
1133    
1134     if (repeat_max <= 1)
1135     {
1136     memmove(previous+1, previous, len);
1137     code++;
1138     *previous++ = OP_BRAZERO + repeat_type;
1139     }
1140    
1141     /* If the maximum is greater than 1 and limited, we have to replicate
1142     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1143     The first one has to be handled carefully because it's the original
1144     copy, which has to be moved up. The remainder can be handled by code
1145     that is common with the non-zero minimum case below. We just have to
1146     adjust the value or repeat_max, since one less copy is required. */
1147    
1148     else
1149     {
1150     int offset;
1151     memmove(previous+4, previous, len);
1152     code += 4;
1153     *previous++ = OP_BRAZERO + repeat_type;
1154     *previous++ = OP_BRA;
1155    
1156     /* We chain together the bracket offset fields that have to be
1157     filled in later when the ends of the brackets are reached. */
1158    
1159     offset = (bralink == NULL)? 0 : previous - bralink;
1160     bralink = previous;
1161     *previous++ = offset >> 8;
1162     *previous++ = offset & 255;
1163     }
1164    
1165     repeat_max--;
1166     }
1167    
1168     /* If the minimum is greater than zero, replicate the group as many
1169     times as necessary, and adjust the maximum to the number of subsequent
1170     copies that we need. */
1171    
1172     else
1173     {
1174     for (i = 1; i < repeat_min; i++)
1175     {
1176     memcpy(code, previous, len);
1177     code += len;
1178     }
1179     if (repeat_max > 0) repeat_max -= repeat_min;
1180     }
1181    
1182     /* This code is common to both the zero and non-zero minimum cases. If
1183     the maximum is limited, it replicates the group in a nested fashion,
1184     remembering the bracket starts on a stack. In the case of a zero minimum,
1185     the first one was set up above. In all cases the repeat_max now specifies
1186     the number of additional copies needed. */
1187    
1188     if (repeat_max >= 0)
1189     {
1190     for (i = repeat_max - 1; i >= 0; i--)
1191     {
1192     *code++ = OP_BRAZERO + repeat_type;
1193    
1194     /* All but the final copy start a new nesting, maintaining the
1195     chain of brackets outstanding. */
1196    
1197     if (i != 0)
1198     {
1199     int offset;
1200     *code++ = OP_BRA;
1201     offset = (bralink == NULL)? 0 : code - bralink;
1202     bralink = code;
1203     *code++ = offset >> 8;
1204     *code++ = offset & 255;
1205     }
1206    
1207     memcpy(code, previous, len);
1208     code += len;
1209     }
1210    
1211     /* Now chain through the pending brackets, and fill in their length
1212     fields (which are holding the chain links pro tem). */
1213    
1214     while (bralink != NULL)
1215     {
1216     int oldlinkoffset;
1217     int offset = code - bralink + 1;
1218     uschar *bra = code - offset;
1219     oldlinkoffset = (bra[1] << 8) + bra[2];
1220     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1221     *code++ = OP_KET;
1222     *code++ = bra[1] = offset >> 8;
1223     *code++ = bra[2] = (offset & 255);
1224     }
1225     }
1226    
1227     /* If the maximum is unlimited, set a repeater in the final copy. We
1228     can't just offset backwards from the current code point, because we
1229     don't know if there's been an options resetting after the ket. The
1230     correct offset was computed above. */
1231    
1232     else code[-ketoffset] = OP_KETRMAX + repeat_type;
1233    
1234    
1235     #ifdef NEVER
1236 nigel 3 /* If the minimum is greater than zero, and the maximum is unlimited or
1237     equal to the minimum, the first copy remains where it is, and is
1238     replicated up to the minimum number of times. This case includes the +
1239     repeat, but of course no replication is needed in that case. */
1240    
1241     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1242     {
1243     for (i = 1; i < repeat_min; i++)
1244     {
1245 nigel 9 memcpy(code, previous, len);
1246     code += len;
1247 nigel 3 }
1248     }
1249    
1250     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1251     Then, if there is a fixed upper limit, replicated up to that many times,
1252     sticking BRAZERO in front of all the optional ones. */
1253    
1254     else
1255     {
1256     if (repeat_min == 0)
1257     {
1258 nigel 9 memmove(previous+1, previous, len);
1259 nigel 3 code++;
1260     *previous++ = OP_BRAZERO + repeat_type;
1261     }
1262    
1263     for (i = 1; i < repeat_min; i++)
1264     {
1265 nigel 9 memcpy(code, previous, len);
1266     code += len;
1267 nigel 3 }
1268    
1269     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1270     {
1271     *code++ = OP_BRAZERO + repeat_type;
1272 nigel 9 memcpy(code, previous, len);
1273     code += len;
1274 nigel 3 }
1275     }
1276    
1277 nigel 23 /* If the maximum is unlimited, set a repeater in the final copy. We
1278     can't just offset backwards from the current code point, because we
1279     don't know if there's been an options resetting after the ket. The
1280     correct offset was computed above. */
1281 nigel 3
1282 nigel 23 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1283 nigel 31 #endif
1284    
1285    
1286 nigel 3 }
1287    
1288     /* Else there's some kind of shambles */
1289    
1290     else
1291     {
1292     *errorptr = ERR11;
1293     goto FAILED;
1294     }
1295    
1296     /* In all case we no longer have a previous item. */
1297    
1298     previous = NULL;
1299     break;
1300    
1301    
1302 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1303     lookbehind or option setting or condition. First deal with special things
1304     that can come after a bracket; all are introduced by ?, and the appearance
1305     of any of them means that this is not a referencing group. They were
1306     checked for validity in the first pass over the string, so we don't have to
1307     check for syntax errors here. */
1308 nigel 3
1309     case '(':
1310 nigel 23 newoptions = options;
1311     condref = -1;
1312    
1313 nigel 3 if (*(++ptr) == '?')
1314     {
1315 nigel 23 int set, unset;
1316     int *optset;
1317 nigel 3
1318     switch (*(++ptr))
1319     {
1320 nigel 23 case '#': /* Comment; skip to ket */
1321 nigel 3 ptr++;
1322     while (*ptr != ')') ptr++;
1323     continue;
1324    
1325     case ':': /* Non-extracting bracket */
1326 nigel 23 bravalue = OP_BRA;
1327 nigel 3 ptr++;
1328     break;
1329    
1330 nigel 23 case '(':
1331     bravalue = OP_COND; /* Conditional group */
1332 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1333 nigel 23 {
1334     condref = *ptr - '0';
1335     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1336     ptr++;
1337     }
1338     else ptr--;
1339     break;
1340    
1341     case '=': /* Positive lookahead */
1342 nigel 3 bravalue = OP_ASSERT;
1343     ptr++;
1344     break;
1345    
1346 nigel 23 case '!': /* Negative lookahead */
1347 nigel 3 bravalue = OP_ASSERT_NOT;
1348     ptr++;
1349     break;
1350    
1351 nigel 23 case '<': /* Lookbehinds */
1352     switch (*(++ptr))
1353 nigel 3 {
1354 nigel 23 case '=': /* Positive lookbehind */
1355     bravalue = OP_ASSERTBACK;
1356 nigel 3 ptr++;
1357     break;
1358 nigel 23
1359     case '!': /* Negative lookbehind */
1360     bravalue = OP_ASSERTBACK_NOT;
1361     ptr++;
1362     break;
1363    
1364     default: /* Syntax error */
1365     *errorptr = ERR24;
1366     goto FAILED;
1367 nigel 3 }
1368 nigel 23 break;
1369 nigel 3
1370 nigel 23 case '>': /* One-time brackets */
1371     bravalue = OP_ONCE;
1372     ptr++;
1373     break;
1374    
1375     default: /* Option setting */
1376     set = unset = 0;
1377     optset = &set;
1378    
1379     while (*ptr != ')' && *ptr != ':')
1380     {
1381     switch (*ptr++)
1382     {
1383     case '-': optset = &unset; break;
1384    
1385     case 'i': *optset |= PCRE_CASELESS; break;
1386     case 'm': *optset |= PCRE_MULTILINE; break;
1387     case 's': *optset |= PCRE_DOTALL; break;
1388     case 'x': *optset |= PCRE_EXTENDED; break;
1389     case 'U': *optset |= PCRE_UNGREEDY; break;
1390     case 'X': *optset |= PCRE_EXTRA; break;
1391    
1392     default:
1393     *errorptr = ERR12;
1394     goto FAILED;
1395     }
1396     }
1397    
1398     /* Set up the changed option bits, but don't change anything yet. */
1399    
1400     newoptions = (options | set) & (~unset);
1401    
1402     /* If the options ended with ')' this is not the start of a nested
1403     group with option changes, so the options change at this level. At top
1404     level there is nothing else to be done (the options will in fact have
1405     been set from the start of compiling as a result of the first pass) but
1406     at an inner level we must compile code to change the ims options if
1407     necessary, and pass the new setting back so that it can be put at the
1408     start of any following branches, and when this group ends, a resetting
1409     item can be compiled. */
1410    
1411     if (*ptr == ')')
1412     {
1413     if ((options & PCRE_INGROUP) != 0 &&
1414     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1415     {
1416     *code++ = OP_OPT;
1417     *code++ = *optchanged = newoptions & PCRE_IMS;
1418     }
1419     options = newoptions; /* Change options at this level */
1420     previous = NULL; /* This item can't be repeated */
1421     continue; /* It is complete */
1422     }
1423    
1424     /* If the options ended with ':' we are heading into a nested group
1425     with possible change of options. Such groups are non-capturing and are
1426     not assertions of any kind. All we need to do is skip over the ':';
1427     the newoptions value is handled below. */
1428    
1429     bravalue = OP_BRA;
1430     ptr++;
1431 nigel 3 }
1432     }
1433    
1434 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1435 nigel 3
1436     else
1437     {
1438     if (++(*brackets) > EXTRACT_MAX)
1439     {
1440     *errorptr = ERR13;
1441     goto FAILED;
1442     }
1443     bravalue = OP_BRA + *brackets;
1444     }
1445    
1446 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1447     kinds can be. We copy code into a non-register variable in order to be able
1448     to pass its address because some compilers complain otherwise. Pass in a
1449     new setting for the ims options if they have changed. */
1450 nigel 3
1451 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1452 nigel 3 *code = bravalue;
1453 nigel 23 tempcode = code;
1454    
1455     if (!compile_regex(
1456     options | PCRE_INGROUP, /* Set for all nested groups */
1457     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1458     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1459     brackets, /* Bracket level */
1460     &tempcode, /* Where to put code (updated) */
1461     &ptr, /* Input pointer (updated) */
1462     errorptr, /* Where to put an error message */
1463     (bravalue == OP_ASSERTBACK ||
1464     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1465 nigel 25 condref, /* Condition reference number */
1466     cd)) /* Tables block */
1467 nigel 23 goto FAILED;
1468    
1469     /* At the end of compiling, code is still pointing to the start of the
1470     group, while tempcode has been updated to point past the end of the group
1471     and any option resetting that may follow it. The pattern pointer (ptr)
1472     is on the bracket. */
1473    
1474     /* If this is a conditional bracket, check that there are no more than
1475     two branches in the group. */
1476    
1477     if (bravalue == OP_COND)
1478 nigel 3 {
1479 nigel 23 int branchcount = 0;
1480     uschar *tc = code;
1481    
1482     do {
1483     branchcount++;
1484     tc += (tc[1] << 8) | tc[2];
1485     }
1486     while (*tc != OP_KET);
1487    
1488     if (branchcount > 2)
1489     {
1490     *errorptr = ERR27;
1491 nigel 3 goto FAILED;
1492 nigel 23 }
1493 nigel 3 }
1494    
1495 nigel 23 /* Now update the main code pointer to the end of the group. */
1496    
1497     code = tempcode;
1498    
1499     /* Error if hit end of pattern */
1500    
1501 nigel 3 if (*ptr != ')')
1502     {
1503     *errorptr = ERR14;
1504     goto FAILED;
1505     }
1506     break;
1507    
1508     /* Check \ for being a real metacharacter; if not, fall through and handle
1509     it as a data character at the start of a string. Escape items are checked
1510     for validity in the pre-compiling pass. */
1511    
1512     case '\\':
1513 nigel 23 tempptr = ptr;
1514 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1515 nigel 3
1516     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1517     are arranged to be the negation of the corresponding OP_values. For the
1518     back references, the values are ESC_REF plus the reference number. Only
1519     back references and those types that consume a character may be repeated.
1520     We can test for values between ESC_b and ESC_Z for the latter; this may
1521     have to change if any new ones are ever created. */
1522    
1523     if (c < 0)
1524     {
1525     if (-c >= ESC_REF)
1526     {
1527     previous = code;
1528     *code++ = OP_REF;
1529 nigel 23 *code++ = -c - ESC_REF;
1530 nigel 3 }
1531     else
1532     {
1533 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1534 nigel 3 *code++ = -c;
1535     }
1536     continue;
1537     }
1538    
1539 nigel 7 /* Data character: reset and fall through */
1540 nigel 3
1541 nigel 23 ptr = tempptr;
1542 nigel 3 c = '\\';
1543    
1544     /* Handle a run of data characters until a metacharacter is encountered.
1545     The first character is guaranteed not to be whitespace or # when the
1546     extended flag is set. */
1547    
1548     NORMAL_CHAR:
1549     default:
1550     previous = code;
1551     *code = OP_CHARS;
1552     code += 2;
1553     length = 0;
1554    
1555     do
1556     {
1557     if ((options & PCRE_EXTENDED) != 0)
1558     {
1559 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1560 nigel 3 if (c == '#')
1561     {
1562     while ((c = *(++ptr)) != 0 && c != '\n');
1563     if (c == 0) break;
1564     continue;
1565     }
1566     }
1567    
1568     /* Backslash may introduce a data char or a metacharacter. Escaped items
1569     are checked for validity in the pre-compiling pass. Stop the string
1570     before a metaitem. */
1571    
1572     if (c == '\\')
1573     {
1574 nigel 23 tempptr = ptr;
1575 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1576 nigel 23 if (c < 0) { ptr = tempptr; break; }
1577 nigel 3 }
1578    
1579     /* Ordinary character or single-char escape */
1580    
1581     *code++ = c;
1582     length++;
1583     }
1584    
1585     /* This "while" is the end of the "do" above. */
1586    
1587 nigel 25 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1588 nigel 3
1589     /* Compute the length and set it in the data vector, and advance to
1590     the next state. */
1591    
1592     previous[1] = length;
1593 nigel 15 if (length < 255) ptr--;
1594 nigel 3 break;
1595     }
1596     } /* end of big loop */
1597    
1598     /* Control never reaches here by falling through, only by a goto for all the
1599     error states. Pass back the position in the pattern so that it can be displayed
1600     to the user for diagnosing the error. */
1601    
1602     FAILED:
1603     *ptrptr = ptr;
1604     return FALSE;
1605     }
1606    
1607    
1608    
1609    
1610     /*************************************************
1611     * Compile sequence of alternatives *
1612     *************************************************/
1613    
1614     /* On entry, ptr is pointing past the bracket character, but on return
1615     it points to the closing bracket, or vertical bar, or end of string.
1616     The code variable is pointing at the byte into which the BRA operator has been
1617 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
1618     during any branch, we need to insert an OP_OPT item at the start of every
1619     following branch to ensure they get set correctly at run time, and also pass
1620     the new options into every subsequent branch compile.
1621 nigel 3
1622     Argument:
1623 nigel 23 options the option bits
1624     optchanged new ims options to set as if (?ims) were at the start, or -1
1625     for no change
1626     brackets -> int containing the number of extracting brackets used
1627     codeptr -> the address of the current code pointer
1628     ptrptr -> the address of the current pattern pointer
1629     errorptr -> pointer to error message
1630     lookbehind TRUE if this is a lookbehind assertion
1631     condref > 0 for OPT_CREF setting at start of conditional group
1632 nigel 25 cd points to the data block with tables pointers
1633 nigel 3
1634 nigel 23 Returns: TRUE on success
1635 nigel 3 */
1636    
1637     static BOOL
1638 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1639 nigel 25 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1640     compile_data *cd)
1641 nigel 3 {
1642 nigel 7 const uschar *ptr = *ptrptr;
1643 nigel 3 uschar *code = *codeptr;
1644 nigel 23 uschar *last_branch = code;
1645 nigel 3 uschar *start_bracket = code;
1646 nigel 23 uschar *reverse_count = NULL;
1647     int oldoptions = options & PCRE_IMS;
1648 nigel 3
1649 nigel 23 code += 3;
1650    
1651     /* At the start of a reference-based conditional group, insert the reference
1652     number as an OP_CREF item. */
1653    
1654     if (condref > 0)
1655     {
1656     *code++ = OP_CREF;
1657     *code++ = condref;
1658     }
1659    
1660     /* Loop for each alternative branch */
1661    
1662 nigel 3 for (;;)
1663     {
1664     int length;
1665    
1666 nigel 23 /* Handle change of options */
1667    
1668     if (optchanged >= 0)
1669 nigel 3 {
1670 nigel 23 *code++ = OP_OPT;
1671     *code++ = optchanged;
1672     options = (options & ~PCRE_IMS) | optchanged;
1673     }
1674    
1675     /* Set up dummy OP_REVERSE if lookbehind assertion */
1676    
1677     if (lookbehind)
1678     {
1679     *code++ = OP_REVERSE;
1680     reverse_count = code;
1681     *code++ = 0;
1682     *code++ = 0;
1683     }
1684    
1685     /* Now compile the branch */
1686    
1687 nigel 25 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1688 nigel 23 {
1689 nigel 3 *ptrptr = ptr;
1690     return FALSE;
1691     }
1692    
1693     /* Fill in the length of the last branch */
1694    
1695     length = code - last_branch;
1696     last_branch[1] = length >> 8;
1697     last_branch[2] = length & 255;
1698    
1699 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
1700     and put the length into the OP_REVERSE item. Temporarily mark the end of
1701     the branch with OP_END. */
1702    
1703     if (lookbehind)
1704     {
1705     *code = OP_END;
1706     length = find_fixedlength(last_branch);
1707     DPRINTF(("fixed length = %d\n", length));
1708     if (length < 0)
1709     {
1710     *errorptr = ERR25;
1711     *ptrptr = ptr;
1712     return FALSE;
1713     }
1714     reverse_count[0] = (length >> 8);
1715     reverse_count[1] = length & 255;
1716     }
1717    
1718 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
1719     terminating ket and the length of the whole bracketed item, and return,
1720 nigel 23 leaving the pointer at the terminating char. If any of the ims options
1721     were changed inside the group, compile a resetting op-code following. */
1722 nigel 3
1723     if (*ptr != '|')
1724     {
1725     length = code - start_bracket;
1726     *code++ = OP_KET;
1727     *code++ = length >> 8;
1728     *code++ = length & 255;
1729 nigel 23 if (optchanged >= 0)
1730     {
1731     *code++ = OP_OPT;
1732     *code++ = oldoptions;
1733     }
1734 nigel 3 *codeptr = code;
1735     *ptrptr = ptr;
1736     return TRUE;
1737     }
1738    
1739     /* Another branch follows; insert an "or" node and advance the pointer. */
1740    
1741     *code = OP_ALT;
1742 nigel 23 last_branch = code;
1743     code += 3;
1744 nigel 3 ptr++;
1745     }
1746     /* Control never reaches here */
1747     }
1748    
1749    
1750    
1751 nigel 23
1752 nigel 3 /*************************************************
1753 nigel 23 * Find first significant op code *
1754     *************************************************/
1755    
1756     /* This is called by several functions that scan a compiled expression looking
1757     for a fixed first character, or an anchoring op code etc. It skips over things
1758     that do not influence this. For one application, a change of caseless option is
1759     important.
1760    
1761     Arguments:
1762     code pointer to the start of the group
1763     options pointer to external options
1764     optbit the option bit whose changing is significant, or
1765     zero if none are
1766     optstop TRUE to return on option change, otherwise change the options
1767     value and continue
1768    
1769     Returns: pointer to the first significant opcode
1770     */
1771    
1772     static const uschar*
1773     first_significant_code(const uschar *code, int *options, int optbit,
1774     BOOL optstop)
1775     {
1776     for (;;)
1777     {
1778     switch ((int)*code)
1779     {
1780     case OP_OPT:
1781     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1782     {
1783     if (optstop) return code;
1784     *options = (int)code[1];
1785     }
1786     code += 2;
1787     break;
1788    
1789     case OP_CREF:
1790     code += 2;
1791     break;
1792    
1793 nigel 35 case OP_WORD_BOUNDARY:
1794     case OP_NOT_WORD_BOUNDARY:
1795     code++;
1796     break;
1797    
1798 nigel 23 case OP_ASSERT_NOT:
1799     case OP_ASSERTBACK:
1800     case OP_ASSERTBACK_NOT:
1801     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1802     code += 3;
1803     break;
1804    
1805     default:
1806     return code;
1807     }
1808     }
1809     /* Control never reaches here */
1810     }
1811    
1812    
1813    
1814    
1815     /*************************************************
1816 nigel 3 * Check for anchored expression *
1817     *************************************************/
1818    
1819     /* Try to find out if this is an anchored regular expression. Consider each
1820     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1821     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1822     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1823     counts, since OP_CIRC can match in the middle.
1824    
1825 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1826     because that will try the rest of the pattern at all possible matching points,
1827     so there is no point trying them again.
1828 nigel 3
1829 nigel 23 Arguments:
1830     code points to start of expression (the bracket)
1831     options points to the options setting
1832    
1833     Returns: TRUE or FALSE
1834 nigel 3 */
1835    
1836     static BOOL
1837 nigel 23 is_anchored(register const uschar *code, int *options)
1838 nigel 3 {
1839     do {
1840 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
1841     PCRE_MULTILINE, FALSE);
1842     register int op = *scode;
1843     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1844     { if (!is_anchored(scode, options)) return FALSE; }
1845 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1846     (*options & PCRE_DOTALL) != 0)
1847 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
1848     else if (op != OP_SOD &&
1849     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1850     return FALSE;
1851 nigel 3 code += (code[1] << 8) + code[2];
1852     }
1853     while (*code == OP_ALT);
1854     return TRUE;
1855     }
1856    
1857    
1858    
1859     /*************************************************
1860 nigel 33 * Check for starting with ^ or .* *
1861 nigel 3 *************************************************/
1862    
1863 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
1864     "first char" processing can be done to speed things up in multiline
1865     matching and for non-DOTALL patterns that start with .* (which must start at
1866     the beginning or after \n).
1867 nigel 3
1868     Argument: points to start of expression (the bracket)
1869     Returns: TRUE or FALSE
1870     */
1871    
1872     static BOOL
1873 nigel 7 is_startline(const uschar *code)
1874 nigel 3 {
1875     do {
1876 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1877     register int op = *scode;
1878     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1879     { if (!is_startline(scode)) return FALSE; }
1880 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1881     { if (scode[1] != OP_ANY) return FALSE; }
1882 nigel 23 else if (op != OP_CIRC) return FALSE;
1883 nigel 3 code += (code[1] << 8) + code[2];
1884     }
1885     while (*code == OP_ALT);
1886     return TRUE;
1887     }
1888    
1889    
1890    
1891     /*************************************************
1892     * Check for fixed first char *
1893     *************************************************/
1894    
1895     /* Try to find out if there is a fixed first character. This is called for
1896     unanchored expressions, as it speeds up their processing quite considerably.
1897     Consider each alternative branch. If they all start with the same char, or with
1898     a bracket all of whose alternatives start with the same char (recurse ad lib),
1899     then we return that char, otherwise -1.
1900    
1901 nigel 23 Arguments:
1902     code points to start of expression (the bracket)
1903     options pointer to the options (used to check casing changes)
1904    
1905     Returns: -1 or the fixed first char
1906 nigel 3 */
1907    
1908     static int
1909 nigel 23 find_firstchar(const uschar *code, int *options)
1910 nigel 3 {
1911     register int c = -1;
1912 nigel 23 do {
1913     int d;
1914     const uschar *scode = first_significant_code(code + 3, options,
1915     PCRE_CASELESS, TRUE);
1916     register int op = *scode;
1917 nigel 3
1918 nigel 23 if (op >= OP_BRA) op = OP_BRA;
1919 nigel 3
1920 nigel 23 switch(op)
1921     {
1922     default:
1923     return -1;
1924 nigel 3
1925 nigel 23 case OP_BRA:
1926     case OP_ASSERT:
1927     case OP_ONCE:
1928     case OP_COND:
1929     if ((d = find_firstchar(scode, options)) < 0) return -1;
1930     if (c < 0) c = d; else if (c != d) return -1;
1931     break;
1932 nigel 3
1933 nigel 23 case OP_EXACT: /* Fall through */
1934     scode++;
1935 nigel 3
1936 nigel 23 case OP_CHARS: /* Fall through */
1937     scode++;
1938    
1939     case OP_PLUS:
1940     case OP_MINPLUS:
1941     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1942     break;
1943     }
1944    
1945     code += (code[1] << 8) + code[2];
1946     }
1947 nigel 3 while (*code == OP_ALT);
1948     return c;
1949     }
1950    
1951    
1952    
1953 nigel 23
1954    
1955 nigel 3 /*************************************************
1956     * Compile a Regular Expression *
1957     *************************************************/
1958    
1959     /* This function takes a string and returns a pointer to a block of store
1960     holding a compiled version of the expression.
1961    
1962     Arguments:
1963     pattern the regular expression
1964     options various option bits
1965     errorptr pointer to pointer to error text
1966     erroroffset ptr offset in pattern where error was detected
1967 nigel 25 tables pointer to character tables or NULL
1968 nigel 3
1969     Returns: pointer to compiled data block, or NULL on error,
1970     with errorptr and erroroffset set
1971     */
1972    
1973     pcre *
1974 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
1975 nigel 25 int *erroroffset, const unsigned char *tables)
1976 nigel 3 {
1977     real_pcre *re;
1978     int length = 3; /* For initial BRA plus length */
1979     int runlength;
1980     int c, size;
1981     int bracount = 0;
1982     int top_backref = 0;
1983 nigel 23 int branch_extra = 0;
1984     int branch_newextra;
1985 nigel 7 unsigned int brastackptr = 0;
1986     uschar *code;
1987     const uschar *ptr;
1988 nigel 25 compile_data compile_block;
1989 nigel 23 int brastack[BRASTACK_SIZE];
1990     uschar bralenstack[BRASTACK_SIZE];
1991 nigel 3
1992     #ifdef DEBUG
1993     uschar *code_base, *code_end;
1994     #endif
1995    
1996     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1997     can do is just return NULL. */
1998    
1999     if (errorptr == NULL) return NULL;
2000     *errorptr = NULL;
2001    
2002     /* However, we can give a message for this error */
2003    
2004     if (erroroffset == NULL)
2005     {
2006     *errorptr = ERR16;
2007     return NULL;
2008     }
2009     *erroroffset = 0;
2010    
2011     if ((options & ~PUBLIC_OPTIONS) != 0)
2012     {
2013     *errorptr = ERR17;
2014     return NULL;
2015     }
2016    
2017 nigel 25 /* Set up pointers to the individual character tables */
2018    
2019     if (tables == NULL) tables = pcre_default_tables;
2020     compile_block.lcc = tables + lcc_offset;
2021     compile_block.fcc = tables + fcc_offset;
2022     compile_block.cbits = tables + cbits_offset;
2023     compile_block.ctypes = tables + ctypes_offset;
2024    
2025     /* Reflect pattern for debugging output */
2026    
2027 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
2028     DPRINTF(("%s\n", pattern));
2029 nigel 3
2030     /* The first thing to do is to make a pass over the pattern to compute the
2031     amount of store required to hold the compiled code. This does not have to be
2032     perfect as long as errors are overestimates. At the same time we can detect any
2033     internal flag settings. Make an attempt to correct for any counted white space
2034     if an "extended" flag setting appears late in the pattern. We can't be so
2035     clever for #-comments. */
2036    
2037 nigel 7 ptr = (const uschar *)(pattern - 1);
2038 nigel 3 while ((c = *(++ptr)) != 0)
2039     {
2040     int min, max;
2041     int class_charcount;
2042    
2043 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2044 nigel 3 {
2045 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2046 nigel 23 if (c == '#')
2047     {
2048     while ((c = *(++ptr)) != 0 && c != '\n');
2049     continue;
2050     }
2051 nigel 3 }
2052    
2053     switch(c)
2054     {
2055     /* A backslashed item may be an escaped "normal" character or a
2056     character type. For a "normal" character, put the pointers and
2057     character back so that tests for whitespace etc. in the input
2058     are done correctly. */
2059    
2060     case '\\':
2061     {
2062 nigel 7 const uschar *save_ptr = ptr;
2063 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2064 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2065     if (c >= 0)
2066     {
2067     ptr = save_ptr;
2068     c = '\\';
2069     goto NORMAL_CHAR;
2070     }
2071     }
2072     length++;
2073    
2074     /* A back reference needs an additional char, plus either one or 5
2075     bytes for a repeat. We also need to keep the value of the highest
2076     back reference. */
2077    
2078     if (c <= -ESC_REF)
2079     {
2080     int refnum = -c - ESC_REF;
2081     if (refnum > top_backref) top_backref = refnum;
2082     length++; /* For single back reference */
2083 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2084 nigel 3 {
2085 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2086 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2087     if ((min == 0 && (max == 1 || max == -1)) ||
2088     (min == 1 && max == -1))
2089     length++;
2090     else length += 5;
2091     if (ptr[1] == '?') ptr++;
2092     }
2093     }
2094     continue;
2095    
2096     case '^':
2097     case '.':
2098     case '$':
2099     case '*': /* These repeats won't be after brackets; */
2100     case '+': /* those are handled separately */
2101     case '?':
2102     length++;
2103     continue;
2104    
2105     /* This covers the cases of repeats after a single char, metachar, class,
2106     or back reference. */
2107    
2108     case '{':
2109 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2110     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2111 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2112     if ((min == 0 && (max == 1 || max == -1)) ||
2113     (min == 1 && max == -1))
2114     length++;
2115     else
2116     {
2117     length--; /* Uncount the original char or metachar */
2118     if (min == 1) length++; else if (min > 0) length += 4;
2119     if (max > 0) length += 4; else length += 2;
2120     }
2121     if (ptr[1] == '?') ptr++;
2122     continue;
2123    
2124 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
2125     options changed in the previous branch(es), and/or if we are in a
2126     lookbehind assertion, extra space will be needed at the start of the
2127     branch. This is handled by branch_extra. */
2128    
2129 nigel 3 case '|':
2130 nigel 23 length += 3 + branch_extra;
2131 nigel 3 continue;
2132    
2133     /* A character class uses 33 characters. Don't worry about character types
2134     that aren't allowed in classes - they'll get picked up during the compile.
2135     A character class that contains only one character uses 2 or 3 bytes,
2136     depending on whether it is negated or not. Notice this where we can. */
2137    
2138     case '[':
2139     class_charcount = 0;
2140     if (*(++ptr) == '^') ptr++;
2141     do
2142     {
2143     if (*ptr == '\\')
2144     {
2145 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2146     &compile_block);
2147 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2148 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2149 nigel 3 }
2150     else class_charcount++;
2151     ptr++;
2152     }
2153     while (*ptr != 0 && *ptr != ']');
2154    
2155     /* Repeats for negated single chars are handled by the general code */
2156    
2157     if (class_charcount == 1) length += 3; else
2158     {
2159     length += 33;
2160    
2161     /* A repeat needs either 1 or 5 bytes. */
2162    
2163 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2164 nigel 3 {
2165 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2166 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2167     if ((min == 0 && (max == 1 || max == -1)) ||
2168     (min == 1 && max == -1))
2169     length++;
2170     else length += 5;
2171     if (ptr[1] == '?') ptr++;
2172     }
2173     }
2174     continue;
2175    
2176     /* Brackets may be genuine groups or special things */
2177    
2178     case '(':
2179 nigel 23 branch_newextra = 0;
2180 nigel 3
2181     /* Handle special forms of bracket, which all start (? */
2182    
2183 nigel 23 if (ptr[1] == '?')
2184 nigel 3 {
2185 nigel 23 int set, unset;
2186     int *optset;
2187    
2188     switch (c = ptr[2])
2189 nigel 3 {
2190 nigel 23 /* Skip over comments entirely */
2191     case '#':
2192     ptr += 3;
2193     while (*ptr != 0 && *ptr != ')') ptr++;
2194     if (*ptr == 0)
2195     {
2196     *errorptr = ERR18;
2197     goto PCRE_ERROR_RETURN;
2198     }
2199     continue;
2200 nigel 3
2201 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2202     then behave like a non-special bracket, except that they don't increment
2203     the count of extracting brackets. Ditto for the "once only" bracket,
2204     which is in Perl from version 5.005. */
2205 nigel 3
2206 nigel 23 case ':':
2207     case '=':
2208     case '!':
2209     case '>':
2210 nigel 3 ptr += 2;
2211     break;
2212    
2213 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2214 nigel 3
2215 nigel 23 case '<':
2216     if (ptr[3] == '=' || ptr[3] == '!')
2217 nigel 3 {
2218 nigel 23 ptr += 3;
2219     branch_newextra = 3;
2220     length += 3; /* For the first branch */
2221     break;
2222 nigel 3 }
2223 nigel 23 *errorptr = ERR24;
2224     goto PCRE_ERROR_RETURN;
2225    
2226     /* Conditionals are in Perl from version 5.005. The bracket must either
2227     be followed by a number (for bracket reference) or by an assertion
2228     group. */
2229    
2230     case '(':
2231 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2232 nigel 3 {
2233 nigel 23 ptr += 4;
2234     length += 2;
2235 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2236 nigel 23 if (*ptr != ')')
2237     {
2238     *errorptr = ERR26;
2239     goto PCRE_ERROR_RETURN;
2240     }
2241 nigel 3 }
2242 nigel 23 else /* An assertion must follow */
2243 nigel 3 {
2244 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2245    
2246     if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2247     {
2248     ptr += 2; /* To get right offset in message */
2249     *errorptr = ERR28;
2250     goto PCRE_ERROR_RETURN;
2251     }
2252 nigel 3 }
2253 nigel 23 break;
2254    
2255     /* Else loop checking valid options until ) is met. Anything else is an
2256     error. If we are without any brackets, i.e. at top level, the settings
2257     act as if specified in the options, so massage the options immediately.
2258     This is for backward compatibility with Perl 5.004. */
2259    
2260     default:
2261     set = unset = 0;
2262     optset = &set;
2263     ptr += 2;
2264    
2265     for (;; ptr++)
2266 nigel 3 {
2267 nigel 23 c = *ptr;
2268     switch (c)
2269     {
2270     case 'i':
2271     *optset |= PCRE_CASELESS;
2272     continue;
2273    
2274     case 'm':
2275     *optset |= PCRE_MULTILINE;
2276     continue;
2277    
2278     case 's':
2279     *optset |= PCRE_DOTALL;
2280     continue;
2281    
2282     case 'x':
2283     *optset |= PCRE_EXTENDED;
2284     continue;
2285    
2286     case 'X':
2287     *optset |= PCRE_EXTRA;
2288     continue;
2289    
2290     case 'U':
2291     *optset |= PCRE_UNGREEDY;
2292     continue;
2293    
2294     case '-':
2295     optset = &unset;
2296     continue;
2297    
2298     /* A termination by ')' indicates an options-setting-only item;
2299     this is global at top level; otherwise nothing is done here and
2300     it is handled during the compiling process on a per-bracket-group
2301     basis. */
2302    
2303     case ')':
2304     if (brastackptr == 0)
2305     {
2306     options = (options | set) & (~unset);
2307     set = unset = 0; /* To save length */
2308     }
2309     /* Fall through */
2310    
2311     /* A termination by ':' indicates the start of a nested group with
2312     the given options set. This is again handled at compile time, but
2313     we must allow for compiled space if any of the ims options are
2314     set. We also have to allow for resetting space at the end of
2315     the group, which is why 4 is added to the length and not just 2.
2316     If there are several changes of options within the same group, this
2317     will lead to an over-estimate on the length, but this shouldn't
2318     matter very much. We also have to allow for resetting options at
2319     the start of any alternations, which we do by setting
2320     branch_newextra to 2. */
2321    
2322     case ':':
2323     if (((set|unset) & PCRE_IMS) != 0)
2324     {
2325     length += 4;
2326     branch_newextra = 2;
2327     }
2328     goto END_OPTIONS;
2329    
2330     /* Unrecognized option character */
2331    
2332     default:
2333     *errorptr = ERR12;
2334     goto PCRE_ERROR_RETURN;
2335     }
2336 nigel 3 }
2337 nigel 23
2338     /* If we hit a closing bracket, that's it - this is a freestanding
2339     option-setting. We need to ensure that branch_extra is updated if
2340     necessary. The only values branch_newextra can have here are 0 or 2.
2341     If the value is 2, then branch_extra must either be 2 or 5, depending
2342     on whether this is a lookbehind group or not. */
2343    
2344     END_OPTIONS:
2345     if (c == ')')
2346 nigel 19 {
2347 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2348     branch_extra += branch_newextra;
2349 nigel 19 continue;
2350     }
2351 nigel 3
2352 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2353     to handle the group below. */
2354 nigel 3 }
2355     }
2356    
2357     /* Extracting brackets must be counted so we can process escapes in a
2358     Perlish way. */
2359    
2360     else bracount++;
2361    
2362     /* Non-special forms of bracket. Save length for computing whole length
2363 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2364     save the current value of branch_extra, and start the new group with
2365     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2366     for a lookbehind assertion. */
2367 nigel 3
2368     if (brastackptr >= sizeof(brastack)/sizeof(int))
2369     {
2370     *errorptr = ERR19;
2371     goto PCRE_ERROR_RETURN;
2372     }
2373    
2374 nigel 23 bralenstack[brastackptr] = branch_extra;
2375     branch_extra = branch_newextra;
2376    
2377 nigel 3 brastack[brastackptr++] = length;
2378     length += 3;
2379     continue;
2380    
2381     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2382 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2383     0 this is an unmatched bracket which will generate an error, but take care
2384 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2385     the branch_extra value. */
2386 nigel 3
2387     case ')':
2388     length += 3;
2389     {
2390 nigel 9 int minval = 1;
2391     int maxval = 1;
2392 nigel 23 int duplength;
2393 nigel 3
2394 nigel 23 if (brastackptr > 0)
2395     {
2396     duplength = length - brastack[--brastackptr];
2397     branch_extra = bralenstack[brastackptr];
2398     }
2399     else duplength = 0;
2400    
2401 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2402     automatically; for the others we need an increment. */
2403    
2404 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2405 nigel 3 {
2406 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2407     &compile_block);
2408 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2409     }
2410 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2411     else if (c == '+') { maxval = -1; ptr++; }
2412     else if (c == '?') { minval = 0; ptr++; }
2413 nigel 3
2414 nigel 31 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2415     group, and if the maximum is greater than zero, we have to replicate
2416     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2417     bracket set - hence the 7. */
2418 nigel 3
2419 nigel 31 if (minval == 0)
2420     {
2421     length++;
2422     if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2423     }
2424    
2425     /* When the minimum is greater than zero, 1 we have to replicate up to
2426     minval-1 times, with no additions required in the copies. Then, if
2427     there is a limited maximum we have to replicate up to maxval-1 times
2428     allowing for a BRAZERO item before each optional copy and nesting
2429     brackets for all but one of the optional copies. */
2430    
2431     else
2432     {
2433     length += (minval - 1) * duplength;
2434     if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2435     length += (maxval - minval) * (duplength + 7) - 6;
2436     }
2437 nigel 3 }
2438     continue;
2439    
2440     /* Non-special character. For a run of such characters the length required
2441     is the number of characters + 2, except that the maximum run length is 255.
2442     We won't get a skipped space or a non-data escape or the start of a #
2443     comment as the first character, so the length can't be zero. */
2444    
2445     NORMAL_CHAR:
2446     default:
2447     length += 2;
2448     runlength = 0;
2449     do
2450     {
2451 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2452 nigel 3 {
2453 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2454 nigel 23 if (c == '#')
2455     {
2456     while ((c = *(++ptr)) != 0 && c != '\n');
2457     continue;
2458     }
2459 nigel 3 }
2460    
2461     /* Backslash may introduce a data char or a metacharacter; stop the
2462     string before the latter. */
2463    
2464     if (c == '\\')
2465     {
2466 nigel 7 const uschar *saveptr = ptr;
2467 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2468     &compile_block);
2469 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2470     if (c < 0) { ptr = saveptr; break; }
2471     }
2472    
2473     /* Ordinary character or single-char escape */
2474    
2475     runlength++;
2476     }
2477    
2478     /* This "while" is the end of the "do" above. */
2479    
2480 nigel 25 while (runlength < 255 &&
2481     (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2482 nigel 3
2483     ptr--;
2484     length += runlength;
2485     continue;
2486     }
2487     }
2488    
2489     length += 4; /* For final KET and END */
2490    
2491     if (length > 65539)
2492     {
2493     *errorptr = ERR20;
2494     return NULL;
2495     }
2496    
2497     /* Compute the size of data block needed and get it, either from malloc or
2498 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
2499     rather than just "code", because it has been reported that one broken compiler
2500     fails on "code" because it is also an independent variable. It should make no
2501     difference to the value of the offsetof(). */
2502 nigel 3
2503 nigel 9 size = length + offsetof(real_pcre, code[0]);
2504 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
2505    
2506     if (re == NULL)
2507     {
2508     *errorptr = ERR21;
2509     return NULL;
2510     }
2511    
2512 nigel 9 /* Put in the magic number and the options. */
2513    
2514 nigel 3 re->magic_number = MAGIC_NUMBER;
2515     re->options = options;
2516 nigel 25 re->tables = tables;
2517 nigel 3
2518     /* Set up a starting, non-extracting bracket, then compile the expression. On
2519     error, *errorptr will be set non-NULL, so we don't need to look at the result
2520     of the function here. */
2521    
2522 nigel 7 ptr = (const uschar *)pattern;
2523 nigel 3 code = re->code;
2524     *code = OP_BRA;
2525     bracount = 0;
2526 nigel 25 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2527     &compile_block);
2528 nigel 3 re->top_bracket = bracount;
2529     re->top_backref = top_backref;
2530    
2531     /* If not reached end of pattern on success, there's an excess bracket. */
2532    
2533     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2534    
2535     /* Fill in the terminating state and check for disastrous overflow, but
2536     if debugging, leave the test till after things are printed out. */
2537    
2538     *code++ = OP_END;
2539    
2540     #ifndef DEBUG
2541     if (code - re->code > length) *errorptr = ERR23;
2542     #endif
2543    
2544 nigel 23 /* Give an error if there's back reference to a non-existent capturing
2545     subpattern. */
2546    
2547     if (top_backref > re->top_bracket) *errorptr = ERR15;
2548    
2549 nigel 3 /* Failed to compile */
2550    
2551     if (*errorptr != NULL)
2552     {
2553     (pcre_free)(re);
2554     PCRE_ERROR_RETURN:
2555 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
2556 nigel 3 return NULL;
2557     }
2558    
2559 nigel 33 /* If the anchored option was not passed, set flag if we can determine that the
2560     pattern is anchored by virtue of ^ characters or \A or anything else (such as
2561     starting with .* when DOTALL is set).
2562 nigel 3
2563 nigel 33 Otherwise, see if we can determine what the first character has to be, because
2564     that speeds up unanchored matches no end. If not, see if we can set the
2565     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2566     start with ^. and also when all branches start with .* for non-DOTALL matches.
2567     */
2568    
2569 nigel 3 if ((options & PCRE_ANCHORED) == 0)
2570     {
2571 nigel 23 int temp_options = options;
2572     if (is_anchored(re->code, &temp_options))
2573 nigel 3 re->options |= PCRE_ANCHORED;
2574     else
2575     {
2576 nigel 23 int ch = find_firstchar(re->code, &temp_options);
2577 nigel 9 if (ch >= 0)
2578 nigel 3 {
2579 nigel 9 re->first_char = ch;
2580 nigel 3 re->options |= PCRE_FIRSTSET;
2581     }
2582     else if (is_startline(re->code))
2583     re->options |= PCRE_STARTLINE;
2584     }
2585     }
2586    
2587     /* Print out the compiled data for debugging */
2588    
2589     #ifdef DEBUG
2590    
2591 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
2592 nigel 3 length, re->top_bracket, re->top_backref);
2593    
2594     if (re->options != 0)
2595     {
2596 nigel 19 printf("%s%s%s%s%s%s%s%s\n",
2597 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2598     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2599     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2600     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2601     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2602     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2603 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2604     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2605 nigel 3 }
2606    
2607     if ((re->options & PCRE_FIRSTSET) != 0)
2608     {
2609     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2610     else printf("First char = \\x%02x\n", re->first_char);
2611     }
2612    
2613     code_end = code;
2614     code_base = code = re->code;
2615    
2616     while (code < code_end)
2617     {
2618     int charlength;
2619    
2620     printf("%3d ", code - code_base);
2621    
2622     if (*code >= OP_BRA)
2623     {
2624     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2625     code += 2;
2626     }
2627    
2628     else switch(*code)
2629     {
2630 nigel 23 case OP_OPT:
2631     printf(" %.2x %s", code[1], OP_names[*code]);
2632     code++;
2633     break;
2634    
2635     case OP_COND:
2636     printf("%3d Cond", (code[1] << 8) + code[2]);
2637     code += 2;
2638     break;
2639    
2640     case OP_CREF:
2641     printf(" %.2d %s", code[1], OP_names[*code]);
2642     code++;
2643     break;
2644    
2645 nigel 3 case OP_CHARS:
2646     charlength = *(++code);
2647     printf("%3d ", charlength);
2648     while (charlength-- > 0)
2649     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2650     break;
2651    
2652     case OP_KETRMAX:
2653     case OP_KETRMIN:
2654     case OP_ALT:
2655     case OP_KET:
2656     case OP_ASSERT:
2657     case OP_ASSERT_NOT:
2658 nigel 23 case OP_ASSERTBACK:
2659     case OP_ASSERTBACK_NOT:
2660 nigel 3 case OP_ONCE:
2661     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2662     code += 2;
2663     break;
2664    
2665 nigel 23 case OP_REVERSE:
2666     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2667     code += 2;
2668     break;
2669    
2670 nigel 3 case OP_STAR:
2671     case OP_MINSTAR:
2672     case OP_PLUS:
2673     case OP_MINPLUS:
2674     case OP_QUERY:
2675     case OP_MINQUERY:
2676     case OP_TYPESTAR:
2677     case OP_TYPEMINSTAR:
2678     case OP_TYPEPLUS:
2679     case OP_TYPEMINPLUS:
2680     case OP_TYPEQUERY:
2681     case OP_TYPEMINQUERY:
2682     if (*code >= OP_TYPESTAR)
2683     printf(" %s", OP_names[code[1]]);
2684     else if (isprint(c = code[1])) printf(" %c", c);
2685     else printf(" \\x%02x", c);
2686     printf("%s", OP_names[*code++]);
2687     break;
2688    
2689     case OP_EXACT:
2690     case OP_UPTO:
2691     case OP_MINUPTO:
2692     if (isprint(c = code[3])) printf(" %c{", c);
2693     else printf(" \\x%02x{", c);
2694 nigel 11 if (*code != OP_EXACT) printf("0,");
2695 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2696     if (*code == OP_MINUPTO) printf("?");
2697     code += 3;
2698     break;
2699    
2700     case OP_TYPEEXACT:
2701     case OP_TYPEUPTO:
2702     case OP_TYPEMINUPTO:
2703     printf(" %s{", OP_names[code[3]]);
2704     if (*code != OP_TYPEEXACT) printf(",");
2705     printf("%d}", (code[1] << 8) + code[2]);
2706     if (*code == OP_TYPEMINUPTO) printf("?");
2707     code += 3;
2708     break;
2709    
2710     case OP_NOT:
2711     if (isprint(c = *(++code))) printf(" [^%c]", c);
2712     else printf(" [^\\x%02x]", c);
2713     break;
2714    
2715     case OP_NOTSTAR:
2716     case OP_NOTMINSTAR:
2717     case OP_NOTPLUS:
2718     case OP_NOTMINPLUS:
2719     case OP_NOTQUERY:
2720     case OP_NOTMINQUERY:
2721     if (isprint(c = code[1])) printf(" [^%c]", c);
2722     else printf(" [^\\x%02x]", c);
2723     printf("%s", OP_names[*code++]);
2724     break;
2725    
2726     case OP_NOTEXACT:
2727     case OP_NOTUPTO:
2728     case OP_NOTMINUPTO:
2729     if (isprint(c = code[3])) printf(" [^%c]{", c);
2730     else printf(" [^\\x%02x]{", c);
2731     if (*code != OP_NOTEXACT) printf(",");
2732     printf("%d}", (code[1] << 8) + code[2]);
2733     if (*code == OP_NOTMINUPTO) printf("?");
2734     code += 3;
2735     break;
2736    
2737     case OP_REF:
2738     printf(" \\%d", *(++code));
2739 nigel 9 code ++;
2740     goto CLASS_REF_REPEAT;
2741 nigel 3
2742     case OP_CLASS:
2743     {
2744     int i, min, max;
2745 nigel 23 code++;
2746     printf(" [");
2747 nigel 3
2748     for (i = 0; i < 256; i++)
2749     {
2750     if ((code[i/8] & (1 << (i&7))) != 0)
2751     {
2752     int j;
2753     for (j = i+1; j < 256; j++)
2754     if ((code[j/8] & (1 << (j&7))) == 0) break;
2755     if (i == '-' || i == ']') printf("\\");
2756     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2757     if (--j > i)
2758     {
2759     printf("-");
2760     if (j == '-' || j == ']') printf("\\");
2761     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2762     }
2763     i = j;
2764     }
2765     }
2766     printf("]");
2767     code += 32;
2768    
2769 nigel 9 CLASS_REF_REPEAT:
2770    
2771 nigel 3 switch(*code)
2772     {
2773     case OP_CRSTAR:
2774     case OP_CRMINSTAR:
2775     case OP_CRPLUS:
2776     case OP_CRMINPLUS:
2777     case OP_CRQUERY:
2778     case OP_CRMINQUERY:
2779     printf("%s", OP_names[*code]);
2780     break;
2781    
2782     case OP_CRRANGE:
2783     case OP_CRMINRANGE:
2784     min = (code[1] << 8) + code[2];
2785     max = (code[3] << 8) + code[4];
2786     if (max == 0) printf("{%d,}", min);
2787     else printf("{%d,%d}", min, max);
2788     if (*code == OP_CRMINRANGE) printf("?");
2789     code += 4;
2790     break;
2791    
2792     default:
2793     code--;
2794     }
2795     }
2796     break;
2797    
2798     /* Anything else is just a one-node item */
2799    
2800     default:
2801     printf(" %s", OP_names[*code]);
2802     break;
2803     }
2804    
2805     code++;
2806     printf("\n");
2807     }
2808     printf("------------------------------------------------------------------\n");
2809    
2810     /* This check is done here in the debugging case so that the code that
2811     was compiled can be seen. */
2812    
2813     if (code - re->code > length)
2814     {
2815     *errorptr = ERR23;
2816     (pcre_free)(re);
2817     *erroroffset = ptr - (uschar *)pattern;
2818     return NULL;
2819     }
2820     #endif
2821    
2822     return (pcre *)re;
2823     }
2824    
2825    
2826    
2827     /*************************************************
2828     * Match a back-reference *
2829     *************************************************/
2830    
2831 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
2832     than the number of characters left in the string, so the match fails.
2833 nigel 3
2834     Arguments:
2835 nigel 23 offset index into the offset vector
2836 nigel 3 eptr points into the subject
2837     length length to be matched
2838     md points to match data block
2839 nigel 23 ims the ims flags
2840 nigel 3
2841     Returns: TRUE if matched
2842     */
2843    
2844     static BOOL
2845 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2846     int ims)
2847 nigel 3 {
2848 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
2849 nigel 3
2850     #ifdef DEBUG
2851     if (eptr >= md->end_subject)
2852     printf("matching subject <null>");
2853     else
2854     {
2855     printf("matching subject ");
2856     pchars(eptr, length, TRUE, md);
2857     }
2858     printf(" against backref ");
2859     pchars(p, length, FALSE, md);
2860     printf("\n");
2861     #endif
2862    
2863     /* Always fail if not enough characters left */
2864    
2865 nigel 23 if (length > md->end_subject - eptr) return FALSE;
2866 nigel 3
2867     /* Separate the caselesss case for speed */
2868    
2869 nigel 23 if ((ims & PCRE_CASELESS) != 0)
2870 nigel 25 {
2871     while (length-- > 0)
2872     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2873     }
2874 nigel 3 else
2875     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2876    
2877     return TRUE;
2878     }
2879    
2880    
2881    
2882     /*************************************************
2883     * Match from current position *
2884     *************************************************/
2885    
2886 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
2887     in the subject string, while eptrb holds the value of eptr at the start of the
2888     last bracketed group - used for breaking infinite loops matching zero-length
2889     strings.
2890 nigel 3
2891     Arguments:
2892     eptr pointer in subject
2893     ecode position in code
2894     offset_top current top pointer
2895     md pointer to "static" info for the match
2896 nigel 23 ims current /i, /m, and /s options
2897     condassert TRUE if called to check a condition assertion
2898     eptrb eptr at start of last bracket
2899 nigel 3
2900     Returns: TRUE if matched
2901     */
2902    
2903     static BOOL
2904 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
2905     int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2906 nigel 3 {
2907 nigel 23 int original_ims = ims; /* Save for resetting on ')' */
2908    
2909 nigel 3 for (;;)
2910     {
2911 nigel 23 int op = (int)*ecode;
2912 nigel 3 int min, max, ctype;
2913     register int i;
2914     register int c;
2915 nigel 7 BOOL minimize = FALSE;
2916 nigel 3
2917 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
2918     the current subject position in the working slot at the top of the vector. We
2919     mustn't change the current values of the data slot, because they may be set
2920     from a previous iteration of this group, and be referred to by a reference
2921     inside the group.
2922 nigel 3
2923 nigel 23 If the bracket fails to match, we need to restore this value and also the
2924     values of the final offsets, in case they were set by a previous iteration of
2925     the same bracket.
2926    
2927     If there isn't enough space in the offset vector, treat this as if it were a
2928     non-capturing bracket. Don't worry about setting the flag for the error case
2929     here; that is handled in the code for KET. */
2930    
2931     if (op > OP_BRA)
2932 nigel 3 {
2933 nigel 23 int number = op - OP_BRA;
2934     int offset = number << 1;
2935 nigel 3
2936 nigel 31 #ifdef DEBUG
2937     printf("start bracket %d subject=", number);
2938     pchars(eptr, 16, TRUE, md);
2939     printf("\n");
2940     #endif
2941 nigel 3
2942 nigel 23 if (offset < md->offset_max)
2943 nigel 3 {
2944 nigel 23 int save_offset1 = md->offset_vector[offset];
2945     int save_offset2 = md->offset_vector[offset+1];
2946     int save_offset3 = md->offset_vector[md->offset_end - number];
2947 nigel 3
2948 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2949     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2950    
2951     do
2952     {
2953     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2954     ecode += (ecode[1] << 8) + ecode[2];
2955     }
2956     while (*ecode == OP_ALT);
2957    
2958     DPRINTF(("bracket %d failed\n", number));
2959    
2960     md->offset_vector[offset] = save_offset1;
2961     md->offset_vector[offset+1] = save_offset2;
2962     md->offset_vector[md->offset_end - number] = save_offset3;
2963     return FALSE;
2964 nigel 3 }
2965    
2966 nigel 23 /* Insufficient room for saving captured contents */
2967 nigel 3
2968 nigel 23 else op = OP_BRA;
2969     }
2970    
2971     /* Other types of node can be handled by a switch */
2972    
2973     switch(op)
2974     {
2975     case OP_BRA: /* Non-capturing bracket: optimized */
2976     DPRINTF(("start bracket 0\n"));
2977 nigel 3 do
2978     {
2979 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2980 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2981     }
2982     while (*ecode == OP_ALT);
2983 nigel 23 DPRINTF(("bracket 0 failed\n"));
2984     return FALSE;
2985 nigel 3
2986 nigel 23 /* Conditional group: compilation checked that there are no more than
2987     two branches. If the condition is false, skipping the first branch takes us
2988     past the end if there is only one branch, but that's OK because that is
2989     exactly what going to the ket would do. */
2990 nigel 3
2991 nigel 23 case OP_COND:
2992     if (ecode[3] == OP_CREF) /* Condition is extraction test */
2993 nigel 3 {
2994 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
2995     return match(eptr,
2996     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2997     5 : 3 + (ecode[1] << 8) + ecode[2]),
2998     offset_top, md, ims, FALSE, eptr);
2999 nigel 3 }
3000    
3001 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
3002     the final argument TRUE causes it to stop at the end of an assertion. */
3003 nigel 3
3004 nigel 23 else
3005     {
3006     if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3007     {
3008     ecode += 3 + (ecode[4] << 8) + ecode[5];
3009     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3010     }
3011     else ecode += (ecode[1] << 8) + ecode[2];
3012     return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3013     }
3014     /* Control never reaches here */
3015 nigel 3
3016 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
3017    
3018     case OP_CREF:
3019     ecode += 2;
3020     break;
3021    
3022     /* End of the pattern */
3023    
3024 nigel 3 case OP_END:
3025     md->end_match_ptr = eptr; /* Record where we ended */
3026     md->end_offset_top = offset_top; /* and how many extracts were taken */
3027     return TRUE;
3028    
3029 nigel 23 /* Change option settings */
3030 nigel 3
3031 nigel 23 case OP_OPT:
3032     ims = ecode[1];
3033     ecode += 2;
3034     DPRINTF(("ims set to %02x\n", ims));
3035     break;
3036 nigel 3
3037     /* Assertion brackets. Check the alternative branches in turn - the
3038     matching won't pass the KET for an assertion. If any one branch matches,
3039 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3040     start of each branch to move the current point backwards, so the code at
3041     this level is identical to the lookahead case. */
3042 nigel 3
3043     case OP_ASSERT:
3044 nigel 23 case OP_ASSERTBACK:
3045 nigel 3 do
3046     {
3047 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3048 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3049     }
3050     while (*ecode == OP_ALT);
3051     if (*ecode == OP_KET) return FALSE;
3052    
3053 nigel 23 /* If checking an assertion for a condition, return TRUE. */
3054    
3055     if (condassert) return TRUE;
3056    
3057 nigel 3 /* Continue from after the assertion, updating the offsets high water
3058     mark, since extracts may have been taken during the assertion. */
3059    
3060     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3061     ecode += 3;
3062     offset_top = md->end_offset_top;
3063     continue;
3064    
3065     /* Negative assertion: all branches must fail to match */
3066    
3067     case OP_ASSERT_NOT:
3068 nigel 23 case OP_ASSERTBACK_NOT:
3069 nigel 3 do
3070     {
3071 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3072 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3073     }
3074     while (*ecode == OP_ALT);
3075 nigel 23
3076     if (condassert) return TRUE;
3077 nigel 3 ecode += 3;
3078     continue;
3079    
3080 nigel 23 /* Move the subject pointer back. This occurs only at the start of
3081     each branch of a lookbehind assertion. If we are too close to the start to
3082     move back, this match function fails. */
3083    
3084     case OP_REVERSE:
3085     eptr -= (ecode[1] << 8) + ecode[2];
3086     if (eptr < md->start_subject) return FALSE;
3087     ecode += 3;
3088     break;
3089    
3090    
3091 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
3092     the point in the subject string is not moved back. Thus there can never be
3093 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
3094 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
3095 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
3096     pointer. */
3097 nigel 3
3098     case OP_ONCE:
3099     {
3100 nigel 23 const uschar *prev = ecode;
3101 nigel 3
3102 nigel 23 do
3103     {
3104     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3105     ecode += (ecode[1] << 8) + ecode[2];
3106     }
3107     while (*ecode == OP_ALT);
3108 nigel 3
3109 nigel 23 /* If hit the end of the group (which could be repeated), fail */
3110 nigel 3
3111 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3112    
3113     /* Continue as from after the assertion, updating the offsets high water
3114     mark, since extracts may have been taken. */
3115    
3116     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3117    
3118     offset_top = md->end_offset_top;
3119     eptr = md->end_match_ptr;
3120    
3121     /* For a non-repeating ket, just continue at this level. This also
3122     happens for a repeating ket if no characters were matched in the group.
3123     This is the forcible breaking of infinite loops as implemented in Perl
3124     5.005. If there is an options reset, it will get obeyed in the normal
3125     course of events. */
3126    
3127     if (*ecode == OP_KET || eptr == eptrb)
3128     {
3129     ecode += 3;
3130     break;
3131     }
3132    
3133     /* The repeating kets try the rest of the pattern or restart from the
3134     preceding bracket, in the appropriate order. We need to reset any options
3135     that changed within the bracket before re-running it, so check the next
3136     opcode. */
3137    
3138     if (ecode[3] == OP_OPT)
3139     {
3140     ims = (ims & ~PCRE_IMS) | ecode[4];
3141     DPRINTF(("ims set to %02x at group repeat\n", ims));
3142     }
3143    
3144     if (*ecode == OP_KETRMIN)
3145     {
3146     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3147     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3148     }
3149     else /* OP_KETRMAX */
3150     {
3151     if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3152     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3153     }
3154     }
3155     return FALSE;
3156    
3157 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
3158     bracketed group and go to there. */
3159    
3160     case OP_ALT:
3161     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3162     break;
3163    
3164     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3165     that it may occur zero times. It may repeat infinitely, or not at all -
3166     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3167     repeat limits are compiled as a number of copies, with the optional ones
3168     preceded by BRAZERO or BRAMINZERO. */
3169    
3170     case OP_BRAZERO:
3171     {
3172 nigel 7 const uschar *next = ecode+1;
3173 nigel 23 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3174 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3175     ecode = next + 3;
3176     }
3177     break;
3178    
3179     case OP_BRAMINZERO:
3180     {
3181 nigel 7 const uschar *next = ecode+1;
3182 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3183 nigel 23 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3184 nigel 3 ecode++;
3185     }
3186 nigel 23 break;
3187 nigel 3
3188     /* End of a group, repeated or non-repeating. If we are at the end of
3189     an assertion "group", stop matching and return TRUE, but record the
3190 nigel 23 current high water mark for use by positive assertions. Do this also
3191     for the "once" (not-backup up) groups. */
3192 nigel 3
3193     case OP_KET:
3194     case OP_KETRMIN:
3195     case OP_KETRMAX:
3196     {
3197 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3198 nigel 3
3199 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3200     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3201     *prev == OP_ONCE)
3202 nigel 3 {
3203     md->end_match_ptr = eptr; /* For ONCE */
3204     md->end_offset_top = offset_top;
3205     return TRUE;
3206     }
3207    
3208 nigel 23 /* In all other cases except a conditional group we have to check the
3209     group number back at the start and if necessary complete handling an
3210     extraction by setting the offsets and bumping the high water mark. */
3211 nigel 3
3212 nigel 23 if (*prev != OP_COND)
3213     {
3214     int number = *prev - OP_BRA;
3215     int offset = number << 1;
3216 nigel 3
3217 nigel 23 DPRINTF(("end bracket %d\n", number));
3218 nigel 3
3219 nigel 23 if (number > 0)
3220 nigel 3 {
3221 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3222     {
3223     md->offset_vector[offset] =
3224     md->offset_vector[md->offset_end - number];
3225     md->offset_vector[offset+1] = eptr - md->start_subject;
3226     if (offset_top <= offset) offset_top = offset + 2;
3227     }
3228 nigel 3 }
3229     }
3230    
3231 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3232     the group. */
3233 nigel 3
3234 nigel 23 ims = original_ims;
3235     DPRINTF(("ims reset to %02x\n", ims));
3236    
3237     /* For a non-repeating ket, just continue at this level. This also
3238     happens for a repeating ket if no characters were matched in the group.
3239     This is the forcible breaking of infinite loops as implemented in Perl
3240     5.005. If there is an options reset, it will get obeyed in the normal
3241     course of events. */
3242    
3243     if (*ecode == OP_KET || eptr == eptrb)
3244 nigel 3 {
3245     ecode += 3;
3246     break;
3247     }
3248    
3249     /* The repeating kets try the rest of the pattern or restart from the
3250     preceding bracket, in the appropriate order. */
3251    
3252     if (*ecode == OP_KETRMIN)
3253     {
3254 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3255     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3256 nigel 3 }
3257     else /* OP_KETRMAX */
3258     {
3259 nigel 23 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3260     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3261 nigel 3 }
3262     }
3263     return FALSE;
3264    
3265     /* Start of subject unless notbol, or after internal newline if multiline */
3266    
3267     case OP_CIRC:
3268     if (md->notbol && eptr == md->start_subject) return FALSE;
3269 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3270 nigel 3 {
3271     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3272     ecode++;
3273     break;
3274     }
3275     /* ... else fall through */
3276    
3277     /* Start of subject assertion */
3278    
3279     case OP_SOD:
3280     if (eptr != md->start_subject) return FALSE;
3281     ecode++;
3282     break;
3283    
3284 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3285     newline unless endonly is set, else end of subject unless noteol is set. */
3286 nigel 3
3287     case OP_DOLL:
3288 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3289 nigel 3 {
3290 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3291     else { if (md->noteol) return FALSE; }
3292 nigel 3 ecode++;
3293     break;
3294     }
3295 nigel 23 else
3296 nigel 3 {
3297 nigel 23 if (md->noteol) return FALSE;
3298     if (!md->endonly)
3299     {
3300     if (eptr < md->end_subject - 1 ||
3301     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3302    
3303     ecode++;
3304     break;
3305     }
3306 nigel 3 }
3307     /* ... else fall through */
3308    
3309 nigel 23 /* End of subject assertion (\z) */
3310 nigel 3
3311     case OP_EOD:
3312     if (eptr < md->end_subject) return FALSE;
3313     ecode++;
3314     break;
3315    
3316 nigel 23 /* End of subject or ending \n assertion (\Z) */
3317    
3318     case OP_EODN:
3319     if (eptr < md->end_subject - 1 ||
3320     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3321     ecode++;
3322     break;
3323    
3324 nigel 3 /* Word boundary assertions */
3325    
3326     case OP_NOT_WORD_BOUNDARY:
3327     case OP_WORD_BOUNDARY:
3328     {
3329     BOOL prev_is_word = (eptr != md->start_subject) &&
3330 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3331 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
3332 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
3333 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
3334     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3335     return FALSE;
3336     }
3337     break;
3338    
3339     /* Match a single character type; inline for speed */
3340    
3341     case OP_ANY:
3342 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3343     return FALSE;
3344 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3345     ecode++;
3346     break;
3347    
3348     case OP_NOT_DIGIT:
3349 nigel 25 if (eptr >= md->end_subject ||
3350     (md->ctypes[*eptr++] & ctype_digit) != 0)
3351 nigel 3 return FALSE;
3352     ecode++;
3353     break;
3354    
3355     case OP_DIGIT:
3356 nigel 25 if (eptr >= md->end_subject ||
3357     (md->ctypes[*eptr++] & ctype_digit) == 0)
3358 nigel 3 return FALSE;
3359     ecode++;
3360     break;
3361    
3362     case OP_NOT_WHITESPACE:
3363 nigel 25 if (eptr >= md->end_subject ||
3364     (md->ctypes[*eptr++] & ctype_space) != 0)
3365 nigel 3 return FALSE;
3366     ecode++;
3367     break;
3368    
3369     case OP_WHITESPACE:
3370 nigel 25 if (eptr >= md->end_subject ||
3371     (md->ctypes[*eptr++] & ctype_space) == 0)
3372 nigel 3 return FALSE;
3373     ecode++;
3374     break;
3375    
3376     case OP_NOT_WORDCHAR:
3377 nigel 25 if (eptr >= md->end_subject ||
3378     (md->ctypes[*eptr++] & ctype_word) != 0)
3379 nigel 3 return FALSE;
3380     ecode++;
3381     break;
3382    
3383     case OP_WORDCHAR:
3384 nigel 25 if (eptr >= md->end_subject ||
3385     (md->ctypes[*eptr++] & ctype_word) == 0)
3386 nigel 3 return FALSE;
3387     ecode++;
3388     break;
3389    
3390     /* Match a back reference, possibly repeatedly. Look past the end of the
3391     item to see if there is repeat information following. The code is similar
3392     to that for character classes, but repeated for efficiency. Then obey
3393     similar code to character type repeats - written out again for speed.
3394     However, if the referenced string is the empty string, always treat
3395     it as matched, any number of times (otherwise there could be infinite
3396     loops). */
3397    
3398     case OP_REF:
3399     {
3400     int length;
3401 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
3402 nigel 3 ecode += 2; /* Advance past the item */
3403    
3404 nigel 23 /* If the reference is unset, set the length to be longer than the amount
3405     of subject left; this ensures that every attempt at a match fails. We
3406     can't just fail here, because of the possibility of quantifiers with zero
3407     minima. */
3408 nigel 3
3409 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3410     md->end_subject - eptr + 1 :
3411     md->offset_vector[offset+1] - md->offset_vector[offset];
3412 nigel 3
3413 nigel 23 /* Set up for repetition, or handle the non-repeated case */
3414    
3415 nigel 3 switch (*ecode)
3416     {
3417     case OP_CRSTAR:
3418     case OP_CRMINSTAR:
3419     case OP_CRPLUS:
3420     case OP_CRMINPLUS:
3421     case OP_CRQUERY:
3422     case OP_CRMINQUERY:
3423     c = *ecode++ - OP_CRSTAR;
3424     minimize = (c & 1) != 0;
3425     min = rep_min[c]; /* Pick up values from tables; */
3426     max = rep_max[c]; /* zero for max => infinity */
3427     if (max == 0) max = INT_MAX;
3428     break;
3429    
3430     case OP_CRRANGE:
3431     case OP_CRMINRANGE:
3432     minimize = (*ecode == OP_CRMINRANGE);
3433     min = (ecode[1] << 8) + ecode[2];
3434     max = (ecode[3] << 8) + ecode[4];
3435     if (max == 0) max = INT_MAX;
3436     ecode += 5;
3437     break;
3438    
3439     default: /* No repeat follows */
3440 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3441 nigel 3 eptr += length;
3442     continue; /* With the main loop */
3443     }
3444    
3445     /* If the length of the reference is zero, just continue with the
3446     main loop. */
3447    
3448     if (length == 0) continue;
3449    
3450     /* First, ensure the minimum number of matches are present. We get back
3451     the length of the reference string explicitly rather than passing the
3452     address of eptr, so that eptr can be a register variable. */
3453    
3454     for (i = 1; i <= min; i++)
3455     {
3456 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3457 nigel 3 eptr += length;
3458     }
3459    
3460     /* If min = max, continue at the same level without recursion.
3461     They are not both allowed to be zero. */
3462    
3463     if (min == max) continue;
3464    
3465     /* If minimizing, keep trying and advancing the pointer */
3466    
3467     if (minimize)
3468     {
3469     for (i = min;; i++)
3470     {
3471 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3472     return TRUE;
3473     if (i >= max || !match_ref(offset, eptr, length, md, ims))
3474 nigel 3 return FALSE;
3475     eptr += length;
3476     }
3477     /* Control never gets here */
3478     }
3479    
3480     /* If maximizing, find the longest string and work backwards */
3481    
3482     else
3483     {
3484 nigel 7 const uschar *pp = eptr;
3485 nigel 3 for (i = min; i < max; i++)
3486     {
3487 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
3488 nigel 3 eptr += length;
3489     }
3490     while (eptr >= pp)
3491     {
3492 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3493     return TRUE;
3494 nigel 3 eptr -= length;
3495     }
3496     return FALSE;
3497     }
3498     }
3499     /* Control never gets here */
3500    
3501 nigel 23
3502    
3503 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
3504     item to see if there is repeat information following. Then obey similar
3505 nigel 23 code to character type repeats - written out again for speed. */
3506 nigel 3
3507     case OP_CLASS:
3508     {
3509 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
3510     ecode += 33; /* Advance past the item */
3511 nigel 3
3512     switch (*ecode)
3513     {
3514     case OP_CRSTAR:
3515     case OP_CRMINSTAR:
3516     case OP_CRPLUS:
3517     case OP_CRMINPLUS:
3518     case OP_CRQUERY:
3519     case OP_CRMINQUERY:
3520     c = *ecode++ - OP_CRSTAR;
3521     minimize = (c & 1) != 0;
3522     min = rep_min[c]; /* Pick up values from tables; */
3523     max = rep_max[c]; /* zero for max => infinity */
3524     if (max == 0) max = INT_MAX;
3525     break;
3526    
3527     case OP_CRRANGE:
3528     case OP_CRMINRANGE:
3529     minimize = (*ecode == OP_CRMINRANGE);
3530     min = (ecode[1] << 8) + ecode[2];
3531     max = (ecode[3] << 8) + ecode[4];
3532     if (max == 0) max = INT_MAX;
3533     ecode += 5;
3534     break;
3535    
3536     default: /* No repeat follows */
3537 nigel 13 min = max = 1;
3538     break;
3539 nigel 3 }
3540    
3541     /* First, ensure the minimum number of matches are present. */
3542    
3543     for (i = 1; i <= min; i++)
3544     {
3545     if (eptr >= md->end_subject) return FALSE;
3546     c = *eptr++;
3547 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3548 nigel 3 return FALSE;
3549     }
3550    
3551     /* If max == min we can continue with the main loop without the
3552     need to recurse. */
3553    
3554     if (min == max) continue;
3555    
3556     /* If minimizing, keep testing the rest of the expression and advancing
3557     the pointer while it matches the class. */
3558    
3559     if (minimize)
3560     {
3561     for (i = min;; i++)
3562     {
3563 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3564     return TRUE;
3565 nigel 3 if (i >= max || eptr >= md->end_subject) return FALSE;
3566     c = *eptr++;
3567 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3568 nigel 3 return FALSE;
3569     }
3570     /* Control never gets here */
3571     }
3572    
3573     /* If maximizing, find the longest possible run, then work backwards. */
3574    
3575     else
3576     {
3577 nigel 7 const uschar *pp = eptr;
3578 nigel 3 for (i = min; i < max; eptr++, i++)
3579     {
3580     if (eptr >= md->end_subject) break;
3581     c = *eptr;
3582 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3583 nigel 3 break;
3584     }
3585    
3586     while (eptr >= pp)
3587 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3588     return TRUE;
3589 nigel 3 return FALSE;
3590     }
3591     }
3592     /* Control never gets here */
3593    
3594     /* Match a run of characters */
3595    
3596     case OP_CHARS:
3597     {
3598     register int length = ecode[1];
3599     ecode += 2;
3600    
3601 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3602 nigel 3 if (eptr >= md->end_subject)
3603     printf("matching subject <null> against pattern ");
3604     else
3605     {
3606     printf("matching subject ");
3607     pchars(eptr, length, TRUE, md);
3608     printf(" against pattern ");
3609     }
3610     pchars(ecode, length, FALSE, md);
3611     printf("\n");
3612 nigel 9 #endif
3613 nigel 3
3614     if (length > md->end_subject - eptr) return FALSE;
3615 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3616 nigel 3 {
3617 nigel 25 while (length-- > 0)
3618     if (md->lcc[*ecode++] != md->lcc[*eptr++])
3619     return FALSE;
3620 nigel 3 }
3621     else
3622     {
3623     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3624     }
3625     }
3626     break;
3627    
3628     /* Match a single character repeatedly; different opcodes share code. */
3629    
3630     case OP_EXACT:
3631     min = max = (ecode[1] << 8) + ecode[2];
3632     ecode += 3;
3633     goto REPEATCHAR;
3634    
3635     case OP_UPTO:
3636     case OP_MINUPTO:
3637     min = 0;
3638     max = (ecode[1] << 8) + ecode[2];
3639     minimize = *ecode == OP_MINUPTO;
3640     ecode += 3;
3641     goto REPEATCHAR;
3642    
3643     case OP_STAR:
3644     case OP_MINSTAR:
3645     case OP_PLUS:
3646     case OP_MINPLUS:
3647     case OP_QUERY:
3648     case OP_MINQUERY:
3649     c = *ecode++ - OP_STAR;
3650     minimize = (c & 1) != 0;
3651     min = rep_min[c]; /* Pick up values from tables; */
3652     max = rep_max[c]; /* zero for max => infinity */
3653     if (max == 0) max = INT_MAX;
3654    
3655     /* Common code for all repeated single-character matches. We can give
3656     up quickly if there are fewer than the minimum number of characters left in
3657     the subject. */
3658    
3659     REPEATCHAR:
3660     if (min > md->end_subject - eptr) return FALSE;
3661     c = *ecode++;
3662    
3663     /* The code is duplicated for the caseless and caseful cases, for speed,
3664     since matching characters is likely to be quite common. First, ensure the
3665     minimum number of matches are present. If min = max, continue at the same
3666     level without recursing. Otherwise, if minimizing, keep trying the rest of
3667     the expression and advancing one matching character if failing, up to the
3668     maximum. Alternatively, if maximizing, find the maximum number of
3669     characters and work backwards. */
3670    
3671 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3672     max, eptr));
3673 nigel 3
3674 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3675 nigel 3 {
3676 nigel 25 c = md->lcc[c];
3677     for (i = 1; i <= min; i++)
3678     if (c != md->lcc[*eptr++]) return FALSE;
3679 nigel 3 if (min == max) continue;
3680     if (minimize)
3681     {
3682     for (i = min;; i++)
3683     {
3684 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3685     return TRUE;
3686 nigel 25 if (i >= max || eptr >= md->end_subject ||
3687     c != md->lcc[*eptr++])
3688 nigel 3 return FALSE;
3689     }
3690     /* Control never gets here */
3691     }
3692     else
3693     {
3694 nigel 7 const uschar *pp = eptr;
3695 nigel 3 for (i = min; i < max; i++)
3696     {
3697 nigel 25 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3698 nigel 3 eptr++;
3699     }
3700     while (eptr >= pp)
3701 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3702     return TRUE;
3703 nigel 3 return FALSE;
3704     }
3705     /* Control never gets here */
3706     }
3707    
3708     /* Caseful comparisons */
3709    
3710     else
3711     {
3712     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3713     if (min == max) continue;
3714     if (minimize)
3715     {
3716     for (i = min;; i++)
3717     {
3718 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3719     return TRUE;
3720 nigel 3 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3721     }
3722     /* Control never gets here */
3723     }
3724     else
3725     {
3726 nigel 7 const uschar *pp = eptr;
3727 nigel 3 for (i = min; i < max; i++)
3728     {
3729     if (eptr >= md->end_subject || c != *eptr) break;
3730     eptr++;
3731     }
3732     while (eptr >= pp)
3733 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3734     return TRUE;
3735 nigel 3 return FALSE;
3736     }
3737     }
3738     /* Control never gets here */
3739    
3740     /* Match a negated single character */
3741    
3742     case OP_NOT:
3743 nigel 9 if (eptr >= md->end_subject) return FALSE;
3744 nigel 3 ecode++;
3745 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3746 nigel 3 {
3747 nigel 25 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3748 nigel 3 }
3749     else
3750     {
3751     if (*ecode++ == *eptr++) return FALSE;
3752     }
3753     break;
3754    
3755     /* Match a negated single character repeatedly. This is almost a repeat of
3756     the code for a repeated single character, but I haven't found a nice way of
3757     commoning these up that doesn't require a test of the positive/negative
3758     option for each character match. Maybe that wouldn't add very much to the
3759     time taken, but character matching *is* what this is all about... */
3760    
3761     case OP_NOTEXACT:
3762     min = max = (ecode[1] << 8) + ecode[2];
3763     ecode += 3;
3764     goto REPEATNOTCHAR;
3765    
3766     case OP_NOTUPTO:
3767     case OP_NOTMINUPTO:
3768     min = 0;
3769     max = (ecode[1] << 8) + ecode[2];
3770     minimize = *ecode == OP_NOTMINUPTO;
3771     ecode += 3;
3772     goto REPEATNOTCHAR;
3773    
3774     case OP_NOTSTAR:
3775     case OP_NOTMINSTAR:
3776     case OP_NOTPLUS:
3777     case OP_NOTMINPLUS:
3778     case OP_NOTQUERY:
3779     case OP_NOTMINQUERY:
3780     c = *ecode++ - OP_NOTSTAR;
3781     minimize = (c & 1) != 0;
3782     min = rep_min[c]; /* Pick up values from tables; */
3783     max = rep_max[c]; /* zero for max => infinity */
3784     if (max == 0) max = INT_MAX;
3785    
3786     /* Common code for all repeated single-character matches. We can give
3787     up quickly if there are fewer than the minimum number of characters left in
3788     the subject. */
3789    
3790     REPEATNOTCHAR:
3791     if (min > md->end_subject - eptr) return FALSE;
3792     c = *ecode++;
3793    
3794     /* The code is duplicated for the caseless and caseful cases, for speed,
3795     since matching characters is likely to be quite common. First, ensure the
3796     minimum number of matches are present. If min = max, continue at the same
3797     level without recursing. Otherwise, if minimizing, keep trying the rest of
3798     the expression and advancing one matching character if failing, up to the
3799     maximum. Alternatively, if maximizing, find the maximum number of
3800     characters and work backwards. */
3801    
3802 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3803     max, eptr));
3804 nigel 3
3805 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3806 nigel 3 {
3807 nigel 25 c = md->lcc[c];
3808     for (i = 1; i <= min; i++)
3809     if (c == md->lcc[*eptr++]) return FALSE;
3810 nigel 3 if (min == max) continue;
3811     if (minimize)
3812     {
3813     for (i = min;; i++)
3814     {
3815 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3816     return TRUE;
3817 nigel 25 if (i >= max || eptr >= md->end_subject ||
3818     c == md->lcc[*eptr++])
3819 nigel 3 return FALSE;
3820     }
3821     /* Control never gets here */
3822     }
3823     else
3824     {
3825 nigel 7 const uschar *pp = eptr;
3826 nigel 3 for (i = min; i < max; i++)
3827     {
3828 nigel 25 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3829 nigel 3 eptr++;
3830     }
3831     while (eptr >= pp)
3832 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3833     return TRUE;
3834 nigel 3 return FALSE;
3835     }
3836     /* Control never gets here */
3837     }
3838    
3839     /* Caseful comparisons */
3840    
3841     else
3842     {
3843     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3844     if (min == max) continue;
3845     if (minimize)
3846     {
3847     for (i = min;; i++)
3848     {
3849 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3850     return TRUE;
3851 nigel 3 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3852     }
3853     /* Control never gets here */
3854     }
3855     else
3856     {
3857 nigel 7 const uschar *pp = eptr;
3858 nigel 3 for (i = min; i < max; i++)
3859     {
3860     if (eptr >= md->end_subject || c == *eptr) break;
3861     eptr++;
3862     }
3863     while (eptr >= pp)
3864 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3865     return TRUE;
3866 nigel 3 return FALSE;
3867     }
3868     }
3869     /* Control never gets here */
3870    
3871     /* Match a single character type repeatedly; several different opcodes
3872     share code. This is very similar to the code for single characters, but we
3873     repeat it in the interests of efficiency. */
3874    
3875     case OP_TYPEEXACT:
3876     min = max = (ecode[1] << 8) + ecode[2];
3877     minimize = TRUE;
3878     ecode += 3;
3879     goto REPEATTYPE;
3880    
3881     case OP_TYPEUPTO:
3882     case OP_TYPEMINUPTO:
3883     min = 0;
3884     max = (ecode[1] << 8) + ecode[2];
3885     minimize = *ecode == OP_TYPEMINUPTO;
3886     ecode += 3;
3887     goto REPEATTYPE;
3888    
3889     case OP_TYPESTAR:
3890     case OP_TYPEMINSTAR:
3891     case OP_TYPEPLUS:
3892     case OP_TYPEMINPLUS:
3893     case OP_TYPEQUERY:
3894     case OP_TYPEMINQUERY:
3895     c = *ecode++ - OP_TYPESTAR;
3896     minimize = (c & 1) != 0;
3897     min = rep_min[c]; /* Pick up values from tables; */
3898     max = rep_max[c]; /* zero for max => infinity */
3899     if (max == 0) max = INT_MAX;
3900    
3901     /* Common code for all repeated single character type matches */
3902    
3903     REPEATTYPE:
3904     ctype = *ecode++; /* Code for the character type */
3905    
3906     /* First, ensure the minimum number of matches are present. Use inline
3907     code for maximizing the speed, and do the type test once at the start
3908     (i.e. keep it out of the loop). Also test that there are at least the
3909     minimum number of characters before we start. */
3910    
3911     if (min > md->end_subject - eptr) return FALSE;
3912     if (min > 0) switch(ctype)
3913     {
3914     case OP_ANY:
3915 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3916 nigel 3 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3917     else eptr += min;
3918     break;
3919    
3920     case OP_NOT_DIGIT:
3921     for (i = 1; i <= min; i++)
3922 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3923 nigel 3 break;
3924    
3925     case OP_DIGIT:
3926     for (i = 1; i <= min; i++)
3927 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3928 nigel 3 break;
3929    
3930     case OP_NOT_WHITESPACE:
3931     for (i = 1; i <= min; i++)
3932 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3933 nigel 3 break;
3934    
3935     case OP_WHITESPACE:
3936     for (i = 1; i <= min; i++)
3937 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3938 nigel 3 break;
3939    
3940     case OP_NOT_WORDCHAR:
3941 nigel 25 for (i = 1; i <= min; i++)
3942     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3943     return FALSE;
3944 nigel 3 break;
3945    
3946     case OP_WORDCHAR:
3947 nigel 25 for (i = 1; i <= min; i++)
3948     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3949     return FALSE;
3950 nigel 3 break;
3951     }
3952    
3953     /* If min = max, continue at the same level without recursing */
3954    
3955     if (min == max) continue;
3956    
3957     /* If minimizing, we have to test the rest of the pattern before each
3958 nigel 25 subsequent match. */
3959 nigel 3
3960     if (minimize)
3961     {
3962     for (i = min;; i++)
3963     {
3964 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3965 nigel 25 if (i >= max || eptr >= md->end_subject) return FALSE;
3966    
3967     c = *eptr++;
3968     switch(ctype)
3969     {
3970     case OP_ANY:
3971     if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3972     break;
3973    
3974     case OP_NOT_DIGIT:
3975     if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3976     break;
3977    
3978     case OP_DIGIT:
3979     if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3980     break;
3981    
3982     case OP_NOT_WHITESPACE:
3983     if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3984     break;
3985    
3986     case OP_WHITESPACE:
3987     if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3988     break;
3989    
3990     case OP_NOT_WORDCHAR:
3991     if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3992     break;
3993    
3994     case OP_WORDCHAR:
3995     if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3996     break;
3997     }
3998 nigel 3 }
3999     /* Control never gets here */
4000     }
4001    
4002     /* If maximizing it is worth using inline code for speed, doing the type
4003     test once at the start (i.e. keep it out of the loop). */
4004    
4005     else
4006     {
4007 nigel 7 const uschar *pp = eptr;
4008 nigel 3 switch(ctype)
4009     {
4010     case OP_ANY:
4011 nigel 23 if ((ims & PCRE_DOTALL) == 0)
4012 nigel 3 {
4013     for (i = min; i < max; i++)
4014     {
4015     if (eptr >= md->end_subject || *eptr == '\n') break;
4016     eptr++;
4017     }
4018     }
4019     else
4020     {
4021     c = max - min;
4022     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4023     eptr += c;
4024     }
4025     break;
4026    
4027     case OP_NOT_DIGIT:
4028     for (i = min; i < max; i++)
4029     {
4030 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4031 nigel 3 break;
4032     eptr++;
4033     }
4034     break;
4035    
4036     case OP_DIGIT:
4037     for (i = min; i < max; i++)
4038     {
4039 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4040 nigel 3 break;
4041     eptr++;
4042     }
4043     break;
4044    
4045     case OP_NOT_WHITESPACE:
4046     for (i = min; i < max; i++)
4047     {
4048 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4049 nigel 3 break;
4050     eptr++;
4051     }
4052     break;
4053    
4054     case OP_WHITESPACE:
4055     for (i = min; i < max; i++)
4056     {
4057 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4058 nigel 3 break;
4059     eptr++;
4060     }
4061     break;
4062    
4063     case OP_NOT_WORDCHAR:
4064     for (i = min; i < max; i++)
4065     {
4066 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4067 nigel 3 break;
4068     eptr++;
4069     }
4070     break;
4071    
4072     case OP_WORDCHAR:
4073     for (i = min; i < max; i++)
4074     {
4075 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4076 nigel 3 break;
4077     eptr++;
4078     }
4079     break;
4080     }
4081    
4082     while (eptr >= pp)
4083 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4084     return TRUE;
4085 nigel 3 return FALSE;
4086     }
4087     /* Control never gets here */
4088    
4089     /* There's been some horrible disaster. */
4090    
4091     default:
4092 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
4093 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4094     return FALSE;
4095     }
4096    
4097     /* Do not stick any code in here without much thought; it is assumed
4098     that "continue" in the code above comes out to here to repeat the main
4099     loop. */
4100    
4101     } /* End of main loop */
4102     /* Control never reaches here */
4103     }
4104    
4105    
4106    
4107 nigel 9
4108     /*************************************************
4109 nigel 3 * Execute a Regular Expression *
4110     *************************************************/
4111    
4112     /* This function applies a compiled re to a subject string and picks out
4113     portions of the string if it matches. Two elements in the vector are set for
4114     each substring: the offsets to the start and end of the substring.
4115    
4116     Arguments:
4117     external_re points to the compiled expression
4118     external_extra points to "hints" from pcre_study() or is NULL
4119     subject points to the subject string
4120     length length of subject string (may contain binary zeros)
4121 nigel 35 start_offset where to start in the subject string
4122 nigel 3 options option bits
4123     offsets points to a vector of ints to be filled in with offsets
4124     offsetcount the number of elements in the vector
4125    
4126     Returns: > 0 => success; value is the number of elements filled in
4127     = 0 => success, but offsets is not big enough
4128     -1 => failed to match
4129     < -1 => some kind of unexpected problem
4130     */
4131    
4132     int
4133     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4134 nigel 35 const char *subject, int length, int start_offset, int options, int *offsets,
4135     int offsetcount)
4136 nigel 3 {
4137 nigel 11 int resetcount, ocount;
4138 nigel 3 int first_char = -1;
4139 nigel 23 int ims = 0;
4140 nigel 3 match_data match_block;
4141 nigel 7 const uschar *start_bits = NULL;
4142 nigel 35 const uschar *start_match = (const uschar *)subject + start_offset;
4143 nigel 7 const uschar *end_subject;
4144     const real_pcre *re = (const real_pcre *)external_re;
4145     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4146 nigel 11 BOOL using_temporary_offsets = FALSE;
4147 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4148     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4149    
4150     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4151    
4152     if (re == NULL || subject == NULL ||
4153     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4154     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4155    
4156 nigel 7 match_block.start_subject = (const uschar *)subject;
4157 nigel 3 match_block.end_subject = match_block.start_subject + length;
4158     end_subject = match_block.end_subject;
4159    
4160 nigel 23 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4161 nigel 3
4162     match_block.notbol = (options & PCRE_NOTBOL) != 0;
4163     match_block.noteol = (options & PCRE_NOTEOL) != 0;
4164    
4165     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4166    
4167 nigel 25 match_block.lcc = re->tables + lcc_offset;
4168     match_block.ctypes = re->tables + ctypes_offset;
4169    
4170 nigel 23 /* The ims options can vary during the matching as a result of the presence
4171     of (?ims) items in the pattern. They are kept in a local variable so that
4172     restoring at the exit of a group is easy. */
4173    
4174     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4175    
4176 nigel 3 /* If the expression has got more back references than the offsets supplied can
4177     hold, we get a temporary bit of working store to use during the matching.
4178 nigel 11 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4179 nigel 23 of 3. */
4180 nigel 3
4181 nigel 23 ocount = offsetcount - (offsetcount % 3);
4182    
4183     if (re->top_backref > 0 && re->top_backref >= ocount/3)
4184 nigel 3 {
4185 nigel 23 ocount = re->top_backref * 3 + 3;
4186 nigel 15 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4187 nigel 3 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4188 nigel 11 using_temporary_offsets = TRUE;
4189 nigel 9 DPRINTF(("Got memory to hold back references\n"));
4190 nigel 3 }
4191     else match_block.offset_vector = offsets;
4192    
4193     match_block.offset_end = ocount;
4194 nigel 23 match_block.offset_max = (2*ocount)/3;
4195 nigel 3 match_block.offset_overflow = FALSE;
4196    
4197     /* Compute the minimum number of offsets that we need to reset each time. Doing
4198     this makes a huge difference to execution time when there aren't many brackets
4199     in the pattern. */
4200    
4201     resetcount = 2 + re->top_bracket * 2;
4202     if (resetcount > offsetcount) resetcount = ocount;
4203    
4204 nigel 27 /* Reset the working variable associated with each extraction. These should
4205     never be used unless previously set, but they get saved and restored, and so we
4206     initialize them to avoid reading uninitialized locations. */
4207    
4208     if (match_block.offset_vector != NULL)
4209     {
4210     register int *iptr = match_block.offset_vector + ocount;
4211     register int *iend = iptr - resetcount/2 + 1;
4212