/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 29 - (hide annotations) (download)
Sat Feb 24 21:38:53 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 121051 byte(s)
Load pcre-2.03 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 27 Copyright (c) 1997-1999 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Define DEBUG to get debugging output on stdout. */
37    
38     /* #define DEBUG */
39    
40 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41     inline, and there are *still* stupid compilers about that don't like indented
42     pre-processor statements. I suppose it's only been 10 years... */
43 nigel 3
44 nigel 9 #ifdef DEBUG
45     #define DPRINTF(p) printf p
46     #else
47     #define DPRINTF(p) /*nothing*/
48     #endif
49    
50 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
51     the external pcre header. */
52    
53     #include "internal.h"
54    
55    
56 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58     #ifdef __cplusplus
59     #define class pcre_class
60     #endif
61    
62    
63 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
64     not be set greater than 200. */
65    
66     #define BRASTACK_SIZE 200
67    
68    
69 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70    
71 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73 nigel 3
74 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
75 nigel 3
76     #ifdef DEBUG
77 nigel 7 static const char *OP_names[] = {
78     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80     "Opt", "^", "$", "Any", "chars", "not",
81 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84     "*", "*?", "+", "+?", "?", "??", "{", "{",
85 nigel 23 "class", "Ref",
86     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 nigel 3 "Brazero", "Braminzero", "Bra"
89     };
90     #endif
91    
92     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93     are simple data values; negative values are for special things like \d and so
94     on. Zero means further processing is needed (for things like \x), or the escape
95     is invalid. */
96    
97 nigel 15 static const short int escapes[] = {
98 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 nigel 23 0, 0, -ESC_z /* x - z */
108 nigel 3 };
109    
110     /* Definition to allow mutual recursion */
111    
112 nigel 13 static BOOL
113 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 nigel 25 BOOL, int, compile_data *);
115 nigel 3
116    
117    
118     /*************************************************
119     * Global variables *
120     *************************************************/
121    
122     /* PCRE is thread-clean and doesn't use any global variables in the normal
123     sense. However, it calls memory allocation and free functions via the two
124     indirections below, which are can be changed by the caller, but are shared
125     between all threads. */
126    
127     void *(*pcre_malloc)(size_t) = malloc;
128     void (*pcre_free)(void *) = free;
129    
130    
131    
132    
133     /*************************************************
134 nigel 25 * Default character tables *
135     *************************************************/
136    
137     /* A default set of character tables is included in the PCRE binary. Its source
138     is built by the maketables auxiliary program, which uses the default C ctypes
139     functions, and put in the file chartables.c. These tables are used by PCRE
140     whenever the caller of pcre_compile() does not provide an alternate set of
141     tables. */
142    
143     #include "chartables.c"
144    
145    
146    
147     /*************************************************
148 nigel 3 * Return version string *
149     *************************************************/
150    
151 nigel 7 const char *
152 nigel 3 pcre_version(void)
153     {
154     return PCRE_VERSION;
155     }
156    
157    
158    
159    
160     /*************************************************
161     * Return info about a compiled pattern *
162     *************************************************/
163    
164     /* This function picks potentially useful data out of the private
165     structure.
166    
167     Arguments:
168     external_re points to compiled code
169     optptr where to pass back the options
170     first_char where to pass back the first character,
171     or -1 if multiline and all branches start ^,
172     or -2 otherwise
173    
174     Returns: number of identifying extraction brackets
175     or negative values on error
176     */
177    
178     int
179     pcre_info(const pcre *external_re, int *optptr, int *first_char)
180     {
181 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
182 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
183     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185     if (first_char != NULL)
186     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188     return re->top_bracket;
189     }
190    
191    
192    
193    
194     #ifdef DEBUG
195     /*************************************************
196     * Debugging function to print chars *
197     *************************************************/
198    
199     /* Print a sequence of chars in printable format, stopping at the end of the
200     subject if the requested.
201    
202     Arguments:
203     p points to characters
204     length number to print
205     is_subject TRUE if printing from within md->start_subject
206     md pointer to matching data block, if is_subject is TRUE
207    
208     Returns: nothing
209     */
210    
211 nigel 9 static void
212     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 nigel 3 {
214     int c;
215     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216     while (length-- > 0)
217     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218     }
219     #endif
220    
221    
222    
223    
224     /*************************************************
225     * Handle escapes *
226     *************************************************/
227    
228     /* This function is called when a \ has been encountered. It either returns a
229     positive value for a simple escape such as \n, or a negative value which
230     encodes one of the more complicated things such as \d. On entry, ptr is
231     pointing at the \. On exit, it is on the final character of the escape
232     sequence.
233    
234     Arguments:
235     ptrptr points to the pattern position pointer
236     errorptr points to the pointer to the error message
237     bracount number of previous extracting brackets
238     options the options bits
239     isclass TRUE if inside a character class
240 nigel 25 cd pointer to char tables block
241 nigel 3
242     Returns: zero or positive => a data character
243     negative => a special escape sequence
244     on error, errorptr is set
245     */
246    
247     static int
248 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
249 nigel 25 int options, BOOL isclass, compile_data *cd)
250 nigel 3 {
251 nigel 7 const uschar *ptr = *ptrptr;
252 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
253     int i;
254    
255     if (c == 0) *errorptr = ERR1;
256    
257     /* Digits or letters may have special meaning; all others are literals. */
258    
259     else if (c < '0' || c > 'z') {}
260    
261     /* Do an initial lookup in a table. A non-zero result is something that can be
262     returned immediately. Otherwise further processing may be required. */
263    
264     else if ((i = escapes[c - '0']) != 0) c = i;
265    
266     /* Escapes that need further processing, or are illegal. */
267    
268     else
269     {
270 nigel 7 const uschar *oldptr;
271 nigel 3 switch (c)
272     {
273     /* The handling of escape sequences consisting of a string of digits
274     starting with one that is not zero is not straightforward. By experiment,
275     the way Perl works seems to be as follows:
276    
277     Outside a character class, the digits are read as a decimal number. If the
278     number is less than 10, or if there are that many previous extracting
279     left brackets, then it is a back reference. Otherwise, up to three octal
280     digits are read to form an escaped byte. Thus \123 is likely to be octal
281     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
282     value is greater than 377, the least significant 8 bits are taken. Inside a
283     character class, \ followed by a digit is always an octal number. */
284    
285     case '1': case '2': case '3': case '4': case '5':
286     case '6': case '7': case '8': case '9':
287    
288     if (!isclass)
289     {
290     oldptr = ptr;
291     c -= '0';
292 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
293 nigel 3 c = c * 10 + *(++ptr) - '0';
294     if (c < 10 || c <= bracount)
295     {
296     c = -(ESC_REF + c);
297     break;
298     }
299     ptr = oldptr; /* Put the pointer back and fall through */
300     }
301    
302     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
303     generates a binary zero byte and treats the digit as a following literal.
304     Thus we have to pull back the pointer by one. */
305    
306     if ((c = *ptr) >= '8')
307     {
308     ptr--;
309     c = 0;
310     break;
311     }
312    
313     /* \0 always starts an octal number, but we may drop through to here with a
314     larger first octal digit */
315    
316     case '0':
317     c -= '0';
318 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
319 nigel 3 ptr[1] != '8' && ptr[1] != '9')
320     c = c * 8 + *(++ptr) - '0';
321     break;
322    
323     /* Special escapes not starting with a digit are straightforward */
324    
325     case 'x':
326     c = 0;
327 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
328 nigel 3 {
329     ptr++;
330 nigel 25 c = c * 16 + cd->lcc[*ptr] -
331     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
332 nigel 3 }
333     break;
334    
335     case 'c':
336     c = *(++ptr);
337     if (c == 0)
338     {
339     *errorptr = ERR2;
340     return 0;
341     }
342    
343     /* A letter is upper-cased; then the 0x40 bit is flipped */
344    
345 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
346 nigel 3 c ^= 0x40;
347     break;
348    
349     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
350     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
351 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
352     there used to be some cases other than the default, and there may be again
353     in future, so I haven't "optimized" it. */
354 nigel 3
355     default:
356     if ((options & PCRE_EXTRA) != 0) switch(c)
357     {
358     default:
359     *errorptr = ERR3;
360     break;
361     }
362     break;
363     }
364     }
365    
366     *ptrptr = ptr;
367     return c;
368     }
369    
370    
371    
372     /*************************************************
373     * Check for counted repeat *
374     *************************************************/
375    
376     /* This function is called when a '{' is encountered in a place where it might
377     start a quantifier. It looks ahead to see if it really is a quantifier or not.
378     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
379     where the ddds are digits.
380    
381     Arguments:
382     p pointer to the first char after '{'
383 nigel 25 cd pointer to char tables block
384 nigel 3
385     Returns: TRUE or FALSE
386     */
387    
388     static BOOL
389 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
390 nigel 3 {
391 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
392     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
393 nigel 3 if (*p == '}') return TRUE;
394    
395     if (*p++ != ',') return FALSE;
396     if (*p == '}') return TRUE;
397    
398 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
399     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
400 nigel 3 return (*p == '}');
401     }
402    
403    
404    
405     /*************************************************
406     * Read repeat counts *
407     *************************************************/
408    
409     /* Read an item of the form {n,m} and return the values. This is called only
410     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
411     so the syntax is guaranteed to be correct, but we need to check the values.
412    
413     Arguments:
414     p pointer to first char after '{'
415     minp pointer to int for min
416     maxp pointer to int for max
417     returned as -1 if no max
418     errorptr points to pointer to error message
419 nigel 25 cd pointer to character tables clock
420 nigel 3
421     Returns: pointer to '}' on success;
422     current ptr on error, with errorptr set
423     */
424    
425 nigel 7 static const uschar *
426 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
427     const char **errorptr, compile_data *cd)
428 nigel 3 {
429     int min = 0;
430     int max = -1;
431    
432 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
433 nigel 3
434     if (*p == '}') max = min; else
435     {
436     if (*(++p) != '}')
437     {
438     max = 0;
439 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
440 nigel 3 if (max < min)
441     {
442     *errorptr = ERR4;
443     return p;
444     }
445     }
446     }
447    
448     /* Do paranoid checks, then fill in the required variables, and pass back the
449     pointer to the terminating '}'. */
450    
451     if (min > 65535 || max > 65535)
452     *errorptr = ERR5;
453     else
454     {
455     *minp = min;
456     *maxp = max;
457     }
458     return p;
459     }
460    
461    
462    
463     /*************************************************
464 nigel 23 * Find the fixed length of a pattern *
465     *************************************************/
466    
467     /* Scan a pattern and compute the fixed length of subject that will match it,
468     if the length is fixed. This is needed for dealing with backward assertions.
469    
470     Arguments:
471     code points to the start of the pattern (the bracket)
472    
473     Returns: the fixed length, or -1 if there is no fixed length
474     */
475    
476     static int
477     find_fixedlength(uschar *code)
478     {
479     int length = -1;
480    
481     register int branchlength = 0;
482     register uschar *cc = code + 3;
483    
484     /* Scan along the opcodes for this branch. If we get to the end of the
485     branch, check the length against that of the other branches. */
486    
487     for (;;)
488     {
489     int d;
490     register int op = *cc;
491     if (op >= OP_BRA) op = OP_BRA;
492    
493     switch (op)
494     {
495     case OP_BRA:
496     case OP_ONCE:
497     case OP_COND:
498     d = find_fixedlength(cc);
499     if (d < 0) return -1;
500     branchlength += d;
501     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
502     cc += 3;
503     break;
504    
505     /* Reached end of a branch; if it's a ket it is the end of a nested
506     call. If it's ALT it is an alternation in a nested call. If it is
507     END it's the end of the outer call. All can be handled by the same code. */
508    
509     case OP_ALT:
510     case OP_KET:
511     case OP_KETRMAX:
512     case OP_KETRMIN:
513     case OP_END:
514     if (length < 0) length = branchlength;
515     else if (length != branchlength) return -1;
516     if (*cc != OP_ALT) return length;
517     cc += 3;
518     branchlength = 0;
519     break;
520    
521     /* Skip over assertive subpatterns */
522    
523     case OP_ASSERT:
524     case OP_ASSERT_NOT:
525     case OP_ASSERTBACK:
526     case OP_ASSERTBACK_NOT:
527     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
528     cc += 3;
529     break;
530    
531     /* Skip over things that don't match chars */
532    
533     case OP_REVERSE:
534     cc++;
535    
536     case OP_CREF:
537     case OP_OPT:
538     cc++;
539     /* Fall through */
540    
541     case OP_SOD:
542     case OP_EOD:
543     case OP_EODN:
544     case OP_CIRC:
545     case OP_DOLL:
546     case OP_NOT_WORD_BOUNDARY:
547     case OP_WORD_BOUNDARY:
548     cc++;
549     break;
550    
551     /* Handle char strings */
552    
553     case OP_CHARS:
554     branchlength += *(++cc);
555     cc += *cc + 1;
556     break;
557    
558     /* Handle exact repetitions */
559    
560     case OP_EXACT:
561     case OP_TYPEEXACT:
562     branchlength += (cc[1] << 8) + cc[2];
563     cc += 4;
564     break;
565    
566     /* Handle single-char matchers */
567    
568     case OP_NOT_DIGIT:
569     case OP_DIGIT:
570     case OP_NOT_WHITESPACE:
571     case OP_WHITESPACE:
572     case OP_NOT_WORDCHAR:
573     case OP_WORDCHAR:
574     case OP_ANY:
575     branchlength++;
576     cc++;
577     break;
578    
579    
580     /* Check a class for variable quantification */
581    
582     case OP_CLASS:
583     cc += (*cc == OP_REF)? 2 : 33;
584    
585     switch (*cc)
586     {
587     case OP_CRSTAR:
588     case OP_CRMINSTAR:
589     case OP_CRQUERY:
590     case OP_CRMINQUERY:
591     return -1;
592    
593     case OP_CRRANGE:
594     case OP_CRMINRANGE:
595     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
596     branchlength += (cc[1] << 8) + cc[2];
597     cc += 5;
598     break;
599    
600     default:
601     branchlength++;
602     }
603     break;
604    
605     /* Anything else is variable length */
606    
607     default:
608     return -1;
609     }
610     }
611     /* Control never gets here */
612     }
613    
614    
615    
616    
617     /*************************************************
618 nigel 3 * Compile one branch *
619     *************************************************/
620    
621     /* Scan the pattern, compiling it into the code vector.
622    
623     Arguments:
624 nigel 25 options the option bits
625     brackets points to number of brackets used
626     code points to the pointer to the current code point
627     ptrptr points to the current pattern pointer
628     errorptr points to pointer to error message
629     optchanged set to the value of the last OP_OPT item compiled
630     cd contains pointers to tables
631 nigel 3
632 nigel 25 Returns: TRUE on success
633     FALSE, with *errorptr set on error
634 nigel 3 */
635    
636     static BOOL
637 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
638 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
639     compile_data *cd)
640 nigel 3 {
641     int repeat_type, op_type;
642     int repeat_min, repeat_max;
643     int bravalue, length;
644 nigel 19 int greedy_default, greedy_non_default;
645 nigel 3 register int c;
646     register uschar *code = *codeptr;
647 nigel 23 uschar *tempcode;
648 nigel 7 const uschar *ptr = *ptrptr;
649 nigel 23 const uschar *tempptr;
650 nigel 3 uschar *previous = NULL;
651     uschar class[32];
652    
653 nigel 19 /* Set up the default and non-default settings for greediness */
654    
655     greedy_default = ((options & PCRE_UNGREEDY) != 0);
656     greedy_non_default = greedy_default ^ 1;
657    
658 nigel 3 /* Switch on next character until the end of the branch */
659    
660     for (;; ptr++)
661     {
662     BOOL negate_class;
663 nigel 23 int class_charcount;
664     int class_lastchar;
665     int newoptions;
666     int condref;
667 nigel 3
668     c = *ptr;
669     if ((options & PCRE_EXTENDED) != 0)
670     {
671 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
672 nigel 3 if (c == '#')
673     {
674     while ((c = *(++ptr)) != 0 && c != '\n');
675     continue;
676     }
677     }
678    
679     switch(c)
680     {
681     /* The branch terminates at end of string, |, or ). */
682    
683     case 0:
684     case '|':
685     case ')':
686     *codeptr = code;
687     *ptrptr = ptr;
688     return TRUE;
689    
690     /* Handle single-character metacharacters */
691    
692     case '^':
693     previous = NULL;
694     *code++ = OP_CIRC;
695     break;
696    
697     case '$':
698     previous = NULL;
699     *code++ = OP_DOLL;
700     break;
701    
702     case '.':
703     previous = code;
704     *code++ = OP_ANY;
705     break;
706    
707     /* Character classes. These always build a 32-byte bitmap of the permitted
708     characters, except in the special case where there is only one character.
709     For negated classes, we build the map as usual, then invert it at the end.
710     */
711    
712     case '[':
713     previous = code;
714 nigel 23 *code++ = OP_CLASS;
715 nigel 3
716 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
717 nigel 3
718     if ((c = *(++ptr)) == '^')
719     {
720     negate_class = TRUE;
721     c = *(++ptr);
722     }
723 nigel 23 else negate_class = FALSE;
724 nigel 3
725     /* Keep a count of chars so that we can optimize the case of just a single
726     character. */
727    
728     class_charcount = 0;
729     class_lastchar = -1;
730    
731     /* Initialize the 32-char bit map to all zeros. We have to build the
732     map in a temporary bit of store, in case the class contains only 1
733     character, because in that case the compiled code doesn't use the
734     bit map. */
735    
736     memset(class, 0, 32 * sizeof(uschar));
737    
738     /* Process characters until ] is reached. By writing this as a "do" it
739     means that an initial ] is taken as a data character. */
740    
741     do
742     {
743     if (c == 0)
744     {
745     *errorptr = ERR6;
746     goto FAILED;
747     }
748    
749     /* Backslash may introduce a single character, or it may introduce one
750     of the specials, which just set a flag. Escaped items are checked for
751     validity in the pre-compiling pass. The sequence \b is a special case.
752 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
753 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
754     or into the one we are building. We assume they have more than one
755     character in them, so set class_count bigger than one. */
756    
757     if (c == '\\')
758     {
759 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
760 nigel 3 if (-c == ESC_b) c = '\b';
761     else if (c < 0)
762     {
763 nigel 25 register const uschar *cbits = cd->cbits;
764 nigel 3 class_charcount = 10;
765     switch (-c)
766     {
767     case ESC_d:
768 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
769 nigel 3 continue;
770    
771     case ESC_D:
772 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
773 nigel 3 continue;
774    
775     case ESC_w:
776     for (c = 0; c < 32; c++)
777 nigel 25 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
778 nigel 3 continue;
779    
780     case ESC_W:
781     for (c = 0; c < 32; c++)
782 nigel 25 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
783 nigel 3 continue;
784    
785     case ESC_s:
786 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
787 nigel 3 continue;
788    
789     case ESC_S:
790 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
791 nigel 3 continue;
792    
793     default:
794     *errorptr = ERR7;
795     goto FAILED;
796     }
797     }
798     /* Fall through if single character */
799     }
800    
801     /* A single character may be followed by '-' to form a range. However,
802     Perl does not permit ']' to be the end of the range. A '-' character
803     here is treated as a literal. */
804    
805     if (ptr[1] == '-' && ptr[2] != ']')
806     {
807     int d;
808     ptr += 2;
809     d = *ptr;
810    
811     if (d == 0)
812     {
813     *errorptr = ERR6;
814     goto FAILED;
815     }
816    
817     /* The second part of a range can be a single-character escape, but
818     not any of the other escapes. */
819    
820     if (d == '\\')
821     {
822 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
823 nigel 3 if (d < 0)
824     {
825     if (d == -ESC_b) d = '\b'; else
826     {
827     *errorptr = ERR7;
828     goto FAILED;
829     }
830     }
831     }
832    
833     if (d < c)
834     {
835     *errorptr = ERR8;
836     goto FAILED;
837     }
838    
839     for (; c <= d; c++)
840     {
841     class[c/8] |= (1 << (c&7));
842     if ((options & PCRE_CASELESS) != 0)
843     {
844 nigel 25 int uc = cd->fcc[c]; /* flip case */
845 nigel 3 class[uc/8] |= (1 << (uc&7));
846     }
847     class_charcount++; /* in case a one-char range */
848     class_lastchar = c;
849     }
850     continue; /* Go get the next char in the class */
851     }
852    
853     /* Handle a lone single character - we can get here for a normal
854     non-escape char, or after \ that introduces a single character. */
855    
856     class [c/8] |= (1 << (c&7));
857     if ((options & PCRE_CASELESS) != 0)
858     {
859 nigel 25 c = cd->fcc[c]; /* flip case */
860 nigel 3 class[c/8] |= (1 << (c&7));
861     }
862     class_charcount++;
863     class_lastchar = c;
864     }
865    
866     /* Loop until ']' reached; the check for end of string happens inside the
867     loop. This "while" is the end of the "do" above. */
868    
869     while ((c = *(++ptr)) != ']');
870    
871     /* If class_charcount is 1 and class_lastchar is not negative, we saw
872     precisely one character. This doesn't need the whole 32-byte bit map.
873     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
874     it's negative. */
875    
876     if (class_charcount == 1 && class_lastchar >= 0)
877     {
878     if (negate_class)
879     {
880     code[-1] = OP_NOT;
881     }
882     else
883     {
884     code[-1] = OP_CHARS;
885     *code++ = 1;
886     }
887     *code++ = class_lastchar;
888     }
889    
890     /* Otherwise, negate the 32-byte map if necessary, and copy it into
891     the code vector. */
892    
893     else
894     {
895     if (negate_class)
896     for (c = 0; c < 32; c++) code[c] = ~class[c];
897     else
898     memcpy(code, class, 32);
899     code += 32;
900     }
901     break;
902    
903     /* Various kinds of repeat */
904    
905     case '{':
906 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
907     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
908 nigel 3 if (*errorptr != NULL) goto FAILED;
909     goto REPEAT;
910    
911     case '*':
912     repeat_min = 0;
913     repeat_max = -1;
914     goto REPEAT;
915    
916     case '+':
917     repeat_min = 1;
918     repeat_max = -1;
919     goto REPEAT;
920    
921     case '?':
922     repeat_min = 0;
923     repeat_max = 1;
924    
925     REPEAT:
926     if (previous == NULL)
927     {
928     *errorptr = ERR9;
929     goto FAILED;
930     }
931    
932 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
933     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
934 nigel 3 next character. */
935    
936 nigel 19 if (ptr[1] == '?')
937     { repeat_type = greedy_non_default; ptr++; }
938     else repeat_type = greedy_default;
939 nigel 3
940     /* If the maximum is zero then the minimum must also be zero; Perl allows
941     this case, so we do too - by simply omitting the item altogether. */
942    
943     if (repeat_max == 0) code = previous;
944    
945     /* If previous was a string of characters, chop off the last one and use it
946     as the subject of the repeat. If there was only one character, we can
947     abolish the previous item altogether. */
948    
949     else if (*previous == OP_CHARS)
950     {
951     int len = previous[1];
952     if (len == 1)
953     {
954     c = previous[2];
955     code = previous;
956     }
957     else
958     {
959     c = previous[len+1];
960     previous[1]--;
961     code--;
962     }
963     op_type = 0; /* Use single-char op codes */
964     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
965     }
966    
967     /* If previous was a single negated character ([^a] or similar), we use
968     one of the special opcodes, replacing it. The code is shared with single-
969     character repeats by adding a suitable offset into repeat_type. */
970    
971     else if ((int)*previous == OP_NOT)
972     {
973     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
974     c = previous[1];
975     code = previous;
976     goto OUTPUT_SINGLE_REPEAT;
977     }
978    
979     /* If previous was a character type match (\d or similar), abolish it and
980     create a suitable repeat item. The code is shared with single-character
981     repeats by adding a suitable offset into repeat_type. */
982    
983 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
984 nigel 3 {
985     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
986     c = *previous;
987     code = previous;
988    
989     OUTPUT_SINGLE_REPEAT:
990     repeat_type += op_type; /* Combine both values for many cases */
991    
992     /* A minimum of zero is handled either as the special case * or ?, or as
993     an UPTO, with the maximum given. */
994    
995     if (repeat_min == 0)
996     {
997     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
998     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
999     else
1000     {
1001     *code++ = OP_UPTO + repeat_type;
1002     *code++ = repeat_max >> 8;
1003     *code++ = (repeat_max & 255);
1004     }
1005     }
1006    
1007     /* The case {1,} is handled as the special case + */
1008    
1009     else if (repeat_min == 1 && repeat_max == -1)
1010     *code++ = OP_PLUS + repeat_type;
1011    
1012     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1013     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1014    
1015     else
1016     {
1017     if (repeat_min != 1)
1018     {
1019     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1020     *code++ = repeat_min >> 8;
1021     *code++ = (repeat_min & 255);
1022     }
1023    
1024     /* If the mininum is 1 and the previous item was a character string,
1025     we either have to put back the item that got cancelled if the string
1026     length was 1, or add the character back onto the end of a longer
1027 nigel 21 string. For a character type nothing need be done; it will just get
1028     put back naturally. Note that the final character is always going to
1029     get added below. */
1030 nigel 3
1031     else if (*previous == OP_CHARS)
1032     {
1033     if (code == previous) code += 2; else previous[1]++;
1034     }
1035    
1036 nigel 21 /* For a single negated character we also have to put back the
1037     item that got cancelled. */
1038    
1039     else if (*previous == OP_NOT) code++;
1040    
1041 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1042 nigel 3
1043 nigel 9 if (repeat_max < 0)
1044 nigel 3 {
1045     *code++ = c;
1046 nigel 9 *code++ = OP_STAR + repeat_type;
1047     }
1048    
1049     /* Else insert an UPTO if the max is greater than the min. */
1050    
1051     else if (repeat_max != repeat_min)
1052     {
1053     *code++ = c;
1054 nigel 3 repeat_max -= repeat_min;
1055     *code++ = OP_UPTO + repeat_type;
1056     *code++ = repeat_max >> 8;
1057     *code++ = (repeat_max & 255);
1058     }
1059     }
1060    
1061     /* The character or character type itself comes last in all cases. */
1062    
1063     *code++ = c;
1064     }
1065    
1066     /* If previous was a character class or a back reference, we put the repeat
1067     stuff after it. */
1068    
1069 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1070 nigel 3 {
1071     if (repeat_min == 0 && repeat_max == -1)
1072     *code++ = OP_CRSTAR + repeat_type;
1073     else if (repeat_min == 1 && repeat_max == -1)
1074     *code++ = OP_CRPLUS + repeat_type;
1075     else if (repeat_min == 0 && repeat_max == 1)
1076     *code++ = OP_CRQUERY + repeat_type;
1077     else
1078     {
1079     *code++ = OP_CRRANGE + repeat_type;
1080     *code++ = repeat_min >> 8;
1081     *code++ = repeat_min & 255;
1082     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1083     *code++ = repeat_max >> 8;
1084     *code++ = repeat_max & 255;
1085     }
1086     }
1087    
1088     /* If previous was a bracket group, we may have to replicate it in certain
1089 nigel 23 cases. */
1090 nigel 3
1091 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092     (int)*previous == OP_COND)
1093 nigel 3 {
1094 nigel 23 int i, ketoffset = 0;
1095 nigel 9 int len = code - previous;
1096 nigel 3
1097 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1098     by scanning through from the start, and compute the offset back to it
1099     from the current code pointer. There may be an OP_OPT setting following
1100     the final KET, so we can't find the end just by going back from the code
1101     pointer. */
1102    
1103     if (repeat_max == -1)
1104 nigel 3 {
1105 nigel 23 register uschar *ket = previous;
1106     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1107     ketoffset = code - ket;
1108 nigel 3 }
1109    
1110     /* If the minimum is greater than zero, and the maximum is unlimited or
1111     equal to the minimum, the first copy remains where it is, and is
1112     replicated up to the minimum number of times. This case includes the +
1113     repeat, but of course no replication is needed in that case. */
1114    
1115     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1116     {
1117     for (i = 1; i < repeat_min; i++)
1118     {
1119 nigel 9 memcpy(code, previous, len);
1120     code += len;
1121 nigel 3 }
1122     }
1123    
1124     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1125     Then, if there is a fixed upper limit, replicated up to that many times,
1126     sticking BRAZERO in front of all the optional ones. */
1127    
1128     else
1129     {
1130     if (repeat_min == 0)
1131     {
1132 nigel 9 memmove(previous+1, previous, len);
1133 nigel 3 code++;
1134     *previous++ = OP_BRAZERO + repeat_type;
1135     }
1136    
1137     for (i = 1; i < repeat_min; i++)
1138     {
1139 nigel 9 memcpy(code, previous, len);
1140     code += len;
1141 nigel 3 }
1142    
1143     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1144     {
1145     *code++ = OP_BRAZERO + repeat_type;
1146 nigel 9 memcpy(code, previous, len);
1147     code += len;
1148 nigel 3 }
1149     }
1150    
1151 nigel 23 /* If the maximum is unlimited, set a repeater in the final copy. We
1152     can't just offset backwards from the current code point, because we
1153     don't know if there's been an options resetting after the ket. The
1154     correct offset was computed above. */
1155 nigel 3
1156 nigel 23 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1157 nigel 3 }
1158    
1159     /* Else there's some kind of shambles */
1160    
1161     else
1162     {
1163     *errorptr = ERR11;
1164     goto FAILED;
1165     }
1166    
1167     /* In all case we no longer have a previous item. */
1168    
1169     previous = NULL;
1170     break;
1171    
1172    
1173 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1174     lookbehind or option setting or condition. First deal with special things
1175     that can come after a bracket; all are introduced by ?, and the appearance
1176     of any of them means that this is not a referencing group. They were
1177     checked for validity in the first pass over the string, so we don't have to
1178     check for syntax errors here. */
1179 nigel 3
1180     case '(':
1181 nigel 23 newoptions = options;
1182     condref = -1;
1183    
1184 nigel 3 if (*(++ptr) == '?')
1185     {
1186 nigel 23 int set, unset;
1187     int *optset;
1188 nigel 3
1189     switch (*(++ptr))
1190     {
1191 nigel 23 case '#': /* Comment; skip to ket */
1192 nigel 3 ptr++;
1193     while (*ptr != ')') ptr++;
1194     continue;
1195    
1196     case ':': /* Non-extracting bracket */
1197 nigel 23 bravalue = OP_BRA;
1198 nigel 3 ptr++;
1199     break;
1200    
1201 nigel 23 case '(':
1202     bravalue = OP_COND; /* Conditional group */
1203 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1204 nigel 23 {
1205     condref = *ptr - '0';
1206     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1207     ptr++;
1208     }
1209     else ptr--;
1210     break;
1211    
1212     case '=': /* Positive lookahead */
1213 nigel 3 bravalue = OP_ASSERT;
1214     ptr++;
1215     break;
1216    
1217 nigel 23 case '!': /* Negative lookahead */
1218 nigel 3 bravalue = OP_ASSERT_NOT;
1219     ptr++;
1220     break;
1221    
1222 nigel 23 case '<': /* Lookbehinds */
1223     switch (*(++ptr))
1224 nigel 3 {
1225 nigel 23 case '=': /* Positive lookbehind */
1226     bravalue = OP_ASSERTBACK;
1227 nigel 3 ptr++;
1228     break;
1229 nigel 23
1230     case '!': /* Negative lookbehind */
1231     bravalue = OP_ASSERTBACK_NOT;
1232     ptr++;
1233     break;
1234    
1235     default: /* Syntax error */
1236     *errorptr = ERR24;
1237     goto FAILED;
1238 nigel 3 }
1239 nigel 23 break;
1240 nigel 3
1241 nigel 23 case '>': /* One-time brackets */
1242     bravalue = OP_ONCE;
1243     ptr++;
1244     break;
1245    
1246     default: /* Option setting */
1247     set = unset = 0;
1248     optset = &set;
1249    
1250     while (*ptr != ')' && *ptr != ':')
1251     {
1252     switch (*ptr++)
1253     {
1254     case '-': optset = &unset; break;
1255    
1256     case 'i': *optset |= PCRE_CASELESS; break;
1257     case 'm': *optset |= PCRE_MULTILINE; break;
1258     case 's': *optset |= PCRE_DOTALL; break;
1259     case 'x': *optset |= PCRE_EXTENDED; break;
1260     case 'U': *optset |= PCRE_UNGREEDY; break;
1261     case 'X': *optset |= PCRE_EXTRA; break;
1262    
1263     default:
1264     *errorptr = ERR12;
1265     goto FAILED;
1266     }
1267     }
1268    
1269     /* Set up the changed option bits, but don't change anything yet. */
1270    
1271     newoptions = (options | set) & (~unset);
1272    
1273     /* If the options ended with ')' this is not the start of a nested
1274     group with option changes, so the options change at this level. At top
1275     level there is nothing else to be done (the options will in fact have
1276     been set from the start of compiling as a result of the first pass) but
1277     at an inner level we must compile code to change the ims options if
1278     necessary, and pass the new setting back so that it can be put at the
1279     start of any following branches, and when this group ends, a resetting
1280     item can be compiled. */
1281    
1282     if (*ptr == ')')
1283     {
1284     if ((options & PCRE_INGROUP) != 0 &&
1285     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1286     {
1287     *code++ = OP_OPT;
1288     *code++ = *optchanged = newoptions & PCRE_IMS;
1289     }
1290     options = newoptions; /* Change options at this level */
1291     previous = NULL; /* This item can't be repeated */
1292     continue; /* It is complete */
1293     }
1294    
1295     /* If the options ended with ':' we are heading into a nested group
1296     with possible change of options. Such groups are non-capturing and are
1297     not assertions of any kind. All we need to do is skip over the ':';
1298     the newoptions value is handled below. */
1299    
1300     bravalue = OP_BRA;
1301     ptr++;
1302 nigel 3 }
1303     }
1304    
1305 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1306 nigel 3
1307     else
1308     {
1309     if (++(*brackets) > EXTRACT_MAX)
1310     {
1311     *errorptr = ERR13;
1312     goto FAILED;
1313     }
1314     bravalue = OP_BRA + *brackets;
1315     }
1316    
1317 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1318     kinds can be. We copy code into a non-register variable in order to be able
1319     to pass its address because some compilers complain otherwise. Pass in a
1320     new setting for the ims options if they have changed. */
1321 nigel 3
1322 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1323 nigel 3 *code = bravalue;
1324 nigel 23 tempcode = code;
1325    
1326     if (!compile_regex(
1327     options | PCRE_INGROUP, /* Set for all nested groups */
1328     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1329     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1330     brackets, /* Bracket level */
1331     &tempcode, /* Where to put code (updated) */
1332     &ptr, /* Input pointer (updated) */
1333     errorptr, /* Where to put an error message */
1334     (bravalue == OP_ASSERTBACK ||
1335     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1336 nigel 25 condref, /* Condition reference number */
1337     cd)) /* Tables block */
1338 nigel 23 goto FAILED;
1339    
1340     /* At the end of compiling, code is still pointing to the start of the
1341     group, while tempcode has been updated to point past the end of the group
1342     and any option resetting that may follow it. The pattern pointer (ptr)
1343     is on the bracket. */
1344    
1345     /* If this is a conditional bracket, check that there are no more than
1346     two branches in the group. */
1347    
1348     if (bravalue == OP_COND)
1349 nigel 3 {
1350 nigel 23 int branchcount = 0;
1351     uschar *tc = code;
1352    
1353     do {
1354     branchcount++;
1355     tc += (tc[1] << 8) | tc[2];
1356     }
1357     while (*tc != OP_KET);
1358    
1359     if (branchcount > 2)
1360     {
1361     *errorptr = ERR27;
1362 nigel 3 goto FAILED;
1363 nigel 23 }
1364 nigel 3 }
1365    
1366 nigel 23 /* Now update the main code pointer to the end of the group. */
1367    
1368     code = tempcode;
1369    
1370     /* Error if hit end of pattern */
1371    
1372 nigel 3 if (*ptr != ')')
1373     {
1374     *errorptr = ERR14;
1375     goto FAILED;
1376     }
1377     break;
1378    
1379     /* Check \ for being a real metacharacter; if not, fall through and handle
1380     it as a data character at the start of a string. Escape items are checked
1381     for validity in the pre-compiling pass. */
1382    
1383     case '\\':
1384 nigel 23 tempptr = ptr;
1385 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1386 nigel 3
1387     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1388     are arranged to be the negation of the corresponding OP_values. For the
1389     back references, the values are ESC_REF plus the reference number. Only
1390     back references and those types that consume a character may be repeated.
1391     We can test for values between ESC_b and ESC_Z for the latter; this may
1392     have to change if any new ones are ever created. */
1393    
1394     if (c < 0)
1395     {
1396     if (-c >= ESC_REF)
1397     {
1398     previous = code;
1399     *code++ = OP_REF;
1400 nigel 23 *code++ = -c - ESC_REF;
1401 nigel 3 }
1402     else
1403     {
1404 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1405 nigel 3 *code++ = -c;
1406     }
1407     continue;
1408     }
1409    
1410 nigel 7 /* Data character: reset and fall through */
1411 nigel 3
1412 nigel 23 ptr = tempptr;
1413 nigel 3 c = '\\';
1414    
1415     /* Handle a run of data characters until a metacharacter is encountered.
1416     The first character is guaranteed not to be whitespace or # when the
1417     extended flag is set. */
1418    
1419     NORMAL_CHAR:
1420     default:
1421     previous = code;
1422     *code = OP_CHARS;
1423     code += 2;
1424     length = 0;
1425    
1426     do
1427     {
1428     if ((options & PCRE_EXTENDED) != 0)
1429     {
1430 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1431 nigel 3 if (c == '#')
1432     {
1433     while ((c = *(++ptr)) != 0 && c != '\n');
1434     if (c == 0) break;
1435     continue;
1436     }
1437     }
1438    
1439     /* Backslash may introduce a data char or a metacharacter. Escaped items
1440     are checked for validity in the pre-compiling pass. Stop the string
1441     before a metaitem. */
1442    
1443     if (c == '\\')
1444     {
1445 nigel 23 tempptr = ptr;
1446 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1447 nigel 23 if (c < 0) { ptr = tempptr; break; }
1448 nigel 3 }
1449    
1450     /* Ordinary character or single-char escape */
1451    
1452     *code++ = c;
1453     length++;
1454     }
1455    
1456     /* This "while" is the end of the "do" above. */
1457    
1458 nigel 25 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1459 nigel 3
1460     /* Compute the length and set it in the data vector, and advance to
1461     the next state. */
1462    
1463     previous[1] = length;
1464 nigel 15 if (length < 255) ptr--;
1465 nigel 3 break;
1466     }
1467     } /* end of big loop */
1468    
1469     /* Control never reaches here by falling through, only by a goto for all the
1470     error states. Pass back the position in the pattern so that it can be displayed
1471     to the user for diagnosing the error. */
1472    
1473     FAILED:
1474     *ptrptr = ptr;
1475     return FALSE;
1476     }
1477    
1478    
1479    
1480    
1481     /*************************************************
1482     * Compile sequence of alternatives *
1483     *************************************************/
1484    
1485     /* On entry, ptr is pointing past the bracket character, but on return
1486     it points to the closing bracket, or vertical bar, or end of string.
1487     The code variable is pointing at the byte into which the BRA operator has been
1488 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
1489     during any branch, we need to insert an OP_OPT item at the start of every
1490     following branch to ensure they get set correctly at run time, and also pass
1491     the new options into every subsequent branch compile.
1492 nigel 3
1493     Argument:
1494 nigel 23 options the option bits
1495     optchanged new ims options to set as if (?ims) were at the start, or -1
1496     for no change
1497     brackets -> int containing the number of extracting brackets used
1498     codeptr -> the address of the current code pointer
1499     ptrptr -> the address of the current pattern pointer
1500     errorptr -> pointer to error message
1501     lookbehind TRUE if this is a lookbehind assertion
1502     condref > 0 for OPT_CREF setting at start of conditional group
1503 nigel 25 cd points to the data block with tables pointers
1504 nigel 3
1505 nigel 23 Returns: TRUE on success
1506 nigel 3 */
1507    
1508     static BOOL
1509 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1510 nigel 25 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1511     compile_data *cd)
1512 nigel 3 {
1513 nigel 7 const uschar *ptr = *ptrptr;
1514 nigel 3 uschar *code = *codeptr;
1515 nigel 23 uschar *last_branch = code;
1516 nigel 3 uschar *start_bracket = code;
1517 nigel 23 uschar *reverse_count = NULL;
1518     int oldoptions = options & PCRE_IMS;
1519 nigel 3
1520 nigel 23 code += 3;
1521    
1522     /* At the start of a reference-based conditional group, insert the reference
1523     number as an OP_CREF item. */
1524    
1525     if (condref > 0)
1526     {
1527     *code++ = OP_CREF;
1528     *code++ = condref;
1529     }
1530    
1531     /* Loop for each alternative branch */
1532    
1533 nigel 3 for (;;)
1534     {
1535     int length;
1536    
1537 nigel 23 /* Handle change of options */
1538    
1539     if (optchanged >= 0)
1540 nigel 3 {
1541 nigel 23 *code++ = OP_OPT;
1542     *code++ = optchanged;
1543     options = (options & ~PCRE_IMS) | optchanged;
1544     }
1545    
1546     /* Set up dummy OP_REVERSE if lookbehind assertion */
1547    
1548     if (lookbehind)
1549     {
1550     *code++ = OP_REVERSE;
1551     reverse_count = code;
1552     *code++ = 0;
1553     *code++ = 0;
1554     }
1555    
1556     /* Now compile the branch */
1557    
1558 nigel 25 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1559 nigel 23 {
1560 nigel 3 *ptrptr = ptr;
1561     return FALSE;
1562     }
1563    
1564     /* Fill in the length of the last branch */
1565    
1566     length = code - last_branch;
1567     last_branch[1] = length >> 8;
1568     last_branch[2] = length & 255;
1569    
1570 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
1571     and put the length into the OP_REVERSE item. Temporarily mark the end of
1572     the branch with OP_END. */
1573    
1574     if (lookbehind)
1575     {
1576     *code = OP_END;
1577     length = find_fixedlength(last_branch);
1578     DPRINTF(("fixed length = %d\n", length));
1579     if (length < 0)
1580     {
1581     *errorptr = ERR25;
1582     *ptrptr = ptr;
1583     return FALSE;
1584     }
1585     reverse_count[0] = (length >> 8);
1586     reverse_count[1] = length & 255;
1587     }
1588    
1589 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
1590     terminating ket and the length of the whole bracketed item, and return,
1591 nigel 23 leaving the pointer at the terminating char. If any of the ims options
1592     were changed inside the group, compile a resetting op-code following. */
1593 nigel 3
1594     if (*ptr != '|')
1595     {
1596     length = code - start_bracket;
1597     *code++ = OP_KET;
1598     *code++ = length >> 8;
1599     *code++ = length & 255;
1600 nigel 23 if (optchanged >= 0)
1601     {
1602     *code++ = OP_OPT;
1603     *code++ = oldoptions;
1604     }
1605 nigel 3 *codeptr = code;
1606     *ptrptr = ptr;
1607     return TRUE;
1608     }
1609    
1610     /* Another branch follows; insert an "or" node and advance the pointer. */
1611    
1612     *code = OP_ALT;
1613 nigel 23 last_branch = code;
1614     code += 3;
1615 nigel 3 ptr++;
1616     }
1617     /* Control never reaches here */
1618     }
1619    
1620    
1621    
1622 nigel 23
1623 nigel 3 /*************************************************
1624 nigel 23 * Find first significant op code *
1625     *************************************************/
1626    
1627     /* This is called by several functions that scan a compiled expression looking
1628     for a fixed first character, or an anchoring op code etc. It skips over things
1629     that do not influence this. For one application, a change of caseless option is
1630     important.
1631    
1632     Arguments:
1633     code pointer to the start of the group
1634     options pointer to external options
1635     optbit the option bit whose changing is significant, or
1636     zero if none are
1637     optstop TRUE to return on option change, otherwise change the options
1638     value and continue
1639    
1640     Returns: pointer to the first significant opcode
1641     */
1642    
1643     static const uschar*
1644     first_significant_code(const uschar *code, int *options, int optbit,
1645     BOOL optstop)
1646     {
1647     for (;;)
1648     {
1649     switch ((int)*code)
1650     {
1651     case OP_OPT:
1652     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1653     {
1654     if (optstop) return code;
1655     *options = (int)code[1];
1656     }
1657     code += 2;
1658     break;
1659    
1660     case OP_CREF:
1661     code += 2;
1662     break;
1663    
1664     case OP_ASSERT_NOT:
1665     case OP_ASSERTBACK:
1666     case OP_ASSERTBACK_NOT:
1667     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1668     code += 3;
1669     break;
1670    
1671     default:
1672     return code;
1673     }
1674     }
1675     /* Control never reaches here */
1676     }
1677    
1678    
1679    
1680    
1681     /*************************************************
1682 nigel 3 * Check for anchored expression *
1683     *************************************************/
1684    
1685     /* Try to find out if this is an anchored regular expression. Consider each
1686     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1687     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1688     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1689     counts, since OP_CIRC can match in the middle.
1690    
1691     A branch is also implicitly anchored if it starts with .* because that will try
1692     the rest of the pattern at all possible matching points, so there is no point
1693     trying them again.
1694    
1695 nigel 23 Arguments:
1696     code points to start of expression (the bracket)
1697     options points to the options setting
1698    
1699     Returns: TRUE or FALSE
1700 nigel 3 */
1701    
1702     static BOOL
1703 nigel 23 is_anchored(register const uschar *code, int *options)
1704 nigel 3 {
1705     do {
1706 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
1707     PCRE_MULTILINE, FALSE);
1708     register int op = *scode;
1709     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1710     { if (!is_anchored(scode, options)) return FALSE; }
1711 nigel 3 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1712 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
1713     else if (op != OP_SOD &&
1714     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1715     return FALSE;
1716 nigel 3 code += (code[1] << 8) + code[2];
1717     }
1718     while (*code == OP_ALT);
1719     return TRUE;
1720     }
1721    
1722    
1723    
1724     /*************************************************
1725     * Check for start with \n line expression *
1726     *************************************************/
1727    
1728     /* This is called for multiline expressions to try to find out if every branch
1729     starts with ^ so that "first char" processing can be done to speed things up.
1730    
1731     Argument: points to start of expression (the bracket)
1732     Returns: TRUE or FALSE
1733     */
1734    
1735     static BOOL
1736 nigel 7 is_startline(const uschar *code)
1737 nigel 3 {
1738     do {
1739 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1740     register int op = *scode;
1741     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1742     { if (!is_startline(scode)) return FALSE; }
1743     else if (op != OP_CIRC) return FALSE;
1744 nigel 3 code += (code[1] << 8) + code[2];
1745     }
1746     while (*code == OP_ALT);
1747     return TRUE;
1748     }
1749    
1750    
1751    
1752     /*************************************************
1753     * Check for fixed first char *
1754     *************************************************/
1755    
1756     /* Try to find out if there is a fixed first character. This is called for
1757     unanchored expressions, as it speeds up their processing quite considerably.
1758     Consider each alternative branch. If they all start with the same char, or with
1759     a bracket all of whose alternatives start with the same char (recurse ad lib),
1760     then we return that char, otherwise -1.
1761    
1762 nigel 23 Arguments:
1763     code points to start of expression (the bracket)
1764     options pointer to the options (used to check casing changes)
1765    
1766     Returns: -1 or the fixed first char
1767 nigel 3 */
1768    
1769     static int
1770 nigel 23 find_firstchar(const uschar *code, int *options)
1771 nigel 3 {
1772     register int c = -1;
1773 nigel 23 do {
1774     int d;
1775     const uschar *scode = first_significant_code(code + 3, options,
1776     PCRE_CASELESS, TRUE);
1777     register int op = *scode;
1778 nigel 3
1779 nigel 23 if (op >= OP_BRA) op = OP_BRA;
1780 nigel 3
1781 nigel 23 switch(op)
1782     {
1783     default:
1784     return -1;
1785 nigel 3
1786 nigel 23 case OP_BRA:
1787     case OP_ASSERT:
1788     case OP_ONCE:
1789     case OP_COND:
1790     if ((d = find_firstchar(scode, options)) < 0) return -1;
1791     if (c < 0) c = d; else if (c != d) return -1;
1792     break;
1793 nigel 3
1794 nigel 23 case OP_EXACT: /* Fall through */
1795     scode++;
1796 nigel 3
1797 nigel 23 case OP_CHARS: /* Fall through */
1798     scode++;
1799    
1800     case OP_PLUS:
1801     case OP_MINPLUS:
1802     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1803     break;
1804     }
1805    
1806     code += (code[1] << 8) + code[2];
1807     }
1808 nigel 3 while (*code == OP_ALT);
1809     return c;
1810     }
1811    
1812    
1813    
1814 nigel 23
1815    
1816 nigel 3 /*************************************************
1817     * Compile a Regular Expression *
1818     *************************************************/
1819    
1820     /* This function takes a string and returns a pointer to a block of store
1821     holding a compiled version of the expression.
1822    
1823     Arguments:
1824     pattern the regular expression
1825     options various option bits
1826     errorptr pointer to pointer to error text
1827     erroroffset ptr offset in pattern where error was detected
1828 nigel 25 tables pointer to character tables or NULL
1829 nigel 3
1830     Returns: pointer to compiled data block, or NULL on error,
1831     with errorptr and erroroffset set
1832     */
1833    
1834     pcre *
1835 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
1836 nigel 25 int *erroroffset, const unsigned char *tables)
1837 nigel 3 {
1838     real_pcre *re;
1839     int length = 3; /* For initial BRA plus length */
1840     int runlength;
1841     int c, size;
1842     int bracount = 0;
1843     int top_backref = 0;
1844 nigel 23 int branch_extra = 0;
1845     int branch_newextra;
1846 nigel 7 unsigned int brastackptr = 0;
1847     uschar *code;
1848     const uschar *ptr;
1849 nigel 25 compile_data compile_block;
1850 nigel 23 int brastack[BRASTACK_SIZE];
1851     uschar bralenstack[BRASTACK_SIZE];
1852 nigel 3
1853     #ifdef DEBUG
1854     uschar *code_base, *code_end;
1855     #endif
1856    
1857     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1858     can do is just return NULL. */
1859    
1860     if (errorptr == NULL) return NULL;
1861     *errorptr = NULL;
1862    
1863     /* However, we can give a message for this error */
1864    
1865     if (erroroffset == NULL)
1866     {
1867     *errorptr = ERR16;
1868     return NULL;
1869     }
1870     *erroroffset = 0;
1871    
1872     if ((options & ~PUBLIC_OPTIONS) != 0)
1873     {
1874     *errorptr = ERR17;
1875     return NULL;
1876     }
1877    
1878 nigel 25 /* Set up pointers to the individual character tables */
1879    
1880     if (tables == NULL) tables = pcre_default_tables;
1881     compile_block.lcc = tables + lcc_offset;
1882     compile_block.fcc = tables + fcc_offset;
1883     compile_block.cbits = tables + cbits_offset;
1884     compile_block.ctypes = tables + ctypes_offset;
1885    
1886     /* Reflect pattern for debugging output */
1887    
1888 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
1889     DPRINTF(("%s\n", pattern));
1890 nigel 3
1891     /* The first thing to do is to make a pass over the pattern to compute the
1892     amount of store required to hold the compiled code. This does not have to be
1893     perfect as long as errors are overestimates. At the same time we can detect any
1894     internal flag settings. Make an attempt to correct for any counted white space
1895     if an "extended" flag setting appears late in the pattern. We can't be so
1896     clever for #-comments. */
1897    
1898 nigel 7 ptr = (const uschar *)(pattern - 1);
1899 nigel 3 while ((c = *(++ptr)) != 0)
1900     {
1901     int min, max;
1902     int class_charcount;
1903    
1904 nigel 23 if ((options & PCRE_EXTENDED) != 0)
1905 nigel 3 {
1906 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
1907 nigel 23 if (c == '#')
1908     {
1909     while ((c = *(++ptr)) != 0 && c != '\n');
1910     continue;
1911     }
1912 nigel 3 }
1913    
1914     switch(c)
1915     {
1916     /* A backslashed item may be an escaped "normal" character or a
1917     character type. For a "normal" character, put the pointers and
1918     character back so that tests for whitespace etc. in the input
1919     are done correctly. */
1920    
1921     case '\\':
1922     {
1923 nigel 7 const uschar *save_ptr = ptr;
1924 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
1925 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1926     if (c >= 0)
1927     {
1928     ptr = save_ptr;
1929     c = '\\';
1930     goto NORMAL_CHAR;
1931     }
1932     }
1933     length++;
1934    
1935     /* A back reference needs an additional char, plus either one or 5
1936     bytes for a repeat. We also need to keep the value of the highest
1937     back reference. */
1938    
1939     if (c <= -ESC_REF)
1940     {
1941     int refnum = -c - ESC_REF;
1942     if (refnum > top_backref) top_backref = refnum;
1943     length++; /* For single back reference */
1944 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
1945 nigel 3 {
1946 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
1947 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1948     if ((min == 0 && (max == 1 || max == -1)) ||
1949     (min == 1 && max == -1))
1950     length++;
1951     else length += 5;
1952     if (ptr[1] == '?') ptr++;
1953     }
1954     }
1955     continue;
1956    
1957     case '^':
1958     case '.':
1959     case '$':
1960     case '*': /* These repeats won't be after brackets; */
1961     case '+': /* those are handled separately */
1962     case '?':
1963     length++;
1964     continue;
1965    
1966     /* This covers the cases of repeats after a single char, metachar, class,
1967     or back reference. */
1968    
1969     case '{':
1970 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
1971     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
1972 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1973     if ((min == 0 && (max == 1 || max == -1)) ||
1974     (min == 1 && max == -1))
1975     length++;
1976     else
1977     {
1978     length--; /* Uncount the original char or metachar */
1979     if (min == 1) length++; else if (min > 0) length += 4;
1980     if (max > 0) length += 4; else length += 2;
1981     }
1982     if (ptr[1] == '?') ptr++;
1983     continue;
1984    
1985 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
1986     options changed in the previous branch(es), and/or if we are in a
1987     lookbehind assertion, extra space will be needed at the start of the
1988     branch. This is handled by branch_extra. */
1989    
1990 nigel 3 case '|':
1991 nigel 23 length += 3 + branch_extra;
1992 nigel 3 continue;
1993    
1994     /* A character class uses 33 characters. Don't worry about character types
1995     that aren't allowed in classes - they'll get picked up during the compile.
1996     A character class that contains only one character uses 2 or 3 bytes,
1997     depending on whether it is negated or not. Notice this where we can. */
1998    
1999     case '[':
2000     class_charcount = 0;
2001     if (*(++ptr) == '^') ptr++;
2002     do
2003     {
2004     if (*ptr == '\\')
2005     {
2006 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2007     &compile_block);
2008 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2009 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2010 nigel 3 }
2011     else class_charcount++;
2012     ptr++;
2013     }
2014     while (*ptr != 0 && *ptr != ']');
2015    
2016     /* Repeats for negated single chars are handled by the general code */
2017    
2018     if (class_charcount == 1) length += 3; else
2019     {
2020     length += 33;
2021    
2022     /* A repeat needs either 1 or 5 bytes. */
2023    
2024 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2025 nigel 3 {
2026 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2027 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2028     if ((min == 0 && (max == 1 || max == -1)) ||
2029     (min == 1 && max == -1))
2030     length++;
2031     else length += 5;
2032     if (ptr[1] == '?') ptr++;
2033     }
2034     }
2035     continue;
2036    
2037     /* Brackets may be genuine groups or special things */
2038    
2039     case '(':
2040 nigel 23 branch_newextra = 0;
2041 nigel 3
2042     /* Handle special forms of bracket, which all start (? */
2043    
2044 nigel 23 if (ptr[1] == '?')
2045 nigel 3 {
2046 nigel 23 int set, unset;
2047     int *optset;
2048    
2049     switch (c = ptr[2])
2050 nigel 3 {
2051 nigel 23 /* Skip over comments entirely */
2052     case '#':
2053     ptr += 3;
2054     while (*ptr != 0 && *ptr != ')') ptr++;
2055     if (*ptr == 0)
2056     {
2057     *errorptr = ERR18;
2058     goto PCRE_ERROR_RETURN;
2059     }
2060     continue;
2061 nigel 3
2062 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2063     then behave like a non-special bracket, except that they don't increment
2064     the count of extracting brackets. Ditto for the "once only" bracket,
2065     which is in Perl from version 5.005. */
2066 nigel 3
2067 nigel 23 case ':':
2068     case '=':
2069     case '!':
2070     case '>':
2071 nigel 3 ptr += 2;
2072     break;
2073    
2074 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2075 nigel 3
2076 nigel 23 case '<':
2077     if (ptr[3] == '=' || ptr[3] == '!')
2078 nigel 3 {
2079 nigel 23 ptr += 3;
2080     branch_newextra = 3;
2081     length += 3; /* For the first branch */
2082     break;
2083 nigel 3 }
2084 nigel 23 *errorptr = ERR24;
2085     goto PCRE_ERROR_RETURN;
2086    
2087     /* Conditionals are in Perl from version 5.005. The bracket must either
2088     be followed by a number (for bracket reference) or by an assertion
2089     group. */
2090    
2091     case '(':
2092 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2093 nigel 3 {
2094 nigel 23 ptr += 4;
2095     length += 2;
2096 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2097 nigel 23 if (*ptr != ')')
2098     {
2099     *errorptr = ERR26;
2100     goto PCRE_ERROR_RETURN;
2101     }
2102 nigel 3 }
2103 nigel 23 else /* An assertion must follow */
2104 nigel 3 {
2105 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2106    
2107     if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2108     {
2109     ptr += 2; /* To get right offset in message */
2110     *errorptr = ERR28;
2111     goto PCRE_ERROR_RETURN;
2112     }
2113 nigel 3 }
2114 nigel 23 break;
2115    
2116     /* Else loop checking valid options until ) is met. Anything else is an
2117     error. If we are without any brackets, i.e. at top level, the settings
2118     act as if specified in the options, so massage the options immediately.
2119     This is for backward compatibility with Perl 5.004. */
2120    
2121     default:
2122     set = unset = 0;
2123     optset = &set;
2124     ptr += 2;
2125    
2126     for (;; ptr++)
2127 nigel 3 {
2128 nigel 23 c = *ptr;
2129     switch (c)
2130     {
2131     case 'i':
2132     *optset |= PCRE_CASELESS;
2133     continue;
2134    
2135     case 'm':
2136     *optset |= PCRE_MULTILINE;
2137     continue;
2138    
2139     case 's':
2140     *optset |= PCRE_DOTALL;
2141     continue;
2142    
2143     case 'x':
2144     *optset |= PCRE_EXTENDED;
2145     continue;
2146    
2147     case 'X':
2148     *optset |= PCRE_EXTRA;
2149     continue;
2150    
2151     case 'U':
2152     *optset |= PCRE_UNGREEDY;
2153     continue;
2154    
2155     case '-':
2156     optset = &unset;
2157     continue;
2158    
2159     /* A termination by ')' indicates an options-setting-only item;
2160     this is global at top level; otherwise nothing is done here and
2161     it is handled during the compiling process on a per-bracket-group
2162     basis. */
2163    
2164     case ')':
2165     if (brastackptr == 0)
2166     {
2167     options = (options | set) & (~unset);
2168     set = unset = 0; /* To save length */
2169     }
2170     /* Fall through */
2171    
2172     /* A termination by ':' indicates the start of a nested group with
2173     the given options set. This is again handled at compile time, but
2174     we must allow for compiled space if any of the ims options are
2175     set. We also have to allow for resetting space at the end of
2176     the group, which is why 4 is added to the length and not just 2.
2177     If there are several changes of options within the same group, this
2178     will lead to an over-estimate on the length, but this shouldn't
2179     matter very much. We also have to allow for resetting options at
2180     the start of any alternations, which we do by setting
2181     branch_newextra to 2. */
2182    
2183     case ':':
2184     if (((set|unset) & PCRE_IMS) != 0)
2185     {
2186     length += 4;
2187     branch_newextra = 2;
2188     }
2189     goto END_OPTIONS;
2190    
2191     /* Unrecognized option character */
2192    
2193     default:
2194     *errorptr = ERR12;
2195     goto PCRE_ERROR_RETURN;
2196     }
2197 nigel 3 }
2198 nigel 23
2199     /* If we hit a closing bracket, that's it - this is a freestanding
2200     option-setting. We need to ensure that branch_extra is updated if
2201     necessary. The only values branch_newextra can have here are 0 or 2.
2202     If the value is 2, then branch_extra must either be 2 or 5, depending
2203     on whether this is a lookbehind group or not. */
2204    
2205     END_OPTIONS:
2206     if (c == ')')
2207 nigel 19 {
2208 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2209     branch_extra += branch_newextra;
2210 nigel 19 continue;
2211     }
2212 nigel 3
2213 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2214     to handle the group below. */
2215 nigel 3 }
2216     }
2217    
2218     /* Extracting brackets must be counted so we can process escapes in a
2219     Perlish way. */
2220    
2221     else bracount++;
2222    
2223     /* Non-special forms of bracket. Save length for computing whole length
2224 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2225     save the current value of branch_extra, and start the new group with
2226     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2227     for a lookbehind assertion. */
2228 nigel 3
2229     if (brastackptr >= sizeof(brastack)/sizeof(int))
2230     {
2231     *errorptr = ERR19;
2232     goto PCRE_ERROR_RETURN;
2233     }
2234    
2235 nigel 23 bralenstack[brastackptr] = branch_extra;
2236     branch_extra = branch_newextra;
2237    
2238 nigel 3 brastack[brastackptr++] = length;
2239     length += 3;
2240     continue;
2241    
2242     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2243 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2244     0 this is an unmatched bracket which will generate an error, but take care
2245 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2246     the branch_extra value. */
2247 nigel 3
2248     case ')':
2249     length += 3;
2250     {
2251 nigel 9 int minval = 1;
2252     int maxval = 1;
2253 nigel 23 int duplength;
2254 nigel 3
2255 nigel 23 if (brastackptr > 0)
2256     {
2257     duplength = length - brastack[--brastackptr];
2258     branch_extra = bralenstack[brastackptr];
2259     }
2260     else duplength = 0;
2261    
2262 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2263     automatically; for the others we need an increment. */
2264    
2265 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2266 nigel 3 {
2267 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2268     &compile_block);
2269 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2270     }
2271 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2272     else if (c == '+') { maxval = -1; ptr++; }
2273     else if (c == '?') { minval = 0; ptr++; }
2274 nigel 3
2275 nigel 9 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2276     if there is a limited maximum we have to replicate up to maxval-1 times
2277     and allow for a BRAZERO item before each optional copy, as we also have
2278     to do before the first copy if the minimum is zero. */
2279 nigel 3
2280 nigel 9 if (minval == 0) length++;
2281     else if (minval > 1) length += (minval - 1) * duplength;
2282     if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2283 nigel 3 }
2284     continue;
2285    
2286     /* Non-special character. For a run of such characters the length required
2287     is the number of characters + 2, except that the maximum run length is 255.
2288     We won't get a skipped space or a non-data escape or the start of a #
2289     comment as the first character, so the length can't be zero. */
2290    
2291     NORMAL_CHAR:
2292     default:
2293     length += 2;
2294     runlength = 0;
2295     do
2296     {
2297 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2298 nigel 3 {
2299 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2300 nigel 23 if (c == '#')
2301     {
2302     while ((c = *(++ptr)) != 0 && c != '\n');
2303     continue;
2304     }
2305 nigel 3 }
2306    
2307     /* Backslash may introduce a data char or a metacharacter; stop the
2308     string before the latter. */
2309    
2310     if (c == '\\')
2311     {
2312 nigel 7 const uschar *saveptr = ptr;
2313 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2314     &compile_block);
2315 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2316     if (c < 0) { ptr = saveptr; break; }
2317     }
2318    
2319     /* Ordinary character or single-char escape */
2320    
2321     runlength++;
2322     }
2323    
2324     /* This "while" is the end of the "do" above. */
2325    
2326 nigel 25 while (runlength < 255 &&
2327     (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2328 nigel 3
2329     ptr--;
2330     length += runlength;
2331     continue;
2332     }
2333     }
2334    
2335     length += 4; /* For final KET and END */
2336    
2337     if (length > 65539)
2338     {
2339     *errorptr = ERR20;
2340     return NULL;
2341     }
2342    
2343     /* Compute the size of data block needed and get it, either from malloc or
2344 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
2345     rather than just "code", because it has been reported that one broken compiler
2346     fails on "code" because it is also an independent variable. It should make no
2347     difference to the value of the offsetof(). */
2348 nigel 3
2349 nigel 9 size = length + offsetof(real_pcre, code[0]);
2350 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
2351    
2352     if (re == NULL)
2353     {
2354     *errorptr = ERR21;
2355     return NULL;
2356     }
2357    
2358 nigel 9 /* Put in the magic number and the options. */
2359    
2360 nigel 3 re->magic_number = MAGIC_NUMBER;
2361     re->options = options;
2362 nigel 25 re->tables = tables;
2363 nigel 3
2364     /* Set up a starting, non-extracting bracket, then compile the expression. On
2365     error, *errorptr will be set non-NULL, so we don't need to look at the result
2366     of the function here. */
2367    
2368 nigel 7 ptr = (const uschar *)pattern;
2369 nigel 3 code = re->code;
2370     *code = OP_BRA;
2371     bracount = 0;
2372 nigel 25 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2373     &compile_block);
2374 nigel 3 re->top_bracket = bracount;
2375     re->top_backref = top_backref;
2376    
2377     /* If not reached end of pattern on success, there's an excess bracket. */
2378    
2379     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2380    
2381     /* Fill in the terminating state and check for disastrous overflow, but
2382     if debugging, leave the test till after things are printed out. */
2383    
2384     *code++ = OP_END;
2385    
2386     #ifndef DEBUG
2387     if (code - re->code > length) *errorptr = ERR23;
2388     #endif
2389    
2390 nigel 23 /* Give an error if there's back reference to a non-existent capturing
2391     subpattern. */
2392    
2393     if (top_backref > re->top_bracket) *errorptr = ERR15;
2394    
2395 nigel 3 /* Failed to compile */
2396    
2397     if (*errorptr != NULL)
2398     {
2399     (pcre_free)(re);
2400     PCRE_ERROR_RETURN:
2401 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
2402 nigel 3 return NULL;
2403     }
2404    
2405     /* If the anchored option was not passed, set flag if we can determine that it
2406     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2407     we can determine what the first character has to be, because that speeds up
2408     unanchored matches no end. In the case of multiline matches, an alternative is
2409     to set the PCRE_STARTLINE flag if all branches start with ^. */
2410    
2411     if ((options & PCRE_ANCHORED) == 0)
2412     {
2413 nigel 23 int temp_options = options;
2414     if (is_anchored(re->code, &temp_options))
2415 nigel 3 re->options |= PCRE_ANCHORED;
2416     else
2417     {
2418 nigel 23 int ch = find_firstchar(re->code, &temp_options);
2419 nigel 9 if (ch >= 0)
2420 nigel 3 {
2421 nigel 9 re->first_char = ch;
2422 nigel 3 re->options |= PCRE_FIRSTSET;
2423     }
2424     else if (is_startline(re->code))
2425     re->options |= PCRE_STARTLINE;
2426     }
2427     }
2428    
2429     /* Print out the compiled data for debugging */
2430    
2431     #ifdef DEBUG
2432    
2433 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
2434 nigel 3 length, re->top_bracket, re->top_backref);
2435    
2436     if (re->options != 0)
2437     {
2438 nigel 19 printf("%s%s%s%s%s%s%s%s\n",
2439 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2440     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2441     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2442     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2443     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2444     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2445 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2446     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2447 nigel 3 }
2448    
2449     if ((re->options & PCRE_FIRSTSET) != 0)
2450     {
2451     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2452     else printf("First char = \\x%02x\n", re->first_char);
2453     }
2454    
2455     code_end = code;
2456     code_base = code = re->code;
2457    
2458     while (code < code_end)
2459     {
2460     int charlength;
2461    
2462     printf("%3d ", code - code_base);
2463    
2464     if (*code >= OP_BRA)
2465     {
2466     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2467     code += 2;
2468     }
2469    
2470     else switch(*code)
2471     {
2472 nigel 23 case OP_OPT:
2473     printf(" %.2x %s", code[1], OP_names[*code]);
2474     code++;
2475     break;
2476    
2477     case OP_COND:
2478     printf("%3d Cond", (code[1] << 8) + code[2]);
2479     code += 2;
2480     break;
2481    
2482     case OP_CREF:
2483     printf(" %.2d %s", code[1], OP_names[*code]);
2484     code++;
2485     break;
2486    
2487 nigel 3 case OP_CHARS:
2488     charlength = *(++code);
2489     printf("%3d ", charlength);
2490     while (charlength-- > 0)
2491     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2492     break;
2493    
2494     case OP_KETRMAX:
2495     case OP_KETRMIN:
2496     case OP_ALT:
2497     case OP_KET:
2498     case OP_ASSERT:
2499     case OP_ASSERT_NOT:
2500 nigel 23 case OP_ASSERTBACK:
2501     case OP_ASSERTBACK_NOT:
2502 nigel 3 case OP_ONCE:
2503     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2504     code += 2;
2505     break;
2506    
2507 nigel 23 case OP_REVERSE:
2508     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2509     code += 2;
2510     break;
2511    
2512 nigel 3 case OP_STAR:
2513     case OP_MINSTAR:
2514     case OP_PLUS:
2515     case OP_MINPLUS:
2516     case OP_QUERY:
2517     case OP_MINQUERY:
2518     case OP_TYPESTAR:
2519     case OP_TYPEMINSTAR:
2520     case OP_TYPEPLUS:
2521     case OP_TYPEMINPLUS:
2522     case OP_TYPEQUERY:
2523     case OP_TYPEMINQUERY:
2524     if (*code >= OP_TYPESTAR)
2525     printf(" %s", OP_names[code[1]]);
2526     else if (isprint(c = code[1])) printf(" %c", c);
2527     else printf(" \\x%02x", c);
2528     printf("%s", OP_names[*code++]);
2529     break;
2530    
2531     case OP_EXACT:
2532     case OP_UPTO:
2533     case OP_MINUPTO:
2534     if (isprint(c = code[3])) printf(" %c{", c);
2535     else printf(" \\x%02x{", c);
2536 nigel 11 if (*code != OP_EXACT) printf("0,");
2537 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2538     if (*code == OP_MINUPTO) printf("?");
2539     code += 3;
2540     break;
2541    
2542     case OP_TYPEEXACT:
2543     case OP_TYPEUPTO:
2544     case OP_TYPEMINUPTO:
2545     printf(" %s{", OP_names[code[3]]);
2546     if (*code != OP_TYPEEXACT) printf(",");
2547     printf("%d}", (code[1] << 8) + code[2]);
2548     if (*code == OP_TYPEMINUPTO) printf("?");
2549     code += 3;
2550     break;
2551    
2552     case OP_NOT:
2553     if (isprint(c = *(++code))) printf(" [^%c]", c);
2554     else printf(" [^\\x%02x]", c);
2555     break;
2556    
2557     case OP_NOTSTAR:
2558     case OP_NOTMINSTAR:
2559     case OP_NOTPLUS:
2560     case OP_NOTMINPLUS:
2561     case OP_NOTQUERY:
2562     case OP_NOTMINQUERY:
2563     if (isprint(c = code[1])) printf(" [^%c]", c);
2564     else printf(" [^\\x%02x]", c);
2565     printf("%s", OP_names[*code++]);
2566     break;
2567    
2568     case OP_NOTEXACT:
2569     case OP_NOTUPTO:
2570     case OP_NOTMINUPTO:
2571     if (isprint(c = code[3])) printf(" [^%c]{", c);
2572     else printf(" [^\\x%02x]{", c);
2573     if (*code != OP_NOTEXACT) printf(",");
2574     printf("%d}", (code[1] << 8) + code[2]);
2575     if (*code == OP_NOTMINUPTO) printf("?");
2576     code += 3;
2577     break;
2578    
2579     case OP_REF:
2580     printf(" \\%d", *(++code));
2581 nigel 9 code ++;
2582     goto CLASS_REF_REPEAT;
2583 nigel 3
2584     case OP_CLASS:
2585     {
2586     int i, min, max;
2587 nigel 23 code++;
2588     printf(" [");
2589 nigel 3
2590     for (i = 0; i < 256; i++)
2591     {
2592     if ((code[i/8] & (1 << (i&7))) != 0)
2593     {
2594     int j;
2595     for (j = i+1; j < 256; j++)
2596     if ((code[j/8] & (1 << (j&7))) == 0) break;
2597     if (i == '-' || i == ']') printf("\\");
2598     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2599     if (--j > i)
2600     {
2601     printf("-");
2602     if (j == '-' || j == ']') printf("\\");
2603     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2604     }
2605     i = j;
2606     }
2607     }
2608     printf("]");
2609     code += 32;
2610    
2611 nigel 9 CLASS_REF_REPEAT:
2612    
2613 nigel 3 switch(*code)
2614     {
2615     case OP_CRSTAR:
2616     case OP_CRMINSTAR:
2617     case OP_CRPLUS:
2618     case OP_CRMINPLUS:
2619     case OP_CRQUERY:
2620     case OP_CRMINQUERY:
2621     printf("%s", OP_names[*code]);
2622     break;
2623    
2624     case OP_CRRANGE:
2625     case OP_CRMINRANGE:
2626     min = (code[1] << 8) + code[2];
2627     max = (code[3] << 8) + code[4];
2628     if (max == 0) printf("{%d,}", min);
2629     else printf("{%d,%d}", min, max);
2630     if (*code == OP_CRMINRANGE) printf("?");
2631     code += 4;
2632     break;
2633    
2634     default:
2635     code--;
2636     }
2637     }
2638     break;
2639    
2640     /* Anything else is just a one-node item */
2641    
2642     default:
2643     printf(" %s", OP_names[*code]);
2644     break;
2645     }
2646    
2647     code++;
2648     printf("\n");
2649     }
2650     printf("------------------------------------------------------------------\n");
2651    
2652     /* This check is done here in the debugging case so that the code that
2653     was compiled can be seen. */
2654    
2655     if (code - re->code > length)
2656     {
2657     *errorptr = ERR23;
2658     (pcre_free)(re);
2659     *erroroffset = ptr - (uschar *)pattern;
2660     return NULL;
2661     }
2662     #endif
2663    
2664     return (pcre *)re;
2665     }
2666    
2667    
2668    
2669     /*************************************************
2670     * Match a back-reference *
2671     *************************************************/
2672    
2673 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
2674     than the number of characters left in the string, so the match fails.
2675 nigel 3
2676     Arguments:
2677 nigel 23 offset index into the offset vector
2678 nigel 3 eptr points into the subject
2679     length length to be matched
2680     md points to match data block
2681 nigel 23 ims the ims flags
2682 nigel 3
2683     Returns: TRUE if matched
2684     */
2685    
2686     static BOOL
2687 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2688     int ims)
2689 nigel 3 {
2690 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
2691 nigel 3
2692     #ifdef DEBUG
2693     if (eptr >= md->end_subject)
2694     printf("matching subject <null>");
2695     else
2696     {
2697     printf("matching subject ");
2698     pchars(eptr, length, TRUE, md);
2699     }
2700     printf(" against backref ");
2701     pchars(p, length, FALSE, md);
2702     printf("\n");
2703     #endif
2704    
2705     /* Always fail if not enough characters left */
2706    
2707 nigel 23 if (length > md->end_subject - eptr) return FALSE;
2708 nigel 3
2709     /* Separate the caselesss case for speed */
2710    
2711 nigel 23 if ((ims & PCRE_CASELESS) != 0)
2712 nigel 25 {
2713     while (length-- > 0)
2714     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2715     }
2716 nigel 3 else
2717     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2718    
2719     return TRUE;
2720     }
2721    
2722    
2723    
2724     /*************************************************
2725     * Match from current position *
2726     *************************************************/
2727    
2728 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
2729     in the subject string, while eptrb holds the value of eptr at the start of the
2730     last bracketed group - used for breaking infinite loops matching zero-length
2731     strings.
2732 nigel 3
2733     Arguments:
2734     eptr pointer in subject
2735     ecode position in code
2736     offset_top current top pointer
2737     md pointer to "static" info for the match
2738 nigel 23 ims current /i, /m, and /s options
2739     condassert TRUE if called to check a condition assertion
2740     eptrb eptr at start of last bracket
2741 nigel 3
2742     Returns: TRUE if matched
2743     */
2744    
2745     static BOOL
2746 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
2747     int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2748 nigel 3 {
2749 nigel 23 int original_ims = ims; /* Save for resetting on ')' */
2750    
2751 nigel 3 for (;;)
2752     {
2753 nigel 23 int op = (int)*ecode;
2754 nigel 3 int min, max, ctype;
2755     register int i;
2756     register int c;
2757 nigel 7 BOOL minimize = FALSE;
2758 nigel 3
2759 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
2760     the current subject position in the working slot at the top of the vector. We
2761     mustn't change the current values of the data slot, because they may be set
2762     from a previous iteration of this group, and be referred to by a reference
2763     inside the group.
2764 nigel 3
2765 nigel 23 If the bracket fails to match, we need to restore this value and also the
2766     values of the final offsets, in case they were set by a previous iteration of
2767     the same bracket.
2768    
2769     If there isn't enough space in the offset vector, treat this as if it were a
2770     non-capturing bracket. Don't worry about setting the flag for the error case
2771     here; that is handled in the code for KET. */
2772    
2773     if (op > OP_BRA)
2774 nigel 3 {
2775 nigel 23 int number = op - OP_BRA;
2776     int offset = number << 1;
2777 nigel 3
2778 nigel 23 DPRINTF(("start bracket %d\n", number));
2779 nigel 3
2780 nigel 23 if (offset < md->offset_max)
2781 nigel 3 {
2782 nigel 23 int save_offset1 = md->offset_vector[offset];
2783     int save_offset2 = md->offset_vector[offset+1];
2784     int save_offset3 = md->offset_vector[md->offset_end - number];
2785 nigel 3
2786 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2787     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2788    
2789     do
2790     {
2791     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2792     ecode += (ecode[1] << 8) + ecode[2];
2793     }
2794     while (*ecode == OP_ALT);
2795    
2796     DPRINTF(("bracket %d failed\n", number));
2797    
2798     md->offset_vector[offset] = save_offset1;
2799     md->offset_vector[offset+1] = save_offset2;
2800     md->offset_vector[md->offset_end - number] = save_offset3;
2801     return FALSE;
2802 nigel 3 }
2803    
2804 nigel 23 /* Insufficient room for saving captured contents */
2805 nigel 3
2806 nigel 23 else op = OP_BRA;
2807     }
2808    
2809     /* Other types of node can be handled by a switch */
2810    
2811     switch(op)
2812     {
2813     case OP_BRA: /* Non-capturing bracket: optimized */
2814     DPRINTF(("start bracket 0\n"));
2815 nigel 3 do
2816     {
2817 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2818 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2819     }
2820     while (*ecode == OP_ALT);
2821 nigel 23 DPRINTF(("bracket 0 failed\n"));
2822     return FALSE;
2823 nigel 3
2824 nigel 23 /* Conditional group: compilation checked that there are no more than
2825     two branches. If the condition is false, skipping the first branch takes us
2826     past the end if there is only one branch, but that's OK because that is
2827     exactly what going to the ket would do. */
2828 nigel 3
2829 nigel 23 case OP_COND:
2830     if (ecode[3] == OP_CREF) /* Condition is extraction test */
2831 nigel 3 {
2832 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
2833     return match(eptr,
2834     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2835     5 : 3 + (ecode[1] << 8) + ecode[2]),
2836     offset_top, md, ims, FALSE, eptr);
2837 nigel 3 }
2838    
2839 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
2840     the final argument TRUE causes it to stop at the end of an assertion. */
2841 nigel 3
2842 nigel 23 else
2843     {
2844     if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
2845     {
2846     ecode += 3 + (ecode[4] << 8) + ecode[5];
2847     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
2848     }
2849     else ecode += (ecode[1] << 8) + ecode[2];
2850     return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
2851     }
2852     /* Control never reaches here */
2853 nigel 3
2854 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
2855    
2856     case OP_CREF:
2857     ecode += 2;
2858     break;
2859    
2860     /* End of the pattern */
2861    
2862 nigel 3 case OP_END:
2863     md->end_match_ptr = eptr; /* Record where we ended */
2864     md->end_offset_top = offset_top; /* and how many extracts were taken */
2865     return TRUE;
2866    
2867 nigel 23 /* Change option settings */
2868 nigel 3
2869 nigel 23 case OP_OPT:
2870     ims = ecode[1];
2871     ecode += 2;
2872     DPRINTF(("ims set to %02x\n", ims));
2873     break;
2874 nigel 3
2875     /* Assertion brackets. Check the alternative branches in turn - the
2876     matching won't pass the KET for an assertion. If any one branch matches,
2877 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
2878     start of each branch to move the current point backwards, so the code at
2879     this level is identical to the lookahead case. */
2880 nigel 3
2881     case OP_ASSERT:
2882 nigel 23 case OP_ASSERTBACK:
2883 nigel 3 do
2884     {
2885 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
2886 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2887     }
2888     while (*ecode == OP_ALT);
2889     if (*ecode == OP_KET) return FALSE;
2890    
2891 nigel 23 /* If checking an assertion for a condition, return TRUE. */
2892    
2893     if (condassert) return TRUE;
2894    
2895 nigel 3 /* Continue from after the assertion, updating the offsets high water
2896     mark, since extracts may have been taken during the assertion. */
2897    
2898     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2899     ecode += 3;
2900     offset_top = md->end_offset_top;
2901     continue;
2902    
2903     /* Negative assertion: all branches must fail to match */
2904    
2905     case OP_ASSERT_NOT:
2906 nigel 23 case OP_ASSERTBACK_NOT:
2907 nigel 3 do
2908     {
2909 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
2910 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2911     }
2912     while (*ecode == OP_ALT);
2913 nigel 23
2914     if (condassert) return TRUE;
2915 nigel 3 ecode += 3;
2916     continue;
2917    
2918 nigel 23 /* Move the subject pointer back. This occurs only at the start of
2919     each branch of a lookbehind assertion. If we are too close to the start to
2920     move back, this match function fails. */
2921    
2922     case OP_REVERSE:
2923     eptr -= (ecode[1] << 8) + ecode[2];
2924     if (eptr < md->start_subject) return FALSE;
2925     ecode += 3;
2926     break;
2927    
2928    
2929 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
2930     the point in the subject string is not moved back. Thus there can never be
2931 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
2932 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
2933 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
2934     pointer. */
2935 nigel 3
2936     case OP_ONCE:
2937     {
2938 nigel 23 const uschar *prev = ecode;
2939 nigel 3
2940 nigel 23 do
2941     {
2942     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
2943     ecode += (ecode[1] << 8) + ecode[2];
2944     }
2945     while (*ecode == OP_ALT);
2946 nigel 3
2947 nigel 23 /* If hit the end of the group (which could be repeated), fail */
2948 nigel 3
2949 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
2950    
2951     /* Continue as from after the assertion, updating the offsets high water
2952     mark, since extracts may have been taken. */
2953    
2954     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2955    
2956     offset_top = md->end_offset_top;
2957     eptr = md->end_match_ptr;
2958    
2959     /* For a non-repeating ket, just continue at this level. This also
2960     happens for a repeating ket if no characters were matched in the group.
2961     This is the forcible breaking of infinite loops as implemented in Perl
2962     5.005. If there is an options reset, it will get obeyed in the normal
2963     course of events. */
2964    
2965     if (*ecode == OP_KET || eptr == eptrb)
2966     {
2967     ecode += 3;
2968     break;
2969     }
2970    
2971     /* The repeating kets try the rest of the pattern or restart from the
2972     preceding bracket, in the appropriate order. We need to reset any options
2973     that changed within the bracket before re-running it, so check the next
2974     opcode. */
2975    
2976     if (ecode[3] == OP_OPT)
2977     {
2978     ims = (ims & ~PCRE_IMS) | ecode[4];
2979     DPRINTF(("ims set to %02x at group repeat\n", ims));
2980     }
2981    
2982     if (*ecode == OP_KETRMIN)
2983     {
2984     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
2985     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
2986     }
2987     else /* OP_KETRMAX */
2988     {
2989     if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
2990     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2991     }
2992     }
2993     return FALSE;
2994    
2995 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
2996     bracketed group and go to there. */
2997    
2998     case OP_ALT:
2999     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3000     break;
3001    
3002     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3003     that it may occur zero times. It may repeat infinitely, or not at all -
3004     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3005     repeat limits are compiled as a number of copies, with the optional ones
3006     preceded by BRAZERO or BRAMINZERO. */
3007    
3008     case OP_BRAZERO:
3009     {
3010 nigel 7 const uschar *next = ecode+1;
3011 nigel 23 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3012 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3013     ecode = next + 3;
3014     }
3015     break;
3016    
3017     case OP_BRAMINZERO:
3018     {
3019 nigel 7 const uschar *next = ecode+1;
3020 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3021 nigel 23 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3022 nigel 3 ecode++;
3023     }
3024 nigel 23 break;
3025 nigel 3
3026     /* End of a group, repeated or non-repeating. If we are at the end of
3027     an assertion "group", stop matching and return TRUE, but record the
3028 nigel 23 current high water mark for use by positive assertions. Do this also
3029     for the "once" (not-backup up) groups. */
3030 nigel 3
3031     case OP_KET:
3032     case OP_KETRMIN:
3033     case OP_KETRMAX:
3034     {
3035 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3036 nigel 3
3037 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3038     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3039     *prev == OP_ONCE)
3040 nigel 3 {
3041     md->end_match_ptr = eptr; /* For ONCE */
3042     md->end_offset_top = offset_top;
3043     return TRUE;
3044     }
3045    
3046 nigel 23 /* In all other cases except a conditional group we have to check the
3047     group number back at the start and if necessary complete handling an
3048     extraction by setting the offsets and bumping the high water mark. */
3049 nigel 3
3050 nigel 23 if (*prev != OP_COND)
3051     {
3052     int number = *prev - OP_BRA;
3053     int offset = number << 1;
3054 nigel 3
3055 nigel 23 DPRINTF(("end bracket %d\n", number));
3056 nigel 3
3057 nigel 23 if (number > 0)
3058 nigel 3 {
3059 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3060     {
3061     md->offset_vector[offset] =
3062     md->offset_vector[md->offset_end - number];
3063     md->offset_vector[offset+1] = eptr - md->start_subject;
3064     if (offset_top <= offset) offset_top = offset + 2;
3065     }
3066 nigel 3 }
3067     }
3068    
3069 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3070     the group. */
3071 nigel 3
3072 nigel 23 ims = original_ims;
3073     DPRINTF(("ims reset to %02x\n", ims));
3074    
3075     /* For a non-repeating ket, just continue at this level. This also
3076     happens for a repeating ket if no characters were matched in the group.
3077     This is the forcible breaking of infinite loops as implemented in Perl
3078     5.005. If there is an options reset, it will get obeyed in the normal
3079     course of events. */
3080    
3081     if (*ecode == OP_KET || eptr == eptrb)
3082 nigel 3 {
3083     ecode += 3;
3084     break;
3085     }
3086    
3087     /* The repeating kets try the rest of the pattern or restart from the
3088     preceding bracket, in the appropriate order. */
3089    
3090     if (*ecode == OP_KETRMIN)
3091     {
3092 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3093     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3094 nigel 3 }
3095     else /* OP_KETRMAX */
3096     {
3097 nigel 23 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3098     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3099 nigel 3 }
3100     }
3101     return FALSE;
3102    
3103     /* Start of subject unless notbol, or after internal newline if multiline */
3104    
3105     case OP_CIRC:
3106     if (md->notbol && eptr == md->start_subject) return FALSE;
3107 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3108 nigel 3 {
3109     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3110     ecode++;
3111     break;
3112     }
3113     /* ... else fall through */
3114    
3115     /* Start of subject assertion */
3116    
3117     case OP_SOD:
3118     if (eptr != md->start_subject) return FALSE;
3119     ecode++;
3120     break;
3121    
3122 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3123     newline unless endonly is set, else end of subject unless noteol is set. */
3124 nigel 3
3125     case OP_DOLL:
3126 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3127 nigel 3 {
3128 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3129     else { if (md->noteol) return FALSE; }
3130 nigel 3 ecode++;
3131     break;
3132     }
3133 nigel 23 else
3134 nigel 3 {
3135 nigel 23 if (md->noteol) return FALSE;
3136     if (!md->endonly)
3137     {
3138     if (eptr < md->end_subject - 1 ||
3139     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3140    
3141     ecode++;
3142     break;
3143     }
3144 nigel 3 }
3145     /* ... else fall through */
3146    
3147 nigel 23 /* End of subject assertion (\z) */
3148 nigel 3
3149     case OP_EOD:
3150     if (eptr < md->end_subject) return FALSE;
3151     ecode++;
3152     break;
3153    
3154 nigel 23 /* End of subject or ending \n assertion (\Z) */
3155    
3156     case OP_EODN:
3157     if (eptr < md->end_subject - 1 ||
3158     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3159     ecode++;
3160     break;
3161    
3162 nigel 3 /* Word boundary assertions */
3163    
3164     case OP_NOT_WORD_BOUNDARY:
3165     case OP_WORD_BOUNDARY:
3166     {
3167     BOOL prev_is_word = (eptr != md->start_subject) &&
3168 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3169 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
3170 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
3171 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
3172     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3173     return FALSE;
3174     }
3175     break;
3176    
3177     /* Match a single character type; inline for speed */
3178    
3179     case OP_ANY:
3180 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3181     return FALSE;
3182 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3183     ecode++;
3184     break;
3185    
3186     case OP_NOT_DIGIT:
3187 nigel 25 if (eptr >= md->end_subject ||
3188     (md->ctypes[*eptr++] & ctype_digit) != 0)
3189 nigel 3 return FALSE;
3190     ecode++;
3191     break;
3192    
3193     case OP_DIGIT:
3194 nigel 25 if (eptr >= md->end_subject ||
3195     (md->ctypes[*eptr++] & ctype_digit) == 0)
3196 nigel 3 return FALSE;
3197     ecode++;
3198     break;
3199    
3200     case OP_NOT_WHITESPACE:
3201 nigel 25 if (eptr >= md->end_subject ||
3202     (md->ctypes[*eptr++] & ctype_space) != 0)
3203 nigel 3 return FALSE;
3204     ecode++;
3205     break;
3206    
3207     case OP_WHITESPACE:
3208 nigel 25 if (eptr >= md->end_subject ||
3209     (md->ctypes[*eptr++] & ctype_space) == 0)
3210 nigel 3 return FALSE;
3211     ecode++;
3212     break;
3213    
3214     case OP_NOT_WORDCHAR:
3215 nigel 25 if (eptr >= md->end_subject ||
3216     (md->ctypes[*eptr++] & ctype_word) != 0)
3217 nigel 3 return FALSE;
3218     ecode++;
3219     break;
3220    
3221     case OP_WORDCHAR:
3222 nigel 25 if (eptr >= md->end_subject ||
3223     (md->ctypes[*eptr++] & ctype_word) == 0)
3224 nigel 3 return FALSE;
3225     ecode++;
3226     break;
3227    
3228     /* Match a back reference, possibly repeatedly. Look past the end of the
3229     item to see if there is repeat information following. The code is similar
3230     to that for character classes, but repeated for efficiency. Then obey
3231     similar code to character type repeats - written out again for speed.
3232     However, if the referenced string is the empty string, always treat
3233     it as matched, any number of times (otherwise there could be infinite
3234     loops). */
3235    
3236     case OP_REF:
3237     {
3238     int length;
3239 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
3240 nigel 3 ecode += 2; /* Advance past the item */
3241    
3242 nigel 23 /* If the reference is unset, set the length to be longer than the amount
3243     of subject left; this ensures that every attempt at a match fails. We
3244     can't just fail here, because of the possibility of quantifiers with zero
3245     minima. */
3246 nigel 3
3247 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3248     md->end_subject - eptr + 1 :
3249     md->offset_vector[offset+1] - md->offset_vector[offset];
3250 nigel 3
3251 nigel 23 /* Set up for repetition, or handle the non-repeated case */
3252    
3253 nigel 3 switch (*ecode)
3254     {
3255     case OP_CRSTAR:
3256     case OP_CRMINSTAR:
3257     case OP_CRPLUS:
3258     case OP_CRMINPLUS:
3259     case OP_CRQUERY:
3260     case OP_CRMINQUERY:
3261     c = *ecode++ - OP_CRSTAR;
3262     minimize = (c & 1) != 0;
3263     min = rep_min[c]; /* Pick up values from tables; */
3264     max = rep_max[c]; /* zero for max => infinity */
3265     if (max == 0) max = INT_MAX;
3266     break;
3267    
3268     case OP_CRRANGE:
3269     case OP_CRMINRANGE:
3270     minimize = (*ecode == OP_CRMINRANGE);
3271     min = (ecode[1] << 8) + ecode[2];
3272     max = (ecode[3] << 8) + ecode[4];
3273     if (max == 0) max = INT_MAX;
3274     ecode += 5;
3275     break;
3276    
3277     default: /* No repeat follows */
3278 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3279 nigel 3 eptr += length;
3280     continue; /* With the main loop */
3281     }
3282    
3283     /* If the length of the reference is zero, just continue with the
3284     main loop. */
3285    
3286     if (length == 0) continue;
3287    
3288     /* First, ensure the minimum number of matches are present. We get back
3289     the length of the reference string explicitly rather than passing the
3290     address of eptr, so that eptr can be a register variable. */
3291    
3292     for (i = 1; i <= min; i++)
3293     {
3294 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3295 nigel 3 eptr += length;
3296     }
3297    
3298     /* If min = max, continue at the same level without recursion.
3299     They are not both allowed to be zero. */
3300    
3301     if (min == max) continue;
3302    
3303     /* If minimizing, keep trying and advancing the pointer */
3304    
3305     if (minimize)
3306     {
3307     for (i = min;; i++)
3308     {
3309 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3310     return TRUE;
3311     if (i >= max || !match_ref(offset, eptr, length, md, ims))
3312 nigel 3 return FALSE;
3313     eptr += length;
3314     }
3315     /* Control never gets here */
3316     }
3317    
3318     /* If maximizing, find the longest string and work backwards */
3319    
3320     else
3321     {
3322 nigel 7 const uschar *pp = eptr;
3323 nigel 3 for (i = min; i < max; i++)
3324     {
3325 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
3326 nigel 3 eptr += length;
3327     }
3328     while (eptr >= pp)
3329     {
3330 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3331     return TRUE;
3332 nigel 3 eptr -= length;
3333     }
3334     return FALSE;
3335     }
3336     }
3337     /* Control never gets here */
3338    
3339 nigel 23
3340    
3341 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
3342     item to see if there is repeat information following. Then obey similar
3343 nigel 23 code to character type repeats - written out again for speed. */
3344 nigel 3
3345     case OP_CLASS:
3346     {
3347 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
3348     ecode += 33; /* Advance past the item */
3349 nigel 3
3350     switch (*ecode)
3351     {
3352     case OP_CRSTAR:
3353     case OP_CRMINSTAR:
3354     case OP_CRPLUS:
3355     case OP_CRMINPLUS:
3356     case OP_CRQUERY:
3357     case OP_CRMINQUERY:
3358     c = *ecode++ - OP_CRSTAR;
3359     minimize = (c & 1) != 0;
3360     min = rep_min[c]; /* Pick up values from tables; */
3361     max = rep_max[c]; /* zero for max => infinity */
3362     if (max == 0) max = INT_MAX;
3363     break;
3364    
3365     case OP_CRRANGE:
3366     case OP_CRMINRANGE:
3367     minimize = (*ecode == OP_CRMINRANGE);
3368     min = (ecode[1] << 8) + ecode[2];
3369     max = (ecode[3] << 8) + ecode[4];
3370     if (max == 0) max = INT_MAX;
3371     ecode += 5;
3372     break;
3373    
3374     default: /* No repeat follows */
3375 nigel 13 min = max = 1;
3376     break;
3377 nigel 3 }
3378    
3379     /* First, ensure the minimum number of matches are present. */
3380    
3381     for (i = 1; i <= min; i++)
3382     {
3383     if (eptr >= md->end_subject) return FALSE;
3384     c = *eptr++;
3385 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3386 nigel 3 return FALSE;
3387     }
3388    
3389     /* If max == min we can continue with the main loop without the
3390     need to recurse. */
3391    
3392     if (min == max) continue;
3393    
3394     /* If minimizing, keep testing the rest of the expression and advancing
3395     the pointer while it matches the class. */
3396    
3397     if (minimize)
3398     {
3399     for (i = min;; i++)
3400     {
3401 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3402     return TRUE;
3403 nigel 3 if (i >= max || eptr >= md->end_subject) return FALSE;
3404     c = *eptr++;
3405 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3406 nigel 3 return FALSE;
3407     }
3408     /* Control never gets here */
3409     }
3410    
3411     /* If maximizing, find the longest possible run, then work backwards. */
3412    
3413     else
3414     {
3415 nigel 7 const uschar *pp = eptr;
3416 nigel 3 for (i = min; i < max; eptr++, i++)
3417     {
3418     if (eptr >= md->end_subject) break;
3419     c = *eptr;
3420 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3421 nigel 3 break;
3422     }
3423    
3424     while (eptr >= pp)
3425 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3426     return TRUE;
3427 nigel 3 return FALSE;
3428     }
3429     }
3430     /* Control never gets here */
3431    
3432     /* Match a run of characters */
3433    
3434     case OP_CHARS:
3435     {
3436     register int length = ecode[1];
3437     ecode += 2;
3438    
3439 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3440 nigel 3 if (eptr >= md->end_subject)
3441     printf("matching subject <null> against pattern ");
3442     else
3443     {
3444     printf("matching subject ");
3445     pchars(eptr, length, TRUE, md);
3446     printf(" against pattern ");
3447     }
3448     pchars(ecode, length, FALSE, md);
3449     printf("\n");
3450 nigel 9 #endif
3451 nigel 3
3452     if (length > md->end_subject - eptr) return FALSE;
3453 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3454 nigel 3 {
3455 nigel 25 while (length-- > 0)
3456     if (md->lcc[*ecode++] != md->lcc[*eptr++])
3457     return FALSE;
3458 nigel 3 }
3459     else
3460     {
3461     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3462     }
3463     }
3464     break;
3465    
3466     /* Match a single character repeatedly; different opcodes share code. */
3467    
3468     case OP_EXACT:
3469     min = max = (ecode[1] << 8) + ecode[2];
3470     ecode += 3;
3471     goto REPEATCHAR;
3472    
3473     case OP_UPTO:
3474     case OP_MINUPTO:
3475     min = 0;
3476     max = (ecode[1] << 8) + ecode[2];
3477     minimize = *ecode == OP_MINUPTO;
3478     ecode += 3;
3479     goto REPEATCHAR;
3480    
3481     case OP_STAR:
3482     case OP_MINSTAR:
3483     case OP_PLUS:
3484     case OP_MINPLUS:
3485     case OP_QUERY:
3486     case OP_MINQUERY:
3487     c = *ecode++ - OP_STAR;
3488     minimize = (c & 1) != 0;
3489     min = rep_min[c]; /* Pick up values from tables; */
3490     max = rep_max[c]; /* zero for max => infinity */
3491     if (max == 0) max = INT_MAX;
3492    
3493     /* Common code for all repeated single-character matches. We can give
3494     up quickly if there are fewer than the minimum number of characters left in
3495     the subject. */
3496    
3497     REPEATCHAR:
3498     if (min > md->end_subject - eptr) return FALSE;
3499     c = *ecode++;
3500    
3501     /* The code is duplicated for the caseless and caseful cases, for speed,
3502     since matching characters is likely to be quite common. First, ensure the
3503     minimum number of matches are present. If min = max, continue at the same
3504     level without recursing. Otherwise, if minimizing, keep trying the rest of
3505     the expression and advancing one matching character if failing, up to the
3506     maximum. Alternatively, if maximizing, find the maximum number of
3507     characters and work backwards. */
3508    
3509 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3510     max, eptr));
3511 nigel 3
3512 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3513 nigel 3 {
3514 nigel 25 c = md->lcc[c];
3515     for (i = 1; i <= min; i++)
3516     if (c != md->lcc[*eptr++]) return FALSE;
3517 nigel 3 if (min == max) continue;
3518     if (minimize)
3519     {
3520     for (i = min;; i++)
3521     {
3522 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3523     return TRUE;
3524 nigel 25 if (i >= max || eptr >= md->end_subject ||
3525     c != md->lcc[*eptr++])
3526 nigel 3 return FALSE;
3527     }
3528     /* Control never gets here */
3529     }
3530     else
3531     {
3532 nigel 7 const uschar *pp = eptr;
3533 nigel 3 for (i = min; i < max; i++)
3534     {
3535 nigel 25 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3536 nigel 3 eptr++;
3537     }
3538     while (eptr >= pp)
3539 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3540     return TRUE;
3541 nigel 3 return FALSE;
3542     }
3543     /* Control never gets here */
3544     }
3545    
3546     /* Caseful comparisons */
3547    
3548     else
3549     {
3550     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3551     if (min == max) continue;
3552     if (minimize)
3553     {
3554     for (i = min;; i++)
3555     {
3556 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3557     return TRUE;
3558 nigel 3 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3559     }
3560     /* Control never gets here */
3561     }
3562     else
3563     {
3564 nigel 7 const uschar *pp = eptr;
3565 nigel 3 for (i = min; i < max; i++)
3566     {
3567     if (eptr >= md->end_subject || c != *eptr) break;
3568     eptr++;
3569     }
3570     while (eptr >= pp)
3571 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3572     return TRUE;
3573 nigel 3 return FALSE;
3574     }
3575     }
3576     /* Control never gets here */
3577    
3578     /* Match a negated single character */
3579    
3580     case OP_NOT:
3581 nigel 9 if (eptr >= md->end_subject) return FALSE;
3582 nigel 3 ecode++;
3583 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3584 nigel 3 {
3585 nigel 25 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3586 nigel 3 }
3587     else
3588     {
3589     if (*ecode++ == *eptr++) return FALSE;
3590     }
3591     break;
3592    
3593     /* Match a negated single character repeatedly. This is almost a repeat of
3594     the code for a repeated single character, but I haven't found a nice way of
3595     commoning these up that doesn't require a test of the positive/negative
3596     option for each character match. Maybe that wouldn't add very much to the
3597     time taken, but character matching *is* what this is all about... */
3598    
3599     case OP_NOTEXACT:
3600     min = max = (ecode[1] << 8) + ecode[2];
3601     ecode += 3;
3602     goto REPEATNOTCHAR;
3603    
3604     case OP_NOTUPTO:
3605     case OP_NOTMINUPTO:
3606     min = 0;
3607     max = (ecode[1] << 8) + ecode[2];
3608     minimize = *ecode == OP_NOTMINUPTO;
3609     ecode += 3;
3610     goto REPEATNOTCHAR;
3611    
3612     case OP_NOTSTAR:
3613     case OP_NOTMINSTAR:
3614     case OP_NOTPLUS:
3615     case OP_NOTMINPLUS:
3616     case OP_NOTQUERY:
3617     case OP_NOTMINQUERY:
3618     c = *ecode++ - OP_NOTSTAR;
3619     minimize = (c & 1) != 0;
3620     min = rep_min[c]; /* Pick up values from tables; */
3621     max = rep_max[c]; /* zero for max => infinity */
3622     if (max == 0) max = INT_MAX;
3623    
3624     /* Common code for all repeated single-character matches. We can give
3625     up quickly if there are fewer than the minimum number of characters left in
3626     the subject. */
3627    
3628     REPEATNOTCHAR:
3629     if (min > md->end_subject - eptr) return FALSE;
3630     c = *ecode++;
3631    
3632     /* The code is duplicated for the caseless and caseful cases, for speed,
3633     since matching characters is likely to be quite common. First, ensure the
3634     minimum number of matches are present. If min = max, continue at the same
3635     level without recursing. Otherwise, if minimizing, keep trying the rest of
3636     the expression and advancing one matching character if failing, up to the
3637     maximum. Alternatively, if maximizing, find the maximum number of
3638     characters and work backwards. */
3639    
3640 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3641     max, eptr));
3642 nigel 3
3643 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3644 nigel 3 {
3645 nigel 25 c = md->lcc[c];
3646     for (i = 1; i <= min; i++)
3647     if (c == md->lcc[*eptr++]) return FALSE;
3648 nigel 3 if (min == max) continue;
3649     if (minimize)
3650     {
3651     for (i = min;; i++)
3652     {
3653 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3654     return TRUE;
3655 nigel 25 if (i >= max || eptr >= md->end_subject ||
3656     c == md->lcc[*eptr++])
3657 nigel 3 return FALSE;
3658     }
3659     /* Control never gets here */
3660     }
3661     else
3662     {
3663 nigel 7 const uschar *pp = eptr;
3664 nigel 3 for (i = min; i < max; i++)
3665     {
3666 nigel 25 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3667 nigel 3 eptr++;
3668     }
3669     while (eptr >= pp)
3670 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3671     return TRUE;
3672 nigel 3 return FALSE;
3673     }
3674     /* Control never gets here */
3675     }
3676    
3677     /* Caseful comparisons */
3678    
3679     else
3680     {
3681     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3682     if (min == max) continue;
3683     if (minimize)
3684     {
3685     for (i = min;; i++)
3686     {
3687 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3688     return TRUE;
3689 nigel 3 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3690     }
3691     /* Control never gets here */
3692     }
3693     else
3694     {
3695 nigel 7 const uschar *pp = eptr;
3696 nigel 3 for (i = min; i < max; i++)
3697     {
3698     if (eptr >= md->end_subject || c == *eptr) break;
3699     eptr++;
3700     }
3701     while (eptr >= pp)
3702 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3703     return TRUE;
3704 nigel 3 return FALSE;
3705     }
3706     }
3707     /* Control never gets here */
3708    
3709     /* Match a single character type repeatedly; several different opcodes
3710     share code. This is very similar to the code for single characters, but we
3711     repeat it in the interests of efficiency. */
3712    
3713     case OP_TYPEEXACT:
3714     min = max = (ecode[1] << 8) + ecode[2];
3715     minimize = TRUE;
3716     ecode += 3;
3717     goto REPEATTYPE;
3718    
3719     case OP_TYPEUPTO:
3720     case OP_TYPEMINUPTO:
3721     min = 0;
3722     max = (ecode[1] << 8) + ecode[2];
3723     minimize = *ecode == OP_TYPEMINUPTO;
3724     ecode += 3;
3725     goto REPEATTYPE;
3726    
3727     case OP_TYPESTAR:
3728     case OP_TYPEMINSTAR:
3729     case OP_TYPEPLUS:
3730     case OP_TYPEMINPLUS:
3731     case OP_TYPEQUERY:
3732     case OP_TYPEMINQUERY:
3733     c = *ecode++ - OP_TYPESTAR;
3734     minimize = (c & 1) != 0;
3735     min = rep_min[c]; /* Pick up values from tables; */
3736     max = rep_max[c]; /* zero for max => infinity */
3737     if (max == 0) max = INT_MAX;
3738    
3739     /* Common code for all repeated single character type matches */
3740    
3741     REPEATTYPE:
3742     ctype = *ecode++; /* Code for the character type */
3743    
3744     /* First, ensure the minimum number of matches are present. Use inline
3745     code for maximizing the speed, and do the type test once at the start
3746     (i.e. keep it out of the loop). Also test that there are at least the
3747     minimum number of characters before we start. */
3748    
3749     if (min > md->end_subject - eptr) return FALSE;
3750     if (min > 0) switch(ctype)
3751     {
3752     case OP_ANY:
3753 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3754 nigel 3 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3755     else eptr += min;
3756     break;
3757    
3758     case OP_NOT_DIGIT:
3759     for (i = 1; i <= min; i++)
3760 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3761 nigel 3 break;
3762    
3763     case OP_DIGIT:
3764     for (i = 1; i <= min; i++)
3765 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3766 nigel 3 break;
3767    
3768     case OP_NOT_WHITESPACE:
3769     for (i = 1; i <= min; i++)
3770 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3771 nigel 3 break;
3772    
3773     case OP_WHITESPACE:
3774     for (i = 1; i <= min; i++)
3775 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3776 nigel 3 break;
3777    
3778     case OP_NOT_WORDCHAR:
3779 nigel 25 for (i = 1; i <= min; i++)
3780     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3781     return FALSE;
3782 nigel 3 break;
3783    
3784     case OP_WORDCHAR:
3785 nigel 25 for (i = 1; i <= min; i++)
3786     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3787     return FALSE;
3788 nigel 3 break;
3789     }
3790    
3791     /* If min = max, continue at the same level without recursing */
3792    
3793     if (min == max) continue;
3794    
3795     /* If minimizing, we have to test the rest of the pattern before each
3796 nigel 25 subsequent match. */
3797 nigel 3
3798     if (minimize)
3799     {
3800     for (i = min;; i++)
3801     {
3802 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3803 nigel 25 if (i >= max || eptr >= md->end_subject) return FALSE;
3804    
3805     c = *eptr++;
3806     switch(ctype)
3807     {
3808     case OP_ANY:
3809     if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3810     break;
3811    
3812     case OP_NOT_DIGIT:
3813     if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3814     break;
3815    
3816     case OP_DIGIT:
3817     if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3818     break;
3819    
3820     case OP_NOT_WHITESPACE:
3821     if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3822     break;
3823    
3824     case OP_WHITESPACE:
3825     if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3826     break;
3827    
3828     case OP_NOT_WORDCHAR:
3829     if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3830     break;
3831    
3832     case OP_WORDCHAR:
3833     if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3834     break;
3835     }
3836 nigel 3 }
3837     /* Control never gets here */
3838     }
3839    
3840     /* If maximizing it is worth using inline code for speed, doing the type
3841     test once at the start (i.e. keep it out of the loop). */
3842    
3843     else
3844     {
3845 nigel 7 const uschar *pp = eptr;
3846 nigel 3 switch(ctype)
3847     {
3848     case OP_ANY:
3849 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3850 nigel 3 {
3851     for (i = min; i < max; i++)
3852     {
3853     if (eptr >= md->end_subject || *eptr == '\n') break;
3854     eptr++;
3855     }
3856     }
3857     else
3858     {
3859     c = max - min;
3860     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3861     eptr += c;
3862     }
3863     break;
3864    
3865     case OP_NOT_DIGIT:
3866     for (i = min; i < max; i++)
3867     {
3868 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3869 nigel 3 break;
3870     eptr++;
3871     }
3872     break;
3873    
3874     case OP_DIGIT:
3875     for (i = min; i < max; i++)
3876     {
3877 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3878 nigel 3 break;
3879     eptr++;
3880     }
3881     break;
3882    
3883     case OP_NOT_WHITESPACE:
3884     for (i = min; i < max; i++)
3885     {
3886 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3887 nigel 3 break;
3888     eptr++;
3889     }
3890     break;
3891    
3892     case OP_WHITESPACE:
3893     for (i = min; i < max; i++)
3894     {
3895 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3896 nigel 3 break;
3897     eptr++;
3898     }
3899     break;
3900    
3901     case OP_NOT_WORDCHAR:
3902     for (i = min; i < max; i++)
3903     {
3904 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3905 nigel 3 break;
3906     eptr++;
3907     }
3908     break;
3909    
3910     case OP_WORDCHAR:
3911     for (i = min; i < max; i++)
3912     {
3913 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3914 nigel 3 break;
3915     eptr++;
3916     }
3917     break;
3918     }
3919    
3920     while (eptr >= pp)
3921 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3922     return TRUE;
3923 nigel 3 return FALSE;
3924     }
3925     /* Control never gets here */
3926    
3927     /* There's been some horrible disaster. */
3928    
3929     default:
3930 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
3931 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3932     return FALSE;
3933     }
3934    
3935     /* Do not stick any code in here without much thought; it is assumed
3936     that "continue" in the code above comes out to here to repeat the main
3937     loop. */
3938    
3939     } /* End of main loop */
3940     /* Control never reaches here */
3941     }
3942    
3943    
3944    
3945 nigel 9
3946     /*************************************************
3947 nigel 3 * Execute a Regular Expression *
3948     *************************************************/
3949    
3950     /* This function applies a compiled re to a subject string and picks out
3951     portions of the string if it matches. Two elements in the vector are set for
3952     each substring: the offsets to the start and end of the substring.
3953    
3954     Arguments:
3955     external_re points to the compiled expression
3956     external_extra points to "hints" from pcre_study() or is NULL
3957     subject points to the subject string
3958     length length of subject string (may contain binary zeros)
3959     options option bits
3960     offsets points to a vector of ints to be filled in with offsets
3961     offsetcount the number of elements in the vector
3962    
3963     Returns: > 0 => success; value is the number of elements filled in
3964     = 0 => success, but offsets is not big enough
3965     -1 => failed to match
3966     < -1 => some kind of unexpected problem
3967     */
3968    
3969     int
3970     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3971     const char *subject, int length, int options, int *offsets, int offsetcount)
3972     {
3973 nigel 11 int resetcount, ocount;
3974 nigel 3 int first_char = -1;
3975 nigel 23 int ims = 0;
3976 nigel 3 match_data match_block;
3977 nigel 7 const uschar *start_bits = NULL;
3978 nigel 9 const uschar *start_match = (const uschar *)subject;
3979 nigel 7 const uschar *end_subject;
3980     const real_pcre *re = (const real_pcre *)external_re;
3981     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3982 nigel 11 BOOL using_temporary_offsets = FALSE;
3983 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3984     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3985    
3986     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3987    
3988     if (re == NULL || subject == NULL ||
3989     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3990     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3991    
3992 nigel 7 match_block.start_subject = (const uschar *)subject;
3993 nigel 3 match_block.end_subject = match_block.start_subject + length;
3994     end_subject = match_block.end_subject;
3995    
3996 nigel 23 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3997 nigel 3
3998     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3999     match_block.noteol = (options & PCRE_NOTEOL) != 0;
4000    
4001     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4002    
4003 nigel 25 match_block.lcc = re->tables + lcc_offset;
4004     match_block.ctypes = re->tables + ctypes_offset;
4005    
4006 nigel 23 /* The ims options can vary during the matching as a result of the presence
4007     of (?ims) items in the pattern. They are kept in a local variable so that
4008     restoring at the exit of a group is easy. */
4009    
4010     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4011    
4012 nigel 3 /* If the expression has got more back references than the offsets supplied can
4013     hold, we get a temporary bit of working store to use during the matching.
4014 nigel 11 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4015 nigel 23 of 3. */
4016 nigel 3
4017 nigel 23 ocount = offsetcount - (offsetcount % 3);
4018    
4019     if (re->top_backref > 0 && re->top_backref >= ocount/3)
4020 nigel 3 {
4021 nigel 23 ocount = re->top_backref * 3 + 3;
4022 nigel 15 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4023 nigel 3 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4024 nigel 11 using_temporary_offsets = TRUE;
4025 nigel 9 DPRINTF(("Got memory to hold back references\n"));
4026 nigel 3 }
4027     else match_block.offset_vector = offsets;
4028    
4029     match_block.offset_end = ocount;
4030 nigel 23 match_block.offset_max = (2*ocount)/3;
4031 nigel 3 match_block.offset_overflow = FALSE;
4032    
4033     /* Compute the minimum number of offsets that we need to reset each time. Doing
4034     this makes a huge difference to execution time when there aren't many brackets
4035     in the pattern. */
4036    
4037     resetcount = 2 + re->top_bracket * 2;
4038     if (resetcount > offsetcount) resetcount = ocount;
4039    
4040 nigel 27 /* Reset the working variable associated with each extraction. These should
4041     never be used unless previously set, but they get saved and restored, and so we
4042     initialize them to avoid reading uninitialized locations. */
4043    
4044     if (match_block.offset_vector != NULL)
4045     {
4046     register int *iptr = match_block.offset_vector + ocount;
4047     register int *iend = iptr - resetcount/2 + 1;
4048     while (--iptr >= iend) *iptr = -1;
4049     }
4050    
4051 nigel 3 /* Set up the first character to match, if available. The first_char value is
4052     never set for an anchored regular expression, but the anchoring may be forced
4053     at run time, so we have to test for anchoring. The first char may be unset for
4054     an unanchored pattern, of course. If there's no first char and the pattern was
4055 nigel 23 studied, there may be a bitmap of possible first characters. */
4056 nigel 3
4057     if (!anchored)
4058     {
4059     if ((re->options & PCRE_FIRSTSET) != 0)
4060     {
4061     first_char = re->first_char;
4062 nigel 25 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4063 nigel 3 }
4064     else
4065     if (!startline && extra != NULL &&
4066 nigel 23 (extra->options & PCRE_STUDY_MAPPED) != 0)
4067 nigel 3 start_bits = extra->start_bits;
4068     }
4069    
4070     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4071    
4072     do
4073     {
4074 nigel 9 int rc;
4075 nigel 3 register int *iptr = match_block.offset_vector;
4076     register int *iend = iptr + resetcount;
4077    
4078     /* Reset the maximum number of extractions we might see. */
4079    
4080     while (iptr < iend) *iptr++ = -1;
4081    
4082     /* Advance to a unique first char if possible */
4083    
4084     if (first_char >= 0)
4085     {
4086 nigel 23 if ((ims & PCRE_CASELESS) != 0)
4087 nigel 25 while (start_match < end_subject &&
4088     match_block.lcc[*start_match] != first_char)
4089 nigel 3 start_match++;
4090     else
4091     while (start_match < end_subject && *start_match != first_char)
4092     start_match++;
4093     }
4094    
4095     /* Or to just after \n for a multiline match if possible */
4096    
4097     else if (startline)
4098     {
4099     if (start_match > match_block.start_subject)
4100     {
4101     while (start_match < end_subject && start_match[-1] != '\n')
4102     start_match++;
4103     }
4104     }
4105    
4106     /* Or to a non-unique first char */
4107    
4108     else if (start_bits != NULL)
4109     {
4110     while (start_match < end_subject)
4111     {
4112     register int c = *start_match;
4113     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4114     }
4115     }
4116    
4117 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4118 nigel 3 printf(">>>> Match against: ");
4119     pchars(start_match, end_subject - start_match, TRUE, &match_block);
4120     printf("\n");
4121 nigel 9 #endif
4122 nigel 3
4123     /* When a match occurs, substrings will be set for all internal extractions;
4124     we just need to set up the whole thing as substring 0 before returning. If
4125     there were too many extractions, set the return code to zero. In the case
4126     where we had to get some local store to hold offsets for backreferences, copy
4127     those back references that we can. In this case there need not be overflow
4128 nigel 23 if certain parts of the pattern were not used. */
4129 nigel 3
4130 nigel 23 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4131     continue;
4132 nigel 3
4133 nigel 9 /* Copy the offset information from temporary store if necessary */
4134    
4135 nigel 11 if (using_temporary_offsets)
4136 nigel 3 {
4137 nigel 9 if (offsetcount >= 4)
4138 nigel 3 {
4139 nigel 9 memcpy(offsets + 2, match_block.offset_vector + 2,
4140     (offsetcount - 2) * sizeof(int));
4141 nigel 11 DPRINTF(("Copied offsets from temporary memory\n"));
4142 nigel 3 }
4143 nigel 9 if (match_block.end_offset_top > offsetcount)
4144     match_block.offset_overflow = TRUE;
4145 nigel 3
4146 nigel 9 DPRINTF(("Freeing temporary memory\n"));
4147     (pcre_free)(match_block.offset_vector);
4148     }
4149 nigel 3
4150 nigel 9 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4151 nigel 3
4152 nigel 9 if (match_block.offset_end < 2) rc = 0; else
4153     {
4154     offsets[0] = start_match - match_block.start_subject;
4155     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4156 nigel 3 }
4157 nigel 9
4158     DPRINTF((">>>> returning %d\n", rc));
4159     return rc;
4160 nigel 3 }
4161 nigel 25
4162     /* This "while" is the end of the "do" above */
4163    
4164 nigel 3 while (!anchored &&
4165     match_block.errorcode == PCRE_ERROR_NOMATCH &&
4166     start_match++ < end_subject);
4167    
4168 nigel 11 if (using_temporary_offsets)
4169     {
4170     DPRINTF(("Freeing temporary memory\n"));
4171     (pcre_free)(match_block.offset_vector);
4172     }
4173    
4174 nigel 9 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4175 nigel 3
4176     return match_block.errorcode;
4177     }
4178    
4179     /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12