/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 25 - (hide annotations) (download)
Sat Feb 24 21:38:45 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 120430 byte(s)
Load pcre-2.01 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 15 Copyright (c) 1998 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Define DEBUG to get debugging output on stdout. */
33    
34     /* #define DEBUG */
35    
36 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
37     inline, and there are *still* stupid compilers about that don't like indented
38     pre-processor statements. I suppose it's only been 10 years... */
39 nigel 3
40 nigel 9 #ifdef DEBUG
41     #define DPRINTF(p) printf p
42     #else
43     #define DPRINTF(p) /*nothing*/
44     #endif
45    
46 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
47     the external pcre header. */
48    
49     #include "internal.h"
50    
51    
52 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
53    
54     #ifdef __cplusplus
55     #define class pcre_class
56     #endif
57    
58    
59 nigel 23 /* Number of items on the nested bracket stacks at compile time. This should
60     not be set greater than 200. */
61    
62     #define BRASTACK_SIZE 200
63    
64    
65 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
66    
67 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
68     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
69 nigel 3
70 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
71 nigel 3
72     #ifdef DEBUG
73 nigel 7 static const char *OP_names[] = {
74     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
75 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
76     "Opt", "^", "$", "Any", "chars", "not",
77 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
78     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
79     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
80     "*", "*?", "+", "+?", "?", "??", "{", "{",
81 nigel 23 "class", "Ref",
82     "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
83     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
84 nigel 3 "Brazero", "Braminzero", "Bra"
85     };
86     #endif
87    
88     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
89     are simple data values; negative values are for special things like \d and so
90     on. Zero means further processing is needed (for things like \x), or the escape
91     is invalid. */
92    
93 nigel 15 static const short int escapes[] = {
94 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
95     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
96     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
97     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
98     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
99     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
100     '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
101     0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
102     0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
103 nigel 23 0, 0, -ESC_z /* x - z */
104 nigel 3 };
105    
106     /* Definition to allow mutual recursion */
107    
108 nigel 13 static BOOL
109 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
110 nigel 25 BOOL, int, compile_data *);
111 nigel 3
112    
113    
114     /*************************************************
115     * Global variables *
116     *************************************************/
117    
118     /* PCRE is thread-clean and doesn't use any global variables in the normal
119     sense. However, it calls memory allocation and free functions via the two
120     indirections below, which are can be changed by the caller, but are shared
121     between all threads. */
122    
123     void *(*pcre_malloc)(size_t) = malloc;
124     void (*pcre_free)(void *) = free;
125    
126    
127    
128    
129     /*************************************************
130 nigel 25 * Default character tables *
131     *************************************************/
132    
133     /* A default set of character tables is included in the PCRE binary. Its source
134     is built by the maketables auxiliary program, which uses the default C ctypes
135     functions, and put in the file chartables.c. These tables are used by PCRE
136     whenever the caller of pcre_compile() does not provide an alternate set of
137     tables. */
138    
139     #include "chartables.c"
140    
141    
142    
143     /*************************************************
144 nigel 3 * Return version string *
145     *************************************************/
146    
147 nigel 7 const char *
148 nigel 3 pcre_version(void)
149     {
150     return PCRE_VERSION;
151     }
152    
153    
154    
155    
156     /*************************************************
157     * Return info about a compiled pattern *
158     *************************************************/
159    
160     /* This function picks potentially useful data out of the private
161     structure.
162    
163     Arguments:
164     external_re points to compiled code
165     optptr where to pass back the options
166     first_char where to pass back the first character,
167     or -1 if multiline and all branches start ^,
168     or -2 otherwise
169    
170     Returns: number of identifying extraction brackets
171     or negative values on error
172     */
173    
174     int
175     pcre_info(const pcre *external_re, int *optptr, int *first_char)
176     {
177 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
178 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
179     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
180     if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
181     if (first_char != NULL)
182     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
183     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
184     return re->top_bracket;
185     }
186    
187    
188    
189    
190     #ifdef DEBUG
191     /*************************************************
192     * Debugging function to print chars *
193     *************************************************/
194    
195     /* Print a sequence of chars in printable format, stopping at the end of the
196     subject if the requested.
197    
198     Arguments:
199     p points to characters
200     length number to print
201     is_subject TRUE if printing from within md->start_subject
202     md pointer to matching data block, if is_subject is TRUE
203    
204     Returns: nothing
205     */
206    
207 nigel 9 static void
208     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
209 nigel 3 {
210     int c;
211     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
212     while (length-- > 0)
213     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
214     }
215     #endif
216    
217    
218    
219    
220     /*************************************************
221     * Handle escapes *
222     *************************************************/
223    
224     /* This function is called when a \ has been encountered. It either returns a
225     positive value for a simple escape such as \n, or a negative value which
226     encodes one of the more complicated things such as \d. On entry, ptr is
227     pointing at the \. On exit, it is on the final character of the escape
228     sequence.
229    
230     Arguments:
231     ptrptr points to the pattern position pointer
232     errorptr points to the pointer to the error message
233     bracount number of previous extracting brackets
234     options the options bits
235     isclass TRUE if inside a character class
236 nigel 25 cd pointer to char tables block
237 nigel 3
238     Returns: zero or positive => a data character
239     negative => a special escape sequence
240     on error, errorptr is set
241     */
242    
243     static int
244 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
245 nigel 25 int options, BOOL isclass, compile_data *cd)
246 nigel 3 {
247 nigel 7 const uschar *ptr = *ptrptr;
248 nigel 3 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
249     int i;
250    
251     if (c == 0) *errorptr = ERR1;
252    
253     /* Digits or letters may have special meaning; all others are literals. */
254    
255     else if (c < '0' || c > 'z') {}
256    
257     /* Do an initial lookup in a table. A non-zero result is something that can be
258     returned immediately. Otherwise further processing may be required. */
259    
260     else if ((i = escapes[c - '0']) != 0) c = i;
261    
262     /* Escapes that need further processing, or are illegal. */
263    
264     else
265     {
266 nigel 7 const uschar *oldptr;
267 nigel 3 switch (c)
268     {
269     /* The handling of escape sequences consisting of a string of digits
270     starting with one that is not zero is not straightforward. By experiment,
271     the way Perl works seems to be as follows:
272    
273     Outside a character class, the digits are read as a decimal number. If the
274     number is less than 10, or if there are that many previous extracting
275     left brackets, then it is a back reference. Otherwise, up to three octal
276     digits are read to form an escaped byte. Thus \123 is likely to be octal
277     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
278     value is greater than 377, the least significant 8 bits are taken. Inside a
279     character class, \ followed by a digit is always an octal number. */
280    
281     case '1': case '2': case '3': case '4': case '5':
282     case '6': case '7': case '8': case '9':
283    
284     if (!isclass)
285     {
286     oldptr = ptr;
287     c -= '0';
288 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
289 nigel 3 c = c * 10 + *(++ptr) - '0';
290     if (c < 10 || c <= bracount)
291     {
292     c = -(ESC_REF + c);
293     break;
294     }
295     ptr = oldptr; /* Put the pointer back and fall through */
296     }
297    
298     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
299     generates a binary zero byte and treats the digit as a following literal.
300     Thus we have to pull back the pointer by one. */
301    
302     if ((c = *ptr) >= '8')
303     {
304     ptr--;
305     c = 0;
306     break;
307     }
308    
309     /* \0 always starts an octal number, but we may drop through to here with a
310     larger first octal digit */
311    
312     case '0':
313     c -= '0';
314 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
315 nigel 3 ptr[1] != '8' && ptr[1] != '9')
316     c = c * 8 + *(++ptr) - '0';
317     break;
318    
319     /* Special escapes not starting with a digit are straightforward */
320    
321     case 'x':
322     c = 0;
323 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
324 nigel 3 {
325     ptr++;
326 nigel 25 c = c * 16 + cd->lcc[*ptr] -
327     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
328 nigel 3 }
329     break;
330    
331     case 'c':
332     c = *(++ptr);
333     if (c == 0)
334     {
335     *errorptr = ERR2;
336     return 0;
337     }
338    
339     /* A letter is upper-cased; then the 0x40 bit is flipped */
340    
341 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
342 nigel 3 c ^= 0x40;
343     break;
344    
345     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
346     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
347 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
348     there used to be some cases other than the default, and there may be again
349     in future, so I haven't "optimized" it. */
350 nigel 3
351     default:
352     if ((options & PCRE_EXTRA) != 0) switch(c)
353     {
354     default:
355     *errorptr = ERR3;
356     break;
357     }
358     break;
359     }
360     }
361    
362     *ptrptr = ptr;
363     return c;
364     }
365    
366    
367    
368     /*************************************************
369     * Check for counted repeat *
370     *************************************************/
371    
372     /* This function is called when a '{' is encountered in a place where it might
373     start a quantifier. It looks ahead to see if it really is a quantifier or not.
374     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
375     where the ddds are digits.
376    
377     Arguments:
378     p pointer to the first char after '{'
379 nigel 25 cd pointer to char tables block
380 nigel 3
381     Returns: TRUE or FALSE
382     */
383    
384     static BOOL
385 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
386 nigel 3 {
387 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
388     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
389 nigel 3 if (*p == '}') return TRUE;
390    
391     if (*p++ != ',') return FALSE;
392     if (*p == '}') return TRUE;
393    
394 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
395     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
396 nigel 3 return (*p == '}');
397     }
398    
399    
400    
401     /*************************************************
402     * Read repeat counts *
403     *************************************************/
404    
405     /* Read an item of the form {n,m} and return the values. This is called only
406     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
407     so the syntax is guaranteed to be correct, but we need to check the values.
408    
409     Arguments:
410     p pointer to first char after '{'
411     minp pointer to int for min
412     maxp pointer to int for max
413     returned as -1 if no max
414     errorptr points to pointer to error message
415 nigel 25 cd pointer to character tables clock
416 nigel 3
417     Returns: pointer to '}' on success;
418     current ptr on error, with errorptr set
419     */
420    
421 nigel 7 static const uschar *
422 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
423     const char **errorptr, compile_data *cd)
424 nigel 3 {
425     int min = 0;
426     int max = -1;
427    
428 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
429 nigel 3
430     if (*p == '}') max = min; else
431     {
432     if (*(++p) != '}')
433     {
434     max = 0;
435 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
436 nigel 3 if (max < min)
437     {
438     *errorptr = ERR4;
439     return p;
440     }
441     }
442     }
443    
444     /* Do paranoid checks, then fill in the required variables, and pass back the
445     pointer to the terminating '}'. */
446    
447     if (min > 65535 || max > 65535)
448     *errorptr = ERR5;
449     else
450     {
451     *minp = min;
452     *maxp = max;
453     }
454     return p;
455     }
456    
457    
458    
459     /*************************************************
460 nigel 23 * Find the fixed length of a pattern *
461     *************************************************/
462    
463     /* Scan a pattern and compute the fixed length of subject that will match it,
464     if the length is fixed. This is needed for dealing with backward assertions.
465    
466     Arguments:
467     code points to the start of the pattern (the bracket)
468    
469     Returns: the fixed length, or -1 if there is no fixed length
470     */
471    
472     static int
473     find_fixedlength(uschar *code)
474     {
475     int length = -1;
476    
477     register int branchlength = 0;
478     register uschar *cc = code + 3;
479    
480     /* Scan along the opcodes for this branch. If we get to the end of the
481     branch, check the length against that of the other branches. */
482    
483     for (;;)
484     {
485     int d;
486     register int op = *cc;
487     if (op >= OP_BRA) op = OP_BRA;
488    
489     switch (op)
490     {
491     case OP_BRA:
492     case OP_ONCE:
493     case OP_COND:
494     d = find_fixedlength(cc);
495     if (d < 0) return -1;
496     branchlength += d;
497     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
498     cc += 3;
499     break;
500    
501     /* Reached end of a branch; if it's a ket it is the end of a nested
502     call. If it's ALT it is an alternation in a nested call. If it is
503     END it's the end of the outer call. All can be handled by the same code. */
504    
505     case OP_ALT:
506     case OP_KET:
507     case OP_KETRMAX:
508     case OP_KETRMIN:
509     case OP_END:
510     if (length < 0) length = branchlength;
511     else if (length != branchlength) return -1;
512     if (*cc != OP_ALT) return length;
513     cc += 3;
514     branchlength = 0;
515     break;
516    
517     /* Skip over assertive subpatterns */
518    
519     case OP_ASSERT:
520     case OP_ASSERT_NOT:
521     case OP_ASSERTBACK:
522     case OP_ASSERTBACK_NOT:
523     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
524     cc += 3;
525     break;
526    
527     /* Skip over things that don't match chars */
528    
529     case OP_REVERSE:
530     cc++;
531    
532     case OP_CREF:
533     case OP_OPT:
534     cc++;
535     /* Fall through */
536    
537     case OP_SOD:
538     case OP_EOD:
539     case OP_EODN:
540     case OP_CIRC:
541     case OP_DOLL:
542     case OP_NOT_WORD_BOUNDARY:
543     case OP_WORD_BOUNDARY:
544     cc++;
545     break;
546    
547     /* Handle char strings */
548    
549     case OP_CHARS:
550     branchlength += *(++cc);
551     cc += *cc + 1;
552     break;
553    
554     /* Handle exact repetitions */
555    
556     case OP_EXACT:
557     case OP_TYPEEXACT:
558     branchlength += (cc[1] << 8) + cc[2];
559     cc += 4;
560     break;
561    
562     /* Handle single-char matchers */
563    
564     case OP_NOT_DIGIT:
565     case OP_DIGIT:
566     case OP_NOT_WHITESPACE:
567     case OP_WHITESPACE:
568     case OP_NOT_WORDCHAR:
569     case OP_WORDCHAR:
570     case OP_ANY:
571     branchlength++;
572     cc++;
573     break;
574    
575    
576     /* Check a class for variable quantification */
577    
578     case OP_CLASS:
579     cc += (*cc == OP_REF)? 2 : 33;
580    
581     switch (*cc)
582     {
583     case OP_CRSTAR:
584     case OP_CRMINSTAR:
585     case OP_CRQUERY:
586     case OP_CRMINQUERY:
587     return -1;
588    
589     case OP_CRRANGE:
590     case OP_CRMINRANGE:
591     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
592     branchlength += (cc[1] << 8) + cc[2];
593     cc += 5;
594     break;
595    
596     default:
597     branchlength++;
598     }
599     break;
600    
601     /* Anything else is variable length */
602    
603     default:
604     return -1;
605     }
606     }
607     /* Control never gets here */
608     }
609    
610    
611    
612    
613     /*************************************************
614 nigel 3 * Compile one branch *
615     *************************************************/
616    
617     /* Scan the pattern, compiling it into the code vector.
618    
619     Arguments:
620 nigel 25 options the option bits
621     brackets points to number of brackets used
622     code points to the pointer to the current code point
623     ptrptr points to the current pattern pointer
624     errorptr points to pointer to error message
625     optchanged set to the value of the last OP_OPT item compiled
626     cd contains pointers to tables
627 nigel 3
628 nigel 25 Returns: TRUE on success
629     FALSE, with *errorptr set on error
630 nigel 3 */
631    
632     static BOOL
633 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
634 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
635     compile_data *cd)
636 nigel 3 {
637     int repeat_type, op_type;
638     int repeat_min, repeat_max;
639     int bravalue, length;
640 nigel 19 int greedy_default, greedy_non_default;
641 nigel 3 register int c;
642     register uschar *code = *codeptr;
643 nigel 23 uschar *tempcode;
644 nigel 7 const uschar *ptr = *ptrptr;
645 nigel 23 const uschar *tempptr;
646 nigel 3 uschar *previous = NULL;
647     uschar class[32];
648    
649 nigel 19 /* Set up the default and non-default settings for greediness */
650    
651     greedy_default = ((options & PCRE_UNGREEDY) != 0);
652     greedy_non_default = greedy_default ^ 1;
653    
654 nigel 3 /* Switch on next character until the end of the branch */
655    
656     for (;; ptr++)
657     {
658     BOOL negate_class;
659 nigel 23 int class_charcount;
660     int class_lastchar;
661     int newoptions;
662     int condref;
663 nigel 3
664     c = *ptr;
665     if ((options & PCRE_EXTENDED) != 0)
666     {
667 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
668 nigel 3 if (c == '#')
669     {
670     while ((c = *(++ptr)) != 0 && c != '\n');
671     continue;
672     }
673     }
674    
675     switch(c)
676     {
677     /* The branch terminates at end of string, |, or ). */
678    
679     case 0:
680     case '|':
681     case ')':
682     *codeptr = code;
683     *ptrptr = ptr;
684     return TRUE;
685    
686     /* Handle single-character metacharacters */
687    
688     case '^':
689     previous = NULL;
690     *code++ = OP_CIRC;
691     break;
692    
693     case '$':
694     previous = NULL;
695     *code++ = OP_DOLL;
696     break;
697    
698     case '.':
699     previous = code;
700     *code++ = OP_ANY;
701     break;
702    
703     /* Character classes. These always build a 32-byte bitmap of the permitted
704     characters, except in the special case where there is only one character.
705     For negated classes, we build the map as usual, then invert it at the end.
706     */
707    
708     case '[':
709     previous = code;
710 nigel 23 *code++ = OP_CLASS;
711 nigel 3
712 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
713 nigel 3
714     if ((c = *(++ptr)) == '^')
715     {
716     negate_class = TRUE;
717     c = *(++ptr);
718     }
719 nigel 23 else negate_class = FALSE;
720 nigel 3
721     /* Keep a count of chars so that we can optimize the case of just a single
722     character. */
723    
724     class_charcount = 0;
725     class_lastchar = -1;
726    
727     /* Initialize the 32-char bit map to all zeros. We have to build the
728     map in a temporary bit of store, in case the class contains only 1
729     character, because in that case the compiled code doesn't use the
730     bit map. */
731    
732     memset(class, 0, 32 * sizeof(uschar));
733    
734     /* Process characters until ] is reached. By writing this as a "do" it
735     means that an initial ] is taken as a data character. */
736    
737     do
738     {
739     if (c == 0)
740     {
741     *errorptr = ERR6;
742     goto FAILED;
743     }
744    
745     /* Backslash may introduce a single character, or it may introduce one
746     of the specials, which just set a flag. Escaped items are checked for
747     validity in the pre-compiling pass. The sequence \b is a special case.
748 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
749 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
750     or into the one we are building. We assume they have more than one
751     character in them, so set class_count bigger than one. */
752    
753     if (c == '\\')
754     {
755 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
756 nigel 3 if (-c == ESC_b) c = '\b';
757     else if (c < 0)
758     {
759 nigel 25 register const uschar *cbits = cd->cbits;
760 nigel 3 class_charcount = 10;
761     switch (-c)
762     {
763     case ESC_d:
764 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
765 nigel 3 continue;
766    
767     case ESC_D:
768 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
769 nigel 3 continue;
770    
771     case ESC_w:
772     for (c = 0; c < 32; c++)
773 nigel 25 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
774 nigel 3 continue;
775    
776     case ESC_W:
777     for (c = 0; c < 32; c++)
778 nigel 25 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
779 nigel 3 continue;
780    
781     case ESC_s:
782 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
783 nigel 3 continue;
784    
785     case ESC_S:
786 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
787 nigel 3 continue;
788    
789     default:
790     *errorptr = ERR7;
791     goto FAILED;
792     }
793     }
794     /* Fall through if single character */
795     }
796    
797     /* A single character may be followed by '-' to form a range. However,
798     Perl does not permit ']' to be the end of the range. A '-' character
799     here is treated as a literal. */
800    
801     if (ptr[1] == '-' && ptr[2] != ']')
802     {
803     int d;
804     ptr += 2;
805     d = *ptr;
806    
807     if (d == 0)
808     {
809     *errorptr = ERR6;
810     goto FAILED;
811     }
812    
813     /* The second part of a range can be a single-character escape, but
814     not any of the other escapes. */
815    
816     if (d == '\\')
817     {
818 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
819 nigel 3 if (d < 0)
820     {
821     if (d == -ESC_b) d = '\b'; else
822     {
823     *errorptr = ERR7;
824     goto FAILED;
825     }
826     }
827     }
828    
829     if (d < c)
830     {
831     *errorptr = ERR8;
832     goto FAILED;
833     }
834    
835     for (; c <= d; c++)
836     {
837     class[c/8] |= (1 << (c&7));
838     if ((options & PCRE_CASELESS) != 0)
839     {
840 nigel 25 int uc = cd->fcc[c]; /* flip case */
841 nigel 3 class[uc/8] |= (1 << (uc&7));
842     }
843     class_charcount++; /* in case a one-char range */
844     class_lastchar = c;
845     }
846     continue; /* Go get the next char in the class */
847     }
848    
849     /* Handle a lone single character - we can get here for a normal
850     non-escape char, or after \ that introduces a single character. */
851    
852     class [c/8] |= (1 << (c&7));
853     if ((options & PCRE_CASELESS) != 0)
854     {
855 nigel 25 c = cd->fcc[c]; /* flip case */
856 nigel 3 class[c/8] |= (1 << (c&7));
857     }
858     class_charcount++;
859     class_lastchar = c;
860     }
861    
862     /* Loop until ']' reached; the check for end of string happens inside the
863     loop. This "while" is the end of the "do" above. */
864    
865     while ((c = *(++ptr)) != ']');
866    
867     /* If class_charcount is 1 and class_lastchar is not negative, we saw
868     precisely one character. This doesn't need the whole 32-byte bit map.
869     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
870     it's negative. */
871    
872     if (class_charcount == 1 && class_lastchar >= 0)
873     {
874     if (negate_class)
875     {
876     code[-1] = OP_NOT;
877     }
878     else
879     {
880     code[-1] = OP_CHARS;
881     *code++ = 1;
882     }
883     *code++ = class_lastchar;
884     }
885    
886     /* Otherwise, negate the 32-byte map if necessary, and copy it into
887     the code vector. */
888    
889     else
890     {
891     if (negate_class)
892     for (c = 0; c < 32; c++) code[c] = ~class[c];
893     else
894     memcpy(code, class, 32);
895     code += 32;
896     }
897     break;
898    
899     /* Various kinds of repeat */
900    
901     case '{':
902 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
903     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
904 nigel 3 if (*errorptr != NULL) goto FAILED;
905     goto REPEAT;
906    
907     case '*':
908     repeat_min = 0;
909     repeat_max = -1;
910     goto REPEAT;
911    
912     case '+':
913     repeat_min = 1;
914     repeat_max = -1;
915     goto REPEAT;
916    
917     case '?':
918     repeat_min = 0;
919     repeat_max = 1;
920    
921     REPEAT:
922     if (previous == NULL)
923     {
924     *errorptr = ERR9;
925     goto FAILED;
926     }
927    
928 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
929     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
930 nigel 3 next character. */
931    
932 nigel 19 if (ptr[1] == '?')
933     { repeat_type = greedy_non_default; ptr++; }
934     else repeat_type = greedy_default;
935 nigel 3
936     /* If the maximum is zero then the minimum must also be zero; Perl allows
937     this case, so we do too - by simply omitting the item altogether. */
938    
939     if (repeat_max == 0) code = previous;
940    
941     /* If previous was a string of characters, chop off the last one and use it
942     as the subject of the repeat. If there was only one character, we can
943     abolish the previous item altogether. */
944    
945     else if (*previous == OP_CHARS)
946     {
947     int len = previous[1];
948     if (len == 1)
949     {
950     c = previous[2];
951     code = previous;
952     }
953     else
954     {
955     c = previous[len+1];
956     previous[1]--;
957     code--;
958     }
959     op_type = 0; /* Use single-char op codes */
960     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
961     }
962    
963     /* If previous was a single negated character ([^a] or similar), we use
964     one of the special opcodes, replacing it. The code is shared with single-
965     character repeats by adding a suitable offset into repeat_type. */
966    
967     else if ((int)*previous == OP_NOT)
968     {
969     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
970     c = previous[1];
971     code = previous;
972     goto OUTPUT_SINGLE_REPEAT;
973     }
974    
975     /* If previous was a character type match (\d or similar), abolish it and
976     create a suitable repeat item. The code is shared with single-character
977     repeats by adding a suitable offset into repeat_type. */
978    
979 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
980 nigel 3 {
981     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
982     c = *previous;
983     code = previous;
984    
985     OUTPUT_SINGLE_REPEAT:
986     repeat_type += op_type; /* Combine both values for many cases */
987    
988     /* A minimum of zero is handled either as the special case * or ?, or as
989     an UPTO, with the maximum given. */
990    
991     if (repeat_min == 0)
992     {
993     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
994     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
995     else
996     {
997     *code++ = OP_UPTO + repeat_type;
998     *code++ = repeat_max >> 8;
999     *code++ = (repeat_max & 255);
1000     }
1001     }
1002    
1003     /* The case {1,} is handled as the special case + */
1004    
1005     else if (repeat_min == 1 && repeat_max == -1)
1006     *code++ = OP_PLUS + repeat_type;
1007    
1008     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1009     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1010    
1011     else
1012     {
1013     if (repeat_min != 1)
1014     {
1015     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1016     *code++ = repeat_min >> 8;
1017     *code++ = (repeat_min & 255);
1018     }
1019    
1020     /* If the mininum is 1 and the previous item was a character string,
1021     we either have to put back the item that got cancelled if the string
1022     length was 1, or add the character back onto the end of a longer
1023 nigel 21 string. For a character type nothing need be done; it will just get
1024     put back naturally. Note that the final character is always going to
1025     get added below. */
1026 nigel 3
1027     else if (*previous == OP_CHARS)
1028     {
1029     if (code == previous) code += 2; else previous[1]++;
1030     }
1031    
1032 nigel 21 /* For a single negated character we also have to put back the
1033     item that got cancelled. */
1034    
1035     else if (*previous == OP_NOT) code++;
1036    
1037 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1038 nigel 3
1039 nigel 9 if (repeat_max < 0)
1040 nigel 3 {
1041     *code++ = c;
1042 nigel 9 *code++ = OP_STAR + repeat_type;
1043     }
1044    
1045     /* Else insert an UPTO if the max is greater than the min. */
1046    
1047     else if (repeat_max != repeat_min)
1048     {
1049     *code++ = c;
1050 nigel 3 repeat_max -= repeat_min;
1051     *code++ = OP_UPTO + repeat_type;
1052     *code++ = repeat_max >> 8;
1053     *code++ = (repeat_max & 255);
1054     }
1055     }
1056    
1057     /* The character or character type itself comes last in all cases. */
1058    
1059     *code++ = c;
1060     }
1061    
1062     /* If previous was a character class or a back reference, we put the repeat
1063     stuff after it. */
1064    
1065 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1066 nigel 3 {
1067     if (repeat_min == 0 && repeat_max == -1)
1068     *code++ = OP_CRSTAR + repeat_type;
1069     else if (repeat_min == 1 && repeat_max == -1)
1070     *code++ = OP_CRPLUS + repeat_type;
1071     else if (repeat_min == 0 && repeat_max == 1)
1072     *code++ = OP_CRQUERY + repeat_type;
1073     else
1074     {
1075     *code++ = OP_CRRANGE + repeat_type;
1076     *code++ = repeat_min >> 8;
1077     *code++ = repeat_min & 255;
1078     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1079     *code++ = repeat_max >> 8;
1080     *code++ = repeat_max & 255;
1081     }
1082     }
1083    
1084     /* If previous was a bracket group, we may have to replicate it in certain
1085 nigel 23 cases. */
1086 nigel 3
1087 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1088     (int)*previous == OP_COND)
1089 nigel 3 {
1090 nigel 23 int i, ketoffset = 0;
1091 nigel 9 int len = code - previous;
1092 nigel 3
1093 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1094     by scanning through from the start, and compute the offset back to it
1095     from the current code pointer. There may be an OP_OPT setting following
1096     the final KET, so we can't find the end just by going back from the code
1097     pointer. */
1098    
1099     if (repeat_max == -1)
1100 nigel 3 {
1101 nigel 23 register uschar *ket = previous;
1102     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1103     ketoffset = code - ket;
1104 nigel 3 }
1105    
1106     /* If the minimum is greater than zero, and the maximum is unlimited or
1107     equal to the minimum, the first copy remains where it is, and is
1108     replicated up to the minimum number of times. This case includes the +
1109     repeat, but of course no replication is needed in that case. */
1110    
1111     if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1112     {
1113     for (i = 1; i < repeat_min; i++)
1114     {
1115 nigel 9 memcpy(code, previous, len);
1116     code += len;
1117 nigel 3 }
1118     }
1119    
1120     /* If the minimum is zero, stick BRAZERO in front of the first copy.
1121     Then, if there is a fixed upper limit, replicated up to that many times,
1122     sticking BRAZERO in front of all the optional ones. */
1123    
1124     else
1125     {
1126     if (repeat_min == 0)
1127     {
1128 nigel 9 memmove(previous+1, previous, len);
1129 nigel 3 code++;
1130     *previous++ = OP_BRAZERO + repeat_type;
1131     }
1132    
1133     for (i = 1; i < repeat_min; i++)
1134     {
1135 nigel 9 memcpy(code, previous, len);
1136     code += len;
1137 nigel 3 }
1138    
1139     for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1140     {
1141     *code++ = OP_BRAZERO + repeat_type;
1142 nigel 9 memcpy(code, previous, len);
1143     code += len;
1144 nigel 3 }
1145     }
1146    
1147 nigel 23 /* If the maximum is unlimited, set a repeater in the final copy. We
1148     can't just offset backwards from the current code point, because we
1149     don't know if there's been an options resetting after the ket. The
1150     correct offset was computed above. */
1151 nigel 3
1152 nigel 23 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1153 nigel 3 }
1154    
1155     /* Else there's some kind of shambles */
1156    
1157     else
1158     {
1159     *errorptr = ERR11;
1160     goto FAILED;
1161     }
1162    
1163     /* In all case we no longer have a previous item. */
1164    
1165     previous = NULL;
1166     break;
1167    
1168    
1169 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1170     lookbehind or option setting or condition. First deal with special things
1171     that can come after a bracket; all are introduced by ?, and the appearance
1172     of any of them means that this is not a referencing group. They were
1173     checked for validity in the first pass over the string, so we don't have to
1174     check for syntax errors here. */
1175 nigel 3
1176     case '(':
1177 nigel 23 newoptions = options;
1178     condref = -1;
1179    
1180 nigel 3 if (*(++ptr) == '?')
1181     {
1182 nigel 23 int set, unset;
1183     int *optset;
1184 nigel 3
1185     switch (*(++ptr))
1186     {
1187 nigel 23 case '#': /* Comment; skip to ket */
1188 nigel 3 ptr++;
1189     while (*ptr != ')') ptr++;
1190     continue;
1191    
1192     case ':': /* Non-extracting bracket */
1193 nigel 23 bravalue = OP_BRA;
1194 nigel 3 ptr++;
1195     break;
1196    
1197 nigel 23 case '(':
1198     bravalue = OP_COND; /* Conditional group */
1199 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1200 nigel 23 {
1201     condref = *ptr - '0';
1202     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1203     ptr++;
1204     }
1205     else ptr--;
1206     break;
1207    
1208     case '=': /* Positive lookahead */
1209 nigel 3 bravalue = OP_ASSERT;
1210     ptr++;
1211     break;
1212    
1213 nigel 23 case '!': /* Negative lookahead */
1214 nigel 3 bravalue = OP_ASSERT_NOT;
1215     ptr++;
1216     break;
1217    
1218 nigel 23 case '<': /* Lookbehinds */
1219     switch (*(++ptr))
1220 nigel 3 {
1221 nigel 23 case '=': /* Positive lookbehind */
1222     bravalue = OP_ASSERTBACK;
1223 nigel 3 ptr++;
1224     break;
1225 nigel 23
1226     case '!': /* Negative lookbehind */
1227     bravalue = OP_ASSERTBACK_NOT;
1228     ptr++;
1229     break;
1230    
1231     default: /* Syntax error */
1232     *errorptr = ERR24;
1233     goto FAILED;
1234 nigel 3 }
1235 nigel 23 break;
1236 nigel 3
1237 nigel 23 case '>': /* One-time brackets */
1238     bravalue = OP_ONCE;
1239     ptr++;
1240     break;
1241    
1242     default: /* Option setting */
1243     set = unset = 0;
1244     optset = &set;
1245    
1246     while (*ptr != ')' && *ptr != ':')
1247     {
1248     switch (*ptr++)
1249     {
1250     case '-': optset = &unset; break;
1251    
1252     case 'i': *optset |= PCRE_CASELESS; break;
1253     case 'm': *optset |= PCRE_MULTILINE; break;
1254     case 's': *optset |= PCRE_DOTALL; break;
1255     case 'x': *optset |= PCRE_EXTENDED; break;
1256     case 'U': *optset |= PCRE_UNGREEDY; break;
1257     case 'X': *optset |= PCRE_EXTRA; break;
1258    
1259     default:
1260     *errorptr = ERR12;
1261     goto FAILED;
1262     }
1263     }
1264    
1265     /* Set up the changed option bits, but don't change anything yet. */
1266    
1267     newoptions = (options | set) & (~unset);
1268    
1269     /* If the options ended with ')' this is not the start of a nested
1270     group with option changes, so the options change at this level. At top
1271     level there is nothing else to be done (the options will in fact have
1272     been set from the start of compiling as a result of the first pass) but
1273     at an inner level we must compile code to change the ims options if
1274     necessary, and pass the new setting back so that it can be put at the
1275     start of any following branches, and when this group ends, a resetting
1276     item can be compiled. */
1277    
1278     if (*ptr == ')')
1279     {
1280     if ((options & PCRE_INGROUP) != 0 &&
1281     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1282     {
1283     *code++ = OP_OPT;
1284     *code++ = *optchanged = newoptions & PCRE_IMS;
1285     }
1286     options = newoptions; /* Change options at this level */
1287     previous = NULL; /* This item can't be repeated */
1288     continue; /* It is complete */
1289     }
1290    
1291     /* If the options ended with ':' we are heading into a nested group
1292     with possible change of options. Such groups are non-capturing and are
1293     not assertions of any kind. All we need to do is skip over the ':';
1294     the newoptions value is handled below. */
1295    
1296     bravalue = OP_BRA;
1297     ptr++;
1298 nigel 3 }
1299     }
1300    
1301 nigel 23 /* Else we have a referencing group; adjust the opcode. */
1302 nigel 3
1303     else
1304     {
1305     if (++(*brackets) > EXTRACT_MAX)
1306     {
1307     *errorptr = ERR13;
1308     goto FAILED;
1309     }
1310     bravalue = OP_BRA + *brackets;
1311     }
1312    
1313 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1314     kinds can be. We copy code into a non-register variable in order to be able
1315     to pass its address because some compilers complain otherwise. Pass in a
1316     new setting for the ims options if they have changed. */
1317 nigel 3
1318 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1319 nigel 3 *code = bravalue;
1320 nigel 23 tempcode = code;
1321    
1322     if (!compile_regex(
1323     options | PCRE_INGROUP, /* Set for all nested groups */
1324     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1325     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1326     brackets, /* Bracket level */
1327     &tempcode, /* Where to put code (updated) */
1328     &ptr, /* Input pointer (updated) */
1329     errorptr, /* Where to put an error message */
1330     (bravalue == OP_ASSERTBACK ||
1331     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1332 nigel 25 condref, /* Condition reference number */
1333     cd)) /* Tables block */
1334 nigel 23 goto FAILED;
1335    
1336     /* At the end of compiling, code is still pointing to the start of the
1337     group, while tempcode has been updated to point past the end of the group
1338     and any option resetting that may follow it. The pattern pointer (ptr)
1339     is on the bracket. */
1340    
1341     /* If this is a conditional bracket, check that there are no more than
1342     two branches in the group. */
1343    
1344     if (bravalue == OP_COND)
1345 nigel 3 {
1346 nigel 23 int branchcount = 0;
1347     uschar *tc = code;
1348    
1349     do {
1350     branchcount++;
1351     tc += (tc[1] << 8) | tc[2];
1352     }
1353     while (*tc != OP_KET);
1354    
1355     if (branchcount > 2)
1356     {
1357     *errorptr = ERR27;
1358 nigel 3 goto FAILED;
1359 nigel 23 }
1360 nigel 3 }
1361    
1362 nigel 23 /* Now update the main code pointer to the end of the group. */
1363    
1364     code = tempcode;
1365    
1366     /* Error if hit end of pattern */
1367    
1368 nigel 3 if (*ptr != ')')
1369     {
1370     *errorptr = ERR14;
1371     goto FAILED;
1372     }
1373     break;
1374    
1375     /* Check \ for being a real metacharacter; if not, fall through and handle
1376     it as a data character at the start of a string. Escape items are checked
1377     for validity in the pre-compiling pass. */
1378    
1379     case '\\':
1380 nigel 23 tempptr = ptr;
1381 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1382 nigel 3
1383     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1384     are arranged to be the negation of the corresponding OP_values. For the
1385     back references, the values are ESC_REF plus the reference number. Only
1386     back references and those types that consume a character may be repeated.
1387     We can test for values between ESC_b and ESC_Z for the latter; this may
1388     have to change if any new ones are ever created. */
1389    
1390     if (c < 0)
1391     {
1392     if (-c >= ESC_REF)
1393     {
1394     previous = code;
1395     *code++ = OP_REF;
1396 nigel 23 *code++ = -c - ESC_REF;
1397 nigel 3 }
1398     else
1399     {
1400 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1401 nigel 3 *code++ = -c;
1402     }
1403     continue;
1404     }
1405    
1406 nigel 7 /* Data character: reset and fall through */
1407 nigel 3
1408 nigel 23 ptr = tempptr;
1409 nigel 3 c = '\\';
1410    
1411     /* Handle a run of data characters until a metacharacter is encountered.
1412     The first character is guaranteed not to be whitespace or # when the
1413     extended flag is set. */
1414    
1415     NORMAL_CHAR:
1416     default:
1417     previous = code;
1418     *code = OP_CHARS;
1419     code += 2;
1420     length = 0;
1421    
1422     do
1423     {
1424     if ((options & PCRE_EXTENDED) != 0)
1425     {
1426 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1427 nigel 3 if (c == '#')
1428     {
1429     while ((c = *(++ptr)) != 0 && c != '\n');
1430     if (c == 0) break;
1431     continue;
1432     }
1433     }
1434    
1435     /* Backslash may introduce a data char or a metacharacter. Escaped items
1436     are checked for validity in the pre-compiling pass. Stop the string
1437     before a metaitem. */
1438    
1439     if (c == '\\')
1440     {
1441 nigel 23 tempptr = ptr;
1442 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1443 nigel 23 if (c < 0) { ptr = tempptr; break; }
1444 nigel 3 }
1445    
1446     /* Ordinary character or single-char escape */
1447    
1448     *code++ = c;
1449     length++;
1450     }
1451    
1452     /* This "while" is the end of the "do" above. */
1453    
1454 nigel 25 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1455 nigel 3
1456     /* Compute the length and set it in the data vector, and advance to
1457     the next state. */
1458    
1459     previous[1] = length;
1460 nigel 15 if (length < 255) ptr--;
1461 nigel 3 break;
1462     }
1463     } /* end of big loop */
1464    
1465     /* Control never reaches here by falling through, only by a goto for all the
1466     error states. Pass back the position in the pattern so that it can be displayed
1467     to the user for diagnosing the error. */
1468    
1469     FAILED:
1470     *ptrptr = ptr;
1471     return FALSE;
1472     }
1473    
1474    
1475    
1476    
1477     /*************************************************
1478     * Compile sequence of alternatives *
1479     *************************************************/
1480    
1481     /* On entry, ptr is pointing past the bracket character, but on return
1482     it points to the closing bracket, or vertical bar, or end of string.
1483     The code variable is pointing at the byte into which the BRA operator has been
1484 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
1485     during any branch, we need to insert an OP_OPT item at the start of every
1486     following branch to ensure they get set correctly at run time, and also pass
1487     the new options into every subsequent branch compile.
1488 nigel 3
1489     Argument:
1490 nigel 23 options the option bits
1491     optchanged new ims options to set as if (?ims) were at the start, or -1
1492     for no change
1493     brackets -> int containing the number of extracting brackets used
1494     codeptr -> the address of the current code pointer
1495     ptrptr -> the address of the current pattern pointer
1496     errorptr -> pointer to error message
1497     lookbehind TRUE if this is a lookbehind assertion
1498     condref > 0 for OPT_CREF setting at start of conditional group
1499 nigel 25 cd points to the data block with tables pointers
1500 nigel 3
1501 nigel 23 Returns: TRUE on success
1502 nigel 3 */
1503    
1504     static BOOL
1505 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1506 nigel 25 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1507     compile_data *cd)
1508 nigel 3 {
1509 nigel 7 const uschar *ptr = *ptrptr;
1510 nigel 3 uschar *code = *codeptr;
1511 nigel 23 uschar *last_branch = code;
1512 nigel 3 uschar *start_bracket = code;
1513 nigel 23 uschar *reverse_count = NULL;
1514     int oldoptions = options & PCRE_IMS;
1515 nigel 3
1516 nigel 23 code += 3;
1517    
1518     /* At the start of a reference-based conditional group, insert the reference
1519     number as an OP_CREF item. */
1520    
1521     if (condref > 0)
1522     {
1523     *code++ = OP_CREF;
1524     *code++ = condref;
1525     }
1526    
1527     /* Loop for each alternative branch */
1528    
1529 nigel 3 for (;;)
1530     {
1531     int length;
1532    
1533 nigel 23 /* Handle change of options */
1534    
1535     if (optchanged >= 0)
1536 nigel 3 {
1537 nigel 23 *code++ = OP_OPT;
1538     *code++ = optchanged;
1539     options = (options & ~PCRE_IMS) | optchanged;
1540     }
1541    
1542     /* Set up dummy OP_REVERSE if lookbehind assertion */
1543    
1544     if (lookbehind)
1545     {
1546     *code++ = OP_REVERSE;
1547     reverse_count = code;
1548     *code++ = 0;
1549     *code++ = 0;
1550     }
1551    
1552     /* Now compile the branch */
1553    
1554 nigel 25 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1555 nigel 23 {
1556 nigel 3 *ptrptr = ptr;
1557     return FALSE;
1558     }
1559    
1560     /* Fill in the length of the last branch */
1561    
1562     length = code - last_branch;
1563     last_branch[1] = length >> 8;
1564     last_branch[2] = length & 255;
1565    
1566 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
1567     and put the length into the OP_REVERSE item. Temporarily mark the end of
1568     the branch with OP_END. */
1569    
1570     if (lookbehind)
1571     {
1572     *code = OP_END;
1573     length = find_fixedlength(last_branch);
1574     DPRINTF(("fixed length = %d\n", length));
1575     if (length < 0)
1576     {
1577     *errorptr = ERR25;
1578     *ptrptr = ptr;
1579     return FALSE;
1580     }
1581     reverse_count[0] = (length >> 8);
1582     reverse_count[1] = length & 255;
1583     }
1584    
1585 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
1586     terminating ket and the length of the whole bracketed item, and return,
1587 nigel 23 leaving the pointer at the terminating char. If any of the ims options
1588     were changed inside the group, compile a resetting op-code following. */
1589 nigel 3
1590     if (*ptr != '|')
1591     {
1592     length = code - start_bracket;
1593     *code++ = OP_KET;
1594     *code++ = length >> 8;
1595     *code++ = length & 255;
1596 nigel 23 if (optchanged >= 0)
1597     {
1598     *code++ = OP_OPT;
1599     *code++ = oldoptions;
1600     }
1601 nigel 3 *codeptr = code;
1602     *ptrptr = ptr;
1603     return TRUE;
1604     }
1605    
1606     /* Another branch follows; insert an "or" node and advance the pointer. */
1607    
1608     *code = OP_ALT;
1609 nigel 23 last_branch = code;
1610     code += 3;
1611 nigel 3 ptr++;
1612     }
1613     /* Control never reaches here */
1614     }
1615    
1616    
1617    
1618 nigel 23
1619 nigel 3 /*************************************************
1620 nigel 23 * Find first significant op code *
1621     *************************************************/
1622    
1623     /* This is called by several functions that scan a compiled expression looking
1624     for a fixed first character, or an anchoring op code etc. It skips over things
1625     that do not influence this. For one application, a change of caseless option is
1626     important.
1627    
1628     Arguments:
1629     code pointer to the start of the group
1630     options pointer to external options
1631     optbit the option bit whose changing is significant, or
1632     zero if none are
1633     optstop TRUE to return on option change, otherwise change the options
1634     value and continue
1635    
1636     Returns: pointer to the first significant opcode
1637     */
1638    
1639     static const uschar*
1640     first_significant_code(const uschar *code, int *options, int optbit,
1641     BOOL optstop)
1642     {
1643     for (;;)
1644     {
1645     switch ((int)*code)
1646     {
1647     case OP_OPT:
1648     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1649     {
1650     if (optstop) return code;
1651     *options = (int)code[1];
1652     }
1653     code += 2;
1654     break;
1655    
1656     case OP_CREF:
1657     code += 2;
1658     break;
1659    
1660     case OP_ASSERT_NOT:
1661     case OP_ASSERTBACK:
1662     case OP_ASSERTBACK_NOT:
1663     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1664     code += 3;
1665     break;
1666    
1667     default:
1668     return code;
1669     }
1670     }
1671     /* Control never reaches here */
1672     }
1673    
1674    
1675    
1676    
1677     /*************************************************
1678 nigel 3 * Check for anchored expression *
1679     *************************************************/
1680    
1681     /* Try to find out if this is an anchored regular expression. Consider each
1682     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1683     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1684     it's anchored. However, if this is a multiline pattern, then only OP_SOD
1685     counts, since OP_CIRC can match in the middle.
1686    
1687     A branch is also implicitly anchored if it starts with .* because that will try
1688     the rest of the pattern at all possible matching points, so there is no point
1689     trying them again.
1690    
1691 nigel 23 Arguments:
1692     code points to start of expression (the bracket)
1693     options points to the options setting
1694    
1695     Returns: TRUE or FALSE
1696 nigel 3 */
1697    
1698     static BOOL
1699 nigel 23 is_anchored(register const uschar *code, int *options)
1700 nigel 3 {
1701     do {
1702 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
1703     PCRE_MULTILINE, FALSE);
1704     register int op = *scode;
1705     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1706     { if (!is_anchored(scode, options)) return FALSE; }
1707 nigel 3 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1708 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
1709     else if (op != OP_SOD &&
1710     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1711     return FALSE;
1712 nigel 3 code += (code[1] << 8) + code[2];
1713     }
1714     while (*code == OP_ALT);
1715     return TRUE;
1716     }
1717    
1718    
1719    
1720     /*************************************************
1721     * Check for start with \n line expression *
1722     *************************************************/
1723    
1724     /* This is called for multiline expressions to try to find out if every branch
1725     starts with ^ so that "first char" processing can be done to speed things up.
1726    
1727     Argument: points to start of expression (the bracket)
1728     Returns: TRUE or FALSE
1729     */
1730    
1731     static BOOL
1732 nigel 7 is_startline(const uschar *code)
1733 nigel 3 {
1734     do {
1735 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1736     register int op = *scode;
1737     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1738     { if (!is_startline(scode)) return FALSE; }
1739     else if (op != OP_CIRC) return FALSE;
1740 nigel 3 code += (code[1] << 8) + code[2];
1741     }
1742     while (*code == OP_ALT);
1743     return TRUE;
1744     }
1745    
1746    
1747    
1748     /*************************************************
1749     * Check for fixed first char *
1750     *************************************************/
1751    
1752     /* Try to find out if there is a fixed first character. This is called for
1753     unanchored expressions, as it speeds up their processing quite considerably.
1754     Consider each alternative branch. If they all start with the same char, or with
1755     a bracket all of whose alternatives start with the same char (recurse ad lib),
1756     then we return that char, otherwise -1.
1757    
1758 nigel 23 Arguments:
1759     code points to start of expression (the bracket)
1760     options pointer to the options (used to check casing changes)
1761    
1762     Returns: -1 or the fixed first char
1763 nigel 3 */
1764    
1765     static int
1766 nigel 23 find_firstchar(const uschar *code, int *options)
1767 nigel 3 {
1768     register int c = -1;
1769 nigel 23 do {
1770     int d;
1771     const uschar *scode = first_significant_code(code + 3, options,
1772     PCRE_CASELESS, TRUE);
1773     register int op = *scode;
1774 nigel 3
1775 nigel 23 if (op >= OP_BRA) op = OP_BRA;
1776 nigel 3
1777 nigel 23 switch(op)
1778     {
1779     default:
1780     return -1;
1781 nigel 3
1782 nigel 23 case OP_BRA:
1783     case OP_ASSERT:
1784     case OP_ONCE:
1785     case OP_COND:
1786     if ((d = find_firstchar(scode, options)) < 0) return -1;
1787     if (c < 0) c = d; else if (c != d) return -1;
1788     break;
1789 nigel 3
1790 nigel 23 case OP_EXACT: /* Fall through */
1791     scode++;
1792 nigel 3
1793 nigel 23 case OP_CHARS: /* Fall through */
1794     scode++;
1795    
1796     case OP_PLUS:
1797     case OP_MINPLUS:
1798     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1799     break;
1800     }
1801    
1802     code += (code[1] << 8) + code[2];
1803     }
1804 nigel 3 while (*code == OP_ALT);
1805     return c;
1806     }
1807    
1808    
1809    
1810 nigel 23
1811    
1812 nigel 3 /*************************************************
1813     * Compile a Regular Expression *
1814     *************************************************/
1815    
1816     /* This function takes a string and returns a pointer to a block of store
1817     holding a compiled version of the expression.
1818    
1819     Arguments:
1820     pattern the regular expression
1821     options various option bits
1822     errorptr pointer to pointer to error text
1823     erroroffset ptr offset in pattern where error was detected
1824 nigel 25 tables pointer to character tables or NULL
1825 nigel 3
1826     Returns: pointer to compiled data block, or NULL on error,
1827     with errorptr and erroroffset set
1828     */
1829    
1830     pcre *
1831 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
1832 nigel 25 int *erroroffset, const unsigned char *tables)
1833 nigel 3 {
1834     real_pcre *re;
1835     int length = 3; /* For initial BRA plus length */
1836     int runlength;
1837     int c, size;
1838     int bracount = 0;
1839     int top_backref = 0;
1840 nigel 23 int branch_extra = 0;
1841     int branch_newextra;
1842 nigel 7 unsigned int brastackptr = 0;
1843     uschar *code;
1844     const uschar *ptr;
1845 nigel 25 compile_data compile_block;
1846 nigel 23 int brastack[BRASTACK_SIZE];
1847     uschar bralenstack[BRASTACK_SIZE];
1848 nigel 3
1849     #ifdef DEBUG
1850     uschar *code_base, *code_end;
1851     #endif
1852    
1853     /* We can't pass back an error message if errorptr is NULL; I guess the best we
1854     can do is just return NULL. */
1855    
1856     if (errorptr == NULL) return NULL;
1857     *errorptr = NULL;
1858    
1859     /* However, we can give a message for this error */
1860    
1861     if (erroroffset == NULL)
1862     {
1863     *errorptr = ERR16;
1864     return NULL;
1865     }
1866     *erroroffset = 0;
1867    
1868     if ((options & ~PUBLIC_OPTIONS) != 0)
1869     {
1870     *errorptr = ERR17;
1871     return NULL;
1872     }
1873    
1874 nigel 25 /* Set up pointers to the individual character tables */
1875    
1876     if (tables == NULL) tables = pcre_default_tables;
1877     compile_block.lcc = tables + lcc_offset;
1878     compile_block.fcc = tables + fcc_offset;
1879     compile_block.cbits = tables + cbits_offset;
1880     compile_block.ctypes = tables + ctypes_offset;
1881    
1882     /* Reflect pattern for debugging output */
1883    
1884 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
1885     DPRINTF(("%s\n", pattern));
1886 nigel 3
1887     /* The first thing to do is to make a pass over the pattern to compute the
1888     amount of store required to hold the compiled code. This does not have to be
1889     perfect as long as errors are overestimates. At the same time we can detect any
1890     internal flag settings. Make an attempt to correct for any counted white space
1891     if an "extended" flag setting appears late in the pattern. We can't be so
1892     clever for #-comments. */
1893    
1894 nigel 7 ptr = (const uschar *)(pattern - 1);
1895 nigel 3 while ((c = *(++ptr)) != 0)
1896     {
1897     int min, max;
1898     int class_charcount;
1899    
1900 nigel 23 if ((options & PCRE_EXTENDED) != 0)
1901 nigel 3 {
1902 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
1903 nigel 23 if (c == '#')
1904     {
1905     while ((c = *(++ptr)) != 0 && c != '\n');
1906     continue;
1907     }
1908 nigel 3 }
1909    
1910     switch(c)
1911     {
1912     /* A backslashed item may be an escaped "normal" character or a
1913     character type. For a "normal" character, put the pointers and
1914     character back so that tests for whitespace etc. in the input
1915     are done correctly. */
1916    
1917     case '\\':
1918     {
1919 nigel 7 const uschar *save_ptr = ptr;
1920 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
1921 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1922     if (c >= 0)
1923     {
1924     ptr = save_ptr;
1925     c = '\\';
1926     goto NORMAL_CHAR;
1927     }
1928     }
1929     length++;
1930    
1931     /* A back reference needs an additional char, plus either one or 5
1932     bytes for a repeat. We also need to keep the value of the highest
1933     back reference. */
1934    
1935     if (c <= -ESC_REF)
1936     {
1937     int refnum = -c - ESC_REF;
1938     if (refnum > top_backref) top_backref = refnum;
1939     length++; /* For single back reference */
1940 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
1941 nigel 3 {
1942 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
1943 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1944     if ((min == 0 && (max == 1 || max == -1)) ||
1945     (min == 1 && max == -1))
1946     length++;
1947     else length += 5;
1948     if (ptr[1] == '?') ptr++;
1949     }
1950     }
1951     continue;
1952    
1953     case '^':
1954     case '.':
1955     case '$':
1956     case '*': /* These repeats won't be after brackets; */
1957     case '+': /* those are handled separately */
1958     case '?':
1959     length++;
1960     continue;
1961    
1962     /* This covers the cases of repeats after a single char, metachar, class,
1963     or back reference. */
1964    
1965     case '{':
1966 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
1967     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
1968 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1969     if ((min == 0 && (max == 1 || max == -1)) ||
1970     (min == 1 && max == -1))
1971     length++;
1972     else
1973     {
1974     length--; /* Uncount the original char or metachar */
1975     if (min == 1) length++; else if (min > 0) length += 4;
1976     if (max > 0) length += 4; else length += 2;
1977     }
1978     if (ptr[1] == '?') ptr++;
1979     continue;
1980    
1981 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
1982     options changed in the previous branch(es), and/or if we are in a
1983     lookbehind assertion, extra space will be needed at the start of the
1984     branch. This is handled by branch_extra. */
1985    
1986 nigel 3 case '|':
1987 nigel 23 length += 3 + branch_extra;
1988 nigel 3 continue;
1989    
1990     /* A character class uses 33 characters. Don't worry about character types
1991     that aren't allowed in classes - they'll get picked up during the compile.
1992     A character class that contains only one character uses 2 or 3 bytes,
1993     depending on whether it is negated or not. Notice this where we can. */
1994    
1995     case '[':
1996     class_charcount = 0;
1997     if (*(++ptr) == '^') ptr++;
1998     do
1999     {
2000     if (*ptr == '\\')
2001     {
2002 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2003     &compile_block);
2004 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2005 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2006 nigel 3 }
2007     else class_charcount++;
2008     ptr++;
2009     }
2010     while (*ptr != 0 && *ptr != ']');
2011    
2012     /* Repeats for negated single chars are handled by the general code */
2013    
2014     if (class_charcount == 1) length += 3; else
2015     {
2016     length += 33;
2017    
2018     /* A repeat needs either 1 or 5 bytes. */
2019    
2020 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2021 nigel 3 {
2022 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2023 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2024     if ((min == 0 && (max == 1 || max == -1)) ||
2025     (min == 1 && max == -1))
2026     length++;
2027     else length += 5;
2028     if (ptr[1] == '?') ptr++;
2029     }
2030     }
2031     continue;
2032    
2033     /* Brackets may be genuine groups or special things */
2034    
2035     case '(':
2036 nigel 23 branch_newextra = 0;
2037 nigel 3
2038     /* Handle special forms of bracket, which all start (? */
2039    
2040 nigel 23 if (ptr[1] == '?')
2041 nigel 3 {
2042 nigel 23 int set, unset;
2043     int *optset;
2044    
2045     switch (c = ptr[2])
2046 nigel 3 {
2047 nigel 23 /* Skip over comments entirely */
2048     case '#':
2049     ptr += 3;
2050     while (*ptr != 0 && *ptr != ')') ptr++;
2051     if (*ptr == 0)
2052     {
2053     *errorptr = ERR18;
2054     goto PCRE_ERROR_RETURN;
2055     }
2056     continue;
2057 nigel 3
2058 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2059     then behave like a non-special bracket, except that they don't increment
2060     the count of extracting brackets. Ditto for the "once only" bracket,
2061     which is in Perl from version 5.005. */
2062 nigel 3
2063 nigel 23 case ':':
2064     case '=':
2065     case '!':
2066     case '>':
2067 nigel 3 ptr += 2;
2068     break;
2069    
2070 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2071 nigel 3
2072 nigel 23 case '<':
2073     if (ptr[3] == '=' || ptr[3] == '!')
2074 nigel 3 {
2075 nigel 23 ptr += 3;
2076     branch_newextra = 3;
2077     length += 3; /* For the first branch */
2078     break;
2079 nigel 3 }
2080 nigel 23 *errorptr = ERR24;
2081     goto PCRE_ERROR_RETURN;
2082    
2083     /* Conditionals are in Perl from version 5.005. The bracket must either
2084     be followed by a number (for bracket reference) or by an assertion
2085     group. */
2086    
2087     case '(':
2088 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2089 nigel 3 {
2090 nigel 23 ptr += 4;
2091     length += 2;
2092 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2093 nigel 23 if (*ptr != ')')
2094     {
2095     *errorptr = ERR26;
2096     goto PCRE_ERROR_RETURN;
2097     }
2098 nigel 3 }
2099 nigel 23 else /* An assertion must follow */
2100 nigel 3 {
2101 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2102    
2103     if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2104     {
2105     ptr += 2; /* To get right offset in message */
2106     *errorptr = ERR28;
2107     goto PCRE_ERROR_RETURN;
2108     }
2109 nigel 3 }
2110 nigel 23 break;
2111    
2112     /* Else loop checking valid options until ) is met. Anything else is an
2113     error. If we are without any brackets, i.e. at top level, the settings
2114     act as if specified in the options, so massage the options immediately.
2115     This is for backward compatibility with Perl 5.004. */
2116    
2117     default:
2118     set = unset = 0;
2119     optset = &set;
2120     ptr += 2;
2121    
2122     for (;; ptr++)
2123 nigel 3 {
2124 nigel 23 c = *ptr;
2125     switch (c)
2126     {
2127     case 'i':
2128     *optset |= PCRE_CASELESS;
2129     continue;
2130    
2131     case 'm':
2132     *optset |= PCRE_MULTILINE;
2133     continue;
2134    
2135     case 's':
2136     *optset |= PCRE_DOTALL;
2137     continue;
2138    
2139     case 'x':
2140     *optset |= PCRE_EXTENDED;
2141     continue;
2142    
2143     case 'X':
2144     *optset |= PCRE_EXTRA;
2145     continue;
2146    
2147     case 'U':
2148     *optset |= PCRE_UNGREEDY;
2149     continue;
2150    
2151     case '-':
2152     optset = &unset;
2153     continue;
2154    
2155     /* A termination by ')' indicates an options-setting-only item;
2156     this is global at top level; otherwise nothing is done here and
2157     it is handled during the compiling process on a per-bracket-group
2158     basis. */
2159    
2160     case ')':
2161     if (brastackptr == 0)
2162     {
2163     options = (options | set) & (~unset);
2164     set = unset = 0; /* To save length */
2165     }
2166     /* Fall through */
2167    
2168     /* A termination by ':' indicates the start of a nested group with
2169     the given options set. This is again handled at compile time, but
2170     we must allow for compiled space if any of the ims options are
2171     set. We also have to allow for resetting space at the end of
2172     the group, which is why 4 is added to the length and not just 2.
2173     If there are several changes of options within the same group, this
2174     will lead to an over-estimate on the length, but this shouldn't
2175     matter very much. We also have to allow for resetting options at
2176     the start of any alternations, which we do by setting
2177     branch_newextra to 2. */
2178    
2179     case ':':
2180     if (((set|unset) & PCRE_IMS) != 0)
2181     {
2182     length += 4;
2183     branch_newextra = 2;
2184     }
2185     goto END_OPTIONS;
2186    
2187     /* Unrecognized option character */
2188    
2189     default:
2190     *errorptr = ERR12;
2191     goto PCRE_ERROR_RETURN;
2192     }
2193 nigel 3 }
2194 nigel 23
2195     /* If we hit a closing bracket, that's it - this is a freestanding
2196     option-setting. We need to ensure that branch_extra is updated if
2197     necessary. The only values branch_newextra can have here are 0 or 2.
2198     If the value is 2, then branch_extra must either be 2 or 5, depending
2199     on whether this is a lookbehind group or not. */
2200    
2201     END_OPTIONS:
2202     if (c == ')')
2203 nigel 19 {
2204 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2205     branch_extra += branch_newextra;
2206 nigel 19 continue;
2207     }
2208 nigel 3
2209 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2210     to handle the group below. */
2211 nigel 3 }
2212     }
2213    
2214     /* Extracting brackets must be counted so we can process escapes in a
2215     Perlish way. */
2216    
2217     else bracount++;
2218    
2219     /* Non-special forms of bracket. Save length for computing whole length
2220 nigel 23 at end if there's a repeat that requires duplication of the group. Also
2221     save the current value of branch_extra, and start the new group with
2222     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2223     for a lookbehind assertion. */
2224 nigel 3
2225     if (brastackptr >= sizeof(brastack)/sizeof(int))
2226     {
2227     *errorptr = ERR19;
2228     goto PCRE_ERROR_RETURN;
2229     }
2230    
2231 nigel 23 bralenstack[brastackptr] = branch_extra;
2232     branch_extra = branch_newextra;
2233    
2234 nigel 3 brastack[brastackptr++] = length;
2235     length += 3;
2236     continue;
2237    
2238     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2239 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2240     0 this is an unmatched bracket which will generate an error, but take care
2241 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2242     the branch_extra value. */
2243 nigel 3
2244     case ')':
2245     length += 3;
2246     {
2247 nigel 9 int minval = 1;
2248     int maxval = 1;
2249 nigel 23 int duplength;
2250 nigel 3
2251 nigel 23 if (brastackptr > 0)
2252     {
2253     duplength = length - brastack[--brastackptr];
2254     branch_extra = bralenstack[brastackptr];
2255     }
2256     else duplength = 0;
2257    
2258 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2259     automatically; for the others we need an increment. */
2260    
2261 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2262 nigel 3 {
2263 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2264     &compile_block);
2265 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2266     }
2267 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2268     else if (c == '+') { maxval = -1; ptr++; }
2269     else if (c == '?') { minval = 0; ptr++; }
2270 nigel 3
2271 nigel 9 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2272     if there is a limited maximum we have to replicate up to maxval-1 times
2273     and allow for a BRAZERO item before each optional copy, as we also have
2274     to do before the first copy if the minimum is zero. */
2275 nigel 3
2276 nigel 9 if (minval == 0) length++;
2277     else if (minval > 1) length += (minval - 1) * duplength;
2278     if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2279 nigel 3 }
2280     continue;
2281    
2282     /* Non-special character. For a run of such characters the length required
2283     is the number of characters + 2, except that the maximum run length is 255.
2284     We won't get a skipped space or a non-data escape or the start of a #
2285     comment as the first character, so the length can't be zero. */
2286    
2287     NORMAL_CHAR:
2288     default:
2289     length += 2;
2290     runlength = 0;
2291     do
2292     {
2293 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2294 nigel 3 {
2295 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2296 nigel 23 if (c == '#')
2297     {
2298     while ((c = *(++ptr)) != 0 && c != '\n');
2299     continue;
2300     }
2301 nigel 3 }
2302    
2303     /* Backslash may introduce a data char or a metacharacter; stop the
2304     string before the latter. */
2305    
2306     if (c == '\\')
2307     {
2308 nigel 7 const uschar *saveptr = ptr;
2309 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2310     &compile_block);
2311 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2312     if (c < 0) { ptr = saveptr; break; }
2313     }
2314    
2315     /* Ordinary character or single-char escape */
2316    
2317     runlength++;
2318     }
2319    
2320     /* This "while" is the end of the "do" above. */
2321    
2322 nigel 25 while (runlength < 255 &&
2323     (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2324 nigel 3
2325     ptr--;
2326     length += runlength;
2327     continue;
2328     }
2329     }
2330    
2331     length += 4; /* For final KET and END */
2332    
2333     if (length > 65539)
2334     {
2335     *errorptr = ERR20;
2336     return NULL;
2337     }
2338    
2339     /* Compute the size of data block needed and get it, either from malloc or
2340 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
2341     rather than just "code", because it has been reported that one broken compiler
2342     fails on "code" because it is also an independent variable. It should make no
2343     difference to the value of the offsetof(). */
2344 nigel 3
2345 nigel 9 size = length + offsetof(real_pcre, code[0]);
2346 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
2347    
2348     if (re == NULL)
2349     {
2350     *errorptr = ERR21;
2351     return NULL;
2352     }
2353    
2354 nigel 9 /* Put in the magic number and the options. */
2355    
2356 nigel 3 re->magic_number = MAGIC_NUMBER;
2357     re->options = options;
2358 nigel 25 re->tables = tables;
2359 nigel 3
2360     /* Set up a starting, non-extracting bracket, then compile the expression. On
2361     error, *errorptr will be set non-NULL, so we don't need to look at the result
2362     of the function here. */
2363    
2364 nigel 7 ptr = (const uschar *)pattern;
2365 nigel 3 code = re->code;
2366     *code = OP_BRA;
2367     bracount = 0;
2368 nigel 25 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2369     &compile_block);
2370 nigel 3 re->top_bracket = bracount;
2371     re->top_backref = top_backref;
2372    
2373     /* If not reached end of pattern on success, there's an excess bracket. */
2374    
2375     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2376    
2377     /* Fill in the terminating state and check for disastrous overflow, but
2378     if debugging, leave the test till after things are printed out. */
2379    
2380     *code++ = OP_END;
2381    
2382     #ifndef DEBUG
2383     if (code - re->code > length) *errorptr = ERR23;
2384     #endif
2385    
2386 nigel 23 /* Give an error if there's back reference to a non-existent capturing
2387     subpattern. */
2388    
2389     if (top_backref > re->top_bracket) *errorptr = ERR15;
2390    
2391 nigel 3 /* Failed to compile */
2392    
2393     if (*errorptr != NULL)
2394     {
2395     (pcre_free)(re);
2396     PCRE_ERROR_RETURN:
2397 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
2398 nigel 3 return NULL;
2399     }
2400    
2401     /* If the anchored option was not passed, set flag if we can determine that it
2402     is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2403     we can determine what the first character has to be, because that speeds up
2404     unanchored matches no end. In the case of multiline matches, an alternative is
2405     to set the PCRE_STARTLINE flag if all branches start with ^. */
2406    
2407     if ((options & PCRE_ANCHORED) == 0)
2408     {
2409 nigel 23 int temp_options = options;
2410     if (is_anchored(re->code, &temp_options))
2411 nigel 3 re->options |= PCRE_ANCHORED;
2412     else
2413     {
2414 nigel 23 int ch = find_firstchar(re->code, &temp_options);
2415 nigel 9 if (ch >= 0)
2416 nigel 3 {
2417 nigel 9 re->first_char = ch;
2418 nigel 3 re->options |= PCRE_FIRSTSET;
2419     }
2420     else if (is_startline(re->code))
2421     re->options |= PCRE_STARTLINE;
2422     }
2423     }
2424    
2425     /* Print out the compiled data for debugging */
2426    
2427     #ifdef DEBUG
2428    
2429 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
2430 nigel 3 length, re->top_bracket, re->top_backref);
2431    
2432     if (re->options != 0)
2433     {
2434 nigel 19 printf("%s%s%s%s%s%s%s%s\n",
2435 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2436     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2437     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2438     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2439     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2440     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2441 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2442     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2443 nigel 3 }
2444    
2445     if ((re->options & PCRE_FIRSTSET) != 0)
2446     {
2447     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2448     else printf("First char = \\x%02x\n", re->first_char);
2449     }
2450    
2451     code_end = code;
2452     code_base = code = re->code;
2453    
2454     while (code < code_end)
2455     {
2456     int charlength;
2457    
2458     printf("%3d ", code - code_base);
2459    
2460     if (*code >= OP_BRA)
2461     {
2462     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2463     code += 2;
2464     }
2465    
2466     else switch(*code)
2467     {
2468 nigel 23 case OP_OPT:
2469     printf(" %.2x %s", code[1], OP_names[*code]);
2470     code++;
2471     break;
2472    
2473     case OP_COND:
2474     printf("%3d Cond", (code[1] << 8) + code[2]);
2475     code += 2;
2476     break;
2477    
2478     case OP_CREF:
2479     printf(" %.2d %s", code[1], OP_names[*code]);
2480     code++;
2481     break;
2482    
2483 nigel 3 case OP_CHARS:
2484     charlength = *(++code);
2485     printf("%3d ", charlength);
2486     while (charlength-- > 0)
2487     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2488     break;
2489    
2490     case OP_KETRMAX:
2491     case OP_KETRMIN:
2492     case OP_ALT:
2493     case OP_KET:
2494     case OP_ASSERT:
2495     case OP_ASSERT_NOT:
2496 nigel 23 case OP_ASSERTBACK:
2497     case OP_ASSERTBACK_NOT:
2498 nigel 3 case OP_ONCE:
2499     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2500     code += 2;
2501     break;
2502    
2503 nigel 23 case OP_REVERSE:
2504     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2505     code += 2;
2506     break;
2507    
2508 nigel 3 case OP_STAR:
2509     case OP_MINSTAR:
2510     case OP_PLUS:
2511     case OP_MINPLUS:
2512     case OP_QUERY:
2513     case OP_MINQUERY:
2514     case OP_TYPESTAR:
2515     case OP_TYPEMINSTAR:
2516     case OP_TYPEPLUS:
2517     case OP_TYPEMINPLUS:
2518     case OP_TYPEQUERY:
2519     case OP_TYPEMINQUERY:
2520     if (*code >= OP_TYPESTAR)
2521     printf(" %s", OP_names[code[1]]);
2522     else if (isprint(c = code[1])) printf(" %c", c);
2523     else printf(" \\x%02x", c);
2524     printf("%s", OP_names[*code++]);
2525     break;
2526    
2527     case OP_EXACT:
2528     case OP_UPTO:
2529     case OP_MINUPTO:
2530     if (isprint(c = code[3])) printf(" %c{", c);
2531     else printf(" \\x%02x{", c);
2532 nigel 11 if (*code != OP_EXACT) printf("0,");
2533 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
2534     if (*code == OP_MINUPTO) printf("?");
2535     code += 3;
2536     break;
2537    
2538     case OP_TYPEEXACT:
2539     case OP_TYPEUPTO:
2540     case OP_TYPEMINUPTO:
2541     printf(" %s{", OP_names[code[3]]);
2542     if (*code != OP_TYPEEXACT) printf(",");
2543     printf("%d}", (code[1] << 8) + code[2]);
2544     if (*code == OP_TYPEMINUPTO) printf("?");
2545     code += 3;
2546     break;
2547    
2548     case OP_NOT:
2549     if (isprint(c = *(++code))) printf(" [^%c]", c);
2550     else printf(" [^\\x%02x]", c);
2551     break;
2552    
2553     case OP_NOTSTAR:
2554     case OP_NOTMINSTAR:
2555     case OP_NOTPLUS:
2556     case OP_NOTMINPLUS:
2557     case OP_NOTQUERY:
2558     case OP_NOTMINQUERY:
2559     if (isprint(c = code[1])) printf(" [^%c]", c);
2560     else printf(" [^\\x%02x]", c);
2561     printf("%s", OP_names[*code++]);
2562     break;
2563    
2564     case OP_NOTEXACT:
2565     case OP_NOTUPTO:
2566     case OP_NOTMINUPTO:
2567     if (isprint(c = code[3])) printf(" [^%c]{", c);
2568     else printf(" [^\\x%02x]{", c);
2569     if (*code != OP_NOTEXACT) printf(",");
2570     printf("%d}", (code[1] << 8) + code[2]);
2571     if (*code == OP_NOTMINUPTO) printf("?");
2572     code += 3;
2573     break;
2574    
2575     case OP_REF:
2576     printf(" \\%d", *(++code));
2577 nigel 9 code ++;
2578     goto CLASS_REF_REPEAT;
2579 nigel 3
2580     case OP_CLASS:
2581     {
2582     int i, min, max;
2583 nigel 23 code++;
2584     printf(" [");
2585 nigel 3
2586     for (i = 0; i < 256; i++)
2587     {
2588     if ((code[i/8] & (1 << (i&7))) != 0)
2589     {
2590     int j;
2591     for (j = i+1; j < 256; j++)
2592     if ((code[j/8] & (1 << (j&7))) == 0) break;
2593     if (i == '-' || i == ']') printf("\\");
2594     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2595     if (--j > i)
2596     {
2597     printf("-");
2598     if (j == '-' || j == ']') printf("\\");
2599     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2600     }
2601     i = j;
2602     }
2603     }
2604     printf("]");
2605     code += 32;
2606    
2607 nigel 9 CLASS_REF_REPEAT:
2608    
2609 nigel 3 switch(*code)
2610     {
2611     case OP_CRSTAR:
2612     case OP_CRMINSTAR:
2613     case OP_CRPLUS:
2614     case OP_CRMINPLUS:
2615     case OP_CRQUERY:
2616     case OP_CRMINQUERY:
2617     printf("%s", OP_names[*code]);
2618     break;
2619    
2620     case OP_CRRANGE:
2621     case OP_CRMINRANGE:
2622     min = (code[1] << 8) + code[2];
2623     max = (code[3] << 8) + code[4];
2624     if (max == 0) printf("{%d,}", min);
2625     else printf("{%d,%d}", min, max);
2626     if (*code == OP_CRMINRANGE) printf("?");
2627     code += 4;
2628     break;
2629    
2630     default:
2631     code--;
2632     }
2633     }
2634     break;
2635    
2636     /* Anything else is just a one-node item */
2637    
2638     default:
2639     printf(" %s", OP_names[*code]);
2640     break;
2641     }
2642    
2643     code++;
2644     printf("\n");
2645     }
2646     printf("------------------------------------------------------------------\n");
2647    
2648     /* This check is done here in the debugging case so that the code that
2649     was compiled can be seen. */
2650    
2651     if (code - re->code > length)
2652     {
2653     *errorptr = ERR23;
2654     (pcre_free)(re);
2655     *erroroffset = ptr - (uschar *)pattern;
2656     return NULL;
2657     }
2658     #endif
2659    
2660     return (pcre *)re;
2661     }
2662    
2663    
2664    
2665     /*************************************************
2666     * Match a back-reference *
2667     *************************************************/
2668    
2669 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
2670     than the number of characters left in the string, so the match fails.
2671 nigel 3
2672     Arguments:
2673 nigel 23 offset index into the offset vector
2674 nigel 3 eptr points into the subject
2675     length length to be matched
2676     md points to match data block
2677 nigel 23 ims the ims flags
2678 nigel 3
2679     Returns: TRUE if matched
2680     */
2681    
2682     static BOOL
2683 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2684     int ims)
2685 nigel 3 {
2686 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
2687 nigel 3
2688     #ifdef DEBUG
2689     if (eptr >= md->end_subject)
2690     printf("matching subject <null>");
2691     else
2692     {
2693     printf("matching subject ");
2694     pchars(eptr, length, TRUE, md);
2695     }
2696     printf(" against backref ");
2697     pchars(p, length, FALSE, md);
2698     printf("\n");
2699     #endif
2700    
2701     /* Always fail if not enough characters left */
2702    
2703 nigel 23 if (length > md->end_subject - eptr) return FALSE;
2704 nigel 3
2705     /* Separate the caselesss case for speed */
2706    
2707 nigel 23 if ((ims & PCRE_CASELESS) != 0)
2708 nigel 25 {
2709     while (length-- > 0)
2710     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2711     }
2712 nigel 3 else
2713     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2714    
2715     return TRUE;
2716     }
2717    
2718    
2719    
2720     /*************************************************
2721     * Match from current position *
2722     *************************************************/
2723    
2724 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
2725     in the subject string, while eptrb holds the value of eptr at the start of the
2726     last bracketed group - used for breaking infinite loops matching zero-length
2727     strings.
2728 nigel 3
2729     Arguments:
2730     eptr pointer in subject
2731     ecode position in code
2732     offset_top current top pointer
2733     md pointer to "static" info for the match
2734 nigel 23 ims current /i, /m, and /s options
2735     condassert TRUE if called to check a condition assertion
2736     eptrb eptr at start of last bracket
2737 nigel 3
2738     Returns: TRUE if matched
2739     */
2740    
2741     static BOOL
2742 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
2743     int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2744 nigel 3 {
2745 nigel 23 int original_ims = ims; /* Save for resetting on ')' */
2746    
2747 nigel 3 for (;;)
2748     {
2749 nigel 23 int op = (int)*ecode;
2750 nigel 3 int min, max, ctype;
2751     register int i;
2752     register int c;
2753 nigel 7 BOOL minimize = FALSE;
2754 nigel 3
2755 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
2756     the current subject position in the working slot at the top of the vector. We
2757     mustn't change the current values of the data slot, because they may be set
2758     from a previous iteration of this group, and be referred to by a reference
2759     inside the group.
2760 nigel 3
2761 nigel 23 If the bracket fails to match, we need to restore this value and also the
2762     values of the final offsets, in case they were set by a previous iteration of
2763     the same bracket.
2764    
2765     If there isn't enough space in the offset vector, treat this as if it were a
2766     non-capturing bracket. Don't worry about setting the flag for the error case
2767     here; that is handled in the code for KET. */
2768    
2769     if (op > OP_BRA)
2770 nigel 3 {
2771 nigel 23 int number = op - OP_BRA;
2772     int offset = number << 1;
2773 nigel 3
2774 nigel 23 DPRINTF(("start bracket %d\n", number));
2775 nigel 3
2776 nigel 23 if (offset < md->offset_max)
2777 nigel 3 {
2778 nigel 23 int save_offset1 = md->offset_vector[offset];
2779     int save_offset2 = md->offset_vector[offset+1];
2780     int save_offset3 = md->offset_vector[md->offset_end - number];
2781 nigel 3
2782 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2783     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2784    
2785     do
2786     {
2787     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2788     ecode += (ecode[1] << 8) + ecode[2];
2789     }
2790     while (*ecode == OP_ALT);
2791    
2792     DPRINTF(("bracket %d failed\n", number));
2793    
2794     md->offset_vector[offset] = save_offset1;
2795     md->offset_vector[offset+1] = save_offset2;
2796     md->offset_vector[md->offset_end - number] = save_offset3;
2797     return FALSE;
2798 nigel 3 }
2799    
2800 nigel 23 /* Insufficient room for saving captured contents */
2801 nigel 3
2802 nigel 23 else op = OP_BRA;
2803     }
2804    
2805     /* Other types of node can be handled by a switch */
2806    
2807     switch(op)
2808     {
2809     case OP_BRA: /* Non-capturing bracket: optimized */
2810     DPRINTF(("start bracket 0\n"));
2811 nigel 3 do
2812     {
2813 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2814 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2815     }
2816     while (*ecode == OP_ALT);
2817 nigel 23 DPRINTF(("bracket 0 failed\n"));
2818     return FALSE;
2819 nigel 3
2820 nigel 23 /* Conditional group: compilation checked that there are no more than
2821     two branches. If the condition is false, skipping the first branch takes us
2822     past the end if there is only one branch, but that's OK because that is
2823     exactly what going to the ket would do. */
2824 nigel 3
2825 nigel 23 case OP_COND:
2826     if (ecode[3] == OP_CREF) /* Condition is extraction test */
2827 nigel 3 {
2828 nigel 23 int offset = ecode[4] << 1; /* Doubled reference number */
2829     return match(eptr,
2830     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2831     5 : 3 + (ecode[1] << 8) + ecode[2]),
2832     offset_top, md, ims, FALSE, eptr);
2833 nigel 3 }
2834    
2835 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
2836     the final argument TRUE causes it to stop at the end of an assertion. */
2837 nigel 3
2838 nigel 23 else
2839     {
2840     if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
2841     {
2842     ecode += 3 + (ecode[4] << 8) + ecode[5];
2843     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
2844     }
2845     else ecode += (ecode[1] << 8) + ecode[2];
2846     return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
2847     }
2848     /* Control never reaches here */
2849 nigel 3
2850 nigel 23 /* Skip over conditional reference data if encountered (should not be) */
2851    
2852     case OP_CREF:
2853     ecode += 2;
2854     break;
2855    
2856     /* End of the pattern */
2857    
2858 nigel 3 case OP_END:
2859     md->end_match_ptr = eptr; /* Record where we ended */
2860     md->end_offset_top = offset_top; /* and how many extracts were taken */
2861     return TRUE;
2862    
2863 nigel 23 /* Change option settings */
2864 nigel 3
2865 nigel 23 case OP_OPT:
2866     ims = ecode[1];
2867     ecode += 2;
2868     DPRINTF(("ims set to %02x\n", ims));
2869     break;
2870 nigel 3
2871     /* Assertion brackets. Check the alternative branches in turn - the
2872     matching won't pass the KET for an assertion. If any one branch matches,
2873 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
2874     start of each branch to move the current point backwards, so the code at
2875     this level is identical to the lookahead case. */
2876 nigel 3
2877     case OP_ASSERT:
2878 nigel 23 case OP_ASSERTBACK:
2879 nigel 3 do
2880     {
2881 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
2882 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2883     }
2884     while (*ecode == OP_ALT);
2885     if (*ecode == OP_KET) return FALSE;
2886    
2887 nigel 23 /* If checking an assertion for a condition, return TRUE. */
2888    
2889     if (condassert) return TRUE;
2890    
2891 nigel 3 /* Continue from after the assertion, updating the offsets high water
2892     mark, since extracts may have been taken during the assertion. */
2893    
2894     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2895     ecode += 3;
2896     offset_top = md->end_offset_top;
2897     continue;
2898    
2899     /* Negative assertion: all branches must fail to match */
2900    
2901     case OP_ASSERT_NOT:
2902 nigel 23 case OP_ASSERTBACK_NOT:
2903 nigel 3 do
2904     {
2905 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
2906 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
2907     }
2908     while (*ecode == OP_ALT);
2909 nigel 23
2910     if (condassert) return TRUE;
2911 nigel 3 ecode += 3;
2912     continue;
2913    
2914 nigel 23 /* Move the subject pointer back. This occurs only at the start of
2915     each branch of a lookbehind assertion. If we are too close to the start to
2916     move back, this match function fails. */
2917    
2918     case OP_REVERSE:
2919     eptr -= (ecode[1] << 8) + ecode[2];
2920     if (eptr < md->start_subject) return FALSE;
2921     ecode += 3;
2922     break;
2923    
2924    
2925 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
2926     the point in the subject string is not moved back. Thus there can never be
2927 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
2928 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
2929 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
2930     pointer. */
2931 nigel 3
2932     case OP_ONCE:
2933     {
2934 nigel 23 const uschar *prev = ecode;
2935 nigel 3
2936 nigel 23 do
2937     {
2938     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
2939     ecode += (ecode[1] << 8) + ecode[2];
2940     }
2941     while (*ecode == OP_ALT);
2942 nigel 3
2943 nigel 23 /* If hit the end of the group (which could be repeated), fail */
2944 nigel 3
2945 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
2946    
2947     /* Continue as from after the assertion, updating the offsets high water
2948     mark, since extracts may have been taken. */
2949    
2950     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2951    
2952     offset_top = md->end_offset_top;
2953     eptr = md->end_match_ptr;
2954    
2955     /* For a non-repeating ket, just continue at this level. This also
2956     happens for a repeating ket if no characters were matched in the group.
2957     This is the forcible breaking of infinite loops as implemented in Perl
2958     5.005. If there is an options reset, it will get obeyed in the normal
2959     course of events. */
2960    
2961     if (*ecode == OP_KET || eptr == eptrb)
2962     {
2963     ecode += 3;
2964     break;
2965     }
2966    
2967     /* The repeating kets try the rest of the pattern or restart from the
2968     preceding bracket, in the appropriate order. We need to reset any options
2969     that changed within the bracket before re-running it, so check the next
2970     opcode. */
2971    
2972     if (ecode[3] == OP_OPT)
2973     {
2974     ims = (ims & ~PCRE_IMS) | ecode[4];
2975     DPRINTF(("ims set to %02x at group repeat\n", ims));
2976     }
2977    
2978     if (*ecode == OP_KETRMIN)
2979     {
2980     if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
2981     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
2982     }
2983     else /* OP_KETRMAX */
2984     {
2985     if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
2986     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2987     }
2988     }
2989     return FALSE;
2990    
2991 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
2992     bracketed group and go to there. */
2993    
2994     case OP_ALT:
2995     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2996     break;
2997    
2998     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2999     that it may occur zero times. It may repeat infinitely, or not at all -
3000     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3001     repeat limits are compiled as a number of copies, with the optional ones
3002     preceded by BRAZERO or BRAMINZERO. */
3003    
3004     case OP_BRAZERO:
3005     {
3006 nigel 7 const uschar *next = ecode+1;
3007 nigel 23 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3008 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3009     ecode = next + 3;
3010     }
3011     break;
3012    
3013     case OP_BRAMINZERO:
3014     {
3015 nigel 7 const uschar *next = ecode+1;
3016 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3017 nigel 23 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3018 nigel 3 ecode++;
3019     }
3020 nigel 23 break;
3021 nigel 3
3022     /* End of a group, repeated or non-repeating. If we are at the end of
3023     an assertion "group", stop matching and return TRUE, but record the
3024 nigel 23 current high water mark for use by positive assertions. Do this also
3025     for the "once" (not-backup up) groups. */
3026 nigel 3
3027     case OP_KET:
3028     case OP_KETRMIN:
3029     case OP_KETRMAX:
3030     {
3031 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3032 nigel 3
3033 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3034     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3035     *prev == OP_ONCE)
3036 nigel 3 {
3037     md->end_match_ptr = eptr; /* For ONCE */
3038     md->end_offset_top = offset_top;
3039     return TRUE;
3040     }
3041    
3042 nigel 23 /* In all other cases except a conditional group we have to check the
3043     group number back at the start and if necessary complete handling an
3044     extraction by setting the offsets and bumping the high water mark. */
3045 nigel 3
3046 nigel 23 if (*prev != OP_COND)
3047     {
3048     int number = *prev - OP_BRA;
3049     int offset = number << 1;
3050 nigel 3
3051 nigel 23 DPRINTF(("end bracket %d\n", number));
3052 nigel 3
3053 nigel 23 if (number > 0)
3054 nigel 3 {
3055 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3056     {
3057     md->offset_vector[offset] =
3058     md->offset_vector[md->offset_end - number];
3059     md->offset_vector[offset+1] = eptr - md->start_subject;
3060     if (offset_top <= offset) offset_top = offset + 2;
3061     }
3062 nigel 3 }
3063     }
3064    
3065 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3066     the group. */
3067 nigel 3
3068 nigel 23 ims = original_ims;
3069     DPRINTF(("ims reset to %02x\n", ims));
3070    
3071     /* For a non-repeating ket, just continue at this level. This also
3072     happens for a repeating ket if no characters were matched in the group.
3073     This is the forcible breaking of infinite loops as implemented in Perl
3074     5.005. If there is an options reset, it will get obeyed in the normal
3075     course of events. */
3076    
3077     if (*ecode == OP_KET || eptr == eptrb)
3078 nigel 3 {
3079     ecode += 3;
3080     break;
3081     }
3082    
3083     /* The repeating kets try the rest of the pattern or restart from the
3084     preceding bracket, in the appropriate order. */
3085    
3086     if (*ecode == OP_KETRMIN)
3087     {
3088 nigel 23 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3089     match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3090 nigel 3 }
3091     else /* OP_KETRMAX */
3092     {
3093 nigel 23 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3094     match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3095 nigel 3 }
3096     }
3097     return FALSE;
3098    
3099     /* Start of subject unless notbol, or after internal newline if multiline */
3100    
3101     case OP_CIRC:
3102     if (md->notbol && eptr == md->start_subject) return FALSE;
3103 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3104 nigel 3 {
3105     if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3106     ecode++;
3107     break;
3108     }
3109     /* ... else fall through */
3110    
3111     /* Start of subject assertion */
3112    
3113     case OP_SOD:
3114     if (eptr != md->start_subject) return FALSE;
3115     ecode++;
3116     break;
3117    
3118 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3119     newline unless endonly is set, else end of subject unless noteol is set. */
3120 nigel 3
3121     case OP_DOLL:
3122 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3123 nigel 3 {
3124 nigel 23 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3125     else { if (md->noteol) return FALSE; }
3126 nigel 3 ecode++;
3127     break;
3128     }
3129 nigel 23 else
3130 nigel 3 {
3131 nigel 23 if (md->noteol) return FALSE;
3132     if (!md->endonly)
3133     {
3134     if (eptr < md->end_subject - 1 ||
3135     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3136    
3137     ecode++;
3138     break;
3139     }
3140 nigel 3 }
3141     /* ... else fall through */
3142    
3143 nigel 23 /* End of subject assertion (\z) */
3144 nigel 3
3145     case OP_EOD:
3146     if (eptr < md->end_subject) return FALSE;
3147     ecode++;
3148     break;
3149    
3150 nigel 23 /* End of subject or ending \n assertion (\Z) */
3151    
3152     case OP_EODN:
3153     if (eptr < md->end_subject - 1 ||
3154     (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3155     ecode++;
3156     break;
3157    
3158 nigel 3 /* Word boundary assertions */
3159    
3160     case OP_NOT_WORD_BOUNDARY:
3161     case OP_WORD_BOUNDARY:
3162     {
3163     BOOL prev_is_word = (eptr != md->start_subject) &&
3164 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3165 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
3166 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
3167 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
3168     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3169     return FALSE;
3170     }
3171     break;
3172    
3173     /* Match a single character type; inline for speed */
3174    
3175     case OP_ANY:
3176 nigel 23 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3177     return FALSE;
3178 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
3179     ecode++;
3180     break;
3181    
3182     case OP_NOT_DIGIT:
3183 nigel 25 if (eptr >= md->end_subject ||
3184     (md->ctypes[*eptr++] & ctype_digit) != 0)
3185 nigel 3 return FALSE;
3186     ecode++;
3187     break;
3188    
3189     case OP_DIGIT:
3190 nigel 25 if (eptr >= md->end_subject ||
3191     (md->ctypes[*eptr++] & ctype_digit) == 0)
3192 nigel 3 return FALSE;
3193     ecode++;
3194     break;
3195    
3196     case OP_NOT_WHITESPACE:
3197 nigel 25 if (eptr >= md->end_subject ||
3198     (md->ctypes[*eptr++] & ctype_space) != 0)
3199 nigel 3 return FALSE;
3200     ecode++;
3201     break;
3202    
3203     case OP_WHITESPACE:
3204 nigel 25 if (eptr >= md->end_subject ||
3205     (md->ctypes[*eptr++] & ctype_space) == 0)
3206 nigel 3 return FALSE;
3207     ecode++;
3208     break;
3209    
3210     case OP_NOT_WORDCHAR:
3211 nigel 25 if (eptr >= md->end_subject ||
3212     (md->ctypes[*eptr++] & ctype_word) != 0)
3213 nigel 3 return FALSE;
3214     ecode++;
3215     break;
3216    
3217     case OP_WORDCHAR:
3218 nigel 25 if (eptr >= md->end_subject ||
3219     (md->ctypes[*eptr++] & ctype_word) == 0)
3220 nigel 3 return FALSE;
3221     ecode++;
3222     break;
3223    
3224     /* Match a back reference, possibly repeatedly. Look past the end of the
3225     item to see if there is repeat information following. The code is similar
3226     to that for character classes, but repeated for efficiency. Then obey
3227     similar code to character type repeats - written out again for speed.
3228     However, if the referenced string is the empty string, always treat
3229     it as matched, any number of times (otherwise there could be infinite
3230     loops). */
3231    
3232     case OP_REF:
3233     {
3234     int length;
3235 nigel 23 int offset = ecode[1] << 1; /* Doubled reference number */
3236 nigel 3 ecode += 2; /* Advance past the item */
3237    
3238 nigel 23 /* If the reference is unset, set the length to be longer than the amount
3239     of subject left; this ensures that every attempt at a match fails. We
3240     can't just fail here, because of the possibility of quantifiers with zero
3241     minima. */
3242 nigel 3
3243 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3244     md->end_subject - eptr + 1 :
3245     md->offset_vector[offset+1] - md->offset_vector[offset];
3246 nigel 3
3247 nigel 23 /* Set up for repetition, or handle the non-repeated case */
3248    
3249 nigel 3 switch (*ecode)
3250     {
3251     case OP_CRSTAR:
3252     case OP_CRMINSTAR:
3253     case OP_CRPLUS:
3254     case OP_CRMINPLUS:
3255     case OP_CRQUERY:
3256     case OP_CRMINQUERY:
3257     c = *ecode++ - OP_CRSTAR;
3258     minimize = (c & 1) != 0;
3259     min = rep_min[c]; /* Pick up values from tables; */
3260     max = rep_max[c]; /* zero for max => infinity */
3261     if (max == 0) max = INT_MAX;
3262     break;
3263    
3264     case OP_CRRANGE:
3265     case OP_CRMINRANGE:
3266     minimize = (*ecode == OP_CRMINRANGE);
3267     min = (ecode[1] << 8) + ecode[2];
3268     max = (ecode[3] << 8) + ecode[4];
3269     if (max == 0) max = INT_MAX;
3270     ecode += 5;
3271     break;
3272    
3273     default: /* No repeat follows */
3274 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3275 nigel 3 eptr += length;
3276     continue; /* With the main loop */
3277     }
3278    
3279     /* If the length of the reference is zero, just continue with the
3280     main loop. */
3281    
3282     if (length == 0) continue;
3283    
3284     /* First, ensure the minimum number of matches are present. We get back
3285     the length of the reference string explicitly rather than passing the
3286     address of eptr, so that eptr can be a register variable. */
3287    
3288     for (i = 1; i <= min; i++)
3289     {
3290 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3291 nigel 3 eptr += length;
3292     }
3293    
3294     /* If min = max, continue at the same level without recursion.
3295     They are not both allowed to be zero. */
3296    
3297     if (min == max) continue;
3298    
3299     /* If minimizing, keep trying and advancing the pointer */
3300    
3301     if (minimize)
3302     {
3303     for (i = min;; i++)
3304     {
3305 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3306     return TRUE;
3307     if (i >= max || !match_ref(offset, eptr, length, md, ims))
3308 nigel 3 return FALSE;
3309     eptr += length;
3310     }
3311     /* Control never gets here */
3312     }
3313    
3314     /* If maximizing, find the longest string and work backwards */
3315    
3316     else
3317     {
3318 nigel 7 const uschar *pp = eptr;
3319 nigel 3 for (i = min; i < max; i++)
3320     {
3321 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) break;
3322 nigel 3 eptr += length;
3323     }
3324     while (eptr >= pp)
3325     {
3326 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3327     return TRUE;
3328 nigel 3 eptr -= length;
3329     }
3330     return FALSE;
3331     }
3332     }
3333     /* Control never gets here */
3334    
3335 nigel 23
3336    
3337 nigel 3 /* Match a character class, possibly repeatedly. Look past the end of the
3338     item to see if there is repeat information following. Then obey similar
3339 nigel 23 code to character type repeats - written out again for speed. */
3340 nigel 3
3341     case OP_CLASS:
3342     {
3343 nigel 7 const uschar *data = ecode + 1; /* Save for matching */
3344     ecode += 33; /* Advance past the item */
3345 nigel 3
3346     switch (*ecode)
3347     {
3348     case OP_CRSTAR:
3349     case OP_CRMINSTAR:
3350     case OP_CRPLUS:
3351     case OP_CRMINPLUS:
3352     case OP_CRQUERY:
3353     case OP_CRMINQUERY:
3354     c = *ecode++ - OP_CRSTAR;
3355     minimize = (c & 1) != 0;
3356     min = rep_min[c]; /* Pick up values from tables; */
3357     max = rep_max[c]; /* zero for max => infinity */
3358     if (max == 0) max = INT_MAX;
3359     break;
3360    
3361     case OP_CRRANGE:
3362     case OP_CRMINRANGE:
3363     minimize = (*ecode == OP_CRMINRANGE);
3364     min = (ecode[1] << 8) + ecode[2];
3365     max = (ecode[3] << 8) + ecode[4];
3366     if (max == 0) max = INT_MAX;
3367     ecode += 5;
3368     break;
3369    
3370     default: /* No repeat follows */
3371 nigel 13 min = max = 1;
3372     break;
3373 nigel 3 }
3374    
3375     /* First, ensure the minimum number of matches are present. */
3376    
3377     for (i = 1; i <= min; i++)
3378     {
3379     if (eptr >= md->end_subject) return FALSE;
3380     c = *eptr++;
3381 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3382 nigel 3 return FALSE;
3383     }
3384    
3385     /* If max == min we can continue with the main loop without the
3386     need to recurse. */
3387    
3388     if (min == max) continue;
3389    
3390     /* If minimizing, keep testing the rest of the expression and advancing
3391     the pointer while it matches the class. */
3392    
3393     if (minimize)
3394     {
3395     for (i = min;; i++)
3396     {
3397 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3398     return TRUE;
3399 nigel 3 if (i >= max || eptr >= md->end_subject) return FALSE;
3400     c = *eptr++;
3401 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3402 nigel 3 return FALSE;
3403     }
3404     /* Control never gets here */
3405     }
3406    
3407     /* If maximizing, find the longest possible run, then work backwards. */
3408    
3409     else
3410     {
3411 nigel 7 const uschar *pp = eptr;
3412 nigel 3 for (i = min; i < max; eptr++, i++)
3413     {
3414     if (eptr >= md->end_subject) break;
3415     c = *eptr;
3416 nigel 23 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3417 nigel 3 break;
3418     }
3419    
3420     while (eptr >= pp)
3421 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3422     return TRUE;
3423 nigel 3 return FALSE;
3424     }
3425     }
3426     /* Control never gets here */
3427    
3428     /* Match a run of characters */
3429    
3430     case OP_CHARS:
3431     {
3432     register int length = ecode[1];
3433     ecode += 2;
3434    
3435 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3436 nigel 3 if (eptr >= md->end_subject)
3437     printf("matching subject <null> against pattern ");
3438     else
3439     {
3440     printf("matching subject ");
3441     pchars(eptr, length, TRUE, md);
3442     printf(" against pattern ");
3443     }
3444     pchars(ecode, length, FALSE, md);
3445     printf("\n");
3446 nigel 9 #endif
3447 nigel 3
3448     if (length > md->end_subject - eptr) return FALSE;
3449 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3450 nigel 3 {
3451 nigel 25 while (length-- > 0)
3452     if (md->lcc[*ecode++] != md->lcc[*eptr++])
3453     return FALSE;
3454 nigel 3 }
3455     else
3456     {
3457     while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3458     }
3459     }
3460     break;
3461    
3462     /* Match a single character repeatedly; different opcodes share code. */
3463    
3464     case OP_EXACT:
3465     min = max = (ecode[1] << 8) + ecode[2];
3466     ecode += 3;
3467     goto REPEATCHAR;
3468    
3469     case OP_UPTO:
3470     case OP_MINUPTO:
3471     min = 0;
3472     max = (ecode[1] << 8) + ecode[2];
3473     minimize = *ecode == OP_MINUPTO;
3474     ecode += 3;
3475     goto REPEATCHAR;
3476    
3477     case OP_STAR:
3478     case OP_MINSTAR:
3479     case OP_PLUS:
3480     case OP_MINPLUS:
3481     case OP_QUERY:
3482     case OP_MINQUERY:
3483     c = *ecode++ - OP_STAR;
3484     minimize = (c & 1) != 0;
3485     min = rep_min[c]; /* Pick up values from tables; */
3486     max = rep_max[c]; /* zero for max => infinity */
3487     if (max == 0) max = INT_MAX;
3488    
3489     /* Common code for all repeated single-character matches. We can give
3490     up quickly if there are fewer than the minimum number of characters left in
3491     the subject. */
3492    
3493     REPEATCHAR:
3494     if (min > md->end_subject - eptr) return FALSE;
3495     c = *ecode++;
3496    
3497     /* The code is duplicated for the caseless and caseful cases, for speed,
3498     since matching characters is likely to be quite common. First, ensure the
3499     minimum number of matches are present. If min = max, continue at the same
3500     level without recursing. Otherwise, if minimizing, keep trying the rest of
3501     the expression and advancing one matching character if failing, up to the
3502     maximum. Alternatively, if maximizing, find the maximum number of
3503     characters and work backwards. */
3504    
3505 nigel 9 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3506     max, eptr));
3507 nigel 3
3508 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3509 nigel 3 {
3510 nigel 25 c = md->lcc[c];
3511     for (i = 1; i <= min; i++)
3512     if (c != md->lcc[*eptr++]) return FALSE;
3513 nigel 3 if (min == max) continue;
3514     if (minimize)
3515     {
3516     for (i = min;; i++)
3517     {
3518 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3519     return TRUE;
3520 nigel 25 if (i >= max || eptr >= md->end_subject ||
3521     c != md->lcc[*eptr++])
3522 nigel 3 return FALSE;
3523     }
3524     /* Control never gets here */
3525     }
3526     else
3527     {
3528 nigel 7 const uschar *pp = eptr;
3529 nigel 3 for (i = min; i < max; i++)
3530     {
3531 nigel 25 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3532 nigel 3 eptr++;
3533     }
3534     while (eptr >= pp)
3535 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3536     return TRUE;
3537 nigel 3 return FALSE;
3538     }
3539     /* Control never gets here */
3540     }
3541    
3542     /* Caseful comparisons */
3543    
3544     else
3545     {
3546     for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3547     if (min == max) continue;
3548     if (minimize)
3549     {
3550     for (i = min;; i++)
3551     {
3552 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3553     return TRUE;
3554 nigel 3 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3555     }
3556     /* Control never gets here */
3557     }
3558     else
3559     {
3560 nigel 7 const uschar *pp = eptr;
3561 nigel 3 for (i = min; i < max; i++)
3562     {
3563     if (eptr >= md->end_subject || c != *eptr) break;
3564     eptr++;
3565     }
3566     while (eptr >= pp)
3567 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3568     return TRUE;
3569 nigel 3 return FALSE;
3570     }
3571     }
3572     /* Control never gets here */
3573    
3574     /* Match a negated single character */
3575    
3576     case OP_NOT:
3577 nigel 9 if (eptr >= md->end_subject) return FALSE;
3578 nigel 3 ecode++;
3579 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3580 nigel 3 {
3581 nigel 25 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3582 nigel 3 }
3583     else
3584     {
3585     if (*ecode++ == *eptr++) return FALSE;
3586     }
3587     break;
3588    
3589     /* Match a negated single character repeatedly. This is almost a repeat of
3590     the code for a repeated single character, but I haven't found a nice way of
3591     commoning these up that doesn't require a test of the positive/negative
3592     option for each character match. Maybe that wouldn't add very much to the
3593     time taken, but character matching *is* what this is all about... */
3594    
3595     case OP_NOTEXACT:
3596     min = max = (ecode[1] << 8) + ecode[2];
3597     ecode += 3;
3598     goto REPEATNOTCHAR;
3599    
3600     case OP_NOTUPTO:
3601     case OP_NOTMINUPTO:
3602     min = 0;
3603     max = (ecode[1] << 8) + ecode[2];
3604     minimize = *ecode == OP_NOTMINUPTO;
3605     ecode += 3;
3606     goto REPEATNOTCHAR;
3607    
3608     case OP_NOTSTAR:
3609     case OP_NOTMINSTAR:
3610     case OP_NOTPLUS:
3611     case OP_NOTMINPLUS:
3612     case OP_NOTQUERY:
3613     case OP_NOTMINQUERY:
3614     c = *ecode++ - OP_NOTSTAR;
3615     minimize = (c & 1) != 0;
3616     min = rep_min[c]; /* Pick up values from tables; */
3617     max = rep_max[c]; /* zero for max => infinity */
3618     if (max == 0) max = INT_MAX;
3619    
3620     /* Common code for all repeated single-character matches. We can give
3621     up quickly if there are fewer than the minimum number of characters left in
3622     the subject. */
3623    
3624     REPEATNOTCHAR:
3625     if (min > md->end_subject - eptr) return FALSE;
3626     c = *ecode++;
3627    
3628     /* The code is duplicated for the caseless and caseful cases, for speed,
3629     since matching characters is likely to be quite common. First, ensure the
3630     minimum number of matches are present. If min = max, continue at the same
3631     level without recursing. Otherwise, if minimizing, keep trying the rest of
3632     the expression and advancing one matching character if failing, up to the
3633     maximum. Alternatively, if maximizing, find the maximum number of
3634     characters and work backwards. */
3635    
3636 nigel 9 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3637     max, eptr));
3638 nigel 3
3639 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3640 nigel 3 {
3641 nigel 25 c = md->lcc[c];
3642     for (i = 1; i <= min; i++)
3643     if (c == md->lcc[*eptr++]) return FALSE;
3644 nigel 3 if (min == max) continue;
3645     if (minimize)
3646     {
3647     for (i = min;; i++)
3648     {
3649 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3650     return TRUE;
3651 nigel 25 if (i >= max || eptr >= md->end_subject ||
3652     c == md->lcc[*eptr++])
3653 nigel 3 return FALSE;
3654     }
3655     /* Control never gets here */
3656     }
3657     else
3658     {
3659 nigel 7 const uschar *pp = eptr;
3660 nigel 3 for (i = min; i < max; i++)
3661     {
3662 nigel 25 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3663 nigel 3 eptr++;
3664     }
3665     while (eptr >= pp)
3666 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3667     return TRUE;
3668 nigel 3 return FALSE;
3669     }
3670     /* Control never gets here */
3671     }
3672    
3673     /* Caseful comparisons */
3674    
3675     else
3676     {
3677     for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3678     if (min == max) continue;
3679     if (minimize)
3680     {
3681     for (i = min;; i++)
3682     {
3683 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3684     return TRUE;
3685 nigel 3 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3686     }
3687     /* Control never gets here */
3688     }
3689     else
3690     {
3691 nigel 7 const uschar *pp = eptr;
3692 nigel 3 for (i = min; i < max; i++)
3693     {
3694     if (eptr >= md->end_subject || c == *eptr) break;
3695     eptr++;
3696     }
3697     while (eptr >= pp)
3698 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3699     return TRUE;
3700 nigel 3 return FALSE;
3701     }
3702     }
3703     /* Control never gets here */
3704    
3705     /* Match a single character type repeatedly; several different opcodes
3706     share code. This is very similar to the code for single characters, but we
3707     repeat it in the interests of efficiency. */
3708    
3709     case OP_TYPEEXACT:
3710     min = max = (ecode[1] << 8) + ecode[2];
3711     minimize = TRUE;
3712     ecode += 3;
3713     goto REPEATTYPE;
3714    
3715     case OP_TYPEUPTO:
3716     case OP_TYPEMINUPTO:
3717     min = 0;
3718     max = (ecode[1] << 8) + ecode[2];
3719     minimize = *ecode == OP_TYPEMINUPTO;
3720     ecode += 3;
3721     goto REPEATTYPE;
3722    
3723     case OP_TYPESTAR:
3724     case OP_TYPEMINSTAR:
3725     case OP_TYPEPLUS:
3726     case OP_TYPEMINPLUS:
3727     case OP_TYPEQUERY:
3728     case OP_TYPEMINQUERY:
3729     c = *ecode++ - OP_TYPESTAR;
3730     minimize = (c & 1) != 0;
3731     min = rep_min[c]; /* Pick up values from tables; */
3732     max = rep_max[c]; /* zero for max => infinity */
3733     if (max == 0) max = INT_MAX;
3734    
3735     /* Common code for all repeated single character type matches */
3736    
3737     REPEATTYPE:
3738     ctype = *ecode++; /* Code for the character type */
3739    
3740     /* First, ensure the minimum number of matches are present. Use inline
3741     code for maximizing the speed, and do the type test once at the start
3742     (i.e. keep it out of the loop). Also test that there are at least the
3743     minimum number of characters before we start. */
3744    
3745     if (min > md->end_subject - eptr) return FALSE;
3746     if (min > 0) switch(ctype)
3747     {
3748     case OP_ANY:
3749 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3750 nigel 3 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3751     else eptr += min;
3752     break;
3753    
3754     case OP_NOT_DIGIT:
3755     for (i = 1; i <= min; i++)
3756 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3757 nigel 3 break;
3758    
3759     case OP_DIGIT:
3760     for (i = 1; i <= min; i++)
3761 nigel 25 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3762 nigel 3 break;
3763    
3764     case OP_NOT_WHITESPACE:
3765     for (i = 1; i <= min; i++)
3766 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3767 nigel 3 break;
3768    
3769     case OP_WHITESPACE:
3770     for (i = 1; i <= min; i++)
3771 nigel 25 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3772 nigel 3 break;
3773    
3774     case OP_NOT_WORDCHAR:
3775 nigel 25 for (i = 1; i <= min; i++)
3776     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3777     return FALSE;
3778 nigel 3 break;
3779    
3780     case OP_WORDCHAR:
3781 nigel 25 for (i = 1; i <= min; i++)
3782     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3783     return FALSE;
3784 nigel 3 break;
3785     }
3786    
3787     /* If min = max, continue at the same level without recursing */
3788    
3789     if (min == max) continue;
3790    
3791     /* If minimizing, we have to test the rest of the pattern before each
3792 nigel 25 subsequent match. */
3793 nigel 3
3794     if (minimize)
3795     {
3796     for (i = min;; i++)
3797     {
3798 nigel 23 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3799 nigel 25 if (i >= max || eptr >= md->end_subject) return FALSE;
3800    
3801     c = *eptr++;
3802     switch(ctype)
3803     {
3804     case OP_ANY:
3805     if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3806     break;
3807    
3808     case OP_NOT_DIGIT:
3809     if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3810     break;
3811    
3812     case OP_DIGIT:
3813     if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3814     break;
3815    
3816     case OP_NOT_WHITESPACE:
3817     if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3818     break;
3819    
3820     case OP_WHITESPACE:
3821     if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3822     break;
3823    
3824     case OP_NOT_WORDCHAR:
3825     if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3826     break;
3827    
3828     case OP_WORDCHAR:
3829     if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3830     break;
3831     }
3832 nigel 3 }
3833     /* Control never gets here */
3834     }
3835    
3836     /* If maximizing it is worth using inline code for speed, doing the type
3837     test once at the start (i.e. keep it out of the loop). */
3838    
3839     else
3840     {
3841 nigel 7 const uschar *pp = eptr;
3842 nigel 3 switch(ctype)
3843     {
3844     case OP_ANY:
3845 nigel 23 if ((ims & PCRE_DOTALL) == 0)
3846 nigel 3 {
3847     for (i = min; i < max; i++)
3848     {
3849     if (eptr >= md->end_subject || *eptr == '\n') break;
3850     eptr++;
3851     }
3852     }
3853     else
3854     {
3855     c = max - min;
3856     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3857     eptr += c;
3858     }
3859     break;
3860    
3861     case OP_NOT_DIGIT:
3862     for (i = min; i < max; i++)
3863     {
3864 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3865 nigel 3 break;
3866     eptr++;
3867     }
3868     break;
3869    
3870     case OP_DIGIT:
3871     for (i = min; i < max; i++)
3872     {
3873 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3874 nigel 3 break;
3875     eptr++;
3876     }
3877     break;
3878    
3879     case OP_NOT_WHITESPACE:
3880     for (i = min; i < max; i++)
3881     {
3882 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3883 nigel 3 break;
3884     eptr++;
3885     }
3886     break;
3887    
3888     case OP_WHITESPACE:
3889     for (i = min; i < max; i++)
3890     {
3891 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3892 nigel 3 break;
3893     eptr++;
3894     }
3895     break;
3896    
3897     case OP_NOT_WORDCHAR:
3898     for (i = min; i < max; i++)
3899     {
3900 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3901 nigel 3 break;
3902     eptr++;
3903     }
3904     break;
3905    
3906     case OP_WORDCHAR:
3907     for (i = min; i < max; i++)
3908     {
3909 nigel 25 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3910 nigel 3 break;
3911     eptr++;
3912     }
3913     break;
3914     }
3915    
3916     while (eptr >= pp)
3917 nigel 23 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3918     return TRUE;
3919 nigel 3 return FALSE;
3920     }
3921     /* Control never gets here */
3922    
3923     /* There's been some horrible disaster. */
3924    
3925     default:
3926 nigel 9 DPRINTF(("Unknown opcode %d\n", *ecode));
3927 nigel 3 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3928     return FALSE;
3929     }
3930    
3931     /* Do not stick any code in here without much thought; it is assumed
3932     that "continue" in the code above comes out to here to repeat the main
3933     loop. */
3934    
3935     } /* End of main loop */
3936     /* Control never reaches here */
3937     }
3938    
3939    
3940    
3941 nigel 9
3942     /*************************************************
3943 nigel 3 * Execute a Regular Expression *
3944     *************************************************/
3945    
3946     /* This function applies a compiled re to a subject string and picks out
3947     portions of the string if it matches. Two elements in the vector are set for
3948     each substring: the offsets to the start and end of the substring.
3949    
3950     Arguments:
3951     external_re points to the compiled expression
3952     external_extra points to "hints" from pcre_study() or is NULL
3953     subject points to the subject string
3954     length length of subject string (may contain binary zeros)
3955     options option bits
3956     offsets points to a vector of ints to be filled in with offsets
3957     offsetcount the number of elements in the vector
3958    
3959     Returns: > 0 => success; value is the number of elements filled in
3960     = 0 => success, but offsets is not big enough
3961     -1 => failed to match
3962     < -1 => some kind of unexpected problem
3963     */
3964    
3965     int
3966     pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3967     const char *subject, int length, int options, int *offsets, int offsetcount)
3968     {
3969 nigel 11 int resetcount, ocount;
3970 nigel 3 int first_char = -1;
3971 nigel 23 int ims = 0;
3972 nigel 3 match_data match_block;
3973 nigel 7 const uschar *start_bits = NULL;
3974 nigel 9 const uschar *start_match = (const uschar *)subject;
3975 nigel 7 const uschar *end_subject;
3976     const real_pcre *re = (const real_pcre *)external_re;
3977     const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3978 nigel 11 BOOL using_temporary_offsets = FALSE;
3979 nigel 3 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3980     BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3981    
3982     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3983    
3984     if (re == NULL || subject == NULL ||
3985     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3986     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3987    
3988 nigel 7 match_block.start_subject = (const uschar *)subject;
3989 nigel 3 match_block.end_subject = match_block.start_subject + length;
3990     end_subject = match_block.end_subject;
3991    
3992 nigel 23 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3993 nigel 3
3994     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3995     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3996    
3997     match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3998    
3999 nigel 25 match_block.lcc = re->tables + lcc_offset;
4000     match_block.ctypes = re->tables + ctypes_offset;
4001    
4002 nigel 23 /* The ims options can vary during the matching as a result of the presence
4003     of (?ims) items in the pattern. They are kept in a local variable so that
4004     restoring at the exit of a group is easy. */
4005    
4006     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4007    
4008 nigel 3 /* If the expression has got more back references than the offsets supplied can
4009     hold, we get a temporary bit of working store to use during the matching.
4010 nigel 11 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4011 nigel 23 of 3. */
4012 nigel 3
4013 nigel 23 ocount = offsetcount - (offsetcount % 3);
4014    
4015     if (re->top_backref > 0 && re->top_backref >= ocount/3)
4016 nigel 3 {
4017 nigel 23 ocount = re->top_backref * 3 + 3;
4018 nigel 15 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4019 nigel 3 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4020 nigel 11 using_temporary_offsets = TRUE;
4021 nigel 9 DPRINTF(("Got memory to hold back references\n"));
4022 nigel 3 }
4023     else match_block.offset_vector = offsets;
4024    
4025     match_block.offset_end = ocount;
4026 nigel 23 match_block.offset_max = (2*ocount)/3;
4027 nigel 3 match_block.offset_overflow = FALSE;
4028    
4029     /* Compute the minimum number of offsets that we need to reset each time. Doing
4030     this makes a huge difference to execution time when there aren't many brackets
4031     in the pattern. */
4032    
4033     resetcount = 2 + re->top_bracket * 2;
4034     if (resetcount > offsetcount) resetcount = ocount;
4035    
4036     /* Set up the first character to match, if available. The first_char value is
4037     never set for an anchored regular expression, but the anchoring may be forced
4038     at run time, so we have to test for anchoring. The first char may be unset for
4039     an unanchored pattern, of course. If there's no first char and the pattern was
4040 nigel 23 studied, there may be a bitmap of possible first characters. */
4041 nigel 3
4042     if (!anchored)
4043     {
4044     if ((re->options & PCRE_FIRSTSET) != 0)
4045     {
4046     first_char = re->first_char;
4047 nigel 25 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4048 nigel 3 }
4049     else
4050     if (!startline && extra != NULL &&
4051 nigel 23 (extra->options & PCRE_STUDY_MAPPED) != 0)
4052 nigel 3 start_bits = extra->start_bits;
4053     }
4054    
4055     /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4056    
4057     do
4058     {
4059 nigel 9 int rc;
4060 nigel 3 register int *iptr = match_block.offset_vector;
4061     register int *iend = iptr + resetcount;
4062    
4063     /* Reset the maximum number of extractions we might see. */
4064    
4065     while (iptr < iend) *iptr++ = -1;
4066    
4067     /* Advance to a unique first char if possible */
4068    
4069     if (first_char >= 0)
4070     {
4071 nigel 23 if ((ims & PCRE_CASELESS) != 0)
4072 nigel 25 while (start_match < end_subject &&
4073     match_block.lcc[*start_match] != first_char)
4074 nigel 3 start_match++;
4075     else
4076     while (start_match < end_subject && *start_match != first_char)
4077     start_match++;
4078     }
4079    
4080     /* Or to just after \n for a multiline match if possible */
4081    
4082     else if (startline)
4083     {
4084     if (start_match > match_block.start_subject)
4085     {
4086     while (start_match < end_subject && start_match[-1] != '\n')
4087     start_match++;
4088     }
4089     }
4090    
4091     /* Or to a non-unique first char */
4092    
4093     else if (start_bits != NULL)
4094     {
4095     while (start_match < end_subject)
4096     {
4097     register int c = *start_match;
4098     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4099     }
4100     }
4101    
4102 nigel 9 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4103 nigel 3 printf(">>>> Match against: ");
4104     pchars(start_match, end_subject - start_match, TRUE, &match_block);
4105     printf("\n");
4106 nigel 9 #endif
4107 nigel 3
4108     /* When a match occurs, substrings will be set for all internal extractions;
4109     we just need to set up the whole thing as substring 0 before returning. If
4110     there were too many extractions, set the return code to zero. In the case
4111     where we had to get some local store to hold offsets for backreferences, copy
4112     those back references that we can. In this case there need not be overflow
4113 nigel 23 if certain parts of the pattern were not used. */
4114 nigel 3
4115 nigel 23 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4116     continue;
4117 nigel 3
4118 nigel 9 /* Copy the offset information from temporary store if necessary */
4119    
4120 nigel 11 if (using_temporary_offsets)
4121 nigel 3 {
4122 nigel 9 if (offsetcount >= 4)
4123 nigel 3 {
4124 nigel 9 memcpy(offsets + 2, match_block.offset_vector + 2,
4125     (offsetcount - 2) * sizeof(int));
4126 nigel 11 DPRINTF(("Copied offsets from temporary memory\n"));
4127 nigel 3 }
4128 nigel 9 if (match_block.end_offset_top > offsetcount)
4129     match_block.offset_overflow = TRUE;
4130 nigel 3
4131 nigel 9 DPRINTF(("Freeing temporary memory\n"));
4132     (pcre_free)(match_block.offset_vector);
4133     }
4134 nigel 3
4135 nigel 9 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4136 nigel 3
4137 nigel 9 if (match_block.offset_end < 2) rc = 0; else
4138     {
4139     offsets[0] = start_match - match_block.start_subject;
4140     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4141 nigel 3 }
4142 nigel 9
4143     DPRINTF((">>>> returning %d\n", rc));
4144     return rc;
4145 nigel 3 }
4146 nigel 25
4147     /* This "while" is the end of the "do" above */
4148    
4149 nigel 3 while (!anchored &&
4150     match_block.errorcode == PCRE_ERROR_NOMATCH &&