/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 35 - (show annotations) (download)
Sat Feb 24 21:39:05 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 126481 byte(s)
Load pcre-2.06 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-1999 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70
71 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73
74 /* Text forms of OP_ values and things, for debugging (not all used) */
75
76 #ifdef DEBUG
77 static const char *OP_names[] = {
78 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80 "Opt", "^", "$", "Any", "chars", "not",
81 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84 "*", "*?", "+", "+?", "?", "??", "{", "{",
85 "class", "Ref",
86 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 "Brazero", "Braminzero", "Bra"
89 };
90 #endif
91
92 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93 are simple data values; negative values are for special things like \d and so
94 on. Zero means further processing is needed (for things like \x), or the escape
95 is invalid. */
96
97 static const short int escapes[] = {
98 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 0, 0, -ESC_z /* x - z */
108 };
109
110 /* Definition to allow mutual recursion */
111
112 static BOOL
113 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 BOOL, int, compile_data *);
115
116
117
118 /*************************************************
119 * Global variables *
120 *************************************************/
121
122 /* PCRE is thread-clean and doesn't use any global variables in the normal
123 sense. However, it calls memory allocation and free functions via the two
124 indirections below, which are can be changed by the caller, but are shared
125 between all threads. */
126
127 void *(*pcre_malloc)(size_t) = malloc;
128 void (*pcre_free)(void *) = free;
129
130
131
132
133 /*************************************************
134 * Default character tables *
135 *************************************************/
136
137 /* A default set of character tables is included in the PCRE binary. Its source
138 is built by the maketables auxiliary program, which uses the default C ctypes
139 functions, and put in the file chartables.c. These tables are used by PCRE
140 whenever the caller of pcre_compile() does not provide an alternate set of
141 tables. */
142
143 #include "chartables.c"
144
145
146
147 /*************************************************
148 * Return version string *
149 *************************************************/
150
151 const char *
152 pcre_version(void)
153 {
154 return PCRE_VERSION;
155 }
156
157
158
159
160 /*************************************************
161 * Return info about a compiled pattern *
162 *************************************************/
163
164 /* This function picks potentially useful data out of the private
165 structure.
166
167 Arguments:
168 external_re points to compiled code
169 optptr where to pass back the options
170 first_char where to pass back the first character,
171 or -1 if multiline and all branches start ^,
172 or -2 otherwise
173
174 Returns: number of identifying extraction brackets
175 or negative values on error
176 */
177
178 int
179 pcre_info(const pcre *external_re, int *optptr, int *first_char)
180 {
181 const real_pcre *re = (const real_pcre *)external_re;
182 if (re == NULL) return PCRE_ERROR_NULL;
183 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185 if (first_char != NULL)
186 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188 return re->top_bracket;
189 }
190
191
192
193
194 #ifdef DEBUG
195 /*************************************************
196 * Debugging function to print chars *
197 *************************************************/
198
199 /* Print a sequence of chars in printable format, stopping at the end of the
200 subject if the requested.
201
202 Arguments:
203 p points to characters
204 length number to print
205 is_subject TRUE if printing from within md->start_subject
206 md pointer to matching data block, if is_subject is TRUE
207
208 Returns: nothing
209 */
210
211 static void
212 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 {
214 int c;
215 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216 while (length-- > 0)
217 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218 }
219 #endif
220
221
222
223
224 /*************************************************
225 * Handle escapes *
226 *************************************************/
227
228 /* This function is called when a \ has been encountered. It either returns a
229 positive value for a simple escape such as \n, or a negative value which
230 encodes one of the more complicated things such as \d. On entry, ptr is
231 pointing at the \. On exit, it is on the final character of the escape
232 sequence.
233
234 Arguments:
235 ptrptr points to the pattern position pointer
236 errorptr points to the pointer to the error message
237 bracount number of previous extracting brackets
238 options the options bits
239 isclass TRUE if inside a character class
240 cd pointer to char tables block
241
242 Returns: zero or positive => a data character
243 negative => a special escape sequence
244 on error, errorptr is set
245 */
246
247 static int
248 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
249 int options, BOOL isclass, compile_data *cd)
250 {
251 const uschar *ptr = *ptrptr;
252 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
253 int i;
254
255 if (c == 0) *errorptr = ERR1;
256
257 /* Digits or letters may have special meaning; all others are literals. */
258
259 else if (c < '0' || c > 'z') {}
260
261 /* Do an initial lookup in a table. A non-zero result is something that can be
262 returned immediately. Otherwise further processing may be required. */
263
264 else if ((i = escapes[c - '0']) != 0) c = i;
265
266 /* Escapes that need further processing, or are illegal. */
267
268 else
269 {
270 const uschar *oldptr;
271 switch (c)
272 {
273 /* The handling of escape sequences consisting of a string of digits
274 starting with one that is not zero is not straightforward. By experiment,
275 the way Perl works seems to be as follows:
276
277 Outside a character class, the digits are read as a decimal number. If the
278 number is less than 10, or if there are that many previous extracting
279 left brackets, then it is a back reference. Otherwise, up to three octal
280 digits are read to form an escaped byte. Thus \123 is likely to be octal
281 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
282 value is greater than 377, the least significant 8 bits are taken. Inside a
283 character class, \ followed by a digit is always an octal number. */
284
285 case '1': case '2': case '3': case '4': case '5':
286 case '6': case '7': case '8': case '9':
287
288 if (!isclass)
289 {
290 oldptr = ptr;
291 c -= '0';
292 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
293 c = c * 10 + *(++ptr) - '0';
294 if (c < 10 || c <= bracount)
295 {
296 c = -(ESC_REF + c);
297 break;
298 }
299 ptr = oldptr; /* Put the pointer back and fall through */
300 }
301
302 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
303 generates a binary zero byte and treats the digit as a following literal.
304 Thus we have to pull back the pointer by one. */
305
306 if ((c = *ptr) >= '8')
307 {
308 ptr--;
309 c = 0;
310 break;
311 }
312
313 /* \0 always starts an octal number, but we may drop through to here with a
314 larger first octal digit */
315
316 case '0':
317 c -= '0';
318 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
319 ptr[1] != '8' && ptr[1] != '9')
320 c = c * 8 + *(++ptr) - '0';
321 break;
322
323 /* Special escapes not starting with a digit are straightforward */
324
325 case 'x':
326 c = 0;
327 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
328 {
329 ptr++;
330 c = c * 16 + cd->lcc[*ptr] -
331 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
332 }
333 break;
334
335 case 'c':
336 c = *(++ptr);
337 if (c == 0)
338 {
339 *errorptr = ERR2;
340 return 0;
341 }
342
343 /* A letter is upper-cased; then the 0x40 bit is flipped */
344
345 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
346 c ^= 0x40;
347 break;
348
349 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
350 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
351 for Perl compatibility, it is a literal. This code looks a bit odd, but
352 there used to be some cases other than the default, and there may be again
353 in future, so I haven't "optimized" it. */
354
355 default:
356 if ((options & PCRE_EXTRA) != 0) switch(c)
357 {
358 default:
359 *errorptr = ERR3;
360 break;
361 }
362 break;
363 }
364 }
365
366 *ptrptr = ptr;
367 return c;
368 }
369
370
371
372 /*************************************************
373 * Check for counted repeat *
374 *************************************************/
375
376 /* This function is called when a '{' is encountered in a place where it might
377 start a quantifier. It looks ahead to see if it really is a quantifier or not.
378 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
379 where the ddds are digits.
380
381 Arguments:
382 p pointer to the first char after '{'
383 cd pointer to char tables block
384
385 Returns: TRUE or FALSE
386 */
387
388 static BOOL
389 is_counted_repeat(const uschar *p, compile_data *cd)
390 {
391 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
392 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
393 if (*p == '}') return TRUE;
394
395 if (*p++ != ',') return FALSE;
396 if (*p == '}') return TRUE;
397
398 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
399 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
400 return (*p == '}');
401 }
402
403
404
405 /*************************************************
406 * Read repeat counts *
407 *************************************************/
408
409 /* Read an item of the form {n,m} and return the values. This is called only
410 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
411 so the syntax is guaranteed to be correct, but we need to check the values.
412
413 Arguments:
414 p pointer to first char after '{'
415 minp pointer to int for min
416 maxp pointer to int for max
417 returned as -1 if no max
418 errorptr points to pointer to error message
419 cd pointer to character tables clock
420
421 Returns: pointer to '}' on success;
422 current ptr on error, with errorptr set
423 */
424
425 static const uschar *
426 read_repeat_counts(const uschar *p, int *minp, int *maxp,
427 const char **errorptr, compile_data *cd)
428 {
429 int min = 0;
430 int max = -1;
431
432 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
433
434 if (*p == '}') max = min; else
435 {
436 if (*(++p) != '}')
437 {
438 max = 0;
439 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
440 if (max < min)
441 {
442 *errorptr = ERR4;
443 return p;
444 }
445 }
446 }
447
448 /* Do paranoid checks, then fill in the required variables, and pass back the
449 pointer to the terminating '}'. */
450
451 if (min > 65535 || max > 65535)
452 *errorptr = ERR5;
453 else
454 {
455 *minp = min;
456 *maxp = max;
457 }
458 return p;
459 }
460
461
462
463 /*************************************************
464 * Find the fixed length of a pattern *
465 *************************************************/
466
467 /* Scan a pattern and compute the fixed length of subject that will match it,
468 if the length is fixed. This is needed for dealing with backward assertions.
469
470 Arguments:
471 code points to the start of the pattern (the bracket)
472
473 Returns: the fixed length, or -1 if there is no fixed length
474 */
475
476 static int
477 find_fixedlength(uschar *code)
478 {
479 int length = -1;
480
481 register int branchlength = 0;
482 register uschar *cc = code + 3;
483
484 /* Scan along the opcodes for this branch. If we get to the end of the
485 branch, check the length against that of the other branches. */
486
487 for (;;)
488 {
489 int d;
490 register int op = *cc;
491 if (op >= OP_BRA) op = OP_BRA;
492
493 switch (op)
494 {
495 case OP_BRA:
496 case OP_ONCE:
497 case OP_COND:
498 d = find_fixedlength(cc);
499 if (d < 0) return -1;
500 branchlength += d;
501 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
502 cc += 3;
503 break;
504
505 /* Reached end of a branch; if it's a ket it is the end of a nested
506 call. If it's ALT it is an alternation in a nested call. If it is
507 END it's the end of the outer call. All can be handled by the same code. */
508
509 case OP_ALT:
510 case OP_KET:
511 case OP_KETRMAX:
512 case OP_KETRMIN:
513 case OP_END:
514 if (length < 0) length = branchlength;
515 else if (length != branchlength) return -1;
516 if (*cc != OP_ALT) return length;
517 cc += 3;
518 branchlength = 0;
519 break;
520
521 /* Skip over assertive subpatterns */
522
523 case OP_ASSERT:
524 case OP_ASSERT_NOT:
525 case OP_ASSERTBACK:
526 case OP_ASSERTBACK_NOT:
527 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
528 cc += 3;
529 break;
530
531 /* Skip over things that don't match chars */
532
533 case OP_REVERSE:
534 cc++;
535
536 case OP_CREF:
537 case OP_OPT:
538 cc++;
539 /* Fall through */
540
541 case OP_SOD:
542 case OP_EOD:
543 case OP_EODN:
544 case OP_CIRC:
545 case OP_DOLL:
546 case OP_NOT_WORD_BOUNDARY:
547 case OP_WORD_BOUNDARY:
548 cc++;
549 break;
550
551 /* Handle char strings */
552
553 case OP_CHARS:
554 branchlength += *(++cc);
555 cc += *cc + 1;
556 break;
557
558 /* Handle exact repetitions */
559
560 case OP_EXACT:
561 case OP_TYPEEXACT:
562 branchlength += (cc[1] << 8) + cc[2];
563 cc += 4;
564 break;
565
566 /* Handle single-char matchers */
567
568 case OP_NOT_DIGIT:
569 case OP_DIGIT:
570 case OP_NOT_WHITESPACE:
571 case OP_WHITESPACE:
572 case OP_NOT_WORDCHAR:
573 case OP_WORDCHAR:
574 case OP_ANY:
575 branchlength++;
576 cc++;
577 break;
578
579
580 /* Check a class for variable quantification */
581
582 case OP_CLASS:
583 cc += (*cc == OP_REF)? 2 : 33;
584
585 switch (*cc)
586 {
587 case OP_CRSTAR:
588 case OP_CRMINSTAR:
589 case OP_CRQUERY:
590 case OP_CRMINQUERY:
591 return -1;
592
593 case OP_CRRANGE:
594 case OP_CRMINRANGE:
595 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
596 branchlength += (cc[1] << 8) + cc[2];
597 cc += 5;
598 break;
599
600 default:
601 branchlength++;
602 }
603 break;
604
605 /* Anything else is variable length */
606
607 default:
608 return -1;
609 }
610 }
611 /* Control never gets here */
612 }
613
614
615
616
617 /*************************************************
618 * Compile one branch *
619 *************************************************/
620
621 /* Scan the pattern, compiling it into the code vector.
622
623 Arguments:
624 options the option bits
625 brackets points to number of brackets used
626 code points to the pointer to the current code point
627 ptrptr points to the current pattern pointer
628 errorptr points to pointer to error message
629 optchanged set to the value of the last OP_OPT item compiled
630 cd contains pointers to tables
631
632 Returns: TRUE on success
633 FALSE, with *errorptr set on error
634 */
635
636 static BOOL
637 compile_branch(int options, int *brackets, uschar **codeptr,
638 const uschar **ptrptr, const char **errorptr, int *optchanged,
639 compile_data *cd)
640 {
641 int repeat_type, op_type;
642 int repeat_min, repeat_max;
643 int bravalue, length;
644 int greedy_default, greedy_non_default;
645 register int c;
646 register uschar *code = *codeptr;
647 uschar *tempcode;
648 const uschar *ptr = *ptrptr;
649 const uschar *tempptr;
650 uschar *previous = NULL;
651 uschar class[32];
652
653 /* Set up the default and non-default settings for greediness */
654
655 greedy_default = ((options & PCRE_UNGREEDY) != 0);
656 greedy_non_default = greedy_default ^ 1;
657
658 /* Switch on next character until the end of the branch */
659
660 for (;; ptr++)
661 {
662 BOOL negate_class;
663 int class_charcount;
664 int class_lastchar;
665 int newoptions;
666 int condref;
667
668 c = *ptr;
669 if ((options & PCRE_EXTENDED) != 0)
670 {
671 if ((cd->ctypes[c] & ctype_space) != 0) continue;
672 if (c == '#')
673 {
674 while ((c = *(++ptr)) != 0 && c != '\n');
675 continue;
676 }
677 }
678
679 switch(c)
680 {
681 /* The branch terminates at end of string, |, or ). */
682
683 case 0:
684 case '|':
685 case ')':
686 *codeptr = code;
687 *ptrptr = ptr;
688 return TRUE;
689
690 /* Handle single-character metacharacters */
691
692 case '^':
693 previous = NULL;
694 *code++ = OP_CIRC;
695 break;
696
697 case '$':
698 previous = NULL;
699 *code++ = OP_DOLL;
700 break;
701
702 case '.':
703 previous = code;
704 *code++ = OP_ANY;
705 break;
706
707 /* Character classes. These always build a 32-byte bitmap of the permitted
708 characters, except in the special case where there is only one character.
709 For negated classes, we build the map as usual, then invert it at the end.
710 */
711
712 case '[':
713 previous = code;
714 *code++ = OP_CLASS;
715
716 /* If the first character is '^', set the negation flag and skip it. */
717
718 if ((c = *(++ptr)) == '^')
719 {
720 negate_class = TRUE;
721 c = *(++ptr);
722 }
723 else negate_class = FALSE;
724
725 /* Keep a count of chars so that we can optimize the case of just a single
726 character. */
727
728 class_charcount = 0;
729 class_lastchar = -1;
730
731 /* Initialize the 32-char bit map to all zeros. We have to build the
732 map in a temporary bit of store, in case the class contains only 1
733 character, because in that case the compiled code doesn't use the
734 bit map. */
735
736 memset(class, 0, 32 * sizeof(uschar));
737
738 /* Process characters until ] is reached. By writing this as a "do" it
739 means that an initial ] is taken as a data character. */
740
741 do
742 {
743 if (c == 0)
744 {
745 *errorptr = ERR6;
746 goto FAILED;
747 }
748
749 /* Backslash may introduce a single character, or it may introduce one
750 of the specials, which just set a flag. Escaped items are checked for
751 validity in the pre-compiling pass. The sequence \b is a special case.
752 Inside a class (and only there) it is treated as backspace. Elsewhere
753 it marks a word boundary. Other escapes have preset maps ready to
754 or into the one we are building. We assume they have more than one
755 character in them, so set class_count bigger than one. */
756
757 if (c == '\\')
758 {
759 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
760 if (-c == ESC_b) c = '\b';
761 else if (c < 0)
762 {
763 register const uschar *cbits = cd->cbits;
764 class_charcount = 10;
765 switch (-c)
766 {
767 case ESC_d:
768 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
769 continue;
770
771 case ESC_D:
772 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
773 continue;
774
775 case ESC_w:
776 for (c = 0; c < 32; c++)
777 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
778 continue;
779
780 case ESC_W:
781 for (c = 0; c < 32; c++)
782 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
783 continue;
784
785 case ESC_s:
786 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
787 continue;
788
789 case ESC_S:
790 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
791 continue;
792
793 default:
794 *errorptr = ERR7;
795 goto FAILED;
796 }
797 }
798 /* Fall through if single character */
799 }
800
801 /* A single character may be followed by '-' to form a range. However,
802 Perl does not permit ']' to be the end of the range. A '-' character
803 here is treated as a literal. */
804
805 if (ptr[1] == '-' && ptr[2] != ']')
806 {
807 int d;
808 ptr += 2;
809 d = *ptr;
810
811 if (d == 0)
812 {
813 *errorptr = ERR6;
814 goto FAILED;
815 }
816
817 /* The second part of a range can be a single-character escape, but
818 not any of the other escapes. */
819
820 if (d == '\\')
821 {
822 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
823 if (d < 0)
824 {
825 if (d == -ESC_b) d = '\b'; else
826 {
827 *errorptr = ERR7;
828 goto FAILED;
829 }
830 }
831 }
832
833 if (d < c)
834 {
835 *errorptr = ERR8;
836 goto FAILED;
837 }
838
839 for (; c <= d; c++)
840 {
841 class[c/8] |= (1 << (c&7));
842 if ((options & PCRE_CASELESS) != 0)
843 {
844 int uc = cd->fcc[c]; /* flip case */
845 class[uc/8] |= (1 << (uc&7));
846 }
847 class_charcount++; /* in case a one-char range */
848 class_lastchar = c;
849 }
850 continue; /* Go get the next char in the class */
851 }
852
853 /* Handle a lone single character - we can get here for a normal
854 non-escape char, or after \ that introduces a single character. */
855
856 class [c/8] |= (1 << (c&7));
857 if ((options & PCRE_CASELESS) != 0)
858 {
859 c = cd->fcc[c]; /* flip case */
860 class[c/8] |= (1 << (c&7));
861 }
862 class_charcount++;
863 class_lastchar = c;
864 }
865
866 /* Loop until ']' reached; the check for end of string happens inside the
867 loop. This "while" is the end of the "do" above. */
868
869 while ((c = *(++ptr)) != ']');
870
871 /* If class_charcount is 1 and class_lastchar is not negative, we saw
872 precisely one character. This doesn't need the whole 32-byte bit map.
873 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
874 it's negative. */
875
876 if (class_charcount == 1 && class_lastchar >= 0)
877 {
878 if (negate_class)
879 {
880 code[-1] = OP_NOT;
881 }
882 else
883 {
884 code[-1] = OP_CHARS;
885 *code++ = 1;
886 }
887 *code++ = class_lastchar;
888 }
889
890 /* Otherwise, negate the 32-byte map if necessary, and copy it into
891 the code vector. */
892
893 else
894 {
895 if (negate_class)
896 for (c = 0; c < 32; c++) code[c] = ~class[c];
897 else
898 memcpy(code, class, 32);
899 code += 32;
900 }
901 break;
902
903 /* Various kinds of repeat */
904
905 case '{':
906 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
907 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
908 if (*errorptr != NULL) goto FAILED;
909 goto REPEAT;
910
911 case '*':
912 repeat_min = 0;
913 repeat_max = -1;
914 goto REPEAT;
915
916 case '+':
917 repeat_min = 1;
918 repeat_max = -1;
919 goto REPEAT;
920
921 case '?':
922 repeat_min = 0;
923 repeat_max = 1;
924
925 REPEAT:
926 if (previous == NULL)
927 {
928 *errorptr = ERR9;
929 goto FAILED;
930 }
931
932 /* If the next character is '?' this is a minimizing repeat, by default,
933 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
934 next character. */
935
936 if (ptr[1] == '?')
937 { repeat_type = greedy_non_default; ptr++; }
938 else repeat_type = greedy_default;
939
940 /* If the maximum is zero then the minimum must also be zero; Perl allows
941 this case, so we do too - by simply omitting the item altogether. */
942
943 if (repeat_max == 0) code = previous;
944
945 /* If previous was a string of characters, chop off the last one and use it
946 as the subject of the repeat. If there was only one character, we can
947 abolish the previous item altogether. */
948
949 else if (*previous == OP_CHARS)
950 {
951 int len = previous[1];
952 if (len == 1)
953 {
954 c = previous[2];
955 code = previous;
956 }
957 else
958 {
959 c = previous[len+1];
960 previous[1]--;
961 code--;
962 }
963 op_type = 0; /* Use single-char op codes */
964 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
965 }
966
967 /* If previous was a single negated character ([^a] or similar), we use
968 one of the special opcodes, replacing it. The code is shared with single-
969 character repeats by adding a suitable offset into repeat_type. */
970
971 else if ((int)*previous == OP_NOT)
972 {
973 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
974 c = previous[1];
975 code = previous;
976 goto OUTPUT_SINGLE_REPEAT;
977 }
978
979 /* If previous was a character type match (\d or similar), abolish it and
980 create a suitable repeat item. The code is shared with single-character
981 repeats by adding a suitable offset into repeat_type. */
982
983 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
984 {
985 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
986 c = *previous;
987 code = previous;
988
989 OUTPUT_SINGLE_REPEAT:
990 repeat_type += op_type; /* Combine both values for many cases */
991
992 /* A minimum of zero is handled either as the special case * or ?, or as
993 an UPTO, with the maximum given. */
994
995 if (repeat_min == 0)
996 {
997 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
998 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
999 else
1000 {
1001 *code++ = OP_UPTO + repeat_type;
1002 *code++ = repeat_max >> 8;
1003 *code++ = (repeat_max & 255);
1004 }
1005 }
1006
1007 /* The case {1,} is handled as the special case + */
1008
1009 else if (repeat_min == 1 && repeat_max == -1)
1010 *code++ = OP_PLUS + repeat_type;
1011
1012 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1013 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1014
1015 else
1016 {
1017 if (repeat_min != 1)
1018 {
1019 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1020 *code++ = repeat_min >> 8;
1021 *code++ = (repeat_min & 255);
1022 }
1023
1024 /* If the mininum is 1 and the previous item was a character string,
1025 we either have to put back the item that got cancelled if the string
1026 length was 1, or add the character back onto the end of a longer
1027 string. For a character type nothing need be done; it will just get
1028 put back naturally. Note that the final character is always going to
1029 get added below. */
1030
1031 else if (*previous == OP_CHARS)
1032 {
1033 if (code == previous) code += 2; else previous[1]++;
1034 }
1035
1036 /* For a single negated character we also have to put back the
1037 item that got cancelled. */
1038
1039 else if (*previous == OP_NOT) code++;
1040
1041 /* If the maximum is unlimited, insert an OP_STAR. */
1042
1043 if (repeat_max < 0)
1044 {
1045 *code++ = c;
1046 *code++ = OP_STAR + repeat_type;
1047 }
1048
1049 /* Else insert an UPTO if the max is greater than the min. */
1050
1051 else if (repeat_max != repeat_min)
1052 {
1053 *code++ = c;
1054 repeat_max -= repeat_min;
1055 *code++ = OP_UPTO + repeat_type;
1056 *code++ = repeat_max >> 8;
1057 *code++ = (repeat_max & 255);
1058 }
1059 }
1060
1061 /* The character or character type itself comes last in all cases. */
1062
1063 *code++ = c;
1064 }
1065
1066 /* If previous was a character class or a back reference, we put the repeat
1067 stuff after it. */
1068
1069 else if (*previous == OP_CLASS || *previous == OP_REF)
1070 {
1071 if (repeat_min == 0 && repeat_max == -1)
1072 *code++ = OP_CRSTAR + repeat_type;
1073 else if (repeat_min == 1 && repeat_max == -1)
1074 *code++ = OP_CRPLUS + repeat_type;
1075 else if (repeat_min == 0 && repeat_max == 1)
1076 *code++ = OP_CRQUERY + repeat_type;
1077 else
1078 {
1079 *code++ = OP_CRRANGE + repeat_type;
1080 *code++ = repeat_min >> 8;
1081 *code++ = repeat_min & 255;
1082 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1083 *code++ = repeat_max >> 8;
1084 *code++ = repeat_max & 255;
1085 }
1086 }
1087
1088 /* If previous was a bracket group, we may have to replicate it in certain
1089 cases. */
1090
1091 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092 (int)*previous == OP_COND)
1093 {
1094 register int i;
1095 int ketoffset = 0;
1096 int len = code - previous;
1097 uschar *bralink = NULL;
1098
1099 /* If the maximum repeat count is unlimited, find the end of the bracket
1100 by scanning through from the start, and compute the offset back to it
1101 from the current code pointer. There may be an OP_OPT setting following
1102 the final KET, so we can't find the end just by going back from the code
1103 pointer. */
1104
1105 if (repeat_max == -1)
1106 {
1107 register uschar *ket = previous;
1108 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1109 ketoffset = code - ket;
1110 }
1111
1112 /* The case of a zero minimum is special because of the need to stick
1113 OP_BRAZERO in front of it, and because the group appears once in the
1114 data, whereas in other cases it appears the minimum number of times. For
1115 this reason, it is simplest to treat this case separately, as otherwise
1116 the code gets far too mess. There are several special subcases when the
1117 minimum is zero. */
1118
1119 if (repeat_min == 0)
1120 {
1121 /* If the maximum is also zero, we just omit the group from the output
1122 altogether. */
1123
1124 if (repeat_max == 0)
1125 {
1126 code = previous;
1127 previous = NULL;
1128 break;
1129 }
1130
1131 /* If the maximum is 1 or unlimited, we just have to stick in the
1132 BRAZERO and do no more at this point. */
1133
1134 if (repeat_max <= 1)
1135 {
1136 memmove(previous+1, previous, len);
1137 code++;
1138 *previous++ = OP_BRAZERO + repeat_type;
1139 }
1140
1141 /* If the maximum is greater than 1 and limited, we have to replicate
1142 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1143 The first one has to be handled carefully because it's the original
1144 copy, which has to be moved up. The remainder can be handled by code
1145 that is common with the non-zero minimum case below. We just have to
1146 adjust the value or repeat_max, since one less copy is required. */
1147
1148 else
1149 {
1150 int offset;
1151 memmove(previous+4, previous, len);
1152 code += 4;
1153 *previous++ = OP_BRAZERO + repeat_type;
1154 *previous++ = OP_BRA;
1155
1156 /* We chain together the bracket offset fields that have to be
1157 filled in later when the ends of the brackets are reached. */
1158
1159 offset = (bralink == NULL)? 0 : previous - bralink;
1160 bralink = previous;
1161 *previous++ = offset >> 8;
1162 *previous++ = offset & 255;
1163 }
1164
1165 repeat_max--;
1166 }
1167
1168 /* If the minimum is greater than zero, replicate the group as many
1169 times as necessary, and adjust the maximum to the number of subsequent
1170 copies that we need. */
1171
1172 else
1173 {
1174 for (i = 1; i < repeat_min; i++)
1175 {
1176 memcpy(code, previous, len);
1177 code += len;
1178 }
1179 if (repeat_max > 0) repeat_max -= repeat_min;
1180 }
1181
1182 /* This code is common to both the zero and non-zero minimum cases. If
1183 the maximum is limited, it replicates the group in a nested fashion,
1184 remembering the bracket starts on a stack. In the case of a zero minimum,
1185 the first one was set up above. In all cases the repeat_max now specifies
1186 the number of additional copies needed. */
1187
1188 if (repeat_max >= 0)
1189 {
1190 for (i = repeat_max - 1; i >= 0; i--)
1191 {
1192 *code++ = OP_BRAZERO + repeat_type;
1193
1194 /* All but the final copy start a new nesting, maintaining the
1195 chain of brackets outstanding. */
1196
1197 if (i != 0)
1198 {
1199 int offset;
1200 *code++ = OP_BRA;
1201 offset = (bralink == NULL)? 0 : code - bralink;
1202 bralink = code;
1203 *code++ = offset >> 8;
1204 *code++ = offset & 255;
1205 }
1206
1207 memcpy(code, previous, len);
1208 code += len;
1209 }
1210
1211 /* Now chain through the pending brackets, and fill in their length
1212 fields (which are holding the chain links pro tem). */
1213
1214 while (bralink != NULL)
1215 {
1216 int oldlinkoffset;
1217 int offset = code - bralink + 1;
1218 uschar *bra = code - offset;
1219 oldlinkoffset = (bra[1] << 8) + bra[2];
1220 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1221 *code++ = OP_KET;
1222 *code++ = bra[1] = offset >> 8;
1223 *code++ = bra[2] = (offset & 255);
1224 }
1225 }
1226
1227 /* If the maximum is unlimited, set a repeater in the final copy. We
1228 can't just offset backwards from the current code point, because we
1229 don't know if there's been an options resetting after the ket. The
1230 correct offset was computed above. */
1231
1232 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1233
1234
1235 #ifdef NEVER
1236 /* If the minimum is greater than zero, and the maximum is unlimited or
1237 equal to the minimum, the first copy remains where it is, and is
1238 replicated up to the minimum number of times. This case includes the +
1239 repeat, but of course no replication is needed in that case. */
1240
1241 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1242 {
1243 for (i = 1; i < repeat_min; i++)
1244 {
1245 memcpy(code, previous, len);
1246 code += len;
1247 }
1248 }
1249
1250 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1251 Then, if there is a fixed upper limit, replicated up to that many times,
1252 sticking BRAZERO in front of all the optional ones. */
1253
1254 else
1255 {
1256 if (repeat_min == 0)
1257 {
1258 memmove(previous+1, previous, len);
1259 code++;
1260 *previous++ = OP_BRAZERO + repeat_type;
1261 }
1262
1263 for (i = 1; i < repeat_min; i++)
1264 {
1265 memcpy(code, previous, len);
1266 code += len;
1267 }
1268
1269 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1270 {
1271 *code++ = OP_BRAZERO + repeat_type;
1272 memcpy(code, previous, len);
1273 code += len;
1274 }
1275 }
1276
1277 /* If the maximum is unlimited, set a repeater in the final copy. We
1278 can't just offset backwards from the current code point, because we
1279 don't know if there's been an options resetting after the ket. The
1280 correct offset was computed above. */
1281
1282 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1283 #endif
1284
1285
1286 }
1287
1288 /* Else there's some kind of shambles */
1289
1290 else
1291 {
1292 *errorptr = ERR11;
1293 goto FAILED;
1294 }
1295
1296 /* In all case we no longer have a previous item. */
1297
1298 previous = NULL;
1299 break;
1300
1301
1302 /* Start of nested bracket sub-expression, or comment or lookahead or
1303 lookbehind or option setting or condition. First deal with special things
1304 that can come after a bracket; all are introduced by ?, and the appearance
1305 of any of them means that this is not a referencing group. They were
1306 checked for validity in the first pass over the string, so we don't have to
1307 check for syntax errors here. */
1308
1309 case '(':
1310 newoptions = options;
1311 condref = -1;
1312
1313 if (*(++ptr) == '?')
1314 {
1315 int set, unset;
1316 int *optset;
1317
1318 switch (*(++ptr))
1319 {
1320 case '#': /* Comment; skip to ket */
1321 ptr++;
1322 while (*ptr != ')') ptr++;
1323 continue;
1324
1325 case ':': /* Non-extracting bracket */
1326 bravalue = OP_BRA;
1327 ptr++;
1328 break;
1329
1330 case '(':
1331 bravalue = OP_COND; /* Conditional group */
1332 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1333 {
1334 condref = *ptr - '0';
1335 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1336 ptr++;
1337 }
1338 else ptr--;
1339 break;
1340
1341 case '=': /* Positive lookahead */
1342 bravalue = OP_ASSERT;
1343 ptr++;
1344 break;
1345
1346 case '!': /* Negative lookahead */
1347 bravalue = OP_ASSERT_NOT;
1348 ptr++;
1349 break;
1350
1351 case '<': /* Lookbehinds */
1352 switch (*(++ptr))
1353 {
1354 case '=': /* Positive lookbehind */
1355 bravalue = OP_ASSERTBACK;
1356 ptr++;
1357 break;
1358
1359 case '!': /* Negative lookbehind */
1360 bravalue = OP_ASSERTBACK_NOT;
1361 ptr++;
1362 break;
1363
1364 default: /* Syntax error */
1365 *errorptr = ERR24;
1366 goto FAILED;
1367 }
1368 break;
1369
1370 case '>': /* One-time brackets */
1371 bravalue = OP_ONCE;
1372 ptr++;
1373 break;
1374
1375 default: /* Option setting */
1376 set = unset = 0;
1377 optset = &set;
1378
1379 while (*ptr != ')' && *ptr != ':')
1380 {
1381 switch (*ptr++)
1382 {
1383 case '-': optset = &unset; break;
1384
1385 case 'i': *optset |= PCRE_CASELESS; break;
1386 case 'm': *optset |= PCRE_MULTILINE; break;
1387 case 's': *optset |= PCRE_DOTALL; break;
1388 case 'x': *optset |= PCRE_EXTENDED; break;
1389 case 'U': *optset |= PCRE_UNGREEDY; break;
1390 case 'X': *optset |= PCRE_EXTRA; break;
1391
1392 default:
1393 *errorptr = ERR12;
1394 goto FAILED;
1395 }
1396 }
1397
1398 /* Set up the changed option bits, but don't change anything yet. */
1399
1400 newoptions = (options | set) & (~unset);
1401
1402 /* If the options ended with ')' this is not the start of a nested
1403 group with option changes, so the options change at this level. At top
1404 level there is nothing else to be done (the options will in fact have
1405 been set from the start of compiling as a result of the first pass) but
1406 at an inner level we must compile code to change the ims options if
1407 necessary, and pass the new setting back so that it can be put at the
1408 start of any following branches, and when this group ends, a resetting
1409 item can be compiled. */
1410
1411 if (*ptr == ')')
1412 {
1413 if ((options & PCRE_INGROUP) != 0 &&
1414 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1415 {
1416 *code++ = OP_OPT;
1417 *code++ = *optchanged = newoptions & PCRE_IMS;
1418 }
1419 options = newoptions; /* Change options at this level */
1420 previous = NULL; /* This item can't be repeated */
1421 continue; /* It is complete */
1422 }
1423
1424 /* If the options ended with ':' we are heading into a nested group
1425 with possible change of options. Such groups are non-capturing and are
1426 not assertions of any kind. All we need to do is skip over the ':';
1427 the newoptions value is handled below. */
1428
1429 bravalue = OP_BRA;
1430 ptr++;
1431 }
1432 }
1433
1434 /* Else we have a referencing group; adjust the opcode. */
1435
1436 else
1437 {
1438 if (++(*brackets) > EXTRACT_MAX)
1439 {
1440 *errorptr = ERR13;
1441 goto FAILED;
1442 }
1443 bravalue = OP_BRA + *brackets;
1444 }
1445
1446 /* Process nested bracketed re. Assertions may not be repeated, but other
1447 kinds can be. We copy code into a non-register variable in order to be able
1448 to pass its address because some compilers complain otherwise. Pass in a
1449 new setting for the ims options if they have changed. */
1450
1451 previous = (bravalue >= OP_ONCE)? code : NULL;
1452 *code = bravalue;
1453 tempcode = code;
1454
1455 if (!compile_regex(
1456 options | PCRE_INGROUP, /* Set for all nested groups */
1457 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1458 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1459 brackets, /* Bracket level */
1460 &tempcode, /* Where to put code (updated) */
1461 &ptr, /* Input pointer (updated) */
1462 errorptr, /* Where to put an error message */
1463 (bravalue == OP_ASSERTBACK ||
1464 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1465 condref, /* Condition reference number */
1466 cd)) /* Tables block */
1467 goto FAILED;
1468
1469 /* At the end of compiling, code is still pointing to the start of the
1470 group, while tempcode has been updated to point past the end of the group
1471 and any option resetting that may follow it. The pattern pointer (ptr)
1472 is on the bracket. */
1473
1474 /* If this is a conditional bracket, check that there are no more than
1475 two branches in the group. */
1476
1477 if (bravalue == OP_COND)
1478 {
1479 int branchcount = 0;
1480 uschar *tc = code;
1481
1482 do {
1483 branchcount++;
1484 tc += (tc[1] << 8) | tc[2];
1485 }
1486 while (*tc != OP_KET);
1487
1488 if (branchcount > 2)
1489 {
1490 *errorptr = ERR27;
1491 goto FAILED;
1492 }
1493 }
1494
1495 /* Now update the main code pointer to the end of the group. */
1496
1497 code = tempcode;
1498
1499 /* Error if hit end of pattern */
1500
1501 if (*ptr != ')')
1502 {
1503 *errorptr = ERR14;
1504 goto FAILED;
1505 }
1506 break;
1507
1508 /* Check \ for being a real metacharacter; if not, fall through and handle
1509 it as a data character at the start of a string. Escape items are checked
1510 for validity in the pre-compiling pass. */
1511
1512 case '\\':
1513 tempptr = ptr;
1514 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1515
1516 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1517 are arranged to be the negation of the corresponding OP_values. For the
1518 back references, the values are ESC_REF plus the reference number. Only
1519 back references and those types that consume a character may be repeated.
1520 We can test for values between ESC_b and ESC_Z for the latter; this may
1521 have to change if any new ones are ever created. */
1522
1523 if (c < 0)
1524 {
1525 if (-c >= ESC_REF)
1526 {
1527 previous = code;
1528 *code++ = OP_REF;
1529 *code++ = -c - ESC_REF;
1530 }
1531 else
1532 {
1533 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1534 *code++ = -c;
1535 }
1536 continue;
1537 }
1538
1539 /* Data character: reset and fall through */
1540
1541 ptr = tempptr;
1542 c = '\\';
1543
1544 /* Handle a run of data characters until a metacharacter is encountered.
1545 The first character is guaranteed not to be whitespace or # when the
1546 extended flag is set. */
1547
1548 NORMAL_CHAR:
1549 default:
1550 previous = code;
1551 *code = OP_CHARS;
1552 code += 2;
1553 length = 0;
1554
1555 do
1556 {
1557 if ((options & PCRE_EXTENDED) != 0)
1558 {
1559 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1560 if (c == '#')
1561 {
1562 while ((c = *(++ptr)) != 0 && c != '\n');
1563 if (c == 0) break;
1564 continue;
1565 }
1566 }
1567
1568 /* Backslash may introduce a data char or a metacharacter. Escaped items
1569 are checked for validity in the pre-compiling pass. Stop the string
1570 before a metaitem. */
1571
1572 if (c == '\\')
1573 {
1574 tempptr = ptr;
1575 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1576 if (c < 0) { ptr = tempptr; break; }
1577 }
1578
1579 /* Ordinary character or single-char escape */
1580
1581 *code++ = c;
1582 length++;
1583 }
1584
1585 /* This "while" is the end of the "do" above. */
1586
1587 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1588
1589 /* Compute the length and set it in the data vector, and advance to
1590 the next state. */
1591
1592 previous[1] = length;
1593 if (length < 255) ptr--;
1594 break;
1595 }
1596 } /* end of big loop */
1597
1598 /* Control never reaches here by falling through, only by a goto for all the
1599 error states. Pass back the position in the pattern so that it can be displayed
1600 to the user for diagnosing the error. */
1601
1602 FAILED:
1603 *ptrptr = ptr;
1604 return FALSE;
1605 }
1606
1607
1608
1609
1610 /*************************************************
1611 * Compile sequence of alternatives *
1612 *************************************************/
1613
1614 /* On entry, ptr is pointing past the bracket character, but on return
1615 it points to the closing bracket, or vertical bar, or end of string.
1616 The code variable is pointing at the byte into which the BRA operator has been
1617 stored. If the ims options are changed at the start (for a (?ims: group) or
1618 during any branch, we need to insert an OP_OPT item at the start of every
1619 following branch to ensure they get set correctly at run time, and also pass
1620 the new options into every subsequent branch compile.
1621
1622 Argument:
1623 options the option bits
1624 optchanged new ims options to set as if (?ims) were at the start, or -1
1625 for no change
1626 brackets -> int containing the number of extracting brackets used
1627 codeptr -> the address of the current code pointer
1628 ptrptr -> the address of the current pattern pointer
1629 errorptr -> pointer to error message
1630 lookbehind TRUE if this is a lookbehind assertion
1631 condref > 0 for OPT_CREF setting at start of conditional group
1632 cd points to the data block with tables pointers
1633
1634 Returns: TRUE on success
1635 */
1636
1637 static BOOL
1638 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1639 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1640 compile_data *cd)
1641 {
1642 const uschar *ptr = *ptrptr;
1643 uschar *code = *codeptr;
1644 uschar *last_branch = code;
1645 uschar *start_bracket = code;
1646 uschar *reverse_count = NULL;
1647 int oldoptions = options & PCRE_IMS;
1648
1649 code += 3;
1650
1651 /* At the start of a reference-based conditional group, insert the reference
1652 number as an OP_CREF item. */
1653
1654 if (condref > 0)
1655 {
1656 *code++ = OP_CREF;
1657 *code++ = condref;
1658 }
1659
1660 /* Loop for each alternative branch */
1661
1662 for (;;)
1663 {
1664 int length;
1665
1666 /* Handle change of options */
1667
1668 if (optchanged >= 0)
1669 {
1670 *code++ = OP_OPT;
1671 *code++ = optchanged;
1672 options = (options & ~PCRE_IMS) | optchanged;
1673 }
1674
1675 /* Set up dummy OP_REVERSE if lookbehind assertion */
1676
1677 if (lookbehind)
1678 {
1679 *code++ = OP_REVERSE;
1680 reverse_count = code;
1681 *code++ = 0;
1682 *code++ = 0;
1683 }
1684
1685 /* Now compile the branch */
1686
1687 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1688 {
1689 *ptrptr = ptr;
1690 return FALSE;
1691 }
1692
1693 /* Fill in the length of the last branch */
1694
1695 length = code - last_branch;
1696 last_branch[1] = length >> 8;
1697 last_branch[2] = length & 255;
1698
1699 /* If lookbehind, check that this branch matches a fixed-length string,
1700 and put the length into the OP_REVERSE item. Temporarily mark the end of
1701 the branch with OP_END. */
1702
1703 if (lookbehind)
1704 {
1705 *code = OP_END;
1706 length = find_fixedlength(last_branch);
1707 DPRINTF(("fixed length = %d\n", length));
1708 if (length < 0)
1709 {
1710 *errorptr = ERR25;
1711 *ptrptr = ptr;
1712 return FALSE;
1713 }
1714 reverse_count[0] = (length >> 8);
1715 reverse_count[1] = length & 255;
1716 }
1717
1718 /* Reached end of expression, either ')' or end of pattern. Insert a
1719 terminating ket and the length of the whole bracketed item, and return,
1720 leaving the pointer at the terminating char. If any of the ims options
1721 were changed inside the group, compile a resetting op-code following. */
1722
1723 if (*ptr != '|')
1724 {
1725 length = code - start_bracket;
1726 *code++ = OP_KET;
1727 *code++ = length >> 8;
1728 *code++ = length & 255;
1729 if (optchanged >= 0)
1730 {
1731 *code++ = OP_OPT;
1732 *code++ = oldoptions;
1733 }
1734 *codeptr = code;
1735 *ptrptr = ptr;
1736 return TRUE;
1737 }
1738
1739 /* Another branch follows; insert an "or" node and advance the pointer. */
1740
1741 *code = OP_ALT;
1742 last_branch = code;
1743 code += 3;
1744 ptr++;
1745 }
1746 /* Control never reaches here */
1747 }
1748
1749
1750
1751
1752 /*************************************************
1753 * Find first significant op code *
1754 *************************************************/
1755
1756 /* This is called by several functions that scan a compiled expression looking
1757 for a fixed first character, or an anchoring op code etc. It skips over things
1758 that do not influence this. For one application, a change of caseless option is
1759 important.
1760
1761 Arguments:
1762 code pointer to the start of the group
1763 options pointer to external options
1764 optbit the option bit whose changing is significant, or
1765 zero if none are
1766 optstop TRUE to return on option change, otherwise change the options
1767 value and continue
1768
1769 Returns: pointer to the first significant opcode
1770 */
1771
1772 static const uschar*
1773 first_significant_code(const uschar *code, int *options, int optbit,
1774 BOOL optstop)
1775 {
1776 for (;;)
1777 {
1778 switch ((int)*code)
1779 {
1780 case OP_OPT:
1781 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1782 {
1783 if (optstop) return code;
1784 *options = (int)code[1];
1785 }
1786 code += 2;
1787 break;
1788
1789 case OP_CREF:
1790 code += 2;
1791 break;
1792
1793 case OP_WORD_BOUNDARY:
1794 case OP_NOT_WORD_BOUNDARY:
1795 code++;
1796 break;
1797
1798 case OP_ASSERT_NOT:
1799 case OP_ASSERTBACK:
1800 case OP_ASSERTBACK_NOT:
1801 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1802 code += 3;
1803 break;
1804
1805 default:
1806 return code;
1807 }
1808 }
1809 /* Control never reaches here */
1810 }
1811
1812
1813
1814
1815 /*************************************************
1816 * Check for anchored expression *
1817 *************************************************/
1818
1819 /* Try to find out if this is an anchored regular expression. Consider each
1820 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1821 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1822 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1823 counts, since OP_CIRC can match in the middle.
1824
1825 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1826 because that will try the rest of the pattern at all possible matching points,
1827 so there is no point trying them again.
1828
1829 Arguments:
1830 code points to start of expression (the bracket)
1831 options points to the options setting
1832
1833 Returns: TRUE or FALSE
1834 */
1835
1836 static BOOL
1837 is_anchored(register const uschar *code, int *options)
1838 {
1839 do {
1840 const uschar *scode = first_significant_code(code + 3, options,
1841 PCRE_MULTILINE, FALSE);
1842 register int op = *scode;
1843 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1844 { if (!is_anchored(scode, options)) return FALSE; }
1845 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1846 (*options & PCRE_DOTALL) != 0)
1847 { if (scode[1] != OP_ANY) return FALSE; }
1848 else if (op != OP_SOD &&
1849 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1850 return FALSE;
1851 code += (code[1] << 8) + code[2];
1852 }
1853 while (*code == OP_ALT);
1854 return TRUE;
1855 }
1856
1857
1858
1859 /*************************************************
1860 * Check for starting with ^ or .* *
1861 *************************************************/
1862
1863 /* This is called to find out if every branch starts with ^ or .* so that
1864 "first char" processing can be done to speed things up in multiline
1865 matching and for non-DOTALL patterns that start with .* (which must start at
1866 the beginning or after \n).
1867
1868 Argument: points to start of expression (the bracket)
1869 Returns: TRUE or FALSE
1870 */
1871
1872 static BOOL
1873 is_startline(const uschar *code)
1874 {
1875 do {
1876 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1877 register int op = *scode;
1878 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1879 { if (!is_startline(scode)) return FALSE; }
1880 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1881 { if (scode[1] != OP_ANY) return FALSE; }
1882 else if (op != OP_CIRC) return FALSE;
1883 code += (code[1] << 8) + code[2];
1884 }
1885 while (*code == OP_ALT);
1886 return TRUE;
1887 }
1888
1889
1890
1891 /*************************************************
1892 * Check for fixed first char *
1893 *************************************************/
1894
1895 /* Try to find out if there is a fixed first character. This is called for
1896 unanchored expressions, as it speeds up their processing quite considerably.
1897 Consider each alternative branch. If they all start with the same char, or with
1898 a bracket all of whose alternatives start with the same char (recurse ad lib),
1899 then we return that char, otherwise -1.
1900
1901 Arguments:
1902 code points to start of expression (the bracket)
1903 options pointer to the options (used to check casing changes)
1904
1905 Returns: -1 or the fixed first char
1906 */
1907
1908 static int
1909 find_firstchar(const uschar *code, int *options)
1910 {
1911 register int c = -1;
1912 do {
1913 int d;
1914 const uschar *scode = first_significant_code(code + 3, options,
1915 PCRE_CASELESS, TRUE);
1916 register int op = *scode;
1917
1918 if (op >= OP_BRA) op = OP_BRA;
1919
1920 switch(op)
1921 {
1922 default:
1923 return -1;
1924
1925 case OP_BRA:
1926 case OP_ASSERT:
1927 case OP_ONCE:
1928 case OP_COND:
1929 if ((d = find_firstchar(scode, options)) < 0) return -1;
1930 if (c < 0) c = d; else if (c != d) return -1;
1931 break;
1932
1933 case OP_EXACT: /* Fall through */
1934 scode++;
1935
1936 case OP_CHARS: /* Fall through */
1937 scode++;
1938
1939 case OP_PLUS:
1940 case OP_MINPLUS:
1941 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1942 break;
1943 }
1944
1945 code += (code[1] << 8) + code[2];
1946 }
1947 while (*code == OP_ALT);
1948 return c;
1949 }
1950
1951
1952
1953
1954
1955 /*************************************************
1956 * Compile a Regular Expression *
1957 *************************************************/
1958
1959 /* This function takes a string and returns a pointer to a block of store
1960 holding a compiled version of the expression.
1961
1962 Arguments:
1963 pattern the regular expression
1964 options various option bits
1965 errorptr pointer to pointer to error text
1966 erroroffset ptr offset in pattern where error was detected
1967 tables pointer to character tables or NULL
1968
1969 Returns: pointer to compiled data block, or NULL on error,
1970 with errorptr and erroroffset set
1971 */
1972
1973 pcre *
1974 pcre_compile(const char *pattern, int options, const char **errorptr,
1975 int *erroroffset, const unsigned char *tables)
1976 {
1977 real_pcre *re;
1978 int length = 3; /* For initial BRA plus length */
1979 int runlength;
1980 int c, size;
1981 int bracount = 0;
1982 int top_backref = 0;
1983 int branch_extra = 0;
1984 int branch_newextra;
1985 unsigned int brastackptr = 0;
1986 uschar *code;
1987 const uschar *ptr;
1988 compile_data compile_block;
1989 int brastack[BRASTACK_SIZE];
1990 uschar bralenstack[BRASTACK_SIZE];
1991
1992 #ifdef DEBUG
1993 uschar *code_base, *code_end;
1994 #endif
1995
1996 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1997 can do is just return NULL. */
1998
1999 if (errorptr == NULL) return NULL;
2000 *errorptr = NULL;
2001
2002 /* However, we can give a message for this error */
2003
2004 if (erroroffset == NULL)
2005 {
2006 *errorptr = ERR16;
2007 return NULL;
2008 }
2009 *erroroffset = 0;
2010
2011 if ((options & ~PUBLIC_OPTIONS) != 0)
2012 {
2013 *errorptr = ERR17;
2014 return NULL;
2015 }
2016
2017 /* Set up pointers to the individual character tables */
2018
2019 if (tables == NULL) tables = pcre_default_tables;
2020 compile_block.lcc = tables + lcc_offset;
2021 compile_block.fcc = tables + fcc_offset;
2022 compile_block.cbits = tables + cbits_offset;
2023 compile_block.ctypes = tables + ctypes_offset;
2024
2025 /* Reflect pattern for debugging output */
2026
2027 DPRINTF(("------------------------------------------------------------------\n"));
2028 DPRINTF(("%s\n", pattern));
2029
2030 /* The first thing to do is to make a pass over the pattern to compute the
2031 amount of store required to hold the compiled code. This does not have to be
2032 perfect as long as errors are overestimates. At the same time we can detect any
2033 internal flag settings. Make an attempt to correct for any counted white space
2034 if an "extended" flag setting appears late in the pattern. We can't be so
2035 clever for #-comments. */
2036
2037 ptr = (const uschar *)(pattern - 1);
2038 while ((c = *(++ptr)) != 0)
2039 {
2040 int min, max;
2041 int class_charcount;
2042
2043 if ((options & PCRE_EXTENDED) != 0)
2044 {
2045 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2046 if (c == '#')
2047 {
2048 while ((c = *(++ptr)) != 0 && c != '\n');
2049 continue;
2050 }
2051 }
2052
2053 switch(c)
2054 {
2055 /* A backslashed item may be an escaped "normal" character or a
2056 character type. For a "normal" character, put the pointers and
2057 character back so that tests for whitespace etc. in the input
2058 are done correctly. */
2059
2060 case '\\':
2061 {
2062 const uschar *save_ptr = ptr;
2063 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2064 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2065 if (c >= 0)
2066 {
2067 ptr = save_ptr;
2068 c = '\\';
2069 goto NORMAL_CHAR;
2070 }
2071 }
2072 length++;
2073
2074 /* A back reference needs an additional char, plus either one or 5
2075 bytes for a repeat. We also need to keep the value of the highest
2076 back reference. */
2077
2078 if (c <= -ESC_REF)
2079 {
2080 int refnum = -c - ESC_REF;
2081 if (refnum > top_backref) top_backref = refnum;
2082 length++; /* For single back reference */
2083 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2084 {
2085 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2086 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2087 if ((min == 0 && (max == 1 || max == -1)) ||
2088 (min == 1 && max == -1))
2089 length++;
2090 else length += 5;
2091 if (ptr[1] == '?') ptr++;
2092 }
2093 }
2094 continue;
2095
2096 case '^':
2097 case '.':
2098 case '$':
2099 case '*': /* These repeats won't be after brackets; */
2100 case '+': /* those are handled separately */
2101 case '?':
2102 length++;
2103 continue;
2104
2105 /* This covers the cases of repeats after a single char, metachar, class,
2106 or back reference. */
2107
2108 case '{':
2109 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2110 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2111 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2112 if ((min == 0 && (max == 1 || max == -1)) ||
2113 (min == 1 && max == -1))
2114 length++;
2115 else
2116 {
2117 length--; /* Uncount the original char or metachar */
2118 if (min == 1) length++; else if (min > 0) length += 4;
2119 if (max > 0) length += 4; else length += 2;
2120 }
2121 if (ptr[1] == '?') ptr++;
2122 continue;
2123
2124 /* An alternation contains an offset to the next branch or ket. If any ims
2125 options changed in the previous branch(es), and/or if we are in a
2126 lookbehind assertion, extra space will be needed at the start of the
2127 branch. This is handled by branch_extra. */
2128
2129 case '|':
2130 length += 3 + branch_extra;
2131 continue;
2132
2133 /* A character class uses 33 characters. Don't worry about character types
2134 that aren't allowed in classes - they'll get picked up during the compile.
2135 A character class that contains only one character uses 2 or 3 bytes,
2136 depending on whether it is negated or not. Notice this where we can. */
2137
2138 case '[':
2139 class_charcount = 0;
2140 if (*(++ptr) == '^') ptr++;
2141 do
2142 {
2143 if (*ptr == '\\')
2144 {
2145 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2146 &compile_block);
2147 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2148 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2149 }
2150 else class_charcount++;
2151 ptr++;
2152 }
2153 while (*ptr != 0 && *ptr != ']');
2154
2155 /* Repeats for negated single chars are handled by the general code */
2156
2157 if (class_charcount == 1) length += 3; else
2158 {
2159 length += 33;
2160
2161 /* A repeat needs either 1 or 5 bytes. */
2162
2163 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2164 {
2165 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2166 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2167 if ((min == 0 && (max == 1 || max == -1)) ||
2168 (min == 1 && max == -1))
2169 length++;
2170 else length += 5;
2171 if (ptr[1] == '?') ptr++;
2172 }
2173 }
2174 continue;
2175
2176 /* Brackets may be genuine groups or special things */
2177
2178 case '(':
2179 branch_newextra = 0;
2180
2181 /* Handle special forms of bracket, which all start (? */
2182
2183 if (ptr[1] == '?')
2184 {
2185 int set, unset;
2186 int *optset;
2187
2188 switch (c = ptr[2])
2189 {
2190 /* Skip over comments entirely */
2191 case '#':
2192 ptr += 3;
2193 while (*ptr != 0 && *ptr != ')') ptr++;
2194 if (*ptr == 0)
2195 {
2196 *errorptr = ERR18;
2197 goto PCRE_ERROR_RETURN;
2198 }
2199 continue;
2200
2201 /* Non-referencing groups and lookaheads just move the pointer on, and
2202 then behave like a non-special bracket, except that they don't increment
2203 the count of extracting brackets. Ditto for the "once only" bracket,
2204 which is in Perl from version 5.005. */
2205
2206 case ':':
2207 case '=':
2208 case '!':
2209 case '>':
2210 ptr += 2;
2211 break;
2212
2213 /* Lookbehinds are in Perl from version 5.005 */
2214
2215 case '<':
2216 if (ptr[3] == '=' || ptr[3] == '!')
2217 {
2218 ptr += 3;
2219 branch_newextra = 3;
2220 length += 3; /* For the first branch */
2221 break;
2222 }
2223 *errorptr = ERR24;
2224 goto PCRE_ERROR_RETURN;
2225
2226 /* Conditionals are in Perl from version 5.005. The bracket must either
2227 be followed by a number (for bracket reference) or by an assertion
2228 group. */
2229
2230 case '(':
2231 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2232 {
2233 ptr += 4;
2234 length += 2;
2235 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2236 if (*ptr != ')')
2237 {
2238 *errorptr = ERR26;
2239 goto PCRE_ERROR_RETURN;
2240 }
2241 }
2242 else /* An assertion must follow */
2243 {
2244 ptr++; /* Can treat like ':' as far as spacing is concerned */
2245
2246 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2247 {
2248 ptr += 2; /* To get right offset in message */
2249 *errorptr = ERR28;
2250 goto PCRE_ERROR_RETURN;
2251 }
2252 }
2253 break;
2254
2255 /* Else loop checking valid options until ) is met. Anything else is an
2256 error. If we are without any brackets, i.e. at top level, the settings
2257 act as if specified in the options, so massage the options immediately.
2258 This is for backward compatibility with Perl 5.004. */
2259
2260 default:
2261 set = unset = 0;
2262 optset = &set;
2263 ptr += 2;
2264
2265 for (;; ptr++)
2266 {
2267 c = *ptr;
2268 switch (c)
2269 {
2270 case 'i':
2271 *optset |= PCRE_CASELESS;
2272 continue;
2273
2274 case 'm':
2275 *optset |= PCRE_MULTILINE;
2276 continue;
2277
2278 case 's':
2279 *optset |= PCRE_DOTALL;
2280 continue;
2281
2282 case 'x':
2283 *optset |= PCRE_EXTENDED;
2284 continue;
2285
2286 case 'X':
2287 *optset |= PCRE_EXTRA;
2288 continue;
2289
2290 case 'U':
2291 *optset |= PCRE_UNGREEDY;
2292 continue;
2293
2294 case '-':
2295 optset = &unset;
2296 continue;
2297
2298 /* A termination by ')' indicates an options-setting-only item;
2299 this is global at top level; otherwise nothing is done here and
2300 it is handled during the compiling process on a per-bracket-group
2301 basis. */
2302
2303 case ')':
2304 if (brastackptr == 0)
2305 {
2306 options = (options | set) & (~unset);
2307 set = unset = 0; /* To save length */
2308 }
2309 /* Fall through */
2310
2311 /* A termination by ':' indicates the start of a nested group with
2312 the given options set. This is again handled at compile time, but
2313 we must allow for compiled space if any of the ims options are
2314 set. We also have to allow for resetting space at the end of
2315 the group, which is why 4 is added to the length and not just 2.
2316 If there are several changes of options within the same group, this
2317 will lead to an over-estimate on the length, but this shouldn't
2318 matter very much. We also have to allow for resetting options at
2319 the start of any alternations, which we do by setting
2320 branch_newextra to 2. */
2321
2322 case ':':
2323 if (((set|unset) & PCRE_IMS) != 0)
2324 {
2325 length += 4;
2326 branch_newextra = 2;
2327 }
2328 goto END_OPTIONS;
2329
2330 /* Unrecognized option character */
2331
2332 default:
2333 *errorptr = ERR12;
2334 goto PCRE_ERROR_RETURN;
2335 }
2336 }
2337
2338 /* If we hit a closing bracket, that's it - this is a freestanding
2339 option-setting. We need to ensure that branch_extra is updated if
2340 necessary. The only values branch_newextra can have here are 0 or 2.
2341 If the value is 2, then branch_extra must either be 2 or 5, depending
2342 on whether this is a lookbehind group or not. */
2343
2344 END_OPTIONS:
2345 if (c == ')')
2346 {
2347 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2348 branch_extra += branch_newextra;
2349 continue;
2350 }
2351
2352 /* If options were terminated by ':' control comes here. Fall through
2353 to handle the group below. */
2354 }
2355 }
2356
2357 /* Extracting brackets must be counted so we can process escapes in a
2358 Perlish way. */
2359
2360 else bracount++;
2361
2362 /* Non-special forms of bracket. Save length for computing whole length
2363 at end if there's a repeat that requires duplication of the group. Also
2364 save the current value of branch_extra, and start the new group with
2365 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2366 for a lookbehind assertion. */
2367
2368 if (brastackptr >= sizeof(brastack)/sizeof(int))
2369 {
2370 *errorptr = ERR19;
2371 goto PCRE_ERROR_RETURN;
2372 }
2373
2374 bralenstack[brastackptr] = branch_extra;
2375 branch_extra = branch_newextra;
2376
2377 brastack[brastackptr++] = length;
2378 length += 3;
2379 continue;
2380
2381 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2382 have to replicate this bracket up to that many times. If brastackptr is
2383 0 this is an unmatched bracket which will generate an error, but take care
2384 not to try to access brastack[-1] when computing the length and restoring
2385 the branch_extra value. */
2386
2387 case ')':
2388 length += 3;
2389 {
2390 int minval = 1;
2391 int maxval = 1;
2392 int duplength;
2393
2394 if (brastackptr > 0)
2395 {
2396 duplength = length - brastack[--brastackptr];
2397 branch_extra = bralenstack[brastackptr];
2398 }
2399 else duplength = 0;
2400
2401 /* Leave ptr at the final char; for read_repeat_counts this happens
2402 automatically; for the others we need an increment. */
2403
2404 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2405 {
2406 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2407 &compile_block);
2408 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2409 }
2410 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2411 else if (c == '+') { maxval = -1; ptr++; }
2412 else if (c == '?') { minval = 0; ptr++; }
2413
2414 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2415 group, and if the maximum is greater than zero, we have to replicate
2416 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2417 bracket set - hence the 7. */
2418
2419 if (minval == 0)
2420 {
2421 length++;
2422 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2423 }
2424
2425 /* When the minimum is greater than zero, 1 we have to replicate up to
2426 minval-1 times, with no additions required in the copies. Then, if
2427 there is a limited maximum we have to replicate up to maxval-1 times
2428 allowing for a BRAZERO item before each optional copy and nesting
2429 brackets for all but one of the optional copies. */
2430
2431 else
2432 {
2433 length += (minval - 1) * duplength;
2434 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2435 length += (maxval - minval) * (duplength + 7) - 6;
2436 }
2437 }
2438 continue;
2439
2440 /* Non-special character. For a run of such characters the length required
2441 is the number of characters + 2, except that the maximum run length is 255.
2442 We won't get a skipped space or a non-data escape or the start of a #
2443 comment as the first character, so the length can't be zero. */
2444
2445 NORMAL_CHAR:
2446 default:
2447 length += 2;
2448 runlength = 0;
2449 do
2450 {
2451 if ((options & PCRE_EXTENDED) != 0)
2452 {
2453 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2454 if (c == '#')
2455 {
2456 while ((c = *(++ptr)) != 0 && c != '\n');
2457 continue;
2458 }
2459 }
2460
2461 /* Backslash may introduce a data char or a metacharacter; stop the
2462 string before the latter. */
2463
2464 if (c == '\\')
2465 {
2466 const uschar *saveptr = ptr;
2467 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2468 &compile_block);
2469 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2470 if (c < 0) { ptr = saveptr; break; }
2471 }
2472
2473 /* Ordinary character or single-char escape */
2474
2475 runlength++;
2476 }
2477
2478 /* This "while" is the end of the "do" above. */
2479
2480 while (runlength < 255 &&
2481 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2482
2483 ptr--;
2484 length += runlength;
2485 continue;
2486 }
2487 }
2488
2489 length += 4; /* For final KET and END */
2490
2491 if (length > 65539)
2492 {
2493 *errorptr = ERR20;
2494 return NULL;
2495 }
2496
2497 /* Compute the size of data block needed and get it, either from malloc or
2498 externally provided function. We specify "code[0]" in the offsetof() expression
2499 rather than just "code", because it has been reported that one broken compiler
2500 fails on "code" because it is also an independent variable. It should make no
2501 difference to the value of the offsetof(). */
2502
2503 size = length + offsetof(real_pcre, code[0]);
2504 re = (real_pcre *)(pcre_malloc)(size);
2505
2506 if (re == NULL)
2507 {
2508 *errorptr = ERR21;
2509 return NULL;
2510 }
2511
2512 /* Put in the magic number and the options. */
2513
2514 re->magic_number = MAGIC_NUMBER;
2515 re->options = options;
2516 re->tables = tables;
2517
2518 /* Set up a starting, non-extracting bracket, then compile the expression. On
2519 error, *errorptr will be set non-NULL, so we don't need to look at the result
2520 of the function here. */
2521
2522 ptr = (const uschar *)pattern;
2523 code = re->code;
2524 *code = OP_BRA;
2525 bracount = 0;
2526 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2527 &compile_block);
2528 re->top_bracket = bracount;
2529 re->top_backref = top_backref;
2530
2531 /* If not reached end of pattern on success, there's an excess bracket. */
2532
2533 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2534
2535 /* Fill in the terminating state and check for disastrous overflow, but
2536 if debugging, leave the test till after things are printed out. */
2537
2538 *code++ = OP_END;
2539
2540 #ifndef DEBUG
2541 if (code - re->code > length) *errorptr = ERR23;
2542 #endif
2543
2544 /* Give an error if there's back reference to a non-existent capturing
2545 subpattern. */
2546
2547 if (top_backref > re->top_bracket) *errorptr = ERR15;
2548
2549 /* Failed to compile */
2550
2551 if (*errorptr != NULL)
2552 {
2553 (pcre_free)(re);
2554 PCRE_ERROR_RETURN:
2555 *erroroffset = ptr - (const uschar *)pattern;
2556 return NULL;
2557 }
2558
2559 /* If the anchored option was not passed, set flag if we can determine that the
2560 pattern is anchored by virtue of ^ characters or \A or anything else (such as
2561 starting with .* when DOTALL is set).
2562
2563 Otherwise, see if we can determine what the first character has to be, because
2564 that speeds up unanchored matches no end. If not, see if we can set the
2565 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2566 start with ^. and also when all branches start with .* for non-DOTALL matches.
2567 */
2568
2569 if ((options & PCRE_ANCHORED) == 0)
2570 {
2571 int temp_options = options;
2572 if (is_anchored(re->code, &temp_options))
2573 re->options |= PCRE_ANCHORED;
2574 else
2575 {
2576 int ch = find_firstchar(re->code, &temp_options);
2577 if (ch >= 0)
2578 {
2579 re->first_char = ch;
2580 re->options |= PCRE_FIRSTSET;
2581 }
2582 else if (is_startline(re->code))
2583 re->options |= PCRE_STARTLINE;
2584 }
2585 }
2586
2587 /* Print out the compiled data for debugging */
2588
2589 #ifdef DEBUG
2590
2591 printf("Length = %d top_bracket = %d top_backref = %d\n",
2592 length, re->top_bracket, re->top_backref);
2593
2594 if (re->options != 0)
2595 {
2596 printf("%s%s%s%s%s%s%s%s\n",
2597 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2598 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2599 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2600 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2601 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2602 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2603 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2604 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2605 }
2606
2607 if ((re->options & PCRE_FIRSTSET) != 0)
2608 {
2609 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2610 else printf("First char = \\x%02x\n", re->first_char);
2611 }
2612
2613 code_end = code;
2614 code_base = code = re->code;
2615
2616 while (code < code_end)
2617 {
2618 int charlength;
2619
2620 printf("%3d ", code - code_base);
2621
2622 if (*code >= OP_BRA)
2623 {
2624 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2625 code += 2;
2626 }
2627
2628 else switch(*code)
2629 {
2630 case OP_OPT:
2631 printf(" %.2x %s", code[1], OP_names[*code]);
2632 code++;
2633 break;
2634
2635 case OP_COND:
2636 printf("%3d Cond", (code[1] << 8) + code[2]);
2637 code += 2;
2638 break;
2639
2640 case OP_CREF:
2641 printf(" %.2d %s", code[1], OP_names[*code]);
2642 code++;
2643 break;
2644
2645 case OP_CHARS:
2646 charlength = *(++code);
2647 printf("%3d ", charlength);
2648 while (charlength-- > 0)
2649 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2650 break;
2651
2652 case OP_KETRMAX:
2653 case OP_KETRMIN:
2654 case OP_ALT:
2655 case OP_KET:
2656 case OP_ASSERT:
2657 case OP_ASSERT_NOT:
2658 case OP_ASSERTBACK:
2659 case OP_ASSERTBACK_NOT:
2660 case OP_ONCE:
2661 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2662 code += 2;
2663 break;
2664
2665 case OP_REVERSE:
2666 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2667 code += 2;
2668 break;
2669
2670 case OP_STAR:
2671 case OP_MINSTAR:
2672 case OP_PLUS:
2673 case OP_MINPLUS:
2674 case OP_QUERY:
2675 case OP_MINQUERY:
2676 case OP_TYPESTAR:
2677 case OP_TYPEMINSTAR:
2678 case OP_TYPEPLUS:
2679 case OP_TYPEMINPLUS:
2680 case OP_TYPEQUERY:
2681 case OP_TYPEMINQUERY:
2682 if (*code >= OP_TYPESTAR)
2683 printf(" %s", OP_names[code[1]]);
2684 else if (isprint(c = code[1])) printf(" %c", c);
2685 else printf(" \\x%02x", c);
2686 printf("%s", OP_names[*code++]);
2687 break;
2688
2689 case OP_EXACT:
2690 case OP_UPTO:
2691 case OP_MINUPTO:
2692 if (isprint(c = code[3])) printf(" %c{", c);
2693 else printf(" \\x%02x{", c);
2694 if (*code != OP_EXACT) printf("0,");
2695 printf("%d}", (code[1] << 8) + code[2]);
2696 if (*code == OP_MINUPTO) printf("?");
2697 code += 3;
2698 break;
2699
2700 case OP_TYPEEXACT:
2701 case OP_TYPEUPTO:
2702 case OP_TYPEMINUPTO:
2703 printf(" %s{", OP_names[code[3]]);
2704 if (*code != OP_TYPEEXACT) printf(",");
2705 printf("%d}", (code[1] << 8) + code[2]);
2706 if (*code == OP_TYPEMINUPTO) printf("?");
2707 code += 3;
2708 break;
2709
2710 case OP_NOT:
2711 if (isprint(c = *(++code))) printf(" [^%c]", c);
2712 else printf(" [^\\x%02x]", c);
2713 break;
2714
2715 case OP_NOTSTAR:
2716 case OP_NOTMINSTAR:
2717 case OP_NOTPLUS:
2718 case OP_NOTMINPLUS:
2719 case OP_NOTQUERY:
2720 case OP_NOTMINQUERY:
2721 if (isprint(c = code[1])) printf(" [^%c]", c);
2722 else printf(" [^\\x%02x]", c);
2723 printf("%s", OP_names[*code++]);
2724 break;
2725
2726 case OP_NOTEXACT:
2727 case OP_NOTUPTO:
2728 case OP_NOTMINUPTO:
2729 if (isprint(c = code[3])) printf(" [^%c]{", c);
2730 else printf(" [^\\x%02x]{", c);
2731 if (*code != OP_NOTEXACT) printf(",");
2732 printf("%d}", (code[1] << 8) + code[2]);
2733 if (*code == OP_NOTMINUPTO) printf("?");
2734 code += 3;
2735 break;
2736
2737 case OP_REF:
2738 printf(" \\%d", *(++code));
2739 code ++;
2740 goto CLASS_REF_REPEAT;
2741
2742 case OP_CLASS:
2743 {
2744 int i, min, max;
2745 code++;
2746 printf(" [");
2747
2748 for (i = 0; i < 256; i++)
2749 {
2750 if ((code[i/8] & (1 << (i&7))) != 0)
2751 {
2752 int j;
2753 for (j = i+1; j < 256; j++)
2754 if ((code[j/8] & (1 << (j&7))) == 0) break;
2755 if (i == '-' || i == ']') printf("\\");
2756 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2757 if (--j > i)
2758 {
2759 printf("-");
2760 if (j == '-' || j == ']') printf("\\");
2761 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2762 }
2763 i = j;
2764 }
2765 }
2766 printf("]");
2767 code += 32;
2768
2769 CLASS_REF_REPEAT:
2770
2771 switch(*code)
2772 {
2773 case OP_CRSTAR:
2774 case OP_CRMINSTAR:
2775 case OP_CRPLUS:
2776 case OP_CRMINPLUS:
2777 case OP_CRQUERY:
2778 case OP_CRMINQUERY:
2779 printf("%s", OP_names[*code]);
2780 break;
2781
2782 case OP_CRRANGE:
2783 case OP_CRMINRANGE:
2784 min = (code[1] << 8) + code[2];
2785 max = (code[3] << 8) + code[4];
2786 if (max == 0) printf("{%d,}", min);
2787 else printf("{%d,%d}", min, max);
2788 if (*code == OP_CRMINRANGE) printf("?");
2789 code += 4;
2790 break;
2791
2792 default:
2793 code--;
2794 }
2795 }
2796 break;
2797
2798 /* Anything else is just a one-node item */
2799
2800 default:
2801 printf(" %s", OP_names[*code]);
2802 break;
2803 }
2804
2805 code++;
2806 printf("\n");
2807 }
2808 printf("------------------------------------------------------------------\n");
2809
2810 /* This check is done here in the debugging case so that the code that
2811 was compiled can be seen. */
2812
2813 if (code - re->code > length)
2814 {
2815 *errorptr = ERR23;
2816 (pcre_free)(re);
2817 *erroroffset = ptr - (uschar *)pattern;
2818 return NULL;
2819 }
2820 #endif
2821
2822 return (pcre *)re;
2823 }
2824
2825
2826
2827 /*************************************************
2828 * Match a back-reference *
2829 *************************************************/
2830
2831 /* If a back reference hasn't been set, the length that is passed is greater
2832 than the number of characters left in the string, so the match fails.
2833
2834 Arguments:
2835 offset index into the offset vector
2836 eptr points into the subject
2837 length length to be matched
2838 md points to match data block
2839 ims the ims flags
2840
2841 Returns: TRUE if matched
2842 */
2843
2844 static BOOL
2845 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2846 int ims)
2847 {
2848 const uschar *p = md->start_subject + md->offset_vector[offset];
2849
2850 #ifdef DEBUG
2851 if (eptr >= md->end_subject)
2852 printf("matching subject <null>");
2853 else
2854 {
2855 printf("matching subject ");
2856 pchars(eptr, length, TRUE, md);
2857 }
2858 printf(" against backref ");
2859 pchars(p, length, FALSE, md);
2860 printf("\n");
2861 #endif
2862
2863 /* Always fail if not enough characters left */
2864
2865 if (length > md->end_subject - eptr) return FALSE;
2866
2867 /* Separate the caselesss case for speed */
2868
2869 if ((ims & PCRE_CASELESS) != 0)
2870 {
2871 while (length-- > 0)
2872 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2873 }
2874 else
2875 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2876
2877 return TRUE;
2878 }
2879
2880
2881
2882 /*************************************************
2883 * Match from current position *
2884 *************************************************/
2885
2886 /* On entry ecode points to the first opcode, and eptr to the first character
2887 in the subject string, while eptrb holds the value of eptr at the start of the
2888 last bracketed group - used for breaking infinite loops matching zero-length
2889 strings.
2890
2891 Arguments:
2892 eptr pointer in subject
2893 ecode position in code
2894 offset_top current top pointer
2895 md pointer to "static" info for the match
2896 ims current /i, /m, and /s options
2897 condassert TRUE if called to check a condition assertion
2898 eptrb eptr at start of last bracket
2899
2900 Returns: TRUE if matched
2901 */
2902
2903 static BOOL
2904 match(register const uschar *eptr, register const uschar *ecode,
2905 int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2906 {
2907 int original_ims = ims; /* Save for resetting on ')' */
2908
2909 for (;;)
2910 {
2911 int op = (int)*ecode;
2912 int min, max, ctype;
2913 register int i;
2914 register int c;
2915 BOOL minimize = FALSE;
2916
2917 /* Opening capturing bracket. If there is space in the offset vector, save
2918 the current subject position in the working slot at the top of the vector. We
2919 mustn't change the current values of the data slot, because they may be set
2920 from a previous iteration of this group, and be referred to by a reference
2921 inside the group.
2922
2923 If the bracket fails to match, we need to restore this value and also the
2924 values of the final offsets, in case they were set by a previous iteration of
2925 the same bracket.
2926
2927 If there isn't enough space in the offset vector, treat this as if it were a
2928 non-capturing bracket. Don't worry about setting the flag for the error case
2929 here; that is handled in the code for KET. */
2930
2931 if (op > OP_BRA)
2932 {
2933 int number = op - OP_BRA;
2934 int offset = number << 1;
2935
2936 #ifdef DEBUG
2937 printf("start bracket %d subject=", number);
2938 pchars(eptr, 16, TRUE, md);
2939 printf("\n");
2940 #endif
2941
2942 if (offset < md->offset_max)
2943 {
2944 int save_offset1 = md->offset_vector[offset];
2945 int save_offset2 = md->offset_vector[offset+1];
2946 int save_offset3 = md->offset_vector[md->offset_end - number];
2947
2948 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2949 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2950
2951 do
2952 {
2953 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2954 ecode += (ecode[1] << 8) + ecode[2];
2955 }
2956 while (*ecode == OP_ALT);
2957
2958 DPRINTF(("bracket %d failed\n", number));
2959
2960 md->offset_vector[offset] = save_offset1;
2961 md->offset_vector[offset+1] = save_offset2;
2962 md->offset_vector[md->offset_end - number] = save_offset3;
2963 return FALSE;
2964 }
2965
2966 /* Insufficient room for saving captured contents */
2967
2968 else op = OP_BRA;
2969 }
2970
2971 /* Other types of node can be handled by a switch */
2972
2973 switch(op)
2974 {
2975 case OP_BRA: /* Non-capturing bracket: optimized */
2976 DPRINTF(("start bracket 0\n"));
2977 do
2978 {
2979 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2980 ecode += (ecode[1] << 8) + ecode[2];
2981 }
2982 while (*ecode == OP_ALT);
2983 DPRINTF(("bracket 0 failed\n"));
2984 return FALSE;
2985
2986 /* Conditional group: compilation checked that there are no more than
2987 two branches. If the condition is false, skipping the first branch takes us
2988 past the end if there is only one branch, but that's OK because that is
2989 exactly what going to the ket would do. */
2990
2991 case OP_COND:
2992 if (ecode[3] == OP_CREF) /* Condition is extraction test */
2993 {
2994 int offset = ecode[4] << 1; /* Doubled reference number */
2995 return match(eptr,
2996 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2997 5 : 3 + (ecode[1] << 8) + ecode[2]),
2998 offset_top, md, ims, FALSE, eptr);
2999 }
3000
3001 /* The condition is an assertion. Call match() to evaluate it - setting
3002 the final argument TRUE causes it to stop at the end of an assertion. */
3003
3004 else
3005 {
3006 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3007 {
3008 ecode += 3 + (ecode[4] << 8) + ecode[5];
3009 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3010 }
3011 else ecode += (ecode[1] << 8) + ecode[2];
3012 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3013 }
3014 /* Control never reaches here */
3015
3016 /* Skip over conditional reference data if encountered (should not be) */
3017
3018 case OP_CREF:
3019 ecode += 2;
3020 break;
3021
3022 /* End of the pattern */
3023
3024 case OP_END:
3025 md->end_match_ptr = eptr; /* Record where we ended */
3026 md->end_offset_top = offset_top; /* and how many extracts were taken */
3027 return TRUE;
3028
3029 /* Change option settings */
3030
3031 case OP_OPT:
3032 ims = ecode[1];
3033 ecode += 2;
3034 DPRINTF(("ims set to %02x\n", ims));
3035 break;
3036
3037 /* Assertion brackets. Check the alternative branches in turn - the
3038 matching won't pass the KET for an assertion. If any one branch matches,
3039 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3040 start of each branch to move the current point backwards, so the code at
3041 this level is identical to the lookahead case. */
3042
3043 case OP_ASSERT:
3044 case OP_ASSERTBACK:
3045 do
3046 {
3047 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3048 ecode += (ecode[1] << 8) + ecode[2];
3049 }
3050 while (*ecode == OP_ALT);
3051 if (*ecode == OP_KET) return FALSE;
3052
3053 /* If checking an assertion for a condition, return TRUE. */
3054
3055 if (condassert) return TRUE;
3056
3057 /* Continue from after the assertion, updating the offsets high water
3058 mark, since extracts may have been taken during the assertion. */
3059
3060 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3061 ecode += 3;
3062 offset_top = md->end_offset_top;
3063 continue;
3064
3065 /* Negative assertion: all branches must fail to match */
3066
3067 case OP_ASSERT_NOT:
3068 case OP_ASSERTBACK_NOT:
3069 do
3070 {
3071 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3072 ecode += (ecode[1] << 8) + ecode[2];
3073 }
3074 while (*ecode == OP_ALT);
3075
3076 if (condassert) return TRUE;
3077 ecode += 3;
3078 continue;
3079
3080 /* Move the subject pointer back. This occurs only at the start of
3081 each branch of a lookbehind assertion. If we are too close to the start to
3082 move back, this match function fails. */
3083
3084 case OP_REVERSE:
3085 eptr -= (ecode[1] << 8) + ecode[2];
3086 if (eptr < md->start_subject) return FALSE;
3087 ecode += 3;
3088 break;
3089
3090
3091 /* "Once" brackets are like assertion brackets except that after a match,
3092 the point in the subject string is not moved back. Thus there can never be
3093 a move back into the brackets. Check the alternative branches in turn - the
3094 matching won't pass the KET for this kind of subpattern. If any one branch
3095 matches, we carry on as at the end of a normal bracket, leaving the subject
3096 pointer. */
3097
3098 case OP_ONCE:
3099 {
3100 const uschar *prev = ecode;
3101
3102 do
3103 {
3104 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3105 ecode += (ecode[1] << 8) + ecode[2];
3106 }
3107 while (*ecode == OP_ALT);
3108
3109 /* If hit the end of the group (which could be repeated), fail */
3110
3111 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3112
3113 /* Continue as from after the assertion, updating the offsets high water
3114 mark, since extracts may have been taken. */
3115
3116 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3117
3118 offset_top = md->end_offset_top;
3119 eptr = md->end_match_ptr;
3120
3121 /* For a non-repeating ket, just continue at this level. This also
3122 happens for a repeating ket if no characters were matched in the group.
3123 This is the forcible breaking of infinite loops as implemented in Perl
3124 5.005. If there is an options reset, it will get obeyed in the normal
3125 course of events. */
3126
3127 if (*ecode == OP_KET || eptr == eptrb)
3128 {
3129 ecode += 3;
3130 break;
3131 }
3132
3133 /* The repeating kets try the rest of the pattern or restart from the
3134 preceding bracket, in the appropriate order. We need to reset any options
3135 that changed within the bracket before re-running it, so check the next
3136 opcode. */
3137
3138 if (ecode[3] == OP_OPT)
3139 {
3140 ims = (ims & ~PCRE_IMS) | ecode[4];
3141 DPRINTF(("ims set to %02x at group repeat\n", ims));
3142 }
3143
3144 if (*ecode == OP_KETRMIN)
3145 {
3146 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3147 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3148 }
3149 else /* OP_KETRMAX */
3150 {
3151 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3152 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3153 }
3154 }
3155 return FALSE;
3156
3157 /* An alternation is the end of a branch; scan along to find the end of the
3158 bracketed group and go to there. */
3159
3160 case OP_ALT:
3161 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3162 break;
3163
3164 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3165 that it may occur zero times. It may repeat infinitely, or not at all -
3166 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3167 repeat limits are compiled as a number of copies, with the optional ones
3168 preceded by BRAZERO or BRAMINZERO. */
3169
3170 case OP_BRAZERO:
3171 {
3172 const uschar *next = ecode+1;
3173 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3174 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3175 ecode = next + 3;
3176 }
3177 break;
3178
3179 case OP_BRAMINZERO:
3180 {
3181 const uschar *next = ecode+1;
3182 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3183 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3184 ecode++;
3185 }
3186 break;
3187
3188 /* End of a group, repeated or non-repeating. If we are at the end of
3189 an assertion "group", stop matching and return TRUE, but record the
3190 current high water mark for use by positive assertions. Do this also
3191 for the "once" (not-backup up) groups. */
3192
3193 case OP_KET:
3194 case OP_KETRMIN:
3195 case OP_KETRMAX:
3196 {
3197 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3198
3199 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3200 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3201 *prev == OP_ONCE)
3202 {
3203 md->end_match_ptr = eptr; /* For ONCE */
3204 md->end_offset_top = offset_top;
3205 return TRUE;
3206 }
3207
3208 /* In all other cases except a conditional group we have to check the
3209 group number back at the start and if necessary complete handling an
3210 extraction by setting the offsets and bumping the high water mark. */
3211
3212 if (*prev != OP_COND)
3213 {
3214 int number = *prev - OP_BRA;
3215 int offset = number << 1;
3216
3217 DPRINTF(("end bracket %d\n", number));
3218
3219 if (number > 0)
3220 {
3221 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3222 {
3223 md->offset_vector[offset] =
3224 md->offset_vector[md->offset_end - number];
3225 md->offset_vector[offset+1] = eptr - md->start_subject;
3226 if (offset_top <= offset) offset_top = offset + 2;
3227 }
3228 }
3229 }
3230
3231 /* Reset the value of the ims flags, in case they got changed during
3232 the group. */
3233
3234 ims = original_ims;
3235 DPRINTF(("ims reset to %02x\n", ims));
3236
3237 /* For a non-repeating ket, just continue at this level. This also
3238 happens for a repeating ket if no characters were matched in the group.
3239 This is the forcible breaking of infinite loops as implemented in Perl
3240 5.005. If there is an options reset, it will get obeyed in the normal
3241 course of events. */
3242
3243 if (*ecode == OP_KET || eptr == eptrb)
3244 {
3245 ecode += 3;
3246 break;
3247 }
3248
3249 /* The repeating kets try the rest of the pattern or restart from the
3250 preceding bracket, in the appropriate order. */
3251
3252 if (*ecode == OP_KETRMIN)
3253 {
3254 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3255 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3256 }
3257 else /* OP_KETRMAX */
3258 {
3259 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3260 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3261 }
3262 }
3263 return FALSE;
3264
3265 /* Start of subject unless notbol, or after internal newline if multiline */
3266
3267 case OP_CIRC:
3268 if (md->notbol && eptr == md->start_subject) return FALSE;
3269 if ((ims & PCRE_MULTILINE) != 0)
3270 {
3271 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3272 ecode++;
3273 break;
3274 }
3275 /* ... else fall through */
3276
3277 /* Start of subject assertion */
3278
3279 case OP_SOD:
3280 if (eptr != md->start_subject) return FALSE;
3281 ecode++;
3282 break;
3283
3284 /* Assert before internal newline if multiline, or before a terminating
3285 newline unless endonly is set, else end of subject unless noteol is set. */
3286
3287 case OP_DOLL:
3288 if ((ims & PCRE_MULTILINE) != 0)
3289 {
3290 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3291 else { if (md->noteol) return FALSE; }
3292 ecode++;
3293 break;
3294 }
3295 else
3296 {
3297 if (md->noteol) return FALSE;
3298 if (!md->endonly)
3299 {
3300 if (eptr < md->end_subject - 1 ||
3301 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3302
3303 ecode++;
3304 break;
3305 }
3306 }
3307 /* ... else fall through */
3308
3309 /* End of subject assertion (\z) */
3310
3311 case OP_EOD:
3312 if (eptr < md->end_subject) return FALSE;
3313 ecode++;
3314 break;
3315
3316 /* End of subject or ending \n assertion (\Z) */
3317
3318 case OP_EODN:
3319 if (eptr < md->end_subject - 1 ||
3320 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3321 ecode++;
3322 break;
3323
3324 /* Word boundary assertions */
3325
3326 case OP_NOT_WORD_BOUNDARY:
3327 case OP_WORD_BOUNDARY:
3328 {
3329 BOOL prev_is_word = (eptr != md->start_subject) &&
3330 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3331 BOOL cur_is_word = (eptr < md->end_subject) &&
3332 ((md->ctypes[*eptr] & ctype_word) != 0);
3333 if ((*ecode++ == OP_WORD_BOUNDARY)?
3334 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3335 return FALSE;
3336 }
3337 break;
3338
3339 /* Match a single character type; inline for speed */
3340
3341 case OP_ANY:
3342 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3343 return FALSE;
3344 if (eptr++ >= md->end_subject) return FALSE;
3345 ecode++;
3346 break;
3347
3348 case OP_NOT_DIGIT:
3349 if (eptr >= md->end_subject ||
3350 (md->ctypes[*eptr++] & ctype_digit) != 0)
3351 return FALSE;
3352 ecode++;
3353 break;
3354
3355 case OP_DIGIT:
3356 if (eptr >= md->end_subject ||
3357 (md->ctypes[*eptr++] & ctype_digit) == 0)
3358 return FALSE;
3359 ecode++;
3360 break;
3361
3362 case OP_NOT_WHITESPACE:
3363 if (eptr >= md->end_subject ||
3364 (md->ctypes[*eptr++] & ctype_space) != 0)
3365 return FALSE;
3366 ecode++;
3367 break;
3368
3369 case OP_WHITESPACE:
3370 if (eptr >= md->end_subject ||
3371 (md->ctypes[*eptr++] & ctype_space) == 0)
3372 return FALSE;
3373 ecode++;
3374 break;
3375
3376 case OP_NOT_WORDCHAR:
3377 if (eptr >= md->end_subject ||
3378 (md->ctypes[*eptr++] & ctype_word) != 0)
3379 return FALSE;
3380 ecode++;
3381 break;
3382
3383 case OP_WORDCHAR:
3384 if (eptr >= md->end_subject ||
3385 (md->ctypes[*eptr++] & ctype_word) == 0)
3386 return FALSE;
3387 ecode++;
3388 break;
3389
3390 /* Match a back reference, possibly repeatedly. Look past the end of the
3391 item to see if there is repeat information following. The code is similar
3392 to that for character classes, but repeated for efficiency. Then obey
3393 similar code to character type repeats - written out again for speed.
3394 However, if the referenced string is the empty string, always treat
3395 it as matched, any number of times (otherwise there could be infinite
3396 loops). */
3397
3398 case OP_REF:
3399 {
3400 int length;
3401 int offset = ecode[1] << 1; /* Doubled reference number */
3402 ecode += 2; /* Advance past the item */
3403
3404 /* If the reference is unset, set the length to be longer than the amount
3405 of subject left; this ensures that every attempt at a match fails. We
3406 can't just fail here, because of the possibility of quantifiers with zero
3407 minima. */
3408
3409 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3410 md->end_subject - eptr + 1 :
3411 md->offset_vector[offset+1] - md->offset_vector[offset];
3412
3413 /* Set up for repetition, or handle the non-repeated case */
3414
3415 switch (*ecode)
3416 {
3417 case OP_CRSTAR:
3418 case OP_CRMINSTAR:
3419 case OP_CRPLUS:
3420 case OP_CRMINPLUS:
3421 case OP_CRQUERY:
3422 case OP_CRMINQUERY:
3423 c = *ecode++ - OP_CRSTAR;
3424 minimize = (c & 1) != 0;
3425 min = rep_min[c]; /* Pick up values from tables; */
3426 max = rep_max[c]; /* zero for max => infinity */
3427 if (max == 0) max = INT_MAX;
3428 break;
3429
3430 case OP_CRRANGE:
3431 case OP_CRMINRANGE:
3432 minimize = (*ecode == OP_CRMINRANGE);
3433 min = (ecode[1] << 8) + ecode[2];
3434 max = (ecode[3] << 8) + ecode[4];
3435 if (max == 0) max = INT_MAX;
3436 ecode += 5;
3437 break;
3438
3439 default: /* No repeat follows */
3440 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3441 eptr += length;
3442 continue; /* With the main loop */
3443 }
3444
3445 /* If the length of the reference is zero, just continue with the
3446 main loop. */
3447
3448 if (length == 0) continue;
3449
3450 /* First, ensure the minimum number of matches are present. We get back
3451 the length of the reference string explicitly rather than passing the
3452 address of eptr, so that eptr can be a register variable. */
3453
3454 for (i = 1; i <= min; i++)
3455 {
3456 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3457 eptr += length;
3458 }
3459
3460 /* If min = max, continue at the same level without recursion.
3461 They are not both allowed to be zero. */
3462
3463 if (min == max) continue;
3464
3465 /* If minimizing, keep trying and advancing the pointer */
3466
3467 if (minimize)
3468 {
3469 for (i = min;; i++)
3470 {
3471 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3472 return TRUE;
3473 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3474 return FALSE;
3475 eptr += length;
3476 }
3477 /* Control never gets here */
3478 }
3479
3480 /* If maximizing, find the longest string and work backwards */
3481
3482 else
3483 {
3484 const uschar *pp = eptr;
3485 for (i = min; i < max; i++)
3486 {
3487 if (!match_ref(offset, eptr, length, md, ims)) break;
3488 eptr += length;
3489 }
3490 while (eptr >= pp)
3491 {
3492 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3493 return TRUE;
3494 eptr -= length;
3495 }
3496 return FALSE;
3497 }
3498 }
3499 /* Control never gets here */
3500
3501
3502
3503 /* Match a character class, possibly repeatedly. Look past the end of the
3504 item to see if there is repeat information following. Then obey similar
3505 code to character type repeats - written out again for speed. */
3506
3507 case OP_CLASS:
3508 {
3509 const uschar *data = ecode + 1; /* Save for matching */
3510 ecode += 33; /* Advance past the item */
3511
3512 switch (*ecode)
3513 {
3514 case OP_CRSTAR:
3515 case OP_CRMINSTAR:
3516 case OP_CRPLUS:
3517 case OP_CRMINPLUS:
3518 case OP_CRQUERY:
3519 case OP_CRMINQUERY:
3520 c = *ecode++ - OP_CRSTAR;
3521 minimize = (c & 1) != 0;
3522 min = rep_min[c]; /* Pick up values from tables; */
3523 max = rep_max[c]; /* zero for max => infinity */
3524 if (max == 0) max = INT_MAX;
3525 break;
3526
3527 case OP_CRRANGE:
3528 case OP_CRMINRANGE:
3529 minimize = (*ecode == OP_CRMINRANGE);
3530 min = (ecode[1] << 8) + ecode[2];
3531 max = (ecode[3] << 8) + ecode[4];
3532 if (max == 0) max = INT_MAX;
3533 ecode += 5;
3534 break;
3535
3536 default: /* No repeat follows */
3537 min = max = 1;
3538 break;
3539 }
3540
3541 /* First, ensure the minimum number of matches are present. */
3542
3543 for (i = 1; i <= min; i++)
3544 {
3545 if (eptr >= md->end_subject) return FALSE;
3546 c = *eptr++;
3547 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3548 return FALSE;
3549 }
3550
3551 /* If max == min we can continue with the main loop without the
3552 need to recurse. */
3553
3554 if (min == max) continue;
3555
3556 /* If minimizing, keep testing the rest of the expression and advancing
3557 the pointer while it matches the class. */
3558
3559 if (minimize)
3560 {
3561 for (i = min;; i++)
3562 {
3563 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3564 return TRUE;
3565 if (i >= max || eptr >= md->end_subject) return FALSE;
3566 c = *eptr++;
3567 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3568 return FALSE;
3569 }
3570 /* Control never gets here */
3571 }
3572
3573 /* If maximizing, find the longest possible run, then work backwards. */
3574
3575 else
3576 {
3577 const uschar *pp = eptr;
3578 for (i = min; i < max; eptr++, i++)
3579 {
3580 if (eptr >= md->end_subject) break;
3581 c = *eptr;
3582 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3583 break;
3584 }
3585
3586 while (eptr >= pp)
3587 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3588 return TRUE;
3589 return FALSE;
3590 }
3591 }
3592 /* Control never gets here */
3593
3594 /* Match a run of characters */
3595
3596 case OP_CHARS:
3597 {
3598 register int length = ecode[1];
3599 ecode += 2;
3600
3601 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3602 if (eptr >= md->end_subject)
3603 printf("matching subject <null> against pattern ");
3604 else
3605 {
3606 printf("matching subject ");
3607 pchars(eptr, length, TRUE, md);
3608 printf(" against pattern ");
3609 }
3610 pchars(ecode, length, FALSE, md);
3611 printf("\n");
3612 #endif
3613
3614 if (length > md->end_subject - eptr) return FALSE;
3615 if ((ims & PCRE_CASELESS) != 0)
3616 {
3617 while (length-- > 0)
3618 if (md->lcc[*ecode++] != md->lcc[*eptr++])
3619 return FALSE;
3620 }
3621 else
3622 {
3623 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3624 }
3625 }
3626 break;
3627
3628 /* Match a single character repeatedly; different opcodes share code. */
3629
3630 case OP_EXACT:
3631 min = max = (ecode[1] << 8) + ecode[2];
3632 ecode += 3;
3633 goto REPEATCHAR;
3634
3635 case OP_UPTO:
3636 case OP_MINUPTO:
3637 min = 0;
3638 max = (ecode[1] << 8) + ecode[2];
3639 minimize = *ecode == OP_MINUPTO;
3640 ecode += 3;
3641 goto REPEATCHAR;
3642
3643 case OP_STAR:
3644 case OP_MINSTAR:
3645 case OP_PLUS:
3646 case OP_MINPLUS:
3647 case OP_QUERY:
3648 case OP_MINQUERY:
3649 c = *ecode++ - OP_STAR;
3650 minimize = (c & 1) != 0;
3651 min = rep_min[c]; /* Pick up values from tables; */
3652 max = rep_max[c]; /* zero for max => infinity */
3653 if (max == 0) max = INT_MAX;
3654
3655 /* Common code for all repeated single-character matches. We can give
3656 up quickly if there are fewer than the minimum number of characters left in
3657 the subject. */
3658
3659 REPEATCHAR:
3660 if (min > md->end_subject - eptr) return FALSE;
3661 c = *ecode++;
3662
3663 /* The code is duplicated for the caseless and caseful cases, for speed,
3664 since matching characters is likely to be quite common. First, ensure the
3665 minimum number of matches are present. If min = max, continue at the same
3666 level without recursing. Otherwise, if minimizing, keep trying the rest of
3667 the expression and advancing one matching character if failing, up to the
3668 maximum. Alternatively, if maximizing, find the maximum number of
3669 characters and work backwards. */
3670
3671 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3672 max, eptr));
3673
3674 if ((ims & PCRE_CASELESS) != 0)
3675 {
3676 c = md->lcc[c];
3677 for (i = 1; i <= min; i++)
3678 if (c != md->lcc[*eptr++]) return FALSE;
3679 if (min == max) continue;
3680 if (minimize)
3681 {
3682 for (i = min;; i++)
3683 {
3684 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3685 return TRUE;
3686 if (i >= max || eptr >= md->end_subject ||
3687 c != md->lcc[*eptr++])
3688 return FALSE;
3689 }
3690 /* Control never gets here */
3691 }
3692 else
3693 {
3694 const uschar *pp = eptr;
3695 for (i = min; i < max; i++)
3696 {
3697 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3698 eptr++;
3699 }
3700 while (eptr >= pp)
3701 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3702 return TRUE;
3703 return FALSE;
3704 }
3705 /* Control never gets here */
3706 }
3707
3708 /* Caseful comparisons */
3709
3710 else
3711 {
3712 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3713 if (min == max) continue;
3714 if (minimize)
3715 {
3716 for (i = min;; i++)
3717 {
3718 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3719 return TRUE;
3720 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3721 }
3722 /* Control never gets here */
3723 }
3724 else
3725 {
3726 const uschar *pp = eptr;
3727 for (i = min; i < max; i++)
3728 {
3729 if (eptr >= md->end_subject || c != *eptr) break;
3730 eptr++;
3731 }
3732 while (eptr >= pp)
3733 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3734 return TRUE;
3735 return FALSE;
3736 }
3737 }
3738 /* Control never gets here */
3739
3740 /* Match a negated single character */
3741
3742 case OP_NOT:
3743 if (eptr >= md->end_subject) return FALSE;
3744 ecode++;
3745 if ((ims & PCRE_CASELESS) != 0)
3746 {
3747 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3748 }
3749 else
3750 {
3751 if (*ecode++ == *eptr++) return FALSE;
3752 }
3753 break;
3754
3755 /* Match a negated single character repeatedly. This is almost a repeat of
3756 the code for a repeated single character, but I haven't found a nice way of
3757 commoning these up that doesn't require a test of the positive/negative
3758 option for each character match. Maybe that wouldn't add very much to the
3759 time taken, but character matching *is* what this is all about... */
3760
3761 case OP_NOTEXACT:
3762 min = max = (ecode[1] << 8) + ecode[2];
3763 ecode += 3;
3764 goto REPEATNOTCHAR;
3765
3766 case OP_NOTUPTO:
3767 case OP_NOTMINUPTO:
3768 min = 0;
3769 max = (ecode[1] << 8) + ecode[2];
3770 minimize = *ecode == OP_NOTMINUPTO;
3771 ecode += 3;
3772 goto REPEATNOTCHAR;
3773
3774 case OP_NOTSTAR:
3775 case OP_NOTMINSTAR:
3776 case OP_NOTPLUS:
3777 case OP_NOTMINPLUS:
3778 case OP_NOTQUERY:
3779 case OP_NOTMINQUERY:
3780 c = *ecode++ - OP_NOTSTAR;
3781 minimize = (c & 1) != 0;
3782 min = rep_min[c]; /* Pick up values from tables; */
3783 max = rep_max[c]; /* zero for max => infinity */
3784 if (max == 0) max = INT_MAX;
3785
3786 /* Common code for all repeated single-character matches. We can give
3787 up quickly if there are fewer than the minimum number of characters left in
3788 the subject. */
3789
3790 REPEATNOTCHAR:
3791 if (min > md->end_subject - eptr) return FALSE;
3792 c = *ecode++;
3793
3794 /* The code is duplicated for the caseless and caseful cases, for speed,
3795 since matching characters is likely to be quite common. First, ensure the
3796 minimum number of matches are present. If min = max, continue at the same
3797 level without recursing. Otherwise, if minimizing, keep trying the rest of
3798 the expression and advancing one matching character if failing, up to the
3799 maximum. Alternatively, if maximizing, find the maximum number of
3800 characters and work backwards. */
3801
3802 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3803 max, eptr));
3804
3805 if ((ims & PCRE_CASELESS) != 0)
3806 {
3807 c = md->lcc[c];
3808 for (i = 1; i <= min; i++)
3809 if (c == md->lcc[*eptr++]) return FALSE;
3810 if (min == max) continue;
3811 if (minimize)
3812 {
3813 for (i = min;; i++)
3814 {
3815 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3816 return TRUE;
3817 if (i >= max || eptr >= md->end_subject ||
3818 c == md->lcc[*eptr++])
3819 return FALSE;
3820 }
3821 /* Control never gets here */
3822 }
3823 else
3824 {
3825 const uschar *pp = eptr;
3826 for (i = min; i < max; i++)
3827 {
3828 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3829 eptr++;
3830 }
3831 while (eptr >= pp)
3832 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3833 return TRUE;
3834 return FALSE;
3835 }
3836 /* Control never gets here */
3837 }
3838
3839 /* Caseful comparisons */
3840
3841 else
3842 {
3843 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3844 if (min == max) continue;
3845 if (minimize)
3846 {
3847 for (i = min;; i++)
3848 {
3849 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3850 return TRUE;
3851 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3852 }
3853 /* Control never gets here */
3854 }
3855 else
3856 {
3857 const uschar *pp = eptr;
3858 for (i = min; i < max; i++)
3859 {
3860 if (eptr >= md->end_subject || c == *eptr) break;
3861 eptr++;
3862 }
3863 while (eptr >= pp)
3864 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3865 return TRUE;
3866 return FALSE;
3867 }
3868 }
3869 /* Control never gets here */
3870
3871 /* Match a single character type repeatedly; several different opcodes
3872 share code. This is very similar to the code for single characters, but we
3873 repeat it in the interests of efficiency. */
3874
3875 case OP_TYPEEXACT:
3876 min = max = (ecode[1] << 8) + ecode[2];
3877 minimize = TRUE;
3878 ecode += 3;
3879 goto REPEATTYPE;
3880
3881 case OP_TYPEUPTO:
3882 case OP_TYPEMINUPTO:
3883 min = 0;
3884 max = (ecode[1] << 8) + ecode[2];
3885 minimize = *ecode == OP_TYPEMINUPTO;
3886 ecode += 3;
3887 goto REPEATTYPE;
3888
3889 case OP_TYPESTAR:
3890 case OP_TYPEMINSTAR:
3891 case OP_TYPEPLUS:
3892 case OP_TYPEMINPLUS:
3893 case OP_TYPEQUERY:
3894 case OP_TYPEMINQUERY:
3895 c = *ecode++ - OP_TYPESTAR;
3896 minimize = (c & 1) != 0;
3897 min = rep_min[c]; /* Pick up values from tables; */
3898 max = rep_max[c]; /* zero for max => infinity */
3899 if (max == 0) max = INT_MAX;
3900
3901 /* Common code for all repeated single character type matches */
3902
3903 REPEATTYPE:
3904 ctype = *ecode++; /* Code for the character type */
3905
3906 /* First, ensure the minimum number of matches are present. Use inline
3907 code for maximizing the speed, and do the type test once at the start
3908 (i.e. keep it out of the loop). Also test that there are at least the
3909 minimum number of characters before we start. */
3910
3911 if (min > md->end_subject - eptr) return FALSE;
3912 if (min > 0) switch(ctype)
3913 {
3914 case OP_ANY:
3915 if ((ims & PCRE_DOTALL) == 0)
3916 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3917 else eptr += min;
3918 break;
3919
3920 case OP_NOT_DIGIT:
3921 for (i = 1; i <= min; i++)
3922 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3923 break;
3924
3925 case OP_DIGIT:
3926 for (i = 1; i <= min; i++)
3927 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3928 break;
3929
3930 case OP_NOT_WHITESPACE:
3931 for (i = 1; i <= min; i++)
3932 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3933 break;
3934
3935 case OP_WHITESPACE:
3936 for (i = 1; i <= min; i++)
3937 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3938 break;
3939
3940 case OP_NOT_WORDCHAR:
3941 for (i = 1; i <= min; i++)
3942 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3943 return FALSE;
3944 break;
3945
3946 case OP_WORDCHAR:
3947 for (i = 1; i <= min; i++)
3948 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3949 return FALSE;
3950 break;
3951 }
3952
3953 /* If min = max, continue at the same level without recursing */
3954
3955 if (min == max) continue;
3956
3957 /* If minimizing, we have to test the rest of the pattern before each
3958 subsequent match. */
3959
3960 if (minimize)
3961 {
3962 for (i = min;; i++)
3963 {
3964 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3965 if (i >= max || eptr >= md->end_subject) return FALSE;
3966
3967 c = *eptr++;
3968 switch(ctype)
3969 {
3970 case OP_ANY:
3971 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3972 break;
3973
3974 case OP_NOT_DIGIT:
3975 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3976 break;
3977
3978 case OP_DIGIT:
3979 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3980 break;
3981
3982 case OP_NOT_WHITESPACE:
3983 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3984 break;
3985
3986 case OP_WHITESPACE:
3987 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3988 break;
3989
3990 case OP_NOT_WORDCHAR:
3991 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3992 break;
3993
3994 case OP_WORDCHAR:
3995 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3996 break;
3997 }
3998 }
3999 /* Control never gets here */
4000 }
4001
4002 /* If maximizing it is worth using inline code for speed, doing the type
4003 test once at the start (i.e. keep it out of the loop). */
4004
4005 else
4006 {
4007 const uschar *pp = eptr;
4008 switch(ctype)
4009 {
4010 case OP_ANY:
4011 if ((ims & PCRE_DOTALL) == 0)
4012 {
4013 for (i = min; i < max; i++)
4014 {
4015 if (eptr >= md->end_subject || *eptr == '\n') break;
4016 eptr++;
4017 }
4018 }
4019 else
4020 {
4021 c = max - min;
4022 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4023 eptr += c;
4024 }
4025 break;
4026
4027 case OP_NOT_DIGIT:
4028 for (i = min; i < max; i++)
4029 {
4030 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4031 break;
4032 eptr++;
4033 }
4034 break;
4035
4036 case OP_DIGIT:
4037 for (i = min; i < max; i++)
4038 {
4039 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4040 break;
4041 eptr++;
4042 }
4043 break;
4044
4045 case OP_NOT_WHITESPACE:
4046 for (i = min; i < max; i++)
4047 {
4048 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4049 break;
4050 eptr++;
4051 }
4052 break;
4053
4054 case OP_WHITESPACE:
4055 for (i = min; i < max; i++)
4056 {
4057 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4058 break;
4059 eptr++;
4060 }
4061 break;
4062
4063 case OP_NOT_WORDCHAR:
4064 for (i = min; i < max; i++)
4065 {
4066 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4067 break;
4068 eptr++;
4069 }
4070 break;
4071
4072 case OP_WORDCHAR:
4073 for (i = min; i < max; i++)
4074 {
4075 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4076 break;
4077 eptr++;
4078 }
4079 break;
4080 }
4081
4082 while (eptr >= pp)
4083 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4084 return TRUE;
4085 return FALSE;
4086 }
4087 /* Control never gets here */
4088
4089 /* There's been some horrible disaster. */
4090
4091 default:
4092 DPRINTF(("Unknown opcode %d\n", *ecode));
4093 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4094 return FALSE;
4095 }
4096
4097 /* Do not stick any code in here without much thought; it is assumed
4098 that "continue" in the code above comes out to here to repeat the main
4099 loop. */
4100
4101 } /* End of main loop */
4102 /* Control never reaches here */
4103 }
4104
4105
4106
4107
4108 /*************************************************
4109 * Execute a Regular Expression *
4110 *************************************************/
4111
4112 /* This function applies a compiled re to a subject string and picks out
4113 portions of the string if it matches. Two elements in the vector are set for
4114 each substring: the offsets to the start and end of the substring.
4115
4116 Arguments:
4117 external_re points to the compiled expression
4118 external_extra points to "hints" from pcre_study() or is NULL
4119 subject points to the subject string
4120 length length of subject string (may contain binary zeros)
4121 start_offset where to start in the subject string
4122 options option bits
4123 offsets points to a vector of ints to be filled in with offsets
4124 offsetcount the number of elements in the vector
4125
4126 Returns: > 0 => success; value is the number of elements filled in
4127 = 0 => success, but offsets is not big enough
4128 -1 => failed to match
4129 < -1 => some kind of unexpected problem
4130 */
4131
4132 int
4133 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4134 const char *subject, int length, int start_offset, int options, int *offsets,
4135 int offsetcount)
4136 {
4137 int resetcount, ocount;
4138 int first_char = -1;
4139 int ims = 0;
4140 match_data match_block;
4141 const uschar *start_bits = NULL;
4142 const uschar *start_match = (const uschar *)subject + start_offset;
4143 const uschar *end_subject;
4144 const real_pcre *re = (const real_pcre *)external_re;
4145 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4146 BOOL using_temporary_offsets = FALSE;
4147 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4148 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4149
4150 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4151
4152 if (re == NULL || subject == NULL ||
4153 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4154 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4155
4156 match_block.start_subject = (const uschar *)subject;
4157 match_block.end_subject = match_block.start_subject + length;
4158 end_subject = match_block.end_subject;
4159
4160 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4161
4162 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4163 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4164
4165 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4166
4167 match_block.lcc = re->tables + lcc_offset;
4168 match_block.ctypes = re->tables + ctypes_offset;
4169
4170 /* The ims options can vary during the matching as a result of the presence
4171 of (?ims) items in the pattern. They are kept in a local variable so that
4172 restoring at the exit of a group is easy. */
4173
4174 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4175
4176 /* If the expression has got more back references than the offsets supplied can
4177 hold, we get a temporary bit of working store to use during the matching.
4178 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4179 of 3. */
4180
4181 ocount = offsetcount - (offsetcount % 3);
4182
4183 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4184 {
4185 ocount = re->top_backref * 3 + 3;
4186 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4187 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4188 using_temporary_offsets = TRUE;
4189 DPRINTF(("Got memory to hold back references\n"));
4190 }
4191 else match_block.offset_vector = offsets;
4192
4193 match_block.offset_end = ocount;
4194 match_block.offset_max = (2*ocount)/3;
4195 match_block.offset_overflow = FALSE;
4196
4197 /* Compute the minimum number of offsets that we need to reset each time. Doing
4198 this makes a huge difference to execution time when there aren't many brackets
4199 in the pattern. */
4200
4201 resetcount = 2 + re->top_bracket * 2;
4202 if (resetcount > offsetcount) resetcount = ocount;
4203
4204 /* Reset the working variable associated with each extraction. These should
4205 never be used unless previously set, but they get saved and restored, and so we
4206 initialize them to avoid reading uninitialized locations. */
4207
4208 if (match_block.offset_vector != NULL)
4209 {
4210 register int *iptr = match_block.offset_vector + ocount;
4211 register int *iend = iptr - resetcount/2 + 1;
4212 while (--iptr >= iend) *iptr = -1;
4213 }
4214
4215 /* Set up the first character to match, if available. The first_char value is
4216 never set for an anchored regular expression, but the anchoring may be forced
4217 at run time, so we have to test for anchoring. The first char may be unset for
4218 an unanchored pattern, of course. If there's no first char and the pattern was
4219 studied, there may be a bitmap of possible first characters. */
4220
4221 if (!anchored)
4222 {
4223 if ((re->options & PCRE_FIRSTSET) != 0)
4224 {
4225 first_char = re->first_char;
4226 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4227 }
4228 else
4229 if (!startline && extra != NULL &&
4230 (extra->options & PCRE_STUDY_MAPPED) != 0)
4231 start_bits = extra->start_bits;
4232 }
4233
4234 /* Loop for unanchored matches; for anchored regexs the loop runs just once. */
4235
4236 do
4237 {
4238 int rc;
4239 register int *iptr = match_block.offset_vector;
4240 register int *iend = iptr + resetcount;
4241
4242 /* Reset the maximum number of extractions we might see. */
4243
4244 while (iptr < iend) *iptr++ = -1;
4245
4246 /* Advance to a unique first char if possible */
4247
4248 if (first_char >= 0)
4249 {
4250 if ((ims & PCRE_CASELESS) != 0)
4251 while (start_match < end_subject &&
4252 match_block.lcc[*start_match] != first_char)
4253 start_match++;
4254 else
4255 while (start_match < end_subject && *start_match != first_char)
4256 start_match++;
4257 }
4258
4259 /* Or to just after \n for a multiline match if possible */
4260
4261 else if (startline)
4262 {
4263 if (start_match > match_block.start_subject)
4264 {
4265 while (start_match < end_subject && start_match[-1] != '\n')
4266 start_match++;
4267 }
4268 }
4269
4270 /* Or to a non-unique first char */
4271
4272 else if (start_bits != NULL)
4273 {
4274 while (start_match < end_subject)
4275 {
4276 register int c = *start_match;
4277 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4278 }
4279 }
4280
4281 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4282 printf(">>>> Match against: ");
4283 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4284 printf("\n");
4285 #endif
4286
4287 /* When a match occurs, substrings will be set for all internal extractions;
4288 we just need to set up the whole thing as substring 0 before returning. If
4289 there were too many extractions, set the return code to zero. In the case
4290 where we had to get some local store to hold offsets for backreferences, copy
4291 those back references that we can. In this case there need not be overflow
4292 if certain parts of the pattern were not used. */
4293
4294 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4295 continue;
4296
4297 /* Copy the offset information from temporary store if necessary */
4298
4299 if (using_temporary_offsets)
4300 {
4301 if (offsetcount >= 4)
4302 {
4303 memcpy(offsets + 2, match_block.offset_vector + 2,
4304 (offsetcount - 2) * sizeof(int));
4305 DPRINTF(("Copied offsets from temporary memory\n"));
4306 }
4307 if (match_block.end_offset_top > offsetcount)
4308 match_block.offset_overflow = TRUE;
4309
4310 DPRINTF(("Freeing temporary memory\n"));
4311 (pcre_free)(match_block.offset_vector);
4312 }
4313
4314 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4315
4316 if (match_block.offset_end < 2) rc = 0; else
4317 {
4318 offsets[0] = start_match - match_block.start_subject;
4319 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4320 }
4321
4322 DPRINTF((">>>> returning %d\n", rc));
4323 return rc;
4324 }
4325
4326 /* This "while" is the end of the "do" above */
4327
4328 while (!anchored &&
4329 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4330 start_match++ < end_subject);
4331
4332 if (using_temporary_offsets)
4333 {
4334 DPRINTF(("Freeing temporary memory\n"));
4335 (pcre_free)(match_block.offset_vector);
4336 }
4337
4338 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4339
4340 return match_block.errorcode;
4341 }
4342
4343 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12