/[pcre]/code/tags/pcre-2.05/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-2.05/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 34 - (show annotations) (download)
Sat Feb 24 21:39:03 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 126310 byte(s)
Tag code/trunk as code/tags/pcre-2.05.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-1999 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70
71 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73
74 /* Text forms of OP_ values and things, for debugging (not all used) */
75
76 #ifdef DEBUG
77 static const char *OP_names[] = {
78 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80 "Opt", "^", "$", "Any", "chars", "not",
81 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84 "*", "*?", "+", "+?", "?", "??", "{", "{",
85 "class", "Ref",
86 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 "Brazero", "Braminzero", "Bra"
89 };
90 #endif
91
92 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93 are simple data values; negative values are for special things like \d and so
94 on. Zero means further processing is needed (for things like \x), or the escape
95 is invalid. */
96
97 static const short int escapes[] = {
98 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 0, 0, -ESC_z /* x - z */
108 };
109
110 /* Definition to allow mutual recursion */
111
112 static BOOL
113 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 BOOL, int, compile_data *);
115
116
117
118 /*************************************************
119 * Global variables *
120 *************************************************/
121
122 /* PCRE is thread-clean and doesn't use any global variables in the normal
123 sense. However, it calls memory allocation and free functions via the two
124 indirections below, which are can be changed by the caller, but are shared
125 between all threads. */
126
127 void *(*pcre_malloc)(size_t) = malloc;
128 void (*pcre_free)(void *) = free;
129
130
131
132
133 /*************************************************
134 * Default character tables *
135 *************************************************/
136
137 /* A default set of character tables is included in the PCRE binary. Its source
138 is built by the maketables auxiliary program, which uses the default C ctypes
139 functions, and put in the file chartables.c. These tables are used by PCRE
140 whenever the caller of pcre_compile() does not provide an alternate set of
141 tables. */
142
143 #include "chartables.c"
144
145
146
147 /*************************************************
148 * Return version string *
149 *************************************************/
150
151 const char *
152 pcre_version(void)
153 {
154 return PCRE_VERSION;
155 }
156
157
158
159
160 /*************************************************
161 * Return info about a compiled pattern *
162 *************************************************/
163
164 /* This function picks potentially useful data out of the private
165 structure.
166
167 Arguments:
168 external_re points to compiled code
169 optptr where to pass back the options
170 first_char where to pass back the first character,
171 or -1 if multiline and all branches start ^,
172 or -2 otherwise
173
174 Returns: number of identifying extraction brackets
175 or negative values on error
176 */
177
178 int
179 pcre_info(const pcre *external_re, int *optptr, int *first_char)
180 {
181 const real_pcre *re = (const real_pcre *)external_re;
182 if (re == NULL) return PCRE_ERROR_NULL;
183 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185 if (first_char != NULL)
186 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188 return re->top_bracket;
189 }
190
191
192
193
194 #ifdef DEBUG
195 /*************************************************
196 * Debugging function to print chars *
197 *************************************************/
198
199 /* Print a sequence of chars in printable format, stopping at the end of the
200 subject if the requested.
201
202 Arguments:
203 p points to characters
204 length number to print
205 is_subject TRUE if printing from within md->start_subject
206 md pointer to matching data block, if is_subject is TRUE
207
208 Returns: nothing
209 */
210
211 static void
212 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 {
214 int c;
215 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216 while (length-- > 0)
217 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218 }
219 #endif
220
221
222
223
224 /*************************************************
225 * Handle escapes *
226 *************************************************/
227
228 /* This function is called when a \ has been encountered. It either returns a
229 positive value for a simple escape such as \n, or a negative value which
230 encodes one of the more complicated things such as \d. On entry, ptr is
231 pointing at the \. On exit, it is on the final character of the escape
232 sequence.
233
234 Arguments:
235 ptrptr points to the pattern position pointer
236 errorptr points to the pointer to the error message
237 bracount number of previous extracting brackets
238 options the options bits
239 isclass TRUE if inside a character class
240 cd pointer to char tables block
241
242 Returns: zero or positive => a data character
243 negative => a special escape sequence
244 on error, errorptr is set
245 */
246
247 static int
248 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
249 int options, BOOL isclass, compile_data *cd)
250 {
251 const uschar *ptr = *ptrptr;
252 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
253 int i;
254
255 if (c == 0) *errorptr = ERR1;
256
257 /* Digits or letters may have special meaning; all others are literals. */
258
259 else if (c < '0' || c > 'z') {}
260
261 /* Do an initial lookup in a table. A non-zero result is something that can be
262 returned immediately. Otherwise further processing may be required. */
263
264 else if ((i = escapes[c - '0']) != 0) c = i;
265
266 /* Escapes that need further processing, or are illegal. */
267
268 else
269 {
270 const uschar *oldptr;
271 switch (c)
272 {
273 /* The handling of escape sequences consisting of a string of digits
274 starting with one that is not zero is not straightforward. By experiment,
275 the way Perl works seems to be as follows:
276
277 Outside a character class, the digits are read as a decimal number. If the
278 number is less than 10, or if there are that many previous extracting
279 left brackets, then it is a back reference. Otherwise, up to three octal
280 digits are read to form an escaped byte. Thus \123 is likely to be octal
281 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
282 value is greater than 377, the least significant 8 bits are taken. Inside a
283 character class, \ followed by a digit is always an octal number. */
284
285 case '1': case '2': case '3': case '4': case '5':
286 case '6': case '7': case '8': case '9':
287
288 if (!isclass)
289 {
290 oldptr = ptr;
291 c -= '0';
292 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
293 c = c * 10 + *(++ptr) - '0';
294 if (c < 10 || c <= bracount)
295 {
296 c = -(ESC_REF + c);
297 break;
298 }
299 ptr = oldptr; /* Put the pointer back and fall through */
300 }
301
302 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
303 generates a binary zero byte and treats the digit as a following literal.
304 Thus we have to pull back the pointer by one. */
305
306 if ((c = *ptr) >= '8')
307 {
308 ptr--;
309 c = 0;
310 break;
311 }
312
313 /* \0 always starts an octal number, but we may drop through to here with a
314 larger first octal digit */
315
316 case '0':
317 c -= '0';
318 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
319 ptr[1] != '8' && ptr[1] != '9')
320 c = c * 8 + *(++ptr) - '0';
321 break;
322
323 /* Special escapes not starting with a digit are straightforward */
324
325 case 'x':
326 c = 0;
327 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
328 {
329 ptr++;
330 c = c * 16 + cd->lcc[*ptr] -
331 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
332 }
333 break;
334
335 case 'c':
336 c = *(++ptr);
337 if (c == 0)
338 {
339 *errorptr = ERR2;
340 return 0;
341 }
342
343 /* A letter is upper-cased; then the 0x40 bit is flipped */
344
345 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
346 c ^= 0x40;
347 break;
348
349 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
350 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
351 for Perl compatibility, it is a literal. This code looks a bit odd, but
352 there used to be some cases other than the default, and there may be again
353 in future, so I haven't "optimized" it. */
354
355 default:
356 if ((options & PCRE_EXTRA) != 0) switch(c)
357 {
358 default:
359 *errorptr = ERR3;
360 break;
361 }
362 break;
363 }
364 }
365
366 *ptrptr = ptr;
367 return c;
368 }
369
370
371
372 /*************************************************
373 * Check for counted repeat *
374 *************************************************/
375
376 /* This function is called when a '{' is encountered in a place where it might
377 start a quantifier. It looks ahead to see if it really is a quantifier or not.
378 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
379 where the ddds are digits.
380
381 Arguments:
382 p pointer to the first char after '{'
383 cd pointer to char tables block
384
385 Returns: TRUE or FALSE
386 */
387
388 static BOOL
389 is_counted_repeat(const uschar *p, compile_data *cd)
390 {
391 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
392 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
393 if (*p == '}') return TRUE;
394
395 if (*p++ != ',') return FALSE;
396 if (*p == '}') return TRUE;
397
398 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
399 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
400 return (*p == '}');
401 }
402
403
404
405 /*************************************************
406 * Read repeat counts *
407 *************************************************/
408
409 /* Read an item of the form {n,m} and return the values. This is called only
410 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
411 so the syntax is guaranteed to be correct, but we need to check the values.
412
413 Arguments:
414 p pointer to first char after '{'
415 minp pointer to int for min
416 maxp pointer to int for max
417 returned as -1 if no max
418 errorptr points to pointer to error message
419 cd pointer to character tables clock
420
421 Returns: pointer to '}' on success;
422 current ptr on error, with errorptr set
423 */
424
425 static const uschar *
426 read_repeat_counts(const uschar *p, int *minp, int *maxp,
427 const char **errorptr, compile_data *cd)
428 {
429 int min = 0;
430 int max = -1;
431
432 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
433
434 if (*p == '}') max = min; else
435 {
436 if (*(++p) != '}')
437 {
438 max = 0;
439 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
440 if (max < min)
441 {
442 *errorptr = ERR4;
443 return p;
444 }
445 }
446 }
447
448 /* Do paranoid checks, then fill in the required variables, and pass back the
449 pointer to the terminating '}'. */
450
451 if (min > 65535 || max > 65535)
452 *errorptr = ERR5;
453 else
454 {
455 *minp = min;
456 *maxp = max;
457 }
458 return p;
459 }
460
461
462
463 /*************************************************
464 * Find the fixed length of a pattern *
465 *************************************************/
466
467 /* Scan a pattern and compute the fixed length of subject that will match it,
468 if the length is fixed. This is needed for dealing with backward assertions.
469
470 Arguments:
471 code points to the start of the pattern (the bracket)
472
473 Returns: the fixed length, or -1 if there is no fixed length
474 */
475
476 static int
477 find_fixedlength(uschar *code)
478 {
479 int length = -1;
480
481 register int branchlength = 0;
482 register uschar *cc = code + 3;
483
484 /* Scan along the opcodes for this branch. If we get to the end of the
485 branch, check the length against that of the other branches. */
486
487 for (;;)
488 {
489 int d;
490 register int op = *cc;
491 if (op >= OP_BRA) op = OP_BRA;
492
493 switch (op)
494 {
495 case OP_BRA:
496 case OP_ONCE:
497 case OP_COND:
498 d = find_fixedlength(cc);
499 if (d < 0) return -1;
500 branchlength += d;
501 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
502 cc += 3;
503 break;
504
505 /* Reached end of a branch; if it's a ket it is the end of a nested
506 call. If it's ALT it is an alternation in a nested call. If it is
507 END it's the end of the outer call. All can be handled by the same code. */
508
509 case OP_ALT:
510 case OP_KET:
511 case OP_KETRMAX:
512 case OP_KETRMIN:
513 case OP_END:
514 if (length < 0) length = branchlength;
515 else if (length != branchlength) return -1;
516 if (*cc != OP_ALT) return length;
517 cc += 3;
518 branchlength = 0;
519 break;
520
521 /* Skip over assertive subpatterns */
522
523 case OP_ASSERT:
524 case OP_ASSERT_NOT:
525 case OP_ASSERTBACK:
526 case OP_ASSERTBACK_NOT:
527 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
528 cc += 3;
529 break;
530
531 /* Skip over things that don't match chars */
532
533 case OP_REVERSE:
534 cc++;
535
536 case OP_CREF:
537 case OP_OPT:
538 cc++;
539 /* Fall through */
540
541 case OP_SOD:
542 case OP_EOD:
543 case OP_EODN:
544 case OP_CIRC:
545 case OP_DOLL:
546 case OP_NOT_WORD_BOUNDARY:
547 case OP_WORD_BOUNDARY:
548 cc++;
549 break;
550
551 /* Handle char strings */
552
553 case OP_CHARS:
554 branchlength += *(++cc);
555 cc += *cc + 1;
556 break;
557
558 /* Handle exact repetitions */
559
560 case OP_EXACT:
561 case OP_TYPEEXACT:
562 branchlength += (cc[1] << 8) + cc[2];
563 cc += 4;
564 break;
565
566 /* Handle single-char matchers */
567
568 case OP_NOT_DIGIT:
569 case OP_DIGIT:
570 case OP_NOT_WHITESPACE:
571 case OP_WHITESPACE:
572 case OP_NOT_WORDCHAR:
573 case OP_WORDCHAR:
574 case OP_ANY:
575 branchlength++;
576 cc++;
577 break;
578
579
580 /* Check a class for variable quantification */
581
582 case OP_CLASS:
583 cc += (*cc == OP_REF)? 2 : 33;
584
585 switch (*cc)
586 {
587 case OP_CRSTAR:
588 case OP_CRMINSTAR:
589 case OP_CRQUERY:
590 case OP_CRMINQUERY:
591 return -1;
592
593 case OP_CRRANGE:
594 case OP_CRMINRANGE:
595 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
596 branchlength += (cc[1] << 8) + cc[2];
597 cc += 5;
598 break;
599
600 default:
601 branchlength++;
602 }
603 break;
604
605 /* Anything else is variable length */
606
607 default:
608 return -1;
609 }
610 }
611 /* Control never gets here */
612 }
613
614
615
616
617 /*************************************************
618 * Compile one branch *
619 *************************************************/
620
621 /* Scan the pattern, compiling it into the code vector.
622
623 Arguments:
624 options the option bits
625 brackets points to number of brackets used
626 code points to the pointer to the current code point
627 ptrptr points to the current pattern pointer
628 errorptr points to pointer to error message
629 optchanged set to the value of the last OP_OPT item compiled
630 cd contains pointers to tables
631
632 Returns: TRUE on success
633 FALSE, with *errorptr set on error
634 */
635
636 static BOOL
637 compile_branch(int options, int *brackets, uschar **codeptr,
638 const uschar **ptrptr, const char **errorptr, int *optchanged,
639 compile_data *cd)
640 {
641 int repeat_type, op_type;
642 int repeat_min, repeat_max;
643 int bravalue, length;
644 int greedy_default, greedy_non_default;
645 register int c;
646 register uschar *code = *codeptr;
647 uschar *tempcode;
648 const uschar *ptr = *ptrptr;
649 const uschar *tempptr;
650 uschar *previous = NULL;
651 uschar class[32];
652
653 /* Set up the default and non-default settings for greediness */
654
655 greedy_default = ((options & PCRE_UNGREEDY) != 0);
656 greedy_non_default = greedy_default ^ 1;
657
658 /* Switch on next character until the end of the branch */
659
660 for (;; ptr++)
661 {
662 BOOL negate_class;
663 int class_charcount;
664 int class_lastchar;
665 int newoptions;
666 int condref;
667
668 c = *ptr;
669 if ((options & PCRE_EXTENDED) != 0)
670 {
671 if ((cd->ctypes[c] & ctype_space) != 0) continue;
672 if (c == '#')
673 {
674 while ((c = *(++ptr)) != 0 && c != '\n');
675 continue;
676 }
677 }
678
679 switch(c)
680 {
681 /* The branch terminates at end of string, |, or ). */
682
683 case 0:
684 case '|':
685 case ')':
686 *codeptr = code;
687 *ptrptr = ptr;
688 return TRUE;
689
690 /* Handle single-character metacharacters */
691
692 case '^':
693 previous = NULL;
694 *code++ = OP_CIRC;
695 break;
696
697 case '$':
698 previous = NULL;
699 *code++ = OP_DOLL;
700 break;
701
702 case '.':
703 previous = code;
704 *code++ = OP_ANY;
705 break;
706
707 /* Character classes. These always build a 32-byte bitmap of the permitted
708 characters, except in the special case where there is only one character.
709 For negated classes, we build the map as usual, then invert it at the end.
710 */
711
712 case '[':
713 previous = code;
714 *code++ = OP_CLASS;
715
716 /* If the first character is '^', set the negation flag and skip it. */
717
718 if ((c = *(++ptr)) == '^')
719 {
720 negate_class = TRUE;
721 c = *(++ptr);
722 }
723 else negate_class = FALSE;
724
725 /* Keep a count of chars so that we can optimize the case of just a single
726 character. */
727
728 class_charcount = 0;
729 class_lastchar = -1;
730
731 /* Initialize the 32-char bit map to all zeros. We have to build the
732 map in a temporary bit of store, in case the class contains only 1
733 character, because in that case the compiled code doesn't use the
734 bit map. */
735
736 memset(class, 0, 32 * sizeof(uschar));
737
738 /* Process characters until ] is reached. By writing this as a "do" it
739 means that an initial ] is taken as a data character. */
740
741 do
742 {
743 if (c == 0)
744 {
745 *errorptr = ERR6;
746 goto FAILED;
747 }
748
749 /* Backslash may introduce a single character, or it may introduce one
750 of the specials, which just set a flag. Escaped items are checked for
751 validity in the pre-compiling pass. The sequence \b is a special case.
752 Inside a class (and only there) it is treated as backspace. Elsewhere
753 it marks a word boundary. Other escapes have preset maps ready to
754 or into the one we are building. We assume they have more than one
755 character in them, so set class_count bigger than one. */
756
757 if (c == '\\')
758 {
759 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
760 if (-c == ESC_b) c = '\b';
761 else if (c < 0)
762 {
763 register const uschar *cbits = cd->cbits;
764 class_charcount = 10;
765 switch (-c)
766 {
767 case ESC_d:
768 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
769 continue;
770
771 case ESC_D:
772 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
773 continue;
774
775 case ESC_w:
776 for (c = 0; c < 32; c++)
777 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
778 continue;
779
780 case ESC_W:
781 for (c = 0; c < 32; c++)
782 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
783 continue;
784
785 case ESC_s:
786 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
787 continue;
788
789 case ESC_S:
790 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
791 continue;
792
793 default:
794 *errorptr = ERR7;
795 goto FAILED;
796 }
797 }
798 /* Fall through if single character */
799 }
800
801 /* A single character may be followed by '-' to form a range. However,
802 Perl does not permit ']' to be the end of the range. A '-' character
803 here is treated as a literal. */
804
805 if (ptr[1] == '-' && ptr[2] != ']')
806 {
807 int d;
808 ptr += 2;
809 d = *ptr;
810
811 if (d == 0)
812 {
813 *errorptr = ERR6;
814 goto FAILED;
815 }
816
817 /* The second part of a range can be a single-character escape, but
818 not any of the other escapes. */
819
820 if (d == '\\')
821 {
822 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
823 if (d < 0)
824 {
825 if (d == -ESC_b) d = '\b'; else
826 {
827 *errorptr = ERR7;
828 goto FAILED;
829 }
830 }
831 }
832
833 if (d < c)
834 {
835 *errorptr = ERR8;
836 goto FAILED;
837 }
838
839 for (; c <= d; c++)
840 {
841 class[c/8] |= (1 << (c&7));
842 if ((options & PCRE_CASELESS) != 0)
843 {
844 int uc = cd->fcc[c]; /* flip case */
845 class[uc/8] |= (1 << (uc&7));
846 }
847 class_charcount++; /* in case a one-char range */
848 class_lastchar = c;
849 }
850 continue; /* Go get the next char in the class */
851 }
852
853 /* Handle a lone single character - we can get here for a normal
854 non-escape char, or after \ that introduces a single character. */
855
856 class [c/8] |= (1 << (c&7));
857 if ((options & PCRE_CASELESS) != 0)
858 {
859 c = cd->fcc[c]; /* flip case */
860 class[c/8] |= (1 << (c&7));
861 }
862 class_charcount++;
863 class_lastchar = c;
864 }
865
866 /* Loop until ']' reached; the check for end of string happens inside the
867 loop. This "while" is the end of the "do" above. */
868
869 while ((c = *(++ptr)) != ']');
870
871 /* If class_charcount is 1 and class_lastchar is not negative, we saw
872 precisely one character. This doesn't need the whole 32-byte bit map.
873 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
874 it's negative. */
875
876 if (class_charcount == 1 && class_lastchar >= 0)
877 {
878 if (negate_class)
879 {
880 code[-1] = OP_NOT;
881 }
882 else
883 {
884 code[-1] = OP_CHARS;
885 *code++ = 1;
886 }
887 *code++ = class_lastchar;
888 }
889
890 /* Otherwise, negate the 32-byte map if necessary, and copy it into
891 the code vector. */
892
893 else
894 {
895 if (negate_class)
896 for (c = 0; c < 32; c++) code[c] = ~class[c];
897 else
898 memcpy(code, class, 32);
899 code += 32;
900 }
901 break;
902
903 /* Various kinds of repeat */
904
905 case '{':
906 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
907 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
908 if (*errorptr != NULL) goto FAILED;
909 goto REPEAT;
910
911 case '*':
912 repeat_min = 0;
913 repeat_max = -1;
914 goto REPEAT;
915
916 case '+':
917 repeat_min = 1;
918 repeat_max = -1;
919 goto REPEAT;
920
921 case '?':
922 repeat_min = 0;
923 repeat_max = 1;
924
925 REPEAT:
926 if (previous == NULL)
927 {
928 *errorptr = ERR9;
929 goto FAILED;
930 }
931
932 /* If the next character is '?' this is a minimizing repeat, by default,
933 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
934 next character. */
935
936 if (ptr[1] == '?')
937 { repeat_type = greedy_non_default; ptr++; }
938 else repeat_type = greedy_default;
939
940 /* If the maximum is zero then the minimum must also be zero; Perl allows
941 this case, so we do too - by simply omitting the item altogether. */
942
943 if (repeat_max == 0) code = previous;
944
945 /* If previous was a string of characters, chop off the last one and use it
946 as the subject of the repeat. If there was only one character, we can
947 abolish the previous item altogether. */
948
949 else if (*previous == OP_CHARS)
950 {
951 int len = previous[1];
952 if (len == 1)
953 {
954 c = previous[2];
955 code = previous;
956 }
957 else
958 {
959 c = previous[len+1];
960 previous[1]--;
961 code--;
962 }
963 op_type = 0; /* Use single-char op codes */
964 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
965 }
966
967 /* If previous was a single negated character ([^a] or similar), we use
968 one of the special opcodes, replacing it. The code is shared with single-
969 character repeats by adding a suitable offset into repeat_type. */
970
971 else if ((int)*previous == OP_NOT)
972 {
973 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
974 c = previous[1];
975 code = previous;
976 goto OUTPUT_SINGLE_REPEAT;
977 }
978
979 /* If previous was a character type match (\d or similar), abolish it and
980 create a suitable repeat item. The code is shared with single-character
981 repeats by adding a suitable offset into repeat_type. */
982
983 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
984 {
985 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
986 c = *previous;
987 code = previous;
988
989 OUTPUT_SINGLE_REPEAT:
990 repeat_type += op_type; /* Combine both values for many cases */
991
992 /* A minimum of zero is handled either as the special case * or ?, or as
993 an UPTO, with the maximum given. */
994
995 if (repeat_min == 0)
996 {
997 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
998 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
999 else
1000 {
1001 *code++ = OP_UPTO + repeat_type;
1002 *code++ = repeat_max >> 8;
1003 *code++ = (repeat_max & 255);
1004 }
1005 }
1006
1007 /* The case {1,} is handled as the special case + */
1008
1009 else if (repeat_min == 1 && repeat_max == -1)
1010 *code++ = OP_PLUS + repeat_type;
1011
1012 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1013 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1014
1015 else
1016 {
1017 if (repeat_min != 1)
1018 {
1019 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1020 *code++ = repeat_min >> 8;
1021 *code++ = (repeat_min & 255);
1022 }
1023
1024 /* If the mininum is 1 and the previous item was a character string,
1025 we either have to put back the item that got cancelled if the string
1026 length was 1, or add the character back onto the end of a longer
1027 string. For a character type nothing need be done; it will just get
1028 put back naturally. Note that the final character is always going to
1029 get added below. */
1030
1031 else if (*previous == OP_CHARS)
1032 {
1033 if (code == previous) code += 2; else previous[1]++;
1034 }
1035
1036 /* For a single negated character we also have to put back the
1037 item that got cancelled. */
1038
1039 else if (*previous == OP_NOT) code++;
1040
1041 /* If the maximum is unlimited, insert an OP_STAR. */
1042
1043 if (repeat_max < 0)
1044 {
1045 *code++ = c;
1046 *code++ = OP_STAR + repeat_type;
1047 }
1048
1049 /* Else insert an UPTO if the max is greater than the min. */
1050
1051 else if (repeat_max != repeat_min)
1052 {
1053 *code++ = c;
1054 repeat_max -= repeat_min;
1055 *code++ = OP_UPTO + repeat_type;
1056 *code++ = repeat_max >> 8;
1057 *code++ = (repeat_max & 255);
1058 }
1059 }
1060
1061 /* The character or character type itself comes last in all cases. */
1062
1063 *code++ = c;
1064 }
1065
1066 /* If previous was a character class or a back reference, we put the repeat
1067 stuff after it. */
1068
1069 else if (*previous == OP_CLASS || *previous == OP_REF)
1070 {
1071 if (repeat_min == 0 && repeat_max == -1)
1072 *code++ = OP_CRSTAR + repeat_type;
1073 else if (repeat_min == 1 && repeat_max == -1)
1074 *code++ = OP_CRPLUS + repeat_type;
1075 else if (repeat_min == 0 && repeat_max == 1)
1076 *code++ = OP_CRQUERY + repeat_type;
1077 else
1078 {
1079 *code++ = OP_CRRANGE + repeat_type;
1080 *code++ = repeat_min >> 8;
1081 *code++ = repeat_min & 255;
1082 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1083 *code++ = repeat_max >> 8;
1084 *code++ = repeat_max & 255;
1085 }
1086 }
1087
1088 /* If previous was a bracket group, we may have to replicate it in certain
1089 cases. */
1090
1091 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092 (int)*previous == OP_COND)
1093 {
1094 register int i;
1095 int ketoffset = 0;
1096 int len = code - previous;
1097 uschar *bralink = NULL;
1098
1099 /* If the maximum repeat count is unlimited, find the end of the bracket
1100 by scanning through from the start, and compute the offset back to it
1101 from the current code pointer. There may be an OP_OPT setting following
1102 the final KET, so we can't find the end just by going back from the code
1103 pointer. */
1104
1105 if (repeat_max == -1)
1106 {
1107 register uschar *ket = previous;
1108 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1109 ketoffset = code - ket;
1110 }
1111
1112 /* The case of a zero minimum is special because of the need to stick
1113 OP_BRAZERO in front of it, and because the group appears once in the
1114 data, whereas in other cases it appears the minimum number of times. For
1115 this reason, it is simplest to treat this case separately, as otherwise
1116 the code gets far too mess. There are several special subcases when the
1117 minimum is zero. */
1118
1119 if (repeat_min == 0)
1120 {
1121 /* If the maximum is also zero, we just omit the group from the output
1122 altogether. */
1123
1124 if (repeat_max == 0)
1125 {
1126 code = previous;
1127 previous = NULL;
1128 break;
1129 }
1130
1131 /* If the maximum is 1 or unlimited, we just have to stick in the
1132 BRAZERO and do no more at this point. */
1133
1134 if (repeat_max <= 1)
1135 {
1136 memmove(previous+1, previous, len);
1137 code++;
1138 *previous++ = OP_BRAZERO + repeat_type;
1139 }
1140
1141 /* If the maximum is greater than 1 and limited, we have to replicate
1142 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1143 The first one has to be handled carefully because it's the original
1144 copy, which has to be moved up. The remainder can be handled by code
1145 that is common with the non-zero minimum case below. We just have to
1146 adjust the value or repeat_max, since one less copy is required. */
1147
1148 else
1149 {
1150 int offset;
1151 memmove(previous+4, previous, len);
1152 code += 4;
1153 *previous++ = OP_BRAZERO + repeat_type;
1154 *previous++ = OP_BRA;
1155
1156 /* We chain together the bracket offset fields that have to be
1157 filled in later when the ends of the brackets are reached. */
1158
1159 offset = (bralink == NULL)? 0 : previous - bralink;
1160 bralink = previous;
1161 *previous++ = offset >> 8;
1162 *previous++ = offset & 255;
1163 }
1164
1165 repeat_max--;
1166 }
1167
1168 /* If the minimum is greater than zero, replicate the group as many
1169 times as necessary, and adjust the maximum to the number of subsequent
1170 copies that we need. */
1171
1172 else
1173 {
1174 for (i = 1; i < repeat_min; i++)
1175 {
1176 memcpy(code, previous, len);
1177 code += len;
1178 }
1179 if (repeat_max > 0) repeat_max -= repeat_min;
1180 }
1181
1182 /* This code is common to both the zero and non-zero minimum cases. If
1183 the maximum is limited, it replicates the group in a nested fashion,
1184 remembering the bracket starts on a stack. In the case of a zero minimum,
1185 the first one was set up above. In all cases the repeat_max now specifies
1186 the number of additional copies needed. */
1187
1188 if (repeat_max >= 0)
1189 {
1190 for (i = repeat_max - 1; i >= 0; i--)
1191 {
1192 *code++ = OP_BRAZERO + repeat_type;
1193
1194 /* All but the final copy start a new nesting, maintaining the
1195 chain of brackets outstanding. */
1196
1197 if (i != 0)
1198 {
1199 int offset;
1200 *code++ = OP_BRA;
1201 offset = (bralink == NULL)? 0 : code - bralink;
1202 bralink = code;
1203 *code++ = offset >> 8;
1204 *code++ = offset & 255;
1205 }
1206
1207 memcpy(code, previous, len);
1208 code += len;
1209 }
1210
1211 /* Now chain through the pending brackets, and fill in their length
1212 fields (which are holding the chain links pro tem). */
1213
1214 while (bralink != NULL)
1215 {
1216 int oldlinkoffset;
1217 int offset = code - bralink + 1;
1218 uschar *bra = code - offset;
1219 oldlinkoffset = (bra[1] << 8) + bra[2];
1220 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1221 *code++ = OP_KET;
1222 *code++ = bra[1] = offset >> 8;
1223 *code++ = bra[2] = (offset & 255);
1224 }
1225 }
1226
1227 /* If the maximum is unlimited, set a repeater in the final copy. We
1228 can't just offset backwards from the current code point, because we
1229 don't know if there's been an options resetting after the ket. The
1230 correct offset was computed above. */
1231
1232 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1233
1234
1235 #ifdef NEVER
1236 /* If the minimum is greater than zero, and the maximum is unlimited or
1237 equal to the minimum, the first copy remains where it is, and is
1238 replicated up to the minimum number of times. This case includes the +
1239 repeat, but of course no replication is needed in that case. */
1240
1241 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1242 {
1243 for (i = 1; i < repeat_min; i++)
1244 {
1245 memcpy(code, previous, len);
1246 code += len;
1247 }
1248 }
1249
1250 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1251 Then, if there is a fixed upper limit, replicated up to that many times,
1252 sticking BRAZERO in front of all the optional ones. */
1253
1254 else
1255 {
1256 if (repeat_min == 0)
1257 {
1258 memmove(previous+1, previous, len);
1259 code++;
1260 *previous++ = OP_BRAZERO + repeat_type;
1261 }
1262
1263 for (i = 1; i < repeat_min; i++)
1264 {
1265 memcpy(code, previous, len);
1266 code += len;
1267 }
1268
1269 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1270 {
1271 *code++ = OP_BRAZERO + repeat_type;
1272 memcpy(code, previous, len);
1273 code += len;
1274 }
1275 }
1276
1277 /* If the maximum is unlimited, set a repeater in the final copy. We
1278 can't just offset backwards from the current code point, because we
1279 don't know if there's been an options resetting after the ket. The
1280 correct offset was computed above. */
1281
1282 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1283 #endif
1284
1285
1286 }
1287
1288 /* Else there's some kind of shambles */
1289
1290 else
1291 {
1292 *errorptr = ERR11;
1293 goto FAILED;
1294 }
1295
1296 /* In all case we no longer have a previous item. */
1297
1298 previous = NULL;
1299 break;
1300
1301
1302 /* Start of nested bracket sub-expression, or comment or lookahead or
1303 lookbehind or option setting or condition. First deal with special things
1304 that can come after a bracket; all are introduced by ?, and the appearance
1305 of any of them means that this is not a referencing group. They were
1306 checked for validity in the first pass over the string, so we don't have to
1307 check for syntax errors here. */
1308
1309 case '(':
1310 newoptions = options;
1311 condref = -1;
1312
1313 if (*(++ptr) == '?')
1314 {
1315 int set, unset;
1316 int *optset;
1317
1318 switch (*(++ptr))
1319 {
1320 case '#': /* Comment; skip to ket */
1321 ptr++;
1322 while (*ptr != ')') ptr++;
1323 continue;
1324
1325 case ':': /* Non-extracting bracket */
1326 bravalue = OP_BRA;
1327 ptr++;
1328 break;
1329
1330 case '(':
1331 bravalue = OP_COND; /* Conditional group */
1332 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1333 {
1334 condref = *ptr - '0';
1335 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1336 ptr++;
1337 }
1338 else ptr--;
1339 break;
1340
1341 case '=': /* Positive lookahead */
1342 bravalue = OP_ASSERT;
1343 ptr++;
1344 break;
1345
1346 case '!': /* Negative lookahead */
1347 bravalue = OP_ASSERT_NOT;
1348 ptr++;
1349 break;
1350
1351 case '<': /* Lookbehinds */
1352 switch (*(++ptr))
1353 {
1354 case '=': /* Positive lookbehind */
1355 bravalue = OP_ASSERTBACK;
1356 ptr++;
1357 break;
1358
1359 case '!': /* Negative lookbehind */
1360 bravalue = OP_ASSERTBACK_NOT;
1361 ptr++;
1362 break;
1363
1364 default: /* Syntax error */
1365 *errorptr = ERR24;
1366 goto FAILED;
1367 }
1368 break;
1369
1370 case '>': /* One-time brackets */
1371 bravalue = OP_ONCE;
1372 ptr++;
1373 break;
1374
1375 default: /* Option setting */
1376 set = unset = 0;
1377 optset = &set;
1378
1379 while (*ptr != ')' && *ptr != ':')
1380 {
1381 switch (*ptr++)
1382 {
1383 case '-': optset = &unset; break;
1384
1385 case 'i': *optset |= PCRE_CASELESS; break;
1386 case 'm': *optset |= PCRE_MULTILINE; break;
1387 case 's': *optset |= PCRE_DOTALL; break;
1388 case 'x': *optset |= PCRE_EXTENDED; break;
1389 case 'U': *optset |= PCRE_UNGREEDY; break;
1390 case 'X': *optset |= PCRE_EXTRA; break;
1391
1392 default:
1393 *errorptr = ERR12;
1394 goto FAILED;
1395 }
1396 }
1397
1398 /* Set up the changed option bits, but don't change anything yet. */
1399
1400 newoptions = (options | set) & (~unset);
1401
1402 /* If the options ended with ')' this is not the start of a nested
1403 group with option changes, so the options change at this level. At top
1404 level there is nothing else to be done (the options will in fact have
1405 been set from the start of compiling as a result of the first pass) but
1406 at an inner level we must compile code to change the ims options if
1407 necessary, and pass the new setting back so that it can be put at the
1408 start of any following branches, and when this group ends, a resetting
1409 item can be compiled. */
1410
1411 if (*ptr == ')')
1412 {
1413 if ((options & PCRE_INGROUP) != 0 &&
1414 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1415 {
1416 *code++ = OP_OPT;
1417 *code++ = *optchanged = newoptions & PCRE_IMS;
1418 }
1419 options = newoptions; /* Change options at this level */
1420 previous = NULL; /* This item can't be repeated */
1421 continue; /* It is complete */
1422 }
1423
1424 /* If the options ended with ':' we are heading into a nested group
1425 with possible change of options. Such groups are non-capturing and are
1426 not assertions of any kind. All we need to do is skip over the ':';
1427 the newoptions value is handled below. */
1428
1429 bravalue = OP_BRA;
1430 ptr++;
1431 }
1432 }
1433
1434 /* Else we have a referencing group; adjust the opcode. */
1435
1436 else
1437 {
1438 if (++(*brackets) > EXTRACT_MAX)
1439 {
1440 *errorptr = ERR13;
1441 goto FAILED;
1442 }
1443 bravalue = OP_BRA + *brackets;
1444 }
1445
1446 /* Process nested bracketed re. Assertions may not be repeated, but other
1447 kinds can be. We copy code into a non-register variable in order to be able
1448 to pass its address because some compilers complain otherwise. Pass in a
1449 new setting for the ims options if they have changed. */
1450
1451 previous = (bravalue >= OP_ONCE)? code : NULL;
1452 *code = bravalue;
1453 tempcode = code;
1454
1455 if (!compile_regex(
1456 options | PCRE_INGROUP, /* Set for all nested groups */
1457 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1458 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1459 brackets, /* Bracket level */
1460 &tempcode, /* Where to put code (updated) */
1461 &ptr, /* Input pointer (updated) */
1462 errorptr, /* Where to put an error message */
1463 (bravalue == OP_ASSERTBACK ||
1464 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1465 condref, /* Condition reference number */
1466 cd)) /* Tables block */
1467 goto FAILED;
1468
1469 /* At the end of compiling, code is still pointing to the start of the
1470 group, while tempcode has been updated to point past the end of the group
1471 and any option resetting that may follow it. The pattern pointer (ptr)
1472 is on the bracket. */
1473
1474 /* If this is a conditional bracket, check that there are no more than
1475 two branches in the group. */
1476
1477 if (bravalue == OP_COND)
1478 {
1479 int branchcount = 0;
1480 uschar *tc = code;
1481
1482 do {
1483 branchcount++;
1484 tc += (tc[1] << 8) | tc[2];
1485 }
1486 while (*tc != OP_KET);
1487
1488 if (branchcount > 2)
1489 {
1490 *errorptr = ERR27;
1491 goto FAILED;
1492 }
1493 }
1494
1495 /* Now update the main code pointer to the end of the group. */
1496
1497 code = tempcode;
1498
1499 /* Error if hit end of pattern */
1500
1501 if (*ptr != ')')
1502 {
1503 *errorptr = ERR14;
1504 goto FAILED;
1505 }
1506 break;
1507
1508 /* Check \ for being a real metacharacter; if not, fall through and handle
1509 it as a data character at the start of a string. Escape items are checked
1510 for validity in the pre-compiling pass. */
1511
1512 case '\\':
1513 tempptr = ptr;
1514 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1515
1516 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1517 are arranged to be the negation of the corresponding OP_values. For the
1518 back references, the values are ESC_REF plus the reference number. Only
1519 back references and those types that consume a character may be repeated.
1520 We can test for values between ESC_b and ESC_Z for the latter; this may
1521 have to change if any new ones are ever created. */
1522
1523 if (c < 0)
1524 {
1525 if (-c >= ESC_REF)
1526 {
1527 previous = code;
1528 *code++ = OP_REF;
1529 *code++ = -c - ESC_REF;
1530 }
1531 else
1532 {
1533 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1534 *code++ = -c;
1535 }
1536 continue;
1537 }
1538
1539 /* Data character: reset and fall through */
1540
1541 ptr = tempptr;
1542 c = '\\';
1543
1544 /* Handle a run of data characters until a metacharacter is encountered.
1545 The first character is guaranteed not to be whitespace or # when the
1546 extended flag is set. */
1547
1548 NORMAL_CHAR:
1549 default:
1550 previous = code;
1551 *code = OP_CHARS;
1552 code += 2;
1553 length = 0;
1554
1555 do
1556 {
1557 if ((options & PCRE_EXTENDED) != 0)
1558 {
1559 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1560 if (c == '#')
1561 {
1562 while ((c = *(++ptr)) != 0 && c != '\n');
1563 if (c == 0) break;
1564 continue;
1565 }
1566 }
1567
1568 /* Backslash may introduce a data char or a metacharacter. Escaped items
1569 are checked for validity in the pre-compiling pass. Stop the string
1570 before a metaitem. */
1571
1572 if (c == '\\')
1573 {
1574 tempptr = ptr;
1575 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1576 if (c < 0) { ptr = tempptr; break; }
1577 }
1578
1579 /* Ordinary character or single-char escape */
1580
1581 *code++ = c;
1582 length++;
1583 }
1584
1585 /* This "while" is the end of the "do" above. */
1586
1587 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1588
1589 /* Compute the length and set it in the data vector, and advance to
1590 the next state. */
1591
1592 previous[1] = length;
1593 if (length < 255) ptr--;
1594 break;
1595 }
1596 } /* end of big loop */
1597
1598 /* Control never reaches here by falling through, only by a goto for all the
1599 error states. Pass back the position in the pattern so that it can be displayed
1600 to the user for diagnosing the error. */
1601
1602 FAILED:
1603 *ptrptr = ptr;
1604 return FALSE;
1605 }
1606
1607
1608
1609
1610 /*************************************************
1611 * Compile sequence of alternatives *
1612 *************************************************/
1613
1614 /* On entry, ptr is pointing past the bracket character, but on return
1615 it points to the closing bracket, or vertical bar, or end of string.
1616 The code variable is pointing at the byte into which the BRA operator has been
1617 stored. If the ims options are changed at the start (for a (?ims: group) or
1618 during any branch, we need to insert an OP_OPT item at the start of every
1619 following branch to ensure they get set correctly at run time, and also pass
1620 the new options into every subsequent branch compile.
1621
1622 Argument:
1623 options the option bits
1624 optchanged new ims options to set as if (?ims) were at the start, or -1
1625 for no change
1626 brackets -> int containing the number of extracting brackets used
1627 codeptr -> the address of the current code pointer
1628 ptrptr -> the address of the current pattern pointer
1629 errorptr -> pointer to error message
1630 lookbehind TRUE if this is a lookbehind assertion
1631 condref > 0 for OPT_CREF setting at start of conditional group
1632 cd points to the data block with tables pointers
1633
1634 Returns: TRUE on success
1635 */
1636
1637 static BOOL
1638 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1639 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1640 compile_data *cd)
1641 {
1642 const uschar *ptr = *ptrptr;
1643 uschar *code = *codeptr;
1644 uschar *last_branch = code;
1645 uschar *start_bracket = code;
1646 uschar *reverse_count = NULL;
1647 int oldoptions = options & PCRE_IMS;
1648
1649 code += 3;
1650
1651 /* At the start of a reference-based conditional group, insert the reference
1652 number as an OP_CREF item. */
1653
1654 if (condref > 0)
1655 {
1656 *code++ = OP_CREF;
1657 *code++ = condref;
1658 }
1659
1660 /* Loop for each alternative branch */
1661
1662 for (;;)
1663 {
1664 int length;
1665
1666 /* Handle change of options */
1667
1668 if (optchanged >= 0)
1669 {
1670 *code++ = OP_OPT;
1671 *code++ = optchanged;
1672 options = (options & ~PCRE_IMS) | optchanged;
1673 }
1674
1675 /* Set up dummy OP_REVERSE if lookbehind assertion */
1676
1677 if (lookbehind)
1678 {
1679 *code++ = OP_REVERSE;
1680 reverse_count = code;
1681 *code++ = 0;
1682 *code++ = 0;
1683 }
1684
1685 /* Now compile the branch */
1686
1687 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1688 {
1689 *ptrptr = ptr;
1690 return FALSE;
1691 }
1692
1693 /* Fill in the length of the last branch */
1694
1695 length = code - last_branch;
1696 last_branch[1] = length >> 8;
1697 last_branch[2] = length & 255;
1698
1699 /* If lookbehind, check that this branch matches a fixed-length string,
1700 and put the length into the OP_REVERSE item. Temporarily mark the end of
1701 the branch with OP_END. */
1702
1703 if (lookbehind)
1704 {
1705 *code = OP_END;
1706 length = find_fixedlength(last_branch);
1707 DPRINTF(("fixed length = %d\n", length));
1708 if (length < 0)
1709 {
1710 *errorptr = ERR25;
1711 *ptrptr = ptr;
1712 return FALSE;
1713 }
1714 reverse_count[0] = (length >> 8);
1715 reverse_count[1] = length & 255;
1716 }
1717
1718 /* Reached end of expression, either ')' or end of pattern. Insert a
1719 terminating ket and the length of the whole bracketed item, and return,
1720 leaving the pointer at the terminating char. If any of the ims options
1721 were changed inside the group, compile a resetting op-code following. */
1722
1723 if (*ptr != '|')
1724 {
1725 length = code - start_bracket;
1726 *code++ = OP_KET;
1727 *code++ = length >> 8;
1728 *code++ = length & 255;
1729 if (optchanged >= 0)
1730 {
1731 *code++ = OP_OPT;
1732 *code++ = oldoptions;
1733 }
1734 *codeptr = code;
1735 *ptrptr = ptr;
1736 return TRUE;
1737 }
1738
1739 /* Another branch follows; insert an "or" node and advance the pointer. */
1740
1741 *code = OP_ALT;
1742 last_branch = code;
1743 code += 3;
1744 ptr++;
1745 }
1746 /* Control never reaches here */
1747 }
1748
1749
1750
1751
1752 /*************************************************
1753 * Find first significant op code *
1754 *************************************************/
1755
1756 /* This is called by several functions that scan a compiled expression looking
1757 for a fixed first character, or an anchoring op code etc. It skips over things
1758 that do not influence this. For one application, a change of caseless option is
1759 important.
1760
1761 Arguments:
1762 code pointer to the start of the group
1763 options pointer to external options
1764 optbit the option bit whose changing is significant, or
1765 zero if none are
1766 optstop TRUE to return on option change, otherwise change the options
1767 value and continue
1768
1769 Returns: pointer to the first significant opcode
1770 */
1771
1772 static const uschar*
1773 first_significant_code(const uschar *code, int *options, int optbit,
1774 BOOL optstop)
1775 {
1776 for (;;)
1777 {
1778 switch ((int)*code)
1779 {
1780 case OP_OPT:
1781 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1782 {
1783 if (optstop) return code;
1784 *options = (int)code[1];
1785 }
1786 code += 2;
1787 break;
1788
1789 case OP_CREF:
1790 code += 2;
1791 break;
1792
1793 case OP_ASSERT_NOT:
1794 case OP_ASSERTBACK:
1795 case OP_ASSERTBACK_NOT:
1796 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1797 code += 3;
1798 break;
1799
1800 default:
1801 return code;
1802 }
1803 }
1804 /* Control never reaches here */
1805 }
1806
1807
1808
1809
1810 /*************************************************
1811 * Check for anchored expression *
1812 *************************************************/
1813
1814 /* Try to find out if this is an anchored regular expression. Consider each
1815 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1816 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1817 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1818 counts, since OP_CIRC can match in the middle.
1819
1820 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1821 because that will try the rest of the pattern at all possible matching points,
1822 so there is no point trying them again.
1823
1824 Arguments:
1825 code points to start of expression (the bracket)
1826 options points to the options setting
1827
1828 Returns: TRUE or FALSE
1829 */
1830
1831 static BOOL
1832 is_anchored(register const uschar *code, int *options)
1833 {
1834 do {
1835 const uschar *scode = first_significant_code(code + 3, options,
1836 PCRE_MULTILINE, FALSE);
1837 register int op = *scode;
1838 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1839 { if (!is_anchored(scode, options)) return FALSE; }
1840 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1841 (*options & PCRE_DOTALL) != 0)
1842 { if (scode[1] != OP_ANY) return FALSE; }
1843 else if (op != OP_SOD &&
1844 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1845 return FALSE;
1846 code += (code[1] << 8) + code[2];
1847 }
1848 while (*code == OP_ALT);
1849 return TRUE;
1850 }
1851
1852
1853
1854 /*************************************************
1855 * Check for starting with ^ or .* *
1856 *************************************************/
1857
1858 /* This is called to find out if every branch starts with ^ or .* so that
1859 "first char" processing can be done to speed things up in multiline
1860 matching and for non-DOTALL patterns that start with .* (which must start at
1861 the beginning or after \n).
1862
1863 Argument: points to start of expression (the bracket)
1864 Returns: TRUE or FALSE
1865 */
1866
1867 static BOOL
1868 is_startline(const uschar *code)
1869 {
1870 do {
1871 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1872 register int op = *scode;
1873 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1874 { if (!is_startline(scode)) return FALSE; }
1875 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1876 { if (scode[1] != OP_ANY) return FALSE; }
1877 else if (op != OP_CIRC) return FALSE;
1878 code += (code[1] << 8) + code[2];
1879 }
1880 while (*code == OP_ALT);
1881 return TRUE;
1882 }
1883
1884
1885
1886 /*************************************************
1887 * Check for fixed first char *
1888 *************************************************/
1889
1890 /* Try to find out if there is a fixed first character. This is called for
1891 unanchored expressions, as it speeds up their processing quite considerably.
1892 Consider each alternative branch. If they all start with the same char, or with
1893 a bracket all of whose alternatives start with the same char (recurse ad lib),
1894 then we return that char, otherwise -1.
1895
1896 Arguments:
1897 code points to start of expression (the bracket)
1898 options pointer to the options (used to check casing changes)
1899
1900 Returns: -1 or the fixed first char
1901 */
1902
1903 static int
1904 find_firstchar(const uschar *code, int *options)
1905 {
1906 register int c = -1;
1907 do {
1908 int d;
1909 const uschar *scode = first_significant_code(code + 3, options,
1910 PCRE_CASELESS, TRUE);
1911 register int op = *scode;
1912
1913 if (op >= OP_BRA) op = OP_BRA;
1914
1915 switch(op)
1916 {
1917 default:
1918 return -1;
1919
1920 case OP_BRA:
1921 case OP_ASSERT:
1922 case OP_ONCE:
1923 case OP_COND:
1924 if ((d = find_firstchar(scode, options)) < 0) return -1;
1925 if (c < 0) c = d; else if (c != d) return -1;
1926 break;
1927
1928 case OP_EXACT: /* Fall through */
1929 scode++;
1930
1931 case OP_CHARS: /* Fall through */
1932 scode++;
1933
1934 case OP_PLUS:
1935 case OP_MINPLUS:
1936 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1937 break;
1938 }
1939
1940 code += (code[1] << 8) + code[2];
1941 }
1942 while (*code == OP_ALT);
1943 return c;
1944 }
1945
1946
1947
1948
1949
1950 /*************************************************
1951 * Compile a Regular Expression *
1952 *************************************************/
1953
1954 /* This function takes a string and returns a pointer to a block of store
1955 holding a compiled version of the expression.
1956
1957 Arguments:
1958 pattern the regular expression
1959 options various option bits
1960 errorptr pointer to pointer to error text
1961 erroroffset ptr offset in pattern where error was detected
1962 tables pointer to character tables or NULL
1963
1964 Returns: pointer to compiled data block, or NULL on error,
1965 with errorptr and erroroffset set
1966 */
1967
1968 pcre *
1969 pcre_compile(const char *pattern, int options, const char **errorptr,
1970 int *erroroffset, const unsigned char *tables)
1971 {
1972 real_pcre *re;
1973 int length = 3; /* For initial BRA plus length */
1974 int runlength;
1975 int c, size;
1976 int bracount = 0;
1977 int top_backref = 0;
1978 int branch_extra = 0;
1979 int branch_newextra;
1980 unsigned int brastackptr = 0;
1981 uschar *code;
1982 const uschar *ptr;
1983 compile_data compile_block;
1984 int brastack[BRASTACK_SIZE];
1985 uschar bralenstack[BRASTACK_SIZE];
1986
1987 #ifdef DEBUG
1988 uschar *code_base, *code_end;
1989 #endif
1990
1991 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1992 can do is just return NULL. */
1993
1994 if (errorptr == NULL) return NULL;
1995 *errorptr = NULL;
1996
1997 /* However, we can give a message for this error */
1998
1999 if (erroroffset == NULL)
2000 {
2001 *errorptr = ERR16;
2002 return NULL;
2003 }
2004 *erroroffset = 0;
2005
2006 if ((options & ~PUBLIC_OPTIONS) != 0)
2007 {
2008 *errorptr = ERR17;
2009 return NULL;
2010 }
2011
2012 /* Set up pointers to the individual character tables */
2013
2014 if (tables == NULL) tables = pcre_default_tables;
2015 compile_block.lcc = tables + lcc_offset;
2016 compile_block.fcc = tables + fcc_offset;
2017 compile_block.cbits = tables + cbits_offset;
2018 compile_block.ctypes = tables + ctypes_offset;
2019
2020 /* Reflect pattern for debugging output */
2021
2022 DPRINTF(("------------------------------------------------------------------\n"));
2023 DPRINTF(("%s\n", pattern));
2024
2025 /* The first thing to do is to make a pass over the pattern to compute the
2026 amount of store required to hold the compiled code. This does not have to be
2027 perfect as long as errors are overestimates. At the same time we can detect any
2028 internal flag settings. Make an attempt to correct for any counted white space
2029 if an "extended" flag setting appears late in the pattern. We can't be so
2030 clever for #-comments. */
2031
2032 ptr = (const uschar *)(pattern - 1);
2033 while ((c = *(++ptr)) != 0)
2034 {
2035 int min, max;
2036 int class_charcount;
2037
2038 if ((options & PCRE_EXTENDED) != 0)
2039 {
2040 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2041 if (c == '#')
2042 {
2043 while ((c = *(++ptr)) != 0 && c != '\n');
2044 continue;
2045 }
2046 }
2047
2048 switch(c)
2049 {
2050 /* A backslashed item may be an escaped "normal" character or a
2051 character type. For a "normal" character, put the pointers and
2052 character back so that tests for whitespace etc. in the input
2053 are done correctly. */
2054
2055 case '\\':
2056 {
2057 const uschar *save_ptr = ptr;
2058 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2059 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2060 if (c >= 0)
2061 {
2062 ptr = save_ptr;
2063 c = '\\';
2064 goto NORMAL_CHAR;
2065 }
2066 }
2067 length++;
2068
2069 /* A back reference needs an additional char, plus either one or 5
2070 bytes for a repeat. We also need to keep the value of the highest
2071 back reference. */
2072
2073 if (c <= -ESC_REF)
2074 {
2075 int refnum = -c - ESC_REF;
2076 if (refnum > top_backref) top_backref = refnum;
2077 length++; /* For single back reference */
2078 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2079 {
2080 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2081 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2082 if ((min == 0 && (max == 1 || max == -1)) ||
2083 (min == 1 && max == -1))
2084 length++;
2085 else length += 5;
2086 if (ptr[1] == '?') ptr++;
2087 }
2088 }
2089 continue;
2090
2091 case '^':
2092 case '.':
2093 case '$':
2094 case '*': /* These repeats won't be after brackets; */
2095 case '+': /* those are handled separately */
2096 case '?':
2097 length++;
2098 continue;
2099
2100 /* This covers the cases of repeats after a single char, metachar, class,
2101 or back reference. */
2102
2103 case '{':
2104 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2105 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2106 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2107 if ((min == 0 && (max == 1 || max == -1)) ||
2108 (min == 1 && max == -1))
2109 length++;
2110 else
2111 {
2112 length--; /* Uncount the original char or metachar */
2113 if (min == 1) length++; else if (min > 0) length += 4;
2114 if (max > 0) length += 4; else length += 2;
2115 }
2116 if (ptr[1] == '?') ptr++;
2117 continue;
2118
2119 /* An alternation contains an offset to the next branch or ket. If any ims
2120 options changed in the previous branch(es), and/or if we are in a
2121 lookbehind assertion, extra space will be needed at the start of the
2122 branch. This is handled by branch_extra. */
2123
2124 case '|':
2125 length += 3 + branch_extra;
2126 continue;
2127
2128 /* A character class uses 33 characters. Don't worry about character types
2129 that aren't allowed in classes - they'll get picked up during the compile.
2130 A character class that contains only one character uses 2 or 3 bytes,
2131 depending on whether it is negated or not. Notice this where we can. */
2132
2133 case '[':
2134 class_charcount = 0;
2135 if (*(++ptr) == '^') ptr++;
2136 do
2137 {
2138 if (*ptr == '\\')
2139 {
2140 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2141 &compile_block);
2142 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2143 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2144 }
2145 else class_charcount++;
2146 ptr++;
2147 }
2148 while (*ptr != 0 && *ptr != ']');
2149
2150 /* Repeats for negated single chars are handled by the general code */
2151
2152 if (class_charcount == 1) length += 3; else
2153 {
2154 length += 33;
2155
2156 /* A repeat needs either 1 or 5 bytes. */
2157
2158 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2159 {
2160 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2161 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2162 if ((min == 0 && (max == 1 || max == -1)) ||
2163 (min == 1 && max == -1))
2164 length++;
2165 else length += 5;
2166 if (ptr[1] == '?') ptr++;
2167 }
2168 }
2169 continue;
2170
2171 /* Brackets may be genuine groups or special things */
2172
2173 case '(':
2174 branch_newextra = 0;
2175
2176 /* Handle special forms of bracket, which all start (? */
2177
2178 if (ptr[1] == '?')
2179 {
2180 int set, unset;
2181 int *optset;
2182
2183 switch (c = ptr[2])
2184 {
2185 /* Skip over comments entirely */
2186 case '#':
2187 ptr += 3;
2188 while (*ptr != 0 && *ptr != ')') ptr++;
2189 if (*ptr == 0)
2190 {
2191 *errorptr = ERR18;
2192 goto PCRE_ERROR_RETURN;
2193 }
2194 continue;
2195
2196 /* Non-referencing groups and lookaheads just move the pointer on, and
2197 then behave like a non-special bracket, except that they don't increment
2198 the count of extracting brackets. Ditto for the "once only" bracket,
2199 which is in Perl from version 5.005. */
2200
2201 case ':':
2202 case '=':
2203 case '!':
2204 case '>':
2205 ptr += 2;
2206 break;
2207
2208 /* Lookbehinds are in Perl from version 5.005 */
2209
2210 case '<':
2211 if (ptr[3] == '=' || ptr[3] == '!')
2212 {
2213 ptr += 3;
2214 branch_newextra = 3;
2215 length += 3; /* For the first branch */
2216 break;
2217 }
2218 *errorptr = ERR24;
2219 goto PCRE_ERROR_RETURN;
2220
2221 /* Conditionals are in Perl from version 5.005. The bracket must either
2222 be followed by a number (for bracket reference) or by an assertion
2223 group. */
2224
2225 case '(':
2226 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2227 {
2228 ptr += 4;
2229 length += 2;
2230 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2231 if (*ptr != ')')
2232 {
2233 *errorptr = ERR26;
2234 goto PCRE_ERROR_RETURN;
2235 }
2236 }
2237 else /* An assertion must follow */
2238 {
2239 ptr++; /* Can treat like ':' as far as spacing is concerned */
2240
2241 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2242 {
2243 ptr += 2; /* To get right offset in message */
2244 *errorptr = ERR28;
2245 goto PCRE_ERROR_RETURN;
2246 }
2247 }
2248 break;
2249
2250 /* Else loop checking valid options until ) is met. Anything else is an
2251 error. If we are without any brackets, i.e. at top level, the settings
2252 act as if specified in the options, so massage the options immediately.
2253 This is for backward compatibility with Perl 5.004. */
2254
2255 default:
2256 set = unset = 0;
2257 optset = &set;
2258 ptr += 2;
2259
2260 for (;; ptr++)
2261 {
2262 c = *ptr;
2263 switch (c)
2264 {
2265 case 'i':
2266 *optset |= PCRE_CASELESS;
2267 continue;
2268
2269 case 'm':
2270 *optset |= PCRE_MULTILINE;
2271 continue;
2272
2273 case 's':
2274 *optset |= PCRE_DOTALL;
2275 continue;
2276
2277 case 'x':
2278 *optset |= PCRE_EXTENDED;
2279 continue;
2280
2281 case 'X':
2282 *optset |= PCRE_EXTRA;
2283 continue;
2284
2285 case 'U':
2286 *optset |= PCRE_UNGREEDY;
2287 continue;
2288
2289 case '-':
2290 optset = &unset;
2291 continue;
2292
2293 /* A termination by ')' indicates an options-setting-only item;
2294 this is global at top level; otherwise nothing is done here and
2295 it is handled during the compiling process on a per-bracket-group
2296 basis. */
2297
2298 case ')':
2299 if (brastackptr == 0)
2300 {
2301 options = (options | set) & (~unset);
2302 set = unset = 0; /* To save length */
2303 }
2304 /* Fall through */
2305
2306 /* A termination by ':' indicates the start of a nested group with
2307 the given options set. This is again handled at compile time, but
2308 we must allow for compiled space if any of the ims options are
2309 set. We also have to allow for resetting space at the end of
2310 the group, which is why 4 is added to the length and not just 2.
2311 If there are several changes of options within the same group, this
2312 will lead to an over-estimate on the length, but this shouldn't
2313 matter very much. We also have to allow for resetting options at
2314 the start of any alternations, which we do by setting
2315 branch_newextra to 2. */
2316
2317 case ':':
2318 if (((set|unset) & PCRE_IMS) != 0)
2319 {
2320 length += 4;
2321 branch_newextra = 2;
2322 }
2323 goto END_OPTIONS;
2324
2325 /* Unrecognized option character */
2326
2327 default:
2328 *errorptr = ERR12;
2329 goto PCRE_ERROR_RETURN;
2330 }
2331 }
2332
2333 /* If we hit a closing bracket, that's it - this is a freestanding
2334 option-setting. We need to ensure that branch_extra is updated if
2335 necessary. The only values branch_newextra can have here are 0 or 2.
2336 If the value is 2, then branch_extra must either be 2 or 5, depending
2337 on whether this is a lookbehind group or not. */
2338
2339 END_OPTIONS:
2340 if (c == ')')
2341 {
2342 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2343 branch_extra += branch_newextra;
2344 continue;
2345 }
2346
2347 /* If options were terminated by ':' control comes here. Fall through
2348 to handle the group below. */
2349 }
2350 }
2351
2352 /* Extracting brackets must be counted so we can process escapes in a
2353 Perlish way. */
2354
2355 else bracount++;
2356
2357 /* Non-special forms of bracket. Save length for computing whole length
2358 at end if there's a repeat that requires duplication of the group. Also
2359 save the current value of branch_extra, and start the new group with
2360 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2361 for a lookbehind assertion. */
2362
2363 if (brastackptr >= sizeof(brastack)/sizeof(int))
2364 {
2365 *errorptr = ERR19;
2366 goto PCRE_ERROR_RETURN;
2367 }
2368
2369 bralenstack[brastackptr] = branch_extra;
2370 branch_extra = branch_newextra;
2371
2372 brastack[brastackptr++] = length;
2373 length += 3;
2374 continue;
2375
2376 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2377 have to replicate this bracket up to that many times. If brastackptr is
2378 0 this is an unmatched bracket which will generate an error, but take care
2379 not to try to access brastack[-1] when computing the length and restoring
2380 the branch_extra value. */
2381
2382 case ')':
2383 length += 3;
2384 {
2385 int minval = 1;
2386 int maxval = 1;
2387 int duplength;
2388
2389 if (brastackptr > 0)
2390 {
2391 duplength = length - brastack[--brastackptr];
2392 branch_extra = bralenstack[brastackptr];
2393 }
2394 else duplength = 0;
2395
2396 /* Leave ptr at the final char; for read_repeat_counts this happens
2397 automatically; for the others we need an increment. */
2398
2399 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2400 {
2401 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2402 &compile_block);
2403 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2404 }
2405 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2406 else if (c == '+') { maxval = -1; ptr++; }
2407 else if (c == '?') { minval = 0; ptr++; }
2408
2409 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2410 group, and if the maximum is greater than zero, we have to replicate
2411 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2412 bracket set - hence the 7. */
2413
2414 if (minval == 0)
2415 {
2416 length++;
2417 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2418 }
2419
2420 /* When the minimum is greater than zero, 1 we have to replicate up to
2421 minval-1 times, with no additions required in the copies. Then, if
2422 there is a limited maximum we have to replicate up to maxval-1 times
2423 allowing for a BRAZERO item before each optional copy and nesting
2424 brackets for all but one of the optional copies. */
2425
2426 else
2427 {
2428 length += (minval - 1) * duplength;
2429 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2430 length += (maxval - minval) * (duplength + 7) - 6;
2431 }
2432 }
2433 continue;
2434
2435 /* Non-special character. For a run of such characters the length required
2436 is the number of characters + 2, except that the maximum run length is 255.
2437 We won't get a skipped space or a non-data escape or the start of a #
2438 comment as the first character, so the length can't be zero. */
2439
2440 NORMAL_CHAR:
2441 default:
2442 length += 2;
2443 runlength = 0;
2444 do
2445 {
2446 if ((options & PCRE_EXTENDED) != 0)
2447 {
2448 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2449 if (c == '#')
2450 {
2451 while ((c = *(++ptr)) != 0 && c != '\n');
2452 continue;
2453 }
2454 }
2455
2456 /* Backslash may introduce a data char or a metacharacter; stop the
2457 string before the latter. */
2458
2459 if (c == '\\')
2460 {
2461 const uschar *saveptr = ptr;
2462 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2463 &compile_block);
2464 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2465 if (c < 0) { ptr = saveptr; break; }
2466 }
2467
2468 /* Ordinary character or single-char escape */
2469
2470 runlength++;
2471 }
2472
2473 /* This "while" is the end of the "do" above. */
2474
2475 while (runlength < 255 &&
2476 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2477
2478 ptr--;
2479 length += runlength;
2480 continue;
2481 }
2482 }
2483
2484 length += 4; /* For final KET and END */
2485
2486 if (length > 65539)
2487 {
2488 *errorptr = ERR20;
2489 return NULL;
2490 }
2491
2492 /* Compute the size of data block needed and get it, either from malloc or
2493 externally provided function. We specify "code[0]" in the offsetof() expression
2494 rather than just "code", because it has been reported that one broken compiler
2495 fails on "code" because it is also an independent variable. It should make no
2496 difference to the value of the offsetof(). */
2497
2498 size = length + offsetof(real_pcre, code[0]);
2499 re = (real_pcre *)(pcre_malloc)(size);
2500
2501 if (re == NULL)
2502 {
2503 *errorptr = ERR21;
2504 return NULL;
2505 }
2506
2507 /* Put in the magic number and the options. */
2508
2509 re->magic_number = MAGIC_NUMBER;
2510 re->options = options;
2511 re->tables = tables;
2512
2513 /* Set up a starting, non-extracting bracket, then compile the expression. On
2514 error, *errorptr will be set non-NULL, so we don't need to look at the result
2515 of the function here. */
2516
2517 ptr = (const uschar *)pattern;
2518 code = re->code;
2519 *code = OP_BRA;
2520 bracount = 0;
2521 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2522 &compile_block);
2523 re->top_bracket = bracount;
2524 re->top_backref = top_backref;
2525
2526 /* If not reached end of pattern on success, there's an excess bracket. */
2527
2528 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2529
2530 /* Fill in the terminating state and check for disastrous overflow, but
2531 if debugging, leave the test till after things are printed out. */
2532
2533 *code++ = OP_END;
2534
2535 #ifndef DEBUG
2536 if (code - re->code > length) *errorptr = ERR23;
2537 #endif
2538
2539 /* Give an error if there's back reference to a non-existent capturing
2540 subpattern. */
2541
2542 if (top_backref > re->top_bracket) *errorptr = ERR15;
2543
2544 /* Failed to compile */
2545
2546 if (*errorptr != NULL)
2547 {
2548 (pcre_free)(re);
2549 PCRE_ERROR_RETURN:
2550 *erroroffset = ptr - (const uschar *)pattern;
2551 return NULL;
2552 }
2553
2554 /* If the anchored option was not passed, set flag if we can determine that the
2555 pattern is anchored by virtue of ^ characters or \A or anything else (such as
2556 starting with .* when DOTALL is set).
2557
2558 Otherwise, see if we can determine what the first character has to be, because
2559 that speeds up unanchored matches no end. If not, see if we can set the
2560 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2561 start with ^. and also when all branches start with .* for non-DOTALL matches.
2562 */
2563
2564 if ((options & PCRE_ANCHORED) == 0)
2565 {
2566 int temp_options = options;
2567 if (is_anchored(re->code, &temp_options))
2568 re->options |= PCRE_ANCHORED;
2569 else
2570 {
2571 int ch = find_firstchar(re->code, &temp_options);
2572 if (ch >= 0)
2573 {
2574 re->first_char = ch;
2575 re->options |= PCRE_FIRSTSET;
2576 }
2577 else if (is_startline(re->code))
2578 re->options |= PCRE_STARTLINE;
2579 }
2580 }
2581
2582 /* Print out the compiled data for debugging */
2583
2584 #ifdef DEBUG
2585
2586 printf("Length = %d top_bracket = %d top_backref = %d\n",
2587 length, re->top_bracket, re->top_backref);
2588
2589 if (re->options != 0)
2590 {
2591 printf("%s%s%s%s%s%s%s%s\n",
2592 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2593 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2594 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2595 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2596 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2597 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2598 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2599 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2600 }
2601
2602 if ((re->options & PCRE_FIRSTSET) != 0)
2603 {
2604 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2605 else printf("First char = \\x%02x\n", re->first_char);
2606 }
2607
2608 code_end = code;
2609 code_base = code = re->code;
2610
2611 while (code < code_end)
2612 {
2613 int charlength;
2614
2615 printf("%3d ", code - code_base);
2616
2617 if (*code >= OP_BRA)
2618 {
2619 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2620 code += 2;
2621 }
2622
2623 else switch(*code)
2624 {
2625 case OP_OPT:
2626 printf(" %.2x %s", code[1], OP_names[*code]);
2627 code++;
2628 break;
2629
2630 case OP_COND:
2631 printf("%3d Cond", (code[1] << 8) + code[2]);
2632 code += 2;
2633 break;
2634
2635 case OP_CREF:
2636 printf(" %.2d %s", code[1], OP_names[*code]);
2637 code++;
2638 break;
2639
2640 case OP_CHARS:
2641 charlength = *(++code);
2642 printf("%3d ", charlength);
2643 while (charlength-- > 0)
2644 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2645 break;
2646
2647 case OP_KETRMAX:
2648 case OP_KETRMIN:
2649 case OP_ALT:
2650 case OP_KET:
2651 case OP_ASSERT:
2652 case OP_ASSERT_NOT:
2653 case OP_ASSERTBACK:
2654 case OP_ASSERTBACK_NOT:
2655 case OP_ONCE:
2656 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2657 code += 2;
2658 break;
2659
2660 case OP_REVERSE:
2661 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2662 code += 2;
2663 break;
2664
2665 case OP_STAR:
2666 case OP_MINSTAR:
2667 case OP_PLUS:
2668 case OP_MINPLUS:
2669 case OP_QUERY:
2670 case OP_MINQUERY:
2671 case OP_TYPESTAR:
2672 case OP_TYPEMINSTAR:
2673 case OP_TYPEPLUS:
2674 case OP_TYPEMINPLUS:
2675 case OP_TYPEQUERY:
2676 case OP_TYPEMINQUERY:
2677 if (*code >= OP_TYPESTAR)
2678 printf(" %s", OP_names[code[1]]);
2679 else if (isprint(c = code[1])) printf(" %c", c);
2680 else printf(" \\x%02x", c);
2681 printf("%s", OP_names[*code++]);
2682 break;
2683
2684 case OP_EXACT:
2685 case OP_UPTO:
2686 case OP_MINUPTO:
2687 if (isprint(c = code[3])) printf(" %c{", c);
2688 else printf(" \\x%02x{", c);
2689 if (*code != OP_EXACT) printf("0,");
2690 printf("%d}", (code[1] << 8) + code[2]);
2691 if (*code == OP_MINUPTO) printf("?");
2692 code += 3;
2693 break;
2694
2695 case OP_TYPEEXACT:
2696 case OP_TYPEUPTO:
2697 case OP_TYPEMINUPTO:
2698 printf(" %s{", OP_names[code[3]]);
2699 if (*code != OP_TYPEEXACT) printf(",");
2700 printf("%d}", (code[1] << 8) + code[2]);
2701 if (*code == OP_TYPEMINUPTO) printf("?");
2702 code += 3;
2703 break;
2704
2705 case OP_NOT:
2706 if (isprint(c = *(++code))) printf(" [^%c]", c);
2707 else printf(" [^\\x%02x]", c);
2708 break;
2709
2710 case OP_NOTSTAR:
2711 case OP_NOTMINSTAR:
2712 case OP_NOTPLUS:
2713 case OP_NOTMINPLUS:
2714 case OP_NOTQUERY:
2715 case OP_NOTMINQUERY:
2716 if (isprint(c = code[1])) printf(" [^%c]", c);
2717 else printf(" [^\\x%02x]", c);
2718 printf("%s", OP_names[*code++]);
2719 break;
2720
2721 case OP_NOTEXACT:
2722 case OP_NOTUPTO:
2723 case OP_NOTMINUPTO:
2724 if (isprint(c = code[3])) printf(" [^%c]{", c);
2725 else printf(" [^\\x%02x]{", c);
2726 if (*code != OP_NOTEXACT) printf(",");
2727 printf("%d}", (code[1] << 8) + code[2]);
2728 if (*code == OP_NOTMINUPTO) printf("?");
2729 code += 3;
2730 break;
2731
2732 case OP_REF:
2733 printf(" \\%d", *(++code));
2734 code ++;
2735 goto CLASS_REF_REPEAT;
2736
2737 case OP_CLASS:
2738 {
2739 int i, min, max;
2740 code++;
2741 printf(" [");
2742
2743 for (i = 0; i < 256; i++)
2744 {
2745 if ((code[i/8] & (1 << (i&7))) != 0)
2746 {
2747 int j;
2748 for (j = i+1; j < 256; j++)
2749 if ((code[j/8] & (1 << (j&7))) == 0) break;
2750 if (i == '-' || i == ']') printf("\\");
2751 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2752 if (--j > i)
2753 {
2754 printf("-");
2755 if (j == '-' || j == ']') printf("\\");
2756 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2757 }
2758 i = j;
2759 }
2760 }
2761 printf("]");
2762 code += 32;
2763
2764 CLASS_REF_REPEAT:
2765
2766 switch(*code)
2767 {
2768 case OP_CRSTAR:
2769 case OP_CRMINSTAR:
2770 case OP_CRPLUS:
2771 case OP_CRMINPLUS:
2772 case OP_CRQUERY:
2773 case OP_CRMINQUERY:
2774 printf("%s", OP_names[*code]);
2775 break;
2776
2777 case OP_CRRANGE:
2778 case OP_CRMINRANGE:
2779 min = (code[1] << 8) + code[2];
2780 max = (code[3] << 8) + code[4];
2781 if (max == 0) printf("{%d,}", min);
2782 else printf("{%d,%d}", min, max);
2783 if (*code == OP_CRMINRANGE) printf("?");
2784 code += 4;
2785 break;
2786
2787 default:
2788 code--;
2789 }
2790 }
2791 break;
2792
2793 /* Anything else is just a one-node item */
2794
2795 default:
2796 printf(" %s", OP_names[*code]);
2797 break;
2798 }
2799
2800 code++;
2801 printf("\n");
2802 }
2803 printf("------------------------------------------------------------------\n");
2804
2805 /* This check is done here in the debugging case so that the code that
2806 was compiled can be seen. */
2807
2808 if (code - re->code > length)
2809 {
2810 *errorptr = ERR23;
2811 (pcre_free)(re);
2812 *erroroffset = ptr - (uschar *)pattern;
2813 return NULL;
2814 }
2815 #endif
2816
2817 return (pcre *)re;
2818 }
2819
2820
2821
2822 /*************************************************
2823 * Match a back-reference *
2824 *************************************************/
2825
2826 /* If a back reference hasn't been set, the length that is passed is greater
2827 than the number of characters left in the string, so the match fails.
2828
2829 Arguments:
2830 offset index into the offset vector
2831 eptr points into the subject
2832 length length to be matched
2833 md points to match data block
2834 ims the ims flags
2835
2836 Returns: TRUE if matched
2837 */
2838
2839 static BOOL
2840 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2841 int ims)
2842 {
2843 const uschar *p = md->start_subject + md->offset_vector[offset];
2844
2845 #ifdef DEBUG
2846 if (eptr >= md->end_subject)
2847 printf("matching subject <null>");
2848 else
2849 {
2850 printf("matching subject ");
2851 pchars(eptr, length, TRUE, md);
2852 }
2853 printf(" against backref ");
2854 pchars(p, length, FALSE, md);
2855 printf("\n");
2856 #endif
2857
2858 /* Always fail if not enough characters left */
2859
2860 if (length > md->end_subject - eptr) return FALSE;
2861
2862 /* Separate the caselesss case for speed */
2863
2864 if ((ims & PCRE_CASELESS) != 0)
2865 {
2866 while (length-- > 0)
2867 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2868 }
2869 else
2870 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2871
2872 return TRUE;
2873 }
2874
2875
2876
2877 /*************************************************
2878 * Match from current position *
2879 *************************************************/
2880
2881 /* On entry ecode points to the first opcode, and eptr to the first character
2882 in the subject string, while eptrb holds the value of eptr at the start of the
2883 last bracketed group - used for breaking infinite loops matching zero-length
2884 strings.
2885
2886 Arguments:
2887 eptr pointer in subject
2888 ecode position in code
2889 offset_top current top pointer
2890 md pointer to "static" info for the match
2891 ims current /i, /m, and /s options
2892 condassert TRUE if called to check a condition assertion
2893 eptrb eptr at start of last bracket
2894
2895 Returns: TRUE if matched
2896 */
2897
2898 static BOOL
2899 match(register const uschar *eptr, register const uschar *ecode,
2900 int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2901 {
2902 int original_ims = ims; /* Save for resetting on ')' */
2903
2904 for (;;)
2905 {
2906 int op = (int)*ecode;
2907 int min, max, ctype;
2908 register int i;
2909 register int c;
2910 BOOL minimize = FALSE;
2911
2912 /* Opening capturing bracket. If there is space in the offset vector, save
2913 the current subject position in the working slot at the top of the vector. We
2914 mustn't change the current values of the data slot, because they may be set
2915 from a previous iteration of this group, and be referred to by a reference
2916 inside the group.
2917
2918 If the bracket fails to match, we need to restore this value and also the
2919 values of the final offsets, in case they were set by a previous iteration of
2920 the same bracket.
2921
2922 If there isn't enough space in the offset vector, treat this as if it were a
2923 non-capturing bracket. Don't worry about setting the flag for the error case
2924 here; that is handled in the code for KET. */
2925
2926 if (op > OP_BRA)
2927 {
2928 int number = op - OP_BRA;
2929 int offset = number << 1;
2930
2931 #ifdef DEBUG
2932 printf("start bracket %d subject=", number);
2933 pchars(eptr, 16, TRUE, md);
2934 printf("\n");
2935 #endif
2936
2937 if (offset < md->offset_max)
2938 {
2939 int save_offset1 = md->offset_vector[offset];
2940 int save_offset2 = md->offset_vector[offset+1];
2941 int save_offset3 = md->offset_vector[md->offset_end - number];
2942
2943 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2944 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2945
2946 do
2947 {
2948 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2949 ecode += (ecode[1] << 8) + ecode[2];
2950 }
2951 while (*ecode == OP_ALT);
2952
2953 DPRINTF(("bracket %d failed\n", number));
2954
2955 md->offset_vector[offset] = save_offset1;
2956 md->offset_vector[offset+1] = save_offset2;
2957 md->offset_vector[md->offset_end - number] = save_offset3;
2958 return FALSE;
2959 }
2960
2961 /* Insufficient room for saving captured contents */
2962
2963 else op = OP_BRA;
2964 }
2965
2966 /* Other types of node can be handled by a switch */
2967
2968 switch(op)
2969 {
2970 case OP_BRA: /* Non-capturing bracket: optimized */
2971 DPRINTF(("start bracket 0\n"));
2972 do
2973 {
2974 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2975 ecode += (ecode[1] << 8) + ecode[2];
2976 }
2977 while (*ecode == OP_ALT);
2978 DPRINTF(("bracket 0 failed\n"));
2979 return FALSE;
2980
2981 /* Conditional group: compilation checked that there are no more than
2982 two branches. If the condition is false, skipping the first branch takes us
2983 past the end if there is only one branch, but that's OK because that is
2984 exactly what going to the ket would do. */
2985
2986 case OP_COND:
2987 if (ecode[3] == OP_CREF) /* Condition is extraction test */
2988 {
2989 int offset = ecode[4] << 1; /* Doubled reference number */
2990 return match(eptr,
2991 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2992 5 : 3 + (ecode[1] << 8) + ecode[2]),
2993 offset_top, md, ims, FALSE, eptr);
2994 }
2995
2996 /* The condition is an assertion. Call match() to evaluate it - setting
2997 the final argument TRUE causes it to stop at the end of an assertion. */
2998
2999 else
3000 {
3001 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3002 {
3003 ecode += 3 + (ecode[4] << 8) + ecode[5];
3004 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3005 }
3006 else ecode += (ecode[1] << 8) + ecode[2];
3007 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3008 }
3009 /* Control never reaches here */
3010
3011 /* Skip over conditional reference data if encountered (should not be) */
3012
3013 case OP_CREF:
3014 ecode += 2;
3015 break;
3016
3017 /* End of the pattern */
3018
3019 case OP_END:
3020 md->end_match_ptr = eptr; /* Record where we ended */
3021 md->end_offset_top = offset_top; /* and how many extracts were taken */
3022 return TRUE;
3023
3024 /* Change option settings */
3025
3026 case OP_OPT:
3027 ims = ecode[1];
3028 ecode += 2;
3029 DPRINTF(("ims set to %02x\n", ims));
3030 break;
3031
3032 /* Assertion brackets. Check the alternative branches in turn - the
3033 matching won't pass the KET for an assertion. If any one branch matches,
3034 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3035 start of each branch to move the current point backwards, so the code at
3036 this level is identical to the lookahead case. */
3037
3038 case OP_ASSERT:
3039 case OP_ASSERTBACK:
3040 do
3041 {
3042 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3043 ecode += (ecode[1] << 8) + ecode[2];
3044 }
3045 while (*ecode == OP_ALT);
3046 if (*ecode == OP_KET) return FALSE;
3047
3048 /* If checking an assertion for a condition, return TRUE. */
3049
3050 if (condassert) return TRUE;
3051
3052 /* Continue from after the assertion, updating the offsets high water
3053 mark, since extracts may have been taken during the assertion. */
3054
3055 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3056 ecode += 3;
3057 offset_top = md->end_offset_top;
3058 continue;
3059
3060 /* Negative assertion: all branches must fail to match */
3061
3062 case OP_ASSERT_NOT:
3063 case OP_ASSERTBACK_NOT:
3064 do
3065 {
3066 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3067 ecode += (ecode[1] << 8) + ecode[2];
3068 }
3069 while (*ecode == OP_ALT);
3070
3071 if (condassert) return TRUE;
3072 ecode += 3;
3073 continue;
3074
3075 /* Move the subject pointer back. This occurs only at the start of
3076 each branch of a lookbehind assertion. If we are too close to the start to
3077 move back, this match function fails. */
3078
3079 case OP_REVERSE:
3080 eptr -= (ecode[1] << 8) + ecode[2];
3081 if (eptr < md->start_subject) return FALSE;
3082 ecode += 3;
3083 break;
3084
3085
3086 /* "Once" brackets are like assertion brackets except that after a match,
3087 the point in the subject string is not moved back. Thus there can never be
3088 a move back into the brackets. Check the alternative branches in turn - the
3089 matching won't pass the KET for this kind of subpattern. If any one branch
3090 matches, we carry on as at the end of a normal bracket, leaving the subject
3091 pointer. */
3092
3093 case OP_ONCE:
3094 {
3095 const uschar *prev = ecode;
3096
3097 do
3098 {
3099 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3100 ecode += (ecode[1] << 8) + ecode[2];
3101 }
3102 while (*ecode == OP_ALT);
3103
3104 /* If hit the end of the group (which could be repeated), fail */
3105
3106 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3107
3108 /* Continue as from after the assertion, updating the offsets high water
3109 mark, since extracts may have been taken. */
3110
3111 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3112
3113 offset_top = md->end_offset_top;
3114 eptr = md->end_match_ptr;
3115
3116 /* For a non-repeating ket, just continue at this level. This also
3117 happens for a repeating ket if no characters were matched in the group.
3118 This is the forcible breaking of infinite loops as implemented in Perl
3119 5.005. If there is an options reset, it will get obeyed in the normal
3120 course of events. */
3121
3122 if (*ecode == OP_KET || eptr == eptrb)
3123 {
3124 ecode += 3;
3125 break;
3126 }
3127
3128 /* The repeating kets try the rest of the pattern or restart from the
3129 preceding bracket, in the appropriate order. We need to reset any options
3130 that changed within the bracket before re-running it, so check the next
3131 opcode. */
3132
3133 if (ecode[3] == OP_OPT)
3134 {
3135 ims = (ims & ~PCRE_IMS) | ecode[4];
3136 DPRINTF(("ims set to %02x at group repeat\n", ims));
3137 }
3138
3139 if (*ecode == OP_KETRMIN)
3140 {
3141 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3142 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3143 }
3144 else /* OP_KETRMAX */
3145 {
3146 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3147 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3148 }
3149 }
3150 return FALSE;
3151
3152 /* An alternation is the end of a branch; scan along to find the end of the
3153 bracketed group and go to there. */
3154
3155 case OP_ALT:
3156 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3157 break;
3158
3159 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3160 that it may occur zero times. It may repeat infinitely, or not at all -
3161 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3162 repeat limits are compiled as a number of copies, with the optional ones
3163 preceded by BRAZERO or BRAMINZERO. */
3164
3165 case OP_BRAZERO:
3166 {
3167 const uschar *next = ecode+1;
3168 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3169 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3170 ecode = next + 3;
3171 }
3172 break;
3173
3174 case OP_BRAMINZERO:
3175 {
3176 const uschar *next = ecode+1;
3177 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3178 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3179 ecode++;
3180 }
3181 break;
3182
3183 /* End of a group, repeated or non-repeating. If we are at the end of
3184 an assertion "group", stop matching and return TRUE, but record the
3185 current high water mark for use by positive assertions. Do this also
3186 for the "once" (not-backup up) groups. */
3187
3188 case OP_KET:
3189 case OP_KETRMIN:
3190 case OP_KETRMAX:
3191 {
3192 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3193
3194 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3195 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3196 *prev == OP_ONCE)
3197 {
3198 md->end_match_ptr = eptr; /* For ONCE */
3199 md->end_offset_top = offset_top;
3200 return TRUE;
3201 }
3202
3203 /* In all other cases except a conditional group we have to check the
3204 group number back at the start and if necessary complete handling an
3205 extraction by setting the offsets and bumping the high water mark. */
3206
3207 if (*prev != OP_COND)
3208 {
3209 int number = *prev - OP_BRA;
3210 int offset = number << 1;
3211
3212 DPRINTF(("end bracket %d\n", number));
3213
3214 if (number > 0)
3215 {
3216 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3217 {
3218 md->offset_vector[offset] =
3219 md->offset_vector[md->offset_end - number];
3220 md->offset_vector[offset+1] = eptr - md->start_subject;
3221 if (offset_top <= offset) offset_top = offset + 2;
3222 }
3223 }
3224 }
3225
3226 /* Reset the value of the ims flags, in case they got changed during
3227 the group. */
3228
3229 ims = original_ims;
3230 DPRINTF(("ims reset to %02x\n", ims));
3231
3232 /* For a non-repeating ket, just continue at this level. This also
3233 happens for a repeating ket if no characters were matched in the group.
3234 This is the forcible breaking of infinite loops as implemented in Perl
3235 5.005. If there is an options reset, it will get obeyed in the normal
3236 course of events. */
3237
3238 if (*ecode == OP_KET || eptr == eptrb)
3239 {
3240 ecode += 3;
3241 break;
3242 }
3243
3244 /* The repeating kets try the rest of the pattern or restart from the
3245 preceding bracket, in the appropriate order. */
3246
3247 if (*ecode == OP_KETRMIN)
3248 {
3249 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3250 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3251 }
3252 else /* OP_KETRMAX */
3253 {
3254 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3255 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3256 }
3257 }
3258 return FALSE;
3259
3260 /* Start of subject unless notbol, or after internal newline if multiline */
3261
3262 case OP_CIRC:
3263 if (md->notbol && eptr == md->start_subject) return FALSE;
3264 if ((ims & PCRE_MULTILINE) != 0)
3265 {
3266 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3267 ecode++;
3268 break;
3269 }
3270 /* ... else fall through */
3271
3272 /* Start of subject assertion */
3273
3274 case OP_SOD:
3275 if (eptr != md->start_subject) return FALSE;
3276 ecode++;
3277 break;
3278
3279 /* Assert before internal newline if multiline, or before a terminating
3280 newline unless endonly is set, else end of subject unless noteol is set. */
3281
3282 case OP_DOLL:
3283 if ((ims & PCRE_MULTILINE) != 0)
3284 {
3285 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3286 else { if (md->noteol) return FALSE; }
3287 ecode++;
3288 break;
3289 }
3290 else
3291 {
3292 if (md->noteol) return FALSE;
3293 if (!md->endonly)
3294 {
3295 if (eptr < md->end_subject - 1 ||
3296 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3297
3298 ecode++;
3299 break;
3300 }
3301 }
3302 /* ... else fall through */
3303
3304 /* End of subject assertion (\z) */
3305
3306 case OP_EOD:
3307 if (eptr < md->end_subject) return FALSE;
3308 ecode++;
3309 break;
3310
3311 /* End of subject or ending \n assertion (\Z) */
3312
3313 case OP_EODN:
3314 if (eptr < md->end_subject - 1 ||
3315 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3316 ecode++;
3317 break;
3318
3319 /* Word boundary assertions */
3320
3321 case OP_NOT_WORD_BOUNDARY:
3322 case OP_WORD_BOUNDARY:
3323 {
3324 BOOL prev_is_word = (eptr != md->start_subject) &&
3325 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3326 BOOL cur_is_word = (eptr < md->end_subject) &&
3327 ((md->ctypes[*eptr] & ctype_word) != 0);
3328 if ((*ecode++ == OP_WORD_BOUNDARY)?
3329 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3330 return FALSE;
3331 }
3332 break;
3333
3334 /* Match a single character type; inline for speed */
3335
3336 case OP_ANY:
3337 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3338 return FALSE;
3339 if (eptr++ >= md->end_subject) return FALSE;
3340 ecode++;
3341 break;
3342
3343 case OP_NOT_DIGIT:
3344 if (eptr >= md->end_subject ||
3345 (md->ctypes[*eptr++] & ctype_digit) != 0)
3346 return FALSE;
3347 ecode++;
3348 break;
3349
3350 case OP_DIGIT:
3351 if (eptr >= md->end_subject ||
3352 (md->ctypes[*eptr++] & ctype_digit) == 0)
3353 return FALSE;
3354 ecode++;
3355 break;
3356
3357 case OP_NOT_WHITESPACE:
3358 if (eptr >= md->end_subject ||
3359 (md->ctypes[*eptr++] & ctype_space) != 0)
3360 return FALSE;
3361 ecode++;
3362 break;
3363
3364 case OP_WHITESPACE:
3365 if (eptr >= md->end_subject ||
3366 (md->ctypes[*eptr++] & ctype_space) == 0)
3367 return FALSE;
3368 ecode++;
3369 break;
3370
3371 case OP_NOT_WORDCHAR:
3372 if (eptr >= md->end_subject ||
3373 (md->ctypes[*eptr++] & ctype_word) != 0)
3374 return FALSE;
3375 ecode++;
3376 break;
3377
3378 case OP_WORDCHAR:
3379 if (eptr >= md->end_subject ||
3380 (md->ctypes[*eptr++] & ctype_word) == 0)
3381 return FALSE;
3382 ecode++;
3383 break;
3384
3385 /* Match a back reference, possibly repeatedly. Look past the end of the
3386 item to see if there is repeat information following. The code is similar
3387 to that for character classes, but repeated for efficiency. Then obey
3388 similar code to character type repeats - written out again for speed.
3389 However, if the referenced string is the empty string, always treat
3390 it as matched, any number of times (otherwise there could be infinite
3391 loops). */
3392
3393 case OP_REF:
3394 {
3395 int length;
3396 int offset = ecode[1] << 1; /* Doubled reference number */
3397 ecode += 2; /* Advance past the item */
3398
3399 /* If the reference is unset, set the length to be longer than the amount
3400 of subject left; this ensures that every attempt at a match fails. We
3401 can't just fail here, because of the possibility of quantifiers with zero
3402 minima. */
3403
3404 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3405 md->end_subject - eptr + 1 :
3406 md->offset_vector[offset+1] - md->offset_vector[offset];
3407
3408 /* Set up for repetition, or handle the non-repeated case */
3409
3410 switch (*ecode)
3411 {
3412 case OP_CRSTAR:
3413 case OP_CRMINSTAR:
3414 case OP_CRPLUS:
3415 case OP_CRMINPLUS:
3416 case OP_CRQUERY:
3417 case OP_CRMINQUERY:
3418 c = *ecode++ - OP_CRSTAR;
3419 minimize = (c & 1) != 0;
3420 min = rep_min[c]; /* Pick up values from tables; */
3421 max = rep_max[c]; /* zero for max => infinity */
3422 if (max == 0) max = INT_MAX;
3423 break;
3424
3425 case OP_CRRANGE:
3426 case OP_CRMINRANGE:
3427 minimize = (*ecode == OP_CRMINRANGE);
3428 min = (ecode[1] << 8) + ecode[2];
3429 max = (ecode[3] << 8) + ecode[4];
3430 if (max == 0) max = INT_MAX;
3431 ecode += 5;
3432 break;
3433
3434 default: /* No repeat follows */
3435 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3436 eptr += length;
3437 continue; /* With the main loop */
3438 }
3439
3440 /* If the length of the reference is zero, just continue with the
3441 main loop. */
3442
3443 if (length == 0) continue;
3444
3445 /* First, ensure the minimum number of matches are present. We get back
3446 the length of the reference string explicitly rather than passing the
3447 address of eptr, so that eptr can be a register variable. */
3448
3449 for (i = 1; i <= min; i++)
3450 {
3451 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3452 eptr += length;
3453 }
3454
3455 /* If min = max, continue at the same level without recursion.
3456 They are not both allowed to be zero. */
3457
3458 if (min == max) continue;
3459
3460 /* If minimizing, keep trying and advancing the pointer */
3461
3462 if (minimize)
3463 {
3464 for (i = min;; i++)
3465 {
3466 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3467 return TRUE;
3468 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3469 return FALSE;
3470 eptr += length;
3471 }
3472 /* Control never gets here */
3473 }
3474
3475 /* If maximizing, find the longest string and work backwards */
3476
3477 else
3478 {
3479 const uschar *pp = eptr;
3480 for (i = min; i < max; i++)
3481 {
3482 if (!match_ref(offset, eptr, length, md, ims)) break;
3483 eptr += length;
3484 }
3485 while (eptr >= pp)
3486 {
3487 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3488 return TRUE;
3489 eptr -= length;
3490 }
3491 return FALSE;
3492 }
3493 }
3494 /* Control never gets here */
3495
3496
3497
3498 /* Match a character class, possibly repeatedly. Look past the end of the
3499 item to see if there is repeat information following. Then obey similar
3500 code to character type repeats - written out again for speed. */
3501
3502 case OP_CLASS:
3503 {
3504 const uschar *data = ecode + 1; /* Save for matching */
3505 ecode += 33; /* Advance past the item */
3506
3507 switch (*ecode)
3508 {
3509 case OP_CRSTAR:
3510 case OP_CRMINSTAR:
3511 case OP_CRPLUS:
3512 case OP_CRMINPLUS:
3513 case OP_CRQUERY:
3514 case OP_CRMINQUERY:
3515 c = *ecode++ - OP_CRSTAR;
3516 minimize = (c & 1) != 0;
3517 min = rep_min[c]; /* Pick up values from tables; */
3518 max = rep_max[c]; /* zero for max => infinity */
3519 if (max == 0) max = INT_MAX;
3520 break;
3521
3522 case OP_CRRANGE:
3523 case OP_CRMINRANGE:
3524 minimize = (*ecode == OP_CRMINRANGE);
3525 min = (ecode[1] << 8) + ecode[2];
3526 max = (ecode[3] << 8) + ecode[4];
3527 if (max == 0) max = INT_MAX;
3528 ecode += 5;
3529 break;
3530
3531 default: /* No repeat follows */
3532 min = max = 1;
3533 break;
3534 }
3535
3536 /* First, ensure the minimum number of matches are present. */
3537
3538 for (i = 1; i <= min; i++)
3539 {
3540 if (eptr >= md->end_subject) return FALSE;
3541 c = *eptr++;
3542 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3543 return FALSE;
3544 }
3545
3546 /* If max == min we can continue with the main loop without the
3547 need to recurse. */
3548
3549 if (min == max) continue;
3550
3551 /* If minimizing, keep testing the rest of the expression and advancing
3552 the pointer while it matches the class. */
3553
3554 if (minimize)
3555 {
3556 for (i = min;; i++)
3557 {
3558 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3559 return TRUE;
3560 if (i >= max || eptr >= md->end_subject) return FALSE;
3561 c = *eptr++;
3562 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3563 return FALSE;
3564 }
3565 /* Control never gets here */
3566 }
3567
3568 /* If maximizing, find the longest possible run, then work backwards. */
3569
3570 else
3571 {
3572 const uschar *pp = eptr;
3573 for (i = min; i < max; eptr++, i++)
3574 {
3575 if (eptr >= md->end_subject) break;
3576 c = *eptr;
3577 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3578 break;
3579 }
3580
3581 while (eptr >= pp)
3582 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3583 return TRUE;
3584 return FALSE;
3585 }
3586 }
3587 /* Control never gets here */
3588
3589 /* Match a run of characters */
3590
3591 case OP_CHARS:
3592 {
3593 register int length = ecode[1];
3594 ecode += 2;
3595
3596 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3597 if (eptr >= md->end_subject)
3598 printf("matching subject <null> against pattern ");
3599 else
3600 {
3601 printf("matching subject ");
3602 pchars(eptr, length, TRUE, md);
3603 printf(" against pattern ");
3604 }
3605 pchars(ecode, length, FALSE, md);
3606 printf("\n");
3607 #endif
3608
3609 if (length > md->end_subject - eptr) return FALSE;
3610 if ((ims & PCRE_CASELESS) != 0)
3611 {
3612 while (length-- > 0)
3613 if (md->lcc[*ecode++] != md->lcc[*eptr++])
3614 return FALSE;
3615 }
3616 else
3617 {
3618 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3619 }
3620 }
3621 break;
3622
3623 /* Match a single character repeatedly; different opcodes share code. */
3624
3625 case OP_EXACT:
3626 min = max = (ecode[1] << 8) + ecode[2];
3627 ecode += 3;
3628 goto REPEATCHAR;
3629
3630 case OP_UPTO:
3631 case OP_MINUPTO:
3632 min = 0;
3633 max = (ecode[1] << 8) + ecode[2];
3634 minimize = *ecode == OP_MINUPTO;
3635 ecode += 3;
3636 goto REPEATCHAR;
3637
3638 case OP_STAR:
3639 case OP_MINSTAR:
3640 case OP_PLUS:
3641 case OP_MINPLUS:
3642 case OP_QUERY:
3643 case OP_MINQUERY:
3644 c = *ecode++ - OP_STAR;
3645 minimize = (c & 1) != 0;
3646 min = rep_min[c]; /* Pick up values from tables; */
3647 max = rep_max[c]; /* zero for max => infinity */
3648 if (max == 0) max = INT_MAX;
3649
3650 /* Common code for all repeated single-character matches. We can give
3651 up quickly if there are fewer than the minimum number of characters left in
3652 the subject. */
3653
3654 REPEATCHAR:
3655 if (min > md->end_subject - eptr) return FALSE;
3656 c = *ecode++;
3657
3658 /* The code is duplicated for the caseless and caseful cases, for speed,
3659 since matching characters is likely to be quite common. First, ensure the
3660 minimum number of matches are present. If min = max, continue at the same
3661 level without recursing. Otherwise, if minimizing, keep trying the rest of
3662 the expression and advancing one matching character if failing, up to the
3663 maximum. Alternatively, if maximizing, find the maximum number of
3664 characters and work backwards. */
3665
3666 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3667 max, eptr));
3668
3669 if ((ims & PCRE_CASELESS) != 0)
3670 {
3671 c = md->lcc[c];
3672 for (i = 1; i <= min; i++)
3673 if (c != md->lcc[*eptr++]) return FALSE;
3674 if (min == max) continue;
3675 if (minimize)
3676 {
3677 for (i = min;; i++)
3678 {
3679 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3680 return TRUE;
3681 if (i >= max || eptr >= md->end_subject ||
3682 c != md->lcc[*eptr++])
3683 return FALSE;
3684 }
3685 /* Control never gets here */
3686 }
3687 else
3688 {
3689 const uschar *pp = eptr;
3690 for (i = min; i < max; i++)
3691 {
3692 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3693 eptr++;
3694 }
3695 while (eptr >= pp)
3696 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3697 return TRUE;
3698 return FALSE;
3699 }
3700 /* Control never gets here */
3701 }
3702
3703 /* Caseful comparisons */
3704
3705 else
3706 {
3707 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3708 if (min == max) continue;
3709 if (minimize)
3710 {
3711 for (i = min;; i++)
3712 {
3713 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3714 return TRUE;
3715 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3716 }
3717 /* Control never gets here */
3718 }
3719 else
3720 {
3721 const uschar *pp = eptr;
3722 for (i = min; i < max; i++)
3723 {
3724 if (eptr >= md->end_subject || c != *eptr) break;
3725 eptr++;
3726 }
3727 while (eptr >= pp)
3728 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3729 return TRUE;
3730 return FALSE;
3731 }
3732 }
3733 /* Control never gets here */
3734
3735 /* Match a negated single character */
3736
3737 case OP_NOT:
3738 if (eptr >= md->end_subject) return FALSE;
3739 ecode++;
3740 if ((ims & PCRE_CASELESS) != 0)
3741 {
3742 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3743 }
3744 else
3745 {
3746 if (*ecode++ == *eptr++) return FALSE;
3747 }
3748 break;
3749
3750 /* Match a negated single character repeatedly. This is almost a repeat of
3751 the code for a repeated single character, but I haven't found a nice way of
3752 commoning these up that doesn't require a test of the positive/negative
3753 option for each character match. Maybe that wouldn't add very much to the
3754 time taken, but character matching *is* what this is all about... */
3755
3756 case OP_NOTEXACT:
3757 min = max = (ecode[1] << 8) + ecode[2];
3758 ecode += 3;
3759 goto REPEATNOTCHAR;
3760
3761 case OP_NOTUPTO:
3762 case OP_NOTMINUPTO:
3763 min = 0;
3764 max = (ecode[1] << 8) + ecode[2];
3765 minimize = *ecode == OP_NOTMINUPTO;
3766 ecode += 3;
3767 goto REPEATNOTCHAR;
3768
3769 case OP_NOTSTAR:
3770 case OP_NOTMINSTAR:
3771 case OP_NOTPLUS:
3772 case OP_NOTMINPLUS:
3773 case OP_NOTQUERY:
3774 case OP_NOTMINQUERY:
3775 c = *ecode++ - OP_NOTSTAR;
3776 minimize = (c & 1) != 0;
3777 min = rep_min[c]; /* Pick up values from tables; */
3778 max = rep_max[c]; /* zero for max => infinity */
3779 if (max == 0) max = INT_MAX;
3780
3781 /* Common code for all repeated single-character matches. We can give
3782 up quickly if there are fewer than the minimum number of characters left in
3783 the subject. */
3784
3785 REPEATNOTCHAR:
3786 if (min > md->end_subject - eptr) return FALSE;
3787 c = *ecode++;
3788
3789 /* The code is duplicated for the caseless and caseful cases, for speed,
3790 since matching characters is likely to be quite common. First, ensure the
3791 minimum number of matches are present. If min = max, continue at the same
3792 level without recursing. Otherwise, if minimizing, keep trying the rest of
3793 the expression and advancing one matching character if failing, up to the
3794 maximum. Alternatively, if maximizing, find the maximum number of
3795 characters and work backwards. */
3796
3797 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3798 max, eptr));
3799
3800 if ((ims & PCRE_CASELESS) != 0)
3801 {
3802 c = md->lcc[c];
3803 for (i = 1; i <= min; i++)
3804 if (c == md->lcc[*eptr++]) return FALSE;
3805 if (min == max) continue;
3806 if (minimize)
3807 {
3808 for (i = min;; i++)
3809 {
3810 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3811 return TRUE;
3812 if (i >= max || eptr >= md->end_subject ||
3813 c == md->lcc[*eptr++])
3814 return FALSE;
3815 }
3816 /* Control never gets here */
3817 }
3818 else
3819 {
3820 const uschar *pp = eptr;
3821 for (i = min; i < max; i++)
3822 {
3823 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3824 eptr++;
3825 }
3826 while (eptr >= pp)
3827 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3828 return TRUE;
3829 return FALSE;
3830 }
3831 /* Control never gets here */
3832 }
3833
3834 /* Caseful comparisons */
3835
3836 else
3837 {
3838 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3839 if (min == max) continue;
3840 if (minimize)
3841 {
3842 for (i = min;; i++)
3843 {
3844 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3845 return TRUE;
3846 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3847 }
3848 /* Control never gets here */
3849 }
3850 else
3851 {
3852 const uschar *pp = eptr;
3853 for (i = min; i < max; i++)
3854 {
3855 if (eptr >= md->end_subject || c == *eptr) break;
3856 eptr++;
3857 }
3858 while (eptr >= pp)
3859 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3860 return TRUE;
3861 return FALSE;
3862 }
3863 }
3864 /* Control never gets here */
3865
3866 /* Match a single character type repeatedly; several different opcodes
3867 share code. This is very similar to the code for single characters, but we
3868 repeat it in the interests of efficiency. */
3869
3870 case OP_TYPEEXACT:
3871 min = max = (ecode[1] << 8) + ecode[2];
3872 minimize = TRUE;
3873 ecode += 3;
3874 goto REPEATTYPE;
3875
3876 case OP_TYPEUPTO:
3877 case OP_TYPEMINUPTO:
3878 min = 0;
3879 max = (ecode[1] << 8) + ecode[2];
3880 minimize = *ecode == OP_TYPEMINUPTO;
3881 ecode += 3;
3882 goto REPEATTYPE;
3883
3884 case OP_TYPESTAR:
3885 case OP_TYPEMINSTAR:
3886 case OP_TYPEPLUS:
3887 case OP_TYPEMINPLUS:
3888 case OP_TYPEQUERY:
3889 case OP_TYPEMINQUERY:
3890 c = *ecode++ - OP_TYPESTAR;
3891 minimize = (c & 1) != 0;
3892 min = rep_min[c]; /* Pick up values from tables; */
3893 max = rep_max[c]; /* zero for max => infinity */
3894 if (max == 0) max = INT_MAX;
3895
3896 /* Common code for all repeated single character type matches */
3897
3898 REPEATTYPE:
3899 ctype = *ecode++; /* Code for the character type */
3900
3901 /* First, ensure the minimum number of matches are present. Use inline
3902 code for maximizing the speed, and do the type test once at the start
3903 (i.e. keep it out of the loop). Also test that there are at least the
3904 minimum number of characters before we start. */
3905
3906 if (min > md->end_subject - eptr) return FALSE;
3907 if (min > 0) switch(ctype)
3908 {
3909 case OP_ANY:
3910 if ((ims & PCRE_DOTALL) == 0)
3911 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3912 else eptr += min;
3913 break;
3914
3915 case OP_NOT_DIGIT:
3916 for (i = 1; i <= min; i++)
3917 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3918 break;
3919
3920 case OP_DIGIT:
3921 for (i = 1; i <= min; i++)
3922 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3923 break;
3924
3925 case OP_NOT_WHITESPACE:
3926 for (i = 1; i <= min; i++)
3927 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3928 break;
3929
3930 case OP_WHITESPACE:
3931 for (i = 1; i <= min; i++)
3932 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3933 break;
3934
3935 case OP_NOT_WORDCHAR:
3936 for (i = 1; i <= min; i++)
3937 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3938 return FALSE;
3939 break;
3940
3941 case OP_WORDCHAR:
3942 for (i = 1; i <= min; i++)
3943 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3944 return FALSE;
3945 break;
3946 }
3947
3948 /* If min = max, continue at the same level without recursing */
3949
3950 if (min == max) continue;
3951
3952 /* If minimizing, we have to test the rest of the pattern before each
3953 subsequent match. */
3954
3955 if (minimize)
3956 {
3957 for (i = min;; i++)
3958 {
3959 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3960 if (i >= max || eptr >= md->end_subject) return FALSE;
3961
3962 c = *eptr++;
3963 switch(ctype)
3964 {
3965 case OP_ANY:
3966 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3967 break;
3968
3969 case OP_NOT_DIGIT:
3970 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3971 break;
3972
3973 case OP_DIGIT:
3974 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3975 break;
3976
3977 case OP_NOT_WHITESPACE:
3978 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3979 break;
3980
3981 case OP_WHITESPACE:
3982 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3983 break;
3984
3985 case OP_NOT_WORDCHAR:
3986 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3987 break;
3988
3989 case OP_WORDCHAR:
3990 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3991 break;
3992 }
3993 }
3994 /* Control never gets here */
3995 }
3996
3997 /* If maximizing it is worth using inline code for speed, doing the type
3998 test once at the start (i.e. keep it out of the loop). */
3999
4000 else
4001 {
4002 const uschar *pp = eptr;
4003 switch(ctype)
4004 {
4005 case OP_ANY:
4006 if ((ims & PCRE_DOTALL) == 0)
4007 {
4008 for (i = min; i < max; i++)
4009 {
4010 if (eptr >= md->end_subject || *eptr == '\n') break;
4011 eptr++;
4012 }
4013 }
4014 else
4015 {
4016 c = max - min;
4017 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4018 eptr += c;
4019 }
4020 break;
4021
4022 case OP_NOT_DIGIT:
4023 for (i = min; i < max; i++)
4024 {
4025 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4026 break;
4027 eptr++;
4028 }
4029 break;
4030
4031 case OP_DIGIT:
4032 for (i = min; i < max; i++)
4033 {
4034 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4035 break;
4036 eptr++;
4037 }
4038 break;
4039
4040 case OP_NOT_WHITESPACE:
4041 for (i = min; i < max; i++)
4042 {
4043 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4044 break;
4045 eptr++;
4046 }
4047 break;
4048
4049 case OP_WHITESPACE:
4050 for (i = min; i < max; i++)
4051 {
4052 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4053 break;
4054 eptr++;
4055 }
4056 break;
4057
4058 case OP_NOT_WORDCHAR:
4059 for (i = min; i < max; i++)
4060 {
4061 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4062 break;
4063 eptr++;
4064 }
4065 break;
4066
4067 case OP_WORDCHAR:
4068 for (i = min; i < max; i++)
4069 {
4070 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4071 break;
4072 eptr++;
4073 }
4074 break;
4075 }
4076
4077 while (eptr >= pp)
4078 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4079 return TRUE;
4080 return FALSE;
4081 }
4082 /* Control never gets here */
4083
4084 /* There's been some horrible disaster. */
4085
4086 default:
4087 DPRINTF(("Unknown opcode %d\n", *ecode));
4088 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4089 return FALSE;
4090 }
4091
4092 /* Do not stick any code in here without much thought; it is assumed
4093 that "continue" in the code above comes out to here to repeat the main
4094 loop. */
4095
4096 } /* End of main loop */
4097 /* Control never reaches here */
4098 }
4099
4100
4101
4102
4103 /*************************************************
4104 * Execute a Regular Expression *
4105 *************************************************/
4106
4107 /* This function applies a compiled re to a subject string and picks out
4108 portions of the string if it matches. Two elements in the vector are set for
4109 each substring: the offsets to the start and end of the substring.
4110
4111 Arguments:
4112 external_re points to the compiled expression
4113 external_extra points to "hints" from pcre_study() or is NULL
4114 subject points to the subject string
4115 length length of subject string (may contain binary zeros)
4116 options option bits
4117 offsets points to a vector of ints to be filled in with offsets
4118 offsetcount the number of elements in the vector
4119
4120 Returns: > 0 => success; value is the number of elements filled in
4121 = 0 => success, but offsets is not big enough
4122 -1 => failed to match
4123 < -1 => some kind of unexpected problem
4124 */
4125
4126 int
4127 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4128 const char *subject, int length, int options, int *offsets, int offsetcount)
4129 {
4130 int resetcount, ocount;
4131 int first_char = -1;
4132 int ims = 0;
4133 match_data match_block;
4134 const uschar *start_bits = NULL;
4135 const uschar *start_match = (const uschar *)subject;
4136 const uschar *end_subject;
4137 const real_pcre *re = (const real_pcre *)external_re;
4138 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4139 BOOL using_temporary_offsets = FALSE;
4140 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4141 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4142
4143 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4144
4145 if (re == NULL || subject == NULL ||
4146 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4147 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4148
4149 match_block.start_subject = (const uschar *)subject;
4150 match_block.end_subject = match_block.start_subject + length;
4151 end_subject = match_block.end_subject;
4152
4153 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4154
4155 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4156 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4157
4158 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4159
4160 match_block.lcc = re->tables + lcc_offset;
4161 match_block.ctypes = re->tables + ctypes_offset;
4162
4163 /* The ims options can vary during the matching as a result of the presence
4164 of (?ims) items in the pattern. They are kept in a local variable so that
4165 restoring at the exit of a group is easy. */
4166
4167 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4168
4169 /* If the expression has got more back references than the offsets supplied can
4170 hold, we get a temporary bit of working store to use during the matching.
4171 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4172 of 3. */
4173
4174 ocount = offsetcount - (offsetcount % 3);
4175
4176 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4177 {
4178 ocount = re->top_backref * 3 + 3;
4179 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4180 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4181 using_temporary_offsets = TRUE;
4182 DPRINTF(("Got memory to hold back references\n"));
4183 }
4184 else match_block.offset_vector = offsets;
4185
4186 match_block.offset_end = ocount;
4187 match_block.offset_max = (2*ocount)/3;
4188 match_block.offset_overflow = FALSE;
4189
4190 /* Compute the minimum number of offsets that we need to reset each time. Doing
4191 this makes a huge difference to execution time when there aren't many brackets
4192 in the pattern. */
4193
4194 resetcount = 2 + re->top_bracket * 2;
4195 if (resetcount > offsetcount) resetcount = ocount;
4196
4197 /* Reset the working variable associated with each extraction. These should
4198 never be used unless previously set, but they get saved and restored, and so we
4199 initialize them to avoid reading uninitialized locations. */
4200
4201 if (match_block.offset_vector != NULL)
4202 {
4203 register int *iptr = match_block.offset_vector + ocount;
4204 register int *iend = iptr - resetcount/2 + 1;
4205 while (--iptr >= iend) *iptr = -1;
4206 }
4207
4208 /* Set up the first character to match, if available. The first_char value is
4209 never set for an anchored regular expression, but the anchoring may be forced
4210 at run time, so we have to test for anchoring. The first char may be unset for
4211 an unanchored pattern, of course. If there's no first char and the pattern was
4212 studied, there may be a bitmap of possible first characters. */
4213
4214 if (!anchored)
4215 {
4216 if ((re->options & PCRE_FIRSTSET) != 0)
4217 {
4218 first_char = re->first_char;
4219 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4220 }
4221 else
4222 if (!startline && extra != NULL &&
4223 (extra->options & PCRE_STUDY_MAPPED) != 0)
4224 start_bits = extra->start_bits;
4225 }
4226
4227 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4228
4229 do
4230 {
4231 int rc;
4232 register int *iptr = match_block.offset_vector;
4233 register int *iend = iptr + resetcount;
4234
4235 /* Reset the maximum number of extractions we might see. */
4236
4237 while (iptr < iend) *iptr++ = -1;
4238
4239 /* Advance to a unique first char if possible */
4240
4241 if (first_char >= 0)
4242 {
4243 if ((ims & PCRE_CASELESS) != 0)
4244 while (start_match < end_subject &&
4245 match_block.lcc[*start_match] != first_char)
4246 start_match++;
4247 else
4248 while (start_match < end_subject && *start_match != first_char)
4249 start_match++;
4250 }
4251
4252 /* Or to just after \n for a multiline match if possible */
4253
4254 else if (startline)
4255 {
4256 if (start_match > match_block.start_subject)
4257 {
4258 while (start_match < end_subject && start_match[-1] != '\n')
4259 start_match++;
4260 }
4261 }
4262
4263 /* Or to a non-unique first char */
4264
4265 else if (start_bits != NULL)
4266 {
4267 while (start_match < end_subject)
4268 {
4269 register int c = *start_match;
4270 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4271 }
4272 }
4273
4274 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4275 printf(">>>> Match against: ");
4276 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4277 printf("\n");
4278 #endif
4279
4280 /* When a match occurs, substrings will be set for all internal extractions;
4281 we just need to set up the whole thing as substring 0 before returning. If
4282 there were too many extractions, set the return code to zero. In the case
4283 where we had to get some local store to hold offsets for backreferences, copy
4284 those back references that we can. In this case there need not be overflow
4285 if certain parts of the pattern were not used. */
4286
4287 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4288 continue;
4289
4290 /* Copy the offset information from temporary store if necessary */
4291
4292 if (using_temporary_offsets)
4293 {
4294 if (offsetcount >= 4)
4295 {
4296 memcpy(offsets + 2, match_block.offset_vector + 2,
4297 (offsetcount - 2) * sizeof(int));
4298 DPRINTF(("Copied offsets from temporary memory\n"));
4299 }
4300 if (match_block.end_offset_top > offsetcount)
4301 match_block.offset_overflow = TRUE;
4302
4303 DPRINTF(("Freeing temporary memory\n"));
4304 (pcre_free)(match_block.offset_vector);
4305 }
4306
4307 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4308
4309 if (match_block.offset_end < 2) rc = 0; else
4310 {
4311 offsets[0] = start_match - match_block.start_subject;
4312 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4313 }
4314
4315 DPRINTF((">>>> returning %d\n", rc));
4316 return rc;
4317 }
4318
4319 /* This "while" is the end of the "do" above */
4320
4321 while (!anchored &&
4322 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4323 start_match++ < end_subject);
4324
4325 if (using_temporary_offsets)
4326 {
4327 DPRINTF(("Freeing temporary memory\n"));
4328 (pcre_free)(match_block.offset_vector);
4329 }
4330
4331 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4332
4333 return match_block.errorcode;
4334 }
4335
4336 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12