/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 29 - (show annotations) (download)
Sat Feb 24 21:38:53 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 121051 byte(s)
Load pcre-2.03 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-1999 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70
71 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73
74 /* Text forms of OP_ values and things, for debugging (not all used) */
75
76 #ifdef DEBUG
77 static const char *OP_names[] = {
78 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80 "Opt", "^", "$", "Any", "chars", "not",
81 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84 "*", "*?", "+", "+?", "?", "??", "{", "{",
85 "class", "Ref",
86 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 "Brazero", "Braminzero", "Bra"
89 };
90 #endif
91
92 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93 are simple data values; negative values are for special things like \d and so
94 on. Zero means further processing is needed (for things like \x), or the escape
95 is invalid. */
96
97 static const short int escapes[] = {
98 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 0, 0, -ESC_z /* x - z */
108 };
109
110 /* Definition to allow mutual recursion */
111
112 static BOOL
113 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114 BOOL, int, compile_data *);
115
116
117
118 /*************************************************
119 * Global variables *
120 *************************************************/
121
122 /* PCRE is thread-clean and doesn't use any global variables in the normal
123 sense. However, it calls memory allocation and free functions via the two
124 indirections below, which are can be changed by the caller, but are shared
125 between all threads. */
126
127 void *(*pcre_malloc)(size_t) = malloc;
128 void (*pcre_free)(void *) = free;
129
130
131
132
133 /*************************************************
134 * Default character tables *
135 *************************************************/
136
137 /* A default set of character tables is included in the PCRE binary. Its source
138 is built by the maketables auxiliary program, which uses the default C ctypes
139 functions, and put in the file chartables.c. These tables are used by PCRE
140 whenever the caller of pcre_compile() does not provide an alternate set of
141 tables. */
142
143 #include "chartables.c"
144
145
146
147 /*************************************************
148 * Return version string *
149 *************************************************/
150
151 const char *
152 pcre_version(void)
153 {
154 return PCRE_VERSION;
155 }
156
157
158
159
160 /*************************************************
161 * Return info about a compiled pattern *
162 *************************************************/
163
164 /* This function picks potentially useful data out of the private
165 structure.
166
167 Arguments:
168 external_re points to compiled code
169 optptr where to pass back the options
170 first_char where to pass back the first character,
171 or -1 if multiline and all branches start ^,
172 or -2 otherwise
173
174 Returns: number of identifying extraction brackets
175 or negative values on error
176 */
177
178 int
179 pcre_info(const pcre *external_re, int *optptr, int *first_char)
180 {
181 const real_pcre *re = (const real_pcre *)external_re;
182 if (re == NULL) return PCRE_ERROR_NULL;
183 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185 if (first_char != NULL)
186 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188 return re->top_bracket;
189 }
190
191
192
193
194 #ifdef DEBUG
195 /*************************************************
196 * Debugging function to print chars *
197 *************************************************/
198
199 /* Print a sequence of chars in printable format, stopping at the end of the
200 subject if the requested.
201
202 Arguments:
203 p points to characters
204 length number to print
205 is_subject TRUE if printing from within md->start_subject
206 md pointer to matching data block, if is_subject is TRUE
207
208 Returns: nothing
209 */
210
211 static void
212 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 {
214 int c;
215 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216 while (length-- > 0)
217 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218 }
219 #endif
220
221
222
223
224 /*************************************************
225 * Handle escapes *
226 *************************************************/
227
228 /* This function is called when a \ has been encountered. It either returns a
229 positive value for a simple escape such as \n, or a negative value which
230 encodes one of the more complicated things such as \d. On entry, ptr is
231 pointing at the \. On exit, it is on the final character of the escape
232 sequence.
233
234 Arguments:
235 ptrptr points to the pattern position pointer
236 errorptr points to the pointer to the error message
237 bracount number of previous extracting brackets
238 options the options bits
239 isclass TRUE if inside a character class
240 cd pointer to char tables block
241
242 Returns: zero or positive => a data character
243 negative => a special escape sequence
244 on error, errorptr is set
245 */
246
247 static int
248 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
249 int options, BOOL isclass, compile_data *cd)
250 {
251 const uschar *ptr = *ptrptr;
252 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
253 int i;
254
255 if (c == 0) *errorptr = ERR1;
256
257 /* Digits or letters may have special meaning; all others are literals. */
258
259 else if (c < '0' || c > 'z') {}
260
261 /* Do an initial lookup in a table. A non-zero result is something that can be
262 returned immediately. Otherwise further processing may be required. */
263
264 else if ((i = escapes[c - '0']) != 0) c = i;
265
266 /* Escapes that need further processing, or are illegal. */
267
268 else
269 {
270 const uschar *oldptr;
271 switch (c)
272 {
273 /* The handling of escape sequences consisting of a string of digits
274 starting with one that is not zero is not straightforward. By experiment,
275 the way Perl works seems to be as follows:
276
277 Outside a character class, the digits are read as a decimal number. If the
278 number is less than 10, or if there are that many previous extracting
279 left brackets, then it is a back reference. Otherwise, up to three octal
280 digits are read to form an escaped byte. Thus \123 is likely to be octal
281 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
282 value is greater than 377, the least significant 8 bits are taken. Inside a
283 character class, \ followed by a digit is always an octal number. */
284
285 case '1': case '2': case '3': case '4': case '5':
286 case '6': case '7': case '8': case '9':
287
288 if (!isclass)
289 {
290 oldptr = ptr;
291 c -= '0';
292 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
293 c = c * 10 + *(++ptr) - '0';
294 if (c < 10 || c <= bracount)
295 {
296 c = -(ESC_REF + c);
297 break;
298 }
299 ptr = oldptr; /* Put the pointer back and fall through */
300 }
301
302 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
303 generates a binary zero byte and treats the digit as a following literal.
304 Thus we have to pull back the pointer by one. */
305
306 if ((c = *ptr) >= '8')
307 {
308 ptr--;
309 c = 0;
310 break;
311 }
312
313 /* \0 always starts an octal number, but we may drop through to here with a
314 larger first octal digit */
315
316 case '0':
317 c -= '0';
318 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
319 ptr[1] != '8' && ptr[1] != '9')
320 c = c * 8 + *(++ptr) - '0';
321 break;
322
323 /* Special escapes not starting with a digit are straightforward */
324
325 case 'x':
326 c = 0;
327 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
328 {
329 ptr++;
330 c = c * 16 + cd->lcc[*ptr] -
331 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
332 }
333 break;
334
335 case 'c':
336 c = *(++ptr);
337 if (c == 0)
338 {
339 *errorptr = ERR2;
340 return 0;
341 }
342
343 /* A letter is upper-cased; then the 0x40 bit is flipped */
344
345 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
346 c ^= 0x40;
347 break;
348
349 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
350 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
351 for Perl compatibility, it is a literal. This code looks a bit odd, but
352 there used to be some cases other than the default, and there may be again
353 in future, so I haven't "optimized" it. */
354
355 default:
356 if ((options & PCRE_EXTRA) != 0) switch(c)
357 {
358 default:
359 *errorptr = ERR3;
360 break;
361 }
362 break;
363 }
364 }
365
366 *ptrptr = ptr;
367 return c;
368 }
369
370
371
372 /*************************************************
373 * Check for counted repeat *
374 *************************************************/
375
376 /* This function is called when a '{' is encountered in a place where it might
377 start a quantifier. It looks ahead to see if it really is a quantifier or not.
378 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
379 where the ddds are digits.
380
381 Arguments:
382 p pointer to the first char after '{'
383 cd pointer to char tables block
384
385 Returns: TRUE or FALSE
386 */
387
388 static BOOL
389 is_counted_repeat(const uschar *p, compile_data *cd)
390 {
391 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
392 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
393 if (*p == '}') return TRUE;
394
395 if (*p++ != ',') return FALSE;
396 if (*p == '}') return TRUE;
397
398 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
399 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
400 return (*p == '}');
401 }
402
403
404
405 /*************************************************
406 * Read repeat counts *
407 *************************************************/
408
409 /* Read an item of the form {n,m} and return the values. This is called only
410 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
411 so the syntax is guaranteed to be correct, but we need to check the values.
412
413 Arguments:
414 p pointer to first char after '{'
415 minp pointer to int for min
416 maxp pointer to int for max
417 returned as -1 if no max
418 errorptr points to pointer to error message
419 cd pointer to character tables clock
420
421 Returns: pointer to '}' on success;
422 current ptr on error, with errorptr set
423 */
424
425 static const uschar *
426 read_repeat_counts(const uschar *p, int *minp, int *maxp,
427 const char **errorptr, compile_data *cd)
428 {
429 int min = 0;
430 int max = -1;
431
432 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
433
434 if (*p == '}') max = min; else
435 {
436 if (*(++p) != '}')
437 {
438 max = 0;
439 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
440 if (max < min)
441 {
442 *errorptr = ERR4;
443 return p;
444 }
445 }
446 }
447
448 /* Do paranoid checks, then fill in the required variables, and pass back the
449 pointer to the terminating '}'. */
450
451 if (min > 65535 || max > 65535)
452 *errorptr = ERR5;
453 else
454 {
455 *minp = min;
456 *maxp = max;
457 }
458 return p;
459 }
460
461
462
463 /*************************************************
464 * Find the fixed length of a pattern *
465 *************************************************/
466
467 /* Scan a pattern and compute the fixed length of subject that will match it,
468 if the length is fixed. This is needed for dealing with backward assertions.
469
470 Arguments:
471 code points to the start of the pattern (the bracket)
472
473 Returns: the fixed length, or -1 if there is no fixed length
474 */
475
476 static int
477 find_fixedlength(uschar *code)
478 {
479 int length = -1;
480
481 register int branchlength = 0;
482 register uschar *cc = code + 3;
483
484 /* Scan along the opcodes for this branch. If we get to the end of the
485 branch, check the length against that of the other branches. */
486
487 for (;;)
488 {
489 int d;
490 register int op = *cc;
491 if (op >= OP_BRA) op = OP_BRA;
492
493 switch (op)
494 {
495 case OP_BRA:
496 case OP_ONCE:
497 case OP_COND:
498 d = find_fixedlength(cc);
499 if (d < 0) return -1;
500 branchlength += d;
501 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
502 cc += 3;
503 break;
504
505 /* Reached end of a branch; if it's a ket it is the end of a nested
506 call. If it's ALT it is an alternation in a nested call. If it is
507 END it's the end of the outer call. All can be handled by the same code. */
508
509 case OP_ALT:
510 case OP_KET:
511 case OP_KETRMAX:
512 case OP_KETRMIN:
513 case OP_END:
514 if (length < 0) length = branchlength;
515 else if (length != branchlength) return -1;
516 if (*cc != OP_ALT) return length;
517 cc += 3;
518 branchlength = 0;
519 break;
520
521 /* Skip over assertive subpatterns */
522
523 case OP_ASSERT:
524 case OP_ASSERT_NOT:
525 case OP_ASSERTBACK:
526 case OP_ASSERTBACK_NOT:
527 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
528 cc += 3;
529 break;
530
531 /* Skip over things that don't match chars */
532
533 case OP_REVERSE:
534 cc++;
535
536 case OP_CREF:
537 case OP_OPT:
538 cc++;
539 /* Fall through */
540
541 case OP_SOD:
542 case OP_EOD:
543 case OP_EODN:
544 case OP_CIRC:
545 case OP_DOLL:
546 case OP_NOT_WORD_BOUNDARY:
547 case OP_WORD_BOUNDARY:
548 cc++;
549 break;
550
551 /* Handle char strings */
552
553 case OP_CHARS:
554 branchlength += *(++cc);
555 cc += *cc + 1;
556 break;
557
558 /* Handle exact repetitions */
559
560 case OP_EXACT:
561 case OP_TYPEEXACT:
562 branchlength += (cc[1] << 8) + cc[2];
563 cc += 4;
564 break;
565
566 /* Handle single-char matchers */
567
568 case OP_NOT_DIGIT:
569 case OP_DIGIT:
570 case OP_NOT_WHITESPACE:
571 case OP_WHITESPACE:
572 case OP_NOT_WORDCHAR:
573 case OP_WORDCHAR:
574 case OP_ANY:
575 branchlength++;
576 cc++;
577 break;
578
579
580 /* Check a class for variable quantification */
581
582 case OP_CLASS:
583 cc += (*cc == OP_REF)? 2 : 33;
584
585 switch (*cc)
586 {
587 case OP_CRSTAR:
588 case OP_CRMINSTAR:
589 case OP_CRQUERY:
590 case OP_CRMINQUERY:
591 return -1;
592
593 case OP_CRRANGE:
594 case OP_CRMINRANGE:
595 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
596 branchlength += (cc[1] << 8) + cc[2];
597 cc += 5;
598 break;
599
600 default:
601 branchlength++;
602 }
603 break;
604
605 /* Anything else is variable length */
606
607 default:
608 return -1;
609 }
610 }
611 /* Control never gets here */
612 }
613
614
615
616
617 /*************************************************
618 * Compile one branch *
619 *************************************************/
620
621 /* Scan the pattern, compiling it into the code vector.
622
623 Arguments:
624 options the option bits
625 brackets points to number of brackets used
626 code points to the pointer to the current code point
627 ptrptr points to the current pattern pointer
628 errorptr points to pointer to error message
629 optchanged set to the value of the last OP_OPT item compiled
630 cd contains pointers to tables
631
632 Returns: TRUE on success
633 FALSE, with *errorptr set on error
634 */
635
636 static BOOL
637 compile_branch(int options, int *brackets, uschar **codeptr,
638 const uschar **ptrptr, const char **errorptr, int *optchanged,
639 compile_data *cd)
640 {
641 int repeat_type, op_type;
642 int repeat_min, repeat_max;
643 int bravalue, length;
644 int greedy_default, greedy_non_default;
645 register int c;
646 register uschar *code = *codeptr;
647 uschar *tempcode;
648 const uschar *ptr = *ptrptr;
649 const uschar *tempptr;
650 uschar *previous = NULL;
651 uschar class[32];
652
653 /* Set up the default and non-default settings for greediness */
654
655 greedy_default = ((options & PCRE_UNGREEDY) != 0);
656 greedy_non_default = greedy_default ^ 1;
657
658 /* Switch on next character until the end of the branch */
659
660 for (;; ptr++)
661 {
662 BOOL negate_class;
663 int class_charcount;
664 int class_lastchar;
665 int newoptions;
666 int condref;
667
668 c = *ptr;
669 if ((options & PCRE_EXTENDED) != 0)
670 {
671 if ((cd->ctypes[c] & ctype_space) != 0) continue;
672 if (c == '#')
673 {
674 while ((c = *(++ptr)) != 0 && c != '\n');
675 continue;
676 }
677 }
678
679 switch(c)
680 {
681 /* The branch terminates at end of string, |, or ). */
682
683 case 0:
684 case '|':
685 case ')':
686 *codeptr = code;
687 *ptrptr = ptr;
688 return TRUE;
689
690 /* Handle single-character metacharacters */
691
692 case '^':
693 previous = NULL;
694 *code++ = OP_CIRC;
695 break;
696
697 case '$':
698 previous = NULL;
699 *code++ = OP_DOLL;
700 break;
701
702 case '.':
703 previous = code;
704 *code++ = OP_ANY;
705 break;
706
707 /* Character classes. These always build a 32-byte bitmap of the permitted
708 characters, except in the special case where there is only one character.
709 For negated classes, we build the map as usual, then invert it at the end.
710 */
711
712 case '[':
713 previous = code;
714 *code++ = OP_CLASS;
715
716 /* If the first character is '^', set the negation flag and skip it. */
717
718 if ((c = *(++ptr)) == '^')
719 {
720 negate_class = TRUE;
721 c = *(++ptr);
722 }
723 else negate_class = FALSE;
724
725 /* Keep a count of chars so that we can optimize the case of just a single
726 character. */
727
728 class_charcount = 0;
729 class_lastchar = -1;
730
731 /* Initialize the 32-char bit map to all zeros. We have to build the
732 map in a temporary bit of store, in case the class contains only 1
733 character, because in that case the compiled code doesn't use the
734 bit map. */
735
736 memset(class, 0, 32 * sizeof(uschar));
737
738 /* Process characters until ] is reached. By writing this as a "do" it
739 means that an initial ] is taken as a data character. */
740
741 do
742 {
743 if (c == 0)
744 {
745 *errorptr = ERR6;
746 goto FAILED;
747 }
748
749 /* Backslash may introduce a single character, or it may introduce one
750 of the specials, which just set a flag. Escaped items are checked for
751 validity in the pre-compiling pass. The sequence \b is a special case.
752 Inside a class (and only there) it is treated as backspace. Elsewhere
753 it marks a word boundary. Other escapes have preset maps ready to
754 or into the one we are building. We assume they have more than one
755 character in them, so set class_count bigger than one. */
756
757 if (c == '\\')
758 {
759 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
760 if (-c == ESC_b) c = '\b';
761 else if (c < 0)
762 {
763 register const uschar *cbits = cd->cbits;
764 class_charcount = 10;
765 switch (-c)
766 {
767 case ESC_d:
768 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
769 continue;
770
771 case ESC_D:
772 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
773 continue;
774
775 case ESC_w:
776 for (c = 0; c < 32; c++)
777 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
778 continue;
779
780 case ESC_W:
781 for (c = 0; c < 32; c++)
782 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
783 continue;
784
785 case ESC_s:
786 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
787 continue;
788
789 case ESC_S:
790 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
791 continue;
792
793 default:
794 *errorptr = ERR7;
795 goto FAILED;
796 }
797 }
798 /* Fall through if single character */
799 }
800
801 /* A single character may be followed by '-' to form a range. However,
802 Perl does not permit ']' to be the end of the range. A '-' character
803 here is treated as a literal. */
804
805 if (ptr[1] == '-' && ptr[2] != ']')
806 {
807 int d;
808 ptr += 2;
809 d = *ptr;
810
811 if (d == 0)
812 {
813 *errorptr = ERR6;
814 goto FAILED;
815 }
816
817 /* The second part of a range can be a single-character escape, but
818 not any of the other escapes. */
819
820 if (d == '\\')
821 {
822 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
823 if (d < 0)
824 {
825 if (d == -ESC_b) d = '\b'; else
826 {
827 *errorptr = ERR7;
828 goto FAILED;
829 }
830 }
831 }
832
833 if (d < c)
834 {
835 *errorptr = ERR8;
836 goto FAILED;
837 }
838
839 for (; c <= d; c++)
840 {
841 class[c/8] |= (1 << (c&7));
842 if ((options & PCRE_CASELESS) != 0)
843 {
844 int uc = cd->fcc[c]; /* flip case */
845 class[uc/8] |= (1 << (uc&7));
846 }
847 class_charcount++; /* in case a one-char range */
848 class_lastchar = c;
849 }
850 continue; /* Go get the next char in the class */
851 }
852
853 /* Handle a lone single character - we can get here for a normal
854 non-escape char, or after \ that introduces a single character. */
855
856 class [c/8] |= (1 << (c&7));
857 if ((options & PCRE_CASELESS) != 0)
858 {
859 c = cd->fcc[c]; /* flip case */
860 class[c/8] |= (1 << (c&7));
861 }
862 class_charcount++;
863 class_lastchar = c;
864 }
865
866 /* Loop until ']' reached; the check for end of string happens inside the
867 loop. This "while" is the end of the "do" above. */
868
869 while ((c = *(++ptr)) != ']');
870
871 /* If class_charcount is 1 and class_lastchar is not negative, we saw
872 precisely one character. This doesn't need the whole 32-byte bit map.
873 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
874 it's negative. */
875
876 if (class_charcount == 1 && class_lastchar >= 0)
877 {
878 if (negate_class)
879 {
880 code[-1] = OP_NOT;
881 }
882 else
883 {
884 code[-1] = OP_CHARS;
885 *code++ = 1;
886 }
887 *code++ = class_lastchar;
888 }
889
890 /* Otherwise, negate the 32-byte map if necessary, and copy it into
891 the code vector. */
892
893 else
894 {
895 if (negate_class)
896 for (c = 0; c < 32; c++) code[c] = ~class[c];
897 else
898 memcpy(code, class, 32);
899 code += 32;
900 }
901 break;
902
903 /* Various kinds of repeat */
904
905 case '{':
906 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
907 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
908 if (*errorptr != NULL) goto FAILED;
909 goto REPEAT;
910
911 case '*':
912 repeat_min = 0;
913 repeat_max = -1;
914 goto REPEAT;
915
916 case '+':
917 repeat_min = 1;
918 repeat_max = -1;
919 goto REPEAT;
920
921 case '?':
922 repeat_min = 0;
923 repeat_max = 1;
924
925 REPEAT:
926 if (previous == NULL)
927 {
928 *errorptr = ERR9;
929 goto FAILED;
930 }
931
932 /* If the next character is '?' this is a minimizing repeat, by default,
933 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
934 next character. */
935
936 if (ptr[1] == '?')
937 { repeat_type = greedy_non_default; ptr++; }
938 else repeat_type = greedy_default;
939
940 /* If the maximum is zero then the minimum must also be zero; Perl allows
941 this case, so we do too - by simply omitting the item altogether. */
942
943 if (repeat_max == 0) code = previous;
944
945 /* If previous was a string of characters, chop off the last one and use it
946 as the subject of the repeat. If there was only one character, we can
947 abolish the previous item altogether. */
948
949 else if (*previous == OP_CHARS)
950 {
951 int len = previous[1];
952 if (len == 1)
953 {
954 c = previous[2];
955 code = previous;
956 }
957 else
958 {
959 c = previous[len+1];
960 previous[1]--;
961 code--;
962 }
963 op_type = 0; /* Use single-char op codes */
964 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
965 }
966
967 /* If previous was a single negated character ([^a] or similar), we use
968 one of the special opcodes, replacing it. The code is shared with single-
969 character repeats by adding a suitable offset into repeat_type. */
970
971 else if ((int)*previous == OP_NOT)
972 {
973 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
974 c = previous[1];
975 code = previous;
976 goto OUTPUT_SINGLE_REPEAT;
977 }
978
979 /* If previous was a character type match (\d or similar), abolish it and
980 create a suitable repeat item. The code is shared with single-character
981 repeats by adding a suitable offset into repeat_type. */
982
983 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
984 {
985 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
986 c = *previous;
987 code = previous;
988
989 OUTPUT_SINGLE_REPEAT:
990 repeat_type += op_type; /* Combine both values for many cases */
991
992 /* A minimum of zero is handled either as the special case * or ?, or as
993 an UPTO, with the maximum given. */
994
995 if (repeat_min == 0)
996 {
997 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
998 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
999 else
1000 {
1001 *code++ = OP_UPTO + repeat_type;
1002 *code++ = repeat_max >> 8;
1003 *code++ = (repeat_max & 255);
1004 }
1005 }
1006
1007 /* The case {1,} is handled as the special case + */
1008
1009 else if (repeat_min == 1 && repeat_max == -1)
1010 *code++ = OP_PLUS + repeat_type;
1011
1012 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1013 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1014
1015 else
1016 {
1017 if (repeat_min != 1)
1018 {
1019 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1020 *code++ = repeat_min >> 8;
1021 *code++ = (repeat_min & 255);
1022 }
1023
1024 /* If the mininum is 1 and the previous item was a character string,
1025 we either have to put back the item that got cancelled if the string
1026 length was 1, or add the character back onto the end of a longer
1027 string. For a character type nothing need be done; it will just get
1028 put back naturally. Note that the final character is always going to
1029 get added below. */
1030
1031 else if (*previous == OP_CHARS)
1032 {
1033 if (code == previous) code += 2; else previous[1]++;
1034 }
1035
1036 /* For a single negated character we also have to put back the
1037 item that got cancelled. */
1038
1039 else if (*previous == OP_NOT) code++;
1040
1041 /* If the maximum is unlimited, insert an OP_STAR. */
1042
1043 if (repeat_max < 0)
1044 {
1045 *code++ = c;
1046 *code++ = OP_STAR + repeat_type;
1047 }
1048
1049 /* Else insert an UPTO if the max is greater than the min. */
1050
1051 else if (repeat_max != repeat_min)
1052 {
1053 *code++ = c;
1054 repeat_max -= repeat_min;
1055 *code++ = OP_UPTO + repeat_type;
1056 *code++ = repeat_max >> 8;
1057 *code++ = (repeat_max & 255);
1058 }
1059 }
1060
1061 /* The character or character type itself comes last in all cases. */
1062
1063 *code++ = c;
1064 }
1065
1066 /* If previous was a character class or a back reference, we put the repeat
1067 stuff after it. */
1068
1069 else if (*previous == OP_CLASS || *previous == OP_REF)
1070 {
1071 if (repeat_min == 0 && repeat_max == -1)
1072 *code++ = OP_CRSTAR + repeat_type;
1073 else if (repeat_min == 1 && repeat_max == -1)
1074 *code++ = OP_CRPLUS + repeat_type;
1075 else if (repeat_min == 0 && repeat_max == 1)
1076 *code++ = OP_CRQUERY + repeat_type;
1077 else
1078 {
1079 *code++ = OP_CRRANGE + repeat_type;
1080 *code++ = repeat_min >> 8;
1081 *code++ = repeat_min & 255;
1082 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1083 *code++ = repeat_max >> 8;
1084 *code++ = repeat_max & 255;
1085 }
1086 }
1087
1088 /* If previous was a bracket group, we may have to replicate it in certain
1089 cases. */
1090
1091 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092 (int)*previous == OP_COND)
1093 {
1094 int i, ketoffset = 0;
1095 int len = code - previous;
1096
1097 /* If the maximum repeat count is unlimited, find the end of the bracket
1098 by scanning through from the start, and compute the offset back to it
1099 from the current code pointer. There may be an OP_OPT setting following
1100 the final KET, so we can't find the end just by going back from the code
1101 pointer. */
1102
1103 if (repeat_max == -1)
1104 {
1105 register uschar *ket = previous;
1106 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1107 ketoffset = code - ket;
1108 }
1109
1110 /* If the minimum is greater than zero, and the maximum is unlimited or
1111 equal to the minimum, the first copy remains where it is, and is
1112 replicated up to the minimum number of times. This case includes the +
1113 repeat, but of course no replication is needed in that case. */
1114
1115 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1116 {
1117 for (i = 1; i < repeat_min; i++)
1118 {
1119 memcpy(code, previous, len);
1120 code += len;
1121 }
1122 }
1123
1124 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1125 Then, if there is a fixed upper limit, replicated up to that many times,
1126 sticking BRAZERO in front of all the optional ones. */
1127
1128 else
1129 {
1130 if (repeat_min == 0)
1131 {
1132 memmove(previous+1, previous, len);
1133 code++;
1134 *previous++ = OP_BRAZERO + repeat_type;
1135 }
1136
1137 for (i = 1; i < repeat_min; i++)
1138 {
1139 memcpy(code, previous, len);
1140 code += len;
1141 }
1142
1143 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1144 {
1145 *code++ = OP_BRAZERO + repeat_type;
1146 memcpy(code, previous, len);
1147 code += len;
1148 }
1149 }
1150
1151 /* If the maximum is unlimited, set a repeater in the final copy. We
1152 can't just offset backwards from the current code point, because we
1153 don't know if there's been an options resetting after the ket. The
1154 correct offset was computed above. */
1155
1156 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1157 }
1158
1159 /* Else there's some kind of shambles */
1160
1161 else
1162 {
1163 *errorptr = ERR11;
1164 goto FAILED;
1165 }
1166
1167 /* In all case we no longer have a previous item. */
1168
1169 previous = NULL;
1170 break;
1171
1172
1173 /* Start of nested bracket sub-expression, or comment or lookahead or
1174 lookbehind or option setting or condition. First deal with special things
1175 that can come after a bracket; all are introduced by ?, and the appearance
1176 of any of them means that this is not a referencing group. They were
1177 checked for validity in the first pass over the string, so we don't have to
1178 check for syntax errors here. */
1179
1180 case '(':
1181 newoptions = options;
1182 condref = -1;
1183
1184 if (*(++ptr) == '?')
1185 {
1186 int set, unset;
1187 int *optset;
1188
1189 switch (*(++ptr))
1190 {
1191 case '#': /* Comment; skip to ket */
1192 ptr++;
1193 while (*ptr != ')') ptr++;
1194 continue;
1195
1196 case ':': /* Non-extracting bracket */
1197 bravalue = OP_BRA;
1198 ptr++;
1199 break;
1200
1201 case '(':
1202 bravalue = OP_COND; /* Conditional group */
1203 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1204 {
1205 condref = *ptr - '0';
1206 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1207 ptr++;
1208 }
1209 else ptr--;
1210 break;
1211
1212 case '=': /* Positive lookahead */
1213 bravalue = OP_ASSERT;
1214 ptr++;
1215 break;
1216
1217 case '!': /* Negative lookahead */
1218 bravalue = OP_ASSERT_NOT;
1219 ptr++;
1220 break;
1221
1222 case '<': /* Lookbehinds */
1223 switch (*(++ptr))
1224 {
1225 case '=': /* Positive lookbehind */
1226 bravalue = OP_ASSERTBACK;
1227 ptr++;
1228 break;
1229
1230 case '!': /* Negative lookbehind */
1231 bravalue = OP_ASSERTBACK_NOT;
1232 ptr++;
1233 break;
1234
1235 default: /* Syntax error */
1236 *errorptr = ERR24;
1237 goto FAILED;
1238 }
1239 break;
1240
1241 case '>': /* One-time brackets */
1242 bravalue = OP_ONCE;
1243 ptr++;
1244 break;
1245
1246 default: /* Option setting */
1247 set = unset = 0;
1248 optset = &set;
1249
1250 while (*ptr != ')' && *ptr != ':')
1251 {
1252 switch (*ptr++)
1253 {
1254 case '-': optset = &unset; break;
1255
1256 case 'i': *optset |= PCRE_CASELESS; break;
1257 case 'm': *optset |= PCRE_MULTILINE; break;
1258 case 's': *optset |= PCRE_DOTALL; break;
1259 case 'x': *optset |= PCRE_EXTENDED; break;
1260 case 'U': *optset |= PCRE_UNGREEDY; break;
1261 case 'X': *optset |= PCRE_EXTRA; break;
1262
1263 default:
1264 *errorptr = ERR12;
1265 goto FAILED;
1266 }
1267 }
1268
1269 /* Set up the changed option bits, but don't change anything yet. */
1270
1271 newoptions = (options | set) & (~unset);
1272
1273 /* If the options ended with ')' this is not the start of a nested
1274 group with option changes, so the options change at this level. At top
1275 level there is nothing else to be done (the options will in fact have
1276 been set from the start of compiling as a result of the first pass) but
1277 at an inner level we must compile code to change the ims options if
1278 necessary, and pass the new setting back so that it can be put at the
1279 start of any following branches, and when this group ends, a resetting
1280 item can be compiled. */
1281
1282 if (*ptr == ')')
1283 {
1284 if ((options & PCRE_INGROUP) != 0 &&
1285 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1286 {
1287 *code++ = OP_OPT;
1288 *code++ = *optchanged = newoptions & PCRE_IMS;
1289 }
1290 options = newoptions; /* Change options at this level */
1291 previous = NULL; /* This item can't be repeated */
1292 continue; /* It is complete */
1293 }
1294
1295 /* If the options ended with ':' we are heading into a nested group
1296 with possible change of options. Such groups are non-capturing and are
1297 not assertions of any kind. All we need to do is skip over the ':';
1298 the newoptions value is handled below. */
1299
1300 bravalue = OP_BRA;
1301 ptr++;
1302 }
1303 }
1304
1305 /* Else we have a referencing group; adjust the opcode. */
1306
1307 else
1308 {
1309 if (++(*brackets) > EXTRACT_MAX)
1310 {
1311 *errorptr = ERR13;
1312 goto FAILED;
1313 }
1314 bravalue = OP_BRA + *brackets;
1315 }
1316
1317 /* Process nested bracketed re. Assertions may not be repeated, but other
1318 kinds can be. We copy code into a non-register variable in order to be able
1319 to pass its address because some compilers complain otherwise. Pass in a
1320 new setting for the ims options if they have changed. */
1321
1322 previous = (bravalue >= OP_ONCE)? code : NULL;
1323 *code = bravalue;
1324 tempcode = code;
1325
1326 if (!compile_regex(
1327 options | PCRE_INGROUP, /* Set for all nested groups */
1328 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1329 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1330 brackets, /* Bracket level */
1331 &tempcode, /* Where to put code (updated) */
1332 &ptr, /* Input pointer (updated) */
1333 errorptr, /* Where to put an error message */
1334 (bravalue == OP_ASSERTBACK ||
1335 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1336 condref, /* Condition reference number */
1337 cd)) /* Tables block */
1338 goto FAILED;
1339
1340 /* At the end of compiling, code is still pointing to the start of the
1341 group, while tempcode has been updated to point past the end of the group
1342 and any option resetting that may follow it. The pattern pointer (ptr)
1343 is on the bracket. */
1344
1345 /* If this is a conditional bracket, check that there are no more than
1346 two branches in the group. */
1347
1348 if (bravalue == OP_COND)
1349 {
1350 int branchcount = 0;
1351 uschar *tc = code;
1352
1353 do {
1354 branchcount++;
1355 tc += (tc[1] << 8) | tc[2];
1356 }
1357 while (*tc != OP_KET);
1358
1359 if (branchcount > 2)
1360 {
1361 *errorptr = ERR27;
1362 goto FAILED;
1363 }
1364 }
1365
1366 /* Now update the main code pointer to the end of the group. */
1367
1368 code = tempcode;
1369
1370 /* Error if hit end of pattern */
1371
1372 if (*ptr != ')')
1373 {
1374 *errorptr = ERR14;
1375 goto FAILED;
1376 }
1377 break;
1378
1379 /* Check \ for being a real metacharacter; if not, fall through and handle
1380 it as a data character at the start of a string. Escape items are checked
1381 for validity in the pre-compiling pass. */
1382
1383 case '\\':
1384 tempptr = ptr;
1385 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1386
1387 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1388 are arranged to be the negation of the corresponding OP_values. For the
1389 back references, the values are ESC_REF plus the reference number. Only
1390 back references and those types that consume a character may be repeated.
1391 We can test for values between ESC_b and ESC_Z for the latter; this may
1392 have to change if any new ones are ever created. */
1393
1394 if (c < 0)
1395 {
1396 if (-c >= ESC_REF)
1397 {
1398 previous = code;
1399 *code++ = OP_REF;
1400 *code++ = -c - ESC_REF;
1401 }
1402 else
1403 {
1404 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1405 *code++ = -c;
1406 }
1407 continue;
1408 }
1409
1410 /* Data character: reset and fall through */
1411
1412 ptr = tempptr;
1413 c = '\\';
1414
1415 /* Handle a run of data characters until a metacharacter is encountered.
1416 The first character is guaranteed not to be whitespace or # when the
1417 extended flag is set. */
1418
1419 NORMAL_CHAR:
1420 default:
1421 previous = code;
1422 *code = OP_CHARS;
1423 code += 2;
1424 length = 0;
1425
1426 do
1427 {
1428 if ((options & PCRE_EXTENDED) != 0)
1429 {
1430 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1431 if (c == '#')
1432 {
1433 while ((c = *(++ptr)) != 0 && c != '\n');
1434 if (c == 0) break;
1435 continue;
1436 }
1437 }
1438
1439 /* Backslash may introduce a data char or a metacharacter. Escaped items
1440 are checked for validity in the pre-compiling pass. Stop the string
1441 before a metaitem. */
1442
1443 if (c == '\\')
1444 {
1445 tempptr = ptr;
1446 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1447 if (c < 0) { ptr = tempptr; break; }
1448 }
1449
1450 /* Ordinary character or single-char escape */
1451
1452 *code++ = c;
1453 length++;
1454 }
1455
1456 /* This "while" is the end of the "do" above. */
1457
1458 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1459
1460 /* Compute the length and set it in the data vector, and advance to
1461 the next state. */
1462
1463 previous[1] = length;
1464 if (length < 255) ptr--;
1465 break;
1466 }
1467 } /* end of big loop */
1468
1469 /* Control never reaches here by falling through, only by a goto for all the
1470 error states. Pass back the position in the pattern so that it can be displayed
1471 to the user for diagnosing the error. */
1472
1473 FAILED:
1474 *ptrptr = ptr;
1475 return FALSE;
1476 }
1477
1478
1479
1480
1481 /*************************************************
1482 * Compile sequence of alternatives *
1483 *************************************************/
1484
1485 /* On entry, ptr is pointing past the bracket character, but on return
1486 it points to the closing bracket, or vertical bar, or end of string.
1487 The code variable is pointing at the byte into which the BRA operator has been
1488 stored. If the ims options are changed at the start (for a (?ims: group) or
1489 during any branch, we need to insert an OP_OPT item at the start of every
1490 following branch to ensure they get set correctly at run time, and also pass
1491 the new options into every subsequent branch compile.
1492
1493 Argument:
1494 options the option bits
1495 optchanged new ims options to set as if (?ims) were at the start, or -1
1496 for no change
1497 brackets -> int containing the number of extracting brackets used
1498 codeptr -> the address of the current code pointer
1499 ptrptr -> the address of the current pattern pointer
1500 errorptr -> pointer to error message
1501 lookbehind TRUE if this is a lookbehind assertion
1502 condref > 0 for OPT_CREF setting at start of conditional group
1503 cd points to the data block with tables pointers
1504
1505 Returns: TRUE on success
1506 */
1507
1508 static BOOL
1509 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1510 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1511 compile_data *cd)
1512 {
1513 const uschar *ptr = *ptrptr;
1514 uschar *code = *codeptr;
1515 uschar *last_branch = code;
1516 uschar *start_bracket = code;
1517 uschar *reverse_count = NULL;
1518 int oldoptions = options & PCRE_IMS;
1519
1520 code += 3;
1521
1522 /* At the start of a reference-based conditional group, insert the reference
1523 number as an OP_CREF item. */
1524
1525 if (condref > 0)
1526 {
1527 *code++ = OP_CREF;
1528 *code++ = condref;
1529 }
1530
1531 /* Loop for each alternative branch */
1532
1533 for (;;)
1534 {
1535 int length;
1536
1537 /* Handle change of options */
1538
1539 if (optchanged >= 0)
1540 {
1541 *code++ = OP_OPT;
1542 *code++ = optchanged;
1543 options = (options & ~PCRE_IMS) | optchanged;
1544 }
1545
1546 /* Set up dummy OP_REVERSE if lookbehind assertion */
1547
1548 if (lookbehind)
1549 {
1550 *code++ = OP_REVERSE;
1551 reverse_count = code;
1552 *code++ = 0;
1553 *code++ = 0;
1554 }
1555
1556 /* Now compile the branch */
1557
1558 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1559 {
1560 *ptrptr = ptr;
1561 return FALSE;
1562 }
1563
1564 /* Fill in the length of the last branch */
1565
1566 length = code - last_branch;
1567 last_branch[1] = length >> 8;
1568 last_branch[2] = length & 255;
1569
1570 /* If lookbehind, check that this branch matches a fixed-length string,
1571 and put the length into the OP_REVERSE item. Temporarily mark the end of
1572 the branch with OP_END. */
1573
1574 if (lookbehind)
1575 {
1576 *code = OP_END;
1577 length = find_fixedlength(last_branch);
1578 DPRINTF(("fixed length = %d\n", length));
1579 if (length < 0)
1580 {
1581 *errorptr = ERR25;
1582 *ptrptr = ptr;
1583 return FALSE;
1584 }
1585 reverse_count[0] = (length >> 8);
1586 reverse_count[1] = length & 255;
1587 }
1588
1589 /* Reached end of expression, either ')' or end of pattern. Insert a
1590 terminating ket and the length of the whole bracketed item, and return,
1591 leaving the pointer at the terminating char. If any of the ims options
1592 were changed inside the group, compile a resetting op-code following. */
1593
1594 if (*ptr != '|')
1595 {
1596 length = code - start_bracket;
1597 *code++ = OP_KET;
1598 *code++ = length >> 8;
1599 *code++ = length & 255;
1600 if (optchanged >= 0)
1601 {
1602 *code++ = OP_OPT;
1603 *code++ = oldoptions;
1604 }
1605 *codeptr = code;
1606 *ptrptr = ptr;
1607 return TRUE;
1608 }
1609
1610 /* Another branch follows; insert an "or" node and advance the pointer. */
1611
1612 *code = OP_ALT;
1613 last_branch = code;
1614 code += 3;
1615 ptr++;
1616 }
1617 /* Control never reaches here */
1618 }
1619
1620
1621
1622
1623 /*************************************************
1624 * Find first significant op code *
1625 *************************************************/
1626
1627 /* This is called by several functions that scan a compiled expression looking
1628 for a fixed first character, or an anchoring op code etc. It skips over things
1629 that do not influence this. For one application, a change of caseless option is
1630 important.
1631
1632 Arguments:
1633 code pointer to the start of the group
1634 options pointer to external options
1635 optbit the option bit whose changing is significant, or
1636 zero if none are
1637 optstop TRUE to return on option change, otherwise change the options
1638 value and continue
1639
1640 Returns: pointer to the first significant opcode
1641 */
1642
1643 static const uschar*
1644 first_significant_code(const uschar *code, int *options, int optbit,
1645 BOOL optstop)
1646 {
1647 for (;;)
1648 {
1649 switch ((int)*code)
1650 {
1651 case OP_OPT:
1652 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1653 {
1654 if (optstop) return code;
1655 *options = (int)code[1];
1656 }
1657 code += 2;
1658 break;
1659
1660 case OP_CREF:
1661 code += 2;
1662 break;
1663
1664 case OP_ASSERT_NOT:
1665 case OP_ASSERTBACK:
1666 case OP_ASSERTBACK_NOT:
1667 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1668 code += 3;
1669 break;
1670
1671 default:
1672 return code;
1673 }
1674 }
1675 /* Control never reaches here */
1676 }
1677
1678
1679
1680
1681 /*************************************************
1682 * Check for anchored expression *
1683 *************************************************/
1684
1685 /* Try to find out if this is an anchored regular expression. Consider each
1686 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1687 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1688 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1689 counts, since OP_CIRC can match in the middle.
1690
1691 A branch is also implicitly anchored if it starts with .* because that will try
1692 the rest of the pattern at all possible matching points, so there is no point
1693 trying them again.
1694
1695 Arguments:
1696 code points to start of expression (the bracket)
1697 options points to the options setting
1698
1699 Returns: TRUE or FALSE
1700 */
1701
1702 static BOOL
1703 is_anchored(register const uschar *code, int *options)
1704 {
1705 do {
1706 const uschar *scode = first_significant_code(code + 3, options,
1707 PCRE_MULTILINE, FALSE);
1708 register int op = *scode;
1709 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1710 { if (!is_anchored(scode, options)) return FALSE; }
1711 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1712 { if (scode[1] != OP_ANY) return FALSE; }
1713 else if (op != OP_SOD &&
1714 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1715 return FALSE;
1716 code += (code[1] << 8) + code[2];
1717 }
1718 while (*code == OP_ALT);
1719 return TRUE;
1720 }
1721
1722
1723
1724 /*************************************************
1725 * Check for start with \n line expression *
1726 *************************************************/
1727
1728 /* This is called for multiline expressions to try to find out if every branch
1729 starts with ^ so that "first char" processing can be done to speed things up.
1730
1731 Argument: points to start of expression (the bracket)
1732 Returns: TRUE or FALSE
1733 */
1734
1735 static BOOL
1736 is_startline(const uschar *code)
1737 {
1738 do {
1739 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1740 register int op = *scode;
1741 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1742 { if (!is_startline(scode)) return FALSE; }
1743 else if (op != OP_CIRC) return FALSE;
1744 code += (code[1] << 8) + code[2];
1745 }
1746 while (*code == OP_ALT);
1747 return TRUE;
1748 }
1749
1750
1751
1752 /*************************************************
1753 * Check for fixed first char *
1754 *************************************************/
1755
1756 /* Try to find out if there is a fixed first character. This is called for
1757 unanchored expressions, as it speeds up their processing quite considerably.
1758 Consider each alternative branch. If they all start with the same char, or with
1759 a bracket all of whose alternatives start with the same char (recurse ad lib),
1760 then we return that char, otherwise -1.
1761
1762 Arguments:
1763 code points to start of expression (the bracket)
1764 options pointer to the options (used to check casing changes)
1765
1766 Returns: -1 or the fixed first char
1767 */
1768
1769 static int
1770 find_firstchar(const uschar *code, int *options)
1771 {
1772 register int c = -1;
1773 do {
1774 int d;
1775 const uschar *scode = first_significant_code(code + 3, options,
1776 PCRE_CASELESS, TRUE);
1777 register int op = *scode;
1778
1779 if (op >= OP_BRA) op = OP_BRA;
1780
1781 switch(op)
1782 {
1783 default:
1784 return -1;
1785
1786 case OP_BRA:
1787 case OP_ASSERT:
1788 case OP_ONCE:
1789 case OP_COND:
1790 if ((d = find_firstchar(scode, options)) < 0) return -1;
1791 if (c < 0) c = d; else if (c != d) return -1;
1792 break;
1793
1794 case OP_EXACT: /* Fall through */
1795 scode++;
1796
1797 case OP_CHARS: /* Fall through */
1798 scode++;
1799
1800 case OP_PLUS:
1801 case OP_MINPLUS:
1802 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1803 break;
1804 }
1805
1806 code += (code[1] << 8) + code[2];
1807 }
1808 while (*code == OP_ALT);
1809 return c;
1810 }
1811
1812
1813
1814
1815
1816 /*************************************************
1817 * Compile a Regular Expression *
1818 *************************************************/
1819
1820 /* This function takes a string and returns a pointer to a block of store
1821 holding a compiled version of the expression.
1822
1823 Arguments:
1824 pattern the regular expression
1825 options various option bits
1826 errorptr pointer to pointer to error text
1827 erroroffset ptr offset in pattern where error was detected
1828 tables pointer to character tables or NULL
1829
1830 Returns: pointer to compiled data block, or NULL on error,
1831 with errorptr and erroroffset set
1832 */
1833
1834 pcre *
1835 pcre_compile(const char *pattern, int options, const char **errorptr,
1836 int *erroroffset, const unsigned char *tables)
1837 {
1838 real_pcre *re;
1839 int length = 3; /* For initial BRA plus length */
1840 int runlength;
1841 int c, size;
1842 int bracount = 0;
1843 int top_backref = 0;
1844 int branch_extra = 0;
1845 int branch_newextra;
1846 unsigned int brastackptr = 0;
1847 uschar *code;
1848 const uschar *ptr;
1849 compile_data compile_block;
1850 int brastack[BRASTACK_SIZE];
1851 uschar bralenstack[BRASTACK_SIZE];
1852
1853 #ifdef DEBUG
1854 uschar *code_base, *code_end;
1855 #endif
1856
1857 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1858 can do is just return NULL. */
1859
1860 if (errorptr == NULL) return NULL;
1861 *errorptr = NULL;
1862
1863 /* However, we can give a message for this error */
1864
1865 if (erroroffset == NULL)
1866 {
1867 *errorptr = ERR16;
1868 return NULL;
1869 }
1870 *erroroffset = 0;
1871
1872 if ((options & ~PUBLIC_OPTIONS) != 0)
1873 {
1874 *errorptr = ERR17;
1875 return NULL;
1876 }
1877
1878 /* Set up pointers to the individual character tables */
1879
1880 if (tables == NULL) tables = pcre_default_tables;
1881 compile_block.lcc = tables + lcc_offset;
1882 compile_block.fcc = tables + fcc_offset;
1883 compile_block.cbits = tables + cbits_offset;
1884 compile_block.ctypes = tables + ctypes_offset;
1885
1886 /* Reflect pattern for debugging output */
1887
1888 DPRINTF(("------------------------------------------------------------------\n"));
1889 DPRINTF(("%s\n", pattern));
1890
1891 /* The first thing to do is to make a pass over the pattern to compute the
1892 amount of store required to hold the compiled code. This does not have to be
1893 perfect as long as errors are overestimates. At the same time we can detect any
1894 internal flag settings. Make an attempt to correct for any counted white space
1895 if an "extended" flag setting appears late in the pattern. We can't be so
1896 clever for #-comments. */
1897
1898 ptr = (const uschar *)(pattern - 1);
1899 while ((c = *(++ptr)) != 0)
1900 {
1901 int min, max;
1902 int class_charcount;
1903
1904 if ((options & PCRE_EXTENDED) != 0)
1905 {
1906 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
1907 if (c == '#')
1908 {
1909 while ((c = *(++ptr)) != 0 && c != '\n');
1910 continue;
1911 }
1912 }
1913
1914 switch(c)
1915 {
1916 /* A backslashed item may be an escaped "normal" character or a
1917 character type. For a "normal" character, put the pointers and
1918 character back so that tests for whitespace etc. in the input
1919 are done correctly. */
1920
1921 case '\\':
1922 {
1923 const uschar *save_ptr = ptr;
1924 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
1925 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1926 if (c >= 0)
1927 {
1928 ptr = save_ptr;
1929 c = '\\';
1930 goto NORMAL_CHAR;
1931 }
1932 }
1933 length++;
1934
1935 /* A back reference needs an additional char, plus either one or 5
1936 bytes for a repeat. We also need to keep the value of the highest
1937 back reference. */
1938
1939 if (c <= -ESC_REF)
1940 {
1941 int refnum = -c - ESC_REF;
1942 if (refnum > top_backref) top_backref = refnum;
1943 length++; /* For single back reference */
1944 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
1945 {
1946 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
1947 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1948 if ((min == 0 && (max == 1 || max == -1)) ||
1949 (min == 1 && max == -1))
1950 length++;
1951 else length += 5;
1952 if (ptr[1] == '?') ptr++;
1953 }
1954 }
1955 continue;
1956
1957 case '^':
1958 case '.':
1959 case '$':
1960 case '*': /* These repeats won't be after brackets; */
1961 case '+': /* those are handled separately */
1962 case '?':
1963 length++;
1964 continue;
1965
1966 /* This covers the cases of repeats after a single char, metachar, class,
1967 or back reference. */
1968
1969 case '{':
1970 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
1971 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
1972 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1973 if ((min == 0 && (max == 1 || max == -1)) ||
1974 (min == 1 && max == -1))
1975 length++;
1976 else
1977 {
1978 length--; /* Uncount the original char or metachar */
1979 if (min == 1) length++; else if (min > 0) length += 4;
1980 if (max > 0) length += 4; else length += 2;
1981 }
1982 if (ptr[1] == '?') ptr++;
1983 continue;
1984
1985 /* An alternation contains an offset to the next branch or ket. If any ims
1986 options changed in the previous branch(es), and/or if we are in a
1987 lookbehind assertion, extra space will be needed at the start of the
1988 branch. This is handled by branch_extra. */
1989
1990 case '|':
1991 length += 3 + branch_extra;
1992 continue;
1993
1994 /* A character class uses 33 characters. Don't worry about character types
1995 that aren't allowed in classes - they'll get picked up during the compile.
1996 A character class that contains only one character uses 2 or 3 bytes,
1997 depending on whether it is negated or not. Notice this where we can. */
1998
1999 case '[':
2000 class_charcount = 0;
2001 if (*(++ptr) == '^') ptr++;
2002 do
2003 {
2004 if (*ptr == '\\')
2005 {
2006 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2007 &compile_block);
2008 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2009 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2010 }
2011 else class_charcount++;
2012 ptr++;
2013 }
2014 while (*ptr != 0 && *ptr != ']');
2015
2016 /* Repeats for negated single chars are handled by the general code */
2017
2018 if (class_charcount == 1) length += 3; else
2019 {
2020 length += 33;
2021
2022 /* A repeat needs either 1 or 5 bytes. */
2023
2024 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2025 {
2026 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2027 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2028 if ((min == 0 && (max == 1 || max == -1)) ||
2029 (min == 1 && max == -1))
2030 length++;
2031 else length += 5;
2032 if (ptr[1] == '?') ptr++;
2033 }
2034 }
2035 continue;
2036
2037 /* Brackets may be genuine groups or special things */
2038
2039 case '(':
2040 branch_newextra = 0;
2041
2042 /* Handle special forms of bracket, which all start (? */
2043
2044 if (ptr[1] == '?')
2045 {
2046 int set, unset;
2047 int *optset;
2048
2049 switch (c = ptr[2])
2050 {
2051 /* Skip over comments entirely */
2052 case '#':
2053 ptr += 3;
2054 while (*ptr != 0 && *ptr != ')') ptr++;
2055 if (*ptr == 0)
2056 {
2057 *errorptr = ERR18;
2058 goto PCRE_ERROR_RETURN;
2059 }
2060 continue;
2061
2062 /* Non-referencing groups and lookaheads just move the pointer on, and
2063 then behave like a non-special bracket, except that they don't increment
2064 the count of extracting brackets. Ditto for the "once only" bracket,
2065 which is in Perl from version 5.005. */
2066
2067 case ':':
2068 case '=':
2069 case '!':
2070 case '>':
2071 ptr += 2;
2072 break;
2073
2074 /* Lookbehinds are in Perl from version 5.005 */
2075
2076 case '<':
2077 if (ptr[3] == '=' || ptr[3] == '!')
2078 {
2079 ptr += 3;
2080 branch_newextra = 3;
2081 length += 3; /* For the first branch */
2082 break;
2083 }
2084 *errorptr = ERR24;
2085 goto PCRE_ERROR_RETURN;
2086
2087 /* Conditionals are in Perl from version 5.005. The bracket must either
2088 be followed by a number (for bracket reference) or by an assertion
2089 group. */
2090
2091 case '(':
2092 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2093 {
2094 ptr += 4;
2095 length += 2;
2096 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2097 if (*ptr != ')')
2098 {
2099 *errorptr = ERR26;
2100 goto PCRE_ERROR_RETURN;
2101 }
2102 }
2103 else /* An assertion must follow */
2104 {
2105 ptr++; /* Can treat like ':' as far as spacing is concerned */
2106
2107 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2108 {
2109 ptr += 2; /* To get right offset in message */
2110 *errorptr = ERR28;
2111 goto PCRE_ERROR_RETURN;
2112 }
2113 }
2114 break;
2115
2116 /* Else loop checking valid options until ) is met. Anything else is an
2117 error. If we are without any brackets, i.e. at top level, the settings
2118 act as if specified in the options, so massage the options immediately.
2119 This is for backward compatibility with Perl 5.004. */
2120
2121 default:
2122 set = unset = 0;
2123 optset = &set;
2124 ptr += 2;
2125
2126 for (;; ptr++)
2127 {
2128 c = *ptr;
2129 switch (c)
2130 {
2131 case 'i':
2132 *optset |= PCRE_CASELESS;
2133 continue;
2134
2135 case 'm':
2136 *optset |= PCRE_MULTILINE;
2137 continue;
2138
2139 case 's':
2140 *optset |= PCRE_DOTALL;
2141 continue;
2142
2143 case 'x':
2144 *optset |= PCRE_EXTENDED;
2145 continue;
2146
2147 case 'X':
2148 *optset |= PCRE_EXTRA;
2149 continue;
2150
2151 case 'U':
2152 *optset |= PCRE_UNGREEDY;
2153 continue;
2154
2155 case '-':
2156 optset = &unset;
2157 continue;
2158
2159 /* A termination by ')' indicates an options-setting-only item;
2160 this is global at top level; otherwise nothing is done here and
2161 it is handled during the compiling process on a per-bracket-group
2162 basis. */
2163
2164 case ')':
2165 if (brastackptr == 0)
2166 {
2167 options = (options | set) & (~unset);
2168 set = unset = 0; /* To save length */
2169 }
2170 /* Fall through */
2171
2172 /* A termination by ':' indicates the start of a nested group with
2173 the given options set. This is again handled at compile time, but
2174 we must allow for compiled space if any of the ims options are
2175 set. We also have to allow for resetting space at the end of
2176 the group, which is why 4 is added to the length and not just 2.
2177 If there are several changes of options within the same group, this
2178 will lead to an over-estimate on the length, but this shouldn't
2179 matter very much. We also have to allow for resetting options at
2180 the start of any alternations, which we do by setting
2181 branch_newextra to 2. */
2182
2183 case ':':
2184 if (((set|unset) & PCRE_IMS) != 0)
2185 {
2186 length += 4;
2187 branch_newextra = 2;
2188 }
2189 goto END_OPTIONS;
2190
2191 /* Unrecognized option character */
2192
2193 default:
2194 *errorptr = ERR12;
2195 goto PCRE_ERROR_RETURN;
2196 }
2197 }
2198
2199 /* If we hit a closing bracket, that's it - this is a freestanding
2200 option-setting. We need to ensure that branch_extra is updated if
2201 necessary. The only values branch_newextra can have here are 0 or 2.
2202 If the value is 2, then branch_extra must either be 2 or 5, depending
2203 on whether this is a lookbehind group or not. */
2204
2205 END_OPTIONS:
2206 if (c == ')')
2207 {
2208 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2209 branch_extra += branch_newextra;
2210 continue;
2211 }
2212
2213 /* If options were terminated by ':' control comes here. Fall through
2214 to handle the group below. */
2215 }
2216 }
2217
2218 /* Extracting brackets must be counted so we can process escapes in a
2219 Perlish way. */
2220
2221 else bracount++;
2222
2223 /* Non-special forms of bracket. Save length for computing whole length
2224 at end if there's a repeat that requires duplication of the group. Also
2225 save the current value of branch_extra, and start the new group with
2226 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2227 for a lookbehind assertion. */
2228
2229 if (brastackptr >= sizeof(brastack)/sizeof(int))
2230 {
2231 *errorptr = ERR19;
2232 goto PCRE_ERROR_RETURN;
2233 }
2234
2235 bralenstack[brastackptr] = branch_extra;
2236 branch_extra = branch_newextra;
2237
2238 brastack[brastackptr++] = length;
2239 length += 3;
2240 continue;
2241
2242 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2243 have to replicate this bracket up to that many times. If brastackptr is
2244 0 this is an unmatched bracket which will generate an error, but take care
2245 not to try to access brastack[-1] when computing the length and restoring
2246 the branch_extra value. */
2247
2248 case ')':
2249 length += 3;
2250 {
2251 int minval = 1;
2252 int maxval = 1;
2253 int duplength;
2254
2255 if (brastackptr > 0)
2256 {
2257 duplength = length - brastack[--brastackptr];
2258 branch_extra = bralenstack[brastackptr];
2259 }
2260 else duplength = 0;
2261
2262 /* Leave ptr at the final char; for read_repeat_counts this happens
2263 automatically; for the others we need an increment. */
2264
2265 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2266 {
2267 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2268 &compile_block);
2269 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2270 }
2271 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2272 else if (c == '+') { maxval = -1; ptr++; }
2273 else if (c == '?') { minval = 0; ptr++; }
2274
2275 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2276 if there is a limited maximum we have to replicate up to maxval-1 times
2277 and allow for a BRAZERO item before each optional copy, as we also have
2278 to do before the first copy if the minimum is zero. */
2279
2280 if (minval == 0) length++;
2281 else if (minval > 1) length += (minval - 1) * duplength;
2282 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2283 }
2284 continue;
2285
2286 /* Non-special character. For a run of such characters the length required
2287 is the number of characters + 2, except that the maximum run length is 255.
2288 We won't get a skipped space or a non-data escape or the start of a #
2289 comment as the first character, so the length can't be zero. */
2290
2291 NORMAL_CHAR:
2292 default:
2293 length += 2;
2294 runlength = 0;
2295 do
2296 {
2297 if ((options & PCRE_EXTENDED) != 0)
2298 {
2299 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2300 if (c == '#')
2301 {
2302 while ((c = *(++ptr)) != 0 && c != '\n');
2303 continue;
2304 }
2305 }
2306
2307 /* Backslash may introduce a data char or a metacharacter; stop the
2308 string before the latter. */
2309
2310 if (c == '\\')
2311 {
2312 const uschar *saveptr = ptr;
2313 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2314 &compile_block);
2315 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2316 if (c < 0) { ptr = saveptr; break; }
2317 }
2318
2319 /* Ordinary character or single-char escape */
2320
2321 runlength++;
2322 }
2323
2324 /* This "while" is the end of the "do" above. */
2325
2326 while (runlength < 255 &&
2327 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2328
2329 ptr--;
2330 length += runlength;
2331 continue;
2332 }
2333 }
2334
2335 length += 4; /* For final KET and END */
2336
2337 if (length > 65539)
2338 {
2339 *errorptr = ERR20;
2340 return NULL;
2341 }
2342
2343 /* Compute the size of data block needed and get it, either from malloc or
2344 externally provided function. We specify "code[0]" in the offsetof() expression
2345 rather than just "code", because it has been reported that one broken compiler
2346 fails on "code" because it is also an independent variable. It should make no
2347 difference to the value of the offsetof(). */
2348
2349 size = length + offsetof(real_pcre, code[0]);
2350 re = (real_pcre *)(pcre_malloc)(size);
2351
2352 if (re == NULL)
2353 {
2354 *errorptr = ERR21;
2355 return NULL;
2356 }
2357
2358 /* Put in the magic number and the options. */
2359
2360 re->magic_number = MAGIC_NUMBER;
2361 re->options = options;
2362 re->tables = tables;
2363
2364 /* Set up a starting, non-extracting bracket, then compile the expression. On
2365 error, *errorptr will be set non-NULL, so we don't need to look at the result
2366 of the function here. */
2367
2368 ptr = (const uschar *)pattern;
2369 code = re->code;
2370 *code = OP_BRA;
2371 bracount = 0;
2372 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2373 &compile_block);
2374 re->top_bracket = bracount;
2375 re->top_backref = top_backref;
2376
2377 /* If not reached end of pattern on success, there's an excess bracket. */
2378
2379 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2380
2381 /* Fill in the terminating state and check for disastrous overflow, but
2382 if debugging, leave the test till after things are printed out. */
2383
2384 *code++ = OP_END;
2385
2386 #ifndef DEBUG
2387 if (code - re->code > length) *errorptr = ERR23;
2388 #endif
2389
2390 /* Give an error if there's back reference to a non-existent capturing
2391 subpattern. */
2392
2393 if (top_backref > re->top_bracket) *errorptr = ERR15;
2394
2395 /* Failed to compile */
2396
2397 if (*errorptr != NULL)
2398 {
2399 (pcre_free)(re);
2400 PCRE_ERROR_RETURN:
2401 *erroroffset = ptr - (const uschar *)pattern;
2402 return NULL;
2403 }
2404
2405 /* If the anchored option was not passed, set flag if we can determine that it
2406 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2407 we can determine what the first character has to be, because that speeds up
2408 unanchored matches no end. In the case of multiline matches, an alternative is
2409 to set the PCRE_STARTLINE flag if all branches start with ^. */
2410
2411 if ((options & PCRE_ANCHORED) == 0)
2412 {
2413 int temp_options = options;
2414 if (is_anchored(re->code, &temp_options))
2415 re->options |= PCRE_ANCHORED;
2416 else
2417 {
2418 int ch = find_firstchar(re->code, &temp_options);
2419 if (ch >= 0)
2420 {
2421 re->first_char = ch;
2422 re->options |= PCRE_FIRSTSET;
2423 }
2424 else if (is_startline(re->code))
2425 re->options |= PCRE_STARTLINE;
2426 }
2427 }
2428
2429 /* Print out the compiled data for debugging */
2430
2431 #ifdef DEBUG
2432
2433 printf("Length = %d top_bracket = %d top_backref = %d\n",
2434 length, re->top_bracket, re->top_backref);
2435
2436 if (re->options != 0)
2437 {
2438 printf("%s%s%s%s%s%s%s%s\n",
2439 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2440 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2441 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2442 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2443 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2444 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2445 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2446 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2447 }
2448
2449 if ((re->options & PCRE_FIRSTSET) != 0)
2450 {
2451 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2452 else printf("First char = \\x%02x\n", re->first_char);
2453 }
2454
2455 code_end = code;
2456 code_base = code = re->code;
2457
2458 while (code < code_end)
2459 {
2460 int charlength;
2461
2462 printf("%3d ", code - code_base);
2463
2464 if (*code >= OP_BRA)
2465 {
2466 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2467 code += 2;
2468 }
2469
2470 else switch(*code)
2471 {
2472 case OP_OPT:
2473 printf(" %.2x %s", code[1], OP_names[*code]);
2474 code++;
2475 break;
2476
2477 case OP_COND:
2478 printf("%3d Cond", (code[1] << 8) + code[2]);
2479 code += 2;
2480 break;
2481
2482 case OP_CREF:
2483 printf(" %.2d %s", code[1], OP_names[*code]);
2484 code++;
2485 break;
2486
2487 case OP_CHARS:
2488 charlength = *(++code);
2489 printf("%3d ", charlength);
2490 while (charlength-- > 0)
2491 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2492 break;
2493
2494 case OP_KETRMAX:
2495 case OP_KETRMIN:
2496 case OP_ALT:
2497 case OP_KET:
2498 case OP_ASSERT:
2499 case OP_ASSERT_NOT:
2500 case OP_ASSERTBACK:
2501 case OP_ASSERTBACK_NOT:
2502 case OP_ONCE:
2503 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2504 code += 2;
2505 break;
2506
2507 case OP_REVERSE:
2508 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2509 code += 2;
2510 break;
2511
2512 case OP_STAR:
2513 case OP_MINSTAR:
2514 case OP_PLUS:
2515 case OP_MINPLUS:
2516 case OP_QUERY:
2517 case OP_MINQUERY:
2518 case OP_TYPESTAR:
2519 case OP_TYPEMINSTAR:
2520 case OP_TYPEPLUS:
2521 case OP_TYPEMINPLUS:
2522 case OP_TYPEQUERY:
2523 case OP_TYPEMINQUERY:
2524 if (*code >= OP_TYPESTAR)
2525 printf(" %s", OP_names[code[1]]);
2526 else if (isprint(c = code[1])) printf(" %c", c);
2527 else printf(" \\x%02x", c);
2528 printf("%s", OP_names[*code++]);
2529 break;
2530
2531 case OP_EXACT:
2532 case OP_UPTO:
2533 case OP_MINUPTO:
2534 if (isprint(c = code[3])) printf(" %c{", c);
2535 else printf(" \\x%02x{", c);
2536 if (*code != OP_EXACT) printf("0,");
2537 printf("%d}", (code[1] << 8) + code[2]);
2538 if (*code == OP_MINUPTO) printf("?");
2539 code += 3;
2540 break;
2541
2542 case OP_TYPEEXACT:
2543 case OP_TYPEUPTO:
2544 case OP_TYPEMINUPTO:
2545 printf(" %s{", OP_names[code[3]]);
2546 if (*code != OP_TYPEEXACT) printf(",");
2547 printf("%d}", (code[1] << 8) + code[2]);
2548 if (*code == OP_TYPEMINUPTO) printf("?");
2549 code += 3;
2550 break;
2551
2552 case OP_NOT:
2553 if (isprint(c = *(++code))) printf(" [^%c]", c);
2554 else printf(" [^\\x%02x]", c);
2555 break;
2556
2557 case OP_NOTSTAR:
2558 case OP_NOTMINSTAR:
2559 case OP_NOTPLUS:
2560 case OP_NOTMINPLUS:
2561 case OP_NOTQUERY:
2562 case OP_NOTMINQUERY:
2563 if (isprint(c = code[1])) printf(" [^%c]", c);
2564 else printf(" [^\\x%02x]", c);
2565 printf("%s", OP_names[*code++]);
2566 break;
2567
2568 case OP_NOTEXACT:
2569 case OP_NOTUPTO:
2570 case OP_NOTMINUPTO:
2571 if (isprint(c = code[3])) printf(" [^%c]{", c);
2572 else printf(" [^\\x%02x]{", c);
2573 if (*code != OP_NOTEXACT) printf(",");
2574 printf("%d}", (code[1] << 8) + code[2]);
2575 if (*code == OP_NOTMINUPTO) printf("?");
2576 code += 3;
2577 break;
2578
2579 case OP_REF:
2580 printf(" \\%d", *(++code));
2581 code ++;
2582 goto CLASS_REF_REPEAT;
2583
2584 case OP_CLASS:
2585 {
2586 int i, min, max;
2587 code++;
2588 printf(" [");
2589
2590 for (i = 0; i < 256; i++)
2591 {
2592 if ((code[i/8] & (1 << (i&7))) != 0)
2593 {
2594 int j;
2595 for (j = i+1; j < 256; j++)
2596 if ((code[j/8] & (1 << (j&7))) == 0) break;
2597 if (i == '-' || i == ']') printf("\\");
2598 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2599 if (--j > i)
2600 {
2601 printf("-");
2602 if (j == '-' || j == ']') printf("\\");
2603 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2604 }
2605 i = j;
2606 }
2607 }
2608 printf("]");
2609 code += 32;
2610
2611 CLASS_REF_REPEAT:
2612
2613 switch(*code)
2614 {
2615 case OP_CRSTAR:
2616 case OP_CRMINSTAR:
2617 case OP_CRPLUS:
2618 case OP_CRMINPLUS:
2619 case OP_CRQUERY:
2620 case OP_CRMINQUERY:
2621 printf("%s", OP_names[*code]);
2622 break;
2623
2624 case OP_CRRANGE:
2625 case OP_CRMINRANGE:
2626 min = (code[1] << 8) + code[2];
2627 max = (code[3] << 8) + code[4];
2628 if (max == 0) printf("{%d,}", min);
2629 else printf("{%d,%d}", min, max);
2630 if (*code == OP_CRMINRANGE) printf("?");
2631 code += 4;
2632 break;
2633
2634 default:
2635 code--;
2636 }
2637 }
2638 break;
2639
2640 /* Anything else is just a one-node item */
2641
2642 default:
2643 printf(" %s", OP_names[*code]);
2644 break;
2645 }
2646
2647 code++;
2648 printf("\n");
2649 }
2650 printf("------------------------------------------------------------------\n");
2651
2652 /* This check is done here in the debugging case so that the code that
2653 was compiled can be seen. */
2654
2655 if (code - re->code > length)
2656 {
2657 *errorptr = ERR23;
2658 (pcre_free)(re);
2659 *erroroffset = ptr - (uschar *)pattern;
2660 return NULL;
2661 }
2662 #endif
2663
2664 return (pcre *)re;
2665 }
2666
2667
2668
2669 /*************************************************
2670 * Match a back-reference *
2671 *************************************************/
2672
2673 /* If a back reference hasn't been set, the length that is passed is greater
2674 than the number of characters left in the string, so the match fails.
2675
2676 Arguments:
2677 offset index into the offset vector
2678 eptr points into the subject
2679 length length to be matched
2680 md points to match data block
2681 ims the ims flags
2682
2683 Returns: TRUE if matched
2684 */
2685
2686 static BOOL
2687 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2688 int ims)
2689 {
2690 const uschar *p = md->start_subject + md->offset_vector[offset];
2691
2692 #ifdef DEBUG
2693 if (eptr >= md->end_subject)
2694 printf("matching subject <null>");
2695 else
2696 {
2697 printf("matching subject ");
2698 pchars(eptr, length, TRUE, md);
2699 }
2700 printf(" against backref ");
2701 pchars(p, length, FALSE, md);
2702 printf("\n");
2703 #endif
2704
2705 /* Always fail if not enough characters left */
2706
2707 if (length > md->end_subject - eptr) return FALSE;
2708
2709 /* Separate the caselesss case for speed */
2710
2711 if ((ims & PCRE_CASELESS) != 0)
2712 {
2713 while (length-- > 0)
2714 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2715 }
2716 else
2717 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2718
2719 return TRUE;
2720 }
2721
2722
2723
2724 /*************************************************
2725 * Match from current position *
2726 *************************************************/
2727
2728 /* On entry ecode points to the first opcode, and eptr to the first character
2729 in the subject string, while eptrb holds the value of eptr at the start of the
2730 last bracketed group - used for breaking infinite loops matching zero-length
2731 strings.
2732
2733 Arguments:
2734 eptr pointer in subject
2735 ecode position in code
2736 offset_top current top pointer
2737 md pointer to "static" info for the match
2738 ims current /i, /m, and /s options
2739 condassert TRUE if called to check a condition assertion
2740 eptrb eptr at start of last bracket
2741
2742 Returns: TRUE if matched
2743 */
2744
2745 static BOOL
2746 match(register const uschar *eptr, register const uschar *ecode,
2747 int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2748 {
2749 int original_ims = ims; /* Save for resetting on ')' */
2750
2751 for (;;)
2752 {
2753 int op = (int)*ecode;
2754 int min, max, ctype;
2755 register int i;
2756 register int c;
2757 BOOL minimize = FALSE;
2758
2759 /* Opening capturing bracket. If there is space in the offset vector, save
2760 the current subject position in the working slot at the top of the vector. We
2761 mustn't change the current values of the data slot, because they may be set
2762 from a previous iteration of this group, and be referred to by a reference
2763 inside the group.
2764
2765 If the bracket fails to match, we need to restore this value and also the
2766 values of the final offsets, in case they were set by a previous iteration of
2767 the same bracket.
2768
2769 If there isn't enough space in the offset vector, treat this as if it were a
2770 non-capturing bracket. Don't worry about setting the flag for the error case
2771 here; that is handled in the code for KET. */
2772
2773 if (op > OP_BRA)
2774 {
2775 int number = op - OP_BRA;
2776 int offset = number << 1;
2777
2778 DPRINTF(("start bracket %d\n", number));
2779
2780 if (offset < md->offset_max)
2781 {
2782 int save_offset1 = md->offset_vector[offset];
2783 int save_offset2 = md->offset_vector[offset+1];
2784 int save_offset3 = md->offset_vector[md->offset_end - number];
2785
2786 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2787 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2788
2789 do
2790 {
2791 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2792 ecode += (ecode[1] << 8) + ecode[2];
2793 }
2794 while (*ecode == OP_ALT);
2795
2796 DPRINTF(("bracket %d failed\n", number));
2797
2798 md->offset_vector[offset] = save_offset1;
2799 md->offset_vector[offset+1] = save_offset2;
2800 md->offset_vector[md->offset_end - number] = save_offset3;
2801 return FALSE;
2802 }
2803
2804 /* Insufficient room for saving captured contents */
2805
2806 else op = OP_BRA;
2807 }
2808
2809 /* Other types of node can be handled by a switch */
2810
2811 switch(op)
2812 {
2813 case OP_BRA: /* Non-capturing bracket: optimized */
2814 DPRINTF(("start bracket 0\n"));
2815 do
2816 {
2817 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2818 ecode += (ecode[1] << 8) + ecode[2];
2819 }
2820 while (*ecode == OP_ALT);
2821 DPRINTF(("bracket 0 failed\n"));
2822 return FALSE;
2823
2824 /* Conditional group: compilation checked that there are no more than
2825 two branches. If the condition is false, skipping the first branch takes us
2826 past the end if there is only one branch, but that's OK because that is
2827 exactly what going to the ket would do. */
2828
2829 case OP_COND:
2830 if (ecode[3] == OP_CREF) /* Condition is extraction test */
2831 {
2832 int offset = ecode[4] << 1; /* Doubled reference number */
2833 return match(eptr,
2834 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2835 5 : 3 + (ecode[1] << 8) + ecode[2]),
2836 offset_top, md, ims, FALSE, eptr);
2837 }
2838
2839 /* The condition is an assertion. Call match() to evaluate it - setting
2840 the final argument TRUE causes it to stop at the end of an assertion. */
2841
2842 else
2843 {
2844 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
2845 {
2846 ecode += 3 + (ecode[4] << 8) + ecode[5];
2847 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
2848 }
2849 else ecode += (ecode[1] << 8) + ecode[2];
2850 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
2851 }
2852 /* Control never reaches here */
2853
2854 /* Skip over conditional reference data if encountered (should not be) */
2855
2856 case OP_CREF:
2857 ecode += 2;
2858 break;
2859
2860 /* End of the pattern */
2861
2862 case OP_END:
2863 md->end_match_ptr = eptr; /* Record where we ended */
2864 md->end_offset_top = offset_top; /* and how many extracts were taken */
2865 return TRUE;
2866
2867 /* Change option settings */
2868
2869 case OP_OPT:
2870 ims = ecode[1];
2871 ecode += 2;
2872 DPRINTF(("ims set to %02x\n", ims));
2873 break;
2874
2875 /* Assertion brackets. Check the alternative branches in turn - the
2876 matching won't pass the KET for an assertion. If any one branch matches,
2877 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
2878 start of each branch to move the current point backwards, so the code at
2879 this level is identical to the lookahead case. */
2880
2881 case OP_ASSERT:
2882 case OP_ASSERTBACK:
2883 do
2884 {
2885 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
2886 ecode += (ecode[1] << 8) + ecode[2];
2887 }
2888 while (*ecode == OP_ALT);
2889 if (*ecode == OP_KET) return FALSE;
2890
2891 /* If checking an assertion for a condition, return TRUE. */
2892
2893 if (condassert) return TRUE;
2894
2895 /* Continue from after the assertion, updating the offsets high water
2896 mark, since extracts may have been taken during the assertion. */
2897
2898 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2899 ecode += 3;
2900 offset_top = md->end_offset_top;
2901 continue;
2902
2903 /* Negative assertion: all branches must fail to match */
2904
2905 case OP_ASSERT_NOT:
2906 case OP_ASSERTBACK_NOT:
2907 do
2908 {
2909 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
2910 ecode += (ecode[1] << 8) + ecode[2];
2911 }
2912 while (*ecode == OP_ALT);
2913
2914 if (condassert) return TRUE;
2915 ecode += 3;
2916 continue;
2917
2918 /* Move the subject pointer back. This occurs only at the start of
2919 each branch of a lookbehind assertion. If we are too close to the start to
2920 move back, this match function fails. */
2921
2922 case OP_REVERSE:
2923 eptr -= (ecode[1] << 8) + ecode[2];
2924 if (eptr < md->start_subject) return FALSE;
2925 ecode += 3;
2926 break;
2927
2928
2929 /* "Once" brackets are like assertion brackets except that after a match,
2930 the point in the subject string is not moved back. Thus there can never be
2931 a move back into the brackets. Check the alternative branches in turn - the
2932 matching won't pass the KET for this kind of subpattern. If any one branch
2933 matches, we carry on as at the end of a normal bracket, leaving the subject
2934 pointer. */
2935
2936 case OP_ONCE:
2937 {
2938 const uschar *prev = ecode;
2939
2940 do
2941 {
2942 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
2943 ecode += (ecode[1] << 8) + ecode[2];
2944 }
2945 while (*ecode == OP_ALT);
2946
2947 /* If hit the end of the group (which could be repeated), fail */
2948
2949 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
2950
2951 /* Continue as from after the assertion, updating the offsets high water
2952 mark, since extracts may have been taken. */
2953
2954 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2955
2956 offset_top = md->end_offset_top;
2957 eptr = md->end_match_ptr;
2958
2959 /* For a non-repeating ket, just continue at this level. This also
2960 happens for a repeating ket if no characters were matched in the group.
2961 This is the forcible breaking of infinite loops as implemented in Perl
2962 5.005. If there is an options reset, it will get obeyed in the normal
2963 course of events. */
2964
2965 if (*ecode == OP_KET || eptr == eptrb)
2966 {
2967 ecode += 3;
2968 break;
2969 }
2970
2971 /* The repeating kets try the rest of the pattern or restart from the
2972 preceding bracket, in the appropriate order. We need to reset any options
2973 that changed within the bracket before re-running it, so check the next
2974 opcode. */
2975
2976 if (ecode[3] == OP_OPT)
2977 {
2978 ims = (ims & ~PCRE_IMS) | ecode[4];
2979 DPRINTF(("ims set to %02x at group repeat\n", ims));
2980 }
2981
2982 if (*ecode == OP_KETRMIN)
2983 {
2984 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
2985 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
2986 }
2987 else /* OP_KETRMAX */
2988 {
2989 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
2990 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2991 }
2992 }
2993 return FALSE;
2994
2995 /* An alternation is the end of a branch; scan along to find the end of the
2996 bracketed group and go to there. */
2997
2998 case OP_ALT:
2999 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3000 break;
3001
3002 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3003 that it may occur zero times. It may repeat infinitely, or not at all -
3004 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3005 repeat limits are compiled as a number of copies, with the optional ones
3006 preceded by BRAZERO or BRAMINZERO. */
3007
3008 case OP_BRAZERO:
3009 {
3010 const uschar *next = ecode+1;
3011 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3012 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3013 ecode = next + 3;
3014 }
3015 break;
3016
3017 case OP_BRAMINZERO:
3018 {
3019 const uschar *next = ecode+1;
3020 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3021 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3022 ecode++;
3023 }
3024 break;
3025
3026 /* End of a group, repeated or non-repeating. If we are at the end of
3027 an assertion "group", stop matching and return TRUE, but record the
3028 current high water mark for use by positive assertions. Do this also
3029 for the "once" (not-backup up) groups. */
3030
3031 case OP_KET:
3032 case OP_KETRMIN:
3033 case OP_KETRMAX:
3034 {
3035 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3036
3037 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3038 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3039 *prev == OP_ONCE)
3040 {
3041 md->end_match_ptr = eptr; /* For ONCE */
3042 md->end_offset_top = offset_top;
3043 return TRUE;
3044 }
3045
3046 /* In all other cases except a conditional group we have to check the
3047 group number back at the start and if necessary complete handling an
3048 extraction by setting the offsets and bumping the high water mark. */
3049
3050 if (*prev != OP_COND)
3051 {
3052 int number = *prev - OP_BRA;
3053 int offset = number << 1;
3054
3055 DPRINTF(("end bracket %d\n", number));
3056
3057 if (number > 0)
3058 {
3059 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3060 {
3061 md->offset_vector[offset] =
3062 md->offset_vector[md->offset_end - number];
3063 md->offset_vector[offset+1] = eptr - md->start_subject;
3064 if (offset_top <= offset) offset_top = offset + 2;
3065 }
3066 }
3067 }
3068
3069 /* Reset the value of the ims flags, in case they got changed during
3070 the group. */
3071
3072 ims = original_ims;
3073 DPRINTF(("ims reset to %02x\n", ims));
3074
3075 /* For a non-repeating ket, just continue at this level. This also
3076 happens for a repeating ket if no characters were matched in the group.
3077 This is the forcible breaking of infinite loops as implemented in Perl
3078 5.005. If there is an options reset, it will get obeyed in the normal
3079 course of events. */
3080
3081 if (*ecode == OP_KET || eptr == eptrb)
3082 {
3083 ecode += 3;
3084 break;
3085 }
3086
3087 /* The repeating kets try the rest of the pattern or restart from the
3088 preceding bracket, in the appropriate order. */
3089
3090 if (*ecode == OP_KETRMIN)
3091 {
3092 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3093 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3094 }
3095 else /* OP_KETRMAX */
3096 {
3097 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3098 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3099 }
3100 }
3101 return FALSE;
3102
3103 /* Start of subject unless notbol, or after internal newline if multiline */
3104
3105 case OP_CIRC:
3106 if (md->notbol && eptr == md->start_subject) return FALSE;
3107 if ((ims & PCRE_MULTILINE) != 0)
3108 {
3109 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3110 ecode++;
3111 break;
3112 }
3113 /* ... else fall through */
3114
3115 /* Start of subject assertion */
3116
3117 case OP_SOD:
3118 if (eptr != md->start_subject) return FALSE;
3119 ecode++;
3120 break;
3121
3122 /* Assert before internal newline if multiline, or before a terminating
3123 newline unless endonly is set, else end of subject unless noteol is set. */
3124
3125 case OP_DOLL:
3126 if ((ims & PCRE_MULTILINE) != 0)
3127 {
3128 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3129 else { if (md->noteol) return FALSE; }
3130 ecode++;
3131 break;
3132 }
3133 else
3134 {
3135 if (md->noteol) return FALSE;
3136 if (!md->endonly)
3137 {
3138 if (eptr < md->end_subject - 1 ||
3139 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3140
3141 ecode++;
3142 break;
3143 }
3144 }
3145 /* ... else fall through */
3146
3147 /* End of subject assertion (\z) */
3148
3149 case OP_EOD:
3150 if (eptr < md->end_subject) return FALSE;
3151 ecode++;
3152 break;
3153
3154 /* End of subject or ending \n assertion (\Z) */
3155
3156 case OP_EODN:
3157 if (eptr < md->end_subject - 1 ||
3158 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3159 ecode++;
3160 break;
3161
3162 /* Word boundary assertions */
3163
3164 case OP_NOT_WORD_BOUNDARY:
3165 case OP_WORD_BOUNDARY:
3166 {
3167 BOOL prev_is_word = (eptr != md->start_subject) &&
3168 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3169 BOOL cur_is_word = (eptr < md->end_subject) &&
3170 ((md->ctypes[*eptr] & ctype_word) != 0);
3171 if ((*ecode++ == OP_WORD_BOUNDARY)?
3172 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3173 return FALSE;
3174 }
3175 break;
3176
3177 /* Match a single character type; inline for speed */
3178
3179 case OP_ANY:
3180 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3181 return FALSE;
3182 if (eptr++ >= md->end_subject) return FALSE;
3183 ecode++;
3184 break;
3185
3186 case OP_NOT_DIGIT:
3187 if (eptr >= md->end_subject ||
3188 (md->ctypes[*eptr++] & ctype_digit) != 0)
3189 return FALSE;
3190 ecode++;
3191 break;
3192
3193 case OP_DIGIT:
3194 if (eptr >= md->end_subject ||
3195 (md->ctypes[*eptr++] & ctype_digit) == 0)
3196 return FALSE;
3197 ecode++;
3198 break;
3199
3200 case OP_NOT_WHITESPACE:
3201 if (eptr >= md->end_subject ||
3202 (md->ctypes[*eptr++] & ctype_space) != 0)
3203 return FALSE;
3204 ecode++;
3205 break;
3206
3207 case OP_WHITESPACE:
3208 if (eptr >= md->end_subject ||
3209 (md->ctypes[*eptr++] & ctype_space) == 0)
3210 return FALSE;
3211 ecode++;
3212 break;
3213
3214 case OP_NOT_WORDCHAR:
3215 if (eptr >= md->end_subject ||
3216 (md->ctypes[*eptr++] & ctype_word) != 0)
3217 return FALSE;
3218 ecode++;
3219 break;
3220
3221 case OP_WORDCHAR:
3222 if (eptr >= md->end_subject ||
3223 (md->ctypes[*eptr++] & ctype_word) == 0)
3224 return FALSE;
3225 ecode++;
3226 break;
3227
3228 /* Match a back reference, possibly repeatedly. Look past the end of the
3229 item to see if there is repeat information following. The code is similar
3230 to that for character classes, but repeated for efficiency. Then obey
3231 similar code to character type repeats - written out again for speed.
3232 However, if the referenced string is the empty string, always treat
3233 it as matched, any number of times (otherwise there could be infinite
3234 loops). */
3235
3236 case OP_REF:
3237 {
3238 int length;
3239 int offset = ecode[1] << 1; /* Doubled reference number */
3240 ecode += 2; /* Advance past the item */
3241
3242 /* If the reference is unset, set the length to be longer than the amount
3243 of subject left; this ensures that every attempt at a match fails. We
3244 can't just fail here, because of the possibility of quantifiers with zero
3245 minima. */
3246
3247 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3248 md->end_subject - eptr + 1 :
3249 md->offset_vector[offset+1] - md->offset_vector[offset];
3250
3251 /* Set up for repetition, or handle the non-repeated case */
3252
3253 switch (*ecode)
3254 {
3255 case OP_CRSTAR:
3256 case OP_CRMINSTAR:
3257 case OP_CRPLUS:
3258 case OP_CRMINPLUS:
3259 case OP_CRQUERY:
3260 case OP_CRMINQUERY:
3261 c = *ecode++ - OP_CRSTAR;
3262 minimize = (c & 1) != 0;
3263 min = rep_min[c]; /* Pick up values from tables; */
3264 max = rep_max[c]; /* zero for max => infinity */
3265 if (max == 0) max = INT_MAX;
3266 break;
3267
3268 case OP_CRRANGE:
3269 case OP_CRMINRANGE:
3270 minimize = (*ecode == OP_CRMINRANGE);
3271 min = (ecode[1] << 8) + ecode[2];
3272 max = (ecode[3] << 8) + ecode[4];
3273 if (max == 0) max = INT_MAX;
3274 ecode += 5;
3275 break;
3276
3277 default: /* No repeat follows */
3278 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3279 eptr += length;
3280 continue; /* With the main loop */
3281 }
3282
3283 /* If the length of the reference is zero, just continue with the
3284 main loop. */
3285
3286 if (length == 0) continue;
3287
3288 /* First, ensure the minimum number of matches are present. We get back
3289 the length of the reference string explicitly rather than passing the
3290 address of eptr, so that eptr can be a register variable. */
3291
3292 for (i = 1; i <= min; i++)
3293 {
3294 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3295 eptr += length;
3296 }
3297
3298 /* If min = max, continue at the same level without recursion.
3299 They are not both allowed to be zero. */
3300
3301 if (min == max) continue;
3302
3303 /* If minimizing, keep trying and advancing the pointer */
3304
3305 if (minimize)
3306 {
3307 for (i = min;; i++)
3308 {
3309 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3310 return TRUE;
3311 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3312 return FALSE;
3313 eptr += length;
3314 }
3315 /* Control never gets here */
3316 }
3317
3318 /* If maximizing, find the longest string and work backwards */
3319
3320 else
3321 {
3322 const uschar *pp = eptr;
3323 for (i = min; i < max; i++)
3324 {
3325 if (!match_ref(offset, eptr, length, md, ims)) break;
3326 eptr += length;
3327 }
3328 while (eptr >= pp)
3329 {
3330 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3331 return TRUE;
3332 eptr -= length;
3333 }
3334 return FALSE;
3335 }
3336 }
3337 /* Control never gets here */
3338
3339
3340
3341 /* Match a character class, possibly repeatedly. Look past the end of the
3342 item to see if there is repeat information following. Then obey similar
3343 code to character type repeats - written out again for speed. */
3344
3345 case OP_CLASS:
3346 {
3347 const uschar *data = ecode + 1; /* Save for matching */
3348 ecode += 33; /* Advance past the item */
3349
3350 switch (*ecode)
3351 {
3352 case OP_CRSTAR:
3353 case OP_CRMINSTAR:
3354 case OP_CRPLUS:
3355 case OP_CRMINPLUS:
3356 case OP_CRQUERY:
3357 case OP_CRMINQUERY:
3358 c = *ecode++ - OP_CRSTAR;
3359 minimize = (c & 1) != 0;
3360 min = rep_min[c]; /* Pick up values from tables; */
3361 max = rep_max[c]; /* zero for max => infinity */
3362 if (max == 0) max = INT_MAX;
3363 break;
3364
3365 case OP_CRRANGE:
3366 case OP_CRMINRANGE:
3367 minimize = (*ecode == OP_CRMINRANGE);
3368 min = (ecode[1] << 8) + ecode[2];
3369 max = (ecode[3] << 8) + ecode[4];
3370 if (max == 0) max = INT_MAX;
3371 ecode += 5;
3372 break;
3373
3374 default: /* No repeat follows */
3375 min = max = 1;
3376 break;
3377 }
3378
3379 /* First, ensure the minimum number of matches are present. */
3380
3381 for (i = 1; i <= min; i++)
3382 {
3383 if (eptr >= md->end_subject) return FALSE;
3384 c = *eptr++;
3385 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3386 return FALSE;
3387 }
3388
3389 /* If max == min we can continue with the main loop without the
3390 need to recurse. */
3391
3392 if (min == max) continue;
3393
3394 /* If minimizing, keep testing the rest of the expression and advancing
3395 the pointer while it matches the class. */
3396
3397 if (minimize)
3398 {
3399 for (i = min;; i++)
3400 {
3401 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3402 return TRUE;
3403 if (i >= max || eptr >= md->end_subject) return FALSE;
3404 c = *eptr++;
3405 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3406 return FALSE;
3407 }
3408 /* Control never gets here */
3409 }
3410
3411 /* If maximizing, find the longest possible run, then work backwards. */
3412
3413 else
3414 {
3415 const uschar *pp = eptr;
3416 for (i = min; i < max; eptr++, i++)
3417 {
3418 if (eptr >= md->end_subject) break;
3419 c = *eptr;
3420 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3421 break;
3422 }
3423
3424 while (eptr >= pp)
3425 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3426 return TRUE;
3427 return FALSE;
3428 }
3429 }
3430 /* Control never gets here */
3431
3432 /* Match a run of characters */
3433
3434 case OP_CHARS:
3435 {
3436 register int length = ecode[1];
3437 ecode += 2;
3438
3439 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3440 if (eptr >= md->end_subject)
3441 printf("matching subject <null> against pattern ");
3442 else
3443 {
3444 printf("matching subject ");
3445 pchars(eptr, length, TRUE, md);
3446 printf(" against pattern ");
3447 }
3448 pchars(ecode, length, FALSE, md);
3449 printf("\n");
3450 #endif
3451
3452 if (length > md->end_subject - eptr) return FALSE;
3453 if ((ims & PCRE_CASELESS) != 0)
3454 {
3455 while (length-- > 0)
3456 if (md->lcc[*ecode++] != md->lcc[*eptr++])
3457 return FALSE;
3458 }
3459 else
3460 {
3461 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3462 }
3463 }
3464 break;
3465
3466 /* Match a single character repeatedly; different opcodes share code. */
3467
3468 case OP_EXACT:
3469 min = max = (ecode[1] << 8) + ecode[2];
3470 ecode += 3;
3471 goto REPEATCHAR;
3472
3473 case OP_UPTO:
3474 case OP_MINUPTO:
3475 min = 0;
3476 max = (ecode[1] << 8) + ecode[2];
3477 minimize = *ecode == OP_MINUPTO;
3478 ecode += 3;
3479 goto REPEATCHAR;
3480
3481 case OP_STAR:
3482 case OP_MINSTAR:
3483 case OP_PLUS:
3484 case OP_MINPLUS:
3485 case OP_QUERY:
3486 case OP_MINQUERY:
3487 c = *ecode++ - OP_STAR;
3488 minimize = (c & 1) != 0;
3489 min = rep_min[c]; /* Pick up values from tables; */
3490 max = rep_max[c]; /* zero for max => infinity */
3491 if (max == 0) max = INT_MAX;
3492
3493 /* Common code for all repeated single-character matches. We can give
3494 up quickly if there are fewer than the minimum number of characters left in
3495 the subject. */
3496
3497 REPEATCHAR:
3498 if (min > md->end_subject - eptr) return FALSE;
3499 c = *ecode++;
3500
3501 /* The code is duplicated for the caseless and caseful cases, for speed,
3502 since matching characters is likely to be quite common. First, ensure the
3503 minimum number of matches are present. If min = max, continue at the same
3504 level without recursing. Otherwise, if minimizing, keep trying the rest of
3505 the expression and advancing one matching character if failing, up to the
3506 maximum. Alternatively, if maximizing, find the maximum number of
3507 characters and work backwards. */
3508
3509 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3510 max, eptr));
3511
3512 if ((ims & PCRE_CASELESS) != 0)
3513 {
3514 c = md->lcc[c];
3515 for (i = 1; i <= min; i++)
3516 if (c != md->lcc[*eptr++]) return FALSE;
3517 if (min == max) continue;
3518 if (minimize)
3519 {
3520 for (i = min;; i++)
3521 {
3522 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3523 return TRUE;
3524 if (i >= max || eptr >= md->end_subject ||
3525 c != md->lcc[*eptr++])
3526 return FALSE;
3527 }
3528 /* Control never gets here */
3529 }
3530 else
3531 {
3532 const uschar *pp = eptr;
3533 for (i = min; i < max; i++)
3534 {
3535 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3536 eptr++;
3537 }
3538 while (eptr >= pp)
3539 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3540 return TRUE;
3541 return FALSE;
3542 }
3543 /* Control never gets here */
3544 }
3545
3546 /* Caseful comparisons */
3547
3548 else
3549 {
3550 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3551 if (min == max) continue;
3552 if (minimize)
3553 {
3554 for (i = min;; i++)
3555 {
3556 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3557 return TRUE;
3558 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3559 }
3560 /* Control never gets here */
3561 }
3562 else
3563 {
3564 const uschar *pp = eptr;
3565 for (i = min; i < max; i++)
3566 {
3567 if (eptr >= md->end_subject || c != *eptr) break;
3568 eptr++;
3569 }
3570 while (eptr >= pp)
3571 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3572 return TRUE;
3573 return FALSE;
3574 }
3575 }
3576 /* Control never gets here */
3577
3578 /* Match a negated single character */
3579
3580 case OP_NOT:
3581 if (eptr >= md->end_subject) return FALSE;
3582 ecode++;
3583 if ((ims & PCRE_CASELESS) != 0)
3584 {
3585 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3586 }
3587 else
3588 {
3589 if (*ecode++ == *eptr++) return FALSE;
3590 }
3591 break;
3592
3593 /* Match a negated single character repeatedly. This is almost a repeat of
3594 the code for a repeated single character, but I haven't found a nice way of
3595 commoning these up that doesn't require a test of the positive/negative
3596 option for each character match. Maybe that wouldn't add very much to the
3597 time taken, but character matching *is* what this is all about... */
3598
3599 case OP_NOTEXACT:
3600 min = max = (ecode[1] << 8) + ecode[2];
3601 ecode += 3;
3602 goto REPEATNOTCHAR;
3603
3604 case OP_NOTUPTO:
3605 case OP_NOTMINUPTO:
3606 min = 0;
3607 max = (ecode[1] << 8) + ecode[2];
3608 minimize = *ecode == OP_NOTMINUPTO;
3609 ecode += 3;
3610 goto REPEATNOTCHAR;
3611
3612 case OP_NOTSTAR:
3613 case OP_NOTMINSTAR:
3614 case OP_NOTPLUS:
3615 case OP_NOTMINPLUS:
3616 case OP_NOTQUERY:
3617 case OP_NOTMINQUERY:
3618 c = *ecode++ - OP_NOTSTAR;
3619 minimize = (c & 1) != 0;
3620 min = rep_min[c]; /* Pick up values from tables; */
3621 max = rep_max[c]; /* zero for max => infinity */
3622 if (max == 0) max = INT_MAX;
3623
3624 /* Common code for all repeated single-character matches. We can give
3625 up quickly if there are fewer than the minimum number of characters left in
3626 the subject. */
3627
3628 REPEATNOTCHAR:
3629 if (min > md->end_subject - eptr) return FALSE;
3630 c = *ecode++;
3631
3632 /* The code is duplicated for the caseless and caseful cases, for speed,
3633 since matching characters is likely to be quite common. First, ensure the
3634 minimum number of matches are present. If min = max, continue at the same
3635 level without recursing. Otherwise, if minimizing, keep trying the rest of
3636 the expression and advancing one matching character if failing, up to the
3637 maximum. Alternatively, if maximizing, find the maximum number of
3638 characters and work backwards. */
3639
3640 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3641 max, eptr));
3642
3643 if ((ims & PCRE_CASELESS) != 0)
3644 {
3645 c = md->lcc[c];
3646 for (i = 1; i <= min; i++)
3647 if (c == md->lcc[*eptr++]) return FALSE;
3648 if (min == max) continue;
3649 if (minimize)
3650 {
3651 for (i = min;; i++)
3652 {
3653 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3654 return TRUE;
3655 if (i >= max || eptr >= md->end_subject ||
3656 c == md->lcc[*eptr++])
3657 return FALSE;
3658 }
3659 /* Control never gets here */
3660 }
3661 else
3662 {
3663 const uschar *pp = eptr;
3664 for (i = min; i < max; i++)
3665 {
3666 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3667 eptr++;
3668 }
3669 while (eptr >= pp)
3670 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3671 return TRUE;
3672 return FALSE;
3673 }
3674 /* Control never gets here */
3675 }
3676
3677 /* Caseful comparisons */
3678
3679 else
3680 {
3681 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3682 if (min == max) continue;
3683 if (minimize)
3684 {
3685 for (i = min;; i++)
3686 {
3687 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3688 return TRUE;
3689 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3690 }
3691 /* Control never gets here */
3692 }
3693 else
3694 {
3695 const uschar *pp = eptr;
3696 for (i = min; i < max; i++)
3697 {
3698 if (eptr >= md->end_subject || c == *eptr) break;
3699 eptr++;
3700 }
3701 while (eptr >= pp)
3702 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3703 return TRUE;
3704 return FALSE;
3705 }
3706 }
3707 /* Control never gets here */
3708
3709 /* Match a single character type repeatedly; several different opcodes
3710 share code. This is very similar to the code for single characters, but we
3711 repeat it in the interests of efficiency. */
3712
3713 case OP_TYPEEXACT:
3714 min = max = (ecode[1] << 8) + ecode[2];
3715 minimize = TRUE;
3716 ecode += 3;
3717 goto REPEATTYPE;
3718
3719 case OP_TYPEUPTO:
3720 case OP_TYPEMINUPTO:
3721 min = 0;
3722 max = (ecode[1] << 8) + ecode[2];
3723 minimize = *ecode == OP_TYPEMINUPTO;
3724 ecode += 3;
3725 goto REPEATTYPE;
3726
3727 case OP_TYPESTAR:
3728 case OP_TYPEMINSTAR:
3729 case OP_TYPEPLUS:
3730 case OP_TYPEMINPLUS:
3731 case OP_TYPEQUERY:
3732 case OP_TYPEMINQUERY:
3733 c = *ecode++ - OP_TYPESTAR;
3734 minimize = (c & 1) != 0;
3735 min = rep_min[c]; /* Pick up values from tables; */
3736 max = rep_max[c]; /* zero for max => infinity */
3737 if (max == 0) max = INT_MAX;
3738
3739 /* Common code for all repeated single character type matches */
3740
3741 REPEATTYPE:
3742 ctype = *ecode++; /* Code for the character type */
3743
3744 /* First, ensure the minimum number of matches are present. Use inline
3745 code for maximizing the speed, and do the type test once at the start
3746 (i.e. keep it out of the loop). Also test that there are at least the
3747 minimum number of characters before we start. */
3748
3749 if (min > md->end_subject - eptr) return FALSE;
3750 if (min > 0) switch(ctype)
3751 {
3752 case OP_ANY:
3753 if ((ims & PCRE_DOTALL) == 0)
3754 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3755 else eptr += min;
3756 break;
3757
3758 case OP_NOT_DIGIT:
3759 for (i = 1; i <= min; i++)
3760 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3761 break;
3762
3763 case OP_DIGIT:
3764 for (i = 1; i <= min; i++)
3765 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3766 break;
3767
3768 case OP_NOT_WHITESPACE:
3769 for (i = 1; i <= min; i++)
3770 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3771 break;
3772
3773 case OP_WHITESPACE:
3774 for (i = 1; i <= min; i++)
3775 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3776 break;
3777
3778 case OP_NOT_WORDCHAR:
3779 for (i = 1; i <= min; i++)
3780 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3781 return FALSE;
3782 break;
3783
3784 case OP_WORDCHAR:
3785 for (i = 1; i <= min; i++)
3786 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3787 return FALSE;
3788 break;
3789 }
3790
3791 /* If min = max, continue at the same level without recursing */
3792
3793 if (min == max) continue;
3794
3795 /* If minimizing, we have to test the rest of the pattern before each
3796 subsequent match. */
3797
3798 if (minimize)
3799 {
3800 for (i = min;; i++)
3801 {
3802 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3803 if (i >= max || eptr >= md->end_subject) return FALSE;
3804
3805 c = *eptr++;
3806 switch(ctype)
3807 {
3808 case OP_ANY:
3809 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3810 break;
3811
3812 case OP_NOT_DIGIT:
3813 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3814 break;
3815
3816 case OP_DIGIT:
3817 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3818 break;
3819
3820 case OP_NOT_WHITESPACE:
3821 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3822 break;
3823
3824 case OP_WHITESPACE:
3825 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3826 break;
3827
3828 case OP_NOT_WORDCHAR:
3829 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3830 break;
3831
3832 case OP_WORDCHAR:
3833 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3834 break;
3835 }
3836 }
3837 /* Control never gets here */
3838 }
3839
3840 /* If maximizing it is worth using inline code for speed, doing the type
3841 test once at the start (i.e. keep it out of the loop). */
3842
3843 else
3844 {
3845 const uschar *pp = eptr;
3846 switch(ctype)
3847 {
3848 case OP_ANY:
3849 if ((ims & PCRE_DOTALL) == 0)
3850 {
3851 for (i = min; i < max; i++)
3852 {
3853 if (eptr >= md->end_subject || *eptr == '\n') break;
3854 eptr++;
3855 }
3856 }
3857 else
3858 {
3859 c = max - min;
3860 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3861 eptr += c;
3862 }
3863 break;
3864
3865 case OP_NOT_DIGIT:
3866 for (i = min; i < max; i++)
3867 {
3868 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3869 break;
3870 eptr++;
3871 }
3872 break;
3873
3874 case OP_DIGIT:
3875 for (i = min; i < max; i++)
3876 {
3877 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3878 break;
3879 eptr++;
3880 }
3881 break;
3882
3883 case OP_NOT_WHITESPACE:
3884 for (i = min; i < max; i++)
3885 {
3886 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3887 break;
3888 eptr++;
3889 }
3890 break;
3891
3892 case OP_WHITESPACE:
3893 for (i = min; i < max; i++)
3894 {
3895 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3896 break;
3897 eptr++;
3898 }
3899 break;
3900
3901 case OP_NOT_WORDCHAR:
3902 for (i = min; i < max; i++)
3903 {
3904 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3905 break;
3906 eptr++;
3907 }
3908 break;
3909
3910 case OP_WORDCHAR:
3911 for (i = min; i < max; i++)
3912 {
3913 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3914 break;
3915 eptr++;
3916 }
3917 break;
3918 }
3919
3920 while (eptr >= pp)
3921 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3922 return TRUE;
3923 return FALSE;
3924 }
3925 /* Control never gets here */
3926
3927 /* There's been some horrible disaster. */
3928
3929 default:
3930 DPRINTF(("Unknown opcode %d\n", *ecode));
3931 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3932 return FALSE;
3933 }
3934
3935 /* Do not stick any code in here without much thought; it is assumed
3936 that "continue" in the code above comes out to here to repeat the main
3937 loop. */
3938
3939 } /* End of main loop */
3940 /* Control never reaches here */
3941 }
3942
3943
3944
3945
3946 /*************************************************
3947 * Execute a Regular Expression *
3948 *************************************************/
3949
3950 /* This function applies a compiled re to a subject string and picks out
3951 portions of the string if it matches. Two elements in the vector are set for
3952 each substring: the offsets to the start and end of the substring.
3953
3954 Arguments:
3955 external_re points to the compiled expression
3956 external_extra points to "hints" from pcre_study() or is NULL
3957 subject points to the subject string
3958 length length of subject string (may contain binary zeros)
3959 options option bits
3960 offsets points to a vector of ints to be filled in with offsets
3961 offsetcount the number of elements in the vector
3962
3963 Returns: > 0 => success; value is the number of elements filled in
3964 = 0 => success, but offsets is not big enough
3965 -1 => failed to match
3966 < -1 => some kind of unexpected problem
3967 */
3968
3969 int
3970 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3971 const char *subject, int length, int options, int *offsets, int offsetcount)
3972 {
3973 int resetcount, ocount;
3974 int first_char = -1;
3975 int ims = 0;
3976 match_data match_block;
3977 const uschar *start_bits = NULL;
3978 const uschar *start_match = (const uschar *)subject;
3979 const uschar *end_subject;
3980 const real_pcre *re = (const real_pcre *)external_re;
3981 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3982 BOOL using_temporary_offsets = FALSE;
3983 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3984 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3985
3986 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3987
3988 if (re == NULL || subject == NULL ||
3989 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3990 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3991
3992 match_block.start_subject = (const uschar *)subject;
3993 match_block.end_subject = match_block.start_subject + length;
3994 end_subject = match_block.end_subject;
3995
3996 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3997
3998 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3999 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4000
4001 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4002
4003 match_block.lcc = re->tables + lcc_offset;
4004 match_block.ctypes = re->tables + ctypes_offset;
4005
4006 /* The ims options can vary during the matching as a result of the presence
4007 of (?ims) items in the pattern. They are kept in a local variable so that
4008 restoring at the exit of a group is easy. */
4009
4010 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4011
4012 /* If the expression has got more back references than the offsets supplied can
4013 hold, we get a temporary bit of working store to use during the matching.
4014 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4015 of 3. */
4016
4017 ocount = offsetcount - (offsetcount % 3);
4018
4019 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4020 {
4021 ocount = re->top_backref * 3 + 3;
4022 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4023 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4024 using_temporary_offsets = TRUE;
4025 DPRINTF(("Got memory to hold back references\n"));
4026 }
4027 else match_block.offset_vector = offsets;
4028
4029 match_block.offset_end = ocount;
4030 match_block.offset_max = (2*ocount)/3;
4031 match_block.offset_overflow = FALSE;
4032
4033 /* Compute the minimum number of offsets that we need to reset each time. Doing
4034 this makes a huge difference to execution time when there aren't many brackets
4035 in the pattern. */
4036
4037 resetcount = 2 + re->top_bracket * 2;
4038 if (resetcount > offsetcount) resetcount = ocount;
4039
4040 /* Reset the working variable associated with each extraction. These should
4041 never be used unless previously set, but they get saved and restored, and so we
4042 initialize them to avoid reading uninitialized locations. */
4043
4044 if (match_block.offset_vector != NULL)
4045 {
4046 register int *iptr = match_block.offset_vector + ocount;
4047 register int *iend = iptr - resetcount/2 + 1;
4048 while (--iptr >= iend) *iptr = -1;
4049 }
4050
4051 /* Set up the first character to match, if available. The first_char value is
4052 never set for an anchored regular expression, but the anchoring may be forced
4053 at run time, so we have to test for anchoring. The first char may be unset for
4054 an unanchored pattern, of course. If there's no first char and the pattern was
4055 studied, there may be a bitmap of possible first characters. */
4056
4057 if (!anchored)
4058 {
4059 if ((re->options & PCRE_FIRSTSET) != 0)
4060 {
4061 first_char = re->first_char;
4062 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4063 }
4064 else
4065 if (!startline && extra != NULL &&
4066 (extra->options & PCRE_STUDY_MAPPED) != 0)
4067 start_bits = extra->start_bits;
4068 }
4069
4070 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4071
4072 do
4073 {
4074 int rc;
4075 register int *iptr = match_block.offset_vector;
4076 register int *iend = iptr + resetcount;
4077
4078 /* Reset the maximum number of extractions we might see. */
4079
4080 while (iptr < iend) *iptr++ = -1;
4081
4082 /* Advance to a unique first char if possible */
4083
4084 if (first_char >= 0)
4085 {
4086 if ((ims & PCRE_CASELESS) != 0)
4087 while (start_match < end_subject &&
4088 match_block.lcc[*start_match] != first_char)
4089 start_match++;
4090 else
4091 while (start_match < end_subject && *start_match != first_char)
4092 start_match++;
4093 }
4094
4095 /* Or to just after \n for a multiline match if possible */
4096
4097 else if (startline)
4098 {
4099 if (start_match > match_block.start_subject)
4100 {
4101 while (start_match < end_subject && start_match[-1] != '\n')
4102 start_match++;
4103 }
4104 }
4105
4106 /* Or to a non-unique first char */
4107
4108 else if (start_bits != NULL)
4109 {
4110 while (start_match < end_subject)
4111 {
4112 register int c = *start_match;
4113 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4114 }
4115 }
4116
4117 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4118 printf(">>>> Match against: ");
4119 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4120 printf("\n");
4121 #endif
4122
4123 /* When a match occurs, substrings will be set for all internal extractions;
4124 we just need to set up the whole thing as substring 0 before returning. If
4125 there were too many extractions, set the return code to zero. In the case
4126 where we had to get some local store to hold offsets for backreferences, copy
4127 those back references that we can. In this case there need not be overflow
4128 if certain parts of the pattern were not used. */
4129
4130 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4131 continue;
4132
4133 /* Copy the offset information from temporary store if necessary */
4134
4135 if (using_temporary_offsets)
4136 {
4137 if (offsetcount >= 4)
4138 {
4139 memcpy(offsets + 2, match_block.offset_vector + 2,
4140 (offsetcount - 2) * sizeof(int));
4141 DPRINTF(("Copied offsets from temporary memory\n"));
4142 }
4143 if (match_block.end_offset_top > offsetcount)
4144 match_block.offset_overflow = TRUE;
4145
4146 DPRINTF(("Freeing temporary memory\n"));
4147 (pcre_free)(match_block.offset_vector);
4148 }
4149
4150 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4151
4152 if (match_block.offset_end < 2) rc = 0; else
4153 {
4154 offsets[0] = start_match - match_block.start_subject;
4155 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4156 }
4157
4158 DPRINTF((">>>> returning %d\n", rc));
4159 return rc;
4160 }
4161
4162 /* This "while" is the end of the "do" above */
4163
4164 while (!anchored &&
4165 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4166 start_match++ < end_subject);
4167
4168 if (using_temporary_offsets)
4169 {
4170 DPRINTF(("Freeing temporary memory\n"));
4171 (pcre_free)(match_block.offset_vector);
4172 }
4173
4174 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4175
4176 return match_block.errorcode;
4177 }
4178
4179 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12