/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 27 - (show annotations) (download)
Sat Feb 24 21:38:49 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 120845 byte(s)
Load pcre-2.02 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-1999 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
37 inline, and there are *still* stupid compilers about that don't like indented
38 pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Allow compilation as C++ source code, should anybody want to do that. */
53
54 #ifdef __cplusplus
55 #define class pcre_class
56 #endif
57
58
59 /* Number of items on the nested bracket stacks at compile time. This should
60 not be set greater than 200. */
61
62 #define BRASTACK_SIZE 200
63
64
65 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
66
67 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
68 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
69
70 /* Text forms of OP_ values and things, for debugging (not all used) */
71
72 #ifdef DEBUG
73 static const char *OP_names[] = {
74 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
75 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
76 "Opt", "^", "$", "Any", "chars", "not",
77 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
78 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
79 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
80 "*", "*?", "+", "+?", "?", "??", "{", "{",
81 "class", "Ref",
82 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
83 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
84 "Brazero", "Braminzero", "Bra"
85 };
86 #endif
87
88 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
89 are simple data values; negative values are for special things like \d and so
90 on. Zero means further processing is needed (for things like \x), or the escape
91 is invalid. */
92
93 static const short int escapes[] = {
94 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
95 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
96 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
97 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
98 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
99 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
100 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
101 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
102 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
103 0, 0, -ESC_z /* x - z */
104 };
105
106 /* Definition to allow mutual recursion */
107
108 static BOOL
109 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
110 BOOL, int, compile_data *);
111
112
113
114 /*************************************************
115 * Global variables *
116 *************************************************/
117
118 /* PCRE is thread-clean and doesn't use any global variables in the normal
119 sense. However, it calls memory allocation and free functions via the two
120 indirections below, which are can be changed by the caller, but are shared
121 between all threads. */
122
123 void *(*pcre_malloc)(size_t) = malloc;
124 void (*pcre_free)(void *) = free;
125
126
127
128
129 /*************************************************
130 * Default character tables *
131 *************************************************/
132
133 /* A default set of character tables is included in the PCRE binary. Its source
134 is built by the maketables auxiliary program, which uses the default C ctypes
135 functions, and put in the file chartables.c. These tables are used by PCRE
136 whenever the caller of pcre_compile() does not provide an alternate set of
137 tables. */
138
139 #include "chartables.c"
140
141
142
143 /*************************************************
144 * Return version string *
145 *************************************************/
146
147 const char *
148 pcre_version(void)
149 {
150 return PCRE_VERSION;
151 }
152
153
154
155
156 /*************************************************
157 * Return info about a compiled pattern *
158 *************************************************/
159
160 /* This function picks potentially useful data out of the private
161 structure.
162
163 Arguments:
164 external_re points to compiled code
165 optptr where to pass back the options
166 first_char where to pass back the first character,
167 or -1 if multiline and all branches start ^,
168 or -2 otherwise
169
170 Returns: number of identifying extraction brackets
171 or negative values on error
172 */
173
174 int
175 pcre_info(const pcre *external_re, int *optptr, int *first_char)
176 {
177 const real_pcre *re = (const real_pcre *)external_re;
178 if (re == NULL) return PCRE_ERROR_NULL;
179 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
180 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
181 if (first_char != NULL)
182 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
183 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
184 return re->top_bracket;
185 }
186
187
188
189
190 #ifdef DEBUG
191 /*************************************************
192 * Debugging function to print chars *
193 *************************************************/
194
195 /* Print a sequence of chars in printable format, stopping at the end of the
196 subject if the requested.
197
198 Arguments:
199 p points to characters
200 length number to print
201 is_subject TRUE if printing from within md->start_subject
202 md pointer to matching data block, if is_subject is TRUE
203
204 Returns: nothing
205 */
206
207 static void
208 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
209 {
210 int c;
211 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
212 while (length-- > 0)
213 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
214 }
215 #endif
216
217
218
219
220 /*************************************************
221 * Handle escapes *
222 *************************************************/
223
224 /* This function is called when a \ has been encountered. It either returns a
225 positive value for a simple escape such as \n, or a negative value which
226 encodes one of the more complicated things such as \d. On entry, ptr is
227 pointing at the \. On exit, it is on the final character of the escape
228 sequence.
229
230 Arguments:
231 ptrptr points to the pattern position pointer
232 errorptr points to the pointer to the error message
233 bracount number of previous extracting brackets
234 options the options bits
235 isclass TRUE if inside a character class
236 cd pointer to char tables block
237
238 Returns: zero or positive => a data character
239 negative => a special escape sequence
240 on error, errorptr is set
241 */
242
243 static int
244 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
245 int options, BOOL isclass, compile_data *cd)
246 {
247 const uschar *ptr = *ptrptr;
248 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
249 int i;
250
251 if (c == 0) *errorptr = ERR1;
252
253 /* Digits or letters may have special meaning; all others are literals. */
254
255 else if (c < '0' || c > 'z') {}
256
257 /* Do an initial lookup in a table. A non-zero result is something that can be
258 returned immediately. Otherwise further processing may be required. */
259
260 else if ((i = escapes[c - '0']) != 0) c = i;
261
262 /* Escapes that need further processing, or are illegal. */
263
264 else
265 {
266 const uschar *oldptr;
267 switch (c)
268 {
269 /* The handling of escape sequences consisting of a string of digits
270 starting with one that is not zero is not straightforward. By experiment,
271 the way Perl works seems to be as follows:
272
273 Outside a character class, the digits are read as a decimal number. If the
274 number is less than 10, or if there are that many previous extracting
275 left brackets, then it is a back reference. Otherwise, up to three octal
276 digits are read to form an escaped byte. Thus \123 is likely to be octal
277 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
278 value is greater than 377, the least significant 8 bits are taken. Inside a
279 character class, \ followed by a digit is always an octal number. */
280
281 case '1': case '2': case '3': case '4': case '5':
282 case '6': case '7': case '8': case '9':
283
284 if (!isclass)
285 {
286 oldptr = ptr;
287 c -= '0';
288 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
289 c = c * 10 + *(++ptr) - '0';
290 if (c < 10 || c <= bracount)
291 {
292 c = -(ESC_REF + c);
293 break;
294 }
295 ptr = oldptr; /* Put the pointer back and fall through */
296 }
297
298 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
299 generates a binary zero byte and treats the digit as a following literal.
300 Thus we have to pull back the pointer by one. */
301
302 if ((c = *ptr) >= '8')
303 {
304 ptr--;
305 c = 0;
306 break;
307 }
308
309 /* \0 always starts an octal number, but we may drop through to here with a
310 larger first octal digit */
311
312 case '0':
313 c -= '0';
314 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
315 ptr[1] != '8' && ptr[1] != '9')
316 c = c * 8 + *(++ptr) - '0';
317 break;
318
319 /* Special escapes not starting with a digit are straightforward */
320
321 case 'x':
322 c = 0;
323 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
324 {
325 ptr++;
326 c = c * 16 + cd->lcc[*ptr] -
327 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
328 }
329 break;
330
331 case 'c':
332 c = *(++ptr);
333 if (c == 0)
334 {
335 *errorptr = ERR2;
336 return 0;
337 }
338
339 /* A letter is upper-cased; then the 0x40 bit is flipped */
340
341 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
342 c ^= 0x40;
343 break;
344
345 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
346 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
347 for Perl compatibility, it is a literal. This code looks a bit odd, but
348 there used to be some cases other than the default, and there may be again
349 in future, so I haven't "optimized" it. */
350
351 default:
352 if ((options & PCRE_EXTRA) != 0) switch(c)
353 {
354 default:
355 *errorptr = ERR3;
356 break;
357 }
358 break;
359 }
360 }
361
362 *ptrptr = ptr;
363 return c;
364 }
365
366
367
368 /*************************************************
369 * Check for counted repeat *
370 *************************************************/
371
372 /* This function is called when a '{' is encountered in a place where it might
373 start a quantifier. It looks ahead to see if it really is a quantifier or not.
374 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
375 where the ddds are digits.
376
377 Arguments:
378 p pointer to the first char after '{'
379 cd pointer to char tables block
380
381 Returns: TRUE or FALSE
382 */
383
384 static BOOL
385 is_counted_repeat(const uschar *p, compile_data *cd)
386 {
387 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
388 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
389 if (*p == '}') return TRUE;
390
391 if (*p++ != ',') return FALSE;
392 if (*p == '}') return TRUE;
393
394 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
395 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
396 return (*p == '}');
397 }
398
399
400
401 /*************************************************
402 * Read repeat counts *
403 *************************************************/
404
405 /* Read an item of the form {n,m} and return the values. This is called only
406 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
407 so the syntax is guaranteed to be correct, but we need to check the values.
408
409 Arguments:
410 p pointer to first char after '{'
411 minp pointer to int for min
412 maxp pointer to int for max
413 returned as -1 if no max
414 errorptr points to pointer to error message
415 cd pointer to character tables clock
416
417 Returns: pointer to '}' on success;
418 current ptr on error, with errorptr set
419 */
420
421 static const uschar *
422 read_repeat_counts(const uschar *p, int *minp, int *maxp,
423 const char **errorptr, compile_data *cd)
424 {
425 int min = 0;
426 int max = -1;
427
428 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
429
430 if (*p == '}') max = min; else
431 {
432 if (*(++p) != '}')
433 {
434 max = 0;
435 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
436 if (max < min)
437 {
438 *errorptr = ERR4;
439 return p;
440 }
441 }
442 }
443
444 /* Do paranoid checks, then fill in the required variables, and pass back the
445 pointer to the terminating '}'. */
446
447 if (min > 65535 || max > 65535)
448 *errorptr = ERR5;
449 else
450 {
451 *minp = min;
452 *maxp = max;
453 }
454 return p;
455 }
456
457
458
459 /*************************************************
460 * Find the fixed length of a pattern *
461 *************************************************/
462
463 /* Scan a pattern and compute the fixed length of subject that will match it,
464 if the length is fixed. This is needed for dealing with backward assertions.
465
466 Arguments:
467 code points to the start of the pattern (the bracket)
468
469 Returns: the fixed length, or -1 if there is no fixed length
470 */
471
472 static int
473 find_fixedlength(uschar *code)
474 {
475 int length = -1;
476
477 register int branchlength = 0;
478 register uschar *cc = code + 3;
479
480 /* Scan along the opcodes for this branch. If we get to the end of the
481 branch, check the length against that of the other branches. */
482
483 for (;;)
484 {
485 int d;
486 register int op = *cc;
487 if (op >= OP_BRA) op = OP_BRA;
488
489 switch (op)
490 {
491 case OP_BRA:
492 case OP_ONCE:
493 case OP_COND:
494 d = find_fixedlength(cc);
495 if (d < 0) return -1;
496 branchlength += d;
497 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
498 cc += 3;
499 break;
500
501 /* Reached end of a branch; if it's a ket it is the end of a nested
502 call. If it's ALT it is an alternation in a nested call. If it is
503 END it's the end of the outer call. All can be handled by the same code. */
504
505 case OP_ALT:
506 case OP_KET:
507 case OP_KETRMAX:
508 case OP_KETRMIN:
509 case OP_END:
510 if (length < 0) length = branchlength;
511 else if (length != branchlength) return -1;
512 if (*cc != OP_ALT) return length;
513 cc += 3;
514 branchlength = 0;
515 break;
516
517 /* Skip over assertive subpatterns */
518
519 case OP_ASSERT:
520 case OP_ASSERT_NOT:
521 case OP_ASSERTBACK:
522 case OP_ASSERTBACK_NOT:
523 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
524 cc += 3;
525 break;
526
527 /* Skip over things that don't match chars */
528
529 case OP_REVERSE:
530 cc++;
531
532 case OP_CREF:
533 case OP_OPT:
534 cc++;
535 /* Fall through */
536
537 case OP_SOD:
538 case OP_EOD:
539 case OP_EODN:
540 case OP_CIRC:
541 case OP_DOLL:
542 case OP_NOT_WORD_BOUNDARY:
543 case OP_WORD_BOUNDARY:
544 cc++;
545 break;
546
547 /* Handle char strings */
548
549 case OP_CHARS:
550 branchlength += *(++cc);
551 cc += *cc + 1;
552 break;
553
554 /* Handle exact repetitions */
555
556 case OP_EXACT:
557 case OP_TYPEEXACT:
558 branchlength += (cc[1] << 8) + cc[2];
559 cc += 4;
560 break;
561
562 /* Handle single-char matchers */
563
564 case OP_NOT_DIGIT:
565 case OP_DIGIT:
566 case OP_NOT_WHITESPACE:
567 case OP_WHITESPACE:
568 case OP_NOT_WORDCHAR:
569 case OP_WORDCHAR:
570 case OP_ANY:
571 branchlength++;
572 cc++;
573 break;
574
575
576 /* Check a class for variable quantification */
577
578 case OP_CLASS:
579 cc += (*cc == OP_REF)? 2 : 33;
580
581 switch (*cc)
582 {
583 case OP_CRSTAR:
584 case OP_CRMINSTAR:
585 case OP_CRQUERY:
586 case OP_CRMINQUERY:
587 return -1;
588
589 case OP_CRRANGE:
590 case OP_CRMINRANGE:
591 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
592 branchlength += (cc[1] << 8) + cc[2];
593 cc += 5;
594 break;
595
596 default:
597 branchlength++;
598 }
599 break;
600
601 /* Anything else is variable length */
602
603 default:
604 return -1;
605 }
606 }
607 /* Control never gets here */
608 }
609
610
611
612
613 /*************************************************
614 * Compile one branch *
615 *************************************************/
616
617 /* Scan the pattern, compiling it into the code vector.
618
619 Arguments:
620 options the option bits
621 brackets points to number of brackets used
622 code points to the pointer to the current code point
623 ptrptr points to the current pattern pointer
624 errorptr points to pointer to error message
625 optchanged set to the value of the last OP_OPT item compiled
626 cd contains pointers to tables
627
628 Returns: TRUE on success
629 FALSE, with *errorptr set on error
630 */
631
632 static BOOL
633 compile_branch(int options, int *brackets, uschar **codeptr,
634 const uschar **ptrptr, const char **errorptr, int *optchanged,
635 compile_data *cd)
636 {
637 int repeat_type, op_type;
638 int repeat_min, repeat_max;
639 int bravalue, length;
640 int greedy_default, greedy_non_default;
641 register int c;
642 register uschar *code = *codeptr;
643 uschar *tempcode;
644 const uschar *ptr = *ptrptr;
645 const uschar *tempptr;
646 uschar *previous = NULL;
647 uschar class[32];
648
649 /* Set up the default and non-default settings for greediness */
650
651 greedy_default = ((options & PCRE_UNGREEDY) != 0);
652 greedy_non_default = greedy_default ^ 1;
653
654 /* Switch on next character until the end of the branch */
655
656 for (;; ptr++)
657 {
658 BOOL negate_class;
659 int class_charcount;
660 int class_lastchar;
661 int newoptions;
662 int condref;
663
664 c = *ptr;
665 if ((options & PCRE_EXTENDED) != 0)
666 {
667 if ((cd->ctypes[c] & ctype_space) != 0) continue;
668 if (c == '#')
669 {
670 while ((c = *(++ptr)) != 0 && c != '\n');
671 continue;
672 }
673 }
674
675 switch(c)
676 {
677 /* The branch terminates at end of string, |, or ). */
678
679 case 0:
680 case '|':
681 case ')':
682 *codeptr = code;
683 *ptrptr = ptr;
684 return TRUE;
685
686 /* Handle single-character metacharacters */
687
688 case '^':
689 previous = NULL;
690 *code++ = OP_CIRC;
691 break;
692
693 case '$':
694 previous = NULL;
695 *code++ = OP_DOLL;
696 break;
697
698 case '.':
699 previous = code;
700 *code++ = OP_ANY;
701 break;
702
703 /* Character classes. These always build a 32-byte bitmap of the permitted
704 characters, except in the special case where there is only one character.
705 For negated classes, we build the map as usual, then invert it at the end.
706 */
707
708 case '[':
709 previous = code;
710 *code++ = OP_CLASS;
711
712 /* If the first character is '^', set the negation flag and skip it. */
713
714 if ((c = *(++ptr)) == '^')
715 {
716 negate_class = TRUE;
717 c = *(++ptr);
718 }
719 else negate_class = FALSE;
720
721 /* Keep a count of chars so that we can optimize the case of just a single
722 character. */
723
724 class_charcount = 0;
725 class_lastchar = -1;
726
727 /* Initialize the 32-char bit map to all zeros. We have to build the
728 map in a temporary bit of store, in case the class contains only 1
729 character, because in that case the compiled code doesn't use the
730 bit map. */
731
732 memset(class, 0, 32 * sizeof(uschar));
733
734 /* Process characters until ] is reached. By writing this as a "do" it
735 means that an initial ] is taken as a data character. */
736
737 do
738 {
739 if (c == 0)
740 {
741 *errorptr = ERR6;
742 goto FAILED;
743 }
744
745 /* Backslash may introduce a single character, or it may introduce one
746 of the specials, which just set a flag. Escaped items are checked for
747 validity in the pre-compiling pass. The sequence \b is a special case.
748 Inside a class (and only there) it is treated as backspace. Elsewhere
749 it marks a word boundary. Other escapes have preset maps ready to
750 or into the one we are building. We assume they have more than one
751 character in them, so set class_count bigger than one. */
752
753 if (c == '\\')
754 {
755 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
756 if (-c == ESC_b) c = '\b';
757 else if (c < 0)
758 {
759 register const uschar *cbits = cd->cbits;
760 class_charcount = 10;
761 switch (-c)
762 {
763 case ESC_d:
764 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
765 continue;
766
767 case ESC_D:
768 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
769 continue;
770
771 case ESC_w:
772 for (c = 0; c < 32; c++)
773 class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
774 continue;
775
776 case ESC_W:
777 for (c = 0; c < 32; c++)
778 class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
779 continue;
780
781 case ESC_s:
782 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
783 continue;
784
785 case ESC_S:
786 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
787 continue;
788
789 default:
790 *errorptr = ERR7;
791 goto FAILED;
792 }
793 }
794 /* Fall through if single character */
795 }
796
797 /* A single character may be followed by '-' to form a range. However,
798 Perl does not permit ']' to be the end of the range. A '-' character
799 here is treated as a literal. */
800
801 if (ptr[1] == '-' && ptr[2] != ']')
802 {
803 int d;
804 ptr += 2;
805 d = *ptr;
806
807 if (d == 0)
808 {
809 *errorptr = ERR6;
810 goto FAILED;
811 }
812
813 /* The second part of a range can be a single-character escape, but
814 not any of the other escapes. */
815
816 if (d == '\\')
817 {
818 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
819 if (d < 0)
820 {
821 if (d == -ESC_b) d = '\b'; else
822 {
823 *errorptr = ERR7;
824 goto FAILED;
825 }
826 }
827 }
828
829 if (d < c)
830 {
831 *errorptr = ERR8;
832 goto FAILED;
833 }
834
835 for (; c <= d; c++)
836 {
837 class[c/8] |= (1 << (c&7));
838 if ((options & PCRE_CASELESS) != 0)
839 {
840 int uc = cd->fcc[c]; /* flip case */
841 class[uc/8] |= (1 << (uc&7));
842 }
843 class_charcount++; /* in case a one-char range */
844 class_lastchar = c;
845 }
846 continue; /* Go get the next char in the class */
847 }
848
849 /* Handle a lone single character - we can get here for a normal
850 non-escape char, or after \ that introduces a single character. */
851
852 class [c/8] |= (1 << (c&7));
853 if ((options & PCRE_CASELESS) != 0)
854 {
855 c = cd->fcc[c]; /* flip case */
856 class[c/8] |= (1 << (c&7));
857 }
858 class_charcount++;
859 class_lastchar = c;
860 }
861
862 /* Loop until ']' reached; the check for end of string happens inside the
863 loop. This "while" is the end of the "do" above. */
864
865 while ((c = *(++ptr)) != ']');
866
867 /* If class_charcount is 1 and class_lastchar is not negative, we saw
868 precisely one character. This doesn't need the whole 32-byte bit map.
869 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
870 it's negative. */
871
872 if (class_charcount == 1 && class_lastchar >= 0)
873 {
874 if (negate_class)
875 {
876 code[-1] = OP_NOT;
877 }
878 else
879 {
880 code[-1] = OP_CHARS;
881 *code++ = 1;
882 }
883 *code++ = class_lastchar;
884 }
885
886 /* Otherwise, negate the 32-byte map if necessary, and copy it into
887 the code vector. */
888
889 else
890 {
891 if (negate_class)
892 for (c = 0; c < 32; c++) code[c] = ~class[c];
893 else
894 memcpy(code, class, 32);
895 code += 32;
896 }
897 break;
898
899 /* Various kinds of repeat */
900
901 case '{':
902 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
903 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
904 if (*errorptr != NULL) goto FAILED;
905 goto REPEAT;
906
907 case '*':
908 repeat_min = 0;
909 repeat_max = -1;
910 goto REPEAT;
911
912 case '+':
913 repeat_min = 1;
914 repeat_max = -1;
915 goto REPEAT;
916
917 case '?':
918 repeat_min = 0;
919 repeat_max = 1;
920
921 REPEAT:
922 if (previous == NULL)
923 {
924 *errorptr = ERR9;
925 goto FAILED;
926 }
927
928 /* If the next character is '?' this is a minimizing repeat, by default,
929 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
930 next character. */
931
932 if (ptr[1] == '?')
933 { repeat_type = greedy_non_default; ptr++; }
934 else repeat_type = greedy_default;
935
936 /* If the maximum is zero then the minimum must also be zero; Perl allows
937 this case, so we do too - by simply omitting the item altogether. */
938
939 if (repeat_max == 0) code = previous;
940
941 /* If previous was a string of characters, chop off the last one and use it
942 as the subject of the repeat. If there was only one character, we can
943 abolish the previous item altogether. */
944
945 else if (*previous == OP_CHARS)
946 {
947 int len = previous[1];
948 if (len == 1)
949 {
950 c = previous[2];
951 code = previous;
952 }
953 else
954 {
955 c = previous[len+1];
956 previous[1]--;
957 code--;
958 }
959 op_type = 0; /* Use single-char op codes */
960 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
961 }
962
963 /* If previous was a single negated character ([^a] or similar), we use
964 one of the special opcodes, replacing it. The code is shared with single-
965 character repeats by adding a suitable offset into repeat_type. */
966
967 else if ((int)*previous == OP_NOT)
968 {
969 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
970 c = previous[1];
971 code = previous;
972 goto OUTPUT_SINGLE_REPEAT;
973 }
974
975 /* If previous was a character type match (\d or similar), abolish it and
976 create a suitable repeat item. The code is shared with single-character
977 repeats by adding a suitable offset into repeat_type. */
978
979 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
980 {
981 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
982 c = *previous;
983 code = previous;
984
985 OUTPUT_SINGLE_REPEAT:
986 repeat_type += op_type; /* Combine both values for many cases */
987
988 /* A minimum of zero is handled either as the special case * or ?, or as
989 an UPTO, with the maximum given. */
990
991 if (repeat_min == 0)
992 {
993 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
994 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
995 else
996 {
997 *code++ = OP_UPTO + repeat_type;
998 *code++ = repeat_max >> 8;
999 *code++ = (repeat_max & 255);
1000 }
1001 }
1002
1003 /* The case {1,} is handled as the special case + */
1004
1005 else if (repeat_min == 1 && repeat_max == -1)
1006 *code++ = OP_PLUS + repeat_type;
1007
1008 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1009 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1010
1011 else
1012 {
1013 if (repeat_min != 1)
1014 {
1015 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1016 *code++ = repeat_min >> 8;
1017 *code++ = (repeat_min & 255);
1018 }
1019
1020 /* If the mininum is 1 and the previous item was a character string,
1021 we either have to put back the item that got cancelled if the string
1022 length was 1, or add the character back onto the end of a longer
1023 string. For a character type nothing need be done; it will just get
1024 put back naturally. Note that the final character is always going to
1025 get added below. */
1026
1027 else if (*previous == OP_CHARS)
1028 {
1029 if (code == previous) code += 2; else previous[1]++;
1030 }
1031
1032 /* For a single negated character we also have to put back the
1033 item that got cancelled. */
1034
1035 else if (*previous == OP_NOT) code++;
1036
1037 /* If the maximum is unlimited, insert an OP_STAR. */
1038
1039 if (repeat_max < 0)
1040 {
1041 *code++ = c;
1042 *code++ = OP_STAR + repeat_type;
1043 }
1044
1045 /* Else insert an UPTO if the max is greater than the min. */
1046
1047 else if (repeat_max != repeat_min)
1048 {
1049 *code++ = c;
1050 repeat_max -= repeat_min;
1051 *code++ = OP_UPTO + repeat_type;
1052 *code++ = repeat_max >> 8;
1053 *code++ = (repeat_max & 255);
1054 }
1055 }
1056
1057 /* The character or character type itself comes last in all cases. */
1058
1059 *code++ = c;
1060 }
1061
1062 /* If previous was a character class or a back reference, we put the repeat
1063 stuff after it. */
1064
1065 else if (*previous == OP_CLASS || *previous == OP_REF)
1066 {
1067 if (repeat_min == 0 && repeat_max == -1)
1068 *code++ = OP_CRSTAR + repeat_type;
1069 else if (repeat_min == 1 && repeat_max == -1)
1070 *code++ = OP_CRPLUS + repeat_type;
1071 else if (repeat_min == 0 && repeat_max == 1)
1072 *code++ = OP_CRQUERY + repeat_type;
1073 else
1074 {
1075 *code++ = OP_CRRANGE + repeat_type;
1076 *code++ = repeat_min >> 8;
1077 *code++ = repeat_min & 255;
1078 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1079 *code++ = repeat_max >> 8;
1080 *code++ = repeat_max & 255;
1081 }
1082 }
1083
1084 /* If previous was a bracket group, we may have to replicate it in certain
1085 cases. */
1086
1087 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1088 (int)*previous == OP_COND)
1089 {
1090 int i, ketoffset = 0;
1091 int len = code - previous;
1092
1093 /* If the maximum repeat count is unlimited, find the end of the bracket
1094 by scanning through from the start, and compute the offset back to it
1095 from the current code pointer. There may be an OP_OPT setting following
1096 the final KET, so we can't find the end just by going back from the code
1097 pointer. */
1098
1099 if (repeat_max == -1)
1100 {
1101 register uschar *ket = previous;
1102 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1103 ketoffset = code - ket;
1104 }
1105
1106 /* If the minimum is greater than zero, and the maximum is unlimited or
1107 equal to the minimum, the first copy remains where it is, and is
1108 replicated up to the minimum number of times. This case includes the +
1109 repeat, but of course no replication is needed in that case. */
1110
1111 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1112 {
1113 for (i = 1; i < repeat_min; i++)
1114 {
1115 memcpy(code, previous, len);
1116 code += len;
1117 }
1118 }
1119
1120 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1121 Then, if there is a fixed upper limit, replicated up to that many times,
1122 sticking BRAZERO in front of all the optional ones. */
1123
1124 else
1125 {
1126 if (repeat_min == 0)
1127 {
1128 memmove(previous+1, previous, len);
1129 code++;
1130 *previous++ = OP_BRAZERO + repeat_type;
1131 }
1132
1133 for (i = 1; i < repeat_min; i++)
1134 {
1135 memcpy(code, previous, len);
1136 code += len;
1137 }
1138
1139 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1140 {
1141 *code++ = OP_BRAZERO + repeat_type;
1142 memcpy(code, previous, len);
1143 code += len;
1144 }
1145 }
1146
1147 /* If the maximum is unlimited, set a repeater in the final copy. We
1148 can't just offset backwards from the current code point, because we
1149 don't know if there's been an options resetting after the ket. The
1150 correct offset was computed above. */
1151
1152 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1153 }
1154
1155 /* Else there's some kind of shambles */
1156
1157 else
1158 {
1159 *errorptr = ERR11;
1160 goto FAILED;
1161 }
1162
1163 /* In all case we no longer have a previous item. */
1164
1165 previous = NULL;
1166 break;
1167
1168
1169 /* Start of nested bracket sub-expression, or comment or lookahead or
1170 lookbehind or option setting or condition. First deal with special things
1171 that can come after a bracket; all are introduced by ?, and the appearance
1172 of any of them means that this is not a referencing group. They were
1173 checked for validity in the first pass over the string, so we don't have to
1174 check for syntax errors here. */
1175
1176 case '(':
1177 newoptions = options;
1178 condref = -1;
1179
1180 if (*(++ptr) == '?')
1181 {
1182 int set, unset;
1183 int *optset;
1184
1185 switch (*(++ptr))
1186 {
1187 case '#': /* Comment; skip to ket */
1188 ptr++;
1189 while (*ptr != ')') ptr++;
1190 continue;
1191
1192 case ':': /* Non-extracting bracket */
1193 bravalue = OP_BRA;
1194 ptr++;
1195 break;
1196
1197 case '(':
1198 bravalue = OP_COND; /* Conditional group */
1199 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1200 {
1201 condref = *ptr - '0';
1202 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1203 ptr++;
1204 }
1205 else ptr--;
1206 break;
1207
1208 case '=': /* Positive lookahead */
1209 bravalue = OP_ASSERT;
1210 ptr++;
1211 break;
1212
1213 case '!': /* Negative lookahead */
1214 bravalue = OP_ASSERT_NOT;
1215 ptr++;
1216 break;
1217
1218 case '<': /* Lookbehinds */
1219 switch (*(++ptr))
1220 {
1221 case '=': /* Positive lookbehind */
1222 bravalue = OP_ASSERTBACK;
1223 ptr++;
1224 break;
1225
1226 case '!': /* Negative lookbehind */
1227 bravalue = OP_ASSERTBACK_NOT;
1228 ptr++;
1229 break;
1230
1231 default: /* Syntax error */
1232 *errorptr = ERR24;
1233 goto FAILED;
1234 }
1235 break;
1236
1237 case '>': /* One-time brackets */
1238 bravalue = OP_ONCE;
1239 ptr++;
1240 break;
1241
1242 default: /* Option setting */
1243 set = unset = 0;
1244 optset = &set;
1245
1246 while (*ptr != ')' && *ptr != ':')
1247 {
1248 switch (*ptr++)
1249 {
1250 case '-': optset = &unset; break;
1251
1252 case 'i': *optset |= PCRE_CASELESS; break;
1253 case 'm': *optset |= PCRE_MULTILINE; break;
1254 case 's': *optset |= PCRE_DOTALL; break;
1255 case 'x': *optset |= PCRE_EXTENDED; break;
1256 case 'U': *optset |= PCRE_UNGREEDY; break;
1257 case 'X': *optset |= PCRE_EXTRA; break;
1258
1259 default:
1260 *errorptr = ERR12;
1261 goto FAILED;
1262 }
1263 }
1264
1265 /* Set up the changed option bits, but don't change anything yet. */
1266
1267 newoptions = (options | set) & (~unset);
1268
1269 /* If the options ended with ')' this is not the start of a nested
1270 group with option changes, so the options change at this level. At top
1271 level there is nothing else to be done (the options will in fact have
1272 been set from the start of compiling as a result of the first pass) but
1273 at an inner level we must compile code to change the ims options if
1274 necessary, and pass the new setting back so that it can be put at the
1275 start of any following branches, and when this group ends, a resetting
1276 item can be compiled. */
1277
1278 if (*ptr == ')')
1279 {
1280 if ((options & PCRE_INGROUP) != 0 &&
1281 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1282 {
1283 *code++ = OP_OPT;
1284 *code++ = *optchanged = newoptions & PCRE_IMS;
1285 }
1286 options = newoptions; /* Change options at this level */
1287 previous = NULL; /* This item can't be repeated */
1288 continue; /* It is complete */
1289 }
1290
1291 /* If the options ended with ':' we are heading into a nested group
1292 with possible change of options. Such groups are non-capturing and are
1293 not assertions of any kind. All we need to do is skip over the ':';
1294 the newoptions value is handled below. */
1295
1296 bravalue = OP_BRA;
1297 ptr++;
1298 }
1299 }
1300
1301 /* Else we have a referencing group; adjust the opcode. */
1302
1303 else
1304 {
1305 if (++(*brackets) > EXTRACT_MAX)
1306 {
1307 *errorptr = ERR13;
1308 goto FAILED;
1309 }
1310 bravalue = OP_BRA + *brackets;
1311 }
1312
1313 /* Process nested bracketed re. Assertions may not be repeated, but other
1314 kinds can be. We copy code into a non-register variable in order to be able
1315 to pass its address because some compilers complain otherwise. Pass in a
1316 new setting for the ims options if they have changed. */
1317
1318 previous = (bravalue >= OP_ONCE)? code : NULL;
1319 *code = bravalue;
1320 tempcode = code;
1321
1322 if (!compile_regex(
1323 options | PCRE_INGROUP, /* Set for all nested groups */
1324 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1325 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1326 brackets, /* Bracket level */
1327 &tempcode, /* Where to put code (updated) */
1328 &ptr, /* Input pointer (updated) */
1329 errorptr, /* Where to put an error message */
1330 (bravalue == OP_ASSERTBACK ||
1331 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1332 condref, /* Condition reference number */
1333 cd)) /* Tables block */
1334 goto FAILED;
1335
1336 /* At the end of compiling, code is still pointing to the start of the
1337 group, while tempcode has been updated to point past the end of the group
1338 and any option resetting that may follow it. The pattern pointer (ptr)
1339 is on the bracket. */
1340
1341 /* If this is a conditional bracket, check that there are no more than
1342 two branches in the group. */
1343
1344 if (bravalue == OP_COND)
1345 {
1346 int branchcount = 0;
1347 uschar *tc = code;
1348
1349 do {
1350 branchcount++;
1351 tc += (tc[1] << 8) | tc[2];
1352 }
1353 while (*tc != OP_KET);
1354
1355 if (branchcount > 2)
1356 {
1357 *errorptr = ERR27;
1358 goto FAILED;
1359 }
1360 }
1361
1362 /* Now update the main code pointer to the end of the group. */
1363
1364 code = tempcode;
1365
1366 /* Error if hit end of pattern */
1367
1368 if (*ptr != ')')
1369 {
1370 *errorptr = ERR14;
1371 goto FAILED;
1372 }
1373 break;
1374
1375 /* Check \ for being a real metacharacter; if not, fall through and handle
1376 it as a data character at the start of a string. Escape items are checked
1377 for validity in the pre-compiling pass. */
1378
1379 case '\\':
1380 tempptr = ptr;
1381 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1382
1383 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1384 are arranged to be the negation of the corresponding OP_values. For the
1385 back references, the values are ESC_REF plus the reference number. Only
1386 back references and those types that consume a character may be repeated.
1387 We can test for values between ESC_b and ESC_Z for the latter; this may
1388 have to change if any new ones are ever created. */
1389
1390 if (c < 0)
1391 {
1392 if (-c >= ESC_REF)
1393 {
1394 previous = code;
1395 *code++ = OP_REF;
1396 *code++ = -c - ESC_REF;
1397 }
1398 else
1399 {
1400 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1401 *code++ = -c;
1402 }
1403 continue;
1404 }
1405
1406 /* Data character: reset and fall through */
1407
1408 ptr = tempptr;
1409 c = '\\';
1410
1411 /* Handle a run of data characters until a metacharacter is encountered.
1412 The first character is guaranteed not to be whitespace or # when the
1413 extended flag is set. */
1414
1415 NORMAL_CHAR:
1416 default:
1417 previous = code;
1418 *code = OP_CHARS;
1419 code += 2;
1420 length = 0;
1421
1422 do
1423 {
1424 if ((options & PCRE_EXTENDED) != 0)
1425 {
1426 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1427 if (c == '#')
1428 {
1429 while ((c = *(++ptr)) != 0 && c != '\n');
1430 if (c == 0) break;
1431 continue;
1432 }
1433 }
1434
1435 /* Backslash may introduce a data char or a metacharacter. Escaped items
1436 are checked for validity in the pre-compiling pass. Stop the string
1437 before a metaitem. */
1438
1439 if (c == '\\')
1440 {
1441 tempptr = ptr;
1442 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1443 if (c < 0) { ptr = tempptr; break; }
1444 }
1445
1446 /* Ordinary character or single-char escape */
1447
1448 *code++ = c;
1449 length++;
1450 }
1451
1452 /* This "while" is the end of the "do" above. */
1453
1454 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1455
1456 /* Compute the length and set it in the data vector, and advance to
1457 the next state. */
1458
1459 previous[1] = length;
1460 if (length < 255) ptr--;
1461 break;
1462 }
1463 } /* end of big loop */
1464
1465 /* Control never reaches here by falling through, only by a goto for all the
1466 error states. Pass back the position in the pattern so that it can be displayed
1467 to the user for diagnosing the error. */
1468
1469 FAILED:
1470 *ptrptr = ptr;
1471 return FALSE;
1472 }
1473
1474
1475
1476
1477 /*************************************************
1478 * Compile sequence of alternatives *
1479 *************************************************/
1480
1481 /* On entry, ptr is pointing past the bracket character, but on return
1482 it points to the closing bracket, or vertical bar, or end of string.
1483 The code variable is pointing at the byte into which the BRA operator has been
1484 stored. If the ims options are changed at the start (for a (?ims: group) or
1485 during any branch, we need to insert an OP_OPT item at the start of every
1486 following branch to ensure they get set correctly at run time, and also pass
1487 the new options into every subsequent branch compile.
1488
1489 Argument:
1490 options the option bits
1491 optchanged new ims options to set as if (?ims) were at the start, or -1
1492 for no change
1493 brackets -> int containing the number of extracting brackets used
1494 codeptr -> the address of the current code pointer
1495 ptrptr -> the address of the current pattern pointer
1496 errorptr -> pointer to error message
1497 lookbehind TRUE if this is a lookbehind assertion
1498 condref > 0 for OPT_CREF setting at start of conditional group
1499 cd points to the data block with tables pointers
1500
1501 Returns: TRUE on success
1502 */
1503
1504 static BOOL
1505 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1506 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1507 compile_data *cd)
1508 {
1509 const uschar *ptr = *ptrptr;
1510 uschar *code = *codeptr;
1511 uschar *last_branch = code;
1512 uschar *start_bracket = code;
1513 uschar *reverse_count = NULL;
1514 int oldoptions = options & PCRE_IMS;
1515
1516 code += 3;
1517
1518 /* At the start of a reference-based conditional group, insert the reference
1519 number as an OP_CREF item. */
1520
1521 if (condref > 0)
1522 {
1523 *code++ = OP_CREF;
1524 *code++ = condref;
1525 }
1526
1527 /* Loop for each alternative branch */
1528
1529 for (;;)
1530 {
1531 int length;
1532
1533 /* Handle change of options */
1534
1535 if (optchanged >= 0)
1536 {
1537 *code++ = OP_OPT;
1538 *code++ = optchanged;
1539 options = (options & ~PCRE_IMS) | optchanged;
1540 }
1541
1542 /* Set up dummy OP_REVERSE if lookbehind assertion */
1543
1544 if (lookbehind)
1545 {
1546 *code++ = OP_REVERSE;
1547 reverse_count = code;
1548 *code++ = 0;
1549 *code++ = 0;
1550 }
1551
1552 /* Now compile the branch */
1553
1554 if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
1555 {
1556 *ptrptr = ptr;
1557 return FALSE;
1558 }
1559
1560 /* Fill in the length of the last branch */
1561
1562 length = code - last_branch;
1563 last_branch[1] = length >> 8;
1564 last_branch[2] = length & 255;
1565
1566 /* If lookbehind, check that this branch matches a fixed-length string,
1567 and put the length into the OP_REVERSE item. Temporarily mark the end of
1568 the branch with OP_END. */
1569
1570 if (lookbehind)
1571 {
1572 *code = OP_END;
1573 length = find_fixedlength(last_branch);
1574 DPRINTF(("fixed length = %d\n", length));
1575 if (length < 0)
1576 {
1577 *errorptr = ERR25;
1578 *ptrptr = ptr;
1579 return FALSE;
1580 }
1581 reverse_count[0] = (length >> 8);
1582 reverse_count[1] = length & 255;
1583 }
1584
1585 /* Reached end of expression, either ')' or end of pattern. Insert a
1586 terminating ket and the length of the whole bracketed item, and return,
1587 leaving the pointer at the terminating char. If any of the ims options
1588 were changed inside the group, compile a resetting op-code following. */
1589
1590 if (*ptr != '|')
1591 {
1592 length = code - start_bracket;
1593 *code++ = OP_KET;
1594 *code++ = length >> 8;
1595 *code++ = length & 255;
1596 if (optchanged >= 0)
1597 {
1598 *code++ = OP_OPT;
1599 *code++ = oldoptions;
1600 }
1601 *codeptr = code;
1602 *ptrptr = ptr;
1603 return TRUE;
1604 }
1605
1606 /* Another branch follows; insert an "or" node and advance the pointer. */
1607
1608 *code = OP_ALT;
1609 last_branch = code;
1610 code += 3;
1611 ptr++;
1612 }
1613 /* Control never reaches here */
1614 }
1615
1616
1617
1618
1619 /*************************************************
1620 * Find first significant op code *
1621 *************************************************/
1622
1623 /* This is called by several functions that scan a compiled expression looking
1624 for a fixed first character, or an anchoring op code etc. It skips over things
1625 that do not influence this. For one application, a change of caseless option is
1626 important.
1627
1628 Arguments:
1629 code pointer to the start of the group
1630 options pointer to external options
1631 optbit the option bit whose changing is significant, or
1632 zero if none are
1633 optstop TRUE to return on option change, otherwise change the options
1634 value and continue
1635
1636 Returns: pointer to the first significant opcode
1637 */
1638
1639 static const uschar*
1640 first_significant_code(const uschar *code, int *options, int optbit,
1641 BOOL optstop)
1642 {
1643 for (;;)
1644 {
1645 switch ((int)*code)
1646 {
1647 case OP_OPT:
1648 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1649 {
1650 if (optstop) return code;
1651 *options = (int)code[1];
1652 }
1653 code += 2;
1654 break;
1655
1656 case OP_CREF:
1657 code += 2;
1658 break;
1659
1660 case OP_ASSERT_NOT:
1661 case OP_ASSERTBACK:
1662 case OP_ASSERTBACK_NOT:
1663 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1664 code += 3;
1665 break;
1666
1667 default:
1668 return code;
1669 }
1670 }
1671 /* Control never reaches here */
1672 }
1673
1674
1675
1676
1677 /*************************************************
1678 * Check for anchored expression *
1679 *************************************************/
1680
1681 /* Try to find out if this is an anchored regular expression. Consider each
1682 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1683 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1684 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1685 counts, since OP_CIRC can match in the middle.
1686
1687 A branch is also implicitly anchored if it starts with .* because that will try
1688 the rest of the pattern at all possible matching points, so there is no point
1689 trying them again.
1690
1691 Arguments:
1692 code points to start of expression (the bracket)
1693 options points to the options setting
1694
1695 Returns: TRUE or FALSE
1696 */
1697
1698 static BOOL
1699 is_anchored(register const uschar *code, int *options)
1700 {
1701 do {
1702 const uschar *scode = first_significant_code(code + 3, options,
1703 PCRE_MULTILINE, FALSE);
1704 register int op = *scode;
1705 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1706 { if (!is_anchored(scode, options)) return FALSE; }
1707 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1708 { if (scode[1] != OP_ANY) return FALSE; }
1709 else if (op != OP_SOD &&
1710 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1711 return FALSE;
1712 code += (code[1] << 8) + code[2];
1713 }
1714 while (*code == OP_ALT);
1715 return TRUE;
1716 }
1717
1718
1719
1720 /*************************************************
1721 * Check for start with \n line expression *
1722 *************************************************/
1723
1724 /* This is called for multiline expressions to try to find out if every branch
1725 starts with ^ so that "first char" processing can be done to speed things up.
1726
1727 Argument: points to start of expression (the bracket)
1728 Returns: TRUE or FALSE
1729 */
1730
1731 static BOOL
1732 is_startline(const uschar *code)
1733 {
1734 do {
1735 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1736 register int op = *scode;
1737 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1738 { if (!is_startline(scode)) return FALSE; }
1739 else if (op != OP_CIRC) return FALSE;
1740 code += (code[1] << 8) + code[2];
1741 }
1742 while (*code == OP_ALT);
1743 return TRUE;
1744 }
1745
1746
1747
1748 /*************************************************
1749 * Check for fixed first char *
1750 *************************************************/
1751
1752 /* Try to find out if there is a fixed first character. This is called for
1753 unanchored expressions, as it speeds up their processing quite considerably.
1754 Consider each alternative branch. If they all start with the same char, or with
1755 a bracket all of whose alternatives start with the same char (recurse ad lib),
1756 then we return that char, otherwise -1.
1757
1758 Arguments:
1759 code points to start of expression (the bracket)
1760 options pointer to the options (used to check casing changes)
1761
1762 Returns: -1 or the fixed first char
1763 */
1764
1765 static int
1766 find_firstchar(const uschar *code, int *options)
1767 {
1768 register int c = -1;
1769 do {
1770 int d;
1771 const uschar *scode = first_significant_code(code + 3, options,
1772 PCRE_CASELESS, TRUE);
1773 register int op = *scode;
1774
1775 if (op >= OP_BRA) op = OP_BRA;
1776
1777 switch(op)
1778 {
1779 default:
1780 return -1;
1781
1782 case OP_BRA:
1783 case OP_ASSERT:
1784 case OP_ONCE:
1785 case OP_COND:
1786 if ((d = find_firstchar(scode, options)) < 0) return -1;
1787 if (c < 0) c = d; else if (c != d) return -1;
1788 break;
1789
1790 case OP_EXACT: /* Fall through */
1791 scode++;
1792
1793 case OP_CHARS: /* Fall through */
1794 scode++;
1795
1796 case OP_PLUS:
1797 case OP_MINPLUS:
1798 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1799 break;
1800 }
1801
1802 code += (code[1] << 8) + code[2];
1803 }
1804 while (*code == OP_ALT);
1805 return c;
1806 }
1807
1808
1809
1810
1811
1812 /*************************************************
1813 * Compile a Regular Expression *
1814 *************************************************/
1815
1816 /* This function takes a string and returns a pointer to a block of store
1817 holding a compiled version of the expression.
1818
1819 Arguments:
1820 pattern the regular expression
1821 options various option bits
1822 errorptr pointer to pointer to error text
1823 erroroffset ptr offset in pattern where error was detected
1824 tables pointer to character tables or NULL
1825
1826 Returns: pointer to compiled data block, or NULL on error,
1827 with errorptr and erroroffset set
1828 */
1829
1830 pcre *
1831 pcre_compile(const char *pattern, int options, const char **errorptr,
1832 int *erroroffset, const unsigned char *tables)
1833 {
1834 real_pcre *re;
1835 int length = 3; /* For initial BRA plus length */
1836 int runlength;
1837 int c, size;
1838 int bracount = 0;
1839 int top_backref = 0;
1840 int branch_extra = 0;
1841 int branch_newextra;
1842 unsigned int brastackptr = 0;
1843 uschar *code;
1844 const uschar *ptr;
1845 compile_data compile_block;
1846 int brastack[BRASTACK_SIZE];
1847 uschar bralenstack[BRASTACK_SIZE];
1848
1849 #ifdef DEBUG
1850 uschar *code_base, *code_end;
1851 #endif
1852
1853 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1854 can do is just return NULL. */
1855
1856 if (errorptr == NULL) return NULL;
1857 *errorptr = NULL;
1858
1859 /* However, we can give a message for this error */
1860
1861 if (erroroffset == NULL)
1862 {
1863 *errorptr = ERR16;
1864 return NULL;
1865 }
1866 *erroroffset = 0;
1867
1868 if ((options & ~PUBLIC_OPTIONS) != 0)
1869 {
1870 *errorptr = ERR17;
1871 return NULL;
1872 }
1873
1874 /* Set up pointers to the individual character tables */
1875
1876 if (tables == NULL) tables = pcre_default_tables;
1877 compile_block.lcc = tables + lcc_offset;
1878 compile_block.fcc = tables + fcc_offset;
1879 compile_block.cbits = tables + cbits_offset;
1880 compile_block.ctypes = tables + ctypes_offset;
1881
1882 /* Reflect pattern for debugging output */
1883
1884 DPRINTF(("------------------------------------------------------------------\n"));
1885 DPRINTF(("%s\n", pattern));
1886
1887 /* The first thing to do is to make a pass over the pattern to compute the
1888 amount of store required to hold the compiled code. This does not have to be
1889 perfect as long as errors are overestimates. At the same time we can detect any
1890 internal flag settings. Make an attempt to correct for any counted white space
1891 if an "extended" flag setting appears late in the pattern. We can't be so
1892 clever for #-comments. */
1893
1894 ptr = (const uschar *)(pattern - 1);
1895 while ((c = *(++ptr)) != 0)
1896 {
1897 int min, max;
1898 int class_charcount;
1899
1900 if ((options & PCRE_EXTENDED) != 0)
1901 {
1902 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
1903 if (c == '#')
1904 {
1905 while ((c = *(++ptr)) != 0 && c != '\n');
1906 continue;
1907 }
1908 }
1909
1910 switch(c)
1911 {
1912 /* A backslashed item may be an escaped "normal" character or a
1913 character type. For a "normal" character, put the pointers and
1914 character back so that tests for whitespace etc. in the input
1915 are done correctly. */
1916
1917 case '\\':
1918 {
1919 const uschar *save_ptr = ptr;
1920 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
1921 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1922 if (c >= 0)
1923 {
1924 ptr = save_ptr;
1925 c = '\\';
1926 goto NORMAL_CHAR;
1927 }
1928 }
1929 length++;
1930
1931 /* A back reference needs an additional char, plus either one or 5
1932 bytes for a repeat. We also need to keep the value of the highest
1933 back reference. */
1934
1935 if (c <= -ESC_REF)
1936 {
1937 int refnum = -c - ESC_REF;
1938 if (refnum > top_backref) top_backref = refnum;
1939 length++; /* For single back reference */
1940 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
1941 {
1942 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
1943 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1944 if ((min == 0 && (max == 1 || max == -1)) ||
1945 (min == 1 && max == -1))
1946 length++;
1947 else length += 5;
1948 if (ptr[1] == '?') ptr++;
1949 }
1950 }
1951 continue;
1952
1953 case '^':
1954 case '.':
1955 case '$':
1956 case '*': /* These repeats won't be after brackets; */
1957 case '+': /* those are handled separately */
1958 case '?':
1959 length++;
1960 continue;
1961
1962 /* This covers the cases of repeats after a single char, metachar, class,
1963 or back reference. */
1964
1965 case '{':
1966 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
1967 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
1968 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1969 if ((min == 0 && (max == 1 || max == -1)) ||
1970 (min == 1 && max == -1))
1971 length++;
1972 else
1973 {
1974 length--; /* Uncount the original char or metachar */
1975 if (min == 1) length++; else if (min > 0) length += 4;
1976 if (max > 0) length += 4; else length += 2;
1977 }
1978 if (ptr[1] == '?') ptr++;
1979 continue;
1980
1981 /* An alternation contains an offset to the next branch or ket. If any ims
1982 options changed in the previous branch(es), and/or if we are in a
1983 lookbehind assertion, extra space will be needed at the start of the
1984 branch. This is handled by branch_extra. */
1985
1986 case '|':
1987 length += 3 + branch_extra;
1988 continue;
1989
1990 /* A character class uses 33 characters. Don't worry about character types
1991 that aren't allowed in classes - they'll get picked up during the compile.
1992 A character class that contains only one character uses 2 or 3 bytes,
1993 depending on whether it is negated or not. Notice this where we can. */
1994
1995 case '[':
1996 class_charcount = 0;
1997 if (*(++ptr) == '^') ptr++;
1998 do
1999 {
2000 if (*ptr == '\\')
2001 {
2002 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2003 &compile_block);
2004 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2005 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2006 }
2007 else class_charcount++;
2008 ptr++;
2009 }
2010 while (*ptr != 0 && *ptr != ']');
2011
2012 /* Repeats for negated single chars are handled by the general code */
2013
2014 if (class_charcount == 1) length += 3; else
2015 {
2016 length += 33;
2017
2018 /* A repeat needs either 1 or 5 bytes. */
2019
2020 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2021 {
2022 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2023 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2024 if ((min == 0 && (max == 1 || max == -1)) ||
2025 (min == 1 && max == -1))
2026 length++;
2027 else length += 5;
2028 if (ptr[1] == '?') ptr++;
2029 }
2030 }
2031 continue;
2032
2033 /* Brackets may be genuine groups or special things */
2034
2035 case '(':
2036 branch_newextra = 0;
2037
2038 /* Handle special forms of bracket, which all start (? */
2039
2040 if (ptr[1] == '?')
2041 {
2042 int set, unset;
2043 int *optset;
2044
2045 switch (c = ptr[2])
2046 {
2047 /* Skip over comments entirely */
2048 case '#':
2049 ptr += 3;
2050 while (*ptr != 0 && *ptr != ')') ptr++;
2051 if (*ptr == 0)
2052 {
2053 *errorptr = ERR18;
2054 goto PCRE_ERROR_RETURN;
2055 }
2056 continue;
2057
2058 /* Non-referencing groups and lookaheads just move the pointer on, and
2059 then behave like a non-special bracket, except that they don't increment
2060 the count of extracting brackets. Ditto for the "once only" bracket,
2061 which is in Perl from version 5.005. */
2062
2063 case ':':
2064 case '=':
2065 case '!':
2066 case '>':
2067 ptr += 2;
2068 break;
2069
2070 /* Lookbehinds are in Perl from version 5.005 */
2071
2072 case '<':
2073 if (ptr[3] == '=' || ptr[3] == '!')
2074 {
2075 ptr += 3;
2076 branch_newextra = 3;
2077 length += 3; /* For the first branch */
2078 break;
2079 }
2080 *errorptr = ERR24;
2081 goto PCRE_ERROR_RETURN;
2082
2083 /* Conditionals are in Perl from version 5.005. The bracket must either
2084 be followed by a number (for bracket reference) or by an assertion
2085 group. */
2086
2087 case '(':
2088 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2089 {
2090 ptr += 4;
2091 length += 2;
2092 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2093 if (*ptr != ')')
2094 {
2095 *errorptr = ERR26;
2096 goto PCRE_ERROR_RETURN;
2097 }
2098 }
2099 else /* An assertion must follow */
2100 {
2101 ptr++; /* Can treat like ':' as far as spacing is concerned */
2102
2103 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2104 {
2105 ptr += 2; /* To get right offset in message */
2106 *errorptr = ERR28;
2107 goto PCRE_ERROR_RETURN;
2108 }
2109 }
2110 break;
2111
2112 /* Else loop checking valid options until ) is met. Anything else is an
2113 error. If we are without any brackets, i.e. at top level, the settings
2114 act as if specified in the options, so massage the options immediately.
2115 This is for backward compatibility with Perl 5.004. */
2116
2117 default:
2118 set = unset = 0;
2119 optset = &set;
2120 ptr += 2;
2121
2122 for (;; ptr++)
2123 {
2124 c = *ptr;
2125 switch (c)
2126 {
2127 case 'i':
2128 *optset |= PCRE_CASELESS;
2129 continue;
2130
2131 case 'm':
2132 *optset |= PCRE_MULTILINE;
2133 continue;
2134
2135 case 's':
2136 *optset |= PCRE_DOTALL;
2137 continue;
2138
2139 case 'x':
2140 *optset |= PCRE_EXTENDED;
2141 continue;
2142
2143 case 'X':
2144 *optset |= PCRE_EXTRA;
2145 continue;
2146
2147 case 'U':
2148 *optset |= PCRE_UNGREEDY;
2149 continue;
2150
2151 case '-':
2152 optset = &unset;
2153 continue;
2154
2155 /* A termination by ')' indicates an options-setting-only item;
2156 this is global at top level; otherwise nothing is done here and
2157 it is handled during the compiling process on a per-bracket-group
2158 basis. */
2159
2160 case ')':
2161 if (brastackptr == 0)
2162 {
2163 options = (options | set) & (~unset);
2164 set = unset = 0; /* To save length */
2165 }
2166 /* Fall through */
2167
2168 /* A termination by ':' indicates the start of a nested group with
2169 the given options set. This is again handled at compile time, but
2170 we must allow for compiled space if any of the ims options are
2171 set. We also have to allow for resetting space at the end of
2172 the group, which is why 4 is added to the length and not just 2.
2173 If there are several changes of options within the same group, this
2174 will lead to an over-estimate on the length, but this shouldn't
2175 matter very much. We also have to allow for resetting options at
2176 the start of any alternations, which we do by setting
2177 branch_newextra to 2. */
2178
2179 case ':':
2180 if (((set|unset) & PCRE_IMS) != 0)
2181 {
2182 length += 4;
2183 branch_newextra = 2;
2184 }
2185 goto END_OPTIONS;
2186
2187 /* Unrecognized option character */
2188
2189 default:
2190 *errorptr = ERR12;
2191 goto PCRE_ERROR_RETURN;
2192 }
2193 }
2194
2195 /* If we hit a closing bracket, that's it - this is a freestanding
2196 option-setting. We need to ensure that branch_extra is updated if
2197 necessary. The only values branch_newextra can have here are 0 or 2.
2198 If the value is 2, then branch_extra must either be 2 or 5, depending
2199 on whether this is a lookbehind group or not. */
2200
2201 END_OPTIONS:
2202 if (c == ')')
2203 {
2204 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2205 branch_extra += branch_newextra;
2206 continue;
2207 }
2208
2209 /* If options were terminated by ':' control comes here. Fall through
2210 to handle the group below. */
2211 }
2212 }
2213
2214 /* Extracting brackets must be counted so we can process escapes in a
2215 Perlish way. */
2216
2217 else bracount++;
2218
2219 /* Non-special forms of bracket. Save length for computing whole length
2220 at end if there's a repeat that requires duplication of the group. Also
2221 save the current value of branch_extra, and start the new group with
2222 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2223 for a lookbehind assertion. */
2224
2225 if (brastackptr >= sizeof(brastack)/sizeof(int))
2226 {
2227 *errorptr = ERR19;
2228 goto PCRE_ERROR_RETURN;
2229 }
2230
2231 bralenstack[brastackptr] = branch_extra;
2232 branch_extra = branch_newextra;
2233
2234 brastack[brastackptr++] = length;
2235 length += 3;
2236 continue;
2237
2238 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2239 have to replicate this bracket up to that many times. If brastackptr is
2240 0 this is an unmatched bracket which will generate an error, but take care
2241 not to try to access brastack[-1] when computing the length and restoring
2242 the branch_extra value. */
2243
2244 case ')':
2245 length += 3;
2246 {
2247 int minval = 1;
2248 int maxval = 1;
2249 int duplength;
2250
2251 if (brastackptr > 0)
2252 {
2253 duplength = length - brastack[--brastackptr];
2254 branch_extra = bralenstack[brastackptr];
2255 }
2256 else duplength = 0;
2257
2258 /* Leave ptr at the final char; for read_repeat_counts this happens
2259 automatically; for the others we need an increment. */
2260
2261 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2262 {
2263 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2264 &compile_block);
2265 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2266 }
2267 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2268 else if (c == '+') { maxval = -1; ptr++; }
2269 else if (c == '?') { minval = 0; ptr++; }
2270
2271 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2272 if there is a limited maximum we have to replicate up to maxval-1 times
2273 and allow for a BRAZERO item before each optional copy, as we also have
2274 to do before the first copy if the minimum is zero. */
2275
2276 if (minval == 0) length++;
2277 else if (minval > 1) length += (minval - 1) * duplength;
2278 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2279 }
2280 continue;
2281
2282 /* Non-special character. For a run of such characters the length required
2283 is the number of characters + 2, except that the maximum run length is 255.
2284 We won't get a skipped space or a non-data escape or the start of a #
2285 comment as the first character, so the length can't be zero. */
2286
2287 NORMAL_CHAR:
2288 default:
2289 length += 2;
2290 runlength = 0;
2291 do
2292 {
2293 if ((options & PCRE_EXTENDED) != 0)
2294 {
2295 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2296 if (c == '#')
2297 {
2298 while ((c = *(++ptr)) != 0 && c != '\n');
2299 continue;
2300 }
2301 }
2302
2303 /* Backslash may introduce a data char or a metacharacter; stop the
2304 string before the latter. */
2305
2306 if (c == '\\')
2307 {
2308 const uschar *saveptr = ptr;
2309 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2310 &compile_block);
2311 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2312 if (c < 0) { ptr = saveptr; break; }
2313 }
2314
2315 /* Ordinary character or single-char escape */
2316
2317 runlength++;
2318 }
2319
2320 /* This "while" is the end of the "do" above. */
2321
2322 while (runlength < 255 &&
2323 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2324
2325 ptr--;
2326 length += runlength;
2327 continue;
2328 }
2329 }
2330
2331 length += 4; /* For final KET and END */
2332
2333 if (length > 65539)
2334 {
2335 *errorptr = ERR20;
2336 return NULL;
2337 }
2338
2339 /* Compute the size of data block needed and get it, either from malloc or
2340 externally provided function. We specify "code[0]" in the offsetof() expression
2341 rather than just "code", because it has been reported that one broken compiler
2342 fails on "code" because it is also an independent variable. It should make no
2343 difference to the value of the offsetof(). */
2344
2345 size = length + offsetof(real_pcre, code[0]);
2346 re = (real_pcre *)(pcre_malloc)(size);
2347
2348 if (re == NULL)
2349 {
2350 *errorptr = ERR21;
2351 return NULL;
2352 }
2353
2354 /* Put in the magic number and the options. */
2355
2356 re->magic_number = MAGIC_NUMBER;
2357 re->options = options;
2358 re->tables = tables;
2359
2360 /* Set up a starting, non-extracting bracket, then compile the expression. On
2361 error, *errorptr will be set non-NULL, so we don't need to look at the result
2362 of the function here. */
2363
2364 ptr = (const uschar *)pattern;
2365 code = re->code;
2366 *code = OP_BRA;
2367 bracount = 0;
2368 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2369 &compile_block);
2370 re->top_bracket = bracount;
2371 re->top_backref = top_backref;
2372
2373 /* If not reached end of pattern on success, there's an excess bracket. */
2374
2375 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2376
2377 /* Fill in the terminating state and check for disastrous overflow, but
2378 if debugging, leave the test till after things are printed out. */
2379
2380 *code++ = OP_END;
2381
2382 #ifndef DEBUG
2383 if (code - re->code > length) *errorptr = ERR23;
2384 #endif
2385
2386 /* Give an error if there's back reference to a non-existent capturing
2387 subpattern. */
2388
2389 if (top_backref > re->top_bracket) *errorptr = ERR15;
2390
2391 /* Failed to compile */
2392
2393 if (*errorptr != NULL)
2394 {
2395 (pcre_free)(re);
2396 PCRE_ERROR_RETURN:
2397 *erroroffset = ptr - (const uschar *)pattern;
2398 return NULL;
2399 }
2400
2401 /* If the anchored option was not passed, set flag if we can determine that it
2402 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2403 we can determine what the first character has to be, because that speeds up
2404 unanchored matches no end. In the case of multiline matches, an alternative is
2405 to set the PCRE_STARTLINE flag if all branches start with ^. */
2406
2407 if ((options & PCRE_ANCHORED) == 0)
2408 {
2409 int temp_options = options;
2410 if (is_anchored(re->code, &temp_options))
2411 re->options |= PCRE_ANCHORED;
2412 else
2413 {
2414 int ch = find_firstchar(re->code, &temp_options);
2415 if (ch >= 0)
2416 {
2417 re->first_char = ch;
2418 re->options |= PCRE_FIRSTSET;
2419 }
2420 else if (is_startline(re->code))
2421 re->options |= PCRE_STARTLINE;
2422 }
2423 }
2424
2425 /* Print out the compiled data for debugging */
2426
2427 #ifdef DEBUG
2428
2429 printf("Length = %d top_bracket = %d top_backref = %d\n",
2430 length, re->top_bracket, re->top_backref);
2431
2432 if (re->options != 0)
2433 {
2434 printf("%s%s%s%s%s%s%s%s\n",
2435 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2436 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2437 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2438 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2439 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2440 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2441 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2442 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2443 }
2444
2445 if ((re->options & PCRE_FIRSTSET) != 0)
2446 {
2447 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2448 else printf("First char = \\x%02x\n", re->first_char);
2449 }
2450
2451 code_end = code;
2452 code_base = code = re->code;
2453
2454 while (code < code_end)
2455 {
2456 int charlength;
2457
2458 printf("%3d ", code - code_base);
2459
2460 if (*code >= OP_BRA)
2461 {
2462 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2463 code += 2;
2464 }
2465
2466 else switch(*code)
2467 {
2468 case OP_OPT:
2469 printf(" %.2x %s", code[1], OP_names[*code]);
2470 code++;
2471 break;
2472
2473 case OP_COND:
2474 printf("%3d Cond", (code[1] << 8) + code[2]);
2475 code += 2;
2476 break;
2477
2478 case OP_CREF:
2479 printf(" %.2d %s", code[1], OP_names[*code]);
2480 code++;
2481 break;
2482
2483 case OP_CHARS:
2484 charlength = *(++code);
2485 printf("%3d ", charlength);
2486 while (charlength-- > 0)
2487 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2488 break;
2489
2490 case OP_KETRMAX:
2491 case OP_KETRMIN:
2492 case OP_ALT:
2493 case OP_KET:
2494 case OP_ASSERT:
2495 case OP_ASSERT_NOT:
2496 case OP_ASSERTBACK:
2497 case OP_ASSERTBACK_NOT:
2498 case OP_ONCE:
2499 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2500 code += 2;
2501 break;
2502
2503 case OP_REVERSE:
2504 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2505 code += 2;
2506 break;
2507
2508 case OP_STAR:
2509 case OP_MINSTAR:
2510 case OP_PLUS:
2511 case OP_MINPLUS:
2512 case OP_QUERY:
2513 case OP_MINQUERY:
2514 case OP_TYPESTAR:
2515 case OP_TYPEMINSTAR:
2516 case OP_TYPEPLUS:
2517 case OP_TYPEMINPLUS:
2518 case OP_TYPEQUERY:
2519 case OP_TYPEMINQUERY:
2520 if (*code >= OP_TYPESTAR)
2521 printf(" %s", OP_names[code[1]]);
2522 else if (isprint(c = code[1])) printf(" %c", c);
2523 else printf(" \\x%02x", c);
2524 printf("%s", OP_names[*code++]);
2525 break;
2526
2527 case OP_EXACT:
2528 case OP_UPTO:
2529 case OP_MINUPTO:
2530 if (isprint(c = code[3])) printf(" %c{", c);
2531 else printf(" \\x%02x{", c);
2532 if (*code != OP_EXACT) printf("0,");
2533 printf("%d}", (code[1] << 8) + code[2]);
2534 if (*code == OP_MINUPTO) printf("?");
2535 code += 3;
2536 break;
2537
2538 case OP_TYPEEXACT:
2539 case OP_TYPEUPTO:
2540 case OP_TYPEMINUPTO:
2541 printf(" %s{", OP_names[code[3]]);
2542 if (*code != OP_TYPEEXACT) printf(",");
2543 printf("%d}", (code[1] << 8) + code[2]);
2544 if (*code == OP_TYPEMINUPTO) printf("?");
2545 code += 3;
2546 break;
2547
2548 case OP_NOT:
2549 if (isprint(c = *(++code))) printf(" [^%c]", c);
2550 else printf(" [^\\x%02x]", c);
2551 break;
2552
2553 case OP_NOTSTAR:
2554 case OP_NOTMINSTAR:
2555 case OP_NOTPLUS:
2556 case OP_NOTMINPLUS:
2557 case OP_NOTQUERY:
2558 case OP_NOTMINQUERY:
2559 if (isprint(c = code[1])) printf(" [^%c]", c);
2560 else printf(" [^\\x%02x]", c);
2561 printf("%s", OP_names[*code++]);
2562 break;
2563
2564 case OP_NOTEXACT:
2565 case OP_NOTUPTO:
2566 case OP_NOTMINUPTO:
2567 if (isprint(c = code[3])) printf(" [^%c]{", c);
2568 else printf(" [^\\x%02x]{", c);
2569 if (*code != OP_NOTEXACT) printf(",");
2570 printf("%d}", (code[1] << 8) + code[2]);
2571 if (*code == OP_NOTMINUPTO) printf("?");
2572 code += 3;
2573 break;
2574
2575 case OP_REF:
2576 printf(" \\%d", *(++code));
2577 code ++;
2578 goto CLASS_REF_REPEAT;
2579
2580 case OP_CLASS:
2581 {
2582 int i, min, max;
2583 code++;
2584 printf(" [");
2585
2586 for (i = 0; i < 256; i++)
2587 {
2588 if ((code[i/8] & (1 << (i&7))) != 0)
2589 {
2590 int j;
2591 for (j = i+1; j < 256; j++)
2592 if ((code[j/8] & (1 << (j&7))) == 0) break;
2593 if (i == '-' || i == ']') printf("\\");
2594 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2595 if (--j > i)
2596 {
2597 printf("-");
2598 if (j == '-' || j == ']') printf("\\");
2599 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2600 }
2601 i = j;
2602 }
2603 }
2604 printf("]");
2605 code += 32;
2606
2607 CLASS_REF_REPEAT:
2608
2609 switch(*code)
2610 {
2611 case OP_CRSTAR:
2612 case OP_CRMINSTAR:
2613 case OP_CRPLUS:
2614 case OP_CRMINPLUS:
2615 case OP_CRQUERY:
2616 case OP_CRMINQUERY:
2617 printf("%s", OP_names[*code]);
2618 break;
2619
2620 case OP_CRRANGE:
2621 case OP_CRMINRANGE:
2622 min = (code[1] << 8) + code[2];
2623 max = (code[3] << 8) + code[4];
2624 if (max == 0) printf("{%d,}", min);
2625 else printf("{%d,%d}", min, max);
2626 if (*code == OP_CRMINRANGE) printf("?");
2627 code += 4;
2628 break;
2629
2630 default:
2631 code--;
2632 }
2633 }
2634 break;
2635
2636 /* Anything else is just a one-node item */
2637
2638 default:
2639 printf(" %s", OP_names[*code]);
2640 break;
2641 }
2642
2643 code++;
2644 printf("\n");
2645 }
2646 printf("------------------------------------------------------------------\n");
2647
2648 /* This check is done here in the debugging case so that the code that
2649 was compiled can be seen. */
2650
2651 if (code - re->code > length)
2652 {
2653 *errorptr = ERR23;
2654 (pcre_free)(re);
2655 *erroroffset = ptr - (uschar *)pattern;
2656 return NULL;
2657 }
2658 #endif
2659
2660 return (pcre *)re;
2661 }
2662
2663
2664
2665 /*************************************************
2666 * Match a back-reference *
2667 *************************************************/
2668
2669 /* If a back reference hasn't been set, the length that is passed is greater
2670 than the number of characters left in the string, so the match fails.
2671
2672 Arguments:
2673 offset index into the offset vector
2674 eptr points into the subject
2675 length length to be matched
2676 md points to match data block
2677 ims the ims flags
2678
2679 Returns: TRUE if matched
2680 */
2681
2682 static BOOL
2683 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2684 int ims)
2685 {
2686 const uschar *p = md->start_subject + md->offset_vector[offset];
2687
2688 #ifdef DEBUG
2689 if (eptr >= md->end_subject)
2690 printf("matching subject <null>");
2691 else
2692 {
2693 printf("matching subject ");
2694 pchars(eptr, length, TRUE, md);
2695 }
2696 printf(" against backref ");
2697 pchars(p, length, FALSE, md);
2698 printf("\n");
2699 #endif
2700
2701 /* Always fail if not enough characters left */
2702
2703 if (length > md->end_subject - eptr) return FALSE;
2704
2705 /* Separate the caselesss case for speed */
2706
2707 if ((ims & PCRE_CASELESS) != 0)
2708 {
2709 while (length-- > 0)
2710 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2711 }
2712 else
2713 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2714
2715 return TRUE;
2716 }
2717
2718
2719
2720 /*************************************************
2721 * Match from current position *
2722 *************************************************/
2723
2724 /* On entry ecode points to the first opcode, and eptr to the first character
2725 in the subject string, while eptrb holds the value of eptr at the start of the
2726 last bracketed group - used for breaking infinite loops matching zero-length
2727 strings.
2728
2729 Arguments:
2730 eptr pointer in subject
2731 ecode position in code
2732 offset_top current top pointer
2733 md pointer to "static" info for the match
2734 ims current /i, /m, and /s options
2735 condassert TRUE if called to check a condition assertion
2736 eptrb eptr at start of last bracket
2737
2738 Returns: TRUE if matched
2739 */
2740
2741 static BOOL
2742 match(register const uschar *eptr, register const uschar *ecode,
2743 int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2744 {
2745 int original_ims = ims; /* Save for resetting on ')' */
2746
2747 for (;;)
2748 {
2749 int op = (int)*ecode;
2750 int min, max, ctype;
2751 register int i;
2752 register int c;
2753 BOOL minimize = FALSE;
2754
2755 /* Opening capturing bracket. If there is space in the offset vector, save
2756 the current subject position in the working slot at the top of the vector. We
2757 mustn't change the current values of the data slot, because they may be set
2758 from a previous iteration of this group, and be referred to by a reference
2759 inside the group.
2760
2761 If the bracket fails to match, we need to restore this value and also the
2762 values of the final offsets, in case they were set by a previous iteration of
2763 the same bracket.
2764
2765 If there isn't enough space in the offset vector, treat this as if it were a
2766 non-capturing bracket. Don't worry about setting the flag for the error case
2767 here; that is handled in the code for KET. */
2768
2769 if (op > OP_BRA)
2770 {
2771 int number = op - OP_BRA;
2772 int offset = number << 1;
2773
2774 DPRINTF(("start bracket %d\n", number));
2775
2776 if (offset < md->offset_max)
2777 {
2778 int save_offset1 = md->offset_vector[offset];
2779 int save_offset2 = md->offset_vector[offset+1];
2780 int save_offset3 = md->offset_vector[md->offset_end - number];
2781
2782 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2783 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2784
2785 do
2786 {
2787 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2788 ecode += (ecode[1] << 8) + ecode[2];
2789 }
2790 while (*ecode == OP_ALT);
2791
2792 DPRINTF(("bracket %d failed\n", number));
2793
2794 md->offset_vector[offset] = save_offset1;
2795 md->offset_vector[offset+1] = save_offset2;
2796 md->offset_vector[md->offset_end - number] = save_offset3;
2797 return FALSE;
2798 }
2799
2800 /* Insufficient room for saving captured contents */
2801
2802 else op = OP_BRA;
2803 }
2804
2805 /* Other types of node can be handled by a switch */
2806
2807 switch(op)
2808 {
2809 case OP_BRA: /* Non-capturing bracket: optimized */
2810 DPRINTF(("start bracket 0\n"));
2811 do
2812 {
2813 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2814 ecode += (ecode[1] << 8) + ecode[2];
2815 }
2816 while (*ecode == OP_ALT);
2817 DPRINTF(("bracket 0 failed\n"));
2818 return FALSE;
2819
2820 /* Conditional group: compilation checked that there are no more than
2821 two branches. If the condition is false, skipping the first branch takes us
2822 past the end if there is only one branch, but that's OK because that is
2823 exactly what going to the ket would do. */
2824
2825 case OP_COND:
2826 if (ecode[3] == OP_CREF) /* Condition is extraction test */
2827 {
2828 int offset = ecode[4] << 1; /* Doubled reference number */
2829 return match(eptr,
2830 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2831 5 : 3 + (ecode[1] << 8) + ecode[2]),
2832 offset_top, md, ims, FALSE, eptr);
2833 }
2834
2835 /* The condition is an assertion. Call match() to evaluate it - setting
2836 the final argument TRUE causes it to stop at the end of an assertion. */
2837
2838 else
2839 {
2840 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
2841 {
2842 ecode += 3 + (ecode[4] << 8) + ecode[5];
2843 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
2844 }
2845 else ecode += (ecode[1] << 8) + ecode[2];
2846 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
2847 }
2848 /* Control never reaches here */
2849
2850 /* Skip over conditional reference data if encountered (should not be) */
2851
2852 case OP_CREF:
2853 ecode += 2;
2854 break;
2855
2856 /* End of the pattern */
2857
2858 case OP_END:
2859 md->end_match_ptr = eptr; /* Record where we ended */
2860 md->end_offset_top = offset_top; /* and how many extracts were taken */
2861 return TRUE;
2862
2863 /* Change option settings */
2864
2865 case OP_OPT:
2866 ims = ecode[1];
2867 ecode += 2;
2868 DPRINTF(("ims set to %02x\n", ims));
2869 break;
2870
2871 /* Assertion brackets. Check the alternative branches in turn - the
2872 matching won't pass the KET for an assertion. If any one branch matches,
2873 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
2874 start of each branch to move the current point backwards, so the code at
2875 this level is identical to the lookahead case. */
2876
2877 case OP_ASSERT:
2878 case OP_ASSERTBACK:
2879 do
2880 {
2881 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
2882 ecode += (ecode[1] << 8) + ecode[2];
2883 }
2884 while (*ecode == OP_ALT);
2885 if (*ecode == OP_KET) return FALSE;
2886
2887 /* If checking an assertion for a condition, return TRUE. */
2888
2889 if (condassert) return TRUE;
2890
2891 /* Continue from after the assertion, updating the offsets high water
2892 mark, since extracts may have been taken during the assertion. */
2893
2894 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2895 ecode += 3;
2896 offset_top = md->end_offset_top;
2897 continue;
2898
2899 /* Negative assertion: all branches must fail to match */
2900
2901 case OP_ASSERT_NOT:
2902 case OP_ASSERTBACK_NOT:
2903 do
2904 {
2905 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
2906 ecode += (ecode[1] << 8) + ecode[2];
2907 }
2908 while (*ecode == OP_ALT);
2909
2910 if (condassert) return TRUE;
2911 ecode += 3;
2912 continue;
2913
2914 /* Move the subject pointer back. This occurs only at the start of
2915 each branch of a lookbehind assertion. If we are too close to the start to
2916 move back, this match function fails. */
2917
2918 case OP_REVERSE:
2919 eptr -= (ecode[1] << 8) + ecode[2];
2920 if (eptr < md->start_subject) return FALSE;
2921 ecode += 3;
2922 break;
2923
2924
2925 /* "Once" brackets are like assertion brackets except that after a match,
2926 the point in the subject string is not moved back. Thus there can never be
2927 a move back into the brackets. Check the alternative branches in turn - the
2928 matching won't pass the KET for this kind of subpattern. If any one branch
2929 matches, we carry on as at the end of a normal bracket, leaving the subject
2930 pointer. */
2931
2932 case OP_ONCE:
2933 {
2934 const uschar *prev = ecode;
2935
2936 do
2937 {
2938 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
2939 ecode += (ecode[1] << 8) + ecode[2];
2940 }
2941 while (*ecode == OP_ALT);
2942
2943 /* If hit the end of the group (which could be repeated), fail */
2944
2945 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
2946
2947 /* Continue as from after the assertion, updating the offsets high water
2948 mark, since extracts may have been taken. */
2949
2950 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2951
2952 offset_top = md->end_offset_top;
2953 eptr = md->end_match_ptr;
2954
2955 /* For a non-repeating ket, just continue at this level. This also
2956 happens for a repeating ket if no characters were matched in the group.
2957 This is the forcible breaking of infinite loops as implemented in Perl
2958 5.005. If there is an options reset, it will get obeyed in the normal
2959 course of events. */
2960
2961 if (*ecode == OP_KET || eptr == eptrb)
2962 {
2963 ecode += 3;
2964 break;
2965 }
2966
2967 /* The repeating kets try the rest of the pattern or restart from the
2968 preceding bracket, in the appropriate order. We need to reset any options
2969 that changed within the bracket before re-running it, so check the next
2970 opcode. */
2971
2972 if (ecode[3] == OP_OPT)
2973 {
2974 ims = (ims & ~PCRE_IMS) | ecode[4];
2975 DPRINTF(("ims set to %02x at group repeat\n", ims));
2976 }
2977
2978 if (*ecode == OP_KETRMIN)
2979 {
2980 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
2981 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
2982 }
2983 else /* OP_KETRMAX */
2984 {
2985 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
2986 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2987 }
2988 }
2989 return FALSE;
2990
2991 /* An alternation is the end of a branch; scan along to find the end of the
2992 bracketed group and go to there. */
2993
2994 case OP_ALT:
2995 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2996 break;
2997
2998 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2999 that it may occur zero times. It may repeat infinitely, or not at all -
3000 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3001 repeat limits are compiled as a number of copies, with the optional ones
3002 preceded by BRAZERO or BRAMINZERO. */
3003
3004 case OP_BRAZERO:
3005 {
3006 const uschar *next = ecode+1;
3007 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3008 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3009 ecode = next + 3;
3010 }
3011 break;
3012
3013 case OP_BRAMINZERO:
3014 {
3015 const uschar *next = ecode+1;
3016 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3017 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3018 ecode++;
3019 }
3020 break;
3021
3022 /* End of a group, repeated or non-repeating. If we are at the end of
3023 an assertion "group", stop matching and return TRUE, but record the
3024 current high water mark for use by positive assertions. Do this also
3025 for the "once" (not-backup up) groups. */
3026
3027 case OP_KET:
3028 case OP_KETRMIN:
3029 case OP_KETRMAX:
3030 {
3031 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3032
3033 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3034 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3035 *prev == OP_ONCE)
3036 {
3037 md->end_match_ptr = eptr; /* For ONCE */
3038 md->end_offset_top = offset_top;
3039 return TRUE;
3040 }
3041
3042 /* In all other cases except a conditional group we have to check the
3043 group number back at the start and if necessary complete handling an
3044 extraction by setting the offsets and bumping the high water mark. */
3045
3046 if (*prev != OP_COND)
3047 {
3048 int number = *prev - OP_BRA;
3049 int offset = number << 1;
3050
3051 DPRINTF(("end bracket %d\n", number));
3052
3053 if (number > 0)
3054 {
3055 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3056 {
3057 md->offset_vector[offset] =
3058 md->offset_vector[md->offset_end - number];
3059 md->offset_vector[offset+1] = eptr - md->start_subject;
3060 if (offset_top <= offset) offset_top = offset + 2;
3061 }
3062 }
3063 }
3064
3065 /* Reset the value of the ims flags, in case they got changed during
3066 the group. */
3067
3068 ims = original_ims;
3069 DPRINTF(("ims reset to %02x\n", ims));
3070
3071 /* For a non-repeating ket, just continue at this level. This also
3072 happens for a repeating ket if no characters were matched in the group.
3073 This is the forcible breaking of infinite loops as implemented in Perl
3074 5.005. If there is an options reset, it will get obeyed in the normal
3075 course of events. */
3076
3077 if (*ecode == OP_KET || eptr == eptrb)
3078 {
3079 ecode += 3;
3080 break;
3081 }
3082
3083 /* The repeating kets try the rest of the pattern or restart from the
3084 preceding bracket, in the appropriate order. */
3085
3086 if (*ecode == OP_KETRMIN)
3087 {
3088 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3089 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3090 }
3091 else /* OP_KETRMAX */
3092 {
3093 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3094 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3095 }
3096 }
3097 return FALSE;
3098
3099 /* Start of subject unless notbol, or after internal newline if multiline */
3100
3101 case OP_CIRC:
3102 if (md->notbol && eptr == md->start_subject) return FALSE;
3103 if ((ims & PCRE_MULTILINE) != 0)
3104 {
3105 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3106 ecode++;
3107 break;
3108 }
3109 /* ... else fall through */
3110
3111 /* Start of subject assertion */
3112
3113 case OP_SOD:
3114 if (eptr != md->start_subject) return FALSE;
3115 ecode++;
3116 break;
3117
3118 /* Assert before internal newline if multiline, or before a terminating
3119 newline unless endonly is set, else end of subject unless noteol is set. */
3120
3121 case OP_DOLL:
3122 if ((ims & PCRE_MULTILINE) != 0)
3123 {
3124 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3125 else { if (md->noteol) return FALSE; }
3126 ecode++;
3127 break;
3128 }
3129 else
3130 {
3131 if (md->noteol) return FALSE;
3132 if (!md->endonly)
3133 {
3134 if (eptr < md->end_subject - 1 ||
3135 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3136
3137 ecode++;
3138 break;
3139 }
3140 }
3141 /* ... else fall through */
3142
3143 /* End of subject assertion (\z) */
3144
3145 case OP_EOD:
3146 if (eptr < md->end_subject) return FALSE;
3147 ecode++;
3148 break;
3149
3150 /* End of subject or ending \n assertion (\Z) */
3151
3152 case OP_EODN:
3153 if (eptr < md->end_subject - 1 ||
3154 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3155 ecode++;
3156 break;
3157
3158 /* Word boundary assertions */
3159
3160 case OP_NOT_WORD_BOUNDARY:
3161 case OP_WORD_BOUNDARY:
3162 {
3163 BOOL prev_is_word = (eptr != md->start_subject) &&
3164 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3165 BOOL cur_is_word = (eptr < md->end_subject) &&
3166 ((md->ctypes[*eptr] & ctype_word) != 0);
3167 if ((*ecode++ == OP_WORD_BOUNDARY)?
3168 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3169 return FALSE;
3170 }
3171 break;
3172
3173 /* Match a single character type; inline for speed */
3174
3175 case OP_ANY:
3176 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3177 return FALSE;
3178 if (eptr++ >= md->end_subject) return FALSE;
3179 ecode++;
3180 break;
3181
3182 case OP_NOT_DIGIT:
3183 if (eptr >= md->end_subject ||
3184 (md->ctypes[*eptr++] & ctype_digit) != 0)
3185 return FALSE;
3186 ecode++;
3187 break;
3188
3189 case OP_DIGIT:
3190 if (eptr >= md->end_subject ||
3191 (md->ctypes[*eptr++] & ctype_digit) == 0)
3192 return FALSE;
3193 ecode++;
3194 break;
3195
3196 case OP_NOT_WHITESPACE:
3197 if (eptr >= md->end_subject ||
3198 (md->ctypes[*eptr++] & ctype_space) != 0)
3199 return FALSE;
3200 ecode++;
3201 break;
3202
3203 case OP_WHITESPACE:
3204 if (eptr >= md->end_subject ||
3205 (md->ctypes[*eptr++] & ctype_space) == 0)
3206 return FALSE;
3207 ecode++;
3208 break;
3209
3210 case OP_NOT_WORDCHAR:
3211 if (eptr >= md->end_subject ||
3212 (md->ctypes[*eptr++] & ctype_word) != 0)
3213 return FALSE;
3214 ecode++;
3215 break;
3216
3217 case OP_WORDCHAR:
3218 if (eptr >= md->end_subject ||
3219 (md->ctypes[*eptr++] & ctype_word) == 0)
3220 return FALSE;
3221 ecode++;
3222 break;
3223
3224 /* Match a back reference, possibly repeatedly. Look past the end of the
3225 item to see if there is repeat information following. The code is similar
3226 to that for character classes, but repeated for efficiency. Then obey
3227 similar code to character type repeats - written out again for speed.
3228 However, if the referenced string is the empty string, always treat
3229 it as matched, any number of times (otherwise there could be infinite
3230 loops). */
3231
3232 case OP_REF:
3233 {
3234 int length;
3235 int offset = ecode[1] << 1; /* Doubled reference number */
3236 ecode += 2; /* Advance past the item */
3237
3238 /* If the reference is unset, set the length to be longer than the amount
3239 of subject left; this ensures that every attempt at a match fails. We
3240 can't just fail here, because of the possibility of quantifiers with zero
3241 minima. */
3242
3243 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3244 md->end_subject - eptr + 1 :
3245 md->offset_vector[offset+1] - md->offset_vector[offset];
3246
3247 /* Set up for repetition, or handle the non-repeated case */
3248
3249 switch (*ecode)
3250 {
3251 case OP_CRSTAR:
3252 case OP_CRMINSTAR:
3253 case OP_CRPLUS:
3254 case OP_CRMINPLUS:
3255 case OP_CRQUERY:
3256 case OP_CRMINQUERY:
3257 c = *ecode++ - OP_CRSTAR;
3258 minimize = (c & 1) != 0;
3259 min = rep_min[c]; /* Pick up values from tables; */
3260 max = rep_max[c]; /* zero for max => infinity */
3261 if (max == 0) max = INT_MAX;
3262 break;
3263
3264 case OP_CRRANGE:
3265 case OP_CRMINRANGE:
3266 minimize = (*ecode == OP_CRMINRANGE);
3267 min = (ecode[1] << 8) + ecode[2];
3268 max = (ecode[3] << 8) + ecode[4];
3269 if (max == 0) max = INT_MAX;
3270 ecode += 5;
3271 break;
3272
3273 default: /* No repeat follows */
3274 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3275 eptr += length;
3276 continue; /* With the main loop */
3277 }
3278
3279 /* If the length of the reference is zero, just continue with the
3280 main loop. */
3281
3282 if (length == 0) continue;
3283
3284 /* First, ensure the minimum number of matches are present. We get back
3285 the length of the reference string explicitly rather than passing the
3286 address of eptr, so that eptr can be a register variable. */
3287
3288 for (i = 1; i <= min; i++)
3289 {
3290 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3291 eptr += length;
3292 }
3293
3294 /* If min = max, continue at the same level without recursion.
3295 They are not both allowed to be zero. */
3296
3297 if (min == max) continue;
3298
3299 /* If minimizing, keep trying and advancing the pointer */
3300
3301 if (minimize)
3302 {
3303 for (i = min;; i++)
3304 {
3305 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3306 return TRUE;
3307 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3308 return FALSE;
3309 eptr += length;
3310 }
3311 /* Control never gets here */
3312 }
3313
3314 /* If maximizing, find the longest string and work backwards */
3315
3316 else
3317 {
3318 const uschar *pp = eptr;
3319 for (i = min; i < max; i++)
3320 {
3321 if (!match_ref(offset, eptr, length, md, ims)) break;
3322 eptr += length;
3323 }
3324 while (eptr >= pp)
3325 {
3326 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3327 return TRUE;
3328 eptr -= length;
3329 }
3330 return FALSE;
3331 }
3332 }
3333 /* Control never gets here */
3334
3335
3336
3337 /* Match a character class, possibly repeatedly. Look past the end of the
3338 item to see if there is repeat information following. Then obey similar
3339 code to character type repeats - written out again for speed. */
3340
3341 case OP_CLASS:
3342 {
3343 const uschar *data = ecode + 1; /* Save for matching */
3344 ecode += 33; /* Advance past the item */
3345
3346 switch (*ecode)
3347 {
3348 case OP_CRSTAR:
3349 case OP_CRMINSTAR:
3350 case OP_CRPLUS:
3351 case OP_CRMINPLUS:
3352 case OP_CRQUERY:
3353 case OP_CRMINQUERY:
3354 c = *ecode++ - OP_CRSTAR;
3355 minimize = (c & 1) != 0;
3356 min = rep_min[c]; /* Pick up values from tables; */
3357 max = rep_max[c]; /* zero for max => infinity */
3358 if (max == 0) max = INT_MAX;
3359 break;
3360
3361 case OP_CRRANGE:
3362 case OP_CRMINRANGE:
3363 minimize = (*ecode == OP_CRMINRANGE);
3364 min = (ecode[1] << 8) + ecode[2];
3365 max = (ecode[3] << 8) + ecode[4];
3366 if (max == 0) max = INT_MAX;
3367 ecode += 5;
3368 break;
3369
3370 default: /* No repeat follows */
3371 min = max = 1;
3372 break;
3373 }
3374
3375 /* First, ensure the minimum number of matches are present. */
3376
3377 for (i = 1; i <= min; i++)
3378 {
3379 if (eptr >= md->end_subject) return FALSE;
3380 c = *eptr++;
3381 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3382 return FALSE;
3383 }
3384
3385 /* If max == min we can continue with the main loop without the
3386 need to recurse. */
3387
3388 if (min == max) continue;
3389
3390 /* If minimizing, keep testing the rest of the expression and advancing
3391 the pointer while it matches the class. */
3392
3393 if (minimize)
3394 {
3395 for (i = min;; i++)
3396 {
3397 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3398 return TRUE;
3399 if (i >= max || eptr >= md->end_subject) return FALSE;
3400 c = *eptr++;
3401 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3402 return FALSE;
3403 }
3404 /* Control never gets here */
3405 }
3406
3407 /* If maximizing, find the longest possible run, then work backwards. */
3408
3409 else
3410 {
3411 const uschar *pp = eptr;
3412 for (i = min; i < max; eptr++, i++)
3413 {
3414 if (eptr >= md->end_subject) break;
3415 c = *eptr;
3416 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3417 break;
3418 }
3419
3420 while (eptr >= pp)
3421 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3422 return TRUE;
3423 return FALSE;
3424 }
3425 }
3426 /* Control never gets here */
3427
3428 /* Match a run of characters */
3429
3430 case OP_CHARS:
3431 {
3432 register int length = ecode[1];
3433 ecode += 2;
3434
3435 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3436 if (eptr >= md->end_subject)
3437 printf("matching subject <null> against pattern ");
3438 else
3439 {
3440 printf("matching subject ");
3441 pchars(eptr, length, TRUE, md);
3442 printf(" against pattern ");
3443 }
3444 pchars(ecode, length, FALSE, md);
3445 printf("\n");
3446 #endif
3447
3448 if (length > md->end_subject - eptr) return FALSE;
3449 if ((ims & PCRE_CASELESS) != 0)
3450 {
3451 while (length-- > 0)
3452 if (md->lcc[*ecode++] != md->lcc[*eptr++])
3453 return FALSE;
3454 }
3455 else
3456 {
3457 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3458 }
3459 }
3460 break;
3461
3462 /* Match a single character repeatedly; different opcodes share code. */
3463
3464 case OP_EXACT:
3465 min = max = (ecode[1] << 8) + ecode[2];
3466 ecode += 3;
3467 goto REPEATCHAR;
3468
3469 case OP_UPTO:
3470 case OP_MINUPTO:
3471 min = 0;
3472 max = (ecode[1] << 8) + ecode[2];
3473 minimize = *ecode == OP_MINUPTO;
3474 ecode += 3;
3475 goto REPEATCHAR;
3476
3477 case OP_STAR:
3478 case OP_MINSTAR:
3479 case OP_PLUS:
3480 case OP_MINPLUS:
3481 case OP_QUERY:
3482 case OP_MINQUERY:
3483 c = *ecode++ - OP_STAR;
3484 minimize = (c & 1) != 0;
3485 min = rep_min[c]; /* Pick up values from tables; */
3486 max = rep_max[c]; /* zero for max => infinity */
3487 if (max == 0) max = INT_MAX;
3488
3489 /* Common code for all repeated single-character matches. We can give
3490 up quickly if there are fewer than the minimum number of characters left in
3491 the subject. */
3492
3493 REPEATCHAR:
3494 if (min > md->end_subject - eptr) return FALSE;
3495 c = *ecode++;
3496
3497 /* The code is duplicated for the caseless and caseful cases, for speed,
3498 since matching characters is likely to be quite common. First, ensure the
3499 minimum number of matches are present. If min = max, continue at the same
3500 level without recursing. Otherwise, if minimizing, keep trying the rest of
3501 the expression and advancing one matching character if failing, up to the
3502 maximum. Alternatively, if maximizing, find the maximum number of
3503 characters and work backwards. */
3504
3505 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3506 max, eptr));
3507
3508 if ((ims & PCRE_CASELESS) != 0)
3509 {
3510 c = md->lcc[c];
3511 for (i = 1; i <= min; i++)
3512 if (c != md->lcc[*eptr++]) return FALSE;
3513 if (min == max) continue;
3514 if (minimize)
3515 {
3516 for (i = min;; i++)
3517 {
3518 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3519 return TRUE;
3520 if (i >= max || eptr >= md->end_subject ||
3521 c != md->lcc[*eptr++])
3522 return FALSE;
3523 }
3524 /* Control never gets here */
3525 }
3526 else
3527 {
3528 const uschar *pp = eptr;
3529 for (i = min; i < max; i++)
3530 {
3531 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3532 eptr++;
3533 }
3534 while (eptr >= pp)
3535 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3536 return TRUE;
3537 return FALSE;
3538 }
3539 /* Control never gets here */
3540 }
3541
3542 /* Caseful comparisons */
3543
3544 else
3545 {
3546 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3547 if (min == max) continue;
3548 if (minimize)
3549 {
3550 for (i = min;; i++)
3551 {
3552 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3553 return TRUE;
3554 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3555 }
3556 /* Control never gets here */
3557 }
3558 else
3559 {
3560 const uschar *pp = eptr;
3561 for (i = min; i < max; i++)
3562 {
3563 if (eptr >= md->end_subject || c != *eptr) break;
3564 eptr++;
3565 }
3566 while (eptr >= pp)
3567 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3568 return TRUE;
3569 return FALSE;
3570 }
3571 }
3572 /* Control never gets here */
3573
3574 /* Match a negated single character */
3575
3576 case OP_NOT:
3577 if (eptr >= md->end_subject) return FALSE;
3578 ecode++;
3579 if ((ims & PCRE_CASELESS) != 0)
3580 {
3581 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3582 }
3583 else
3584 {
3585 if (*ecode++ == *eptr++) return FALSE;
3586 }
3587 break;
3588
3589 /* Match a negated single character repeatedly. This is almost a repeat of
3590 the code for a repeated single character, but I haven't found a nice way of
3591 commoning these up that doesn't require a test of the positive/negative
3592 option for each character match. Maybe that wouldn't add very much to the
3593 time taken, but character matching *is* what this is all about... */
3594
3595 case OP_NOTEXACT:
3596 min = max = (ecode[1] << 8) + ecode[2];
3597 ecode += 3;
3598 goto REPEATNOTCHAR;
3599
3600 case OP_NOTUPTO:
3601 case OP_NOTMINUPTO:
3602 min = 0;
3603 max = (ecode[1] << 8) + ecode[2];
3604 minimize = *ecode == OP_NOTMINUPTO;
3605 ecode += 3;
3606 goto REPEATNOTCHAR;
3607
3608 case OP_NOTSTAR:
3609 case OP_NOTMINSTAR:
3610 case OP_NOTPLUS:
3611 case OP_NOTMINPLUS:
3612 case OP_NOTQUERY:
3613 case OP_NOTMINQUERY:
3614 c = *ecode++ - OP_NOTSTAR;
3615 minimize = (c & 1) != 0;
3616 min = rep_min[c]; /* Pick up values from tables; */
3617 max = rep_max[c]; /* zero for max => infinity */
3618 if (max == 0) max = INT_MAX;
3619
3620 /* Common code for all repeated single-character matches. We can give
3621 up quickly if there are fewer than the minimum number of characters left in
3622 the subject. */
3623
3624 REPEATNOTCHAR:
3625 if (min > md->end_subject - eptr) return FALSE;
3626 c = *ecode++;
3627
3628 /* The code is duplicated for the caseless and caseful cases, for speed,
3629 since matching characters is likely to be quite common. First, ensure the
3630 minimum number of matches are present. If min = max, continue at the same
3631 level without recursing. Otherwise, if minimizing, keep trying the rest of
3632 the expression and advancing one matching character if failing, up to the
3633 maximum. Alternatively, if maximizing, find the maximum number of
3634 characters and work backwards. */
3635
3636 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3637 max, eptr));
3638
3639 if ((ims & PCRE_CASELESS) != 0)
3640 {
3641 c = md->lcc[c];
3642 for (i = 1; i <= min; i++)
3643 if (c == md->lcc[*eptr++]) return FALSE;
3644 if (min == max) continue;
3645 if (minimize)
3646 {
3647 for (i = min;; i++)
3648 {
3649 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3650 return TRUE;
3651 if (i >= max || eptr >= md->end_subject ||
3652 c == md->lcc[*eptr++])
3653 return FALSE;
3654 }
3655 /* Control never gets here */
3656 }
3657 else
3658 {
3659 const uschar *pp = eptr;
3660 for (i = min; i < max; i++)
3661 {
3662 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3663 eptr++;
3664 }
3665 while (eptr >= pp)
3666 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3667 return TRUE;
3668 return FALSE;
3669 }
3670 /* Control never gets here */
3671 }
3672
3673 /* Caseful comparisons */
3674
3675 else
3676 {
3677 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3678 if (min == max) continue;
3679 if (minimize)
3680 {
3681 for (i = min;; i++)
3682 {
3683 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3684 return TRUE;
3685 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3686 }
3687 /* Control never gets here */
3688 }
3689 else
3690 {
3691 const uschar *pp = eptr;
3692 for (i = min; i < max; i++)
3693 {
3694 if (eptr >= md->end_subject || c == *eptr) break;
3695 eptr++;
3696 }
3697 while (eptr >= pp)
3698 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3699 return TRUE;
3700 return FALSE;
3701 }
3702 }
3703 /* Control never gets here */
3704
3705 /* Match a single character type repeatedly; several different opcodes
3706 share code. This is very similar to the code for single characters, but we
3707 repeat it in the interests of efficiency. */
3708
3709 case OP_TYPEEXACT:
3710 min = max = (ecode[1] << 8) + ecode[2];
3711 minimize = TRUE;
3712 ecode += 3;
3713 goto REPEATTYPE;
3714
3715 case OP_TYPEUPTO:
3716 case OP_TYPEMINUPTO:
3717 min = 0;
3718 max = (ecode[1] << 8) + ecode[2];
3719 minimize = *ecode == OP_TYPEMINUPTO;
3720 ecode += 3;
3721 goto REPEATTYPE;
3722
3723 case OP_TYPESTAR:
3724 case OP_TYPEMINSTAR:
3725 case OP_TYPEPLUS:
3726 case OP_TYPEMINPLUS:
3727 case OP_TYPEQUERY:
3728 case OP_TYPEMINQUERY:
3729 c = *ecode++ - OP_TYPESTAR;
3730 minimize = (c & 1) != 0;
3731 min = rep_min[c]; /* Pick up values from tables; */
3732 max = rep_max[c]; /* zero for max => infinity */
3733 if (max == 0) max = INT_MAX;
3734
3735 /* Common code for all repeated single character type matches */
3736
3737 REPEATTYPE:
3738 ctype = *ecode++; /* Code for the character type */
3739
3740 /* First, ensure the minimum number of matches are present. Use inline
3741 code for maximizing the speed, and do the type test once at the start
3742 (i.e. keep it out of the loop). Also test that there are at least the
3743 minimum number of characters before we start. */
3744
3745 if (min > md->end_subject - eptr) return FALSE;
3746 if (min > 0) switch(ctype)
3747 {
3748 case OP_ANY:
3749 if ((ims & PCRE_DOTALL) == 0)
3750 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3751 else eptr += min;
3752 break;
3753
3754 case OP_NOT_DIGIT:
3755 for (i = 1; i <= min; i++)
3756 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3757 break;
3758
3759 case OP_DIGIT:
3760 for (i = 1; i <= min; i++)
3761 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3762 break;
3763
3764 case OP_NOT_WHITESPACE:
3765 for (i = 1; i <= min; i++)
3766 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3767 break;
3768
3769 case OP_WHITESPACE:
3770 for (i = 1; i <= min; i++)
3771 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3772 break;
3773
3774 case OP_NOT_WORDCHAR:
3775 for (i = 1; i <= min; i++)
3776 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3777 return FALSE;
3778 break;
3779
3780 case OP_WORDCHAR:
3781 for (i = 1; i <= min; i++)
3782 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3783 return FALSE;
3784 break;
3785 }
3786
3787 /* If min = max, continue at the same level without recursing */
3788
3789 if (min == max) continue;
3790
3791 /* If minimizing, we have to test the rest of the pattern before each
3792 subsequent match. */
3793
3794 if (minimize)
3795 {
3796 for (i = min;; i++)
3797 {
3798 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3799 if (i >= max || eptr >= md->end_subject) return FALSE;
3800
3801 c = *eptr++;
3802 switch(ctype)
3803 {
3804 case OP_ANY:
3805 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
3806 break;
3807
3808 case OP_NOT_DIGIT:
3809 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
3810 break;
3811
3812 case OP_DIGIT:
3813 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
3814 break;
3815
3816 case OP_NOT_WHITESPACE:
3817 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
3818 break;
3819
3820 case OP_WHITESPACE:
3821 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
3822 break;
3823
3824 case OP_NOT_WORDCHAR:
3825 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
3826 break;
3827
3828 case OP_WORDCHAR:
3829 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
3830 break;
3831 }
3832 }
3833 /* Control never gets here */
3834 }
3835
3836 /* If maximizing it is worth using inline code for speed, doing the type
3837 test once at the start (i.e. keep it out of the loop). */
3838
3839 else
3840 {
3841 const uschar *pp = eptr;
3842 switch(ctype)
3843 {
3844 case OP_ANY:
3845 if ((ims & PCRE_DOTALL) == 0)
3846 {
3847 for (i = min; i < max; i++)
3848 {
3849 if (eptr >= md->end_subject || *eptr == '\n') break;
3850 eptr++;
3851 }
3852 }
3853 else
3854 {
3855 c = max - min;
3856 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3857 eptr += c;
3858 }
3859 break;
3860
3861 case OP_NOT_DIGIT:
3862 for (i = min; i < max; i++)
3863 {
3864 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3865 break;
3866 eptr++;
3867 }
3868 break;
3869
3870 case OP_DIGIT:
3871 for (i = min; i < max; i++)
3872 {
3873 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3874 break;
3875 eptr++;
3876 }
3877 break;
3878
3879 case OP_NOT_WHITESPACE:
3880 for (i = min; i < max; i++)
3881 {
3882 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3883 break;
3884 eptr++;
3885 }
3886 break;
3887
3888 case OP_WHITESPACE:
3889 for (i = min; i < max; i++)
3890 {
3891 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3892 break;
3893 eptr++;
3894 }
3895 break;
3896
3897 case OP_NOT_WORDCHAR:
3898 for (i = min; i < max; i++)
3899 {
3900 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3901 break;
3902 eptr++;
3903 }
3904 break;
3905
3906 case OP_WORDCHAR:
3907 for (i = min; i < max; i++)
3908 {
3909 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3910 break;
3911 eptr++;
3912 }
3913 break;
3914 }
3915
3916 while (eptr >= pp)
3917 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3918 return TRUE;
3919 return FALSE;
3920 }
3921 /* Control never gets here */
3922
3923 /* There's been some horrible disaster. */
3924
3925 default:
3926 DPRINTF(("Unknown opcode %d\n", *ecode));
3927 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3928 return FALSE;
3929 }
3930
3931 /* Do not stick any code in here without much thought; it is assumed
3932 that "continue" in the code above comes out to here to repeat the main
3933 loop. */
3934
3935 } /* End of main loop */
3936 /* Control never reaches here */
3937 }
3938
3939
3940
3941
3942 /*************************************************
3943 * Execute a Regular Expression *
3944 *************************************************/
3945
3946 /* This function applies a compiled re to a subject string and picks out
3947 portions of the string if it matches. Two elements in the vector are set for
3948 each substring: the offsets to the start and end of the substring.
3949
3950 Arguments:
3951 external_re points to the compiled expression
3952 external_extra points to "hints" from pcre_study() or is NULL
3953 subject points to the subject string
3954 length length of subject string (may contain binary zeros)
3955 options option bits
3956 offsets points to a vector of ints to be filled in with offsets
3957 offsetcount the number of elements in the vector
3958
3959 Returns: > 0 => success; value is the number of elements filled in
3960 = 0 => success, but offsets is not big enough
3961 -1 => failed to match
3962 < -1 => some kind of unexpected problem
3963 */
3964
3965 int
3966 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3967 const char *subject, int length, int options, int *offsets, int offsetcount)
3968 {
3969 int resetcount, ocount;
3970 int first_char = -1;
3971 int ims = 0;
3972 match_data match_block;
3973 const uschar *start_bits = NULL;
3974 const uschar *start_match = (const uschar *)subject;
3975 const uschar *end_subject;
3976 const real_pcre *re = (const real_pcre *)external_re;
3977 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3978 BOOL using_temporary_offsets = FALSE;
3979 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3980 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3981
3982 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3983
3984 if (re == NULL || subject == NULL ||
3985 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3986 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3987
3988 match_block.start_subject = (const uschar *)subject;
3989 match_block.end_subject = match_block.start_subject + length;
3990 end_subject = match_block.end_subject;
3991
3992 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3993
3994 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3995 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3996
3997 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3998
3999 match_block.lcc = re->tables + lcc_offset;
4000 match_block.ctypes = re->tables + ctypes_offset;
4001
4002 /* The ims options can vary during the matching as a result of the presence
4003 of (?ims) items in the pattern. They are kept in a local variable so that
4004 restoring at the exit of a group is easy. */
4005
4006 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4007
4008 /* If the expression has got more back references than the offsets supplied can
4009 hold, we get a temporary bit of working store to use during the matching.
4010 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4011 of 3. */
4012
4013 ocount = offsetcount - (offsetcount % 3);
4014
4015 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4016 {
4017 ocount = re->top_backref * 3 + 3;
4018 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4019 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4020 using_temporary_offsets = TRUE;
4021 DPRINTF(("Got memory to hold back references\n"));
4022 }
4023 else match_block.offset_vector = offsets;
4024
4025 match_block.offset_end = ocount;
4026 match_block.offset_max = (2*ocount)/3;
4027 match_block.offset_overflow = FALSE;
4028
4029 /* Compute the minimum number of offsets that we need to reset each time. Doing
4030 this makes a huge difference to execution time when there aren't many brackets
4031 in the pattern. */
4032
4033 resetcount = 2 + re->top_bracket * 2;
4034 if (resetcount > offsetcount) resetcount = ocount;
4035
4036 /* Reset the working variable associated with each extraction. These should
4037 never be used unless previously set, but they get saved and restored, and so we
4038 initialize them to avoid reading uninitialized locations. */
4039
4040 if (match_block.offset_vector != NULL)
4041 {
4042 register int *iptr = match_block.offset_vector + ocount;
4043 register int *iend = iptr - resetcount/2 + 1;
4044 while (--iptr >= iend) *iptr = -1;
4045 }
4046
4047 /* Set up the first character to match, if available. The first_char value is
4048 never set for an anchored regular expression, but the anchoring may be forced
4049 at run time, so we have to test for anchoring. The first char may be unset for
4050 an unanchored pattern, of course. If there's no first char and the pattern was
4051 studied, there may be a bitmap of possible first characters. */
4052
4053 if (!anchored)
4054 {
4055 if ((re->options & PCRE_FIRSTSET) != 0)
4056 {
4057 first_char = re->first_char;
4058 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4059 }
4060 else
4061 if (!startline && extra != NULL &&
4062 (extra->options & PCRE_STUDY_MAPPED) != 0)
4063 start_bits = extra->start_bits;
4064 }
4065
4066 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4067
4068 do
4069 {
4070 int rc;
4071 register int *iptr = match_block.offset_vector;
4072 register int *iend = iptr + resetcount;
4073
4074 /* Reset the maximum number of extractions we might see. */
4075
4076 while (iptr < iend) *iptr++ = -1;
4077
4078 /* Advance to a unique first char if possible */
4079
4080 if (first_char >= 0)
4081 {
4082 if ((ims & PCRE_CASELESS) != 0)
4083 while (start_match < end_subject &&
4084 match_block.lcc[*start_match] != first_char)
4085 start_match++;
4086 else
4087 while (start_match < end_subject && *start_match != first_char)
4088 start_match++;
4089 }
4090
4091 /* Or to just after \n for a multiline match if possible */
4092
4093 else if (startline)
4094 {
4095 if (start_match > match_block.start_subject)
4096 {
4097 while (start_match < end_subject && start_match[-1] != '\n')
4098 start_match++;
4099 }
4100 }
4101
4102 /* Or to a non-unique first char */
4103
4104 else if (start_bits != NULL)
4105 {
4106 while (start_match < end_subject)
4107 {
4108 register int c = *start_match;
4109 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4110 }
4111 }
4112
4113 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4114 printf(">>>> Match against: ");
4115 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4116 printf("\n");
4117 #endif
4118
4119 /* When a match occurs, substrings will be set for all internal extractions;
4120 we just need to set up the whole thing as substring 0 before returning. If
4121 there were too many extractions, set the return code to zero. In the case
4122 where we had to get some local store to hold offsets for backreferences, copy
4123 those back references that we can. In this case there need not be overflow
4124 if certain parts of the pattern were not used. */
4125
4126 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4127 continue;
4128
4129 /* Copy the offset information from temporary store if necessary */
4130
4131 if (using_temporary_offsets)
4132 {
4133 if (offsetcount >= 4)
4134 {
4135 memcpy(offsets + 2, match_block.offset_vector + 2,
4136 (offsetcount - 2) * sizeof(int));
4137 DPRINTF(("Copied offsets from temporary memory\n"));
4138 }
4139 if (match_block.end_offset_top > offsetcount)
4140 match_block.offset_overflow = TRUE;
4141
4142 DPRINTF(("Freeing temporary memory\n"));
4143 (pcre_free)(match_block.offset_vector);
4144 }
4145
4146 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4147
4148 if (match_block.offset_end < 2) rc = 0; else
4149 {
4150 offsets[0] = start_match - match_block.start_subject;
4151 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4152 }
4153
4154 DPRINTF((">>>> returning %d\n", rc));
4155 return rc;
4156 }
4157
4158 /* This "while" is the end of the "do" above */
4159
4160 while (!anchored &&
4161 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4162 start_match++ < end_subject);
4163
4164 if (using_temporary_offsets)
4165 {
4166 DPRINTF(("Freeing temporary memory\n"));
4167 (pcre_free)(match_block.offset_vector);
4168 }
4169
4170 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4171
4172 return match_block.errorcode;
4173 }
4174
4175 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12