/[pcre]/code/tags/pcre-2.00/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-2.00/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 24 - (show annotations) (download)
Sat Feb 24 21:38:43 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 119644 byte(s)
Tag code/trunk as code/tags/pcre-2.00.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1998 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
37 inline, and there are *still* stupid compilers about that don't like indented
38 pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Allow compilation as C++ source code, should anybody want to do that. */
53
54 #ifdef __cplusplus
55 #define class pcre_class
56 #endif
57
58
59 /* Number of items on the nested bracket stacks at compile time. This should
60 not be set greater than 200. */
61
62 #define BRASTACK_SIZE 200
63
64
65 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
66
67 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
68 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
69
70 /* Text forms of OP_ values and things, for debugging (not all used) */
71
72 #ifdef DEBUG
73 static const char *OP_names[] = {
74 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
75 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
76 "Opt", "^", "$", "Any", "chars", "not",
77 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
78 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
79 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
80 "*", "*?", "+", "+?", "?", "??", "{", "{",
81 "class", "Ref",
82 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
83 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
84 "Brazero", "Braminzero", "Bra"
85 };
86 #endif
87
88 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
89 are simple data values; negative values are for special things like \d and so
90 on. Zero means further processing is needed (for things like \x), or the escape
91 is invalid. */
92
93 static const short int escapes[] = {
94 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
95 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
96 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
97 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
98 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
99 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
100 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
101 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
102 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
103 0, 0, -ESC_z /* x - z */
104 };
105
106 /* Definition to allow mutual recursion */
107
108 static BOOL
109 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
110 BOOL, int);
111
112 /* Structure for passing "static" information around between the functions
113 doing the matching, so that they are thread-safe. */
114
115 typedef struct match_data {
116 int errorcode; /* As it says */
117 int *offset_vector; /* Offset vector */
118 int offset_end; /* One past the end */
119 int offset_max; /* The maximum usable for return data */
120 BOOL offset_overflow; /* Set if too many extractions */
121 BOOL notbol; /* NOTBOL flag */
122 BOOL noteol; /* NOTEOL flag */
123 BOOL endonly; /* Dollar not before final \n */
124 const uschar *start_subject; /* Start of the subject string */
125 const uschar *end_subject; /* End of the subject string */
126 const uschar *end_match_ptr; /* Subject position at end match */
127 int end_offset_top; /* Highwater mark at end of match */
128 } match_data;
129
130
131
132 /*************************************************
133 * Global variables *
134 *************************************************/
135
136 /* PCRE is thread-clean and doesn't use any global variables in the normal
137 sense. However, it calls memory allocation and free functions via the two
138 indirections below, which are can be changed by the caller, but are shared
139 between all threads. */
140
141 void *(*pcre_malloc)(size_t) = malloc;
142 void (*pcre_free)(void *) = free;
143
144
145
146
147 /*************************************************
148 * Return version string *
149 *************************************************/
150
151 const char *
152 pcre_version(void)
153 {
154 return PCRE_VERSION;
155 }
156
157
158
159
160 /*************************************************
161 * Return info about a compiled pattern *
162 *************************************************/
163
164 /* This function picks potentially useful data out of the private
165 structure.
166
167 Arguments:
168 external_re points to compiled code
169 optptr where to pass back the options
170 first_char where to pass back the first character,
171 or -1 if multiline and all branches start ^,
172 or -2 otherwise
173
174 Returns: number of identifying extraction brackets
175 or negative values on error
176 */
177
178 int
179 pcre_info(const pcre *external_re, int *optptr, int *first_char)
180 {
181 const real_pcre *re = (const real_pcre *)external_re;
182 if (re == NULL) return PCRE_ERROR_NULL;
183 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
184 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
185 if (first_char != NULL)
186 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
187 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
188 return re->top_bracket;
189 }
190
191
192
193
194 #ifdef DEBUG
195 /*************************************************
196 * Debugging function to print chars *
197 *************************************************/
198
199 /* Print a sequence of chars in printable format, stopping at the end of the
200 subject if the requested.
201
202 Arguments:
203 p points to characters
204 length number to print
205 is_subject TRUE if printing from within md->start_subject
206 md pointer to matching data block, if is_subject is TRUE
207
208 Returns: nothing
209 */
210
211 static void
212 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
213 {
214 int c;
215 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
216 while (length-- > 0)
217 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
218 }
219 #endif
220
221
222
223
224 /*************************************************
225 * Handle escapes *
226 *************************************************/
227
228 /* This function is called when a \ has been encountered. It either returns a
229 positive value for a simple escape such as \n, or a negative value which
230 encodes one of the more complicated things such as \d. On entry, ptr is
231 pointing at the \. On exit, it is on the final character of the escape
232 sequence.
233
234 Arguments:
235 ptrptr points to the pattern position pointer
236 errorptr points to the pointer to the error message
237 bracount number of previous extracting brackets
238 options the options bits
239 isclass TRUE if inside a character class
240
241 Returns: zero or positive => a data character
242 negative => a special escape sequence
243 on error, errorptr is set
244 */
245
246 static int
247 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
248 int options, BOOL isclass)
249 {
250 const uschar *ptr = *ptrptr;
251 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
252 int i;
253
254 if (c == 0) *errorptr = ERR1;
255
256 /* Digits or letters may have special meaning; all others are literals. */
257
258 else if (c < '0' || c > 'z') {}
259
260 /* Do an initial lookup in a table. A non-zero result is something that can be
261 returned immediately. Otherwise further processing may be required. */
262
263 else if ((i = escapes[c - '0']) != 0) c = i;
264
265 /* Escapes that need further processing, or are illegal. */
266
267 else
268 {
269 const uschar *oldptr;
270 switch (c)
271 {
272 /* The handling of escape sequences consisting of a string of digits
273 starting with one that is not zero is not straightforward. By experiment,
274 the way Perl works seems to be as follows:
275
276 Outside a character class, the digits are read as a decimal number. If the
277 number is less than 10, or if there are that many previous extracting
278 left brackets, then it is a back reference. Otherwise, up to three octal
279 digits are read to form an escaped byte. Thus \123 is likely to be octal
280 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
281 value is greater than 377, the least significant 8 bits are taken. Inside a
282 character class, \ followed by a digit is always an octal number. */
283
284 case '1': case '2': case '3': case '4': case '5':
285 case '6': case '7': case '8': case '9':
286
287 if (!isclass)
288 {
289 oldptr = ptr;
290 c -= '0';
291 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
292 c = c * 10 + *(++ptr) - '0';
293 if (c < 10 || c <= bracount)
294 {
295 c = -(ESC_REF + c);
296 break;
297 }
298 ptr = oldptr; /* Put the pointer back and fall through */
299 }
300
301 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
302 generates a binary zero byte and treats the digit as a following literal.
303 Thus we have to pull back the pointer by one. */
304
305 if ((c = *ptr) >= '8')
306 {
307 ptr--;
308 c = 0;
309 break;
310 }
311
312 /* \0 always starts an octal number, but we may drop through to here with a
313 larger first octal digit */
314
315 case '0':
316 c -= '0';
317 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
318 ptr[1] != '8' && ptr[1] != '9')
319 c = c * 8 + *(++ptr) - '0';
320 break;
321
322 /* Special escapes not starting with a digit are straightforward */
323
324 case 'x':
325 c = 0;
326 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
327 {
328 ptr++;
329 c = c * 16 + pcre_lcc[*ptr] -
330 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
331 }
332 break;
333
334 case 'c':
335 c = *(++ptr);
336 if (c == 0)
337 {
338 *errorptr = ERR2;
339 return 0;
340 }
341
342 /* A letter is upper-cased; then the 0x40 bit is flipped */
343
344 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
345 c ^= 0x40;
346 break;
347
348 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
349 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
350 for Perl compatibility, it is a literal. */
351
352 default:
353 if ((options & PCRE_EXTRA) != 0) switch(c)
354 {
355 default:
356 *errorptr = ERR3;
357 break;
358 }
359 break;
360 }
361 }
362
363 *ptrptr = ptr;
364 return c;
365 }
366
367
368
369 /*************************************************
370 * Check for counted repeat *
371 *************************************************/
372
373 /* This function is called when a '{' is encountered in a place where it might
374 start a quantifier. It looks ahead to see if it really is a quantifier or not.
375 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
376 where the ddds are digits.
377
378 Arguments:
379 p pointer to the first char after '{'
380
381 Returns: TRUE or FALSE
382 */
383
384 static BOOL
385 is_counted_repeat(const uschar *p)
386 {
387 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
388 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
389 if (*p == '}') return TRUE;
390
391 if (*p++ != ',') return FALSE;
392 if (*p == '}') return TRUE;
393
394 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
395 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
396 return (*p == '}');
397 }
398
399
400
401 /*************************************************
402 * Read repeat counts *
403 *************************************************/
404
405 /* Read an item of the form {n,m} and return the values. This is called only
406 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
407 so the syntax is guaranteed to be correct, but we need to check the values.
408
409 Arguments:
410 p pointer to first char after '{'
411 minp pointer to int for min
412 maxp pointer to int for max
413 returned as -1 if no max
414 errorptr points to pointer to error message
415
416 Returns: pointer to '}' on success;
417 current ptr on error, with errorptr set
418 */
419
420 static const uschar *
421 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
422 {
423 int min = 0;
424 int max = -1;
425
426 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
427
428 if (*p == '}') max = min; else
429 {
430 if (*(++p) != '}')
431 {
432 max = 0;
433 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
434 if (max < min)
435 {
436 *errorptr = ERR4;
437 return p;
438 }
439 }
440 }
441
442 /* Do paranoid checks, then fill in the required variables, and pass back the
443 pointer to the terminating '}'. */
444
445 if (min > 65535 || max > 65535)
446 *errorptr = ERR5;
447 else
448 {
449 *minp = min;
450 *maxp = max;
451 }
452 return p;
453 }
454
455
456
457 /*************************************************
458 * Find the fixed length of a pattern *
459 *************************************************/
460
461 /* Scan a pattern and compute the fixed length of subject that will match it,
462 if the length is fixed. This is needed for dealing with backward assertions.
463
464 Arguments:
465 code points to the start of the pattern (the bracket)
466
467 Returns: the fixed length, or -1 if there is no fixed length
468 */
469
470 static int
471 find_fixedlength(uschar *code)
472 {
473 int length = -1;
474
475 register int branchlength = 0;
476 register uschar *cc = code + 3;
477
478 /* Scan along the opcodes for this branch. If we get to the end of the
479 branch, check the length against that of the other branches. */
480
481 for (;;)
482 {
483 int d;
484 register int op = *cc;
485 if (op >= OP_BRA) op = OP_BRA;
486
487 switch (op)
488 {
489 case OP_BRA:
490 case OP_ONCE:
491 case OP_COND:
492 d = find_fixedlength(cc);
493 if (d < 0) return -1;
494 branchlength += d;
495 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
496 cc += 3;
497 break;
498
499 /* Reached end of a branch; if it's a ket it is the end of a nested
500 call. If it's ALT it is an alternation in a nested call. If it is
501 END it's the end of the outer call. All can be handled by the same code. */
502
503 case OP_ALT:
504 case OP_KET:
505 case OP_KETRMAX:
506 case OP_KETRMIN:
507 case OP_END:
508 if (length < 0) length = branchlength;
509 else if (length != branchlength) return -1;
510 if (*cc != OP_ALT) return length;
511 cc += 3;
512 branchlength = 0;
513 break;
514
515 /* Skip over assertive subpatterns */
516
517 case OP_ASSERT:
518 case OP_ASSERT_NOT:
519 case OP_ASSERTBACK:
520 case OP_ASSERTBACK_NOT:
521 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
522 cc += 3;
523 break;
524
525 /* Skip over things that don't match chars */
526
527 case OP_REVERSE:
528 cc++;
529
530 case OP_CREF:
531 case OP_OPT:
532 cc++;
533 /* Fall through */
534
535 case OP_SOD:
536 case OP_EOD:
537 case OP_EODN:
538 case OP_CIRC:
539 case OP_DOLL:
540 case OP_NOT_WORD_BOUNDARY:
541 case OP_WORD_BOUNDARY:
542 cc++;
543 break;
544
545 /* Handle char strings */
546
547 case OP_CHARS:
548 branchlength += *(++cc);
549 cc += *cc + 1;
550 break;
551
552 /* Handle exact repetitions */
553
554 case OP_EXACT:
555 case OP_TYPEEXACT:
556 branchlength += (cc[1] << 8) + cc[2];
557 cc += 4;
558 break;
559
560 /* Handle single-char matchers */
561
562 case OP_NOT_DIGIT:
563 case OP_DIGIT:
564 case OP_NOT_WHITESPACE:
565 case OP_WHITESPACE:
566 case OP_NOT_WORDCHAR:
567 case OP_WORDCHAR:
568 case OP_ANY:
569 branchlength++;
570 cc++;
571 break;
572
573
574 /* Check a class for variable quantification */
575
576 case OP_CLASS:
577 cc += (*cc == OP_REF)? 2 : 33;
578
579 switch (*cc)
580 {
581 case OP_CRSTAR:
582 case OP_CRMINSTAR:
583 case OP_CRQUERY:
584 case OP_CRMINQUERY:
585 return -1;
586
587 case OP_CRRANGE:
588 case OP_CRMINRANGE:
589 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
590 branchlength += (cc[1] << 8) + cc[2];
591 cc += 5;
592 break;
593
594 default:
595 branchlength++;
596 }
597 break;
598
599 /* Anything else is variable length */
600
601 default:
602 return -1;
603 }
604 }
605 /* Control never gets here */
606 }
607
608
609
610
611 /*************************************************
612 * Compile one branch *
613 *************************************************/
614
615 /* Scan the pattern, compiling it into the code vector.
616
617 Arguments:
618 options the option bits
619 brackets points to number of brackets used
620 code points to the pointer to the current code point
621 ptrptr points to the current pattern pointer
622 errorptr points to pointer to error message
623 optchanged set to the value of the last OP_OPT item compiled
624
625 Returns: TRUE on success
626 FALSE, with *errorptr set on error
627 */
628
629 static BOOL
630 compile_branch(int options, int *brackets, uschar **codeptr,
631 const uschar **ptrptr, const char **errorptr, int *optchanged)
632 {
633 int repeat_type, op_type;
634 int repeat_min, repeat_max;
635 int bravalue, length;
636 int greedy_default, greedy_non_default;
637 register int c;
638 register uschar *code = *codeptr;
639 uschar *tempcode;
640 const uschar *ptr = *ptrptr;
641 const uschar *tempptr;
642 uschar *previous = NULL;
643 uschar class[32];
644
645 /* Set up the default and non-default settings for greediness */
646
647 greedy_default = ((options & PCRE_UNGREEDY) != 0);
648 greedy_non_default = greedy_default ^ 1;
649
650 /* Switch on next character until the end of the branch */
651
652 for (;; ptr++)
653 {
654 BOOL negate_class;
655 int class_charcount;
656 int class_lastchar;
657 int newoptions;
658 int condref;
659
660 c = *ptr;
661 if ((options & PCRE_EXTENDED) != 0)
662 {
663 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
664 if (c == '#')
665 {
666 while ((c = *(++ptr)) != 0 && c != '\n');
667 continue;
668 }
669 }
670
671 switch(c)
672 {
673 /* The branch terminates at end of string, |, or ). */
674
675 case 0:
676 case '|':
677 case ')':
678 *codeptr = code;
679 *ptrptr = ptr;
680 return TRUE;
681
682 /* Handle single-character metacharacters */
683
684 case '^':
685 previous = NULL;
686 *code++ = OP_CIRC;
687 break;
688
689 case '$':
690 previous = NULL;
691 *code++ = OP_DOLL;
692 break;
693
694 case '.':
695 previous = code;
696 *code++ = OP_ANY;
697 break;
698
699 /* Character classes. These always build a 32-byte bitmap of the permitted
700 characters, except in the special case where there is only one character.
701 For negated classes, we build the map as usual, then invert it at the end.
702 */
703
704 case '[':
705 previous = code;
706 *code++ = OP_CLASS;
707
708 /* If the first character is '^', set the negation flag and skip it. */
709
710 if ((c = *(++ptr)) == '^')
711 {
712 negate_class = TRUE;
713 c = *(++ptr);
714 }
715 else negate_class = FALSE;
716
717 /* Keep a count of chars so that we can optimize the case of just a single
718 character. */
719
720 class_charcount = 0;
721 class_lastchar = -1;
722
723 /* Initialize the 32-char bit map to all zeros. We have to build the
724 map in a temporary bit of store, in case the class contains only 1
725 character, because in that case the compiled code doesn't use the
726 bit map. */
727
728 memset(class, 0, 32 * sizeof(uschar));
729
730 /* Process characters until ] is reached. By writing this as a "do" it
731 means that an initial ] is taken as a data character. */
732
733 do
734 {
735 if (c == 0)
736 {
737 *errorptr = ERR6;
738 goto FAILED;
739 }
740
741 /* Backslash may introduce a single character, or it may introduce one
742 of the specials, which just set a flag. Escaped items are checked for
743 validity in the pre-compiling pass. The sequence \b is a special case.
744 Inside a class (and only there) it is treated as backspace. Elsewhere
745 it marks a word boundary. Other escapes have preset maps ready to
746 or into the one we are building. We assume they have more than one
747 character in them, so set class_count bigger than one. */
748
749 if (c == '\\')
750 {
751 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
752 if (-c == ESC_b) c = '\b';
753 else if (c < 0)
754 {
755 class_charcount = 10;
756 switch (-c)
757 {
758 case ESC_d:
759 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
760 continue;
761
762 case ESC_D:
763 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
764 continue;
765
766 case ESC_w:
767 for (c = 0; c < 32; c++)
768 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
769 continue;
770
771 case ESC_W:
772 for (c = 0; c < 32; c++)
773 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
774 continue;
775
776 case ESC_s:
777 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
778 continue;
779
780 case ESC_S:
781 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
782 continue;
783
784 default:
785 *errorptr = ERR7;
786 goto FAILED;
787 }
788 }
789 /* Fall through if single character */
790 }
791
792 /* A single character may be followed by '-' to form a range. However,
793 Perl does not permit ']' to be the end of the range. A '-' character
794 here is treated as a literal. */
795
796 if (ptr[1] == '-' && ptr[2] != ']')
797 {
798 int d;
799 ptr += 2;
800 d = *ptr;
801
802 if (d == 0)
803 {
804 *errorptr = ERR6;
805 goto FAILED;
806 }
807
808 /* The second part of a range can be a single-character escape, but
809 not any of the other escapes. */
810
811 if (d == '\\')
812 {
813 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
814 if (d < 0)
815 {
816 if (d == -ESC_b) d = '\b'; else
817 {
818 *errorptr = ERR7;
819 goto FAILED;
820 }
821 }
822 }
823
824 if (d < c)
825 {
826 *errorptr = ERR8;
827 goto FAILED;
828 }
829
830 for (; c <= d; c++)
831 {
832 class[c/8] |= (1 << (c&7));
833 if ((options & PCRE_CASELESS) != 0)
834 {
835 int uc = pcre_fcc[c]; /* flip case */
836 class[uc/8] |= (1 << (uc&7));
837 }
838 class_charcount++; /* in case a one-char range */
839 class_lastchar = c;
840 }
841 continue; /* Go get the next char in the class */
842 }
843
844 /* Handle a lone single character - we can get here for a normal
845 non-escape char, or after \ that introduces a single character. */
846
847 class [c/8] |= (1 << (c&7));
848 if ((options & PCRE_CASELESS) != 0)
849 {
850 c = pcre_fcc[c]; /* flip case */
851 class[c/8] |= (1 << (c&7));
852 }
853 class_charcount++;
854 class_lastchar = c;
855 }
856
857 /* Loop until ']' reached; the check for end of string happens inside the
858 loop. This "while" is the end of the "do" above. */
859
860 while ((c = *(++ptr)) != ']');
861
862 /* If class_charcount is 1 and class_lastchar is not negative, we saw
863 precisely one character. This doesn't need the whole 32-byte bit map.
864 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
865 it's negative. */
866
867 if (class_charcount == 1 && class_lastchar >= 0)
868 {
869 if (negate_class)
870 {
871 code[-1] = OP_NOT;
872 }
873 else
874 {
875 code[-1] = OP_CHARS;
876 *code++ = 1;
877 }
878 *code++ = class_lastchar;
879 }
880
881 /* Otherwise, negate the 32-byte map if necessary, and copy it into
882 the code vector. */
883
884 else
885 {
886 if (negate_class)
887 for (c = 0; c < 32; c++) code[c] = ~class[c];
888 else
889 memcpy(code, class, 32);
890 code += 32;
891 }
892 break;
893
894 /* Various kinds of repeat */
895
896 case '{':
897 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
898 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
899 if (*errorptr != NULL) goto FAILED;
900 goto REPEAT;
901
902 case '*':
903 repeat_min = 0;
904 repeat_max = -1;
905 goto REPEAT;
906
907 case '+':
908 repeat_min = 1;
909 repeat_max = -1;
910 goto REPEAT;
911
912 case '?':
913 repeat_min = 0;
914 repeat_max = 1;
915
916 REPEAT:
917 if (previous == NULL)
918 {
919 *errorptr = ERR9;
920 goto FAILED;
921 }
922
923 /* If the next character is '?' this is a minimizing repeat, by default,
924 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
925 next character. */
926
927 if (ptr[1] == '?')
928 { repeat_type = greedy_non_default; ptr++; }
929 else repeat_type = greedy_default;
930
931 /* If the maximum is zero then the minimum must also be zero; Perl allows
932 this case, so we do too - by simply omitting the item altogether. */
933
934 if (repeat_max == 0) code = previous;
935
936 /* If previous was a string of characters, chop off the last one and use it
937 as the subject of the repeat. If there was only one character, we can
938 abolish the previous item altogether. */
939
940 else if (*previous == OP_CHARS)
941 {
942 int len = previous[1];
943 if (len == 1)
944 {
945 c = previous[2];
946 code = previous;
947 }
948 else
949 {
950 c = previous[len+1];
951 previous[1]--;
952 code--;
953 }
954 op_type = 0; /* Use single-char op codes */
955 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
956 }
957
958 /* If previous was a single negated character ([^a] or similar), we use
959 one of the special opcodes, replacing it. The code is shared with single-
960 character repeats by adding a suitable offset into repeat_type. */
961
962 else if ((int)*previous == OP_NOT)
963 {
964 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
965 c = previous[1];
966 code = previous;
967 goto OUTPUT_SINGLE_REPEAT;
968 }
969
970 /* If previous was a character type match (\d or similar), abolish it and
971 create a suitable repeat item. The code is shared with single-character
972 repeats by adding a suitable offset into repeat_type. */
973
974 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
975 {
976 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
977 c = *previous;
978 code = previous;
979
980 OUTPUT_SINGLE_REPEAT:
981 repeat_type += op_type; /* Combine both values for many cases */
982
983 /* A minimum of zero is handled either as the special case * or ?, or as
984 an UPTO, with the maximum given. */
985
986 if (repeat_min == 0)
987 {
988 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
989 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
990 else
991 {
992 *code++ = OP_UPTO + repeat_type;
993 *code++ = repeat_max >> 8;
994 *code++ = (repeat_max & 255);
995 }
996 }
997
998 /* The case {1,} is handled as the special case + */
999
1000 else if (repeat_min == 1 && repeat_max == -1)
1001 *code++ = OP_PLUS + repeat_type;
1002
1003 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1004 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1005
1006 else
1007 {
1008 if (repeat_min != 1)
1009 {
1010 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1011 *code++ = repeat_min >> 8;
1012 *code++ = (repeat_min & 255);
1013 }
1014
1015 /* If the mininum is 1 and the previous item was a character string,
1016 we either have to put back the item that got cancelled if the string
1017 length was 1, or add the character back onto the end of a longer
1018 string. For a character type nothing need be done; it will just get
1019 put back naturally. Note that the final character is always going to
1020 get added below. */
1021
1022 else if (*previous == OP_CHARS)
1023 {
1024 if (code == previous) code += 2; else previous[1]++;
1025 }
1026
1027 /* For a single negated character we also have to put back the
1028 item that got cancelled. */
1029
1030 else if (*previous == OP_NOT) code++;
1031
1032 /* If the maximum is unlimited, insert an OP_STAR. */
1033
1034 if (repeat_max < 0)
1035 {
1036 *code++ = c;
1037 *code++ = OP_STAR + repeat_type;
1038 }
1039
1040 /* Else insert an UPTO if the max is greater than the min. */
1041
1042 else if (repeat_max != repeat_min)
1043 {
1044 *code++ = c;
1045 repeat_max -= repeat_min;
1046 *code++ = OP_UPTO + repeat_type;
1047 *code++ = repeat_max >> 8;
1048 *code++ = (repeat_max & 255);
1049 }
1050 }
1051
1052 /* The character or character type itself comes last in all cases. */
1053
1054 *code++ = c;
1055 }
1056
1057 /* If previous was a character class or a back reference, we put the repeat
1058 stuff after it. */
1059
1060 else if (*previous == OP_CLASS || *previous == OP_REF)
1061 {
1062 if (repeat_min == 0 && repeat_max == -1)
1063 *code++ = OP_CRSTAR + repeat_type;
1064 else if (repeat_min == 1 && repeat_max == -1)
1065 *code++ = OP_CRPLUS + repeat_type;
1066 else if (repeat_min == 0 && repeat_max == 1)
1067 *code++ = OP_CRQUERY + repeat_type;
1068 else
1069 {
1070 *code++ = OP_CRRANGE + repeat_type;
1071 *code++ = repeat_min >> 8;
1072 *code++ = repeat_min & 255;
1073 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1074 *code++ = repeat_max >> 8;
1075 *code++ = repeat_max & 255;
1076 }
1077 }
1078
1079 /* If previous was a bracket group, we may have to replicate it in certain
1080 cases. */
1081
1082 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1083 (int)*previous == OP_COND)
1084 {
1085 int i, ketoffset = 0;
1086 int len = code - previous;
1087
1088 /* If the maximum repeat count is unlimited, find the end of the bracket
1089 by scanning through from the start, and compute the offset back to it
1090 from the current code pointer. There may be an OP_OPT setting following
1091 the final KET, so we can't find the end just by going back from the code
1092 pointer. */
1093
1094 if (repeat_max == -1)
1095 {
1096 register uschar *ket = previous;
1097 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1098 ketoffset = code - ket;
1099 }
1100
1101 /* If the minimum is greater than zero, and the maximum is unlimited or
1102 equal to the minimum, the first copy remains where it is, and is
1103 replicated up to the minimum number of times. This case includes the +
1104 repeat, but of course no replication is needed in that case. */
1105
1106 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1107 {
1108 for (i = 1; i < repeat_min; i++)
1109 {
1110 memcpy(code, previous, len);
1111 code += len;
1112 }
1113 }
1114
1115 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1116 Then, if there is a fixed upper limit, replicated up to that many times,
1117 sticking BRAZERO in front of all the optional ones. */
1118
1119 else
1120 {
1121 if (repeat_min == 0)
1122 {
1123 memmove(previous+1, previous, len);
1124 code++;
1125 *previous++ = OP_BRAZERO + repeat_type;
1126 }
1127
1128 for (i = 1; i < repeat_min; i++)
1129 {
1130 memcpy(code, previous, len);
1131 code += len;
1132 }
1133
1134 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1135 {
1136 *code++ = OP_BRAZERO + repeat_type;
1137 memcpy(code, previous, len);
1138 code += len;
1139 }
1140 }
1141
1142 /* If the maximum is unlimited, set a repeater in the final copy. We
1143 can't just offset backwards from the current code point, because we
1144 don't know if there's been an options resetting after the ket. The
1145 correct offset was computed above. */
1146
1147 if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1148 }
1149
1150 /* Else there's some kind of shambles */
1151
1152 else
1153 {
1154 *errorptr = ERR11;
1155 goto FAILED;
1156 }
1157
1158 /* In all case we no longer have a previous item. */
1159
1160 previous = NULL;
1161 break;
1162
1163
1164 /* Start of nested bracket sub-expression, or comment or lookahead or
1165 lookbehind or option setting or condition. First deal with special things
1166 that can come after a bracket; all are introduced by ?, and the appearance
1167 of any of them means that this is not a referencing group. They were
1168 checked for validity in the first pass over the string, so we don't have to
1169 check for syntax errors here. */
1170
1171 case '(':
1172 newoptions = options;
1173 condref = -1;
1174
1175 if (*(++ptr) == '?')
1176 {
1177 int set, unset;
1178 int *optset;
1179
1180 switch (*(++ptr))
1181 {
1182 case '#': /* Comment; skip to ket */
1183 ptr++;
1184 while (*ptr != ')') ptr++;
1185 continue;
1186
1187 case ':': /* Non-extracting bracket */
1188 bravalue = OP_BRA;
1189 ptr++;
1190 break;
1191
1192 case '(':
1193 bravalue = OP_COND; /* Conditional group */
1194 if ((pcre_ctypes[*(++ptr)] & ctype_digit) != 0)
1195 {
1196 condref = *ptr - '0';
1197 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1198 ptr++;
1199 }
1200 else ptr--;
1201 break;
1202
1203 case '=': /* Positive lookahead */
1204 bravalue = OP_ASSERT;
1205 ptr++;
1206 break;
1207
1208 case '!': /* Negative lookahead */
1209 bravalue = OP_ASSERT_NOT;
1210 ptr++;
1211 break;
1212
1213 case '<': /* Lookbehinds */
1214 switch (*(++ptr))
1215 {
1216 case '=': /* Positive lookbehind */
1217 bravalue = OP_ASSERTBACK;
1218 ptr++;
1219 break;
1220
1221 case '!': /* Negative lookbehind */
1222 bravalue = OP_ASSERTBACK_NOT;
1223 ptr++;
1224 break;
1225
1226 default: /* Syntax error */
1227 *errorptr = ERR24;
1228 goto FAILED;
1229 }
1230 break;
1231
1232 case '>': /* One-time brackets */
1233 bravalue = OP_ONCE;
1234 ptr++;
1235 break;
1236
1237 default: /* Option setting */
1238 set = unset = 0;
1239 optset = &set;
1240
1241 while (*ptr != ')' && *ptr != ':')
1242 {
1243 switch (*ptr++)
1244 {
1245 case '-': optset = &unset; break;
1246
1247 case 'i': *optset |= PCRE_CASELESS; break;
1248 case 'm': *optset |= PCRE_MULTILINE; break;
1249 case 's': *optset |= PCRE_DOTALL; break;
1250 case 'x': *optset |= PCRE_EXTENDED; break;
1251 case 'U': *optset |= PCRE_UNGREEDY; break;
1252 case 'X': *optset |= PCRE_EXTRA; break;
1253
1254 default:
1255 *errorptr = ERR12;
1256 goto FAILED;
1257 }
1258 }
1259
1260 /* Set up the changed option bits, but don't change anything yet. */
1261
1262 newoptions = (options | set) & (~unset);
1263
1264 /* If the options ended with ')' this is not the start of a nested
1265 group with option changes, so the options change at this level. At top
1266 level there is nothing else to be done (the options will in fact have
1267 been set from the start of compiling as a result of the first pass) but
1268 at an inner level we must compile code to change the ims options if
1269 necessary, and pass the new setting back so that it can be put at the
1270 start of any following branches, and when this group ends, a resetting
1271 item can be compiled. */
1272
1273 if (*ptr == ')')
1274 {
1275 if ((options & PCRE_INGROUP) != 0 &&
1276 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1277 {
1278 *code++ = OP_OPT;
1279 *code++ = *optchanged = newoptions & PCRE_IMS;
1280 }
1281 options = newoptions; /* Change options at this level */
1282 previous = NULL; /* This item can't be repeated */
1283 continue; /* It is complete */
1284 }
1285
1286 /* If the options ended with ':' we are heading into a nested group
1287 with possible change of options. Such groups are non-capturing and are
1288 not assertions of any kind. All we need to do is skip over the ':';
1289 the newoptions value is handled below. */
1290
1291 bravalue = OP_BRA;
1292 ptr++;
1293 }
1294 }
1295
1296 /* Else we have a referencing group; adjust the opcode. */
1297
1298 else
1299 {
1300 if (++(*brackets) > EXTRACT_MAX)
1301 {
1302 *errorptr = ERR13;
1303 goto FAILED;
1304 }
1305 bravalue = OP_BRA + *brackets;
1306 }
1307
1308 /* Process nested bracketed re. Assertions may not be repeated, but other
1309 kinds can be. We copy code into a non-register variable in order to be able
1310 to pass its address because some compilers complain otherwise. Pass in a
1311 new setting for the ims options if they have changed. */
1312
1313 previous = (bravalue >= OP_ONCE)? code : NULL;
1314 *code = bravalue;
1315 tempcode = code;
1316
1317 if (!compile_regex(
1318 options | PCRE_INGROUP, /* Set for all nested groups */
1319 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1320 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1321 brackets, /* Bracket level */
1322 &tempcode, /* Where to put code (updated) */
1323 &ptr, /* Input pointer (updated) */
1324 errorptr, /* Where to put an error message */
1325 (bravalue == OP_ASSERTBACK ||
1326 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1327 condref)) /* Condition reference number */
1328 goto FAILED;
1329
1330 /* At the end of compiling, code is still pointing to the start of the
1331 group, while tempcode has been updated to point past the end of the group
1332 and any option resetting that may follow it. The pattern pointer (ptr)
1333 is on the bracket. */
1334
1335 /* If this is a conditional bracket, check that there are no more than
1336 two branches in the group. */
1337
1338 if (bravalue == OP_COND)
1339 {
1340 int branchcount = 0;
1341 uschar *tc = code;
1342
1343 do {
1344 branchcount++;
1345 tc += (tc[1] << 8) | tc[2];
1346 }
1347 while (*tc != OP_KET);
1348
1349 if (branchcount > 2)
1350 {
1351 *errorptr = ERR27;
1352 goto FAILED;
1353 }
1354 }
1355
1356 /* Now update the main code pointer to the end of the group. */
1357
1358 code = tempcode;
1359
1360 /* Error if hit end of pattern */
1361
1362 if (*ptr != ')')
1363 {
1364 *errorptr = ERR14;
1365 goto FAILED;
1366 }
1367 break;
1368
1369 /* Check \ for being a real metacharacter; if not, fall through and handle
1370 it as a data character at the start of a string. Escape items are checked
1371 for validity in the pre-compiling pass. */
1372
1373 case '\\':
1374 tempptr = ptr;
1375 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1376
1377 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1378 are arranged to be the negation of the corresponding OP_values. For the
1379 back references, the values are ESC_REF plus the reference number. Only
1380 back references and those types that consume a character may be repeated.
1381 We can test for values between ESC_b and ESC_Z for the latter; this may
1382 have to change if any new ones are ever created. */
1383
1384 if (c < 0)
1385 {
1386 if (-c >= ESC_REF)
1387 {
1388 previous = code;
1389 *code++ = OP_REF;
1390 *code++ = -c - ESC_REF;
1391 }
1392 else
1393 {
1394 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1395 *code++ = -c;
1396 }
1397 continue;
1398 }
1399
1400 /* Data character: reset and fall through */
1401
1402 ptr = tempptr;
1403 c = '\\';
1404
1405 /* Handle a run of data characters until a metacharacter is encountered.
1406 The first character is guaranteed not to be whitespace or # when the
1407 extended flag is set. */
1408
1409 NORMAL_CHAR:
1410 default:
1411 previous = code;
1412 *code = OP_CHARS;
1413 code += 2;
1414 length = 0;
1415
1416 do
1417 {
1418 if ((options & PCRE_EXTENDED) != 0)
1419 {
1420 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1421 if (c == '#')
1422 {
1423 while ((c = *(++ptr)) != 0 && c != '\n');
1424 if (c == 0) break;
1425 continue;
1426 }
1427 }
1428
1429 /* Backslash may introduce a data char or a metacharacter. Escaped items
1430 are checked for validity in the pre-compiling pass. Stop the string
1431 before a metaitem. */
1432
1433 if (c == '\\')
1434 {
1435 tempptr = ptr;
1436 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1437 if (c < 0) { ptr = tempptr; break; }
1438 }
1439
1440 /* Ordinary character or single-char escape */
1441
1442 *code++ = c;
1443 length++;
1444 }
1445
1446 /* This "while" is the end of the "do" above. */
1447
1448 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1449
1450 /* Compute the length and set it in the data vector, and advance to
1451 the next state. */
1452
1453 previous[1] = length;
1454 if (length < 255) ptr--;
1455 break;
1456 }
1457 } /* end of big loop */
1458
1459 /* Control never reaches here by falling through, only by a goto for all the
1460 error states. Pass back the position in the pattern so that it can be displayed
1461 to the user for diagnosing the error. */
1462
1463 FAILED:
1464 *ptrptr = ptr;
1465 return FALSE;
1466 }
1467
1468
1469
1470
1471 /*************************************************
1472 * Compile sequence of alternatives *
1473 *************************************************/
1474
1475 /* On entry, ptr is pointing past the bracket character, but on return
1476 it points to the closing bracket, or vertical bar, or end of string.
1477 The code variable is pointing at the byte into which the BRA operator has been
1478 stored. If the ims options are changed at the start (for a (?ims: group) or
1479 during any branch, we need to insert an OP_OPT item at the start of every
1480 following branch to ensure they get set correctly at run time, and also pass
1481 the new options into every subsequent branch compile.
1482
1483 Argument:
1484 options the option bits
1485 optchanged new ims options to set as if (?ims) were at the start, or -1
1486 for no change
1487 brackets -> int containing the number of extracting brackets used
1488 codeptr -> the address of the current code pointer
1489 ptrptr -> the address of the current pattern pointer
1490 errorptr -> pointer to error message
1491 lookbehind TRUE if this is a lookbehind assertion
1492 condref > 0 for OPT_CREF setting at start of conditional group
1493
1494 Returns: TRUE on success
1495 */
1496
1497 static BOOL
1498 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1499 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref)
1500 {
1501 const uschar *ptr = *ptrptr;
1502 uschar *code = *codeptr;
1503 uschar *last_branch = code;
1504 uschar *start_bracket = code;
1505 uschar *reverse_count = NULL;
1506 int oldoptions = options & PCRE_IMS;
1507
1508 code += 3;
1509
1510 /* At the start of a reference-based conditional group, insert the reference
1511 number as an OP_CREF item. */
1512
1513 if (condref > 0)
1514 {
1515 *code++ = OP_CREF;
1516 *code++ = condref;
1517 }
1518
1519 /* Loop for each alternative branch */
1520
1521 for (;;)
1522 {
1523 int length;
1524
1525 /* Handle change of options */
1526
1527 if (optchanged >= 0)
1528 {
1529 *code++ = OP_OPT;
1530 *code++ = optchanged;
1531 options = (options & ~PCRE_IMS) | optchanged;
1532 }
1533
1534 /* Set up dummy OP_REVERSE if lookbehind assertion */
1535
1536 if (lookbehind)
1537 {
1538 *code++ = OP_REVERSE;
1539 reverse_count = code;
1540 *code++ = 0;
1541 *code++ = 0;
1542 }
1543
1544 /* Now compile the branch */
1545
1546 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged))
1547 {
1548 *ptrptr = ptr;
1549 return FALSE;
1550 }
1551
1552 /* Fill in the length of the last branch */
1553
1554 length = code - last_branch;
1555 last_branch[1] = length >> 8;
1556 last_branch[2] = length & 255;
1557
1558 /* If lookbehind, check that this branch matches a fixed-length string,
1559 and put the length into the OP_REVERSE item. Temporarily mark the end of
1560 the branch with OP_END. */
1561
1562 if (lookbehind)
1563 {
1564 *code = OP_END;
1565 length = find_fixedlength(last_branch);
1566 DPRINTF(("fixed length = %d\n", length));
1567 if (length < 0)
1568 {
1569 *errorptr = ERR25;
1570 *ptrptr = ptr;
1571 return FALSE;
1572 }
1573 reverse_count[0] = (length >> 8);
1574 reverse_count[1] = length & 255;
1575 }
1576
1577 /* Reached end of expression, either ')' or end of pattern. Insert a
1578 terminating ket and the length of the whole bracketed item, and return,
1579 leaving the pointer at the terminating char. If any of the ims options
1580 were changed inside the group, compile a resetting op-code following. */
1581
1582 if (*ptr != '|')
1583 {
1584 length = code - start_bracket;
1585 *code++ = OP_KET;
1586 *code++ = length >> 8;
1587 *code++ = length & 255;
1588 if (optchanged >= 0)
1589 {
1590 *code++ = OP_OPT;
1591 *code++ = oldoptions;
1592 }
1593 *codeptr = code;
1594 *ptrptr = ptr;
1595 return TRUE;
1596 }
1597
1598 /* Another branch follows; insert an "or" node and advance the pointer. */
1599
1600 *code = OP_ALT;
1601 last_branch = code;
1602 code += 3;
1603 ptr++;
1604 }
1605 /* Control never reaches here */
1606 }
1607
1608
1609
1610
1611 /*************************************************
1612 * Find first significant op code *
1613 *************************************************/
1614
1615 /* This is called by several functions that scan a compiled expression looking
1616 for a fixed first character, or an anchoring op code etc. It skips over things
1617 that do not influence this. For one application, a change of caseless option is
1618 important.
1619
1620 Arguments:
1621 code pointer to the start of the group
1622 options pointer to external options
1623 optbit the option bit whose changing is significant, or
1624 zero if none are
1625 optstop TRUE to return on option change, otherwise change the options
1626 value and continue
1627
1628 Returns: pointer to the first significant opcode
1629 */
1630
1631 static const uschar*
1632 first_significant_code(const uschar *code, int *options, int optbit,
1633 BOOL optstop)
1634 {
1635 for (;;)
1636 {
1637 switch ((int)*code)
1638 {
1639 case OP_OPT:
1640 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1641 {
1642 if (optstop) return code;
1643 *options = (int)code[1];
1644 }
1645 code += 2;
1646 break;
1647
1648 case OP_CREF:
1649 code += 2;
1650 break;
1651
1652 case OP_ASSERT_NOT:
1653 case OP_ASSERTBACK:
1654 case OP_ASSERTBACK_NOT:
1655 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1656 code += 3;
1657 break;
1658
1659 default:
1660 return code;
1661 }
1662 }
1663 /* Control never reaches here */
1664 }
1665
1666
1667
1668
1669 /*************************************************
1670 * Check for anchored expression *
1671 *************************************************/
1672
1673 /* Try to find out if this is an anchored regular expression. Consider each
1674 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1675 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1676 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1677 counts, since OP_CIRC can match in the middle.
1678
1679 A branch is also implicitly anchored if it starts with .* because that will try
1680 the rest of the pattern at all possible matching points, so there is no point
1681 trying them again.
1682
1683 Arguments:
1684 code points to start of expression (the bracket)
1685 options points to the options setting
1686
1687 Returns: TRUE or FALSE
1688 */
1689
1690 static BOOL
1691 is_anchored(register const uschar *code, int *options)
1692 {
1693 do {
1694 const uschar *scode = first_significant_code(code + 3, options,
1695 PCRE_MULTILINE, FALSE);
1696 register int op = *scode;
1697 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1698 { if (!is_anchored(scode, options)) return FALSE; }
1699 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1700 { if (scode[1] != OP_ANY) return FALSE; }
1701 else if (op != OP_SOD &&
1702 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1703 return FALSE;
1704 code += (code[1] << 8) + code[2];
1705 }
1706 while (*code == OP_ALT);
1707 return TRUE;
1708 }
1709
1710
1711
1712 /*************************************************
1713 * Check for start with \n line expression *
1714 *************************************************/
1715
1716 /* This is called for multiline expressions to try to find out if every branch
1717 starts with ^ so that "first char" processing can be done to speed things up.
1718
1719 Argument: points to start of expression (the bracket)
1720 Returns: TRUE or FALSE
1721 */
1722
1723 static BOOL
1724 is_startline(const uschar *code)
1725 {
1726 do {
1727 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1728 register int op = *scode;
1729 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1730 { if (!is_startline(scode)) return FALSE; }
1731 else if (op != OP_CIRC) return FALSE;
1732 code += (code[1] << 8) + code[2];
1733 }
1734 while (*code == OP_ALT);
1735 return TRUE;
1736 }
1737
1738
1739
1740 /*************************************************
1741 * Check for fixed first char *
1742 *************************************************/
1743
1744 /* Try to find out if there is a fixed first character. This is called for
1745 unanchored expressions, as it speeds up their processing quite considerably.
1746 Consider each alternative branch. If they all start with the same char, or with
1747 a bracket all of whose alternatives start with the same char (recurse ad lib),
1748 then we return that char, otherwise -1.
1749
1750 Arguments:
1751 code points to start of expression (the bracket)
1752 options pointer to the options (used to check casing changes)
1753
1754 Returns: -1 or the fixed first char
1755 */
1756
1757 static int
1758 find_firstchar(const uschar *code, int *options)
1759 {
1760 register int c = -1;
1761 do {
1762 int d;
1763 const uschar *scode = first_significant_code(code + 3, options,
1764 PCRE_CASELESS, TRUE);
1765 register int op = *scode;
1766
1767 if (op >= OP_BRA) op = OP_BRA;
1768
1769 switch(op)
1770 {
1771 default:
1772 return -1;
1773
1774 case OP_BRA:
1775 case OP_ASSERT:
1776 case OP_ONCE:
1777 case OP_COND:
1778 if ((d = find_firstchar(scode, options)) < 0) return -1;
1779 if (c < 0) c = d; else if (c != d) return -1;
1780 break;
1781
1782 case OP_EXACT: /* Fall through */
1783 scode++;
1784
1785 case OP_CHARS: /* Fall through */
1786 scode++;
1787
1788 case OP_PLUS:
1789 case OP_MINPLUS:
1790 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1791 break;
1792 }
1793
1794 code += (code[1] << 8) + code[2];
1795 }
1796 while (*code == OP_ALT);
1797 return c;
1798 }
1799
1800
1801
1802
1803
1804 /*************************************************
1805 * Compile a Regular Expression *
1806 *************************************************/
1807
1808 /* This function takes a string and returns a pointer to a block of store
1809 holding a compiled version of the expression.
1810
1811 Arguments:
1812 pattern the regular expression
1813 options various option bits
1814 errorptr pointer to pointer to error text
1815 erroroffset ptr offset in pattern where error was detected
1816
1817 Returns: pointer to compiled data block, or NULL on error,
1818 with errorptr and erroroffset set
1819 */
1820
1821 pcre *
1822 pcre_compile(const char *pattern, int options, const char **errorptr,
1823 int *erroroffset)
1824 {
1825 real_pcre *re;
1826 int length = 3; /* For initial BRA plus length */
1827 int runlength;
1828 int c, size;
1829 int bracount = 0;
1830 int top_backref = 0;
1831 int branch_extra = 0;
1832 int branch_newextra;
1833 unsigned int brastackptr = 0;
1834 uschar *code;
1835 const uschar *ptr;
1836 int brastack[BRASTACK_SIZE];
1837 uschar bralenstack[BRASTACK_SIZE];
1838
1839 #ifdef DEBUG
1840 uschar *code_base, *code_end;
1841 #endif
1842
1843 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1844 can do is just return NULL. */
1845
1846 if (errorptr == NULL) return NULL;
1847 *errorptr = NULL;
1848
1849 /* However, we can give a message for this error */
1850
1851 if (erroroffset == NULL)
1852 {
1853 *errorptr = ERR16;
1854 return NULL;
1855 }
1856 *erroroffset = 0;
1857
1858 if ((options & ~PUBLIC_OPTIONS) != 0)
1859 {
1860 *errorptr = ERR17;
1861 return NULL;
1862 }
1863
1864 DPRINTF(("------------------------------------------------------------------\n"));
1865 DPRINTF(("%s\n", pattern));
1866
1867 /* The first thing to do is to make a pass over the pattern to compute the
1868 amount of store required to hold the compiled code. This does not have to be
1869 perfect as long as errors are overestimates. At the same time we can detect any
1870 internal flag settings. Make an attempt to correct for any counted white space
1871 if an "extended" flag setting appears late in the pattern. We can't be so
1872 clever for #-comments. */
1873
1874 ptr = (const uschar *)(pattern - 1);
1875 while ((c = *(++ptr)) != 0)
1876 {
1877 int min, max;
1878 int class_charcount;
1879
1880 if ((options & PCRE_EXTENDED) != 0)
1881 {
1882 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1883 if (c == '#')
1884 {
1885 while ((c = *(++ptr)) != 0 && c != '\n');
1886 continue;
1887 }
1888 }
1889
1890 switch(c)
1891 {
1892 /* A backslashed item may be an escaped "normal" character or a
1893 character type. For a "normal" character, put the pointers and
1894 character back so that tests for whitespace etc. in the input
1895 are done correctly. */
1896
1897 case '\\':
1898 {
1899 const uschar *save_ptr = ptr;
1900 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1901 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1902 if (c >= 0)
1903 {
1904 ptr = save_ptr;
1905 c = '\\';
1906 goto NORMAL_CHAR;
1907 }
1908 }
1909 length++;
1910
1911 /* A back reference needs an additional char, plus either one or 5
1912 bytes for a repeat. We also need to keep the value of the highest
1913 back reference. */
1914
1915 if (c <= -ESC_REF)
1916 {
1917 int refnum = -c - ESC_REF;
1918 if (refnum > top_backref) top_backref = refnum;
1919 length++; /* For single back reference */
1920 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1921 {
1922 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1923 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1924 if ((min == 0 && (max == 1 || max == -1)) ||
1925 (min == 1 && max == -1))
1926 length++;
1927 else length += 5;
1928 if (ptr[1] == '?') ptr++;
1929 }
1930 }
1931 continue;
1932
1933 case '^':
1934 case '.':
1935 case '$':
1936 case '*': /* These repeats won't be after brackets; */
1937 case '+': /* those are handled separately */
1938 case '?':
1939 length++;
1940 continue;
1941
1942 /* This covers the cases of repeats after a single char, metachar, class,
1943 or back reference. */
1944
1945 case '{':
1946 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1947 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1948 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1949 if ((min == 0 && (max == 1 || max == -1)) ||
1950 (min == 1 && max == -1))
1951 length++;
1952 else
1953 {
1954 length--; /* Uncount the original char or metachar */
1955 if (min == 1) length++; else if (min > 0) length += 4;
1956 if (max > 0) length += 4; else length += 2;
1957 }
1958 if (ptr[1] == '?') ptr++;
1959 continue;
1960
1961 /* An alternation contains an offset to the next branch or ket. If any ims
1962 options changed in the previous branch(es), and/or if we are in a
1963 lookbehind assertion, extra space will be needed at the start of the
1964 branch. This is handled by branch_extra. */
1965
1966 case '|':
1967 length += 3 + branch_extra;
1968 continue;
1969
1970 /* A character class uses 33 characters. Don't worry about character types
1971 that aren't allowed in classes - they'll get picked up during the compile.
1972 A character class that contains only one character uses 2 or 3 bytes,
1973 depending on whether it is negated or not. Notice this where we can. */
1974
1975 case '[':
1976 class_charcount = 0;
1977 if (*(++ptr) == '^') ptr++;
1978 do
1979 {
1980 if (*ptr == '\\')
1981 {
1982 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1983 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1984 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1985 }
1986 else class_charcount++;
1987 ptr++;
1988 }
1989 while (*ptr != 0 && *ptr != ']');
1990
1991 /* Repeats for negated single chars are handled by the general code */
1992
1993 if (class_charcount == 1) length += 3; else
1994 {
1995 length += 33;
1996
1997 /* A repeat needs either 1 or 5 bytes. */
1998
1999 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
2000 {
2001 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2002 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2003 if ((min == 0 && (max == 1 || max == -1)) ||
2004 (min == 1 && max == -1))
2005 length++;
2006 else length += 5;
2007 if (ptr[1] == '?') ptr++;
2008 }
2009 }
2010 continue;
2011
2012 /* Brackets may be genuine groups or special things */
2013
2014 case '(':
2015 branch_newextra = 0;
2016
2017 /* Handle special forms of bracket, which all start (? */
2018
2019 if (ptr[1] == '?')
2020 {
2021 int set, unset;
2022 int *optset;
2023
2024 switch (c = ptr[2])
2025 {
2026 /* Skip over comments entirely */
2027 case '#':
2028 ptr += 3;
2029 while (*ptr != 0 && *ptr != ')') ptr++;
2030 if (*ptr == 0)
2031 {
2032 *errorptr = ERR18;
2033 goto PCRE_ERROR_RETURN;
2034 }
2035 continue;
2036
2037 /* Non-referencing groups and lookaheads just move the pointer on, and
2038 then behave like a non-special bracket, except that they don't increment
2039 the count of extracting brackets. Ditto for the "once only" bracket,
2040 which is in Perl from version 5.005. */
2041
2042 case ':':
2043 case '=':
2044 case '!':
2045 case '>':
2046 ptr += 2;
2047 break;
2048
2049 /* Lookbehinds are in Perl from version 5.005 */
2050
2051 case '<':
2052 if (ptr[3] == '=' || ptr[3] == '!')
2053 {
2054 ptr += 3;
2055 branch_newextra = 3;
2056 length += 3; /* For the first branch */
2057 break;
2058 }
2059 *errorptr = ERR24;
2060 goto PCRE_ERROR_RETURN;
2061
2062 /* Conditionals are in Perl from version 5.005. The bracket must either
2063 be followed by a number (for bracket reference) or by an assertion
2064 group. */
2065
2066 case '(':
2067 if ((pcre_ctypes[ptr[3]] & ctype_digit) != 0)
2068 {
2069 ptr += 4;
2070 length += 2;
2071 while ((pcre_ctypes[*ptr] & ctype_digit) != 0) ptr++;
2072 if (*ptr != ')')
2073 {
2074 *errorptr = ERR26;
2075 goto PCRE_ERROR_RETURN;
2076 }
2077 }
2078 else /* An assertion must follow */
2079 {
2080 ptr++; /* Can treat like ':' as far as spacing is concerned */
2081
2082 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2083 {
2084 ptr += 2; /* To get right offset in message */
2085 *errorptr = ERR28;
2086 goto PCRE_ERROR_RETURN;
2087 }
2088 }
2089 break;
2090
2091 /* Else loop checking valid options until ) is met. Anything else is an
2092 error. If we are without any brackets, i.e. at top level, the settings
2093 act as if specified in the options, so massage the options immediately.
2094 This is for backward compatibility with Perl 5.004. */
2095
2096 default:
2097 set = unset = 0;
2098 optset = &set;
2099 ptr += 2;
2100
2101 for (;; ptr++)
2102 {
2103 c = *ptr;
2104 switch (c)
2105 {
2106 case 'i':
2107 *optset |= PCRE_CASELESS;
2108 continue;
2109
2110 case 'm':
2111 *optset |= PCRE_MULTILINE;
2112 continue;
2113
2114 case 's':
2115 *optset |= PCRE_DOTALL;
2116 continue;
2117
2118 case 'x':
2119 *optset |= PCRE_EXTENDED;
2120 continue;
2121
2122 case 'X':
2123 *optset |= PCRE_EXTRA;
2124 continue;
2125
2126 case 'U':
2127 *optset |= PCRE_UNGREEDY;
2128 continue;
2129
2130 case '-':
2131 optset = &unset;
2132 continue;
2133
2134 /* A termination by ')' indicates an options-setting-only item;
2135 this is global at top level; otherwise nothing is done here and
2136 it is handled during the compiling process on a per-bracket-group
2137 basis. */
2138
2139 case ')':
2140 if (brastackptr == 0)
2141 {
2142 options = (options | set) & (~unset);
2143 set = unset = 0; /* To save length */
2144 }
2145 /* Fall through */
2146
2147 /* A termination by ':' indicates the start of a nested group with
2148 the given options set. This is again handled at compile time, but
2149 we must allow for compiled space if any of the ims options are
2150 set. We also have to allow for resetting space at the end of
2151 the group, which is why 4 is added to the length and not just 2.
2152 If there are several changes of options within the same group, this
2153 will lead to an over-estimate on the length, but this shouldn't
2154 matter very much. We also have to allow for resetting options at
2155 the start of any alternations, which we do by setting
2156 branch_newextra to 2. */
2157
2158 case ':':
2159 if (((set|unset) & PCRE_IMS) != 0)
2160 {
2161 length += 4;
2162 branch_newextra = 2;
2163 }
2164 goto END_OPTIONS;
2165
2166 /* Unrecognized option character */
2167
2168 default:
2169 *errorptr = ERR12;
2170 goto PCRE_ERROR_RETURN;
2171 }
2172 }
2173
2174 /* If we hit a closing bracket, that's it - this is a freestanding
2175 option-setting. We need to ensure that branch_extra is updated if
2176 necessary. The only values branch_newextra can have here are 0 or 2.
2177 If the value is 2, then branch_extra must either be 2 or 5, depending
2178 on whether this is a lookbehind group or not. */
2179
2180 END_OPTIONS:
2181 if (c == ')')
2182 {
2183 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2184 branch_extra += branch_newextra;
2185 continue;
2186 }
2187
2188 /* If options were terminated by ':' control comes here. Fall through
2189 to handle the group below. */
2190 }
2191 }
2192
2193 /* Extracting brackets must be counted so we can process escapes in a
2194 Perlish way. */
2195
2196 else bracount++;
2197
2198 /* Non-special forms of bracket. Save length for computing whole length
2199 at end if there's a repeat that requires duplication of the group. Also
2200 save the current value of branch_extra, and start the new group with
2201 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2202 for a lookbehind assertion. */
2203
2204 if (brastackptr >= sizeof(brastack)/sizeof(int))
2205 {
2206 *errorptr = ERR19;
2207 goto PCRE_ERROR_RETURN;
2208 }
2209
2210 bralenstack[brastackptr] = branch_extra;
2211 branch_extra = branch_newextra;
2212
2213 brastack[brastackptr++] = length;
2214 length += 3;
2215 continue;
2216
2217 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2218 have to replicate this bracket up to that many times. If brastackptr is
2219 0 this is an unmatched bracket which will generate an error, but take care
2220 not to try to access brastack[-1] when computing the length and restoring
2221 the branch_extra value. */
2222
2223 case ')':
2224 length += 3;
2225 {
2226 int minval = 1;
2227 int maxval = 1;
2228 int duplength;
2229
2230 if (brastackptr > 0)
2231 {
2232 duplength = length - brastack[--brastackptr];
2233 branch_extra = bralenstack[brastackptr];
2234 }
2235 else duplength = 0;
2236
2237 /* Leave ptr at the final char; for read_repeat_counts this happens
2238 automatically; for the others we need an increment. */
2239
2240 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
2241 {
2242 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
2243 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2244 }
2245 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2246 else if (c == '+') { maxval = -1; ptr++; }
2247 else if (c == '?') { minval = 0; ptr++; }
2248
2249 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2250 if there is a limited maximum we have to replicate up to maxval-1 times
2251 and allow for a BRAZERO item before each optional copy, as we also have
2252 to do before the first copy if the minimum is zero. */
2253
2254 if (minval == 0) length++;
2255 else if (minval > 1) length += (minval - 1) * duplength;
2256 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
2257 }
2258 continue;
2259
2260 /* Non-special character. For a run of such characters the length required
2261 is the number of characters + 2, except that the maximum run length is 255.
2262 We won't get a skipped space or a non-data escape or the start of a #
2263 comment as the first character, so the length can't be zero. */
2264
2265 NORMAL_CHAR:
2266 default:
2267 length += 2;
2268 runlength = 0;
2269 do
2270 {
2271 if ((options & PCRE_EXTENDED) != 0)
2272 {
2273 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2274 if (c == '#')
2275 {
2276 while ((c = *(++ptr)) != 0 && c != '\n');
2277 continue;
2278 }
2279 }
2280
2281 /* Backslash may introduce a data char or a metacharacter; stop the
2282 string before the latter. */
2283
2284 if (c == '\\')
2285 {
2286 const uschar *saveptr = ptr;
2287 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
2288 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2289 if (c < 0) { ptr = saveptr; break; }
2290 }
2291
2292 /* Ordinary character or single-char escape */
2293
2294 runlength++;
2295 }
2296
2297 /* This "while" is the end of the "do" above. */
2298
2299 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2300
2301 ptr--;
2302 length += runlength;
2303 continue;
2304 }
2305 }
2306
2307 length += 4; /* For final KET and END */
2308
2309 if (length > 65539)
2310 {
2311 *errorptr = ERR20;
2312 return NULL;
2313 }
2314
2315 /* Compute the size of data block needed and get it, either from malloc or
2316 externally provided function. We specify "code[0]" in the offsetof() expression
2317 rather than just "code", because it has been reported that one broken compiler
2318 fails on "code" because it is also an independent variable. It should make no
2319 difference to the value of the offsetof(). */
2320
2321 size = length + offsetof(real_pcre, code[0]);
2322 re = (real_pcre *)(pcre_malloc)(size);
2323
2324 if (re == NULL)
2325 {
2326 *errorptr = ERR21;
2327 return NULL;
2328 }
2329
2330 /* Put in the magic number and the options. */
2331
2332 re->magic_number = MAGIC_NUMBER;
2333 re->options = options;
2334
2335 /* Set up a starting, non-extracting bracket, then compile the expression. On
2336 error, *errorptr will be set non-NULL, so we don't need to look at the result
2337 of the function here. */
2338
2339 ptr = (const uschar *)pattern;
2340 code = re->code;
2341 *code = OP_BRA;
2342 bracount = 0;
2343 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1);
2344 re->top_bracket = bracount;
2345 re->top_backref = top_backref;
2346
2347 /* If not reached end of pattern on success, there's an excess bracket. */
2348
2349 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2350
2351 /* Fill in the terminating state and check for disastrous overflow, but
2352 if debugging, leave the test till after things are printed out. */
2353
2354 *code++ = OP_END;
2355
2356 #ifndef DEBUG
2357 if (code - re->code > length) *errorptr = ERR23;
2358 #endif
2359
2360 /* Give an error if there's back reference to a non-existent capturing
2361 subpattern. */
2362
2363 if (top_backref > re->top_bracket) *errorptr = ERR15;
2364
2365 /* Failed to compile */
2366
2367 if (*errorptr != NULL)
2368 {
2369 (pcre_free)(re);
2370 PCRE_ERROR_RETURN:
2371 *erroroffset = ptr - (const uschar *)pattern;
2372 return NULL;
2373 }
2374
2375 /* If the anchored option was not passed, set flag if we can determine that it
2376 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2377 we can determine what the first character has to be, because that speeds up
2378 unanchored matches no end. In the case of multiline matches, an alternative is
2379 to set the PCRE_STARTLINE flag if all branches start with ^. */
2380
2381 if ((options & PCRE_ANCHORED) == 0)
2382 {
2383 int temp_options = options;
2384 if (is_anchored(re->code, &temp_options))
2385 re->options |= PCRE_ANCHORED;
2386 else
2387 {
2388 int ch = find_firstchar(re->code, &temp_options);
2389 if (ch >= 0)
2390 {
2391 re->first_char = ch;
2392 re->options |= PCRE_FIRSTSET;
2393 }
2394 else if (is_startline(re->code))
2395 re->options |= PCRE_STARTLINE;
2396 }
2397 }
2398
2399 /* Print out the compiled data for debugging */
2400
2401 #ifdef DEBUG
2402
2403 printf("Length = %d top_bracket = %d top_backref = %d\n",
2404 length, re->top_bracket, re->top_backref);
2405
2406 if (re->options != 0)
2407 {
2408 printf("%s%s%s%s%s%s%s%s\n",
2409 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2410 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2411 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2412 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2413 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2414 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2415 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2416 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2417 }
2418
2419 if ((re->options & PCRE_FIRSTSET) != 0)
2420 {
2421 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2422 else printf("First char = \\x%02x\n", re->first_char);
2423 }
2424
2425 code_end = code;
2426 code_base = code = re->code;
2427
2428 while (code < code_end)
2429 {
2430 int charlength;
2431
2432 printf("%3d ", code - code_base);
2433
2434 if (*code >= OP_BRA)
2435 {
2436 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2437 code += 2;
2438 }
2439
2440 else switch(*code)
2441 {
2442 case OP_OPT:
2443 printf(" %.2x %s", code[1], OP_names[*code]);
2444 code++;
2445 break;
2446
2447 case OP_COND:
2448 printf("%3d Cond", (code[1] << 8) + code[2]);
2449 code += 2;
2450 break;
2451
2452 case OP_CREF:
2453 printf(" %.2d %s", code[1], OP_names[*code]);
2454 code++;
2455 break;
2456
2457 case OP_CHARS:
2458 charlength = *(++code);
2459 printf("%3d ", charlength);
2460 while (charlength-- > 0)
2461 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2462 break;
2463
2464 case OP_KETRMAX:
2465 case OP_KETRMIN:
2466 case OP_ALT:
2467 case OP_KET:
2468 case OP_ASSERT:
2469 case OP_ASSERT_NOT:
2470 case OP_ASSERTBACK:
2471 case OP_ASSERTBACK_NOT:
2472 case OP_ONCE:
2473 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2474 code += 2;
2475 break;
2476
2477 case OP_REVERSE:
2478 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2479 code += 2;
2480 break;
2481
2482 case OP_STAR:
2483 case OP_MINSTAR:
2484 case OP_PLUS:
2485 case OP_MINPLUS:
2486 case OP_QUERY:
2487 case OP_MINQUERY:
2488 case OP_TYPESTAR:
2489 case OP_TYPEMINSTAR:
2490 case OP_TYPEPLUS:
2491 case OP_TYPEMINPLUS:
2492 case OP_TYPEQUERY:
2493 case OP_TYPEMINQUERY:
2494 if (*code >= OP_TYPESTAR)
2495 printf(" %s", OP_names[code[1]]);
2496 else if (isprint(c = code[1])) printf(" %c", c);
2497 else printf(" \\x%02x", c);
2498 printf("%s", OP_names[*code++]);
2499 break;
2500
2501 case OP_EXACT:
2502 case OP_UPTO:
2503 case OP_MINUPTO:
2504 if (isprint(c = code[3])) printf(" %c{", c);
2505 else printf(" \\x%02x{", c);
2506 if (*code != OP_EXACT) printf("0,");
2507 printf("%d}", (code[1] << 8) + code[2]);
2508 if (*code == OP_MINUPTO) printf("?");
2509 code += 3;
2510 break;
2511
2512 case OP_TYPEEXACT:
2513 case OP_TYPEUPTO:
2514 case OP_TYPEMINUPTO:
2515 printf(" %s{", OP_names[code[3]]);
2516 if (*code != OP_TYPEEXACT) printf(",");
2517 printf("%d}", (code[1] << 8) + code[2]);
2518 if (*code == OP_TYPEMINUPTO) printf("?");
2519 code += 3;
2520 break;
2521
2522 case OP_NOT:
2523 if (isprint(c = *(++code))) printf(" [^%c]", c);
2524 else printf(" [^\\x%02x]", c);
2525 break;
2526
2527 case OP_NOTSTAR:
2528 case OP_NOTMINSTAR:
2529 case OP_NOTPLUS:
2530 case OP_NOTMINPLUS:
2531 case OP_NOTQUERY:
2532 case OP_NOTMINQUERY:
2533 if (isprint(c = code[1])) printf(" [^%c]", c);
2534 else printf(" [^\\x%02x]", c);
2535 printf("%s", OP_names[*code++]);
2536 break;
2537
2538 case OP_NOTEXACT:
2539 case OP_NOTUPTO:
2540 case OP_NOTMINUPTO:
2541 if (isprint(c = code[3])) printf(" [^%c]{", c);
2542 else printf(" [^\\x%02x]{", c);
2543 if (*code != OP_NOTEXACT) printf(",");
2544 printf("%d}", (code[1] << 8) + code[2]);
2545 if (*code == OP_NOTMINUPTO) printf("?");
2546 code += 3;
2547 break;
2548
2549 case OP_REF:
2550 printf(" \\%d", *(++code));
2551 code ++;
2552 goto CLASS_REF_REPEAT;
2553
2554 case OP_CLASS:
2555 {
2556 int i, min, max;
2557 code++;
2558 printf(" [");
2559
2560 for (i = 0; i < 256; i++)
2561 {
2562 if ((code[i/8] & (1 << (i&7))) != 0)
2563 {
2564 int j;
2565 for (j = i+1; j < 256; j++)
2566 if ((code[j/8] & (1 << (j&7))) == 0) break;
2567 if (i == '-' || i == ']') printf("\\");
2568 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2569 if (--j > i)
2570 {
2571 printf("-");
2572 if (j == '-' || j == ']') printf("\\");
2573 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2574 }
2575 i = j;
2576 }
2577 }
2578 printf("]");
2579 code += 32;
2580
2581 CLASS_REF_REPEAT:
2582
2583 switch(*code)
2584 {
2585 case OP_CRSTAR:
2586 case OP_CRMINSTAR:
2587 case OP_CRPLUS:
2588 case OP_CRMINPLUS:
2589 case OP_CRQUERY:
2590 case OP_CRMINQUERY:
2591 printf("%s", OP_names[*code]);
2592 break;
2593
2594 case OP_CRRANGE:
2595 case OP_CRMINRANGE:
2596 min = (code[1] << 8) + code[2];
2597 max = (code[3] << 8) + code[4];
2598 if (max == 0) printf("{%d,}", min);
2599 else printf("{%d,%d}", min, max);
2600 if (*code == OP_CRMINRANGE) printf("?");
2601 code += 4;
2602 break;
2603
2604 default:
2605 code--;
2606 }
2607 }
2608 break;
2609
2610 /* Anything else is just a one-node item */
2611
2612 default:
2613 printf(" %s", OP_names[*code]);
2614 break;
2615 }
2616
2617 code++;
2618 printf("\n");
2619 }
2620 printf("------------------------------------------------------------------\n");
2621
2622 /* This check is done here in the debugging case so that the code that
2623 was compiled can be seen. */
2624
2625 if (code - re->code > length)
2626 {
2627 *errorptr = ERR23;
2628 (pcre_free)(re);
2629 *erroroffset = ptr - (uschar *)pattern;
2630 return NULL;
2631 }
2632 #endif
2633
2634 return (pcre *)re;
2635 }
2636
2637
2638
2639 /*************************************************
2640 * Match a character type *
2641 *************************************************/
2642
2643 /* Not used in all the places it might be as it's sometimes faster
2644 to put the code inline.
2645
2646 Arguments:
2647 type the character type
2648 c the character
2649 dotall the dotall flag
2650
2651 Returns: TRUE if character is of the type
2652 */
2653
2654 static BOOL
2655 match_type(int type, int c, BOOL dotall)
2656 {
2657
2658 #ifdef DEBUG
2659 if (isprint(c)) printf("matching subject %c against ", c);
2660 else printf("matching subject \\x%02x against ", c);
2661 printf("%s\n", OP_names[type]);
2662 #endif
2663
2664 switch(type)
2665 {
2666 case OP_ANY: return dotall || c != '\n';
2667 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2668 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2669 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2670 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2671 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2672 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2673 }
2674 return FALSE;
2675 }
2676
2677
2678
2679 /*************************************************
2680 * Match a back-reference *
2681 *************************************************/
2682
2683 /* If a back reference hasn't been set, the length that is passed is greater
2684 than the number of characters left in the string, so the match fails.
2685
2686 Arguments:
2687 offset index into the offset vector
2688 eptr points into the subject
2689 length length to be matched
2690 md points to match data block
2691 ims the ims flags
2692
2693 Returns: TRUE if matched
2694 */
2695
2696 static BOOL
2697 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2698 int ims)
2699 {
2700 const uschar *p = md->start_subject + md->offset_vector[offset];
2701
2702 #ifdef DEBUG
2703 if (eptr >= md->end_subject)
2704 printf("matching subject <null>");
2705 else
2706 {
2707 printf("matching subject ");
2708 pchars(eptr, length, TRUE, md);
2709 }
2710 printf(" against backref ");
2711 pchars(p, length, FALSE, md);
2712 printf("\n");
2713 #endif
2714
2715 /* Always fail if not enough characters left */
2716
2717 if (length > md->end_subject - eptr) return FALSE;
2718
2719 /* Separate the caselesss case for speed */
2720
2721 if ((ims & PCRE_CASELESS) != 0)
2722 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2723 else
2724 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2725
2726 return TRUE;
2727 }
2728
2729
2730
2731 /*************************************************
2732 * Match from current position *
2733 *************************************************/
2734
2735 /* On entry ecode points to the first opcode, and eptr to the first character
2736 in the subject string, while eptrb holds the value of eptr at the start of the
2737 last bracketed group - used for breaking infinite loops matching zero-length
2738 strings.
2739
2740 Arguments:
2741 eptr pointer in subject
2742 ecode position in code
2743 offset_top current top pointer
2744 md pointer to "static" info for the match
2745 ims current /i, /m, and /s options
2746 condassert TRUE if called to check a condition assertion
2747 eptrb eptr at start of last bracket
2748
2749 Returns: TRUE if matched
2750 */
2751
2752 static BOOL
2753 match(register const uschar *eptr, register const uschar *ecode,
2754 int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
2755 {
2756 int original_ims = ims; /* Save for resetting on ')' */
2757
2758 for (;;)
2759 {
2760 int op = (int)*ecode;
2761 int min, max, ctype;
2762 register int i;
2763 register int c;
2764 BOOL minimize = FALSE;
2765
2766 /* Opening capturing bracket. If there is space in the offset vector, save
2767 the current subject position in the working slot at the top of the vector. We
2768 mustn't change the current values of the data slot, because they may be set
2769 from a previous iteration of this group, and be referred to by a reference
2770 inside the group.
2771
2772 If the bracket fails to match, we need to restore this value and also the
2773 values of the final offsets, in case they were set by a previous iteration of
2774 the same bracket.
2775
2776 If there isn't enough space in the offset vector, treat this as if it were a
2777 non-capturing bracket. Don't worry about setting the flag for the error case
2778 here; that is handled in the code for KET. */
2779
2780 if (op > OP_BRA)
2781 {
2782 int number = op - OP_BRA;
2783 int offset = number << 1;
2784
2785 DPRINTF(("start bracket %d\n", number));
2786
2787 if (offset < md->offset_max)
2788 {
2789 int save_offset1 = md->offset_vector[offset];
2790 int save_offset2 = md->offset_vector[offset+1];
2791 int save_offset3 = md->offset_vector[md->offset_end - number];
2792
2793 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
2794 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
2795
2796 do
2797 {
2798 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2799 ecode += (ecode[1] << 8) + ecode[2];
2800 }
2801 while (*ecode == OP_ALT);
2802
2803 DPRINTF(("bracket %d failed\n", number));
2804
2805 md->offset_vector[offset] = save_offset1;
2806 md->offset_vector[offset+1] = save_offset2;
2807 md->offset_vector[md->offset_end - number] = save_offset3;
2808 return FALSE;
2809 }
2810
2811 /* Insufficient room for saving captured contents */
2812
2813 else op = OP_BRA;
2814 }
2815
2816 /* Other types of node can be handled by a switch */
2817
2818 switch(op)
2819 {
2820 case OP_BRA: /* Non-capturing bracket: optimized */
2821 DPRINTF(("start bracket 0\n"));
2822 do
2823 {
2824 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2825 ecode += (ecode[1] << 8) + ecode[2];
2826 }
2827 while (*ecode == OP_ALT);
2828 DPRINTF(("bracket 0 failed\n"));
2829 return FALSE;
2830
2831 /* Conditional group: compilation checked that there are no more than
2832 two branches. If the condition is false, skipping the first branch takes us
2833 past the end if there is only one branch, but that's OK because that is
2834 exactly what going to the ket would do. */
2835
2836 case OP_COND:
2837 if (ecode[3] == OP_CREF) /* Condition is extraction test */
2838 {
2839 int offset = ecode[4] << 1; /* Doubled reference number */
2840 return match(eptr,
2841 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
2842 5 : 3 + (ecode[1] << 8) + ecode[2]),
2843 offset_top, md, ims, FALSE, eptr);
2844 }
2845
2846 /* The condition is an assertion. Call match() to evaluate it - setting
2847 the final argument TRUE causes it to stop at the end of an assertion. */
2848
2849 else
2850 {
2851 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
2852 {
2853 ecode += 3 + (ecode[4] << 8) + ecode[5];
2854 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
2855 }
2856 else ecode += (ecode[1] << 8) + ecode[2];
2857 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
2858 }
2859 /* Control never reaches here */
2860
2861 /* Skip over conditional reference data if encountered (should not be) */
2862
2863 case OP_CREF:
2864 ecode += 2;
2865 break;
2866
2867 /* End of the pattern */
2868
2869 case OP_END:
2870 md->end_match_ptr = eptr; /* Record where we ended */
2871 md->end_offset_top = offset_top; /* and how many extracts were taken */
2872 return TRUE;
2873
2874 /* Change option settings */
2875
2876 case OP_OPT:
2877 ims = ecode[1];
2878 ecode += 2;
2879 DPRINTF(("ims set to %02x\n", ims));
2880 break;
2881
2882 /* Assertion brackets. Check the alternative branches in turn - the
2883 matching won't pass the KET for an assertion. If any one branch matches,
2884 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
2885 start of each branch to move the current point backwards, so the code at
2886 this level is identical to the lookahead case. */
2887
2888 case OP_ASSERT:
2889 case OP_ASSERTBACK:
2890 do
2891 {
2892 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
2893 ecode += (ecode[1] << 8) + ecode[2];
2894 }
2895 while (*ecode == OP_ALT);
2896 if (*ecode == OP_KET) return FALSE;
2897
2898 /* If checking an assertion for a condition, return TRUE. */
2899
2900 if (condassert) return TRUE;
2901
2902 /* Continue from after the assertion, updating the offsets high water
2903 mark, since extracts may have been taken during the assertion. */
2904
2905 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2906 ecode += 3;
2907 offset_top = md->end_offset_top;
2908 continue;
2909
2910 /* Negative assertion: all branches must fail to match */
2911
2912 case OP_ASSERT_NOT:
2913 case OP_ASSERTBACK_NOT:
2914 do
2915 {
2916 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
2917 ecode += (ecode[1] << 8) + ecode[2];
2918 }
2919 while (*ecode == OP_ALT);
2920
2921 if (condassert) return TRUE;
2922 ecode += 3;
2923 continue;
2924
2925 /* Move the subject pointer back. This occurs only at the start of
2926 each branch of a lookbehind assertion. If we are too close to the start to
2927 move back, this match function fails. */
2928
2929 case OP_REVERSE:
2930 eptr -= (ecode[1] << 8) + ecode[2];
2931 if (eptr < md->start_subject) return FALSE;
2932 ecode += 3;
2933 break;
2934
2935
2936 /* "Once" brackets are like assertion brackets except that after a match,
2937 the point in the subject string is not moved back. Thus there can never be
2938 a move back into the brackets. Check the alternative branches in turn - the
2939 matching won't pass the KET for this kind of subpattern. If any one branch
2940 matches, we carry on as at the end of a normal bracket, leaving the subject
2941 pointer. */
2942
2943 case OP_ONCE:
2944 {
2945 const uschar *prev = ecode;
2946
2947 do
2948 {
2949 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
2950 ecode += (ecode[1] << 8) + ecode[2];
2951 }
2952 while (*ecode == OP_ALT);
2953
2954 /* If hit the end of the group (which could be repeated), fail */
2955
2956 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
2957
2958 /* Continue as from after the assertion, updating the offsets high water
2959 mark, since extracts may have been taken. */
2960
2961 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2962
2963 offset_top = md->end_offset_top;
2964 eptr = md->end_match_ptr;
2965
2966 /* For a non-repeating ket, just continue at this level. This also
2967 happens for a repeating ket if no characters were matched in the group.
2968 This is the forcible breaking of infinite loops as implemented in Perl
2969 5.005. If there is an options reset, it will get obeyed in the normal
2970 course of events. */
2971
2972 if (*ecode == OP_KET || eptr == eptrb)
2973 {
2974 ecode += 3;
2975 break;
2976 }
2977
2978 /* The repeating kets try the rest of the pattern or restart from the
2979 preceding bracket, in the appropriate order. We need to reset any options
2980 that changed within the bracket before re-running it, so check the next
2981 opcode. */
2982
2983 if (ecode[3] == OP_OPT)
2984 {
2985 ims = (ims & ~PCRE_IMS) | ecode[4];
2986 DPRINTF(("ims set to %02x at group repeat\n", ims));
2987 }
2988
2989 if (*ecode == OP_KETRMIN)
2990 {
2991 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
2992 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
2993 }
2994 else /* OP_KETRMAX */
2995 {
2996 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
2997 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
2998 }
2999 }
3000 return FALSE;
3001
3002 /* An alternation is the end of a branch; scan along to find the end of the
3003 bracketed group and go to there. */
3004
3005 case OP_ALT:
3006 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3007 break;
3008
3009 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3010 that it may occur zero times. It may repeat infinitely, or not at all -
3011 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3012 repeat limits are compiled as a number of copies, with the optional ones
3013 preceded by BRAZERO or BRAMINZERO. */
3014
3015 case OP_BRAZERO:
3016 {
3017 const uschar *next = ecode+1;
3018 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3019 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3020 ecode = next + 3;
3021 }
3022 break;
3023
3024 case OP_BRAMINZERO:
3025 {
3026 const uschar *next = ecode+1;
3027 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3028 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3029 ecode++;
3030 }
3031 break;
3032
3033 /* End of a group, repeated or non-repeating. If we are at the end of
3034 an assertion "group", stop matching and return TRUE, but record the
3035 current high water mark for use by positive assertions. Do this also
3036 for the "once" (not-backup up) groups. */
3037
3038 case OP_KET:
3039 case OP_KETRMIN:
3040 case OP_KETRMAX:
3041 {
3042 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3043
3044 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3045 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3046 *prev == OP_ONCE)
3047 {
3048 md->end_match_ptr = eptr; /* For ONCE */
3049 md->end_offset_top = offset_top;
3050 return TRUE;
3051 }
3052
3053 /* In all other cases except a conditional group we have to check the
3054 group number back at the start and if necessary complete handling an
3055 extraction by setting the offsets and bumping the high water mark. */
3056
3057 if (*prev != OP_COND)
3058 {
3059 int number = *prev - OP_BRA;
3060 int offset = number << 1;
3061
3062 DPRINTF(("end bracket %d\n", number));
3063
3064 if (number > 0)
3065 {
3066 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3067 {
3068 md->offset_vector[offset] =
3069 md->offset_vector[md->offset_end - number];
3070 md->offset_vector[offset+1] = eptr - md->start_subject;
3071 if (offset_top <= offset) offset_top = offset + 2;
3072 }
3073 }
3074 }
3075
3076 /* Reset the value of the ims flags, in case they got changed during
3077 the group. */
3078
3079 ims = original_ims;
3080 DPRINTF(("ims reset to %02x\n", ims));
3081
3082 /* For a non-repeating ket, just continue at this level. This also
3083 happens for a repeating ket if no characters were matched in the group.
3084 This is the forcible breaking of infinite loops as implemented in Perl
3085 5.005. If there is an options reset, it will get obeyed in the normal
3086 course of events. */
3087
3088 if (*ecode == OP_KET || eptr == eptrb)
3089 {
3090 ecode += 3;
3091 break;
3092 }
3093
3094 /* The repeating kets try the rest of the pattern or restart from the
3095 preceding bracket, in the appropriate order. */
3096
3097 if (*ecode == OP_KETRMIN)
3098 {
3099 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3100 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3101 }
3102 else /* OP_KETRMAX */
3103 {
3104 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3105 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3106 }
3107 }
3108 return FALSE;
3109
3110 /* Start of subject unless notbol, or after internal newline if multiline */
3111
3112 case OP_CIRC:
3113 if (md->notbol && eptr == md->start_subject) return FALSE;
3114 if ((ims & PCRE_MULTILINE) != 0)
3115 {
3116 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3117 ecode++;
3118 break;
3119 }
3120 /* ... else fall through */
3121
3122 /* Start of subject assertion */
3123
3124 case OP_SOD:
3125 if (eptr != md->start_subject) return FALSE;
3126 ecode++;
3127 break;
3128
3129 /* Assert before internal newline if multiline, or before a terminating
3130 newline unless endonly is set, else end of subject unless noteol is set. */
3131
3132 case OP_DOLL:
3133 if ((ims & PCRE_MULTILINE) != 0)
3134 {
3135 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3136 else { if (md->noteol) return FALSE; }
3137 ecode++;
3138 break;
3139 }
3140 else
3141 {
3142 if (md->noteol) return FALSE;
3143 if (!md->endonly)
3144 {
3145 if (eptr < md->end_subject - 1 ||
3146 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3147
3148 ecode++;
3149 break;
3150 }
3151 }
3152 /* ... else fall through */
3153
3154 /* End of subject assertion (\z) */
3155
3156 case OP_EOD:
3157 if (eptr < md->end_subject) return FALSE;
3158 ecode++;
3159 break;
3160
3161 /* End of subject or ending \n assertion (\Z) */
3162
3163 case OP_EODN:
3164 if (eptr < md->end_subject - 1 ||
3165 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3166 ecode++;
3167 break;
3168
3169 /* Word boundary assertions */
3170
3171 case OP_NOT_WORD_BOUNDARY:
3172 case OP_WORD_BOUNDARY:
3173 {
3174 BOOL prev_is_word = (eptr != md->start_subject) &&
3175 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3176 BOOL cur_is_word = (eptr < md->end_subject) &&
3177 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3178 if ((*ecode++ == OP_WORD_BOUNDARY)?
3179 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3180 return FALSE;
3181 }
3182 break;
3183
3184 /* Match a single character type; inline for speed */
3185
3186 case OP_ANY:
3187 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3188 return FALSE;
3189 if (eptr++ >= md->end_subject) return FALSE;
3190 ecode++;
3191 break;
3192
3193 case OP_NOT_DIGIT:
3194 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3195 return FALSE;
3196 ecode++;
3197 break;
3198
3199 case OP_DIGIT:
3200 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3201 return FALSE;
3202 ecode++;
3203 break;
3204
3205 case OP_NOT_WHITESPACE:
3206 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3207 return FALSE;
3208 ecode++;
3209 break;
3210
3211 case OP_WHITESPACE:
3212 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3213 return FALSE;
3214 ecode++;
3215 break;
3216
3217 case OP_NOT_WORDCHAR:
3218 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3219 return FALSE;
3220 ecode++;
3221 break;
3222
3223 case OP_WORDCHAR:
3224 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3225 return FALSE;
3226 ecode++;
3227 break;
3228
3229 /* Match a back reference, possibly repeatedly. Look past the end of the
3230 item to see if there is repeat information following. The code is similar
3231 to that for character classes, but repeated for efficiency. Then obey
3232 similar code to character type repeats - written out again for speed.
3233 However, if the referenced string is the empty string, always treat
3234 it as matched, any number of times (otherwise there could be infinite
3235 loops). */
3236
3237 case OP_REF:
3238 {
3239 int length;
3240 int offset = ecode[1] << 1; /* Doubled reference number */
3241 ecode += 2; /* Advance past the item */
3242
3243 /* If the reference is unset, set the length to be longer than the amount
3244 of subject left; this ensures that every attempt at a match fails. We
3245 can't just fail here, because of the possibility of quantifiers with zero
3246 minima. */
3247
3248 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3249 md->end_subject - eptr + 1 :
3250 md->offset_vector[offset+1] - md->offset_vector[offset];
3251
3252 /* Set up for repetition, or handle the non-repeated case */
3253
3254 switch (*ecode)
3255 {
3256 case OP_CRSTAR:
3257 case OP_CRMINSTAR:
3258 case OP_CRPLUS:
3259 case OP_CRMINPLUS:
3260 case OP_CRQUERY:
3261 case OP_CRMINQUERY:
3262 c = *ecode++ - OP_CRSTAR;
3263 minimize = (c & 1) != 0;
3264 min = rep_min[c]; /* Pick up values from tables; */
3265 max = rep_max[c]; /* zero for max => infinity */
3266 if (max == 0) max = INT_MAX;
3267 break;
3268
3269 case OP_CRRANGE:
3270 case OP_CRMINRANGE:
3271 minimize = (*ecode == OP_CRMINRANGE);
3272 min = (ecode[1] << 8) + ecode[2];
3273 max = (ecode[3] << 8) + ecode[4];
3274 if (max == 0) max = INT_MAX;
3275 ecode += 5;
3276 break;
3277
3278 default: /* No repeat follows */
3279 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3280 eptr += length;
3281 continue; /* With the main loop */
3282 }
3283
3284 /* If the length of the reference is zero, just continue with the
3285 main loop. */
3286
3287 if (length == 0) continue;
3288
3289 /* First, ensure the minimum number of matches are present. We get back
3290 the length of the reference string explicitly rather than passing the
3291 address of eptr, so that eptr can be a register variable. */
3292
3293 for (i = 1; i <= min; i++)
3294 {
3295 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3296 eptr += length;
3297 }
3298
3299 /* If min = max, continue at the same level without recursion.
3300 They are not both allowed to be zero. */
3301
3302 if (min == max) continue;
3303
3304 /* If minimizing, keep trying and advancing the pointer */
3305
3306 if (minimize)
3307 {
3308 for (i = min;; i++)
3309 {
3310 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3311 return TRUE;
3312 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3313 return FALSE;
3314 eptr += length;
3315 }
3316 /* Control never gets here */
3317 }
3318
3319 /* If maximizing, find the longest string and work backwards */
3320
3321 else
3322 {
3323 const uschar *pp = eptr;
3324 for (i = min; i < max; i++)
3325 {
3326 if (!match_ref(offset, eptr, length, md, ims)) break;
3327 eptr += length;
3328 }
3329 while (eptr >= pp)
3330 {
3331 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3332 return TRUE;
3333 eptr -= length;
3334 }
3335 return FALSE;
3336 }
3337 }
3338 /* Control never gets here */
3339
3340
3341
3342 /* Match a character class, possibly repeatedly. Look past the end of the
3343 item to see if there is repeat information following. Then obey similar
3344 code to character type repeats - written out again for speed. */
3345
3346 case OP_CLASS:
3347 {
3348 const uschar *data = ecode + 1; /* Save for matching */
3349 ecode += 33; /* Advance past the item */
3350
3351 switch (*ecode)
3352 {
3353 case OP_CRSTAR:
3354 case OP_CRMINSTAR:
3355 case OP_CRPLUS:
3356 case OP_CRMINPLUS:
3357 case OP_CRQUERY:
3358 case OP_CRMINQUERY:
3359 c = *ecode++ - OP_CRSTAR;
3360 minimize = (c & 1) != 0;
3361 min = rep_min[c]; /* Pick up values from tables; */
3362 max = rep_max[c]; /* zero for max => infinity */
3363 if (max == 0) max = INT_MAX;
3364 break;
3365
3366 case OP_CRRANGE:
3367 case OP_CRMINRANGE:
3368 minimize = (*ecode == OP_CRMINRANGE);
3369 min = (ecode[1] << 8) + ecode[2];
3370 max = (ecode[3] << 8) + ecode[4];
3371 if (max == 0) max = INT_MAX;
3372 ecode += 5;
3373 break;
3374
3375 default: /* No repeat follows */
3376 min = max = 1;
3377 break;
3378 }
3379
3380 /* First, ensure the minimum number of matches are present. */
3381
3382 for (i = 1; i <= min; i++)
3383 {
3384 if (eptr >= md->end_subject) return FALSE;
3385 c = *eptr++;
3386 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3387 return FALSE;
3388 }
3389
3390 /* If max == min we can continue with the main loop without the
3391 need to recurse. */
3392
3393 if (min == max) continue;
3394
3395 /* If minimizing, keep testing the rest of the expression and advancing
3396 the pointer while it matches the class. */
3397
3398 if (minimize)
3399 {
3400 for (i = min;; i++)
3401 {
3402 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3403 return TRUE;
3404 if (i >= max || eptr >= md->end_subject) return FALSE;
3405 c = *eptr++;
3406 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3407 return FALSE;
3408 }
3409 /* Control never gets here */
3410 }
3411
3412 /* If maximizing, find the longest possible run, then work backwards. */
3413
3414 else
3415 {
3416 const uschar *pp = eptr;
3417 for (i = min; i < max; eptr++, i++)
3418 {
3419 if (eptr >= md->end_subject) break;
3420 c = *eptr;
3421 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3422 break;
3423 }
3424
3425 while (eptr >= pp)
3426 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3427 return TRUE;
3428 return FALSE;
3429 }
3430 }
3431 /* Control never gets here */
3432
3433 /* Match a run of characters */
3434
3435 case OP_CHARS:
3436 {
3437 register int length = ecode[1];
3438 ecode += 2;
3439
3440 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3441 if (eptr >= md->end_subject)
3442 printf("matching subject <null> against pattern ");
3443 else
3444 {
3445 printf("matching subject ");
3446 pchars(eptr, length, TRUE, md);
3447 printf(" against pattern ");
3448 }
3449 pchars(ecode, length, FALSE, md);
3450 printf("\n");
3451 #endif
3452
3453 if (length > md->end_subject - eptr) return FALSE;
3454 if ((ims & PCRE_CASELESS) != 0)
3455 {
3456 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
3457 }
3458 else
3459 {
3460 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3461 }
3462 }
3463 break;
3464
3465 /* Match a single character repeatedly; different opcodes share code. */
3466
3467 case OP_EXACT:
3468 min = max = (ecode[1] << 8) + ecode[2];
3469 ecode += 3;
3470 goto REPEATCHAR;
3471
3472 case OP_UPTO:
3473 case OP_MINUPTO:
3474 min = 0;
3475 max = (ecode[1] << 8) + ecode[2];
3476 minimize = *ecode == OP_MINUPTO;
3477 ecode += 3;
3478 goto REPEATCHAR;
3479
3480 case OP_STAR:
3481 case OP_MINSTAR:
3482 case OP_PLUS:
3483 case OP_MINPLUS:
3484 case OP_QUERY:
3485 case OP_MINQUERY:
3486 c = *ecode++ - OP_STAR;
3487 minimize = (c & 1) != 0;
3488 min = rep_min[c]; /* Pick up values from tables; */
3489 max = rep_max[c]; /* zero for max => infinity */
3490 if (max == 0) max = INT_MAX;
3491
3492 /* Common code for all repeated single-character matches. We can give
3493 up quickly if there are fewer than the minimum number of characters left in
3494 the subject. */
3495
3496 REPEATCHAR:
3497 if (min > md->end_subject - eptr) return FALSE;
3498 c = *ecode++;
3499
3500 /* The code is duplicated for the caseless and caseful cases, for speed,
3501 since matching characters is likely to be quite common. First, ensure the
3502 minimum number of matches are present. If min = max, continue at the same
3503 level without recursing. Otherwise, if minimizing, keep trying the rest of
3504 the expression and advancing one matching character if failing, up to the
3505 maximum. Alternatively, if maximizing, find the maximum number of
3506 characters and work backwards. */
3507
3508 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3509 max, eptr));
3510
3511 if ((ims & PCRE_CASELESS) != 0)
3512 {
3513 c = pcre_lcc[c];
3514 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
3515 if (min == max) continue;
3516 if (minimize)
3517 {
3518 for (i = min;; i++)
3519 {
3520 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3521 return TRUE;
3522 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3523 return FALSE;
3524 }
3525 /* Control never gets here */
3526 }
3527 else
3528 {
3529 const uschar *pp = eptr;
3530 for (i = min; i < max; i++)
3531 {
3532 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3533 eptr++;
3534 }
3535 while (eptr >= pp)
3536 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3537 return TRUE;
3538 return FALSE;
3539 }
3540 /* Control never gets here */
3541 }
3542
3543 /* Caseful comparisons */
3544
3545 else
3546 {
3547 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3548 if (min == max) continue;
3549 if (minimize)
3550 {
3551 for (i = min;; i++)
3552 {
3553 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3554 return TRUE;
3555 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3556 }
3557 /* Control never gets here */
3558 }
3559 else
3560 {
3561 const uschar *pp = eptr;
3562 for (i = min; i < max; i++)
3563 {
3564 if (eptr >= md->end_subject || c != *eptr) break;
3565 eptr++;
3566 }
3567 while (eptr >= pp)
3568 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3569 return TRUE;
3570 return FALSE;
3571 }
3572 }
3573 /* Control never gets here */
3574
3575 /* Match a negated single character */
3576
3577 case OP_NOT:
3578 if (eptr >= md->end_subject) return FALSE;
3579 ecode++;
3580 if ((ims & PCRE_CASELESS) != 0)
3581 {
3582 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3583 }
3584 else
3585 {
3586 if (*ecode++ == *eptr++) return FALSE;
3587 }
3588 break;
3589
3590 /* Match a negated single character repeatedly. This is almost a repeat of
3591 the code for a repeated single character, but I haven't found a nice way of
3592 commoning these up that doesn't require a test of the positive/negative
3593 option for each character match. Maybe that wouldn't add very much to the
3594 time taken, but character matching *is* what this is all about... */
3595
3596 case OP_NOTEXACT:
3597 min = max = (ecode[1] << 8) + ecode[2];
3598 ecode += 3;
3599 goto REPEATNOTCHAR;
3600
3601 case OP_NOTUPTO:
3602 case OP_NOTMINUPTO:
3603 min = 0;
3604 max = (ecode[1] << 8) + ecode[2];
3605 minimize = *ecode == OP_NOTMINUPTO;
3606 ecode += 3;
3607 goto REPEATNOTCHAR;
3608
3609 case OP_NOTSTAR:
3610 case OP_NOTMINSTAR:
3611 case OP_NOTPLUS:
3612 case OP_NOTMINPLUS:
3613 case OP_NOTQUERY:
3614 case OP_NOTMINQUERY:
3615 c = *ecode++ - OP_NOTSTAR;
3616 minimize = (c & 1) != 0;
3617 min = rep_min[c]; /* Pick up values from tables; */
3618 max = rep_max[c]; /* zero for max => infinity */
3619 if (max == 0) max = INT_MAX;
3620
3621 /* Common code for all repeated single-character matches. We can give
3622 up quickly if there are fewer than the minimum number of characters left in
3623 the subject. */
3624
3625 REPEATNOTCHAR:
3626 if (min > md->end_subject - eptr) return FALSE;
3627 c = *ecode++;
3628
3629 /* The code is duplicated for the caseless and caseful cases, for speed,
3630 since matching characters is likely to be quite common. First, ensure the
3631 minimum number of matches are present. If min = max, continue at the same
3632 level without recursing. Otherwise, if minimizing, keep trying the rest of
3633 the expression and advancing one matching character if failing, up to the
3634 maximum. Alternatively, if maximizing, find the maximum number of
3635 characters and work backwards. */
3636
3637 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3638 max, eptr));
3639
3640 if ((ims & PCRE_CASELESS) != 0)
3641 {
3642 c = pcre_lcc[c];
3643 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3644 if (min == max) continue;
3645 if (minimize)
3646 {
3647 for (i = min;; i++)
3648 {
3649 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3650 return TRUE;
3651 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3652 return FALSE;
3653 }
3654 /* Control never gets here */
3655 }
3656 else
3657 {
3658 const uschar *pp = eptr;
3659 for (i = min; i < max; i++)
3660 {
3661 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3662 eptr++;
3663 }
3664 while (eptr >= pp)
3665 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3666 return TRUE;
3667 return FALSE;
3668 }
3669 /* Control never gets here */
3670 }
3671
3672 /* Caseful comparisons */
3673
3674 else
3675 {
3676 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3677 if (min == max) continue;
3678 if (minimize)
3679 {
3680 for (i = min;; i++)
3681 {
3682 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3683 return TRUE;
3684 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3685 }
3686 /* Control never gets here */
3687 }
3688 else
3689 {
3690 const uschar *pp = eptr;
3691 for (i = min; i < max; i++)
3692 {
3693 if (eptr >= md->end_subject || c == *eptr) break;
3694 eptr++;
3695 }
3696 while (eptr >= pp)
3697 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3698 return TRUE;
3699 return FALSE;
3700 }
3701 }
3702 /* Control never gets here */
3703
3704 /* Match a single character type repeatedly; several different opcodes
3705 share code. This is very similar to the code for single characters, but we
3706 repeat it in the interests of efficiency. */
3707
3708 case OP_TYPEEXACT:
3709 min = max = (ecode[1] << 8) + ecode[2];
3710 minimize = TRUE;
3711 ecode += 3;
3712 goto REPEATTYPE;
3713
3714 case OP_TYPEUPTO:
3715 case OP_TYPEMINUPTO:
3716 min = 0;
3717 max = (ecode[1] << 8) + ecode[2];
3718 minimize = *ecode == OP_TYPEMINUPTO;
3719 ecode += 3;
3720 goto REPEATTYPE;
3721
3722 case OP_TYPESTAR:
3723 case OP_TYPEMINSTAR:
3724 case OP_TYPEPLUS:
3725 case OP_TYPEMINPLUS:
3726 case OP_TYPEQUERY:
3727 case OP_TYPEMINQUERY:
3728 c = *ecode++ - OP_TYPESTAR;
3729 minimize = (c & 1) != 0;
3730 min = rep_min[c]; /* Pick up values from tables; */
3731 max = rep_max[c]; /* zero for max => infinity */
3732 if (max == 0) max = INT_MAX;
3733
3734 /* Common code for all repeated single character type matches */
3735
3736 REPEATTYPE:
3737 ctype = *ecode++; /* Code for the character type */
3738
3739 /* First, ensure the minimum number of matches are present. Use inline
3740 code for maximizing the speed, and do the type test once at the start
3741 (i.e. keep it out of the loop). Also test that there are at least the
3742 minimum number of characters before we start. */
3743
3744 if (min > md->end_subject - eptr) return FALSE;
3745 if (min > 0) switch(ctype)
3746 {
3747 case OP_ANY:
3748 if ((ims & PCRE_DOTALL) == 0)
3749 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3750 else eptr += min;
3751 break;
3752
3753 case OP_NOT_DIGIT:
3754 for (i = 1; i <= min; i++)
3755 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3756 break;
3757
3758 case OP_DIGIT:
3759 for (i = 1; i <= min; i++)
3760 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3761 break;
3762
3763 case OP_NOT_WHITESPACE:
3764 for (i = 1; i <= min; i++)
3765 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3766 break;
3767
3768 case OP_WHITESPACE:
3769 for (i = 1; i <= min; i++)
3770 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3771 break;
3772
3773 case OP_NOT_WORDCHAR:
3774 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3775 return FALSE;
3776 break;
3777
3778 case OP_WORDCHAR:
3779 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3780 return FALSE;
3781 break;
3782 }
3783
3784 /* If min = max, continue at the same level without recursing */
3785
3786 if (min == max) continue;
3787
3788 /* If minimizing, we have to test the rest of the pattern before each
3789 subsequent match, so inlining isn't much help; just use the function. */
3790
3791 if (minimize)
3792 {
3793 for (i = min;; i++)
3794 {
3795 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
3796 if (i >= max || eptr >= md->end_subject ||
3797 !match_type(ctype, *eptr++, (ims & PCRE_DOTALL) != 0))
3798 return FALSE;
3799 }
3800 /* Control never gets here */
3801 }
3802
3803 /* If maximizing it is worth using inline code for speed, doing the type
3804 test once at the start (i.e. keep it out of the loop). */
3805
3806 else
3807 {
3808 const uschar *pp = eptr;
3809 switch(ctype)
3810 {
3811 case OP_ANY:
3812 if ((ims & PCRE_DOTALL) == 0)
3813 {
3814 for (i = min; i < max; i++)
3815 {
3816 if (eptr >= md->end_subject || *eptr == '\n') break;
3817 eptr++;
3818 }
3819 }
3820 else
3821 {
3822 c = max - min;
3823 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3824 eptr += c;
3825 }
3826 break;
3827
3828 case OP_NOT_DIGIT:
3829 for (i = min; i < max; i++)
3830 {
3831 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3832 break;
3833 eptr++;
3834 }
3835 break;
3836
3837 case OP_DIGIT:
3838 for (i = min; i < max; i++)
3839 {
3840 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3841 break;
3842 eptr++;
3843 }
3844 break;
3845
3846 case OP_NOT_WHITESPACE:
3847 for (i = min; i < max; i++)
3848 {
3849 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3850 break;
3851 eptr++;
3852 }
3853 break;
3854
3855 case OP_WHITESPACE:
3856 for (i = min; i < max; i++)
3857 {
3858 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3859 break;
3860 eptr++;
3861 }
3862 break;
3863
3864 case OP_NOT_WORDCHAR:
3865 for (i = min; i < max; i++)
3866 {
3867 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3868 break;
3869 eptr++;
3870 }
3871 break;
3872
3873 case OP_WORDCHAR:
3874 for (i = min; i < max; i++)
3875 {
3876 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3877 break;
3878 eptr++;
3879 }
3880 break;
3881 }
3882
3883 while (eptr >= pp)
3884 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3885 return TRUE;
3886 return FALSE;
3887 }
3888 /* Control never gets here */
3889
3890 /* There's been some horrible disaster. */
3891
3892 default:
3893 DPRINTF(("Unknown opcode %d\n", *ecode));
3894 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3895 return FALSE;
3896 }
3897
3898 /* Do not stick any code in here without much thought; it is assumed
3899 that "continue" in the code above comes out to here to repeat the main
3900 loop. */
3901
3902 } /* End of main loop */
3903 /* Control never reaches here */
3904 }
3905
3906
3907
3908
3909 /*************************************************
3910 * Execute a Regular Expression *
3911 *************************************************/
3912
3913 /* This function applies a compiled re to a subject string and picks out
3914 portions of the string if it matches. Two elements in the vector are set for
3915 each substring: the offsets to the start and end of the substring.
3916
3917 Arguments:
3918 external_re points to the compiled expression
3919 external_extra points to "hints" from pcre_study() or is NULL
3920 subject points to the subject string
3921 length length of subject string (may contain binary zeros)
3922 options option bits
3923 offsets points to a vector of ints to be filled in with offsets
3924 offsetcount the number of elements in the vector
3925
3926 Returns: > 0 => success; value is the number of elements filled in
3927 = 0 => success, but offsets is not big enough
3928 -1 => failed to match
3929 < -1 => some kind of unexpected problem
3930 */
3931
3932 int
3933 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3934 const char *subject, int length, int options, int *offsets, int offsetcount)
3935 {
3936 int resetcount, ocount;
3937 int first_char = -1;
3938 int ims = 0;
3939 match_data match_block;
3940 const uschar *start_bits = NULL;
3941 const uschar *start_match = (const uschar *)subject;
3942 const uschar *end_subject;
3943 const real_pcre *re = (const real_pcre *)external_re;
3944 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3945 BOOL using_temporary_offsets = FALSE;
3946 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3947 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3948
3949 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3950
3951 if (re == NULL || subject == NULL ||
3952 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3953 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3954
3955 match_block.start_subject = (const uschar *)subject;
3956 match_block.end_subject = match_block.start_subject + length;
3957 end_subject = match_block.end_subject;
3958
3959 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3960
3961 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3962 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3963
3964 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3965
3966 /* The ims options can vary during the matching as a result of the presence
3967 of (?ims) items in the pattern. They are kept in a local variable so that
3968 restoring at the exit of a group is easy. */
3969
3970 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3971
3972 /* If the expression has got more back references than the offsets supplied can
3973 hold, we get a temporary bit of working store to use during the matching.
3974 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3975 of 3. */
3976
3977 ocount = offsetcount - (offsetcount % 3);
3978
3979 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3980 {
3981 ocount = re->top_backref * 3 + 3;
3982 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3983 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3984 using_temporary_offsets = TRUE;
3985 DPRINTF(("Got memory to hold back references\n"));
3986 }
3987 else match_block.offset_vector = offsets;
3988
3989 match_block.offset_end = ocount;
3990 match_block.offset_max = (2*ocount)/3;
3991 match_block.offset_overflow = FALSE;
3992
3993 /* Compute the minimum number of offsets that we need to reset each time. Doing
3994 this makes a huge difference to execution time when there aren't many brackets
3995 in the pattern. */
3996
3997 resetcount = 2 + re->top_bracket * 2;
3998 if (resetcount > offsetcount) resetcount = ocount;
3999
4000 /* Set up the first character to match, if available. The first_char value is
4001 never set for an anchored regular expression, but the anchoring may be forced
4002 at run time, so we have to test for anchoring. The first char may be unset for
4003 an unanchored pattern, of course. If there's no first char and the pattern was
4004 studied, there may be a bitmap of possible first characters. */
4005
4006 if (!anchored)
4007 {
4008 if ((re->options & PCRE_FIRSTSET) != 0)
4009 {
4010 first_char = re->first_char;
4011 if ((ims & PCRE_CASELESS) != 0) first_char = pcre_lcc[first_char];
4012 }
4013 else
4014 if (!startline && extra != NULL &&
4015 (extra->options & PCRE_STUDY_MAPPED) != 0)
4016 start_bits = extra->start_bits;
4017 }
4018
4019 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4020
4021 do
4022 {
4023 int rc;
4024 register int *iptr = match_block.offset_vector;
4025 register int *iend = iptr + resetcount;
4026
4027 /* Reset the maximum number of extractions we might see. */
4028
4029 while (iptr < iend) *iptr++ = -1;
4030
4031 /* Advance to a unique first char if possible */
4032
4033 if (first_char >= 0)
4034 {
4035 if ((ims & PCRE_CASELESS) != 0)
4036 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4037 start_match++;
4038 else
4039 while (start_match < end_subject && *start_match != first_char)
4040 start_match++;
4041 }
4042
4043 /* Or to just after \n for a multiline match if possible */
4044
4045 else if (startline)
4046 {
4047 if (start_match > match_block.start_subject)
4048 {
4049 while (start_match < end_subject && start_match[-1] != '\n')
4050 start_match++;
4051 }
4052 }
4053
4054 /* Or to a non-unique first char */
4055
4056 else if (start_bits != NULL)
4057 {
4058 while (start_match < end_subject)
4059 {
4060 register int c = *start_match;
4061 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4062 }
4063 }
4064
4065 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4066 printf(">>>> Match against: ");
4067 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4068 printf("\n");
4069 #endif
4070
4071 /* When a match occurs, substrings will be set for all internal extractions;
4072 we just need to set up the whole thing as substring 0 before returning. If
4073 there were too many extractions, set the return code to zero. In the case
4074 where we had to get some local store to hold offsets for backreferences, copy
4075 those back references that we can. In this case there need not be overflow
4076 if certain parts of the pattern were not used. */
4077
4078 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4079 continue;
4080
4081 /* Copy the offset information from temporary store if necessary */
4082
4083 if (using_temporary_offsets)
4084 {
4085 if (offsetcount >= 4)
4086 {
4087 memcpy(offsets + 2, match_block.offset_vector + 2,
4088 (offsetcount - 2) * sizeof(int));
4089 DPRINTF(("Copied offsets from temporary memory\n"));
4090 }
4091 if (match_block.end_offset_top > offsetcount)
4092 match_block.offset_overflow = TRUE;
4093
4094 DPRINTF(("Freeing temporary memory\n"));
4095 (pcre_free)(match_block.offset_vector);
4096 }
4097
4098 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4099
4100 if (match_block.offset_end < 2) rc = 0; else
4101 {
4102 offsets[0] = start_match - match_block.start_subject;
4103 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4104 }
4105
4106 DPRINTF((">>>> returning %d\n", rc));
4107 return rc;
4108 }
4109 while (!anchored &&
4110 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4111 start_match++ < end_subject);
4112
4113 if (using_temporary_offsets)
4114 {
4115 DPRINTF(("Freeing temporary memory\n"));
4116 (pcre_free)(match_block.offset_vector);
4117 }
4118
4119 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4120
4121 return match_block.errorcode;
4122 }
4123
4124 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12