/[pcre]/code/tags/pcre-1.04/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-1.04/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (show annotations) (download)
Sat Feb 24 21:38:19 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 101617 byte(s)
Tag code/trunk as code/tags/pcre-1.04.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the the use
37 of #ifdef inline, and there are *still* stupid compilers about that don't like
38 indented pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
53
54 static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
55 static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
56
57 /* Text forms of OP_ values and things, for debugging */
58
59 #ifdef DEBUG
60 static const char *OP_names[] = {
61 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
62 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
63 "not",
64 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
65 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
66 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
67 "*", "*?", "+", "+?", "?", "??", "{", "{",
68 "class", "Ref",
69 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
70 "Brazero", "Braminzero", "Bra"
71 };
72 #endif
73
74 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
75 are simple data values; negative values are for special things like \d and so
76 on. Zero means further processing is needed (for things like \x), or the escape
77 is invalid. */
78
79 static short int escapes[] = {
80 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
81 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
82 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
83 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
84 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
85 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
86 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
87 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
88 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
89 0, 0, 0 /* x - z */
90 };
91
92 /* Definition to allow mutual recursion */
93
94 static BOOL compile_regex(int, int *, uschar **, const uschar **, const char **);
95
96 /* Structure for passing "static" information around between the functions
97 doing the matching, so that they are thread-safe. */
98
99 typedef struct match_data {
100 int errorcode; /* As it says */
101 int *offset_vector; /* Offset vector */
102 int offset_end; /* One past the end */
103 BOOL offset_overflow; /* Set if too many extractions */
104 BOOL caseless; /* Case-independent flag */
105 BOOL runtime_caseless; /* Caseless forced at run time */
106 BOOL multiline; /* Multiline flag */
107 BOOL notbol; /* NOTBOL flag */
108 BOOL noteol; /* NOTEOL flag */
109 BOOL dotall; /* Dot matches any char */
110 BOOL endonly; /* Dollar not before final \n */
111 const uschar *start_subject; /* Start of the subject string */
112 const uschar *end_subject; /* End of the subject string */
113 jmp_buf fail_env; /* Environment for longjump() break out */
114 const uschar *end_match_ptr; /* Subject position at end match */
115 int end_offset_top; /* Highwater mark at end of match */
116 } match_data;
117
118
119
120 /*************************************************
121 * Global variables *
122 *************************************************/
123
124 /* PCRE is thread-clean and doesn't use any global variables in the normal
125 sense. However, it calls memory allocation and free functions via the two
126 indirections below, which are can be changed by the caller, but are shared
127 between all threads. */
128
129 void *(*pcre_malloc)(size_t) = malloc;
130 void (*pcre_free)(void *) = free;
131
132
133
134
135 /*************************************************
136 * Return version string *
137 *************************************************/
138
139 const char *
140 pcre_version(void)
141 {
142 return PCRE_VERSION;
143 }
144
145
146
147
148 /*************************************************
149 * Return info about a compiled pattern *
150 *************************************************/
151
152 /* This function picks potentially useful data out of the private
153 structure.
154
155 Arguments:
156 external_re points to compiled code
157 optptr where to pass back the options
158 first_char where to pass back the first character,
159 or -1 if multiline and all branches start ^,
160 or -2 otherwise
161
162 Returns: number of identifying extraction brackets
163 or negative values on error
164 */
165
166 int
167 pcre_info(const pcre *external_re, int *optptr, int *first_char)
168 {
169 const real_pcre *re = (const real_pcre *)external_re;
170 if (re == NULL) return PCRE_ERROR_NULL;
171 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
172 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
173 if (first_char != NULL)
174 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
175 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
176 return re->top_bracket;
177 }
178
179
180
181
182 #ifdef DEBUG
183 /*************************************************
184 * Debugging function to print chars *
185 *************************************************/
186
187 /* Print a sequence of chars in printable format, stopping at the end of the
188 subject if the requested.
189
190 Arguments:
191 p points to characters
192 length number to print
193 is_subject TRUE if printing from within md->start_subject
194 md pointer to matching data block, if is_subject is TRUE
195
196 Returns: nothing
197 */
198
199 static void
200 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
201 {
202 int c;
203 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
204 while (length-- > 0)
205 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
206 }
207 #endif
208
209
210
211
212 /*************************************************
213 * Check subpattern for empty operand *
214 *************************************************/
215
216 /* This function checks a bracketed subpattern to see if any of the paths
217 through it could match an empty string. This is used to diagnose an error if
218 such a subpattern is followed by a quantifier with an unlimited upper bound.
219
220 Argument:
221 code points to the opening bracket
222
223 Returns: TRUE or FALSE
224 */
225
226 static BOOL
227 could_be_empty(uschar *code)
228 {
229 do {
230 uschar *cc = code + 3;
231
232 /* Scan along the opcodes for this branch; as soon as we find something
233 that matches a non-empty string, break out and advance to test the next
234 branch. If we get to the end of the branch, return TRUE for the whole
235 sub-expression. */
236
237 for (;;)
238 {
239 /* Test an embedded subpattern; if it could not be empty, break the
240 loop. Otherwise carry on in the branch. */
241
242 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
243 {
244 if (!could_be_empty(cc)) break;
245 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
246 cc += 3;
247 }
248
249 else switch (*cc)
250 {
251 /* Reached end of a branch: the subpattern may match the empty string */
252
253 case OP_ALT:
254 case OP_KET:
255 case OP_KETRMAX:
256 case OP_KETRMIN:
257 return TRUE;
258
259 /* Skip over assertive subpatterns */
260
261 case OP_ASSERT:
262 case OP_ASSERT_NOT:
263 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
264 cc += 3;
265 break;
266
267 /* Skip over things that don't match chars */
268
269 case OP_SOD:
270 case OP_EOD:
271 case OP_CIRC:
272 case OP_DOLL:
273 case OP_BRAZERO:
274 case OP_BRAMINZERO:
275 case OP_NOT_WORD_BOUNDARY:
276 case OP_WORD_BOUNDARY:
277 cc++;
278 break;
279
280 /* Skip over simple repeats with zero lower bound */
281
282 case OP_STAR:
283 case OP_MINSTAR:
284 case OP_QUERY:
285 case OP_MINQUERY:
286 case OP_NOTSTAR:
287 case OP_NOTMINSTAR:
288 case OP_NOTQUERY:
289 case OP_NOTMINQUERY:
290 case OP_TYPESTAR:
291 case OP_TYPEMINSTAR:
292 case OP_TYPEQUERY:
293 case OP_TYPEMINQUERY:
294 cc += 2;
295 break;
296
297 /* Skip over UPTOs (lower bound is zero) */
298
299 case OP_UPTO:
300 case OP_MINUPTO:
301 case OP_TYPEUPTO:
302 case OP_TYPEMINUPTO:
303 cc += 4;
304 break;
305
306 /* Check a class or a back reference for a zero minimum */
307
308 case OP_CLASS:
309 case OP_REF:
310 cc += (*cc == OP_REF)? 2 : 33;
311
312 switch (*cc)
313 {
314 case OP_CRSTAR:
315 case OP_CRMINSTAR:
316 case OP_CRQUERY:
317 case OP_CRMINQUERY:
318 cc++;
319 break;
320
321 case OP_CRRANGE:
322 case OP_CRMINRANGE:
323 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
324 cc += 3;
325 break;
326
327 default:
328 goto NEXT_BRANCH;
329 }
330 break;
331
332 /* Anything else matches at least one character */
333
334 default:
335 goto NEXT_BRANCH;
336 }
337 }
338
339 NEXT_BRANCH:
340 code += (code[1] << 8) + code[2];
341 }
342 while (*code == OP_ALT);
343
344 /* No branches match the empty string */
345
346 return FALSE;
347 }
348
349
350
351 /*************************************************
352 * Handle escapes *
353 *************************************************/
354
355 /* This function is called when a \ has been encountered. It either returns a
356 positive value for a simple escape such as \n, or a negative value which
357 encodes one of the more complicated things such as \d. On entry, ptr is
358 pointing at the \. On exit, it is on the final character of the escape
359 sequence.
360
361 Arguments:
362 ptrptr points to the pattern position pointer
363 errorptr points to the pointer to the error message
364 bracount number of previous extracting brackets
365 options the options bits
366 isclass TRUE if inside a character class
367
368 Returns: zero or positive => a data character
369 negative => a special escape sequence
370 on error, errorptr is set
371 */
372
373 static int
374 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
375 int options, BOOL isclass)
376 {
377 const uschar *ptr = *ptrptr;
378 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
379 int i;
380
381 if (c == 0) *errorptr = ERR1;
382
383 /* Digits or letters may have special meaning; all others are literals. */
384
385 else if (c < '0' || c > 'z') {}
386
387 /* Do an initial lookup in a table. A non-zero result is something that can be
388 returned immediately. Otherwise further processing may be required. */
389
390 else if ((i = escapes[c - '0']) != 0) c = i;
391
392 /* Escapes that need further processing, or are illegal. */
393
394 else
395 {
396 const uschar *oldptr;
397 switch (c)
398 {
399 /* The handling of escape sequences consisting of a string of digits
400 starting with one that is not zero is not straightforward. By experiment,
401 the way Perl works seems to be as follows:
402
403 Outside a character class, the digits are read as a decimal number. If the
404 number is less than 10, or if there are that many previous extracting
405 left brackets, then it is a back reference. Otherwise, up to three octal
406 digits are read to form an escaped byte. Thus \123 is likely to be octal
407 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
408 value is greater than 377, the least significant 8 bits are taken. Inside a
409 character class, \ followed by a digit is always an octal number. */
410
411 case '1': case '2': case '3': case '4': case '5':
412 case '6': case '7': case '8': case '9':
413
414 if (!isclass)
415 {
416 oldptr = ptr;
417 c -= '0';
418 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
419 c = c * 10 + *(++ptr) - '0';
420 if (c < 10 || c <= bracount)
421 {
422 c = -(ESC_REF + c);
423 break;
424 }
425 ptr = oldptr; /* Put the pointer back and fall through */
426 }
427
428 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
429 generates a binary zero byte and treats the digit as a following literal.
430 Thus we have to pull back the pointer by one. */
431
432 if ((c = *ptr) >= '8')
433 {
434 ptr--;
435 c = 0;
436 break;
437 }
438
439 /* \0 always starts an octal number, but we may drop through to here with a
440 larger first octal digit */
441
442 case '0':
443 c -= '0';
444 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
445 ptr[1] != '8' && ptr[1] != '9')
446 c = c * 8 + *(++ptr) - '0';
447 break;
448
449 /* Special escapes not starting with a digit are straightforward */
450
451 case 'x':
452 c = 0;
453 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
454 {
455 ptr++;
456 c = c * 16 + pcre_lcc[*ptr] -
457 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
458 }
459 break;
460
461 case 'c':
462 c = *(++ptr);
463 if (c == 0)
464 {
465 *errorptr = ERR2;
466 return 0;
467 }
468
469 /* A letter is upper-cased; then the 0x40 bit is flipped */
470
471 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
472 c ^= 0x40;
473 break;
474
475 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
476 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
477 for Perl compatibility, it is a literal. */
478
479 default:
480 if ((options & PCRE_EXTRA) != 0) switch(c)
481 {
482 case 'X':
483 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
484 break;
485
486 default:
487 *errorptr = ERR3;
488 break;
489 }
490 break;
491 }
492 }
493
494 *ptrptr = ptr;
495 return c;
496 }
497
498
499
500 /*************************************************
501 * Check for counted repeat *
502 *************************************************/
503
504 /* This function is called when a '{' is encountered in a place where it might
505 start a quantifier. It looks ahead to see if it really is a quantifier or not.
506 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
507 where the ddds are digits.
508
509 Arguments:
510 p pointer to the first char after '{'
511
512 Returns: TRUE or FALSE
513 */
514
515 static BOOL
516 is_counted_repeat(const uschar *p)
517 {
518 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
519 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
520 if (*p == '}') return TRUE;
521
522 if (*p++ != ',') return FALSE;
523 if (*p == '}') return TRUE;
524
525 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
526 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
527 return (*p == '}');
528 }
529
530
531
532 /*************************************************
533 * Read repeat counts *
534 *************************************************/
535
536 /* Read an item of the form {n,m} and return the values. This is called only
537 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
538 so the syntax is guaranteed to be correct, but we need to check the values.
539
540 Arguments:
541 p pointer to first char after '{'
542 minp pointer to int for min
543 maxp pointer to int for max
544 returned as -1 if no max
545 errorptr points to pointer to error message
546
547 Returns: pointer to '}' on success;
548 current ptr on error, with errorptr set
549 */
550
551 static const uschar *
552 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
553 {
554 int min = 0;
555 int max = -1;
556
557 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
558
559 if (*p == '}') max = min; else
560 {
561 if (*(++p) != '}')
562 {
563 max = 0;
564 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
565 if (max < min)
566 {
567 *errorptr = ERR4;
568 return p;
569 }
570 }
571 }
572
573 /* Do paranoid checks, then fill in the required variables, and pass back the
574 pointer to the terminating '}'. */
575
576 if (min > 65535 || max > 65535)
577 *errorptr = ERR5;
578 else
579 {
580 *minp = min;
581 *maxp = max;
582 }
583 return p;
584 }
585
586
587
588 /*************************************************
589 * Compile one branch *
590 *************************************************/
591
592 /* Scan the pattern, compiling it into the code vector.
593
594 Arguments:
595 options the option bits
596 bracket points to number of brackets used
597 code points to the pointer to the current code point
598 ptrptr points to the current pattern pointer
599 errorptr points to pointer to error message
600
601 Returns: TRUE on success
602 FALSE, with *errorptr set on error
603 */
604
605 static BOOL
606 compile_branch(int options, int *brackets, uschar **codeptr,
607 const uschar **ptrptr, const char **errorptr)
608 {
609 int repeat_type, op_type;
610 int repeat_min, repeat_max;
611 int bravalue, length;
612 register int c;
613 register uschar *code = *codeptr;
614 const uschar *ptr = *ptrptr;
615 const uschar *oldptr;
616 uschar *previous = NULL;
617 uschar class[32];
618
619 /* Switch on next character until the end of the branch */
620
621 for (;; ptr++)
622 {
623 BOOL negate_class;
624 int class_charcount;
625 int class_lastchar;
626
627 c = *ptr;
628 if ((options & PCRE_EXTENDED) != 0)
629 {
630 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
631 if (c == '#')
632 {
633 while ((c = *(++ptr)) != 0 && c != '\n');
634 continue;
635 }
636 }
637
638 switch(c)
639 {
640 /* The branch terminates at end of string, |, or ). */
641
642 case 0:
643 case '|':
644 case ')':
645 *codeptr = code;
646 *ptrptr = ptr;
647 return TRUE;
648
649 /* Handle single-character metacharacters */
650
651 case '^':
652 previous = NULL;
653 *code++ = OP_CIRC;
654 break;
655
656 case '$':
657 previous = NULL;
658 *code++ = OP_DOLL;
659 break;
660
661 case '.':
662 previous = code;
663 *code++ = OP_ANY;
664 break;
665
666 /* Character classes. These always build a 32-byte bitmap of the permitted
667 characters, except in the special case where there is only one character.
668 For negated classes, we build the map as usual, then invert it at the end.
669 */
670
671 case '[':
672 previous = code;
673 *code++ = OP_CLASS;
674
675 /* If the first character is '^', set the negation flag */
676
677 if ((c = *(++ptr)) == '^')
678 {
679 negate_class = TRUE;
680 c = *(++ptr);
681 }
682 else negate_class = FALSE;
683
684 /* Keep a count of chars so that we can optimize the case of just a single
685 character. */
686
687 class_charcount = 0;
688 class_lastchar = -1;
689
690 /* Initialize the 32-char bit map to all zeros. We have to build the
691 map in a temporary bit of store, in case the class contains only 1
692 character, because in that case the compiled code doesn't use the
693 bit map. */
694
695 memset(class, 0, 32 * sizeof(uschar));
696
697 /* Process characters until ] is reached. By writing this as a "do" it
698 means that an initial ] is taken as a data character. */
699
700 do
701 {
702 if (c == 0)
703 {
704 *errorptr = ERR6;
705 goto FAILED;
706 }
707
708 /* Backslash may introduce a single character, or it may introduce one
709 of the specials, which just set a flag. Escaped items are checked for
710 validity in the pre-compiling pass. The sequence \b is a special case.
711 Inside a class (and only there) it is treated as backspace. Elsewhere
712 it marks a word boundary. Other escapes have preset maps ready to
713 or into the one we are building. We assume they have more than one
714 character in them, so set class_count bigger than one. */
715
716 if (c == '\\')
717 {
718 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
719 if (-c == ESC_b) c = '\b';
720 else if (c < 0)
721 {
722 class_charcount = 10;
723 switch (-c)
724 {
725 case ESC_d:
726 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
727 continue;
728
729 case ESC_D:
730 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
731 continue;
732
733 case ESC_w:
734 for (c = 0; c < 32; c++)
735 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
736 continue;
737
738 case ESC_W:
739 for (c = 0; c < 32; c++)
740 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
741 continue;
742
743 case ESC_s:
744 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
745 continue;
746
747 case ESC_S:
748 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
749 continue;
750
751 default:
752 *errorptr = ERR7;
753 goto FAILED;
754 }
755 }
756 /* Fall through if single character */
757 }
758
759 /* A single character may be followed by '-' to form a range. However,
760 Perl does not permit ']' to be the end of the range. A '-' character
761 here is treated as a literal. */
762
763 if (ptr[1] == '-' && ptr[2] != ']')
764 {
765 int d;
766 ptr += 2;
767 d = *ptr;
768
769 if (d == 0)
770 {
771 *errorptr = ERR6;
772 goto FAILED;
773 }
774
775 /* The second part of a range can be a single-character escape, but
776 not any of the other escapes. */
777
778 if (d == '\\')
779 {
780 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
781 if (d < 0)
782 {
783 if (d == -ESC_b) d = '\b'; else
784 {
785 *errorptr = ERR7;
786 goto FAILED;
787 }
788 }
789 }
790
791 if (d < c)
792 {
793 *errorptr = ERR8;
794 goto FAILED;
795 }
796
797 for (; c <= d; c++)
798 {
799 class[c/8] |= (1 << (c&7));
800 if ((options & PCRE_CASELESS) != 0)
801 {
802 int uc = pcre_fcc[c]; /* flip case */
803 class[uc/8] |= (1 << (uc&7));
804 }
805 class_charcount++; /* in case a one-char range */
806 class_lastchar = c;
807 }
808 continue; /* Go get the next char in the class */
809 }
810
811 /* Handle a lone single character - we can get here for a normal
812 non-escape char, or after \ that introduces a single character. */
813
814 class [c/8] |= (1 << (c&7));
815 if ((options & PCRE_CASELESS) != 0)
816 {
817 c = pcre_fcc[c]; /* flip case */
818 class[c/8] |= (1 << (c&7));
819 }
820 class_charcount++;
821 class_lastchar = c;
822 }
823
824 /* Loop until ']' reached; the check for end of string happens inside the
825 loop. This "while" is the end of the "do" above. */
826
827 while ((c = *(++ptr)) != ']');
828
829 /* If class_charcount is 1 and class_lastchar is not negative, we saw
830 precisely one character. This doesn't need the whole 32-byte bit map.
831 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
832 it's negative. */
833
834 if (class_charcount == 1 && class_lastchar >= 0)
835 {
836 if (negate_class)
837 {
838 code[-1] = OP_NOT;
839 }
840 else
841 {
842 code[-1] = OP_CHARS;
843 *code++ = 1;
844 }
845 *code++ = class_lastchar;
846 }
847
848 /* Otherwise, negate the 32-byte map if necessary, and copy it into
849 the code vector. */
850
851 else
852 {
853 if (negate_class)
854 for (c = 0; c < 32; c++) code[c] = ~class[c];
855 else
856 memcpy(code, class, 32);
857 code += 32;
858 }
859 break;
860
861 /* Various kinds of repeat */
862
863 case '{':
864 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
865 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
866 if (*errorptr != NULL) goto FAILED;
867 goto REPEAT;
868
869 case '*':
870 repeat_min = 0;
871 repeat_max = -1;
872 goto REPEAT;
873
874 case '+':
875 repeat_min = 1;
876 repeat_max = -1;
877 goto REPEAT;
878
879 case '?':
880 repeat_min = 0;
881 repeat_max = 1;
882
883 REPEAT:
884 if (previous == NULL)
885 {
886 *errorptr = ERR9;
887 goto FAILED;
888 }
889
890 /* If the next character is '?' this is a minimizing repeat. Advance to the
891 next character. */
892
893 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
894
895 /* If the maximum is zero then the minimum must also be zero; Perl allows
896 this case, so we do too - by simply omitting the item altogether. */
897
898 if (repeat_max == 0) code = previous;
899
900 /* If previous was a string of characters, chop off the last one and use it
901 as the subject of the repeat. If there was only one character, we can
902 abolish the previous item altogether. */
903
904 else if (*previous == OP_CHARS)
905 {
906 int len = previous[1];
907 if (len == 1)
908 {
909 c = previous[2];
910 code = previous;
911 }
912 else
913 {
914 c = previous[len+1];
915 previous[1]--;
916 code--;
917 }
918 op_type = 0; /* Use single-char op codes */
919 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
920 }
921
922 /* If previous was a single negated character ([^a] or similar), we use
923 one of the special opcodes, replacing it. The code is shared with single-
924 character repeats by adding a suitable offset into repeat_type. */
925
926 else if ((int)*previous == OP_NOT)
927 {
928 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
929 c = previous[1];
930 code = previous;
931 goto OUTPUT_SINGLE_REPEAT;
932 }
933
934 /* If previous was a character type match (\d or similar), abolish it and
935 create a suitable repeat item. The code is shared with single-character
936 repeats by adding a suitable offset into repeat_type. */
937
938 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
939 {
940 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
941 c = *previous;
942 code = previous;
943
944 OUTPUT_SINGLE_REPEAT:
945 repeat_type += op_type; /* Combine both values for many cases */
946
947 /* A minimum of zero is handled either as the special case * or ?, or as
948 an UPTO, with the maximum given. */
949
950 if (repeat_min == 0)
951 {
952 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
953 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
954 else
955 {
956 *code++ = OP_UPTO + repeat_type;
957 *code++ = repeat_max >> 8;
958 *code++ = (repeat_max & 255);
959 }
960 }
961
962 /* The case {1,} is handled as the special case + */
963
964 else if (repeat_min == 1 && repeat_max == -1)
965 *code++ = OP_PLUS + repeat_type;
966
967 /* The case {n,n} is just an EXACT, while the general case {n,m} is
968 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
969
970 else
971 {
972 if (repeat_min != 1)
973 {
974 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
975 *code++ = repeat_min >> 8;
976 *code++ = (repeat_min & 255);
977 }
978
979 /* If the mininum is 1 and the previous item was a character string,
980 we either have to put back the item that got cancelled if the string
981 length was 1, or add the character back onto the end of a longer
982 string. For a character type nothing need be done; it will just get put
983 back naturally. */
984
985 else if (*previous == OP_CHARS)
986 {
987 if (code == previous) code += 2; else previous[1]++;
988 }
989
990 /* If the maximum is unlimited, insert an OP_STAR. */
991
992 if (repeat_max < 0)
993 {
994 *code++ = c;
995 *code++ = OP_STAR + repeat_type;
996 }
997
998 /* Else insert an UPTO if the max is greater than the min. */
999
1000 else if (repeat_max != repeat_min)
1001 {
1002 *code++ = c;
1003 repeat_max -= repeat_min;
1004 *code++ = OP_UPTO + repeat_type;
1005 *code++ = repeat_max >> 8;
1006 *code++ = (repeat_max & 255);
1007 }
1008 }
1009
1010 /* The character or character type itself comes last in all cases. */
1011
1012 *code++ = c;
1013 }
1014
1015 /* If previous was a character class or a back reference, we put the repeat
1016 stuff after it. */
1017
1018 else if (*previous == OP_CLASS || *previous == OP_REF)
1019 {
1020 if (repeat_min == 0 && repeat_max == -1)
1021 *code++ = OP_CRSTAR + repeat_type;
1022 else if (repeat_min == 1 && repeat_max == -1)
1023 *code++ = OP_CRPLUS + repeat_type;
1024 else if (repeat_min == 0 && repeat_max == 1)
1025 *code++ = OP_CRQUERY + repeat_type;
1026 else
1027 {
1028 *code++ = OP_CRRANGE + repeat_type;
1029 *code++ = repeat_min >> 8;
1030 *code++ = repeat_min & 255;
1031 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1032 *code++ = repeat_max >> 8;
1033 *code++ = repeat_max & 255;
1034 }
1035 }
1036
1037 /* If previous was a bracket group, we may have to replicate it in certain
1038 cases. If the maximum repeat count is unlimited, check that the bracket
1039 group cannot match the empty string, and diagnose an error if it can. */
1040
1041 else if ((int)*previous >= OP_BRA)
1042 {
1043 int i;
1044 int len = code - previous;
1045
1046 if (repeat_max == -1 && could_be_empty(previous))
1047 {
1048 *errorptr = ERR10;
1049 goto FAILED;
1050 }
1051
1052 /* If the minimum is greater than zero, and the maximum is unlimited or
1053 equal to the minimum, the first copy remains where it is, and is
1054 replicated up to the minimum number of times. This case includes the +
1055 repeat, but of course no replication is needed in that case. */
1056
1057 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1058 {
1059 for (i = 1; i < repeat_min; i++)
1060 {
1061 memcpy(code, previous, len);
1062 code += len;
1063 }
1064 }
1065
1066 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1067 Then, if there is a fixed upper limit, replicated up to that many times,
1068 sticking BRAZERO in front of all the optional ones. */
1069
1070 else
1071 {
1072 if (repeat_min == 0)
1073 {
1074 memmove(previous+1, previous, len);
1075 code++;
1076 *previous++ = OP_BRAZERO + repeat_type;
1077 }
1078
1079 for (i = 1; i < repeat_min; i++)
1080 {
1081 memcpy(code, previous, len);
1082 code += len;
1083 }
1084
1085 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1086 {
1087 *code++ = OP_BRAZERO + repeat_type;
1088 memcpy(code, previous, len);
1089 code += len;
1090 }
1091 }
1092
1093 /* If the maximum is unlimited, set a repeater in the final copy. */
1094
1095 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1096 }
1097
1098 /* Else there's some kind of shambles */
1099
1100 else
1101 {
1102 *errorptr = ERR11;
1103 goto FAILED;
1104 }
1105
1106 /* In all case we no longer have a previous item. */
1107
1108 previous = NULL;
1109 break;
1110
1111
1112 /* Start of nested bracket sub-expression, or comment or lookahead.
1113 First deal with special things that can come after a bracket; all are
1114 introduced by ?, and the appearance of any of them means that this is not a
1115 referencing group. They were checked for validity in the first pass over
1116 the string, so we don't have to check for syntax errors here. */
1117
1118 case '(':
1119 previous = code; /* Only real brackets can be repeated */
1120 if (*(++ptr) == '?')
1121 {
1122 bravalue = OP_BRA;
1123
1124 switch (*(++ptr))
1125 {
1126 case '#':
1127 case 'i':
1128 case 'm':
1129 case 's':
1130 case 'x':
1131 ptr++;
1132 while (*ptr != ')') ptr++;
1133 previous = NULL;
1134 continue;
1135
1136 case ':': /* Non-extracting bracket */
1137 ptr++;
1138 break;
1139
1140 case '=': /* Assertions can't be repeated */
1141 bravalue = OP_ASSERT;
1142 ptr++;
1143 previous = NULL;
1144 break;
1145
1146 case '!':
1147 bravalue = OP_ASSERT_NOT;
1148 ptr++;
1149 previous = NULL;
1150 break;
1151
1152 case '>': /* "Match once" brackets */
1153 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1154 {
1155 bravalue = OP_ONCE;
1156 ptr++;
1157 previous = NULL;
1158 break;
1159 }
1160 /* Else fall through */
1161
1162 default:
1163 *errorptr = ERR12;
1164 goto FAILED;
1165 }
1166 }
1167
1168 /* Else we have a referencing group */
1169
1170 else
1171 {
1172 if (++(*brackets) > EXTRACT_MAX)
1173 {
1174 *errorptr = ERR13;
1175 goto FAILED;
1176 }
1177 bravalue = OP_BRA + *brackets;
1178 }
1179
1180 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1181 code into a non-register variable in order to be able to pass its address
1182 because some compilers complain otherwise. */
1183
1184 *code = bravalue;
1185 {
1186 uschar *mcode = code;
1187 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1188 goto FAILED;
1189 code = mcode;
1190 }
1191
1192 if (*ptr != ')')
1193 {
1194 *errorptr = ERR14;
1195 goto FAILED;
1196 }
1197 break;
1198
1199 /* Check \ for being a real metacharacter; if not, fall through and handle
1200 it as a data character at the start of a string. Escape items are checked
1201 for validity in the pre-compiling pass. */
1202
1203 case '\\':
1204 oldptr = ptr;
1205 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1206
1207 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1208 are arranged to be the negation of the corresponding OP_values. For the
1209 back references, the values are ESC_REF plus the reference number. Only
1210 back references and those types that consume a character may be repeated.
1211 We can test for values between ESC_b and ESC_Z for the latter; this may
1212 have to change if any new ones are ever created. */
1213
1214 if (c < 0)
1215 {
1216 if (-c >= ESC_REF)
1217 {
1218 int refnum = -c - ESC_REF;
1219 if (*brackets < refnum)
1220 {
1221 *errorptr = ERR15;
1222 goto FAILED;
1223 }
1224 previous = code;
1225 *code++ = OP_REF;
1226 *code++ = refnum;
1227 }
1228 else
1229 {
1230 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1231 *code++ = -c;
1232 }
1233 continue;
1234 }
1235
1236 /* Data character: reset and fall through */
1237
1238 ptr = oldptr;
1239 c = '\\';
1240
1241 /* Handle a run of data characters until a metacharacter is encountered.
1242 The first character is guaranteed not to be whitespace or # when the
1243 extended flag is set. */
1244
1245 NORMAL_CHAR:
1246 default:
1247 previous = code;
1248 *code = OP_CHARS;
1249 code += 2;
1250 length = 0;
1251
1252 do
1253 {
1254 if ((options & PCRE_EXTENDED) != 0)
1255 {
1256 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1257 if (c == '#')
1258 {
1259 while ((c = *(++ptr)) != 0 && c != '\n');
1260 if (c == 0) break;
1261 continue;
1262 }
1263 }
1264
1265 /* Backslash may introduce a data char or a metacharacter. Escaped items
1266 are checked for validity in the pre-compiling pass. Stop the string
1267 before a metaitem. */
1268
1269 if (c == '\\')
1270 {
1271 oldptr = ptr;
1272 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1273 if (c < 0) { ptr = oldptr; break; }
1274 }
1275
1276 /* Ordinary character or single-char escape */
1277
1278 *code++ = c;
1279 length++;
1280 }
1281
1282 /* This "while" is the end of the "do" above. */
1283
1284 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1285
1286 /* Compute the length and set it in the data vector, and advance to
1287 the next state. */
1288
1289 previous[1] = length;
1290 ptr--;
1291 break;
1292 }
1293 } /* end of big loop */
1294
1295 /* Control never reaches here by falling through, only by a goto for all the
1296 error states. Pass back the position in the pattern so that it can be displayed
1297 to the user for diagnosing the error. */
1298
1299 FAILED:
1300 *ptrptr = ptr;
1301 return FALSE;
1302 }
1303
1304
1305
1306
1307 /*************************************************
1308 * Compile sequence of alternatives *
1309 *************************************************/
1310
1311 /* On entry, ptr is pointing past the bracket character, but on return
1312 it points to the closing bracket, or vertical bar, or end of string.
1313 The code variable is pointing at the byte into which the BRA operator has been
1314 stored.
1315
1316 Argument:
1317 options the option bits
1318 brackets -> int containing the number of extracting brackets used
1319 codeptr -> the address of the current code pointer
1320 ptrptr -> the address of the current pattern pointer
1321 errorptr -> pointer to error message
1322
1323 Returns: TRUE on success
1324 */
1325
1326 static BOOL
1327 compile_regex(int options, int *brackets, uschar **codeptr,
1328 const uschar **ptrptr, const char **errorptr)
1329 {
1330 const uschar *ptr = *ptrptr;
1331 uschar *code = *codeptr;
1332 uschar *start_bracket = code;
1333
1334 for (;;)
1335 {
1336 int length;
1337 uschar *last_branch = code;
1338
1339 code += 3;
1340 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1341 {
1342 *ptrptr = ptr;
1343 return FALSE;
1344 }
1345
1346 /* Fill in the length of the last branch */
1347
1348 length = code - last_branch;
1349 last_branch[1] = length >> 8;
1350 last_branch[2] = length & 255;
1351
1352 /* Reached end of expression, either ')' or end of pattern. Insert a
1353 terminating ket and the length of the whole bracketed item, and return,
1354 leaving the pointer at the terminating char. */
1355
1356 if (*ptr != '|')
1357 {
1358 length = code - start_bracket;
1359 *code++ = OP_KET;
1360 *code++ = length >> 8;
1361 *code++ = length & 255;
1362 *codeptr = code;
1363 *ptrptr = ptr;
1364 return TRUE;
1365 }
1366
1367 /* Another branch follows; insert an "or" node and advance the pointer. */
1368
1369 *code = OP_ALT;
1370 ptr++;
1371 }
1372 /* Control never reaches here */
1373 }
1374
1375
1376
1377 /*************************************************
1378 * Check for anchored expression *
1379 *************************************************/
1380
1381 /* Try to find out if this is an anchored regular expression. Consider each
1382 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1383 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1384 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1385 counts, since OP_CIRC can match in the middle.
1386
1387 A branch is also implicitly anchored if it starts with .* because that will try
1388 the rest of the pattern at all possible matching points, so there is no point
1389 trying them again.
1390
1391 Argument: points to start of expression (the bracket)
1392 Returns: TRUE or FALSE
1393 */
1394
1395 static BOOL
1396 is_anchored(register const uschar *code, BOOL multiline)
1397 {
1398 do {
1399 int op = (int)code[3];
1400 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1401 { if (!is_anchored(code+3, multiline)) return FALSE; }
1402 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1403 { if (code[4] != OP_ANY) return FALSE; }
1404 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1405 code += (code[1] << 8) + code[2];
1406 }
1407 while (*code == OP_ALT);
1408 return TRUE;
1409 }
1410
1411
1412
1413 /*************************************************
1414 * Check for start with \n line expression *
1415 *************************************************/
1416
1417 /* This is called for multiline expressions to try to find out if every branch
1418 starts with ^ so that "first char" processing can be done to speed things up.
1419
1420 Argument: points to start of expression (the bracket)
1421 Returns: TRUE or FALSE
1422 */
1423
1424 static BOOL
1425 is_startline(const uschar *code)
1426 {
1427 do {
1428 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1429 { if (!is_startline(code+3)) return FALSE; }
1430 else if (code[3] != OP_CIRC) return FALSE;
1431 code += (code[1] << 8) + code[2];
1432 }
1433 while (*code == OP_ALT);
1434 return TRUE;
1435 }
1436
1437
1438
1439 /*************************************************
1440 * Check for fixed first char *
1441 *************************************************/
1442
1443 /* Try to find out if there is a fixed first character. This is called for
1444 unanchored expressions, as it speeds up their processing quite considerably.
1445 Consider each alternative branch. If they all start with the same char, or with
1446 a bracket all of whose alternatives start with the same char (recurse ad lib),
1447 then we return that char, otherwise -1.
1448
1449 Argument: points to start of expression (the bracket)
1450 Returns: -1 or the fixed first char
1451 */
1452
1453 static int
1454 find_firstchar(uschar *code)
1455 {
1456 register int c = -1;
1457 do
1458 {
1459 register int charoffset = 4;
1460
1461 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1462 {
1463 register int d;
1464 if ((d = find_firstchar(code+3)) < 0) return -1;
1465 if (c < 0) c = d; else if (c != d) return -1;
1466 }
1467
1468 else switch(code[3])
1469 {
1470 default:
1471 return -1;
1472
1473 case OP_EXACT: /* Fall through */
1474 charoffset++;
1475
1476 case OP_CHARS: /* Fall through */
1477 charoffset++;
1478
1479 case OP_PLUS:
1480 case OP_MINPLUS:
1481 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1482 break;
1483 }
1484 code += (code[1] << 8) + code[2];
1485 }
1486 while (*code == OP_ALT);
1487 return c;
1488 }
1489
1490
1491
1492 /*************************************************
1493 * Compile a Regular Expression *
1494 *************************************************/
1495
1496 /* This function takes a string and returns a pointer to a block of store
1497 holding a compiled version of the expression.
1498
1499 Arguments:
1500 pattern the regular expression
1501 options various option bits
1502 errorptr pointer to pointer to error text
1503 erroroffset ptr offset in pattern where error was detected
1504
1505 Returns: pointer to compiled data block, or NULL on error,
1506 with errorptr and erroroffset set
1507 */
1508
1509 pcre *
1510 pcre_compile(const char *pattern, int options, const char **errorptr,
1511 int *erroroffset)
1512 {
1513 real_pcre *re;
1514 int spaces = 0;
1515 int length = 3; /* For initial BRA plus length */
1516 int runlength;
1517 int c, size;
1518 int bracount = 0;
1519 int brastack[200];
1520 int top_backref = 0;
1521 unsigned int brastackptr = 0;
1522 uschar *code;
1523 const uschar *ptr;
1524
1525 #ifdef DEBUG
1526 uschar *code_base, *code_end;
1527 #endif
1528
1529 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1530 can do is just return NULL. */
1531
1532 if (errorptr == NULL) return NULL;
1533 *errorptr = NULL;
1534
1535 /* However, we can give a message for this error */
1536
1537 if (erroroffset == NULL)
1538 {
1539 *errorptr = ERR16;
1540 return NULL;
1541 }
1542 *erroroffset = 0;
1543
1544 if ((options & ~PUBLIC_OPTIONS) != 0)
1545 {
1546 *errorptr = ERR17;
1547 return NULL;
1548 }
1549
1550 DPRINTF(("------------------------------------------------------------------\n"));
1551 DPRINTF(("%s\n", pattern));
1552
1553 /* The first thing to do is to make a pass over the pattern to compute the
1554 amount of store required to hold the compiled code. This does not have to be
1555 perfect as long as errors are overestimates. At the same time we can detect any
1556 internal flag settings. Make an attempt to correct for any counted white space
1557 if an "extended" flag setting appears late in the pattern. We can't be so
1558 clever for #-comments. */
1559
1560 ptr = (const uschar *)(pattern - 1);
1561 while ((c = *(++ptr)) != 0)
1562 {
1563 int min, max;
1564 int class_charcount;
1565
1566 if ((pcre_ctypes[c] & ctype_space) != 0)
1567 {
1568 if ((options & PCRE_EXTENDED) != 0) continue;
1569 spaces++;
1570 }
1571
1572 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1573 {
1574 while ((c = *(++ptr)) != 0 && c != '\n');
1575 continue;
1576 }
1577
1578 switch(c)
1579 {
1580 /* A backslashed item may be an escaped "normal" character or a
1581 character type. For a "normal" character, put the pointers and
1582 character back so that tests for whitespace etc. in the input
1583 are done correctly. */
1584
1585 case '\\':
1586 {
1587 const uschar *save_ptr = ptr;
1588 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1589 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1590 if (c >= 0)
1591 {
1592 ptr = save_ptr;
1593 c = '\\';
1594 goto NORMAL_CHAR;
1595 }
1596 }
1597 length++;
1598
1599 /* A back reference needs an additional char, plus either one or 5
1600 bytes for a repeat. We also need to keep the value of the highest
1601 back reference. */
1602
1603 if (c <= -ESC_REF)
1604 {
1605 int refnum = -c - ESC_REF;
1606 if (refnum > top_backref) top_backref = refnum;
1607 length++; /* For single back reference */
1608 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1609 {
1610 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1611 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1612 if ((min == 0 && (max == 1 || max == -1)) ||
1613 (min == 1 && max == -1))
1614 length++;
1615 else length += 5;
1616 if (ptr[1] == '?') ptr++;
1617 }
1618 }
1619 continue;
1620
1621 case '^':
1622 case '.':
1623 case '$':
1624 case '*': /* These repeats won't be after brackets; */
1625 case '+': /* those are handled separately */
1626 case '?':
1627 length++;
1628 continue;
1629
1630 /* This covers the cases of repeats after a single char, metachar, class,
1631 or back reference. */
1632
1633 case '{':
1634 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1635 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1636 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1637 if ((min == 0 && (max == 1 || max == -1)) ||
1638 (min == 1 && max == -1))
1639 length++;
1640 else
1641 {
1642 length--; /* Uncount the original char or metachar */
1643 if (min == 1) length++; else if (min > 0) length += 4;
1644 if (max > 0) length += 4; else length += 2;
1645 }
1646 if (ptr[1] == '?') ptr++;
1647 continue;
1648
1649 /* An alternation contains an offset to the next branch or ket. */
1650 case '|':
1651 length += 3;
1652 continue;
1653
1654 /* A character class uses 33 characters. Don't worry about character types
1655 that aren't allowed in classes - they'll get picked up during the compile.
1656 A character class that contains only one character uses 2 or 3 bytes,
1657 depending on whether it is negated or not. Notice this where we can. */
1658
1659 case '[':
1660 class_charcount = 0;
1661 if (*(++ptr) == '^') ptr++;
1662 do
1663 {
1664 if (*ptr == '\\')
1665 {
1666 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1667 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1668 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1669 }
1670 else class_charcount++;
1671 ptr++;
1672 }
1673 while (*ptr != 0 && *ptr != ']');
1674
1675 /* Repeats for negated single chars are handled by the general code */
1676
1677 if (class_charcount == 1) length += 3; else
1678 {
1679 length += 33;
1680
1681 /* A repeat needs either 1 or 5 bytes. */
1682
1683 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1684 {
1685 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1686 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1687 if ((min == 0 && (max == 1 || max == -1)) ||
1688 (min == 1 && max == -1))
1689 length++;
1690 else length += 5;
1691 if (ptr[1] == '?') ptr++;
1692 }
1693 }
1694 continue;
1695
1696 /* Brackets may be genuine groups or special things */
1697
1698 case '(':
1699
1700 /* Handle special forms of bracket, which all start (? */
1701
1702 if (ptr[1] == '?') switch (c = ptr[2])
1703 {
1704 /* Skip over comments entirely */
1705 case '#':
1706 ptr += 3;
1707 while (*ptr != 0 && *ptr != ')') ptr++;
1708 if (*ptr == 0)
1709 {
1710 *errorptr = ERR18;
1711 goto PCRE_ERROR_RETURN;
1712 }
1713 continue;
1714
1715 /* Non-referencing groups and lookaheads just move the pointer on, and
1716 then behave like a non-special bracket, except that they don't increment
1717 the count of extracting brackets. */
1718
1719 case ':':
1720 case '=':
1721 case '!':
1722 ptr += 2;
1723 break;
1724
1725 /* Ditto for the "once only" bracket, allowed only if the extra bit
1726 is set. */
1727
1728 case '>':
1729 if ((options & PCRE_EXTRA) != 0)
1730 {
1731 ptr += 2;
1732 break;
1733 }
1734 /* Else fall thourh */
1735
1736 /* Else loop setting valid options until ) is met. Anything else is an
1737 error. */
1738
1739 default:
1740 ptr += 2;
1741 for (;; ptr++)
1742 {
1743 if ((c = *ptr) == 'i')
1744 {
1745 options |= PCRE_CASELESS;
1746 continue;
1747 }
1748 else if ((c = *ptr) == 'm')
1749 {
1750 options |= PCRE_MULTILINE;
1751 continue;
1752 }
1753 else if (c == 's')
1754 {
1755 options |= PCRE_DOTALL;
1756 continue;
1757 }
1758 else if (c == 'x')
1759 {
1760 options |= PCRE_EXTENDED;
1761 length -= spaces; /* Already counted spaces */
1762 continue;
1763 }
1764 else if (c == ')') break;
1765
1766 *errorptr = ERR12;
1767 goto PCRE_ERROR_RETURN;
1768 }
1769 continue; /* End of this bracket handling */
1770 }
1771
1772 /* Extracting brackets must be counted so we can process escapes in a
1773 Perlish way. */
1774
1775 else bracount++;
1776
1777 /* Non-special forms of bracket. Save length for computing whole length
1778 at end if there's a repeat that requires duplication of the group. */
1779
1780 if (brastackptr >= sizeof(brastack)/sizeof(int))
1781 {
1782 *errorptr = ERR19;
1783 goto PCRE_ERROR_RETURN;
1784 }
1785
1786 brastack[brastackptr++] = length;
1787 length += 3;
1788 continue;
1789
1790 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1791 have to replicate this bracket up to that many times. If brastackptr is
1792 0 this is an unmatched bracket which will generate an error, but take care
1793 not to try to access brastack[-1]. */
1794
1795 case ')':
1796 length += 3;
1797 {
1798 int minval = 1;
1799 int maxval = 1;
1800 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1801
1802 /* Leave ptr at the final char; for read_repeat_counts this happens
1803 automatically; for the others we need an increment. */
1804
1805 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1806 {
1807 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1808 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1809 }
1810 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1811 else if (c == '+') { maxval = -1; ptr++; }
1812 else if (c == '?') { minval = 0; ptr++; }
1813
1814 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1815 if there is a limited maximum we have to replicate up to maxval-1 times
1816 and allow for a BRAZERO item before each optional copy, as we also have
1817 to do before the first copy if the minimum is zero. */
1818
1819 if (minval == 0) length++;
1820 else if (minval > 1) length += (minval - 1) * duplength;
1821 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1822 }
1823 continue;
1824
1825 /* Non-special character. For a run of such characters the length required
1826 is the number of characters + 2, except that the maximum run length is 255.
1827 We won't get a skipped space or a non-data escape or the start of a #
1828 comment as the first character, so the length can't be zero. */
1829
1830 NORMAL_CHAR:
1831 default:
1832 length += 2;
1833 runlength = 0;
1834 do
1835 {
1836 if ((pcre_ctypes[c] & ctype_space) != 0)
1837 {
1838 if ((options & PCRE_EXTENDED) != 0) continue;
1839 spaces++;
1840 }
1841
1842 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1843 {
1844 while ((c = *(++ptr)) != 0 && c != '\n');
1845 continue;
1846 }
1847
1848 /* Backslash may introduce a data char or a metacharacter; stop the
1849 string before the latter. */
1850
1851 if (c == '\\')
1852 {
1853 const uschar *saveptr = ptr;
1854 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1855 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1856 if (c < 0) { ptr = saveptr; break; }
1857 }
1858
1859 /* Ordinary character or single-char escape */
1860
1861 runlength++;
1862 }
1863
1864 /* This "while" is the end of the "do" above. */
1865
1866 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1867
1868 ptr--;
1869 length += runlength;
1870 continue;
1871 }
1872 }
1873
1874 length += 4; /* For final KET and END */
1875
1876 if (length > 65539)
1877 {
1878 *errorptr = ERR20;
1879 return NULL;
1880 }
1881
1882 /* Compute the size of data block needed and get it, either from malloc or
1883 externally provided function. We specify "code[0]" in the offsetof() expression
1884 rather than just "code", because it has been reported that one broken compiler
1885 fails on "code" because it is also an independent variable. It should make no
1886 difference to the value of the offsetof(). */
1887
1888 size = length + offsetof(real_pcre, code[0]);
1889 re = (real_pcre *)(pcre_malloc)(size);
1890
1891 if (re == NULL)
1892 {
1893 *errorptr = ERR21;
1894 return NULL;
1895 }
1896
1897 /* Put in the magic number and the options. */
1898
1899 re->magic_number = MAGIC_NUMBER;
1900 re->options = options;
1901
1902 /* Set up a starting, non-extracting bracket, then compile the expression. On
1903 error, *errorptr will be set non-NULL, so we don't need to look at the result
1904 of the function here. */
1905
1906 ptr = (const uschar *)pattern;
1907 code = re->code;
1908 *code = OP_BRA;
1909 bracount = 0;
1910 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1911 re->top_bracket = bracount;
1912 re->top_backref = top_backref;
1913
1914 /* If not reached end of pattern on success, there's an excess bracket. */
1915
1916 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1917
1918 /* Fill in the terminating state and check for disastrous overflow, but
1919 if debugging, leave the test till after things are printed out. */
1920
1921 *code++ = OP_END;
1922
1923 #ifndef DEBUG
1924 if (code - re->code > length) *errorptr = ERR23;
1925 #endif
1926
1927 /* Failed to compile */
1928
1929 if (*errorptr != NULL)
1930 {
1931 (pcre_free)(re);
1932 PCRE_ERROR_RETURN:
1933 *erroroffset = ptr - (const uschar *)pattern;
1934 return NULL;
1935 }
1936
1937 /* If the anchored option was not passed, set flag if we can determine that it
1938 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1939 we can determine what the first character has to be, because that speeds up
1940 unanchored matches no end. In the case of multiline matches, an alternative is
1941 to set the PCRE_STARTLINE flag if all branches start with ^. */
1942
1943 if ((options & PCRE_ANCHORED) == 0)
1944 {
1945 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1946 re->options |= PCRE_ANCHORED;
1947 else
1948 {
1949 int ch = find_firstchar(re->code);
1950 if (ch >= 0)
1951 {
1952 re->first_char = ch;
1953 re->options |= PCRE_FIRSTSET;
1954 }
1955 else if (is_startline(re->code))
1956 re->options |= PCRE_STARTLINE;
1957 }
1958 }
1959
1960 /* Print out the compiled data for debugging */
1961
1962 #ifdef DEBUG
1963
1964 printf("Length = %d top_bracket = %d top_backref=%d\n",
1965 length, re->top_bracket, re->top_backref);
1966
1967 if (re->options != 0)
1968 {
1969 printf("%s%s%s%s%s%s%s\n",
1970 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1971 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1972 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1973 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1974 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1975 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1976 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1977 }
1978
1979 if ((re->options & PCRE_FIRSTSET) != 0)
1980 {
1981 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1982 else printf("First char = \\x%02x\n", re->first_char);
1983 }
1984
1985 code_end = code;
1986 code_base = code = re->code;
1987
1988 while (code < code_end)
1989 {
1990 int charlength;
1991
1992 printf("%3d ", code - code_base);
1993
1994 if (*code >= OP_BRA)
1995 {
1996 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1997 code += 2;
1998 }
1999
2000 else switch(*code)
2001 {
2002 case OP_CHARS:
2003 charlength = *(++code);
2004 printf("%3d ", charlength);
2005 while (charlength-- > 0)
2006 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2007 break;
2008
2009 case OP_KETRMAX:
2010 case OP_KETRMIN:
2011 case OP_ALT:
2012 case OP_KET:
2013 case OP_ASSERT:
2014 case OP_ASSERT_NOT:
2015 case OP_ONCE:
2016 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2017 code += 2;
2018 break;
2019
2020 case OP_STAR:
2021 case OP_MINSTAR:
2022 case OP_PLUS:
2023 case OP_MINPLUS:
2024 case OP_QUERY:
2025 case OP_MINQUERY:
2026 case OP_TYPESTAR:
2027 case OP_TYPEMINSTAR:
2028 case OP_TYPEPLUS:
2029 case OP_TYPEMINPLUS:
2030 case OP_TYPEQUERY:
2031 case OP_TYPEMINQUERY:
2032 if (*code >= OP_TYPESTAR)
2033 printf(" %s", OP_names[code[1]]);
2034 else if (isprint(c = code[1])) printf(" %c", c);
2035 else printf(" \\x%02x", c);
2036 printf("%s", OP_names[*code++]);
2037 break;
2038
2039 case OP_EXACT:
2040 case OP_UPTO:
2041 case OP_MINUPTO:
2042 if (isprint(c = code[3])) printf(" %c{", c);
2043 else printf(" \\x%02x{", c);
2044 if (*code != OP_EXACT) printf("0,");
2045 printf("%d}", (code[1] << 8) + code[2]);
2046 if (*code == OP_MINUPTO) printf("?");
2047 code += 3;
2048 break;
2049
2050 case OP_TYPEEXACT:
2051 case OP_TYPEUPTO:
2052 case OP_TYPEMINUPTO:
2053 printf(" %s{", OP_names[code[3]]);
2054 if (*code != OP_TYPEEXACT) printf(",");
2055 printf("%d}", (code[1] << 8) + code[2]);
2056 if (*code == OP_TYPEMINUPTO) printf("?");
2057 code += 3;
2058 break;
2059
2060 case OP_NOT:
2061 if (isprint(c = *(++code))) printf(" [^%c]", c);
2062 else printf(" [^\\x%02x]", c);
2063 break;
2064
2065 case OP_NOTSTAR:
2066 case OP_NOTMINSTAR:
2067 case OP_NOTPLUS:
2068 case OP_NOTMINPLUS:
2069 case OP_NOTQUERY:
2070 case OP_NOTMINQUERY:
2071 if (isprint(c = code[1])) printf(" [^%c]", c);
2072 else printf(" [^\\x%02x]", c);
2073 printf("%s", OP_names[*code++]);
2074 break;
2075
2076 case OP_NOTEXACT:
2077 case OP_NOTUPTO:
2078 case OP_NOTMINUPTO:
2079 if (isprint(c = code[3])) printf(" [^%c]{", c);
2080 else printf(" [^\\x%02x]{", c);
2081 if (*code != OP_NOTEXACT) printf(",");
2082 printf("%d}", (code[1] << 8) + code[2]);
2083 if (*code == OP_NOTMINUPTO) printf("?");
2084 code += 3;
2085 break;
2086
2087 case OP_REF:
2088 printf(" \\%d", *(++code));
2089 code ++;
2090 goto CLASS_REF_REPEAT;
2091
2092 case OP_CLASS:
2093 {
2094 int i, min, max;
2095
2096 code++;
2097 printf(" [");
2098
2099 for (i = 0; i < 256; i++)
2100 {
2101 if ((code[i/8] & (1 << (i&7))) != 0)
2102 {
2103 int j;
2104 for (j = i+1; j < 256; j++)
2105 if ((code[j/8] & (1 << (j&7))) == 0) break;
2106 if (i == '-' || i == ']') printf("\\");
2107 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2108 if (--j > i)
2109 {
2110 printf("-");
2111 if (j == '-' || j == ']') printf("\\");
2112 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2113 }
2114 i = j;
2115 }
2116 }
2117 printf("]");
2118 code += 32;
2119
2120 CLASS_REF_REPEAT:
2121
2122 switch(*code)
2123 {
2124 case OP_CRSTAR:
2125 case OP_CRMINSTAR:
2126 case OP_CRPLUS:
2127 case OP_CRMINPLUS:
2128 case OP_CRQUERY:
2129 case OP_CRMINQUERY:
2130 printf("%s", OP_names[*code]);
2131 break;
2132
2133 case OP_CRRANGE:
2134 case OP_CRMINRANGE:
2135 min = (code[1] << 8) + code[2];
2136 max = (code[3] << 8) + code[4];
2137 if (max == 0) printf("{%d,}", min);
2138 else printf("{%d,%d}", min, max);
2139 if (*code == OP_CRMINRANGE) printf("?");
2140 code += 4;
2141 break;
2142
2143 default:
2144 code--;
2145 }
2146 }
2147 break;
2148
2149 /* Anything else is just a one-node item */
2150
2151 default:
2152 printf(" %s", OP_names[*code]);
2153 break;
2154 }
2155
2156 code++;
2157 printf("\n");
2158 }
2159 printf("------------------------------------------------------------------\n");
2160
2161 /* This check is done here in the debugging case so that the code that
2162 was compiled can be seen. */
2163
2164 if (code - re->code > length)
2165 {
2166 *errorptr = ERR23;
2167 (pcre_free)(re);
2168 *erroroffset = ptr - (uschar *)pattern;
2169 return NULL;
2170 }
2171 #endif
2172
2173 return (pcre *)re;
2174 }
2175
2176
2177
2178 /*************************************************
2179 * Match a character type *
2180 *************************************************/
2181
2182 /* Not used in all the places it might be as it's sometimes faster
2183 to put the code inline.
2184
2185 Arguments:
2186 type the character type
2187 c the character
2188 dotall the dotall flag
2189
2190 Returns: TRUE if character is of the type
2191 */
2192
2193 static BOOL
2194 match_type(int type, int c, BOOL dotall)
2195 {
2196
2197 #ifdef DEBUG
2198 if (isprint(c)) printf("matching subject %c against ", c);
2199 else printf("matching subject \\x%02x against ", c);
2200 printf("%s\n", OP_names[type]);
2201 #endif
2202
2203 switch(type)
2204 {
2205 case OP_ANY: return dotall || c != '\n';
2206 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2207 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2208 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2209 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2210 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2211 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2212 }
2213 return FALSE;
2214 }
2215
2216
2217
2218 /*************************************************
2219 * Match a back-reference *
2220 *************************************************/
2221
2222 /* If a back reference hasn't been set, the match fails.
2223
2224 Arguments:
2225 number reference number
2226 eptr points into the subject
2227 length length to be matched
2228 md points to match data block
2229
2230 Returns: TRUE if matched
2231 */
2232
2233 static BOOL
2234 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2235 {
2236 const uschar *p = md->start_subject + md->offset_vector[number];
2237
2238 #ifdef DEBUG
2239 if (eptr >= md->end_subject)
2240 printf("matching subject <null>");
2241 else
2242 {
2243 printf("matching subject ");
2244 pchars(eptr, length, TRUE, md);
2245 }
2246 printf(" against backref ");
2247 pchars(p, length, FALSE, md);
2248 printf("\n");
2249 #endif
2250
2251 /* Always fail if not enough characters left */
2252
2253 if (length > md->end_subject - p) return FALSE;
2254
2255 /* Separate the caselesss case for speed */
2256
2257 if (md->caseless)
2258 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2259 else
2260 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2261
2262 return TRUE;
2263 }
2264
2265
2266
2267 /*************************************************
2268 * Match from current position *
2269 *************************************************/
2270
2271 /* On entry ecode points to the first opcode, and eptr to the first character.
2272
2273 Arguments:
2274 eptr pointer in subject
2275 ecode position in code
2276 offset_top current top pointer
2277 md pointer to "static" info for the match
2278
2279 Returns: TRUE if matched
2280 */
2281
2282 static BOOL
2283 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2284 match_data *md)
2285 {
2286 for (;;)
2287 {
2288 int min, max, ctype;
2289 register int i;
2290 register int c;
2291 BOOL minimize = FALSE;
2292
2293 /* Opening bracket. Check the alternative branches in turn, failing if none
2294 match. We have to set the start offset if required and there is space
2295 in the offset vector so that it is available for subsequent back references
2296 if the bracket matches. However, if the bracket fails, we must put back the
2297 previous value of both offsets in case they were set by a previous copy of
2298 the same bracket. Don't worry about setting the flag for the error case here;
2299 that is handled in the code for KET. */
2300
2301 if ((int)*ecode >= OP_BRA)
2302 {
2303 int number = (*ecode - OP_BRA) << 1;
2304 int save_offset1 = 0, save_offset2 = 0;
2305
2306 DPRINTF(("start bracket %d\n", number/2));
2307
2308 if (number > 0 && number < md->offset_end)
2309 {
2310 save_offset1 = md->offset_vector[number];
2311 save_offset2 = md->offset_vector[number+1];
2312 md->offset_vector[number] = eptr - md->start_subject;
2313
2314 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2315 }
2316
2317 /* Recurse for all the alternatives. */
2318
2319 do
2320 {
2321 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2322 ecode += (ecode[1] << 8) + ecode[2];
2323 }
2324 while (*ecode == OP_ALT);
2325
2326 DPRINTF(("bracket %d failed\n", number/2));
2327
2328 if (number > 0 && number < md->offset_end)
2329 {
2330 md->offset_vector[number] = save_offset1;
2331 md->offset_vector[number+1] = save_offset2;
2332 }
2333
2334 return FALSE;
2335 }
2336
2337 /* Other types of node can be handled by a switch */
2338
2339 switch(*ecode)
2340 {
2341 case OP_END:
2342 md->end_match_ptr = eptr; /* Record where we ended */
2343 md->end_offset_top = offset_top; /* and how many extracts were taken */
2344 return TRUE;
2345
2346 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2347 whole thing doesn't match, so we have to get out via a longjmp(). */
2348
2349 case OP_CUT:
2350 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2351 longjmp(md->fail_env, 1);
2352
2353 /* Assertion brackets. Check the alternative branches in turn - the
2354 matching won't pass the KET for an assertion. If any one branch matches,
2355 the assertion is true. */
2356
2357 case OP_ASSERT:
2358 do
2359 {
2360 if (match(eptr, ecode+3, offset_top, md)) break;
2361 ecode += (ecode[1] << 8) + ecode[2];
2362 }
2363 while (*ecode == OP_ALT);
2364 if (*ecode == OP_KET) return FALSE;
2365
2366 /* Continue from after the assertion, updating the offsets high water
2367 mark, since extracts may have been taken during the assertion. */
2368
2369 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2370 ecode += 3;
2371 offset_top = md->end_offset_top;
2372 continue;
2373
2374 /* Negative assertion: all branches must fail to match */
2375
2376 case OP_ASSERT_NOT:
2377 do
2378 {
2379 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2380 ecode += (ecode[1] << 8) + ecode[2];
2381 }
2382 while (*ecode == OP_ALT);
2383 ecode += 3;
2384 continue;
2385
2386 /* "Once" brackets are like assertion brackets except that after a match,
2387 the point in the subject string is not moved back. Thus there can never be
2388 a move back into the brackets. Check the alternative branches in turn - the
2389 matching won't pass the KET for this kind of subpattern. If any one branch
2390 matches, we carry on, leaving the subject pointer. */
2391
2392 case OP_ONCE:
2393 do
2394 {
2395 if (match(eptr, ecode+3, offset_top, md)) break;
2396 ecode += (ecode[1] << 8) + ecode[2];
2397 }
2398 while (*ecode == OP_ALT);
2399 if (*ecode == OP_KET) return FALSE;
2400
2401 /* Continue as from after the assertion, updating the offsets high water
2402 mark, since extracts may have been taken. */
2403
2404 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2405 ecode += 3;
2406 offset_top = md->end_offset_top;
2407 eptr = md->end_match_ptr;
2408 continue;
2409
2410 /* An alternation is the end of a branch; scan along to find the end of the
2411 bracketed group and go to there. */
2412
2413 case OP_ALT:
2414 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2415 break;
2416
2417 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2418 that it may occur zero times. It may repeat infinitely, or not at all -
2419 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2420 repeat limits are compiled as a number of copies, with the optional ones
2421 preceded by BRAZERO or BRAMINZERO. */
2422
2423 case OP_BRAZERO:
2424 {
2425 const uschar *next = ecode+1;
2426 if (match(eptr, next, offset_top, md)) return TRUE;
2427 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2428 ecode = next + 3;
2429 }
2430 break;
2431
2432 case OP_BRAMINZERO:
2433 {
2434 const uschar *next = ecode+1;
2435 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2436 if (match(eptr, next+3, offset_top, md)) return TRUE;
2437 ecode++;
2438 }
2439 break;;
2440
2441 /* End of a group, repeated or non-repeating. If we are at the end of
2442 an assertion "group", stop matching and return TRUE, but record the
2443 current high water mark for use by positive assertions. */
2444
2445 case OP_KET:
2446 case OP_KETRMIN:
2447 case OP_KETRMAX:
2448 {
2449 int number;
2450 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2451
2452 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2453 {
2454 md->end_match_ptr = eptr; /* For ONCE */
2455 md->end_offset_top = offset_top;
2456 return TRUE;
2457 }
2458
2459 /* In all other cases we have to check the group number back at the
2460 start and if necessary complete handling an extraction by setting the
2461 final offset and bumping the high water mark. */
2462
2463 number = (*prev - OP_BRA) << 1;
2464
2465 DPRINTF(("end bracket %d\n", number/2));
2466
2467 if (number > 0)
2468 {
2469 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2470 {
2471 md->offset_vector[number+1] = eptr - md->start_subject;
2472 if (offset_top <= number) offset_top = number + 2;
2473 }
2474 }
2475
2476 /* For a non-repeating ket, just advance to the next node and continue at
2477 this level. */
2478
2479 if (*ecode == OP_KET)
2480 {
2481 ecode += 3;
2482 break;
2483 }
2484
2485 /* The repeating kets try the rest of the pattern or restart from the
2486 preceding bracket, in the appropriate order. */
2487
2488 if (*ecode == OP_KETRMIN)
2489 {
2490 if (match(eptr, ecode+3, offset_top, md) ||
2491 match(eptr, prev, offset_top, md)) return TRUE;
2492 }
2493 else /* OP_KETRMAX */
2494 {
2495 if (match(eptr, prev, offset_top, md) ||
2496 match(eptr, ecode+3, offset_top, md)) return TRUE;
2497 }
2498 }
2499 return FALSE;
2500
2501 /* Start of subject unless notbol, or after internal newline if multiline */
2502
2503 case OP_CIRC:
2504 if (md->notbol && eptr == md->start_subject) return FALSE;
2505 if (md->multiline)
2506 {
2507 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2508 ecode++;
2509 break;
2510 }
2511 /* ... else fall through */
2512
2513 /* Start of subject assertion */
2514
2515 case OP_SOD:
2516 if (eptr != md->start_subject) return FALSE;
2517 ecode++;
2518 break;
2519
2520 /* Assert before internal newline if multiline, or before
2521 a terminating newline unless endonly is set, else end of subject unless
2522 noteol is set. */
2523
2524 case OP_DOLL:
2525 if (md->noteol && eptr >= md->end_subject) return FALSE;
2526 if (md->multiline)
2527 {
2528 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2529 ecode++;
2530 break;
2531 }
2532 else if (!md->endonly)
2533 {
2534 if (eptr < md->end_subject - 1 ||
2535 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2536 ecode++;
2537 break;
2538 }
2539 /* ... else fall through */
2540
2541 /* End of subject assertion */
2542
2543 case OP_EOD:
2544 if (eptr < md->end_subject) return FALSE;
2545 ecode++;
2546 break;
2547
2548 /* Word boundary assertions */
2549
2550 case OP_NOT_WORD_BOUNDARY:
2551 case OP_WORD_BOUNDARY:
2552 {
2553 BOOL prev_is_word = (eptr != md->start_subject) &&
2554 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2555 BOOL cur_is_word = (eptr < md->end_subject) &&
2556 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2557 if ((*ecode++ == OP_WORD_BOUNDARY)?
2558 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2559 return FALSE;
2560 }
2561 break;
2562
2563 /* Match a single character type; inline for speed */
2564
2565 case OP_ANY:
2566 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2567 if (eptr++ >= md->end_subject) return FALSE;
2568 ecode++;
2569 break;
2570
2571 case OP_NOT_DIGIT:
2572 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2573 return FALSE;
2574 ecode++;
2575 break;
2576
2577 case OP_DIGIT:
2578 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2579 return FALSE;
2580 ecode++;
2581 break;
2582
2583 case OP_NOT_WHITESPACE:
2584 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2585 return FALSE;
2586 ecode++;
2587 break;
2588
2589 case OP_WHITESPACE:
2590 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2591 return FALSE;
2592 ecode++;
2593 break;
2594
2595 case OP_NOT_WORDCHAR:
2596 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2597 return FALSE;
2598 ecode++;
2599 break;
2600
2601 case OP_WORDCHAR:
2602 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2603 return FALSE;
2604 ecode++;
2605 break;
2606
2607 /* Match a back reference, possibly repeatedly. Look past the end of the
2608 item to see if there is repeat information following. The code is similar
2609 to that for character classes, but repeated for efficiency. Then obey
2610 similar code to character type repeats - written out again for speed.
2611 However, if the referenced string is the empty string, always treat
2612 it as matched, any number of times (otherwise there could be infinite
2613 loops). */
2614
2615 case OP_REF:
2616 {
2617 int length;
2618 int number = ecode[1] << 1; /* Doubled reference number */
2619 ecode += 2; /* Advance past the item */
2620
2621 if (number >= offset_top || md->offset_vector[number] < 0)
2622 {
2623 md->errorcode = PCRE_ERROR_BADREF;
2624 return FALSE;
2625 }
2626
2627 length = md->offset_vector[number+1] - md->offset_vector[number];
2628
2629 switch (*ecode)
2630 {
2631 case OP_CRSTAR:
2632 case OP_CRMINSTAR:
2633 case OP_CRPLUS:
2634 case OP_CRMINPLUS:
2635 case OP_CRQUERY:
2636 case OP_CRMINQUERY:
2637 c = *ecode++ - OP_CRSTAR;
2638 minimize = (c & 1) != 0;
2639 min = rep_min[c]; /* Pick up values from tables; */
2640 max = rep_max[c]; /* zero for max => infinity */
2641 if (max == 0) max = INT_MAX;
2642 break;
2643
2644 case OP_CRRANGE:
2645 case OP_CRMINRANGE:
2646 minimize = (*ecode == OP_CRMINRANGE);
2647 min = (ecode[1] << 8) + ecode[2];
2648 max = (ecode[3] << 8) + ecode[4];
2649 if (max == 0) max = INT_MAX;
2650 ecode += 5;
2651 break;
2652
2653 default: /* No repeat follows */
2654 if (!match_ref(number, eptr, length, md)) return FALSE;
2655 eptr += length;
2656 continue; /* With the main loop */
2657 }
2658
2659 /* If the length of the reference is zero, just continue with the
2660 main loop. */
2661
2662 if (length == 0) continue;
2663
2664 /* First, ensure the minimum number of matches are present. We get back
2665 the length of the reference string explicitly rather than passing the
2666 address of eptr, so that eptr can be a register variable. */
2667
2668 for (i = 1; i <= min; i++)
2669 {
2670 if (!match_ref(number, eptr, length, md)) return FALSE;
2671 eptr += length;
2672 }
2673
2674 /* If min = max, continue at the same level without recursion.
2675 They are not both allowed to be zero. */
2676
2677 if (min == max) continue;
2678
2679 /* If minimizing, keep trying and advancing the pointer */
2680
2681 if (minimize)
2682 {
2683 for (i = min;; i++)
2684 {
2685 if (match(eptr, ecode, offset_top, md)) return TRUE;
2686 if (i >= max || !match_ref(number, eptr, length, md))
2687 return FALSE;
2688 eptr += length;
2689 }
2690 /* Control never gets here */
2691 }
2692
2693 /* If maximizing, find the longest string and work backwards */
2694
2695 else
2696 {
2697 const uschar *pp = eptr;
2698 for (i = min; i < max; i++)
2699 {
2700 if (!match_ref(number, eptr, length, md)) break;
2701 eptr += length;
2702 }
2703 while (eptr >= pp)
2704 {
2705 if (match(eptr, ecode, offset_top, md)) return TRUE;
2706 eptr -= length;
2707 }
2708 return FALSE;
2709 }
2710 }
2711 /* Control never gets here */
2712
2713 /* Match a character class, possibly repeatedly. Look past the end of the
2714 item to see if there is repeat information following. Then obey similar
2715 code to character type repeats - written out again for speed. If caseless
2716 matching was set at runtime but not at compile time, we have to check both
2717 versions of a character. */
2718
2719 case OP_CLASS:
2720 {
2721 const uschar *data = ecode + 1; /* Save for matching */
2722 ecode += 33; /* Advance past the item */
2723
2724 switch (*ecode)
2725 {
2726 case OP_CRSTAR:
2727 case OP_CRMINSTAR:
2728 case OP_CRPLUS:
2729 case OP_CRMINPLUS:
2730 case OP_CRQUERY:
2731 case OP_CRMINQUERY:
2732 c = *ecode++ - OP_CRSTAR;
2733 minimize = (c & 1) != 0;
2734 min = rep_min[c]; /* Pick up values from tables; */
2735 max = rep_max[c]; /* zero for max => infinity */
2736 if (max == 0) max = INT_MAX;
2737 break;
2738
2739 case OP_CRRANGE:
2740 case OP_CRMINRANGE:
2741 minimize = (*ecode == OP_CRMINRANGE);
2742 min = (ecode[1] << 8) + ecode[2];
2743 max = (ecode[3] << 8) + ecode[4];
2744 if (max == 0) max = INT_MAX;
2745 ecode += 5;
2746 break;
2747
2748 default: /* No repeat follows */
2749 if (eptr >= md->end_subject) return FALSE;
2750 c = *eptr++;
2751 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2752 if (md->runtime_caseless)
2753 {
2754 c = pcre_fcc[c];
2755 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2756 }
2757 return FALSE;
2758 }
2759
2760 /* First, ensure the minimum number of matches are present. */
2761
2762 for (i = 1; i <= min; i++)
2763 {
2764 if (eptr >= md->end_subject) return FALSE;
2765 c = *eptr++;
2766 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2767 if (md->runtime_caseless)
2768 {
2769 c = pcre_fcc[c];
2770 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2771 }
2772 return FALSE;
2773 }
2774
2775 /* If max == min we can continue with the main loop without the
2776 need to recurse. */
2777
2778 if (min == max) continue;
2779
2780 /* If minimizing, keep testing the rest of the expression and advancing
2781 the pointer while it matches the class. */
2782
2783 if (minimize)
2784 {
2785 for (i = min;; i++)
2786 {
2787 if (match(eptr, ecode, offset_top, md)) return TRUE;
2788 if (i >= max || eptr >= md->end_subject) return FALSE;
2789 c = *eptr++;
2790 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2791 if (md->runtime_caseless)
2792 {
2793 c = pcre_fcc[c];
2794 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2795 }
2796 return FALSE;
2797 }
2798 /* Control never gets here */
2799 }
2800
2801 /* If maximizing, find the longest possible run, then work backwards. */
2802
2803 else
2804 {
2805 const uschar *pp = eptr;
2806 for (i = min; i < max; eptr++, i++)
2807 {
2808 if (eptr >= md->end_subject) break;
2809 c = *eptr;
2810 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2811 if (md->runtime_caseless)
2812 {
2813 c = pcre_fcc[c];
2814 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2815 }
2816 break;
2817 }
2818
2819 while (eptr >= pp)
2820 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2821 return FALSE;
2822 }
2823 }
2824 /* Control never gets here */
2825
2826 /* Match a run of characters */
2827
2828 case OP_CHARS:
2829 {
2830 register int length = ecode[1];
2831 ecode += 2;
2832
2833 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2834 if (eptr >= md->end_subject)
2835 printf("matching subject <null> against pattern ");
2836 else
2837 {
2838 printf("matching subject ");
2839 pchars(eptr, length, TRUE, md);
2840 printf(" against pattern ");
2841 }
2842 pchars(ecode, length, FALSE, md);
2843 printf("\n");
2844 #endif
2845
2846 if (length > md->end_subject - eptr) return FALSE;
2847 if (md->caseless)
2848 {
2849 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2850 }
2851 else
2852 {
2853 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2854 }
2855 }
2856 break;
2857
2858 /* Match a single character repeatedly; different opcodes share code. */
2859
2860 case OP_EXACT:
2861 min = max = (ecode[1] << 8) + ecode[2];
2862 ecode += 3;
2863 goto REPEATCHAR;
2864
2865 case OP_UPTO:
2866 case OP_MINUPTO:
2867 min = 0;
2868 max = (ecode[1] << 8) + ecode[2];
2869 minimize = *ecode == OP_MINUPTO;
2870 ecode += 3;
2871 goto REPEATCHAR;
2872
2873 case OP_STAR:
2874 case OP_MINSTAR:
2875 case OP_PLUS:
2876 case OP_MINPLUS:
2877 case OP_QUERY:
2878 case OP_MINQUERY:
2879 c = *ecode++ - OP_STAR;
2880 minimize = (c & 1) != 0;
2881 min = rep_min[c]; /* Pick up values from tables; */
2882 max = rep_max[c]; /* zero for max => infinity */
2883 if (max == 0) max = INT_MAX;
2884
2885 /* Common code for all repeated single-character matches. We can give
2886 up quickly if there are fewer than the minimum number of characters left in
2887 the subject. */
2888
2889 REPEATCHAR:
2890 if (min > md->end_subject - eptr) return FALSE;
2891 c = *ecode++;
2892
2893 /* The code is duplicated for the caseless and caseful cases, for speed,
2894 since matching characters is likely to be quite common. First, ensure the
2895 minimum number of matches are present. If min = max, continue at the same
2896 level without recursing. Otherwise, if minimizing, keep trying the rest of
2897 the expression and advancing one matching character if failing, up to the
2898 maximum. Alternatively, if maximizing, find the maximum number of
2899 characters and work backwards. */
2900
2901 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2902 max, eptr));
2903
2904 if (md->caseless)
2905 {
2906 c = pcre_lcc[c];
2907 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2908 if (min == max) continue;
2909 if (minimize)
2910 {
2911 for (i = min;; i++)
2912 {
2913 if (match(eptr, ecode, offset_top, md)) return TRUE;
2914 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2915 return FALSE;
2916 }
2917 /* Control never gets here */
2918 }
2919 else
2920 {
2921 const uschar *pp = eptr;
2922 for (i = min; i < max; i++)
2923 {
2924 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2925 eptr++;
2926 }
2927 while (eptr >= pp)
2928 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2929 return FALSE;
2930 }
2931 /* Control never gets here */
2932 }
2933
2934 /* Caseful comparisons */
2935
2936 else
2937 {
2938 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2939 if (min == max) continue;
2940 if (minimize)
2941 {
2942 for (i = min;; i++)
2943 {
2944 if (match(eptr, ecode, offset_top, md)) return TRUE;
2945 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2946 }
2947 /* Control never gets here */
2948 }
2949 else
2950 {
2951 const uschar *pp = eptr;
2952 for (i = min; i < max; i++)
2953 {
2954 if (eptr >= md->end_subject || c != *eptr) break;
2955 eptr++;
2956 }
2957 while (eptr >= pp)
2958 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2959 return FALSE;
2960 }
2961 }
2962 /* Control never gets here */
2963
2964 /* Match a negated single character */
2965
2966 case OP_NOT:
2967 if (eptr >= md->end_subject) return FALSE;
2968 ecode++;
2969 if (md->caseless)
2970 {
2971 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2972 }
2973 else
2974 {
2975 if (*ecode++ == *eptr++) return FALSE;
2976 }
2977 break;
2978
2979 /* Match a negated single character repeatedly. This is almost a repeat of
2980 the code for a repeated single character, but I haven't found a nice way of
2981 commoning these up that doesn't require a test of the positive/negative
2982 option for each character match. Maybe that wouldn't add very much to the
2983 time taken, but character matching *is* what this is all about... */
2984
2985 case OP_NOTEXACT:
2986 min = max = (ecode[1] << 8) + ecode[2];
2987 ecode += 3;
2988 goto REPEATNOTCHAR;
2989
2990 case OP_NOTUPTO:
2991 case OP_NOTMINUPTO:
2992 min = 0;
2993 max = (ecode[1] << 8) + ecode[2];
2994 minimize = *ecode == OP_NOTMINUPTO;
2995 ecode += 3;
2996 goto REPEATNOTCHAR;
2997
2998 case OP_NOTSTAR:
2999 case OP_NOTMINSTAR:
3000 case OP_NOTPLUS:
3001 case OP_NOTMINPLUS:
3002 case OP_NOTQUERY:
3003 case OP_NOTMINQUERY:
3004 c = *ecode++ - OP_NOTSTAR;
3005 minimize = (c & 1) != 0;
3006 min = rep_min[c]; /* Pick up values from tables; */
3007 max = rep_max[c]; /* zero for max => infinity */
3008 if (max == 0) max = INT_MAX;
3009
3010 /* Common code for all repeated single-character matches. We can give
3011 up quickly if there are fewer than the minimum number of characters left in
3012 the subject. */
3013
3014 REPEATNOTCHAR:
3015 if (min > md->end_subject - eptr) return FALSE;
3016 c = *ecode++;
3017
3018 /* The code is duplicated for the caseless and caseful cases, for speed,
3019 since matching characters is likely to be quite common. First, ensure the
3020 minimum number of matches are present. If min = max, continue at the same
3021 level without recursing. Otherwise, if minimizing, keep trying the rest of
3022 the expression and advancing one matching character if failing, up to the
3023 maximum. Alternatively, if maximizing, find the maximum number of
3024 characters and work backwards. */
3025
3026 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3027 max, eptr));
3028
3029 if (md->caseless)
3030 {
3031 c = pcre_lcc[c];
3032 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3033 if (min == max) continue;
3034 if (minimize)
3035 {
3036 for (i = min;; i++)
3037 {
3038 if (match(eptr, ecode, offset_top, md)) return TRUE;
3039 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3040 return FALSE;
3041 }
3042 /* Control never gets here */
3043 }
3044 else
3045 {
3046 const uschar *pp = eptr;
3047 for (i = min; i < max; i++)
3048 {
3049 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3050 eptr++;
3051 }
3052 while (eptr >= pp)
3053 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3054 return FALSE;
3055 }
3056 /* Control never gets here */
3057 }
3058
3059 /* Caseful comparisons */
3060
3061 else
3062 {
3063 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3064 if (min == max) continue;
3065 if (minimize)
3066 {
3067 for (i = min;; i++)
3068 {
3069 if (match(eptr, ecode, offset_top, md)) return TRUE;
3070 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3071 }
3072 /* Control never gets here */
3073 }
3074 else
3075 {
3076 const uschar *pp = eptr;
3077 for (i = min; i < max; i++)
3078 {
3079 if (eptr >= md->end_subject || c == *eptr) break;
3080 eptr++;
3081 }
3082 while (eptr >= pp)
3083 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3084 return FALSE;
3085 }
3086 }
3087 /* Control never gets here */
3088
3089 /* Match a single character type repeatedly; several different opcodes
3090 share code. This is very similar to the code for single characters, but we
3091 repeat it in the interests of efficiency. */
3092
3093 case OP_TYPEEXACT:
3094 min = max = (ecode[1] << 8) + ecode[2];
3095 minimize = TRUE;
3096 ecode += 3;
3097 goto REPEATTYPE;
3098
3099 case OP_TYPEUPTO:
3100 case OP_TYPEMINUPTO:
3101 min = 0;
3102 max = (ecode[1] << 8) + ecode[2];
3103 minimize = *ecode == OP_TYPEMINUPTO;
3104 ecode += 3;
3105 goto REPEATTYPE;
3106
3107 case OP_TYPESTAR:
3108 case OP_TYPEMINSTAR:
3109 case OP_TYPEPLUS:
3110 case OP_TYPEMINPLUS:
3111 case OP_TYPEQUERY:
3112 case OP_TYPEMINQUERY:
3113 c = *ecode++ - OP_TYPESTAR;
3114 minimize = (c & 1) != 0;
3115 min = rep_min[c]; /* Pick up values from tables; */
3116 max = rep_max[c]; /* zero for max => infinity */
3117 if (max == 0) max = INT_MAX;
3118
3119 /* Common code for all repeated single character type matches */
3120
3121 REPEATTYPE:
3122 ctype = *ecode++; /* Code for the character type */
3123
3124 /* First, ensure the minimum number of matches are present. Use inline
3125 code for maximizing the speed, and do the type test once at the start
3126 (i.e. keep it out of the loop). Also test that there are at least the
3127 minimum number of characters before we start. */
3128
3129 if (min > md->end_subject - eptr) return FALSE;
3130 if (min > 0) switch(ctype)
3131 {
3132 case OP_ANY:
3133 if (!md->dotall)
3134 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3135 else eptr += min;
3136 break;
3137
3138 case OP_NOT_DIGIT:
3139 for (i = 1; i <= min; i++)
3140 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3141 break;
3142
3143 case OP_DIGIT:
3144 for (i = 1; i <= min; i++)
3145 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3146 break;
3147
3148 case OP_NOT_WHITESPACE:
3149 for (i = 1; i <= min; i++)
3150 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3151 break;
3152
3153 case OP_WHITESPACE:
3154 for (i = 1; i <= min; i++)
3155 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3156 break;
3157
3158 case OP_NOT_WORDCHAR:
3159 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3160 return FALSE;
3161 break;
3162
3163 case OP_WORDCHAR:
3164 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3165 return FALSE;
3166 break;
3167 }
3168
3169 /* If min = max, continue at the same level without recursing */
3170
3171 if (min == max) continue;
3172
3173 /* If minimizing, we have to test the rest of the pattern before each
3174 subsequent match, so inlining isn't much help; just use the function. */
3175
3176 if (minimize)
3177 {
3178 for (i = min;; i++)
3179 {
3180 if (match(eptr, ecode, offset_top, md)) return TRUE;
3181 if (i >= max || eptr >= md->end_subject ||
3182 !match_type(ctype, *eptr++, md->dotall))
3183 return FALSE;
3184 }
3185 /* Control never gets here */
3186 }
3187
3188 /* If maximizing it is worth using inline code for speed, doing the type
3189 test once at the start (i.e. keep it out of the loop). */
3190
3191 else
3192 {
3193 const uschar *pp = eptr;
3194 switch(ctype)
3195 {
3196 case OP_ANY:
3197 if (!md->dotall)
3198 {
3199 for (i = min; i < max; i++)
3200 {
3201 if (eptr >= md->end_subject || *eptr == '\n') break;
3202 eptr++;
3203 }
3204 }
3205 else
3206 {
3207 c = max - min;
3208 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3209 eptr += c;
3210 }
3211 break;
3212
3213 case OP_NOT_DIGIT:
3214 for (i = min; i < max; i++)
3215 {
3216 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3217 break;
3218 eptr++;
3219 }
3220 break;
3221
3222 case OP_DIGIT:
3223 for (i = min; i < max; i++)
3224 {
3225 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3226 break;
3227 eptr++;
3228 }
3229 break;
3230
3231 case OP_NOT_WHITESPACE:
3232 for (i = min; i < max; i++)
3233 {
3234 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3235 break;
3236 eptr++;
3237 }
3238 break;
3239
3240 case OP_WHITESPACE:
3241 for (i = min; i < max; i++)
3242 {
3243 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3244 break;
3245 eptr++;
3246 }
3247 break;
3248
3249 case OP_NOT_WORDCHAR:
3250 for (i = min; i < max; i++)
3251 {
3252 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3253 break;
3254 eptr++;
3255 }
3256 break;
3257
3258 case OP_WORDCHAR:
3259 for (i = min; i < max; i++)
3260 {
3261 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3262 break;
3263 eptr++;
3264 }
3265 break;
3266 }
3267
3268 while (eptr >= pp)
3269 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3270 return FALSE;
3271 }
3272 /* Control never gets here */
3273
3274 /* There's been some horrible disaster. */
3275
3276 default:
3277 DPRINTF(("Unknown opcode %d\n", *ecode));
3278 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3279 return FALSE;
3280 }
3281
3282 /* Do not stick any code in here without much thought; it is assumed
3283 that "continue" in the code above comes out to here to repeat the main
3284 loop. */
3285
3286 } /* End of main loop */
3287 /* Control never reaches here */
3288 }
3289
3290
3291
3292 /*************************************************
3293 * Segregate setjmp() *
3294 *************************************************/
3295
3296 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3297 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3298 hide it in a separate function. This is called only when PCRE_EXTRA is set,
3299 since it's needed only for the extension \X option, and with any luck, a good
3300 compiler will spot the tail recursion and compile it efficiently.
3301
3302 Arguments:
3303 eptr pointer in subject
3304 ecode position in code
3305 offset_top current top pointer
3306 md pointer to "static" info for the match
3307
3308 Returns: TRUE if matched
3309 */
3310
3311 static BOOL
3312 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
3313 match_data *match_block)
3314 {
3315 return setjmp(match_block->fail_env) == 0 &&
3316 match(eptr, ecode, offset_top, match_block);
3317 }
3318
3319
3320
3321 /*************************************************
3322 * Execute a Regular Expression *
3323 *************************************************/
3324
3325 /* This function applies a compiled re to a subject string and picks out
3326 portions of the string if it matches. Two elements in the vector are set for
3327 each substring: the offsets to the start and end of the substring.
3328
3329 Arguments:
3330 external_re points to the compiled expression
3331 external_extra points to "hints" from pcre_study() or is NULL
3332 subject points to the subject string
3333 length length of subject string (may contain binary zeros)
3334 options option bits
3335 offsets points to a vector of ints to be filled in with offsets
3336 offsetcount the number of elements in the vector
3337
3338 Returns: > 0 => success; value is the number of elements filled in
3339 = 0 => success, but offsets is not big enough
3340 -1 => failed to match
3341 < -1 => some kind of unexpected problem
3342 */
3343
3344 int
3345 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3346 const char *subject, int length, int options, int *offsets, int offsetcount)
3347 {
3348 int resetcount, ocount;
3349 int first_char = -1;
3350 match_data match_block;
3351 const uschar *start_bits = NULL;
3352 const uschar *start_match = (const uschar *)subject;
3353 const uschar *end_subject;
3354 const real_pcre *re = (const real_pcre *)external_re;
3355 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3356 BOOL using_temporary_offsets = FALSE;
3357 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3358 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3359
3360 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3361
3362 if (re == NULL || subject == NULL ||
3363 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3364 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3365
3366 match_block.start_subject = (const uschar *)subject;
3367 match_block.end_subject = match_block.start_subject + length;
3368 end_subject = match_block.end_subject;
3369
3370 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3371 match_block.runtime_caseless = match_block.caseless &&
3372 (re->options & PCRE_CASELESS) == 0;
3373
3374 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3375 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3376 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3377
3378 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3379 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3380
3381 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3382
3383 /* If the expression has got more back references than the offsets supplied can
3384 hold, we get a temporary bit of working store to use during the matching.
3385 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3386 of 2. */
3387
3388 ocount = offsetcount & (-2);
3389 if (re->top_backref > 0 && re->top_backref >= ocount/2)
3390 {
3391 ocount = re->top_backref * 2 + 2;
3392 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3393 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3394 using_temporary_offsets = TRUE;
3395 DPRINTF(("Got memory to hold back references\n"));
3396 }
3397 else match_block.offset_vector = offsets;
3398
3399 match_block.offset_end = ocount;
3400 match_block.offset_overflow = FALSE;
3401
3402 /* Compute the minimum number of offsets that we need to reset each time. Doing
3403 this makes a huge difference to execution time when there aren't many brackets
3404 in the pattern. */
3405
3406 resetcount = 2 + re->top_bracket * 2;
3407 if (resetcount > offsetcount) resetcount = ocount;
3408
3409 /* If MULTILINE is set at exec time but was not set at compile time, and the
3410 anchored flag is set, we must re-check because a setting provoked by ^ in the
3411 pattern is not right in multi-line mode. Calling is_anchored() again here does
3412 the right check, because multiline is now set. If it now yields FALSE, the
3413 expression must have had ^ starting some of its branches. Check to see if
3414 that is true for *all* branches, and if so, set the startline flag. */
3415
3416 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3417 !is_anchored(re->code, match_block.multiline))
3418 {
3419 anchored = FALSE;
3420 if (is_startline(re->code)) startline = TRUE;
3421 }
3422
3423 /* Set up the first character to match, if available. The first_char value is
3424 never set for an anchored regular expression, but the anchoring may be forced
3425 at run time, so we have to test for anchoring. The first char may be unset for
3426 an unanchored pattern, of course. If there's no first char and the pattern was
3427 studied, the may be a bitmap of possible first characters. However, we can
3428 use this only if the caseless state of the studying was correct. */
3429
3430 if (!anchored)
3431 {
3432 if ((re->options & PCRE_FIRSTSET) != 0)
3433 {
3434 first_char = re->first_char;
3435 if (match_block.caseless) first_char = pcre_lcc[first_char];
3436 }
3437 else
3438 if (!startline && extra != NULL &&
3439 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3440 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3441 start_bits = extra->start_bits;
3442 }
3443
3444 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3445
3446 do
3447 {
3448 int rc;
3449 register int *iptr = match_block.offset_vector;
3450 register int *iend = iptr + resetcount;
3451
3452 /* Reset the maximum number of extractions we might see. */
3453
3454 while (iptr < iend) *iptr++ = -1;
3455
3456 /* Advance to a unique first char if possible */
3457
3458 if (first_char >= 0)
3459 {
3460 if (match_block.caseless)
3461 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3462 start_match++;
3463 else
3464 while (start_match < end_subject && *start_match != first_char)
3465 start_match++;
3466 }
3467
3468 /* Or to just after \n for a multiline match if possible */
3469
3470 else if (startline)
3471 {
3472 if (start_match > match_block.start_subject)
3473 {
3474 while (start_match < end_subject && start_match[-1] != '\n')
3475 start_match++;
3476 }
3477 }
3478
3479 /* Or to a non-unique first char */
3480
3481 else if (start_bits != NULL)
3482 {
3483 while (start_match < end_subject)
3484 {
3485 register int c = *start_match;
3486 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3487 }
3488 }
3489
3490 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3491 printf(">>>> Match against: ");
3492 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3493 printf("\n");
3494 #endif
3495
3496 /* When a match occurs, substrings will be set for all internal extractions;
3497 we just need to set up the whole thing as substring 0 before returning. If
3498 there were too many extractions, set the return code to zero. In the case
3499 where we had to get some local store to hold offsets for backreferences, copy
3500 those back references that we can. In this case there need not be overflow
3501 if certain parts of the pattern were not used.
3502
3503 Before starting the match, we have to set up a longjmp() target to enable
3504 the "cut" operation to fail a match completely without backtracking. This
3505 is done in a separate function to avoid compiler warnings. We need not do
3506 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3507 enabled. */
3508
3509 if ((re->options & PCRE_EXTRA) != 0)
3510 {
3511 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
3512 continue;
3513 }
3514 else if (!match(start_match, re->code, 2, &match_block)) continue;
3515
3516 /* Copy the offset information from temporary store if necessary */
3517
3518 if (using_temporary_offsets)
3519 {
3520 if (offsetcount >= 4)
3521 {
3522 memcpy(offsets + 2, match_block.offset_vector + 2,
3523 (offsetcount - 2) * sizeof(int));
3524 DPRINTF(("Copied offsets from temporary memory\n"));
3525 }
3526 if (match_block.end_offset_top > offsetcount)
3527 match_block.offset_overflow = TRUE;
3528
3529 DPRINTF(("Freeing temporary memory\n"));
3530 (pcre_free)(match_block.offset_vector);
3531 }
3532
3533 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3534
3535 if (match_block.offset_end < 2) rc = 0; else
3536 {
3537 offsets[0] = start_match - match_block.start_subject;
3538 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3539 }
3540
3541 DPRINTF((">>>> returning %d\n", rc));
3542 return rc;
3543 }
3544 while (!anchored &&
3545 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3546 start_match++ < end_subject);
3547
3548 if (using_temporary_offsets)
3549 {
3550 DPRINTF(("Freeing temporary memory\n"));
3551 (pcre_free)(match_block.offset_vector);
3552 }
3553
3554 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3555
3556 return match_block.errorcode;
3557 }
3558
3559 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12