/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (show annotations) (download)
Sat Feb 24 21:38:05 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 99029 byte(s)
Load pcre-1.01 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36
37 /* Include the internals header, which itself includes Standard C headers plus
38 the external pcre header. */
39
40 #include "internal.h"
41
42
43 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
44
45 static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
46 static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
47
48 /* Text forms of OP_ values and things, for debugging */
49
50 #ifdef DEBUG
51 static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
52 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
53 "not",
54 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
55 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
56 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
57 "*", "*?", "+", "+?", "?", "??", "{", "{",
58 "class", "Ref",
59 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
60 "Brazero", "Braminzero", "Bra"
61 };
62 #endif
63
64 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
65 are simple data values; negative values are for special things like \d and so
66 on. Zero means further processing is needed (for things like \x), or the escape
67 is invalid. */
68
69 static short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
78 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
79 0, 0, 0 /* x - z */
80 };
81
82 /* Definition to allow mutual recursion */
83
84 static BOOL compile_regex(int, int *,uschar **,uschar **,char **);
85
86 /* Structure for passing "static" information around between the functions
87 doing the matching, so that they are thread-safe. */
88
89 typedef struct match_data {
90 int errorcode; /* As it says */
91 int *offset_vector; /* Offset vector */
92 int offset_end; /* One past the end */
93 BOOL offset_overflow; /* Set if too many extractions */
94 BOOL caseless; /* Case-independent flag */
95 BOOL runtime_caseless; /* Caseless forced at run time */
96 BOOL multiline; /* Multiline flag */
97 BOOL notbol; /* NOTBOL flag */
98 BOOL noteol; /* NOTEOL flag */
99 BOOL dotall; /* Dot matches any char */
100 BOOL endonly; /* Dollar not before final \n */
101 uschar *start_subject; /* Start of the subject string */
102 uschar *end_subject; /* End of the subject string */
103 jmp_buf fail_env; /* Environment for longjump() break out */
104 uschar *end_match_ptr; /* Subject position at end match */
105 int end_offset_top; /* Highwater mark at end of match */
106 } match_data;
107
108
109
110 /*************************************************
111 * Global variables *
112 *************************************************/
113
114 /* PCRE is thread-clean and doesn't use any global variables in the normal
115 sense. However, it calls memory allocation and free functions via the two
116 indirections below, which are can be changed by the caller, but are shared
117 between all threads. */
118
119 void *(*pcre_malloc)(size_t) = malloc;
120 void (*pcre_free)(void *) = free;
121
122
123
124
125 /*************************************************
126 * Return version string *
127 *************************************************/
128
129 char *
130 pcre_version(void)
131 {
132 return PCRE_VERSION;
133 }
134
135
136
137
138 /*************************************************
139 * Return info about a compiled pattern *
140 *************************************************/
141
142 /* This function picks potentially useful data out of the private
143 structure.
144
145 Arguments:
146 external_re points to compiled code
147 optptr where to pass back the options
148 first_char where to pass back the first character,
149 or -1 if multiline and all branches start ^,
150 or -2 otherwise
151
152 Returns: number of identifying extraction brackets
153 or negative values on error
154 */
155
156 int
157 pcre_info(const pcre *external_re, int *optptr, int *first_char)
158 {
159 real_pcre *re = (real_pcre *)external_re;
160 if (re == NULL) return PCRE_ERROR_NULL;
161 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
162 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
163 if (first_char != NULL)
164 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
165 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
166 return re->top_bracket;
167 }
168
169
170
171
172 #ifdef DEBUG
173 /*************************************************
174 * Debugging function to print chars *
175 *************************************************/
176
177 /* Print a sequence of chars in printable format, stopping at the end of the
178 subject if the requested.
179
180 Arguments:
181 p points to characters
182 length number to print
183 is_subject TRUE if printing from within md->start_subject
184 md pointer to matching data block, if is_subject is TRUE
185
186 Returns: nothing
187 */
188
189 static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
190 {
191 int c;
192 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
193 while (length-- > 0)
194 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
195 }
196 #endif
197
198
199
200
201 /*************************************************
202 * Check subpattern for empty operand *
203 *************************************************/
204
205 /* This function checks a bracketed subpattern to see if any of the paths
206 through it could match an empty string. This is used to diagnose an error if
207 such a subpattern is followed by a quantifier with an unlimited upper bound.
208
209 Argument:
210 code points to the opening bracket
211
212 Returns: TRUE or FALSE
213 */
214
215 static BOOL
216 could_be_empty(uschar *code)
217 {
218 do {
219 uschar *cc = code + 3;
220
221 /* Scan along the opcodes for this branch; as soon as we find something
222 that matches a non-empty string, break out and advance to test the next
223 branch. If we get to the end of the branch, return TRUE for the whole
224 sub-expression. */
225
226 for (;;)
227 {
228 /* Test an embedded subpattern; if it could not be empty, break the
229 loop. Otherwise carry on in the branch. */
230
231 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
232 {
233 if (!could_be_empty(cc)) break;
234 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
235 cc += 3;
236 }
237
238 else switch (*cc)
239 {
240 /* Reached end of a branch: the subpattern may match the empty string */
241
242 case OP_ALT:
243 case OP_KET:
244 case OP_KETRMAX:
245 case OP_KETRMIN:
246 return TRUE;
247
248 /* Skip over assertive subpatterns */
249
250 case OP_ASSERT:
251 case OP_ASSERT_NOT:
252 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
253 cc += 3;
254 break;
255
256 /* Skip over things that don't match chars */
257
258 case OP_SOD:
259 case OP_EOD:
260 case OP_CIRC:
261 case OP_DOLL:
262 case OP_BRAZERO:
263 case OP_BRAMINZERO:
264 case OP_NOT_WORD_BOUNDARY:
265 case OP_WORD_BOUNDARY:
266 cc++;
267 break;
268
269 /* Skip over simple repeats with zero lower bound */
270
271 case OP_STAR:
272 case OP_MINSTAR:
273 case OP_QUERY:
274 case OP_MINQUERY:
275 case OP_NOTSTAR:
276 case OP_NOTMINSTAR:
277 case OP_NOTQUERY:
278 case OP_NOTMINQUERY:
279 case OP_TYPESTAR:
280 case OP_TYPEMINSTAR:
281 case OP_TYPEQUERY:
282 case OP_TYPEMINQUERY:
283 cc += 2;
284 break;
285
286 /* Skip over UPTOs (lower bound is zero) */
287
288 case OP_UPTO:
289 case OP_MINUPTO:
290 case OP_TYPEUPTO:
291 case OP_TYPEMINUPTO:
292 cc += 4;
293 break;
294
295 /* Check a class or a back reference for a zero minimum */
296
297 case OP_CLASS:
298 case OP_REF:
299 cc += (*cc == OP_REF)? 2 : 33;
300
301 switch (*cc)
302 {
303 case OP_CRSTAR:
304 case OP_CRMINSTAR:
305 case OP_CRQUERY:
306 case OP_CRMINQUERY:
307 cc++;
308 break;
309
310 case OP_CRRANGE:
311 case OP_CRMINRANGE:
312 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
313 cc += 3;
314 break;
315
316 default:
317 goto NEXT_BRANCH;
318 }
319 break;
320
321 /* Anything else matches at least one character */
322
323 default:
324 goto NEXT_BRANCH;
325 }
326 }
327
328 NEXT_BRANCH:
329 code += (code[1] << 8) + code[2];
330 }
331 while (*code == OP_ALT);
332
333 /* No branches match the empty string */
334
335 return FALSE;
336 }
337
338
339
340 /*************************************************
341 * Handle escapes *
342 *************************************************/
343
344 /* This function is called when a \ has been encountered. It either returns a
345 positive value for a simple escape such as \n, or a negative value which
346 encodes one of the more complicated things such as \d. On entry, ptr is
347 pointing at the \. On exit, it is on the final character of the escape
348 sequence.
349
350 Arguments:
351 ptrptr points to the pattern position pointer
352 errorptr points to the pointer to the error message
353 bracount number of previous extracting brackets
354 options the options bits
355 isclass TRUE if inside a character class
356
357 Returns: zero or positive => a data character
358 negative => a special escape sequence
359 on error, errorptr is set
360 */
361
362 static int
363 check_escape(uschar **ptrptr, char **errorptr, int bracount, int options,
364 BOOL isclass)
365 {
366 uschar *ptr = *ptrptr;
367 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
368 int i;
369
370 if (c == 0) *errorptr = ERR1;
371
372 /* Digits or letters may have special meaning; all others are literals. */
373
374 else if (c < '0' || c > 'z') {}
375
376 /* Do an initial lookup in a table. A non-zero result is something that can be
377 returned immediately. Otherwise further processing may be required. */
378
379 else if ((i = escapes[c - '0']) != 0) c = i;
380
381 /* Escapes that need further processing, or are illegal. */
382
383 else
384 {
385 uschar *oldptr;
386 switch (c)
387 {
388 /* The handling of escape sequences consisting of a string of digits
389 starting with one that is not zero is not straightforward. By experiment,
390 the way Perl works seems to be as follows:
391
392 Outside a character class, the digits are read as a decimal number. If the
393 number is less than 10, or if there are that many previous extracting
394 left brackets, then it is a back reference. Otherwise, up to three octal
395 digits are read to form an escaped byte. Thus \123 is likely to be octal
396 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
397 value is greater than 377, the least significant 8 bits are taken. Inside a
398 character class, \ followed by a digit is always an octal number. */
399
400 case '1': case '2': case '3': case '4': case '5':
401 case '6': case '7': case '8': case '9':
402
403 if (!isclass)
404 {
405 oldptr = ptr;
406 c -= '0';
407 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
408 c = c * 10 + *(++ptr) - '0';
409 if (c < 10 || c <= bracount)
410 {
411 c = -(ESC_REF + c);
412 break;
413 }
414 ptr = oldptr; /* Put the pointer back and fall through */
415 }
416
417 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
418 generates a binary zero byte and treats the digit as a following literal.
419 Thus we have to pull back the pointer by one. */
420
421 if ((c = *ptr) >= '8')
422 {
423 ptr--;
424 c = 0;
425 break;
426 }
427
428 /* \0 always starts an octal number, but we may drop through to here with a
429 larger first octal digit */
430
431 case '0':
432 c -= '0';
433 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
434 ptr[1] != '8' && ptr[1] != '9')
435 c = c * 8 + *(++ptr) - '0';
436 break;
437
438 /* Special escapes not starting with a digit are straightforward */
439
440 case 'x':
441 c = 0;
442 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
443 {
444 ptr++;
445 c = c * 16 + pcre_lcc[*ptr] -
446 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
447 }
448 break;
449
450 case 'c':
451 c = *(++ptr);
452 if (c == 0)
453 {
454 *errorptr = ERR2;
455 return 0;
456 }
457
458 /* A letter is upper-cased; then the 0x40 bit is flipped */
459
460 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
461 c ^= 0x40;
462 break;
463
464 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
465 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
466 for Perl compatibility, it is a literal. */
467
468 default:
469 if ((options & PCRE_EXTRA) != 0) switch(c)
470 {
471 case 'X':
472 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
473 break;
474
475 default:
476 *errorptr = ERR3;
477 break;
478 }
479 break;
480 }
481 }
482
483 *ptrptr = ptr;
484 return c;
485 }
486
487
488
489 /*************************************************
490 * Check for counted repeat *
491 *************************************************/
492
493 /* This function is called when a '{' is encountered in a place where it might
494 start a quantifier. It looks ahead to see if it really is a quantifier or not.
495 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
496 where the ddds are digits.
497
498 Arguments:
499 p pointer to the first char after '{'
500
501 Returns: TRUE or FALSE
502 */
503
504 static BOOL
505 is_counted_repeat(uschar *p)
506 {
507 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
508 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
509 if (*p == '}') return TRUE;
510
511 if (*p++ != ',') return FALSE;
512 if (*p == '}') return TRUE;
513
514 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
515 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
516 return (*p == '}');
517 }
518
519
520
521 /*************************************************
522 * Read repeat counts *
523 *************************************************/
524
525 /* Read an item of the form {n,m} and return the values. This is called only
526 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
527 so the syntax is guaranteed to be correct, but we need to check the values.
528
529 Arguments:
530 p pointer to first char after '{'
531 minp pointer to int for min
532 maxp pointer to int for max
533 returned as -1 if no max
534 errorptr points to pointer to error message
535
536 Returns: pointer to '}' on success;
537 current ptr on error, with errorptr set
538 */
539
540 static uschar *
541 read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)
542 {
543 int min = 0;
544 int max = -1;
545
546 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
547
548 if (*p == '}') max = min; else
549 {
550 if (*(++p) != '}')
551 {
552 max = 0;
553 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
554 if (max < min)
555 {
556 *errorptr = ERR4;
557 return p;
558 }
559 }
560 }
561
562 /* Do paranoid checks, then fill in the required variables, and pass back the
563 pointer to the terminating '}'. */
564
565 if (min > 65535 || max > 65535)
566 *errorptr = ERR5;
567 else
568 {
569 *minp = min;
570 *maxp = max;
571 }
572 return p;
573 }
574
575
576
577 /*************************************************
578 * Compile one branch *
579 *************************************************/
580
581 /* Scan the pattern, compiling it into the code vector.
582
583 Arguments:
584 options the option bits
585 bracket points to number of brackets used
586 code points to the pointer to the current code point
587 ptrptr points to the current pattern pointer
588 errorptr points to pointer to error message
589
590 Returns: TRUE on success
591 FALSE, with *errorptr set on error
592 */
593
594 static BOOL
595 compile_branch(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
596 char **errorptr)
597 {
598 int repeat_type, op_type;
599 int repeat_min, repeat_max;
600 int bravalue, length;
601 register int c;
602 register uschar *code = *codeptr;
603 uschar *ptr = *ptrptr;
604 uschar *previous = NULL;
605 uschar *oldptr;
606 uschar class[32];
607
608 /* Switch on next character until the end of the branch */
609
610 for (;; ptr++)
611 {
612 BOOL negate_class;
613 int class_charcount;
614 int class_lastchar;
615
616 c = *ptr;
617 if ((options & PCRE_EXTENDED) != 0)
618 {
619 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
620 if (c == '#')
621 {
622 while ((c = *(++ptr)) != 0 && c != '\n');
623 continue;
624 }
625 }
626
627 switch(c)
628 {
629 /* The branch terminates at end of string, |, or ). */
630
631 case 0:
632 case '|':
633 case ')':
634 *codeptr = code;
635 *ptrptr = ptr;
636 return TRUE;
637
638 /* Handle single-character metacharacters */
639
640 case '^':
641 previous = NULL;
642 *code++ = OP_CIRC;
643 break;
644
645 case '$':
646 previous = NULL;
647 *code++ = OP_DOLL;
648 break;
649
650 case '.':
651 previous = code;
652 *code++ = OP_ANY;
653 break;
654
655 /* Character classes. These always build a 32-byte bitmap of the permitted
656 characters, except in the special case where there is only one character.
657 For negated classes, we build the map as usual, then invert it at the end.
658 */
659
660 case '[':
661 previous = code;
662 *code++ = OP_CLASS;
663
664 /* If the first character is '^', set the negation flag */
665
666 if ((c = *(++ptr)) == '^')
667 {
668 negate_class = TRUE;
669 c = *(++ptr);
670 }
671 else negate_class = FALSE;
672
673 /* Keep a count of chars so that we can optimize the case of just a single
674 character. */
675
676 class_charcount = 0;
677 class_lastchar = -1;
678
679 /* Initialize the 32-char bit map to all zeros. We have to build the
680 map in a temporary bit of store, in case the class contains only 1
681 character, because in that case the compiled code doesn't use the
682 bit map. */
683
684 memset(class, 0, 32 * sizeof(uschar));
685
686 /* Process characters until ] is reached. By writing this as a "do" it
687 means that an initial ] is taken as a data character. */
688
689 do
690 {
691 if (c == 0)
692 {
693 *errorptr = ERR6;
694 goto FAILED;
695 }
696
697 /* Backslash may introduce a single character, or it may introduce one
698 of the specials, which just set a flag. Escaped items are checked for
699 validity in the pre-compiling pass. The sequence \b is a special case.
700 Inside a class (and only there) it is treated as backslash. Elsewhere
701 it marks a word boundary. Other escapes have preset maps ready to
702 or into the one we are building. We assume they have more than one
703 character in them, so set class_count bigger than one. */
704
705 if (c == '\\')
706 {
707 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
708 if (-c == ESC_b) c = '\b';
709 else if (c < 0)
710 {
711 class_charcount = 10;
712 switch (-c)
713 {
714 case ESC_d:
715 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
716 continue;
717
718 case ESC_D:
719 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
720 continue;
721
722 case ESC_w:
723 for (c = 0; c < 32; c++)
724 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
725 continue;
726
727 case ESC_W:
728 for (c = 0; c < 32; c++)
729 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
730 continue;
731
732 case ESC_s:
733 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
734 continue;
735
736 case ESC_S:
737 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
738 continue;
739
740 default:
741 *errorptr = ERR7;
742 goto FAILED;
743 }
744 }
745 /* Fall through if single character */
746 }
747
748 /* A single character may be followed by '-' to form a range. However,
749 Perl does not permit ']' to be the end of the range. A '-' character
750 here is treated as a literal. */
751
752 if (ptr[1] == '-' && ptr[2] != ']')
753 {
754 int d;
755 ptr += 2;
756 d = *ptr;
757
758 if (d == 0)
759 {
760 *errorptr = ERR6;
761 goto FAILED;
762 }
763
764 /* The second part of a range can be a single-character escape, but
765 not any of the other escapes. */
766
767 if (d == '\\')
768 {
769 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
770 if (d < 0)
771 {
772 if (d == -ESC_b) d = '\b'; else
773 {
774 *errorptr = ERR7;
775 goto FAILED;
776 }
777 }
778 }
779
780 if (d < c)
781 {
782 *errorptr = ERR8;
783 goto FAILED;
784 }
785
786 for (; c <= d; c++)
787 {
788 class[c/8] |= (1 << (c&7));
789 if ((options & PCRE_CASELESS) != 0)
790 {
791 int uc = pcre_fcc[c]; /* flip case */
792 class[uc/8] |= (1 << (uc&7));
793 }
794 class_charcount++; /* in case a one-char range */
795 class_lastchar = c;
796 }
797 continue; /* Go get the next char in the class */
798 }
799
800 /* Handle a lone single character - we can get here for a normal
801 non-escape char, or after \ that introduces a single character. */
802
803 class [c/8] |= (1 << (c&7));
804 if ((options & PCRE_CASELESS) != 0)
805 {
806 c = pcre_fcc[c]; /* flip case */
807 class[c/8] |= (1 << (c&7));
808 }
809 class_charcount++;
810 class_lastchar = c;
811 }
812
813 /* Loop until ']' reached; the check for end of string happens inside the
814 loop. This "while" is the end of the "do" above. */
815
816 while ((c = *(++ptr)) != ']');
817
818 /* If class_charcount is 1 and class_lastchar is not negative, we saw
819 precisely one character. This doesn't need the whole 32-byte bit map.
820 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
821 it's negative. */
822
823 if (class_charcount == 1 && class_lastchar >= 0)
824 {
825 if (negate_class)
826 {
827 code[-1] = OP_NOT;
828 }
829 else
830 {
831 code[-1] = OP_CHARS;
832 *code++ = 1;
833 }
834 *code++ = class_lastchar;
835 }
836
837 /* Otherwise, negate the 32-byte map if necessary, and copy it into
838 the code vector. */
839
840 else
841 {
842 if (negate_class)
843 for (c = 0; c < 32; c++) code[c] = ~class[c];
844 else
845 memcpy(code, class, 32);
846 code += 32;
847 }
848 break;
849
850 /* Various kinds of repeat */
851
852 case '{':
853 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
854 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
855 if (*errorptr != NULL) goto FAILED;
856 goto REPEAT;
857
858 case '*':
859 repeat_min = 0;
860 repeat_max = -1;
861 goto REPEAT;
862
863 case '+':
864 repeat_min = 1;
865 repeat_max = -1;
866 goto REPEAT;
867
868 case '?':
869 repeat_min = 0;
870 repeat_max = 1;
871
872 REPEAT:
873 if (previous == NULL)
874 {
875 *errorptr = ERR9;
876 goto FAILED;
877 }
878
879 /* If the next character is '?' this is a minimizing repeat. Advance to the
880 next character. */
881
882 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
883
884 /* If the maximum is zero then the minimum must also be zero; Perl allows
885 this case, so we do too - by simply omitting the item altogether. */
886
887 if (repeat_max == 0) code = previous;
888
889 /* If previous was a string of characters, chop off the last one and use it
890 as the subject of the repeat. If there was only one character, we can
891 abolish the previous item altogether. */
892
893 else if (*previous == OP_CHARS)
894 {
895 int len = previous[1];
896 if (len == 1)
897 {
898 c = previous[2];
899 code = previous;
900 }
901 else
902 {
903 c = previous[len+1];
904 previous[1]--;
905 code--;
906 }
907 op_type = 0; /* Use single-char op codes */
908 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
909 }
910
911 /* If previous was a single negated character ([^a] or similar), we use
912 one of the special opcodes, replacing it. The code is shared with single-
913 character repeats by adding a suitable offset into repeat_type. */
914
915 else if ((int)*previous == OP_NOT)
916 {
917 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
918 c = previous[1];
919 code = previous;
920 goto OUTPUT_SINGLE_REPEAT;
921 }
922
923 /* If previous was a character type match (\d or similar), abolish it and
924 create a suitable repeat item. The code is shared with single-character
925 repeats by adding a suitable offset into repeat_type. */
926
927 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
928 {
929 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
930 c = *previous;
931 code = previous;
932
933 OUTPUT_SINGLE_REPEAT:
934 repeat_type += op_type; /* Combine both values for many cases */
935
936 /* A minimum of zero is handled either as the special case * or ?, or as
937 an UPTO, with the maximum given. */
938
939 if (repeat_min == 0)
940 {
941 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
942 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
943 else
944 {
945 *code++ = OP_UPTO + repeat_type;
946 *code++ = repeat_max >> 8;
947 *code++ = (repeat_max & 255);
948 }
949 }
950
951 /* The case {1,} is handled as the special case + */
952
953 else if (repeat_min == 1 && repeat_max == -1)
954 *code++ = OP_PLUS + repeat_type;
955
956 /* The case {n,n} is just an EXACT, while the general case {n,m} is
957 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
958
959 else
960 {
961 if (repeat_min != 1)
962 {
963 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
964 *code++ = repeat_min >> 8;
965 *code++ = (repeat_min & 255);
966 }
967
968 /* If the mininum is 1 and the previous item was a character string,
969 we either have to put back the item that got cancelled if the string
970 length was 1, or add the character back onto the end of a longer
971 string. For a character type nothing need be done; it will just get put
972 back naturally. */
973
974 else if (*previous == OP_CHARS)
975 {
976 if (code == previous) code += 2; else previous[1]++;
977 }
978
979 /* Insert an UPTO if the max is greater than the min. */
980
981 if (repeat_max != repeat_min)
982 {
983 *code++ = c;
984 repeat_max -= repeat_min;
985 *code++ = OP_UPTO + repeat_type;
986 *code++ = repeat_max >> 8;
987 *code++ = (repeat_max & 255);
988 }
989 }
990
991 /* The character or character type itself comes last in all cases. */
992
993 *code++ = c;
994 }
995
996 /* If previous was a character class or a back reference, we put the repeat
997 stuff after it. */
998
999 else if (*previous == OP_CLASS || *previous == OP_REF)
1000 {
1001 if (repeat_min == 0 && repeat_max == -1)
1002 *code++ = OP_CRSTAR + repeat_type;
1003 else if (repeat_min == 1 && repeat_max == -1)
1004 *code++ = OP_CRPLUS + repeat_type;
1005 else if (repeat_min == 0 && repeat_max == 1)
1006 *code++ = OP_CRQUERY + repeat_type;
1007 else
1008 {
1009 *code++ = OP_CRRANGE + repeat_type;
1010 *code++ = repeat_min >> 8;
1011 *code++ = repeat_min & 255;
1012 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1013 *code++ = repeat_max >> 8;
1014 *code++ = repeat_max & 255;
1015 }
1016 }
1017
1018 /* If previous was a bracket group, we may have to replicate it in certain
1019 cases. If the maximum repeat count is unlimited, check that the bracket
1020 group cannot match the empty string, and diagnose an error if it can. */
1021
1022 else if ((int)*previous >= OP_BRA)
1023 {
1024 int i;
1025 int length = code - previous;
1026
1027 if (repeat_max == -1 && could_be_empty(previous))
1028 {
1029 *errorptr = ERR10;
1030 goto FAILED;
1031 }
1032
1033 /* If the minimum is greater than zero, and the maximum is unlimited or
1034 equal to the minimum, the first copy remains where it is, and is
1035 replicated up to the minimum number of times. This case includes the +
1036 repeat, but of course no replication is needed in that case. */
1037
1038 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1039 {
1040 for (i = 1; i < repeat_min; i++)
1041 {
1042 memcpy(code, previous, length);
1043 code += length;
1044 }
1045 }
1046
1047 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1048 Then, if there is a fixed upper limit, replicated up to that many times,
1049 sticking BRAZERO in front of all the optional ones. */
1050
1051 else
1052 {
1053 if (repeat_min == 0)
1054 {
1055 memmove(previous+1, previous, length);
1056 code++;
1057 *previous++ = OP_BRAZERO + repeat_type;
1058 }
1059
1060 for (i = 1; i < repeat_min; i++)
1061 {
1062 memcpy(code, previous, length);
1063 code += length;
1064 }
1065
1066 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1067 {
1068 *code++ = OP_BRAZERO + repeat_type;
1069 memcpy(code, previous, length);
1070 code += length;
1071 }
1072 }
1073
1074 /* If the maximum is unlimited, set a repeater in the final copy. */
1075
1076 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1077 }
1078
1079 /* Else there's some kind of shambles */
1080
1081 else
1082 {
1083 *errorptr = ERR11;
1084 goto FAILED;
1085 }
1086
1087 /* In all case we no longer have a previous item. */
1088
1089 previous = NULL;
1090 break;
1091
1092
1093 /* Start of nested bracket sub-expression, or comment or lookahead.
1094 First deal with special things that can come after a bracket; all are
1095 introduced by ?, and the appearance of any of them means that this is not a
1096 referencing group. They were checked for validity in the first pass over
1097 the string, so we don't have to check for syntax errors here. */
1098
1099 case '(':
1100 previous = code; /* Only real brackets can be repeated */
1101 if (*(++ptr) == '?')
1102 {
1103 bravalue = OP_BRA;
1104
1105 switch (*(++ptr))
1106 {
1107 case '#':
1108 case 'i':
1109 case 'm':
1110 case 's':
1111 case 'x':
1112 ptr++;
1113 while (*ptr != ')') ptr++;
1114 previous = NULL;
1115 continue;
1116
1117 case ':': /* Non-extracting bracket */
1118 ptr++;
1119 break;
1120
1121 case '=': /* Assertions can't be repeated */
1122 bravalue = OP_ASSERT;
1123 ptr++;
1124 previous = NULL;
1125 break;
1126
1127 case '!':
1128 bravalue = OP_ASSERT_NOT;
1129 ptr++;
1130 previous = NULL;
1131 break;
1132
1133 case '>': /* "Match once" brackets */
1134 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1135 {
1136 bravalue = OP_ONCE;
1137 ptr++;
1138 previous = NULL;
1139 break;
1140 }
1141 /* Else fall through */
1142
1143 default:
1144 *errorptr = ERR12;
1145 goto FAILED;
1146 }
1147 }
1148
1149 /* Else we have a referencing group */
1150
1151 else
1152 {
1153 if (++(*brackets) > EXTRACT_MAX)
1154 {
1155 *errorptr = ERR13;
1156 goto FAILED;
1157 }
1158 bravalue = OP_BRA + *brackets;
1159 }
1160
1161 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1162 code into a non-register variable in order to be able to pass its address
1163 because some compilers complain otherwise. */
1164
1165 *code = bravalue;
1166 {
1167 uschar *mcode = code;
1168 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1169 goto FAILED;
1170 code = mcode;
1171 }
1172
1173 if (*ptr != ')')
1174 {
1175 *errorptr = ERR14;
1176 goto FAILED;
1177 }
1178 break;
1179
1180 /* Check \ for being a real metacharacter; if not, fall through and handle
1181 it as a data character at the start of a string. Escape items are checked
1182 for validity in the pre-compiling pass. */
1183
1184 case '\\':
1185 oldptr = ptr;
1186 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1187
1188 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1189 are arranged to be the negation of the corresponding OP_values. For the
1190 back references, the values are ESC_REF plus the reference number. Only
1191 back references and those types that consume a character may be repeated.
1192 We can test for values between ESC_b and ESC_Z for the latter; this may
1193 have to change if any new ones are ever created. */
1194
1195 if (c < 0)
1196 {
1197 if (-c >= ESC_REF)
1198 {
1199 int refnum = -c - ESC_REF;
1200 if (*brackets < refnum)
1201 {
1202 *errorptr = ERR15;
1203 goto FAILED;
1204 }
1205 previous = code;
1206 *code++ = OP_REF;
1207 *code++ = refnum;
1208 }
1209 else
1210 {
1211 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1212 *code++ = -c;
1213 }
1214 continue;
1215 }
1216
1217 /* Reset and fall through */
1218
1219 ptr = oldptr;
1220 c = '\\';
1221
1222 /* Handle a run of data characters until a metacharacter is encountered.
1223 The first character is guaranteed not to be whitespace or # when the
1224 extended flag is set. */
1225
1226 NORMAL_CHAR:
1227 default:
1228 previous = code;
1229 *code = OP_CHARS;
1230 code += 2;
1231 length = 0;
1232
1233 do
1234 {
1235 if ((options & PCRE_EXTENDED) != 0)
1236 {
1237 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1238 if (c == '#')
1239 {
1240 while ((c = *(++ptr)) != 0 && c != '\n');
1241 if (c == 0) break;
1242 continue;
1243 }
1244 }
1245
1246 /* Backslash may introduce a data char or a metacharacter. Escaped items
1247 are checked for validity in the pre-compiling pass. Stop the string
1248 before a metaitem. */
1249
1250 if (c == '\\')
1251 {
1252 oldptr = ptr;
1253 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1254 if (c < 0) { ptr = oldptr; break; }
1255 }
1256
1257 /* Ordinary character or single-char escape */
1258
1259 *code++ = c;
1260 length++;
1261 }
1262
1263 /* This "while" is the end of the "do" above. */
1264
1265 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1266
1267 /* Compute the length and set it in the data vector, and advance to
1268 the next state. */
1269
1270 previous[1] = length;
1271 ptr--;
1272 break;
1273 }
1274 } /* end of big loop */
1275
1276 /* Control never reaches here by falling through, only by a goto for all the
1277 error states. Pass back the position in the pattern so that it can be displayed
1278 to the user for diagnosing the error. */
1279
1280 FAILED:
1281 *ptrptr = ptr;
1282 return FALSE;
1283 }
1284
1285
1286
1287
1288 /*************************************************
1289 * Compile sequence of alternatives *
1290 *************************************************/
1291
1292 /* On entry, ptr is pointing past the bracket character, but on return
1293 it points to the closing bracket, or vertical bar, or end of string.
1294 The code variable is pointing at the byte into which the BRA operator has been
1295 stored.
1296
1297 Argument:
1298 options the option bits
1299 brackets -> int containing the number of extracting brackets used
1300 codeptr -> the address of the current code pointer
1301 ptrptr -> the address of the current pattern pointer
1302 errorptr -> pointer to error message
1303
1304 Returns: TRUE on success
1305 */
1306
1307 static BOOL
1308 compile_regex(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
1309 char **errorptr)
1310 {
1311 uschar *ptr = *ptrptr;
1312 uschar *code = *codeptr;
1313 uschar *start_bracket = code;
1314
1315 for (;;)
1316 {
1317 int length;
1318 uschar *last_branch = code;
1319
1320 code += 3;
1321 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1322 {
1323 *ptrptr = ptr;
1324 return FALSE;
1325 }
1326
1327 /* Fill in the length of the last branch */
1328
1329 length = code - last_branch;
1330 last_branch[1] = length >> 8;
1331 last_branch[2] = length & 255;
1332
1333 /* Reached end of expression, either ')' or end of pattern. Insert a
1334 terminating ket and the length of the whole bracketed item, and return,
1335 leaving the pointer at the terminating char. */
1336
1337 if (*ptr != '|')
1338 {
1339 length = code - start_bracket;
1340 *code++ = OP_KET;
1341 *code++ = length >> 8;
1342 *code++ = length & 255;
1343 *codeptr = code;
1344 *ptrptr = ptr;
1345 return TRUE;
1346 }
1347
1348 /* Another branch follows; insert an "or" node and advance the pointer. */
1349
1350 *code = OP_ALT;
1351 ptr++;
1352 }
1353 /* Control never reaches here */
1354 }
1355
1356
1357
1358 /*************************************************
1359 * Check for anchored expression *
1360 *************************************************/
1361
1362 /* Try to find out if this is an anchored regular expression. Consider each
1363 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1364 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1365 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1366 counts, since OP_CIRC can match in the middle.
1367
1368 A branch is also implicitly anchored if it starts with .* because that will try
1369 the rest of the pattern at all possible matching points, so there is no point
1370 trying them again.
1371
1372 Argument: points to start of expression (the bracket)
1373 Returns: TRUE or FALSE
1374 */
1375
1376 static BOOL
1377 is_anchored(register uschar *code, BOOL multiline)
1378 {
1379 do {
1380 int op = (int)code[3];
1381 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1382 { if (!is_anchored(code+3, multiline)) return FALSE; }
1383 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1384 { if (code[4] != OP_ANY) return FALSE; }
1385 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1386 code += (code[1] << 8) + code[2];
1387 }
1388 while (*code == OP_ALT);
1389 return TRUE;
1390 }
1391
1392
1393
1394 /*************************************************
1395 * Check for start with \n line expression *
1396 *************************************************/
1397
1398 /* This is called for multiline expressions to try to find out if every branch
1399 starts with ^ so that "first char" processing can be done to speed things up.
1400
1401 Argument: points to start of expression (the bracket)
1402 Returns: TRUE or FALSE
1403 */
1404
1405 static BOOL
1406 is_startline(uschar *code)
1407 {
1408 do {
1409 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1410 { if (!is_startline(code+3)) return FALSE; }
1411 else if (code[3] != OP_CIRC) return FALSE;
1412 code += (code[1] << 8) + code[2];
1413 }
1414 while (*code == OP_ALT);
1415 return TRUE;
1416 }
1417
1418
1419
1420 /*************************************************
1421 * Check for fixed first char *
1422 *************************************************/
1423
1424 /* Try to find out if there is a fixed first character. This is called for
1425 unanchored expressions, as it speeds up their processing quite considerably.
1426 Consider each alternative branch. If they all start with the same char, or with
1427 a bracket all of whose alternatives start with the same char (recurse ad lib),
1428 then we return that char, otherwise -1.
1429
1430 Argument: points to start of expression (the bracket)
1431 Returns: -1 or the fixed first char
1432 */
1433
1434 static int
1435 find_firstchar(uschar *code)
1436 {
1437 register int c = -1;
1438 do
1439 {
1440 register int charoffset = 4;
1441
1442 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1443 {
1444 register int d;
1445 if ((d = find_firstchar(code+3)) < 0) return -1;
1446 if (c < 0) c = d; else if (c != d) return -1;
1447 }
1448
1449 else switch(code[3])
1450 {
1451 default:
1452 return -1;
1453
1454 case OP_EXACT: /* Fall through */
1455 charoffset++;
1456
1457 case OP_CHARS: /* Fall through */
1458 charoffset++;
1459
1460 case OP_PLUS:
1461 case OP_MINPLUS:
1462 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1463 break;
1464 }
1465 code += (code[1] << 8) + code[2];
1466 }
1467 while (*code == OP_ALT);
1468 return c;
1469 }
1470
1471
1472
1473 /*************************************************
1474 * Compile a Regular Expression *
1475 *************************************************/
1476
1477 /* This function takes a string and returns a pointer to a block of store
1478 holding a compiled version of the expression.
1479
1480 Arguments:
1481 pattern the regular expression
1482 options various option bits
1483 errorptr pointer to pointer to error text
1484 erroroffset ptr offset in pattern where error was detected
1485
1486 Returns: pointer to compiled data block, or NULL on error,
1487 with errorptr and erroroffset set
1488 */
1489
1490 pcre *
1491 pcre_compile(const char *pattern, int options, char **errorptr,
1492 int *erroroffset)
1493 {
1494 real_pcre *re;
1495 int spaces = 0;
1496 int length = 3; /* For initial BRA plus length */
1497 int runlength;
1498 int c, size;
1499 int bracount = 0;
1500 int brastack[200];
1501 int brastackptr = 0;
1502 int top_backref = 0;
1503 uschar *code, *ptr;
1504
1505 #ifdef DEBUG
1506 uschar *code_base, *code_end;
1507 #endif
1508
1509 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1510 can do is just return NULL. */
1511
1512 if (errorptr == NULL) return NULL;
1513 *errorptr = NULL;
1514
1515 /* However, we can give a message for this error */
1516
1517 if (erroroffset == NULL)
1518 {
1519 *errorptr = ERR16;
1520 return NULL;
1521 }
1522 *erroroffset = 0;
1523
1524 if ((options & ~PUBLIC_OPTIONS) != 0)
1525 {
1526 *errorptr = ERR17;
1527 return NULL;
1528 }
1529
1530 #ifdef DEBUG
1531 printf("------------------------------------------------------------------\n");
1532 printf("%s\n", pattern);
1533 #endif
1534
1535 /* The first thing to do is to make a pass over the pattern to compute the
1536 amount of store required to hold the compiled code. This does not have to be
1537 perfect as long as errors are overestimates. At the same time we can detect any
1538 internal flag settings. Make an attempt to correct for any counted white space
1539 if an "extended" flag setting appears late in the pattern. We can't be so
1540 clever for #-comments. */
1541
1542 ptr = (uschar *)(pattern - 1);
1543 while ((c = *(++ptr)) != 0)
1544 {
1545 int min, max;
1546 int class_charcount;
1547
1548 if ((pcre_ctypes[c] & ctype_space) != 0)
1549 {
1550 if ((options & PCRE_EXTENDED) != 0) continue;
1551 spaces++;
1552 }
1553
1554 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1555 {
1556 while ((c = *(++ptr)) != 0 && c != '\n');
1557 continue;
1558 }
1559
1560 switch(c)
1561 {
1562 /* A backslashed item may be an escaped "normal" character or a
1563 character type. For a "normal" character, put the pointers and
1564 character back so that tests for whitespace etc. in the input
1565 are done correctly. */
1566
1567 case '\\':
1568 {
1569 uschar *save_ptr = ptr;
1570 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1571 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1572 if (c >= 0)
1573 {
1574 ptr = save_ptr;
1575 c = '\\';
1576 goto NORMAL_CHAR;
1577 }
1578 }
1579 length++;
1580
1581 /* A back reference needs an additional char, plus either one or 5
1582 bytes for a repeat. We also need to keep the value of the highest
1583 back reference. */
1584
1585 if (c <= -ESC_REF)
1586 {
1587 int refnum = -c - ESC_REF;
1588 if (refnum > top_backref) top_backref = refnum;
1589 length++; /* For single back reference */
1590 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1591 {
1592 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1593 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1594 if ((min == 0 && (max == 1 || max == -1)) ||
1595 (min == 1 && max == -1))
1596 length++;
1597 else length += 5;
1598 if (ptr[1] == '?') ptr++;
1599 }
1600 }
1601 continue;
1602
1603 case '^':
1604 case '.':
1605 case '$':
1606 case '*': /* These repeats won't be after brackets; */
1607 case '+': /* those are handled separately */
1608 case '?':
1609 length++;
1610 continue;
1611
1612 /* This covers the cases of repeats after a single char, metachar, class,
1613 or back reference. */
1614
1615 case '{':
1616 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1617 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1618 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1619 if ((min == 0 && (max == 1 || max == -1)) ||
1620 (min == 1 && max == -1))
1621 length++;
1622 else
1623 {
1624 length--; /* Uncount the original char or metachar */
1625 if (min == 1) length++; else if (min > 0) length += 4;
1626 if (max > 0) length += 4; else length += 2;
1627 }
1628 if (ptr[1] == '?') ptr++;
1629 continue;
1630
1631 /* An alternation contains an offset to the next branch or ket. */
1632 case '|':
1633 length += 3;
1634 continue;
1635
1636 /* A character class uses 33 characters. Don't worry about character types
1637 that aren't allowed in classes - they'll get picked up during the compile.
1638 A character class that contains only one character uses 2 or 3 bytes,
1639 depending on whether it is negated or not. Notice this where we can. */
1640
1641 case '[':
1642 class_charcount = 0;
1643 if (*(++ptr) == '^') ptr++;
1644 do
1645 {
1646 if (*ptr == '\\')
1647 {
1648 int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
1649 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1650 if (-c == ESC_b) class_charcount++; else class_charcount = 10;
1651 }
1652 else class_charcount++;
1653 ptr++;
1654 }
1655 while (*ptr != 0 && *ptr != ']');
1656
1657 /* Repeats for negated single chars are handled by the general code */
1658
1659 if (class_charcount == 1) length += 3; else
1660 {
1661 length += 33;
1662
1663 /* A repeat needs either 1 or 5 bytes. */
1664
1665 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1666 {
1667 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1668 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1669 if ((min == 0 && (max == 1 || max == -1)) ||
1670 (min == 1 && max == -1))
1671 length++;
1672 else length += 5;
1673 if (ptr[1] == '?') ptr++;
1674 }
1675 }
1676 continue;
1677
1678 /* Brackets may be genuine groups or special things */
1679
1680 case '(':
1681
1682 /* Handle special forms of bracket, which all start (? */
1683
1684 if (ptr[1] == '?') switch (c = ptr[2])
1685 {
1686 /* Skip over comments entirely */
1687 case '#':
1688 ptr += 3;
1689 while (*ptr != 0 && *ptr != ')') ptr++;
1690 if (*ptr == 0)
1691 {
1692 *errorptr = ERR18;
1693 goto PCRE_ERROR_RETURN;
1694 }
1695 continue;
1696
1697 /* Non-referencing groups and lookaheads just move the pointer on, and
1698 then behave like a non-special bracket, except that they don't increment
1699 the count of extracting brackets. */
1700
1701 case ':':
1702 case '=':
1703 case '!':
1704 ptr += 2;
1705 break;
1706
1707 /* Ditto for the "once only" bracket, allowed only if the extra bit
1708 is set. */
1709
1710 case '>':
1711 if ((options & PCRE_EXTRA) != 0)
1712 {
1713 ptr += 2;
1714 break;
1715 }
1716 /* Else fall thourh */
1717
1718 /* Else loop setting valid options until ) is met. Anything else is an
1719 error. */
1720
1721 default:
1722 ptr += 2;
1723 for (;; ptr++)
1724 {
1725 if ((c = *ptr) == 'i')
1726 {
1727 options |= PCRE_CASELESS;
1728 continue;
1729 }
1730 else if ((c = *ptr) == 'm')
1731 {
1732 options |= PCRE_MULTILINE;
1733 continue;
1734 }
1735 else if (c == 's')
1736 {
1737 options |= PCRE_DOTALL;
1738 continue;
1739 }
1740 else if (c == 'x')
1741 {
1742 options |= PCRE_EXTENDED;
1743 length -= spaces; /* Already counted spaces */
1744 continue;
1745 }
1746 else if (c == ')') break;
1747
1748 *errorptr = ERR12;
1749 goto PCRE_ERROR_RETURN;
1750 }
1751 continue; /* End of this bracket handling */
1752 }
1753
1754 /* Extracting brackets must be counted so we can process escapes in a
1755 Perlish way. */
1756
1757 else bracount++;
1758
1759 /* Non-special forms of bracket. Save length for computing whole length
1760 at end if there's a repeat that requires duplication of the group. */
1761
1762 if (brastackptr >= sizeof(brastack)/sizeof(int))
1763 {
1764 *errorptr = ERR19;
1765 goto PCRE_ERROR_RETURN;
1766 }
1767
1768 brastack[brastackptr++] = length;
1769 length += 3;
1770 continue;
1771
1772 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1773 have to replicate this bracket up to that many times. */
1774
1775 case ')':
1776 length += 3;
1777 {
1778 int min = 1;
1779 int max = 1;
1780 int duplength = length - brastack[--brastackptr];
1781
1782 /* Leave ptr at the final char; for read_repeat_counts this happens
1783 automatically; for the others we need an increment. */
1784
1785 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1786 {
1787 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1788 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1789 }
1790 else if (c == '*') { min = 0; max = -1; ptr++; }
1791 else if (c == '+') { max = -1; ptr++; }
1792 else if (c == '?') { min = 0; ptr++; }
1793
1794 /* If there is a minimum > 1 we have to replicate up to min-1 times; if
1795 there is a limited maximum we have to replicate up to max-1 times and
1796 allow for a BRAZERO item before each optional copy, as we also have to
1797 do before the first copy if the minimum is zero. */
1798
1799 if (min == 0) length++;
1800 else if (min > 1) length += (min - 1) * duplength;
1801 if (max > min) length += (max - min) * (duplength + 1);
1802 }
1803
1804 continue;
1805
1806 /* Non-special character. For a run of such characters the length required
1807 is the number of characters + 2, except that the maximum run length is 255.
1808 We won't get a skipped space or a non-data escape or the start of a #
1809 comment as the first character, so the length can't be zero. */
1810
1811 NORMAL_CHAR:
1812 default:
1813 length += 2;
1814 runlength = 0;
1815 do
1816 {
1817 if ((pcre_ctypes[c] & ctype_space) != 0)
1818 {
1819 if ((options & PCRE_EXTENDED) != 0) continue;
1820 spaces++;
1821 }
1822
1823 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1824 {
1825 while ((c = *(++ptr)) != 0 && c != '\n');
1826 continue;
1827 }
1828
1829 /* Backslash may introduce a data char or a metacharacter; stop the
1830 string before the latter. */
1831
1832 if (c == '\\')
1833 {
1834 uschar *saveptr = ptr;
1835 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1836 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1837 if (c < 0) { ptr = saveptr; break; }
1838 }
1839
1840 /* Ordinary character or single-char escape */
1841
1842 runlength++;
1843 }
1844
1845 /* This "while" is the end of the "do" above. */
1846
1847 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1848
1849 ptr--;
1850 length += runlength;
1851 continue;
1852 }
1853 }
1854
1855 length += 4; /* For final KET and END */
1856
1857 if (length > 65539)
1858 {
1859 *errorptr = ERR20;
1860 return NULL;
1861 }
1862
1863 /* Compute the size of data block needed and get it, either from malloc or
1864 externally provided function. Put in the magic number and the options. */
1865
1866 size = length + offsetof(real_pcre, code);
1867 re = (real_pcre *)(pcre_malloc)(size);
1868
1869 if (re == NULL)
1870 {
1871 *errorptr = ERR21;
1872 return NULL;
1873 }
1874
1875 re->magic_number = MAGIC_NUMBER;
1876 re->options = options;
1877
1878 /* Set up a starting, non-extracting bracket, then compile the expression. On
1879 error, *errorptr will be set non-NULL, so we don't need to look at the result
1880 of the function here. */
1881
1882 ptr = (uschar *)pattern;
1883 code = re->code;
1884 *code = OP_BRA;
1885 bracount = 0;
1886 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1887 re->top_bracket = bracount;
1888 re->top_backref = top_backref;
1889
1890 /* If not reached end of pattern on success, there's an excess bracket. */
1891
1892 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1893
1894 /* Fill in the terminating state and check for disastrous overflow, but
1895 if debugging, leave the test till after things are printed out. */
1896
1897 *code++ = OP_END;
1898
1899 #ifndef DEBUG
1900 if (code - re->code > length) *errorptr = ERR23;
1901 #endif
1902
1903 /* Failed to compile */
1904
1905 if (*errorptr != NULL)
1906 {
1907 (pcre_free)(re);
1908 PCRE_ERROR_RETURN:
1909 *erroroffset = ptr - (uschar *)pattern;
1910 return NULL;
1911 }
1912
1913 /* If the anchored option was not passed, set flag if we can determine that it
1914 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1915 we can determine what the first character has to be, because that speeds up
1916 unanchored matches no end. In the case of multiline matches, an alternative is
1917 to set the PCRE_STARTLINE flag if all branches start with ^. */
1918
1919 if ((options & PCRE_ANCHORED) == 0)
1920 {
1921 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1922 re->options |= PCRE_ANCHORED;
1923 else
1924 {
1925 int c = find_firstchar(re->code);
1926 if (c >= 0)
1927 {
1928 re->first_char = c;
1929 re->options |= PCRE_FIRSTSET;
1930 }
1931 else if (is_startline(re->code))
1932 re->options |= PCRE_STARTLINE;
1933 }
1934 }
1935
1936 /* Print out the compiled data for debugging */
1937
1938 #ifdef DEBUG
1939
1940 printf("Length = %d top_bracket = %d top_backref=%d\n",
1941 length, re->top_bracket, re->top_backref);
1942
1943 if (re->options != 0)
1944 {
1945 printf("%s%s%s%s%s%s%s\n",
1946 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1947 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1948 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1949 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1950 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1951 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1952 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1953 }
1954
1955 if ((re->options & PCRE_FIRSTSET) != 0)
1956 {
1957 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1958 else printf("First char = \\x%02x\n", re->first_char);
1959 }
1960
1961 code_end = code;
1962 code_base = code = re->code;
1963
1964 while (code < code_end)
1965 {
1966 int charlength;
1967
1968 printf("%3d ", code - code_base);
1969
1970 if (*code >= OP_BRA)
1971 {
1972 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1973 code += 2;
1974 }
1975
1976 else switch(*code)
1977 {
1978 case OP_CHARS:
1979 charlength = *(++code);
1980 printf("%3d ", charlength);
1981 while (charlength-- > 0)
1982 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
1983 break;
1984
1985 case OP_KETRMAX:
1986 case OP_KETRMIN:
1987 case OP_ALT:
1988 case OP_KET:
1989 case OP_ASSERT:
1990 case OP_ASSERT_NOT:
1991 case OP_ONCE:
1992 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
1993 code += 2;
1994 break;
1995
1996 case OP_STAR:
1997 case OP_MINSTAR:
1998 case OP_PLUS:
1999 case OP_MINPLUS:
2000 case OP_QUERY:
2001 case OP_MINQUERY:
2002 case OP_TYPESTAR:
2003 case OP_TYPEMINSTAR:
2004 case OP_TYPEPLUS:
2005 case OP_TYPEMINPLUS:
2006 case OP_TYPEQUERY:
2007 case OP_TYPEMINQUERY:
2008 if (*code >= OP_TYPESTAR)
2009 printf(" %s", OP_names[code[1]]);
2010 else if (isprint(c = code[1])) printf(" %c", c);
2011 else printf(" \\x%02x", c);
2012 printf("%s", OP_names[*code++]);
2013 break;
2014
2015 case OP_EXACT:
2016 case OP_UPTO:
2017 case OP_MINUPTO:
2018 if (isprint(c = code[3])) printf(" %c{", c);
2019 else printf(" \\x%02x{", c);
2020 if (*code != OP_EXACT) printf(",");
2021 printf("%d}", (code[1] << 8) + code[2]);
2022 if (*code == OP_MINUPTO) printf("?");
2023 code += 3;
2024 break;
2025
2026 case OP_TYPEEXACT:
2027 case OP_TYPEUPTO:
2028 case OP_TYPEMINUPTO:
2029 printf(" %s{", OP_names[code[3]]);
2030 if (*code != OP_TYPEEXACT) printf(",");
2031 printf("%d}", (code[1] << 8) + code[2]);
2032 if (*code == OP_TYPEMINUPTO) printf("?");
2033 code += 3;
2034 break;
2035
2036 case OP_NOT:
2037 if (isprint(c = *(++code))) printf(" [^%c]", c);
2038 else printf(" [^\\x%02x]", c);
2039 break;
2040
2041 case OP_NOTSTAR:
2042 case OP_NOTMINSTAR:
2043 case OP_NOTPLUS:
2044 case OP_NOTMINPLUS:
2045 case OP_NOTQUERY:
2046 case OP_NOTMINQUERY:
2047 if (isprint(c = code[1])) printf(" [^%c]", c);
2048 else printf(" [^\\x%02x]", c);
2049 printf("%s", OP_names[*code++]);
2050 break;
2051
2052 case OP_NOTEXACT:
2053 case OP_NOTUPTO:
2054 case OP_NOTMINUPTO:
2055 if (isprint(c = code[3])) printf(" [^%c]{", c);
2056 else printf(" [^\\x%02x]{", c);
2057 if (*code != OP_NOTEXACT) printf(",");
2058 printf("%d}", (code[1] << 8) + code[2]);
2059 if (*code == OP_NOTMINUPTO) printf("?");
2060 code += 3;
2061 break;
2062
2063 case OP_REF:
2064 printf(" \\%d", *(++code));
2065 break;
2066
2067 case OP_CLASS:
2068 {
2069 int i, min, max;
2070
2071 code++;
2072 printf(" [");
2073
2074 for (i = 0; i < 256; i++)
2075 {
2076 if ((code[i/8] & (1 << (i&7))) != 0)
2077 {
2078 int j;
2079 for (j = i+1; j < 256; j++)
2080 if ((code[j/8] & (1 << (j&7))) == 0) break;
2081 if (i == '-' || i == ']') printf("\\");
2082 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2083 if (--j > i)
2084 {
2085 printf("-");
2086 if (j == '-' || j == ']') printf("\\");
2087 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2088 }
2089 i = j;
2090 }
2091 }
2092 printf("]");
2093 code += 32;
2094
2095 switch(*code)
2096 {
2097 case OP_CRSTAR:
2098 case OP_CRMINSTAR:
2099 case OP_CRPLUS:
2100 case OP_CRMINPLUS:
2101 case OP_CRQUERY:
2102 case OP_CRMINQUERY:
2103 printf("%s", OP_names[*code]);
2104 break;
2105
2106 case OP_CRRANGE:
2107 case OP_CRMINRANGE:
2108 min = (code[1] << 8) + code[2];
2109 max = (code[3] << 8) + code[4];
2110 if (max == 0) printf("{%d,}", min);
2111 else printf("{%d,%d}", min, max);
2112 if (*code == OP_CRMINRANGE) printf("?");
2113 code += 4;
2114 break;
2115
2116 default:
2117 code--;
2118 }
2119 }
2120 break;
2121
2122 /* Anything else is just a one-node item */
2123
2124 default:
2125 printf(" %s", OP_names[*code]);
2126 break;
2127 }
2128
2129 code++;
2130 printf("\n");
2131 }
2132 printf("------------------------------------------------------------------\n");
2133
2134 /* This check is done here in the debugging case so that the code that
2135 was compiled can be seen. */
2136
2137 if (code - re->code > length)
2138 {
2139 *errorptr = ERR23;
2140 (pcre_free)(re);
2141 *erroroffset = ptr - (uschar *)pattern;
2142 return NULL;
2143 }
2144 #endif
2145
2146 return (pcre *)re;
2147 }
2148
2149
2150
2151 /*************************************************
2152 * Match a character type *
2153 *************************************************/
2154
2155 /* Not used in all the places it might be as it's sometimes faster
2156 to put the code inline.
2157
2158 Arguments:
2159 type the character type
2160 c the character
2161 dotall the dotall flag
2162
2163 Returns: TRUE if character is of the type
2164 */
2165
2166 static BOOL
2167 match_type(int type, int c, BOOL dotall)
2168 {
2169
2170 #ifdef DEBUG
2171 if (isprint(c)) printf("matching subject %c against ", c);
2172 else printf("matching subject \\x%02x against ", c);
2173 printf("%s\n", OP_names[type]);
2174 #endif
2175
2176 switch(type)
2177 {
2178 case OP_ANY: return dotall || c != '\n';
2179 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2180 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2181 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2182 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2183 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2184 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2185 }
2186 return FALSE;
2187 }
2188
2189
2190
2191 /*************************************************
2192 * Match a back-reference *
2193 *************************************************/
2194
2195 /* If a back reference hasn't been set, the match fails.
2196
2197 Arguments:
2198 number reference number
2199 eptr points into the subject
2200 length length to be matched
2201 md points to match data block
2202
2203 Returns: TRUE if matched
2204 */
2205
2206 static BOOL
2207 match_ref(int number, register uschar *eptr, int length, match_data *md)
2208 {
2209 uschar *p = md->start_subject + md->offset_vector[number];
2210
2211 #ifdef DEBUG
2212 if (eptr >= md->end_subject)
2213 printf("matching subject <null>");
2214 else
2215 {
2216 printf("matching subject ");
2217 pchars(eptr, length, TRUE, md);
2218 }
2219 printf(" against backref ");
2220 pchars(p, length, FALSE, md);
2221 printf("\n");
2222 #endif
2223
2224 /* Always fail if not enough characters left */
2225
2226 if (length > md->end_subject - p) return FALSE;
2227
2228 /* Separate the caselesss case for speed */
2229
2230 if (md->caseless)
2231 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2232 else
2233 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2234
2235 return TRUE;
2236 }
2237
2238
2239
2240 /*************************************************
2241 * Match from current position *
2242 *************************************************/
2243
2244 /* On entry ecode points to the first opcode, and eptr to the first character.
2245
2246 Arguments:
2247 eptr pointer in subject
2248 ecode position in code
2249 offset_top current top pointer
2250 md pointer to "static" info for the match
2251
2252 Returns: TRUE if matched
2253 */
2254
2255 static BOOL
2256 match(register uschar *eptr, register uschar *ecode, int offset_top,
2257 match_data *md)
2258 {
2259 for (;;)
2260 {
2261 int min, max, ctype;
2262 register int i;
2263 register int c;
2264 BOOL minimize;
2265
2266 /* Opening bracket. Check the alternative branches in turn, failing if none
2267 match. We have to set the start offset if required and there is space
2268 in the offset vector so that it is available for subsequent back references
2269 if the bracket matches. However, if the bracket fails, we must put back the
2270 previous value of both offsets in case they were set by a previous copy of
2271 the same bracket. Don't worry about setting the flag for the error case here;
2272 that is handled in the code for KET. */
2273
2274 if ((int)*ecode >= OP_BRA)
2275 {
2276 int number = (*ecode - OP_BRA) << 1;
2277 int save_offset1, save_offset2;
2278
2279 #ifdef DEBUG
2280 printf("start bracket %d\n", number/2);
2281 #endif
2282
2283 if (number > 0 && number < md->offset_end)
2284 {
2285 save_offset1 = md->offset_vector[number];
2286 save_offset2 = md->offset_vector[number+1];
2287 md->offset_vector[number] = eptr - md->start_subject;
2288
2289 #ifdef DEBUG
2290 printf("saving %d %d\n", save_offset1, save_offset2);
2291 #endif
2292 }
2293
2294 /* Recurse for all the alternatives. */
2295
2296 do
2297 {
2298 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2299 ecode += (ecode[1] << 8) + ecode[2];
2300 }
2301 while (*ecode == OP_ALT);
2302
2303 #ifdef DEBUG
2304 printf("bracket %d failed\n", number/2);
2305 #endif
2306
2307 if (number > 0 && number < md->offset_end)
2308 {
2309 md->offset_vector[number] = save_offset1;
2310 md->offset_vector[number+1] = save_offset2;
2311 }
2312
2313 return FALSE;
2314 }
2315
2316 /* Other types of node can be handled by a switch */
2317
2318 switch(*ecode)
2319 {
2320 case OP_END:
2321 md->end_match_ptr = eptr; /* Record where we ended */
2322 md->end_offset_top = offset_top; /* and how many extracts were taken */
2323 return TRUE;
2324
2325 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2326 whole thing doesn't match, so we have to get out via a longjmp(). */
2327
2328 case OP_CUT:
2329 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2330 longjmp(md->fail_env, 1);
2331
2332 /* Assertion brackets. Check the alternative branches in turn - the
2333 matching won't pass the KET for an assertion. If any one branch matches,
2334 the assertion is true. */
2335
2336 case OP_ASSERT:
2337 do
2338 {
2339 if (match(eptr, ecode+3, offset_top, md)) break;
2340 ecode += (ecode[1] << 8) + ecode[2];
2341 }
2342 while (*ecode == OP_ALT);
2343 if (*ecode == OP_KET) return FALSE;
2344
2345 /* Continue from after the assertion, updating the offsets high water
2346 mark, since extracts may have been taken during the assertion. */
2347
2348 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2349 ecode += 3;
2350 offset_top = md->end_offset_top;
2351 continue;
2352
2353 /* Negative assertion: all branches must fail to match */
2354
2355 case OP_ASSERT_NOT:
2356 do
2357 {
2358 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2359 ecode += (ecode[1] << 8) + ecode[2];
2360 }
2361 while (*ecode == OP_ALT);
2362 ecode += 3;
2363 continue;
2364
2365 /* "Once" brackets are like assertion brackets except that after a match,
2366 the point in the subject string is not moved back. Thus there can never be
2367 a move back into the brackets. Check the alternative branches in turn - the
2368 matching won't pass the KET for this kind of subpattern. If any one branch
2369 matches, we carry on, leaving the subject pointer. */
2370
2371 case OP_ONCE:
2372 do
2373 {
2374 if (match(eptr, ecode+3, offset_top, md)) break;
2375 ecode += (ecode[1] << 8) + ecode[2];
2376 }
2377 while (*ecode == OP_ALT);
2378 if (*ecode == OP_KET) return FALSE;
2379
2380 /* Continue as from after the assertion, updating the offsets high water
2381 mark, since extracts may have been taken. */
2382
2383 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2384 ecode += 3;
2385 offset_top = md->end_offset_top;
2386 eptr = md->end_match_ptr;
2387 continue;
2388
2389 /* An alternation is the end of a branch; scan along to find the end of the
2390 bracketed group and go to there. */
2391
2392 case OP_ALT:
2393 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2394 break;
2395
2396 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2397 that it may occur zero times. It may repeat infinitely, or not at all -
2398 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2399 repeat limits are compiled as a number of copies, with the optional ones
2400 preceded by BRAZERO or BRAMINZERO. */
2401
2402 case OP_BRAZERO:
2403 {
2404 uschar *next = ecode+1;
2405 if (match(eptr, next, offset_top, md)) return TRUE;
2406 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2407 ecode = next + 3;
2408 }
2409 break;
2410
2411 case OP_BRAMINZERO:
2412 {
2413 uschar *next = ecode+1;
2414 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2415 if (match(eptr, next+3, offset_top, md)) return TRUE;
2416 ecode++;
2417 }
2418 break;;
2419
2420 /* End of a group, repeated or non-repeating. If we are at the end of
2421 an assertion "group", stop matching and return TRUE, but record the
2422 current high water mark for use by positive assertions. */
2423
2424 case OP_KET:
2425 case OP_KETRMIN:
2426 case OP_KETRMAX:
2427 {
2428 int number;
2429 uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2430
2431 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2432 {
2433 md->end_match_ptr = eptr; /* For ONCE */
2434 md->end_offset_top = offset_top;
2435 return TRUE;
2436 }
2437
2438 /* In all other cases we have to check the group number back at the
2439 start and if necessary complete handling an extraction by setting the
2440 final offset and bumping the high water mark. */
2441
2442 number = (*prev - OP_BRA) << 1;
2443
2444 #ifdef DEBUG
2445 printf("end bracket %d\n", number/2);
2446 #endif
2447
2448 if (number > 0)
2449 {
2450 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2451 {
2452 md->offset_vector[number+1] = eptr - md->start_subject;
2453 if (offset_top <= number) offset_top = number + 2;
2454 }
2455 }
2456
2457 /* For a non-repeating ket, just advance to the next node and continue at
2458 this level. */
2459
2460 if (*ecode == OP_KET)
2461 {
2462 ecode += 3;
2463 break;
2464 }
2465
2466 /* The repeating kets try the rest of the pattern or restart from the
2467 preceding bracket, in the appropriate order. */
2468
2469 if (*ecode == OP_KETRMIN)
2470 {
2471 if (match(eptr, ecode+3, offset_top, md) ||
2472 match(eptr, prev, offset_top, md)) return TRUE;
2473 }
2474 else /* OP_KETRMAX */
2475 {
2476 if (match(eptr, prev, offset_top, md) ||
2477 match(eptr, ecode+3, offset_top, md)) return TRUE;
2478 }
2479 }
2480 return FALSE;
2481
2482 /* Start of subject unless notbol, or after internal newline if multiline */
2483
2484 case OP_CIRC:
2485 if (md->notbol && eptr == md->start_subject) return FALSE;
2486 if (md->multiline)
2487 {
2488 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2489 ecode++;
2490 break;
2491 }
2492 /* ... else fall through */
2493
2494 /* Start of subject assertion */
2495
2496 case OP_SOD:
2497 if (eptr != md->start_subject) return FALSE;
2498 ecode++;
2499 break;
2500
2501 /* Assert before internal newline if multiline, or before
2502 a terminating newline unless endonly is set, else end of subject unless
2503 noteol is set. */
2504
2505 case OP_DOLL:
2506 if (md->noteol && eptr >= md->end_subject) return FALSE;
2507 if (md->multiline)
2508 {
2509 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2510 ecode++;
2511 break;
2512 }
2513 else if (!md->endonly)
2514 {
2515 if (eptr < md->end_subject - 1 ||
2516 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2517 ecode++;
2518 break;
2519 }
2520 /* ... else fall through */
2521
2522 /* End of subject assertion */
2523
2524 case OP_EOD:
2525 if (eptr < md->end_subject) return FALSE;
2526 ecode++;
2527 break;
2528
2529 /* Word boundary assertions */
2530
2531 case OP_NOT_WORD_BOUNDARY:
2532 case OP_WORD_BOUNDARY:
2533 {
2534 BOOL prev_is_word = (eptr != md->start_subject) &&
2535 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2536 BOOL cur_is_word = (eptr < md->end_subject) &&
2537 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2538 if ((*ecode++ == OP_WORD_BOUNDARY)?
2539 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2540 return FALSE;
2541 }
2542 break;
2543
2544 /* Match a single character type; inline for speed */
2545
2546 case OP_ANY:
2547 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2548 if (eptr++ >= md->end_subject) return FALSE;
2549 ecode++;
2550 break;
2551
2552 case OP_NOT_DIGIT:
2553 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2554 return FALSE;
2555 ecode++;
2556 break;
2557
2558 case OP_DIGIT:
2559 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2560 return FALSE;
2561 ecode++;
2562 break;
2563
2564 case OP_NOT_WHITESPACE:
2565 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2566 return FALSE;
2567 ecode++;
2568 break;
2569
2570 case OP_WHITESPACE:
2571 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2572 return FALSE;
2573 ecode++;
2574 break;
2575
2576 case OP_NOT_WORDCHAR:
2577 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2578 return FALSE;
2579 ecode++;
2580 break;
2581
2582 case OP_WORDCHAR:
2583 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2584 return FALSE;
2585 ecode++;
2586 break;
2587
2588 /* Match a back reference, possibly repeatedly. Look past the end of the
2589 item to see if there is repeat information following. The code is similar
2590 to that for character classes, but repeated for efficiency. Then obey
2591 similar code to character type repeats - written out again for speed.
2592 However, if the referenced string is the empty string, always treat
2593 it as matched, any number of times (otherwise there could be infinite
2594 loops). */
2595
2596 case OP_REF:
2597 {
2598 int length;
2599 int number = ecode[1] << 1; /* Doubled reference number */
2600 ecode += 2; /* Advance past the item */
2601
2602 if (number >= offset_top || md->offset_vector[number] < 0)
2603 {
2604 md->errorcode = PCRE_ERROR_BADREF;
2605 return FALSE;
2606 }
2607
2608 length = md->offset_vector[number+1] - md->offset_vector[number];
2609
2610 switch (*ecode)
2611 {
2612 case OP_CRSTAR:
2613 case OP_CRMINSTAR:
2614 case OP_CRPLUS:
2615 case OP_CRMINPLUS:
2616 case OP_CRQUERY:
2617 case OP_CRMINQUERY:
2618 c = *ecode++ - OP_CRSTAR;
2619 minimize = (c & 1) != 0;
2620 min = rep_min[c]; /* Pick up values from tables; */
2621 max = rep_max[c]; /* zero for max => infinity */
2622 if (max == 0) max = INT_MAX;
2623 break;
2624
2625 case OP_CRRANGE:
2626 case OP_CRMINRANGE:
2627 minimize = (*ecode == OP_CRMINRANGE);
2628 min = (ecode[1] << 8) + ecode[2];
2629 max = (ecode[3] << 8) + ecode[4];
2630 if (max == 0) max = INT_MAX;
2631 ecode += 5;
2632 break;
2633
2634 default: /* No repeat follows */
2635 if (!match_ref(number, eptr, length, md)) return FALSE;
2636 eptr += length;
2637 continue; /* With the main loop */
2638 }
2639
2640 /* If the length of the reference is zero, just continue with the
2641 main loop. */
2642
2643 if (length == 0) continue;
2644
2645 /* First, ensure the minimum number of matches are present. We get back
2646 the length of the reference string explicitly rather than passing the
2647 address of eptr, so that eptr can be a register variable. */
2648
2649 for (i = 1; i <= min; i++)
2650 {
2651 if (!match_ref(number, eptr, length, md)) return FALSE;
2652 eptr += length;
2653 }
2654
2655 /* If min = max, continue at the same level without recursion.
2656 They are not both allowed to be zero. */
2657
2658 if (min == max) continue;
2659
2660 /* If minimizing, keep trying and advancing the pointer */
2661
2662 if (minimize)
2663 {
2664 for (i = min;; i++)
2665 {
2666 if (match(eptr, ecode, offset_top, md)) return TRUE;
2667 if (i >= max || !match_ref(number, eptr, length, md))
2668 return FALSE;
2669 eptr += length;
2670 }
2671 /* Control never gets here */
2672 }
2673
2674 /* If maximizing, find the longest string and work backwards */
2675
2676 else
2677 {
2678 uschar *pp = eptr;
2679 for (i = min; i < max; i++)
2680 {
2681 if (!match_ref(number, eptr, length, md)) break;
2682 eptr += length;
2683 }
2684 while (eptr >= pp)
2685 {
2686 if (match(eptr, ecode, offset_top, md)) return TRUE;
2687 eptr -= length;
2688 }
2689 return FALSE;
2690 }
2691 }
2692 /* Control never gets here */
2693
2694 /* Match a character class, possibly repeatedly. Look past the end of the
2695 item to see if there is repeat information following. Then obey similar
2696 code to character type repeats - written out again for speed. If caseless
2697 matching was set at runtime but not at compile time, we have to check both
2698 versions of a character. */
2699
2700 case OP_CLASS:
2701 {
2702 uschar *data = ecode + 1; /* Save for matching */
2703 ecode += 33; /* Advance past the item */
2704
2705 switch (*ecode)
2706 {
2707 case OP_CRSTAR:
2708 case OP_CRMINSTAR:
2709 case OP_CRPLUS:
2710 case OP_CRMINPLUS:
2711 case OP_CRQUERY:
2712 case OP_CRMINQUERY:
2713 c = *ecode++ - OP_CRSTAR;
2714 minimize = (c & 1) != 0;
2715 min = rep_min[c]; /* Pick up values from tables; */
2716 max = rep_max[c]; /* zero for max => infinity */
2717 if (max == 0) max = INT_MAX;
2718 break;
2719
2720 case OP_CRRANGE:
2721 case OP_CRMINRANGE:
2722 minimize = (*ecode == OP_CRMINRANGE);
2723 min = (ecode[1] << 8) + ecode[2];
2724 max = (ecode[3] << 8) + ecode[4];
2725 if (max == 0) max = INT_MAX;
2726 ecode += 5;
2727 break;
2728
2729 default: /* No repeat follows */
2730 if (eptr >= md->end_subject) return FALSE;
2731 c = *eptr++;
2732 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2733 if (md->runtime_caseless)
2734 {
2735 c = pcre_fcc[c];
2736 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2737 }
2738 return FALSE;
2739 }
2740
2741 /* First, ensure the minimum number of matches are present. */
2742
2743 for (i = 1; i <= min; i++)
2744 {
2745 if (eptr >= md->end_subject) return FALSE;
2746 c = *eptr++;
2747 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2748 if (md->runtime_caseless)
2749 {
2750 c = pcre_fcc[c];
2751 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2752 }
2753 return FALSE;
2754 }
2755
2756 /* If max == min we can continue with the main loop without the
2757 need to recurse. */
2758
2759 if (min == max) continue;
2760
2761 /* If minimizing, keep testing the rest of the expression and advancing
2762 the pointer while it matches the class. */
2763
2764 if (minimize)
2765 {
2766 for (i = min;; i++)
2767 {
2768 if (match(eptr, ecode, offset_top, md)) return TRUE;
2769 if (i >= max || eptr >= md->end_subject) return FALSE;
2770 c = *eptr++;
2771 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2772 if (md->runtime_caseless)
2773 {
2774 c = pcre_fcc[c];
2775 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2776 }
2777 return FALSE;
2778 }
2779 /* Control never gets here */
2780 }
2781
2782 /* If maximizing, find the longest possible run, then work backwards. */
2783
2784 else
2785 {
2786 uschar *pp = eptr;
2787 for (i = min; i < max; eptr++, i++)
2788 {
2789 if (eptr >= md->end_subject) break;
2790 c = *eptr;
2791 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2792 if (md->runtime_caseless)
2793 {
2794 c = pcre_fcc[c];
2795 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2796 }
2797 break;
2798 }
2799
2800 while (eptr >= pp)
2801 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2802 return FALSE;
2803 }
2804 }
2805 /* Control never gets here */
2806
2807 /* Match a run of characters */
2808
2809 case OP_CHARS:
2810 {
2811 register int length = ecode[1];
2812 ecode += 2;
2813
2814 #ifdef DEBUG
2815 if (eptr >= md->end_subject)
2816 printf("matching subject <null> against pattern ");
2817 else
2818 {
2819 printf("matching subject ");
2820 pchars(eptr, length, TRUE, md);
2821 printf(" against pattern ");
2822 }
2823 pchars(ecode, length, FALSE, md);
2824 printf("\n");
2825 #endif
2826
2827 if (length > md->end_subject - eptr) return FALSE;
2828 if (md->caseless)
2829 {
2830 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2831 }
2832 else
2833 {
2834 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2835 }
2836 }
2837 break;
2838
2839 /* Match a single character repeatedly; different opcodes share code. */
2840
2841 case OP_EXACT:
2842 min = max = (ecode[1] << 8) + ecode[2];
2843 ecode += 3;
2844 goto REPEATCHAR;
2845
2846 case OP_UPTO:
2847 case OP_MINUPTO:
2848 min = 0;
2849 max = (ecode[1] << 8) + ecode[2];
2850 minimize = *ecode == OP_MINUPTO;
2851 ecode += 3;
2852 goto REPEATCHAR;
2853
2854 case OP_STAR:
2855 case OP_MINSTAR:
2856 case OP_PLUS:
2857 case OP_MINPLUS:
2858 case OP_QUERY:
2859 case OP_MINQUERY:
2860 c = *ecode++ - OP_STAR;
2861 minimize = (c & 1) != 0;
2862 min = rep_min[c]; /* Pick up values from tables; */
2863 max = rep_max[c]; /* zero for max => infinity */
2864 if (max == 0) max = INT_MAX;
2865
2866 /* Common code for all repeated single-character matches. We can give
2867 up quickly if there are fewer than the minimum number of characters left in
2868 the subject. */
2869
2870 REPEATCHAR:
2871 if (min > md->end_subject - eptr) return FALSE;
2872 c = *ecode++;
2873
2874 /* The code is duplicated for the caseless and caseful cases, for speed,
2875 since matching characters is likely to be quite common. First, ensure the
2876 minimum number of matches are present. If min = max, continue at the same
2877 level without recursing. Otherwise, if minimizing, keep trying the rest of
2878 the expression and advancing one matching character if failing, up to the
2879 maximum. Alternatively, if maximizing, find the maximum number of
2880 characters and work backwards. */
2881
2882 #ifdef DEBUG
2883 printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2884 max, eptr);
2885 #endif
2886
2887 if (md->caseless)
2888 {
2889 c = pcre_lcc[c];
2890 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2891 if (min == max) continue;
2892 if (minimize)
2893 {
2894 for (i = min;; i++)
2895 {
2896 if (match(eptr, ecode, offset_top, md)) return TRUE;
2897 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2898 return FALSE;
2899 }
2900 /* Control never gets here */
2901 }
2902 else
2903 {
2904 uschar *pp = eptr;
2905 for (i = min; i < max; i++)
2906 {
2907 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2908 eptr++;
2909 }
2910 while (eptr >= pp)
2911 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2912 return FALSE;
2913 }
2914 /* Control never gets here */
2915 }
2916
2917 /* Caseful comparisons */
2918
2919 else
2920 {
2921 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2922 if (min == max) continue;
2923 if (minimize)
2924 {
2925 for (i = min;; i++)
2926 {
2927 if (match(eptr, ecode, offset_top, md)) return TRUE;
2928 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2929 }
2930 /* Control never gets here */
2931 }
2932 else
2933 {
2934 uschar *pp = eptr;
2935 for (i = min; i < max; i++)
2936 {
2937 if (eptr >= md->end_subject || c != *eptr) break;
2938 eptr++;
2939 }
2940 while (eptr >= pp)
2941 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2942 return FALSE;
2943 }
2944 }
2945 /* Control never gets here */
2946
2947 /* Match a negated single character */
2948
2949 case OP_NOT:
2950 if (eptr > md->end_subject) return FALSE;
2951 ecode++;
2952 if (md->caseless)
2953 {
2954 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2955 }
2956 else
2957 {
2958 if (*ecode++ == *eptr++) return FALSE;
2959 }
2960 break;
2961
2962 /* Match a negated single character repeatedly. This is almost a repeat of
2963 the code for a repeated single character, but I haven't found a nice way of
2964 commoning these up that doesn't require a test of the positive/negative
2965 option for each character match. Maybe that wouldn't add very much to the
2966 time taken, but character matching *is* what this is all about... */
2967
2968 case OP_NOTEXACT:
2969 min = max = (ecode[1] << 8) + ecode[2];
2970 ecode += 3;
2971 goto REPEATNOTCHAR;
2972
2973 case OP_NOTUPTO:
2974 case OP_NOTMINUPTO:
2975 min = 0;
2976 max = (ecode[1] << 8) + ecode[2];
2977 minimize = *ecode == OP_NOTMINUPTO;
2978 ecode += 3;
2979 goto REPEATNOTCHAR;
2980
2981 case OP_NOTSTAR:
2982 case OP_NOTMINSTAR:
2983 case OP_NOTPLUS:
2984 case OP_NOTMINPLUS:
2985 case OP_NOTQUERY:
2986 case OP_NOTMINQUERY:
2987 c = *ecode++ - OP_NOTSTAR;
2988 minimize = (c & 1) != 0;
2989 min = rep_min[c]; /* Pick up values from tables; */
2990 max = rep_max[c]; /* zero for max => infinity */
2991 if (max == 0) max = INT_MAX;
2992
2993 /* Common code for all repeated single-character matches. We can give
2994 up quickly if there are fewer than the minimum number of characters left in
2995 the subject. */
2996
2997 REPEATNOTCHAR:
2998 if (min > md->end_subject - eptr) return FALSE;
2999 c = *ecode++;
3000
3001 /* The code is duplicated for the caseless and caseful cases, for speed,
3002 since matching characters is likely to be quite common. First, ensure the
3003 minimum number of matches are present. If min = max, continue at the same
3004 level without recursing. Otherwise, if minimizing, keep trying the rest of
3005 the expression and advancing one matching character if failing, up to the
3006 maximum. Alternatively, if maximizing, find the maximum number of
3007 characters and work backwards. */
3008
3009 #ifdef DEBUG
3010 printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3011 max, eptr);
3012 #endif
3013
3014 if (md->caseless)
3015 {
3016 c = pcre_lcc[c];
3017 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3018 if (min == max) continue;
3019 if (minimize)
3020 {
3021 for (i = min;; i++)
3022 {
3023 if (match(eptr, ecode, offset_top, md)) return TRUE;
3024 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3025 return FALSE;
3026 }
3027 /* Control never gets here */
3028 }
3029 else
3030 {
3031 uschar *pp = eptr;
3032 for (i = min; i < max; i++)
3033 {
3034 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3035 eptr++;
3036 }
3037 while (eptr >= pp)
3038 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3039 return FALSE;
3040 }
3041 /* Control never gets here */
3042 }
3043
3044 /* Caseful comparisons */
3045
3046 else
3047 {
3048 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3049 if (min == max) continue;
3050 if (minimize)
3051 {
3052 for (i = min;; i++)
3053 {
3054 if (match(eptr, ecode, offset_top, md)) return TRUE;
3055 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3056 }
3057 /* Control never gets here */
3058 }
3059 else
3060 {
3061 uschar *pp = eptr;
3062 for (i = min; i < max; i++)
3063 {
3064 if (eptr >= md->end_subject || c == *eptr) break;
3065 eptr++;
3066 }
3067 while (eptr >= pp)
3068 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3069 return FALSE;
3070 }
3071 }
3072 /* Control never gets here */
3073
3074 /* Match a single character type repeatedly; several different opcodes
3075 share code. This is very similar to the code for single characters, but we
3076 repeat it in the interests of efficiency. */
3077
3078 case OP_TYPEEXACT:
3079 min = max = (ecode[1] << 8) + ecode[2];
3080 minimize = TRUE;
3081 ecode += 3;
3082 goto REPEATTYPE;
3083
3084 case OP_TYPEUPTO:
3085 case OP_TYPEMINUPTO:
3086 min = 0;
3087 max = (ecode[1] << 8) + ecode[2];
3088 minimize = *ecode == OP_TYPEMINUPTO;
3089 ecode += 3;
3090 goto REPEATTYPE;
3091
3092 case OP_TYPESTAR:
3093 case OP_TYPEMINSTAR:
3094 case OP_TYPEPLUS:
3095 case OP_TYPEMINPLUS:
3096 case OP_TYPEQUERY:
3097 case OP_TYPEMINQUERY:
3098 c = *ecode++ - OP_TYPESTAR;
3099 minimize = (c & 1) != 0;
3100 min = rep_min[c]; /* Pick up values from tables; */
3101 max = rep_max[c]; /* zero for max => infinity */
3102 if (max == 0) max = INT_MAX;
3103
3104 /* Common code for all repeated single character type matches */
3105
3106 REPEATTYPE:
3107 ctype = *ecode++; /* Code for the character type */
3108
3109 /* First, ensure the minimum number of matches are present. Use inline
3110 code for maximizing the speed, and do the type test once at the start
3111 (i.e. keep it out of the loop). Also test that there are at least the
3112 minimum number of characters before we start. */
3113
3114 if (min > md->end_subject - eptr) return FALSE;
3115 if (min > 0) switch(ctype)
3116 {
3117 case OP_ANY:
3118 if (!md->dotall)
3119 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3120 else eptr += min;
3121 break;
3122
3123 case OP_NOT_DIGIT:
3124 for (i = 1; i <= min; i++)
3125 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3126 break;
3127
3128 case OP_DIGIT:
3129 for (i = 1; i <= min; i++)
3130 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3131 break;
3132
3133 case OP_NOT_WHITESPACE:
3134 for (i = 1; i <= min; i++)
3135 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3136 break;
3137
3138 case OP_WHITESPACE:
3139 for (i = 1; i <= min; i++)
3140 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3141 break;
3142
3143 case OP_NOT_WORDCHAR:
3144 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3145 return FALSE;
3146 break;
3147
3148 case OP_WORDCHAR:
3149 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3150 return FALSE;
3151 break;
3152 }
3153
3154 /* If min = max, continue at the same level without recursing */
3155
3156 if (min == max) continue;
3157
3158 /* If minimizing, we have to test the rest of the pattern before each
3159 subsequent match, so inlining isn't much help; just use the function. */
3160
3161 if (minimize)
3162 {
3163 for (i = min;; i++)
3164 {
3165 if (match(eptr, ecode, offset_top, md)) return TRUE;
3166 if (i >= max || eptr >= md->end_subject ||
3167 !match_type(ctype, *eptr++, md->dotall))
3168 return FALSE;
3169 }
3170 /* Control never gets here */
3171 }
3172
3173 /* If maximizing it is worth using inline code for speed, doing the type
3174 test once at the start (i.e. keep it out of the loop). */
3175
3176 else
3177 {
3178 uschar *pp = eptr;
3179 switch(ctype)
3180 {
3181 case OP_ANY:
3182 if (!md->dotall)
3183 {
3184 for (i = min; i < max; i++)
3185 {
3186 if (eptr >= md->end_subject || *eptr == '\n') break;
3187 eptr++;
3188 }
3189 }
3190 else
3191 {
3192 c = max - min;
3193 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3194 eptr += c;
3195 }
3196 break;
3197
3198 case OP_NOT_DIGIT:
3199 for (i = min; i < max; i++)
3200 {
3201 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3202 break;
3203 eptr++;
3204 }
3205 break;
3206
3207 case OP_DIGIT:
3208 for (i = min; i < max; i++)
3209 {
3210 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3211 break;
3212 eptr++;
3213 }
3214 break;
3215
3216 case OP_NOT_WHITESPACE:
3217 for (i = min; i < max; i++)
3218 {
3219 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3220 break;
3221 eptr++;
3222 }
3223 break;
3224
3225 case OP_WHITESPACE:
3226 for (i = min; i < max; i++)
3227 {
3228 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3229 break;
3230 eptr++;
3231 }
3232 break;
3233
3234 case OP_NOT_WORDCHAR:
3235 for (i = min; i < max; i++)
3236 {
3237 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3238 break;
3239 eptr++;
3240 }
3241 break;
3242
3243 case OP_WORDCHAR:
3244 for (i = min; i < max; i++)
3245 {
3246 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3247 break;
3248 eptr++;
3249 }
3250 break;
3251 }
3252
3253 while (eptr >= pp)
3254 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3255 return FALSE;
3256 }
3257 /* Control never gets here */
3258
3259 /* There's been some horrible disaster. */
3260
3261 default:
3262 #ifdef DEBUG
3263 printf("Unknown opcode %d\n", *ecode);
3264 #endif
3265 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3266 return FALSE;
3267 }
3268
3269 /* Do not stick any code in here without much thought; it is assumed
3270 that "continue" in the code above comes out to here to repeat the main
3271 loop. */
3272
3273 } /* End of main loop */
3274 /* Control never reaches here */
3275 }
3276
3277
3278
3279 /*************************************************
3280 * Execute a Regular Expression *
3281 *************************************************/
3282
3283 /* This function applies a compiled re to a subject string and picks out
3284 portions of the string if it matches. Two elements in the vector are set for
3285 each substring: the offsets to the start and end of the substring.
3286
3287 Arguments:
3288 external_re points to the compiled expression
3289 external_extra points to "hints" from pcre_study() or is NULL
3290 subject points to the subject string
3291 length length of subject string (may contain binary zeros)
3292 options option bits
3293 offsets points to a vector of ints to be filled in with offsets
3294 offsetcount the number of elements in the vector
3295
3296 Returns: > 0 => success; value is the number of elements filled in
3297 = 0 => success, but offsets is not big enough
3298 -1 => failed to match
3299 < -1 => some kind of unexpected problem
3300 */
3301
3302 int
3303 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3304 const char *subject, int length, int options, int *offsets, int offsetcount)
3305 {
3306 int resetcount;
3307 int ocount = offsetcount;
3308 int first_char = -1;
3309 match_data match_block;
3310 uschar *start_bits = NULL;
3311 uschar *start_match = (uschar *)subject;
3312 uschar *end_subject;
3313 real_pcre *re = (real_pcre *)external_re;
3314 real_pcre_extra *extra = (real_pcre_extra *)external_extra;
3315 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3316 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3317
3318 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3319
3320 if (re == NULL || subject == NULL ||
3321 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3322 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3323
3324 match_block.start_subject = (uschar *)subject;
3325 match_block.end_subject = match_block.start_subject + length;
3326 end_subject = match_block.end_subject;
3327
3328 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3329 match_block.runtime_caseless = match_block.caseless &&
3330 (re->options & PCRE_CASELESS) == 0;
3331
3332 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3333 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3334 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3335
3336 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3337 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3338
3339 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3340
3341 /* If the expression has got more back references than the offsets supplied can
3342 hold, we get a temporary bit of working store to use during the matching.
3343 Otherwise, we can use the vector supplied, rounding down the size of it to a
3344 multiple of 2. */
3345
3346 ocount &= (-2);
3347 if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
3348 {
3349 ocount = re->top_backref * 2 + 2;
3350 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3351 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3352 #ifdef DEBUG
3353 printf("Got memory to hold back references\n");
3354 #endif
3355 }
3356 else match_block.offset_vector = offsets;
3357
3358 match_block.offset_end = ocount;
3359 match_block.offset_overflow = FALSE;
3360
3361 /* Compute the minimum number of offsets that we need to reset each time. Doing
3362 this makes a huge difference to execution time when there aren't many brackets
3363 in the pattern. */
3364
3365 resetcount = 2 + re->top_bracket * 2;
3366 if (resetcount > offsetcount) resetcount = ocount;
3367
3368 /* If MULTILINE is set at exec time but was not set at compile time, and the
3369 anchored flag is set, we must re-check because a setting provoked by ^ in the
3370 pattern is not right in multi-line mode. Calling is_anchored() again here does
3371 the right check, because multiline is now set. If it now yields FALSE, the
3372 expression must have had ^ starting some of its branches. Check to see if
3373 that is true for *all* branches, and if so, set the startline flag. */
3374
3375 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3376 !is_anchored(re->code, match_block.multiline))
3377 {
3378 anchored = FALSE;
3379 if (is_startline(re->code)) startline = TRUE;
3380 }
3381
3382 /* Set up the first character to match, if available. The first_char value is
3383 never set for an anchored regular expression, but the anchoring may be forced
3384 at run time, so we have to test for anchoring. The first char may be unset for
3385 an unanchored pattern, of course. If there's no first char and the pattern was
3386 studied, the may be a bitmap of possible first characters. However, we can
3387 use this only if the caseless state of the studying was correct. */
3388
3389 if (!anchored)
3390 {
3391 if ((re->options & PCRE_FIRSTSET) != 0)
3392 {
3393 first_char = re->first_char;
3394 if (match_block.caseless) first_char = pcre_lcc[first_char];
3395 }
3396 else
3397 if (!startline && extra != NULL &&
3398 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3399 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3400 start_bits = extra->start_bits;
3401 }
3402
3403 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3404
3405 do
3406 {
3407 register int *iptr = match_block.offset_vector;
3408 register int *iend = iptr + resetcount;
3409
3410 /* Reset the maximum number of extractions we might see. */
3411
3412 while (iptr < iend) *iptr++ = -1;
3413
3414 /* Advance to a unique first char if possible */
3415
3416 if (first_char >= 0)
3417 {
3418 if (match_block.caseless)
3419 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3420 start_match++;
3421 else
3422 while (start_match < end_subject && *start_match != first_char)
3423 start_match++;
3424 }
3425
3426 /* Or to just after \n for a multiline match if possible */
3427
3428 else if (startline)
3429 {
3430 if (start_match > match_block.start_subject)
3431 {
3432 while (start_match < end_subject && start_match[-1] != '\n')
3433 start_match++;
3434 }
3435 }
3436
3437 /* Or to a non-unique first char */
3438
3439 else if (start_bits != NULL)
3440 {
3441 while (start_match < end_subject)
3442 {
3443 register int c = *start_match;
3444 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3445 }
3446 }
3447
3448 #ifdef DEBUG
3449 printf(">>>> Match against: ");
3450 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3451 printf("\n");
3452 #endif
3453
3454 /* When a match occurs, substrings will be set for all internal extractions;
3455 we just need to set up the whole thing as substring 0 before returning. If
3456 there were too many extractions, set the return code to zero. In the case
3457 where we had to get some local store to hold offsets for backreferences, copy
3458 those back references that we can. In this case there need not be overflow
3459 if certain parts of the pattern were not used.
3460
3461 Before starting the match, we have to set up a longjmp() target to enable
3462 the "cut" operation to fail a match completely without backtracking. */
3463
3464 if (setjmp(match_block.fail_env) == 0 &&
3465 match(start_match, re->code, 2, &match_block))
3466 {
3467 int rc;
3468
3469 if (ocount != offsetcount)
3470 {
3471 if (offsetcount >= 4)
3472 {
3473 memcpy(offsets + 2, match_block.offset_vector + 2,
3474 (offsetcount - 2) * sizeof(int));
3475 #ifdef DEBUG
3476 printf("Copied offsets; freeing temporary memory\n");
3477 #endif
3478 }
3479 if (match_block.end_offset_top > offsetcount)
3480 match_block.offset_overflow = TRUE;
3481
3482 #ifdef DEBUG
3483 printf("Freeing temporary memory\n");
3484 #endif
3485
3486 (pcre_free)(match_block.offset_vector);
3487 }
3488
3489 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3490
3491 if (match_block.offset_end < 2) rc = 0; else
3492 {
3493 offsets[0] = start_match - match_block.start_subject;
3494 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3495 }
3496
3497 #ifdef DEBUG
3498 printf(">>>> returning %d\n", rc);
3499 #endif
3500 return rc;
3501 }
3502 }
3503 while (!anchored &&
3504 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3505 start_match++ < end_subject);
3506
3507 #ifdef DEBUG
3508 printf(">>>> returning %d\n", match_block.errorcode);
3509 #endif
3510
3511 return match_block.errorcode;
3512 }
3513
3514 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12