/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 7 - (show annotations) (download)
Sat Feb 24 21:38:09 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 99406 byte(s)
Load pcre-1.02 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36
37 /* Include the internals header, which itself includes Standard C headers plus
38 the external pcre header. */
39
40 #include "internal.h"
41
42
43 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
44
45 static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
46 static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
47
48 /* Text forms of OP_ values and things, for debugging */
49
50 #ifdef DEBUG
51 static const char *OP_names[] = {
52 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
53 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
54 "not",
55 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
56 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
57 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
58 "*", "*?", "+", "+?", "?", "??", "{", "{",
59 "class", "Ref",
60 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
61 "Brazero", "Braminzero", "Bra"
62 };
63 #endif
64
65 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
66 are simple data values; negative values are for special things like \d and so
67 on. Zero means further processing is needed (for things like \x), or the escape
68 is invalid. */
69
70 static short int escapes[] = {
71 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
72 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
73 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
74 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
75 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
76 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
77 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
78 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
79 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
80 0, 0, 0 /* x - z */
81 };
82
83 /* Definition to allow mutual recursion */
84
85 static BOOL compile_regex(int, int *, uschar **, const uschar **, const char **);
86
87 /* Structure for passing "static" information around between the functions
88 doing the matching, so that they are thread-safe. */
89
90 typedef struct match_data {
91 int errorcode; /* As it says */
92 int *offset_vector; /* Offset vector */
93 int offset_end; /* One past the end */
94 BOOL offset_overflow; /* Set if too many extractions */
95 BOOL caseless; /* Case-independent flag */
96 BOOL runtime_caseless; /* Caseless forced at run time */
97 BOOL multiline; /* Multiline flag */
98 BOOL notbol; /* NOTBOL flag */
99 BOOL noteol; /* NOTEOL flag */
100 BOOL dotall; /* Dot matches any char */
101 BOOL endonly; /* Dollar not before final \n */
102 const uschar *start_subject; /* Start of the subject string */
103 const uschar *end_subject; /* End of the subject string */
104 jmp_buf fail_env; /* Environment for longjump() break out */
105 const uschar *end_match_ptr; /* Subject position at end match */
106 int end_offset_top; /* Highwater mark at end of match */
107 } match_data;
108
109
110
111 /*************************************************
112 * Global variables *
113 *************************************************/
114
115 /* PCRE is thread-clean and doesn't use any global variables in the normal
116 sense. However, it calls memory allocation and free functions via the two
117 indirections below, which are can be changed by the caller, but are shared
118 between all threads. */
119
120 void *(*pcre_malloc)(size_t) = malloc;
121 void (*pcre_free)(void *) = free;
122
123
124
125
126 /*************************************************
127 * Return version string *
128 *************************************************/
129
130 const char *
131 pcre_version(void)
132 {
133 return PCRE_VERSION;
134 }
135
136
137
138
139 /*************************************************
140 * Return info about a compiled pattern *
141 *************************************************/
142
143 /* This function picks potentially useful data out of the private
144 structure.
145
146 Arguments:
147 external_re points to compiled code
148 optptr where to pass back the options
149 first_char where to pass back the first character,
150 or -1 if multiline and all branches start ^,
151 or -2 otherwise
152
153 Returns: number of identifying extraction brackets
154 or negative values on error
155 */
156
157 int
158 pcre_info(const pcre *external_re, int *optptr, int *first_char)
159 {
160 const real_pcre *re = (const real_pcre *)external_re;
161 if (re == NULL) return PCRE_ERROR_NULL;
162 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
163 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
164 if (first_char != NULL)
165 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
166 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
167 return re->top_bracket;
168 }
169
170
171
172
173 #ifdef DEBUG
174 /*************************************************
175 * Debugging function to print chars *
176 *************************************************/
177
178 /* Print a sequence of chars in printable format, stopping at the end of the
179 subject if the requested.
180
181 Arguments:
182 p points to characters
183 length number to print
184 is_subject TRUE if printing from within md->start_subject
185 md pointer to matching data block, if is_subject is TRUE
186
187 Returns: nothing
188 */
189
190 static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
191 {
192 int c;
193 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
194 while (length-- > 0)
195 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
196 }
197 #endif
198
199
200
201
202 /*************************************************
203 * Check subpattern for empty operand *
204 *************************************************/
205
206 /* This function checks a bracketed subpattern to see if any of the paths
207 through it could match an empty string. This is used to diagnose an error if
208 such a subpattern is followed by a quantifier with an unlimited upper bound.
209
210 Argument:
211 code points to the opening bracket
212
213 Returns: TRUE or FALSE
214 */
215
216 static BOOL
217 could_be_empty(uschar *code)
218 {
219 do {
220 uschar *cc = code + 3;
221
222 /* Scan along the opcodes for this branch; as soon as we find something
223 that matches a non-empty string, break out and advance to test the next
224 branch. If we get to the end of the branch, return TRUE for the whole
225 sub-expression. */
226
227 for (;;)
228 {
229 /* Test an embedded subpattern; if it could not be empty, break the
230 loop. Otherwise carry on in the branch. */
231
232 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
233 {
234 if (!could_be_empty(cc)) break;
235 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
236 cc += 3;
237 }
238
239 else switch (*cc)
240 {
241 /* Reached end of a branch: the subpattern may match the empty string */
242
243 case OP_ALT:
244 case OP_KET:
245 case OP_KETRMAX:
246 case OP_KETRMIN:
247 return TRUE;
248
249 /* Skip over assertive subpatterns */
250
251 case OP_ASSERT:
252 case OP_ASSERT_NOT:
253 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
254 cc += 3;
255 break;
256
257 /* Skip over things that don't match chars */
258
259 case OP_SOD:
260 case OP_EOD:
261 case OP_CIRC:
262 case OP_DOLL:
263 case OP_BRAZERO:
264 case OP_BRAMINZERO:
265 case OP_NOT_WORD_BOUNDARY:
266 case OP_WORD_BOUNDARY:
267 cc++;
268 break;
269
270 /* Skip over simple repeats with zero lower bound */
271
272 case OP_STAR:
273 case OP_MINSTAR:
274 case OP_QUERY:
275 case OP_MINQUERY:
276 case OP_NOTSTAR:
277 case OP_NOTMINSTAR:
278 case OP_NOTQUERY:
279 case OP_NOTMINQUERY:
280 case OP_TYPESTAR:
281 case OP_TYPEMINSTAR:
282 case OP_TYPEQUERY:
283 case OP_TYPEMINQUERY:
284 cc += 2;
285 break;
286
287 /* Skip over UPTOs (lower bound is zero) */
288
289 case OP_UPTO:
290 case OP_MINUPTO:
291 case OP_TYPEUPTO:
292 case OP_TYPEMINUPTO:
293 cc += 4;
294 break;
295
296 /* Check a class or a back reference for a zero minimum */
297
298 case OP_CLASS:
299 case OP_REF:
300 cc += (*cc == OP_REF)? 2 : 33;
301
302 switch (*cc)
303 {
304 case OP_CRSTAR:
305 case OP_CRMINSTAR:
306 case OP_CRQUERY:
307 case OP_CRMINQUERY:
308 cc++;
309 break;
310
311 case OP_CRRANGE:
312 case OP_CRMINRANGE:
313 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
314 cc += 3;
315 break;
316
317 default:
318 goto NEXT_BRANCH;
319 }
320 break;
321
322 /* Anything else matches at least one character */
323
324 default:
325 goto NEXT_BRANCH;
326 }
327 }
328
329 NEXT_BRANCH:
330 code += (code[1] << 8) + code[2];
331 }
332 while (*code == OP_ALT);
333
334 /* No branches match the empty string */
335
336 return FALSE;
337 }
338
339
340
341 /*************************************************
342 * Handle escapes *
343 *************************************************/
344
345 /* This function is called when a \ has been encountered. It either returns a
346 positive value for a simple escape such as \n, or a negative value which
347 encodes one of the more complicated things such as \d. On entry, ptr is
348 pointing at the \. On exit, it is on the final character of the escape
349 sequence.
350
351 Arguments:
352 ptrptr points to the pattern position pointer
353 errorptr points to the pointer to the error message
354 bracount number of previous extracting brackets
355 options the options bits
356 isclass TRUE if inside a character class
357
358 Returns: zero or positive => a data character
359 negative => a special escape sequence
360 on error, errorptr is set
361 */
362
363 static int
364 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
365 int options, BOOL isclass)
366 {
367 const uschar *ptr = *ptrptr;
368 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
369 int i;
370
371 if (c == 0) *errorptr = ERR1;
372
373 /* Digits or letters may have special meaning; all others are literals. */
374
375 else if (c < '0' || c > 'z') {}
376
377 /* Do an initial lookup in a table. A non-zero result is something that can be
378 returned immediately. Otherwise further processing may be required. */
379
380 else if ((i = escapes[c - '0']) != 0) c = i;
381
382 /* Escapes that need further processing, or are illegal. */
383
384 else
385 {
386 const uschar *oldptr;
387 switch (c)
388 {
389 /* The handling of escape sequences consisting of a string of digits
390 starting with one that is not zero is not straightforward. By experiment,
391 the way Perl works seems to be as follows:
392
393 Outside a character class, the digits are read as a decimal number. If the
394 number is less than 10, or if there are that many previous extracting
395 left brackets, then it is a back reference. Otherwise, up to three octal
396 digits are read to form an escaped byte. Thus \123 is likely to be octal
397 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
398 value is greater than 377, the least significant 8 bits are taken. Inside a
399 character class, \ followed by a digit is always an octal number. */
400
401 case '1': case '2': case '3': case '4': case '5':
402 case '6': case '7': case '8': case '9':
403
404 if (!isclass)
405 {
406 oldptr = ptr;
407 c -= '0';
408 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
409 c = c * 10 + *(++ptr) - '0';
410 if (c < 10 || c <= bracount)
411 {
412 c = -(ESC_REF + c);
413 break;
414 }
415 ptr = oldptr; /* Put the pointer back and fall through */
416 }
417
418 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
419 generates a binary zero byte and treats the digit as a following literal.
420 Thus we have to pull back the pointer by one. */
421
422 if ((c = *ptr) >= '8')
423 {
424 ptr--;
425 c = 0;
426 break;
427 }
428
429 /* \0 always starts an octal number, but we may drop through to here with a
430 larger first octal digit */
431
432 case '0':
433 c -= '0';
434 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
435 ptr[1] != '8' && ptr[1] != '9')
436 c = c * 8 + *(++ptr) - '0';
437 break;
438
439 /* Special escapes not starting with a digit are straightforward */
440
441 case 'x':
442 c = 0;
443 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
444 {
445 ptr++;
446 c = c * 16 + pcre_lcc[*ptr] -
447 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
448 }
449 break;
450
451 case 'c':
452 c = *(++ptr);
453 if (c == 0)
454 {
455 *errorptr = ERR2;
456 return 0;
457 }
458
459 /* A letter is upper-cased; then the 0x40 bit is flipped */
460
461 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
462 c ^= 0x40;
463 break;
464
465 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
466 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
467 for Perl compatibility, it is a literal. */
468
469 default:
470 if ((options & PCRE_EXTRA) != 0) switch(c)
471 {
472 case 'X':
473 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
474 break;
475
476 default:
477 *errorptr = ERR3;
478 break;
479 }
480 break;
481 }
482 }
483
484 *ptrptr = ptr;
485 return c;
486 }
487
488
489
490 /*************************************************
491 * Check for counted repeat *
492 *************************************************/
493
494 /* This function is called when a '{' is encountered in a place where it might
495 start a quantifier. It looks ahead to see if it really is a quantifier or not.
496 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
497 where the ddds are digits.
498
499 Arguments:
500 p pointer to the first char after '{'
501
502 Returns: TRUE or FALSE
503 */
504
505 static BOOL
506 is_counted_repeat(const uschar *p)
507 {
508 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
509 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
510 if (*p == '}') return TRUE;
511
512 if (*p++ != ',') return FALSE;
513 if (*p == '}') return TRUE;
514
515 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
516 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
517 return (*p == '}');
518 }
519
520
521
522 /*************************************************
523 * Read repeat counts *
524 *************************************************/
525
526 /* Read an item of the form {n,m} and return the values. This is called only
527 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
528 so the syntax is guaranteed to be correct, but we need to check the values.
529
530 Arguments:
531 p pointer to first char after '{'
532 minp pointer to int for min
533 maxp pointer to int for max
534 returned as -1 if no max
535 errorptr points to pointer to error message
536
537 Returns: pointer to '}' on success;
538 current ptr on error, with errorptr set
539 */
540
541 static const uschar *
542 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
543 {
544 int min = 0;
545 int max = -1;
546
547 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
548
549 if (*p == '}') max = min; else
550 {
551 if (*(++p) != '}')
552 {
553 max = 0;
554 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
555 if (max < min)
556 {
557 *errorptr = ERR4;
558 return p;
559 }
560 }
561 }
562
563 /* Do paranoid checks, then fill in the required variables, and pass back the
564 pointer to the terminating '}'. */
565
566 if (min > 65535 || max > 65535)
567 *errorptr = ERR5;
568 else
569 {
570 *minp = min;
571 *maxp = max;
572 }
573 return p;
574 }
575
576
577
578 /*************************************************
579 * Compile one branch *
580 *************************************************/
581
582 /* Scan the pattern, compiling it into the code vector.
583
584 Arguments:
585 options the option bits
586 bracket points to number of brackets used
587 code points to the pointer to the current code point
588 ptrptr points to the current pattern pointer
589 errorptr points to pointer to error message
590
591 Returns: TRUE on success
592 FALSE, with *errorptr set on error
593 */
594
595 static BOOL
596 compile_branch(int options, int *brackets, uschar **codeptr,
597 const uschar **ptrptr, const char **errorptr)
598 {
599 int repeat_type, op_type;
600 int repeat_min, repeat_max;
601 int bravalue, length;
602 register int c;
603 register uschar *code = *codeptr;
604 const uschar *ptr = *ptrptr;
605 const uschar *oldptr;
606 uschar *previous = NULL;
607 uschar class[32];
608
609 /* Switch on next character until the end of the branch */
610
611 for (;; ptr++)
612 {
613 BOOL negate_class;
614 int class_charcount;
615 int class_lastchar;
616
617 c = *ptr;
618 if ((options & PCRE_EXTENDED) != 0)
619 {
620 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
621 if (c == '#')
622 {
623 while ((c = *(++ptr)) != 0 && c != '\n');
624 continue;
625 }
626 }
627
628 switch(c)
629 {
630 /* The branch terminates at end of string, |, or ). */
631
632 case 0:
633 case '|':
634 case ')':
635 *codeptr = code;
636 *ptrptr = ptr;
637 return TRUE;
638
639 /* Handle single-character metacharacters */
640
641 case '^':
642 previous = NULL;
643 *code++ = OP_CIRC;
644 break;
645
646 case '$':
647 previous = NULL;
648 *code++ = OP_DOLL;
649 break;
650
651 case '.':
652 previous = code;
653 *code++ = OP_ANY;
654 break;
655
656 /* Character classes. These always build a 32-byte bitmap of the permitted
657 characters, except in the special case where there is only one character.
658 For negated classes, we build the map as usual, then invert it at the end.
659 */
660
661 case '[':
662 previous = code;
663 *code++ = OP_CLASS;
664
665 /* If the first character is '^', set the negation flag */
666
667 if ((c = *(++ptr)) == '^')
668 {
669 negate_class = TRUE;
670 c = *(++ptr);
671 }
672 else negate_class = FALSE;
673
674 /* Keep a count of chars so that we can optimize the case of just a single
675 character. */
676
677 class_charcount = 0;
678 class_lastchar = -1;
679
680 /* Initialize the 32-char bit map to all zeros. We have to build the
681 map in a temporary bit of store, in case the class contains only 1
682 character, because in that case the compiled code doesn't use the
683 bit map. */
684
685 memset(class, 0, 32 * sizeof(uschar));
686
687 /* Process characters until ] is reached. By writing this as a "do" it
688 means that an initial ] is taken as a data character. */
689
690 do
691 {
692 if (c == 0)
693 {
694 *errorptr = ERR6;
695 goto FAILED;
696 }
697
698 /* Backslash may introduce a single character, or it may introduce one
699 of the specials, which just set a flag. Escaped items are checked for
700 validity in the pre-compiling pass. The sequence \b is a special case.
701 Inside a class (and only there) it is treated as backspace. Elsewhere
702 it marks a word boundary. Other escapes have preset maps ready to
703 or into the one we are building. We assume they have more than one
704 character in them, so set class_count bigger than one. */
705
706 if (c == '\\')
707 {
708 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
709 if (-c == ESC_b) c = '\b';
710 else if (c < 0)
711 {
712 class_charcount = 10;
713 switch (-c)
714 {
715 case ESC_d:
716 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
717 continue;
718
719 case ESC_D:
720 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
721 continue;
722
723 case ESC_w:
724 for (c = 0; c < 32; c++)
725 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
726 continue;
727
728 case ESC_W:
729 for (c = 0; c < 32; c++)
730 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
731 continue;
732
733 case ESC_s:
734 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
735 continue;
736
737 case ESC_S:
738 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
739 continue;
740
741 default:
742 *errorptr = ERR7;
743 goto FAILED;
744 }
745 }
746 /* Fall through if single character */
747 }
748
749 /* A single character may be followed by '-' to form a range. However,
750 Perl does not permit ']' to be the end of the range. A '-' character
751 here is treated as a literal. */
752
753 if (ptr[1] == '-' && ptr[2] != ']')
754 {
755 int d;
756 ptr += 2;
757 d = *ptr;
758
759 if (d == 0)
760 {
761 *errorptr = ERR6;
762 goto FAILED;
763 }
764
765 /* The second part of a range can be a single-character escape, but
766 not any of the other escapes. */
767
768 if (d == '\\')
769 {
770 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
771 if (d < 0)
772 {
773 if (d == -ESC_b) d = '\b'; else
774 {
775 *errorptr = ERR7;
776 goto FAILED;
777 }
778 }
779 }
780
781 if (d < c)
782 {
783 *errorptr = ERR8;
784 goto FAILED;
785 }
786
787 for (; c <= d; c++)
788 {
789 class[c/8] |= (1 << (c&7));
790 if ((options & PCRE_CASELESS) != 0)
791 {
792 int uc = pcre_fcc[c]; /* flip case */
793 class[uc/8] |= (1 << (uc&7));
794 }
795 class_charcount++; /* in case a one-char range */
796 class_lastchar = c;
797 }
798 continue; /* Go get the next char in the class */
799 }
800
801 /* Handle a lone single character - we can get here for a normal
802 non-escape char, or after \ that introduces a single character. */
803
804 class [c/8] |= (1 << (c&7));
805 if ((options & PCRE_CASELESS) != 0)
806 {
807 c = pcre_fcc[c]; /* flip case */
808 class[c/8] |= (1 << (c&7));
809 }
810 class_charcount++;
811 class_lastchar = c;
812 }
813
814 /* Loop until ']' reached; the check for end of string happens inside the
815 loop. This "while" is the end of the "do" above. */
816
817 while ((c = *(++ptr)) != ']');
818
819 /* If class_charcount is 1 and class_lastchar is not negative, we saw
820 precisely one character. This doesn't need the whole 32-byte bit map.
821 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
822 it's negative. */
823
824 if (class_charcount == 1 && class_lastchar >= 0)
825 {
826 if (negate_class)
827 {
828 code[-1] = OP_NOT;
829 }
830 else
831 {
832 code[-1] = OP_CHARS;
833 *code++ = 1;
834 }
835 *code++ = class_lastchar;
836 }
837
838 /* Otherwise, negate the 32-byte map if necessary, and copy it into
839 the code vector. */
840
841 else
842 {
843 if (negate_class)
844 for (c = 0; c < 32; c++) code[c] = ~class[c];
845 else
846 memcpy(code, class, 32);
847 code += 32;
848 }
849 break;
850
851 /* Various kinds of repeat */
852
853 case '{':
854 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
855 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
856 if (*errorptr != NULL) goto FAILED;
857 goto REPEAT;
858
859 case '*':
860 repeat_min = 0;
861 repeat_max = -1;
862 goto REPEAT;
863
864 case '+':
865 repeat_min = 1;
866 repeat_max = -1;
867 goto REPEAT;
868
869 case '?':
870 repeat_min = 0;
871 repeat_max = 1;
872
873 REPEAT:
874 if (previous == NULL)
875 {
876 *errorptr = ERR9;
877 goto FAILED;
878 }
879
880 /* If the next character is '?' this is a minimizing repeat. Advance to the
881 next character. */
882
883 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
884
885 /* If the maximum is zero then the minimum must also be zero; Perl allows
886 this case, so we do too - by simply omitting the item altogether. */
887
888 if (repeat_max == 0) code = previous;
889
890 /* If previous was a string of characters, chop off the last one and use it
891 as the subject of the repeat. If there was only one character, we can
892 abolish the previous item altogether. */
893
894 else if (*previous == OP_CHARS)
895 {
896 int len = previous[1];
897 if (len == 1)
898 {
899 c = previous[2];
900 code = previous;
901 }
902 else
903 {
904 c = previous[len+1];
905 previous[1]--;
906 code--;
907 }
908 op_type = 0; /* Use single-char op codes */
909 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
910 }
911
912 /* If previous was a single negated character ([^a] or similar), we use
913 one of the special opcodes, replacing it. The code is shared with single-
914 character repeats by adding a suitable offset into repeat_type. */
915
916 else if ((int)*previous == OP_NOT)
917 {
918 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
919 c = previous[1];
920 code = previous;
921 goto OUTPUT_SINGLE_REPEAT;
922 }
923
924 /* If previous was a character type match (\d or similar), abolish it and
925 create a suitable repeat item. The code is shared with single-character
926 repeats by adding a suitable offset into repeat_type. */
927
928 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
929 {
930 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
931 c = *previous;
932 code = previous;
933
934 OUTPUT_SINGLE_REPEAT:
935 repeat_type += op_type; /* Combine both values for many cases */
936
937 /* A minimum of zero is handled either as the special case * or ?, or as
938 an UPTO, with the maximum given. */
939
940 if (repeat_min == 0)
941 {
942 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
943 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
944 else
945 {
946 *code++ = OP_UPTO + repeat_type;
947 *code++ = repeat_max >> 8;
948 *code++ = (repeat_max & 255);
949 }
950 }
951
952 /* The case {1,} is handled as the special case + */
953
954 else if (repeat_min == 1 && repeat_max == -1)
955 *code++ = OP_PLUS + repeat_type;
956
957 /* The case {n,n} is just an EXACT, while the general case {n,m} is
958 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
959
960 else
961 {
962 if (repeat_min != 1)
963 {
964 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
965 *code++ = repeat_min >> 8;
966 *code++ = (repeat_min & 255);
967 }
968
969 /* If the mininum is 1 and the previous item was a character string,
970 we either have to put back the item that got cancelled if the string
971 length was 1, or add the character back onto the end of a longer
972 string. For a character type nothing need be done; it will just get put
973 back naturally. */
974
975 else if (*previous == OP_CHARS)
976 {
977 if (code == previous) code += 2; else previous[1]++;
978 }
979
980 /* Insert an UPTO if the max is greater than the min. */
981
982 if (repeat_max != repeat_min)
983 {
984 *code++ = c;
985 repeat_max -= repeat_min;
986 *code++ = OP_UPTO + repeat_type;
987 *code++ = repeat_max >> 8;
988 *code++ = (repeat_max & 255);
989 }
990 }
991
992 /* The character or character type itself comes last in all cases. */
993
994 *code++ = c;
995 }
996
997 /* If previous was a character class or a back reference, we put the repeat
998 stuff after it. */
999
1000 else if (*previous == OP_CLASS || *previous == OP_REF)
1001 {
1002 if (repeat_min == 0 && repeat_max == -1)
1003 *code++ = OP_CRSTAR + repeat_type;
1004 else if (repeat_min == 1 && repeat_max == -1)
1005 *code++ = OP_CRPLUS + repeat_type;
1006 else if (repeat_min == 0 && repeat_max == 1)
1007 *code++ = OP_CRQUERY + repeat_type;
1008 else
1009 {
1010 *code++ = OP_CRRANGE + repeat_type;
1011 *code++ = repeat_min >> 8;
1012 *code++ = repeat_min & 255;
1013 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1014 *code++ = repeat_max >> 8;
1015 *code++ = repeat_max & 255;
1016 }
1017 }
1018
1019 /* If previous was a bracket group, we may have to replicate it in certain
1020 cases. If the maximum repeat count is unlimited, check that the bracket
1021 group cannot match the empty string, and diagnose an error if it can. */
1022
1023 else if ((int)*previous >= OP_BRA)
1024 {
1025 int i;
1026 int length = code - previous;
1027
1028 if (repeat_max == -1 && could_be_empty(previous))
1029 {
1030 *errorptr = ERR10;
1031 goto FAILED;
1032 }
1033
1034 /* If the minimum is greater than zero, and the maximum is unlimited or
1035 equal to the minimum, the first copy remains where it is, and is
1036 replicated up to the minimum number of times. This case includes the +
1037 repeat, but of course no replication is needed in that case. */
1038
1039 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1040 {
1041 for (i = 1; i < repeat_min; i++)
1042 {
1043 memcpy(code, previous, length);
1044 code += length;
1045 }
1046 }
1047
1048 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1049 Then, if there is a fixed upper limit, replicated up to that many times,
1050 sticking BRAZERO in front of all the optional ones. */
1051
1052 else
1053 {
1054 if (repeat_min == 0)
1055 {
1056 memmove(previous+1, previous, length);
1057 code++;
1058 *previous++ = OP_BRAZERO + repeat_type;
1059 }
1060
1061 for (i = 1; i < repeat_min; i++)
1062 {
1063 memcpy(code, previous, length);
1064 code += length;
1065 }
1066
1067 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1068 {
1069 *code++ = OP_BRAZERO + repeat_type;
1070 memcpy(code, previous, length);
1071 code += length;
1072 }
1073 }
1074
1075 /* If the maximum is unlimited, set a repeater in the final copy. */
1076
1077 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1078 }
1079
1080 /* Else there's some kind of shambles */
1081
1082 else
1083 {
1084 *errorptr = ERR11;
1085 goto FAILED;
1086 }
1087
1088 /* In all case we no longer have a previous item. */
1089
1090 previous = NULL;
1091 break;
1092
1093
1094 /* Start of nested bracket sub-expression, or comment or lookahead.
1095 First deal with special things that can come after a bracket; all are
1096 introduced by ?, and the appearance of any of them means that this is not a
1097 referencing group. They were checked for validity in the first pass over
1098 the string, so we don't have to check for syntax errors here. */
1099
1100 case '(':
1101 previous = code; /* Only real brackets can be repeated */
1102 if (*(++ptr) == '?')
1103 {
1104 bravalue = OP_BRA;
1105
1106 switch (*(++ptr))
1107 {
1108 case '#':
1109 case 'i':
1110 case 'm':
1111 case 's':
1112 case 'x':
1113 ptr++;
1114 while (*ptr != ')') ptr++;
1115 previous = NULL;
1116 continue;
1117
1118 case ':': /* Non-extracting bracket */
1119 ptr++;
1120 break;
1121
1122 case '=': /* Assertions can't be repeated */
1123 bravalue = OP_ASSERT;
1124 ptr++;
1125 previous = NULL;
1126 break;
1127
1128 case '!':
1129 bravalue = OP_ASSERT_NOT;
1130 ptr++;
1131 previous = NULL;
1132 break;
1133
1134 case '>': /* "Match once" brackets */
1135 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1136 {
1137 bravalue = OP_ONCE;
1138 ptr++;
1139 previous = NULL;
1140 break;
1141 }
1142 /* Else fall through */
1143
1144 default:
1145 *errorptr = ERR12;
1146 goto FAILED;
1147 }
1148 }
1149
1150 /* Else we have a referencing group */
1151
1152 else
1153 {
1154 if (++(*brackets) > EXTRACT_MAX)
1155 {
1156 *errorptr = ERR13;
1157 goto FAILED;
1158 }
1159 bravalue = OP_BRA + *brackets;
1160 }
1161
1162 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1163 code into a non-register variable in order to be able to pass its address
1164 because some compilers complain otherwise. */
1165
1166 *code = bravalue;
1167 {
1168 uschar *mcode = code;
1169 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1170 goto FAILED;
1171 code = mcode;
1172 }
1173
1174 if (*ptr != ')')
1175 {
1176 *errorptr = ERR14;
1177 goto FAILED;
1178 }
1179 break;
1180
1181 /* Check \ for being a real metacharacter; if not, fall through and handle
1182 it as a data character at the start of a string. Escape items are checked
1183 for validity in the pre-compiling pass. */
1184
1185 case '\\':
1186 oldptr = ptr;
1187 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1188
1189 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1190 are arranged to be the negation of the corresponding OP_values. For the
1191 back references, the values are ESC_REF plus the reference number. Only
1192 back references and those types that consume a character may be repeated.
1193 We can test for values between ESC_b and ESC_Z for the latter; this may
1194 have to change if any new ones are ever created. */
1195
1196 if (c < 0)
1197 {
1198 if (-c >= ESC_REF)
1199 {
1200 int refnum = -c - ESC_REF;
1201 if (*brackets < refnum)
1202 {
1203 *errorptr = ERR15;
1204 goto FAILED;
1205 }
1206 previous = code;
1207 *code++ = OP_REF;
1208 *code++ = refnum;
1209 }
1210 else
1211 {
1212 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1213 *code++ = -c;
1214 }
1215 continue;
1216 }
1217
1218 /* Data character: reset and fall through */
1219
1220 ptr = oldptr;
1221 c = '\\';
1222
1223 /* Handle a run of data characters until a metacharacter is encountered.
1224 The first character is guaranteed not to be whitespace or # when the
1225 extended flag is set. */
1226
1227 NORMAL_CHAR:
1228 default:
1229 previous = code;
1230 *code = OP_CHARS;
1231 code += 2;
1232 length = 0;
1233
1234 do
1235 {
1236 if ((options & PCRE_EXTENDED) != 0)
1237 {
1238 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1239 if (c == '#')
1240 {
1241 while ((c = *(++ptr)) != 0 && c != '\n');
1242 if (c == 0) break;
1243 continue;
1244 }
1245 }
1246
1247 /* Backslash may introduce a data char or a metacharacter. Escaped items
1248 are checked for validity in the pre-compiling pass. Stop the string
1249 before a metaitem. */
1250
1251 if (c == '\\')
1252 {
1253 oldptr = ptr;
1254 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1255 if (c < 0) { ptr = oldptr; break; }
1256 }
1257
1258 /* Ordinary character or single-char escape */
1259
1260 *code++ = c;
1261 length++;
1262 }
1263
1264 /* This "while" is the end of the "do" above. */
1265
1266 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1267
1268 /* Compute the length and set it in the data vector, and advance to
1269 the next state. */
1270
1271 previous[1] = length;
1272 ptr--;
1273 break;
1274 }
1275 } /* end of big loop */
1276
1277 /* Control never reaches here by falling through, only by a goto for all the
1278 error states. Pass back the position in the pattern so that it can be displayed
1279 to the user for diagnosing the error. */
1280
1281 FAILED:
1282 *ptrptr = ptr;
1283 return FALSE;
1284 }
1285
1286
1287
1288
1289 /*************************************************
1290 * Compile sequence of alternatives *
1291 *************************************************/
1292
1293 /* On entry, ptr is pointing past the bracket character, but on return
1294 it points to the closing bracket, or vertical bar, or end of string.
1295 The code variable is pointing at the byte into which the BRA operator has been
1296 stored.
1297
1298 Argument:
1299 options the option bits
1300 brackets -> int containing the number of extracting brackets used
1301 codeptr -> the address of the current code pointer
1302 ptrptr -> the address of the current pattern pointer
1303 errorptr -> pointer to error message
1304
1305 Returns: TRUE on success
1306 */
1307
1308 static BOOL
1309 compile_regex(int options, int *brackets, uschar **codeptr,
1310 const uschar **ptrptr, const char **errorptr)
1311 {
1312 const uschar *ptr = *ptrptr;
1313 uschar *code = *codeptr;
1314 uschar *start_bracket = code;
1315
1316 for (;;)
1317 {
1318 int length;
1319 uschar *last_branch = code;
1320
1321 code += 3;
1322 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1323 {
1324 *ptrptr = ptr;
1325 return FALSE;
1326 }
1327
1328 /* Fill in the length of the last branch */
1329
1330 length = code - last_branch;
1331 last_branch[1] = length >> 8;
1332 last_branch[2] = length & 255;
1333
1334 /* Reached end of expression, either ')' or end of pattern. Insert a
1335 terminating ket and the length of the whole bracketed item, and return,
1336 leaving the pointer at the terminating char. */
1337
1338 if (*ptr != '|')
1339 {
1340 length = code - start_bracket;
1341 *code++ = OP_KET;
1342 *code++ = length >> 8;
1343 *code++ = length & 255;
1344 *codeptr = code;
1345 *ptrptr = ptr;
1346 return TRUE;
1347 }
1348
1349 /* Another branch follows; insert an "or" node and advance the pointer. */
1350
1351 *code = OP_ALT;
1352 ptr++;
1353 }
1354 /* Control never reaches here */
1355 }
1356
1357
1358
1359 /*************************************************
1360 * Check for anchored expression *
1361 *************************************************/
1362
1363 /* Try to find out if this is an anchored regular expression. Consider each
1364 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1365 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1366 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1367 counts, since OP_CIRC can match in the middle.
1368
1369 A branch is also implicitly anchored if it starts with .* because that will try
1370 the rest of the pattern at all possible matching points, so there is no point
1371 trying them again.
1372
1373 Argument: points to start of expression (the bracket)
1374 Returns: TRUE or FALSE
1375 */
1376
1377 static BOOL
1378 is_anchored(register const uschar *code, BOOL multiline)
1379 {
1380 do {
1381 int op = (int)code[3];
1382 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1383 { if (!is_anchored(code+3, multiline)) return FALSE; }
1384 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1385 { if (code[4] != OP_ANY) return FALSE; }
1386 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1387 code += (code[1] << 8) + code[2];
1388 }
1389 while (*code == OP_ALT);
1390 return TRUE;
1391 }
1392
1393
1394
1395 /*************************************************
1396 * Check for start with \n line expression *
1397 *************************************************/
1398
1399 /* This is called for multiline expressions to try to find out if every branch
1400 starts with ^ so that "first char" processing can be done to speed things up.
1401
1402 Argument: points to start of expression (the bracket)
1403 Returns: TRUE or FALSE
1404 */
1405
1406 static BOOL
1407 is_startline(const uschar *code)
1408 {
1409 do {
1410 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1411 { if (!is_startline(code+3)) return FALSE; }
1412 else if (code[3] != OP_CIRC) return FALSE;
1413 code += (code[1] << 8) + code[2];
1414 }
1415 while (*code == OP_ALT);
1416 return TRUE;
1417 }
1418
1419
1420
1421 /*************************************************
1422 * Check for fixed first char *
1423 *************************************************/
1424
1425 /* Try to find out if there is a fixed first character. This is called for
1426 unanchored expressions, as it speeds up their processing quite considerably.
1427 Consider each alternative branch. If they all start with the same char, or with
1428 a bracket all of whose alternatives start with the same char (recurse ad lib),
1429 then we return that char, otherwise -1.
1430
1431 Argument: points to start of expression (the bracket)
1432 Returns: -1 or the fixed first char
1433 */
1434
1435 static int
1436 find_firstchar(uschar *code)
1437 {
1438 register int c = -1;
1439 do
1440 {
1441 register int charoffset = 4;
1442
1443 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1444 {
1445 register int d;
1446 if ((d = find_firstchar(code+3)) < 0) return -1;
1447 if (c < 0) c = d; else if (c != d) return -1;
1448 }
1449
1450 else switch(code[3])
1451 {
1452 default:
1453 return -1;
1454
1455 case OP_EXACT: /* Fall through */
1456 charoffset++;
1457
1458 case OP_CHARS: /* Fall through */
1459 charoffset++;
1460
1461 case OP_PLUS:
1462 case OP_MINPLUS:
1463 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1464 break;
1465 }
1466 code += (code[1] << 8) + code[2];
1467 }
1468 while (*code == OP_ALT);
1469 return c;
1470 }
1471
1472
1473
1474 /*************************************************
1475 * Compile a Regular Expression *
1476 *************************************************/
1477
1478 /* This function takes a string and returns a pointer to a block of store
1479 holding a compiled version of the expression.
1480
1481 Arguments:
1482 pattern the regular expression
1483 options various option bits
1484 errorptr pointer to pointer to error text
1485 erroroffset ptr offset in pattern where error was detected
1486
1487 Returns: pointer to compiled data block, or NULL on error,
1488 with errorptr and erroroffset set
1489 */
1490
1491 pcre *
1492 pcre_compile(const char *pattern, int options, const char **errorptr,
1493 int *erroroffset)
1494 {
1495 real_pcre *re;
1496 int spaces = 0;
1497 int length = 3; /* For initial BRA plus length */
1498 int runlength;
1499 int c, size;
1500 int bracount = 0;
1501 int brastack[200];
1502 int top_backref = 0;
1503 unsigned int brastackptr = 0;
1504 uschar *code;
1505 const uschar *ptr;
1506
1507 #ifdef DEBUG
1508 uschar *code_base, *code_end;
1509 #endif
1510
1511 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1512 can do is just return NULL. */
1513
1514 if (errorptr == NULL) return NULL;
1515 *errorptr = NULL;
1516
1517 /* However, we can give a message for this error */
1518
1519 if (erroroffset == NULL)
1520 {
1521 *errorptr = ERR16;
1522 return NULL;
1523 }
1524 *erroroffset = 0;
1525
1526 if ((options & ~PUBLIC_OPTIONS) != 0)
1527 {
1528 *errorptr = ERR17;
1529 return NULL;
1530 }
1531
1532 #ifdef DEBUG
1533 printf("------------------------------------------------------------------\n");
1534 printf("%s\n", pattern);
1535 #endif
1536
1537 /* The first thing to do is to make a pass over the pattern to compute the
1538 amount of store required to hold the compiled code. This does not have to be
1539 perfect as long as errors are overestimates. At the same time we can detect any
1540 internal flag settings. Make an attempt to correct for any counted white space
1541 if an "extended" flag setting appears late in the pattern. We can't be so
1542 clever for #-comments. */
1543
1544 ptr = (const uschar *)(pattern - 1);
1545 while ((c = *(++ptr)) != 0)
1546 {
1547 int min, max;
1548 int class_charcount;
1549
1550 if ((pcre_ctypes[c] & ctype_space) != 0)
1551 {
1552 if ((options & PCRE_EXTENDED) != 0) continue;
1553 spaces++;
1554 }
1555
1556 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1557 {
1558 while ((c = *(++ptr)) != 0 && c != '\n');
1559 continue;
1560 }
1561
1562 switch(c)
1563 {
1564 /* A backslashed item may be an escaped "normal" character or a
1565 character type. For a "normal" character, put the pointers and
1566 character back so that tests for whitespace etc. in the input
1567 are done correctly. */
1568
1569 case '\\':
1570 {
1571 const uschar *save_ptr = ptr;
1572 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1573 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1574 if (c >= 0)
1575 {
1576 ptr = save_ptr;
1577 c = '\\';
1578 goto NORMAL_CHAR;
1579 }
1580 }
1581 length++;
1582
1583 /* A back reference needs an additional char, plus either one or 5
1584 bytes for a repeat. We also need to keep the value of the highest
1585 back reference. */
1586
1587 if (c <= -ESC_REF)
1588 {
1589 int refnum = -c - ESC_REF;
1590 if (refnum > top_backref) top_backref = refnum;
1591 length++; /* For single back reference */
1592 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1593 {
1594 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1595 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1596 if ((min == 0 && (max == 1 || max == -1)) ||
1597 (min == 1 && max == -1))
1598 length++;
1599 else length += 5;
1600 if (ptr[1] == '?') ptr++;
1601 }
1602 }
1603 continue;
1604
1605 case '^':
1606 case '.':
1607 case '$':
1608 case '*': /* These repeats won't be after brackets; */
1609 case '+': /* those are handled separately */
1610 case '?':
1611 length++;
1612 continue;
1613
1614 /* This covers the cases of repeats after a single char, metachar, class,
1615 or back reference. */
1616
1617 case '{':
1618 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1619 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1620 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1621 if ((min == 0 && (max == 1 || max == -1)) ||
1622 (min == 1 && max == -1))
1623 length++;
1624 else
1625 {
1626 length--; /* Uncount the original char or metachar */
1627 if (min == 1) length++; else if (min > 0) length += 4;
1628 if (max > 0) length += 4; else length += 2;
1629 }
1630 if (ptr[1] == '?') ptr++;
1631 continue;
1632
1633 /* An alternation contains an offset to the next branch or ket. */
1634 case '|':
1635 length += 3;
1636 continue;
1637
1638 /* A character class uses 33 characters. Don't worry about character types
1639 that aren't allowed in classes - they'll get picked up during the compile.
1640 A character class that contains only one character uses 2 or 3 bytes,
1641 depending on whether it is negated or not. Notice this where we can. */
1642
1643 case '[':
1644 class_charcount = 0;
1645 if (*(++ptr) == '^') ptr++;
1646 do
1647 {
1648 if (*ptr == '\\')
1649 {
1650 int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
1651 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1652 if (-c == ESC_b) class_charcount++; else class_charcount = 10;
1653 }
1654 else class_charcount++;
1655 ptr++;
1656 }
1657 while (*ptr != 0 && *ptr != ']');
1658
1659 /* Repeats for negated single chars are handled by the general code */
1660
1661 if (class_charcount == 1) length += 3; else
1662 {
1663 length += 33;
1664
1665 /* A repeat needs either 1 or 5 bytes. */
1666
1667 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1668 {
1669 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1670 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1671 if ((min == 0 && (max == 1 || max == -1)) ||
1672 (min == 1 && max == -1))
1673 length++;
1674 else length += 5;
1675 if (ptr[1] == '?') ptr++;
1676 }
1677 }
1678 continue;
1679
1680 /* Brackets may be genuine groups or special things */
1681
1682 case '(':
1683
1684 /* Handle special forms of bracket, which all start (? */
1685
1686 if (ptr[1] == '?') switch (c = ptr[2])
1687 {
1688 /* Skip over comments entirely */
1689 case '#':
1690 ptr += 3;
1691 while (*ptr != 0 && *ptr != ')') ptr++;
1692 if (*ptr == 0)
1693 {
1694 *errorptr = ERR18;
1695 goto PCRE_ERROR_RETURN;
1696 }
1697 continue;
1698
1699 /* Non-referencing groups and lookaheads just move the pointer on, and
1700 then behave like a non-special bracket, except that they don't increment
1701 the count of extracting brackets. */
1702
1703 case ':':
1704 case '=':
1705 case '!':
1706 ptr += 2;
1707 break;
1708
1709 /* Ditto for the "once only" bracket, allowed only if the extra bit
1710 is set. */
1711
1712 case '>':
1713 if ((options & PCRE_EXTRA) != 0)
1714 {
1715 ptr += 2;
1716 break;
1717 }
1718 /* Else fall thourh */
1719
1720 /* Else loop setting valid options until ) is met. Anything else is an
1721 error. */
1722
1723 default:
1724 ptr += 2;
1725 for (;; ptr++)
1726 {
1727 if ((c = *ptr) == 'i')
1728 {
1729 options |= PCRE_CASELESS;
1730 continue;
1731 }
1732 else if ((c = *ptr) == 'm')
1733 {
1734 options |= PCRE_MULTILINE;
1735 continue;
1736 }
1737 else if (c == 's')
1738 {
1739 options |= PCRE_DOTALL;
1740 continue;
1741 }
1742 else if (c == 'x')
1743 {
1744 options |= PCRE_EXTENDED;
1745 length -= spaces; /* Already counted spaces */
1746 continue;
1747 }
1748 else if (c == ')') break;
1749
1750 *errorptr = ERR12;
1751 goto PCRE_ERROR_RETURN;
1752 }
1753 continue; /* End of this bracket handling */
1754 }
1755
1756 /* Extracting brackets must be counted so we can process escapes in a
1757 Perlish way. */
1758
1759 else bracount++;
1760
1761 /* Non-special forms of bracket. Save length for computing whole length
1762 at end if there's a repeat that requires duplication of the group. */
1763
1764 if (brastackptr >= sizeof(brastack)/sizeof(int))
1765 {
1766 *errorptr = ERR19;
1767 goto PCRE_ERROR_RETURN;
1768 }
1769
1770 brastack[brastackptr++] = length;
1771 length += 3;
1772 continue;
1773
1774 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1775 have to replicate this bracket up to that many times. */
1776
1777 case ')':
1778 length += 3;
1779 {
1780 int min = 1;
1781 int max = 1;
1782 int duplength = length - brastack[--brastackptr];
1783
1784 /* Leave ptr at the final char; for read_repeat_counts this happens
1785 automatically; for the others we need an increment. */
1786
1787 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1788 {
1789 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1790 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1791 }
1792 else if (c == '*') { min = 0; max = -1; ptr++; }
1793 else if (c == '+') { max = -1; ptr++; }
1794 else if (c == '?') { min = 0; ptr++; }
1795
1796 /* If there is a minimum > 1 we have to replicate up to min-1 times; if
1797 there is a limited maximum we have to replicate up to max-1 times and
1798 allow for a BRAZERO item before each optional copy, as we also have to
1799 do before the first copy if the minimum is zero. */
1800
1801 if (min == 0) length++;
1802 else if (min > 1) length += (min - 1) * duplength;
1803 if (max > min) length += (max - min) * (duplength + 1);
1804 }
1805
1806 continue;
1807
1808 /* Non-special character. For a run of such characters the length required
1809 is the number of characters + 2, except that the maximum run length is 255.
1810 We won't get a skipped space or a non-data escape or the start of a #
1811 comment as the first character, so the length can't be zero. */
1812
1813 NORMAL_CHAR:
1814 default:
1815 length += 2;
1816 runlength = 0;
1817 do
1818 {
1819 if ((pcre_ctypes[c] & ctype_space) != 0)
1820 {
1821 if ((options & PCRE_EXTENDED) != 0) continue;
1822 spaces++;
1823 }
1824
1825 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1826 {
1827 while ((c = *(++ptr)) != 0 && c != '\n');
1828 continue;
1829 }
1830
1831 /* Backslash may introduce a data char or a metacharacter; stop the
1832 string before the latter. */
1833
1834 if (c == '\\')
1835 {
1836 const uschar *saveptr = ptr;
1837 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1838 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1839 if (c < 0) { ptr = saveptr; break; }
1840 }
1841
1842 /* Ordinary character or single-char escape */
1843
1844 runlength++;
1845 }
1846
1847 /* This "while" is the end of the "do" above. */
1848
1849 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1850
1851 ptr--;
1852 length += runlength;
1853 continue;
1854 }
1855 }
1856
1857 length += 4; /* For final KET and END */
1858
1859 if (length > 65539)
1860 {
1861 *errorptr = ERR20;
1862 return NULL;
1863 }
1864
1865 /* Compute the size of data block needed and get it, either from malloc or
1866 externally provided function. Put in the magic number and the options. */
1867
1868 size = length + offsetof(real_pcre, code);
1869 re = (real_pcre *)(pcre_malloc)(size);
1870
1871 if (re == NULL)
1872 {
1873 *errorptr = ERR21;
1874 return NULL;
1875 }
1876
1877 re->magic_number = MAGIC_NUMBER;
1878 re->options = options;
1879
1880 /* Set up a starting, non-extracting bracket, then compile the expression. On
1881 error, *errorptr will be set non-NULL, so we don't need to look at the result
1882 of the function here. */
1883
1884 ptr = (const uschar *)pattern;
1885 code = re->code;
1886 *code = OP_BRA;
1887 bracount = 0;
1888 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1889 re->top_bracket = bracount;
1890 re->top_backref = top_backref;
1891
1892 /* If not reached end of pattern on success, there's an excess bracket. */
1893
1894 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1895
1896 /* Fill in the terminating state and check for disastrous overflow, but
1897 if debugging, leave the test till after things are printed out. */
1898
1899 *code++ = OP_END;
1900
1901 #ifndef DEBUG
1902 if (code - re->code > length) *errorptr = ERR23;
1903 #endif
1904
1905 /* Failed to compile */
1906
1907 if (*errorptr != NULL)
1908 {
1909 (pcre_free)(re);
1910 PCRE_ERROR_RETURN:
1911 *erroroffset = ptr - (const uschar *)pattern;
1912 return NULL;
1913 }
1914
1915 /* If the anchored option was not passed, set flag if we can determine that it
1916 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1917 we can determine what the first character has to be, because that speeds up
1918 unanchored matches no end. In the case of multiline matches, an alternative is
1919 to set the PCRE_STARTLINE flag if all branches start with ^. */
1920
1921 if ((options & PCRE_ANCHORED) == 0)
1922 {
1923 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1924 re->options |= PCRE_ANCHORED;
1925 else
1926 {
1927 int c = find_firstchar(re->code);
1928 if (c >= 0)
1929 {
1930 re->first_char = c;
1931 re->options |= PCRE_FIRSTSET;
1932 }
1933 else if (is_startline(re->code))
1934 re->options |= PCRE_STARTLINE;
1935 }
1936 }
1937
1938 /* Print out the compiled data for debugging */
1939
1940 #ifdef DEBUG
1941
1942 printf("Length = %d top_bracket = %d top_backref=%d\n",
1943 length, re->top_bracket, re->top_backref);
1944
1945 if (re->options != 0)
1946 {
1947 printf("%s%s%s%s%s%s%s\n",
1948 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1949 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1950 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1951 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1952 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1953 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1954 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1955 }
1956
1957 if ((re->options & PCRE_FIRSTSET) != 0)
1958 {
1959 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1960 else printf("First char = \\x%02x\n", re->first_char);
1961 }
1962
1963 code_end = code;
1964 code_base = code = re->code;
1965
1966 while (code < code_end)
1967 {
1968 int charlength;
1969
1970 printf("%3d ", code - code_base);
1971
1972 if (*code >= OP_BRA)
1973 {
1974 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1975 code += 2;
1976 }
1977
1978 else switch(*code)
1979 {
1980 case OP_CHARS:
1981 charlength = *(++code);
1982 printf("%3d ", charlength);
1983 while (charlength-- > 0)
1984 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
1985 break;
1986
1987 case OP_KETRMAX:
1988 case OP_KETRMIN:
1989 case OP_ALT:
1990 case OP_KET:
1991 case OP_ASSERT:
1992 case OP_ASSERT_NOT:
1993 case OP_ONCE:
1994 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
1995 code += 2;
1996 break;
1997
1998 case OP_STAR:
1999 case OP_MINSTAR:
2000 case OP_PLUS:
2001 case OP_MINPLUS:
2002 case OP_QUERY:
2003 case OP_MINQUERY:
2004 case OP_TYPESTAR:
2005 case OP_TYPEMINSTAR:
2006 case OP_TYPEPLUS:
2007 case OP_TYPEMINPLUS:
2008 case OP_TYPEQUERY:
2009 case OP_TYPEMINQUERY:
2010 if (*code >= OP_TYPESTAR)
2011 printf(" %s", OP_names[code[1]]);
2012 else if (isprint(c = code[1])) printf(" %c", c);
2013 else printf(" \\x%02x", c);
2014 printf("%s", OP_names[*code++]);
2015 break;
2016
2017 case OP_EXACT:
2018 case OP_UPTO:
2019 case OP_MINUPTO:
2020 if (isprint(c = code[3])) printf(" %c{", c);
2021 else printf(" \\x%02x{", c);
2022 if (*code != OP_EXACT) printf(",");
2023 printf("%d}", (code[1] << 8) + code[2]);
2024 if (*code == OP_MINUPTO) printf("?");
2025 code += 3;
2026 break;
2027
2028 case OP_TYPEEXACT:
2029 case OP_TYPEUPTO:
2030 case OP_TYPEMINUPTO:
2031 printf(" %s{", OP_names[code[3]]);
2032 if (*code != OP_TYPEEXACT) printf(",");
2033 printf("%d}", (code[1] << 8) + code[2]);
2034 if (*code == OP_TYPEMINUPTO) printf("?");
2035 code += 3;
2036 break;
2037
2038 case OP_NOT:
2039 if (isprint(c = *(++code))) printf(" [^%c]", c);
2040 else printf(" [^\\x%02x]", c);
2041 break;
2042
2043 case OP_NOTSTAR:
2044 case OP_NOTMINSTAR:
2045 case OP_NOTPLUS:
2046 case OP_NOTMINPLUS:
2047 case OP_NOTQUERY:
2048 case OP_NOTMINQUERY:
2049 if (isprint(c = code[1])) printf(" [^%c]", c);
2050 else printf(" [^\\x%02x]", c);
2051 printf("%s", OP_names[*code++]);
2052 break;
2053
2054 case OP_NOTEXACT:
2055 case OP_NOTUPTO:
2056 case OP_NOTMINUPTO:
2057 if (isprint(c = code[3])) printf(" [^%c]{", c);
2058 else printf(" [^\\x%02x]{", c);
2059 if (*code != OP_NOTEXACT) printf(",");
2060 printf("%d}", (code[1] << 8) + code[2]);
2061 if (*code == OP_NOTMINUPTO) printf("?");
2062 code += 3;
2063 break;
2064
2065 case OP_REF:
2066 printf(" \\%d", *(++code));
2067 break;
2068
2069 case OP_CLASS:
2070 {
2071 int i, min, max;
2072
2073 code++;
2074 printf(" [");
2075
2076 for (i = 0; i < 256; i++)
2077 {
2078 if ((code[i/8] & (1 << (i&7))) != 0)
2079 {
2080 int j;
2081 for (j = i+1; j < 256; j++)
2082 if ((code[j/8] & (1 << (j&7))) == 0) break;
2083 if (i == '-' || i == ']') printf("\\");
2084 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2085 if (--j > i)
2086 {
2087 printf("-");
2088 if (j == '-' || j == ']') printf("\\");
2089 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2090 }
2091 i = j;
2092 }
2093 }
2094 printf("]");
2095 code += 32;
2096
2097 switch(*code)
2098 {
2099 case OP_CRSTAR:
2100 case OP_CRMINSTAR:
2101 case OP_CRPLUS:
2102 case OP_CRMINPLUS:
2103 case OP_CRQUERY:
2104 case OP_CRMINQUERY:
2105 printf("%s", OP_names[*code]);
2106 break;
2107
2108 case OP_CRRANGE:
2109 case OP_CRMINRANGE:
2110 min = (code[1] << 8) + code[2];
2111 max = (code[3] << 8) + code[4];
2112 if (max == 0) printf("{%d,}", min);
2113 else printf("{%d,%d}", min, max);
2114 if (*code == OP_CRMINRANGE) printf("?");
2115 code += 4;
2116 break;
2117
2118 default:
2119 code--;
2120 }
2121 }
2122 break;
2123
2124 /* Anything else is just a one-node item */
2125
2126 default:
2127 printf(" %s", OP_names[*code]);
2128 break;
2129 }
2130
2131 code++;
2132 printf("\n");
2133 }
2134 printf("------------------------------------------------------------------\n");
2135
2136 /* This check is done here in the debugging case so that the code that
2137 was compiled can be seen. */
2138
2139 if (code - re->code > length)
2140 {
2141 *errorptr = ERR23;
2142 (pcre_free)(re);
2143 *erroroffset = ptr - (uschar *)pattern;
2144 return NULL;
2145 }
2146 #endif
2147
2148 return (pcre *)re;
2149 }
2150
2151
2152
2153 /*************************************************
2154 * Match a character type *
2155 *************************************************/
2156
2157 /* Not used in all the places it might be as it's sometimes faster
2158 to put the code inline.
2159
2160 Arguments:
2161 type the character type
2162 c the character
2163 dotall the dotall flag
2164
2165 Returns: TRUE if character is of the type
2166 */
2167
2168 static BOOL
2169 match_type(int type, int c, BOOL dotall)
2170 {
2171
2172 #ifdef DEBUG
2173 if (isprint(c)) printf("matching subject %c against ", c);
2174 else printf("matching subject \\x%02x against ", c);
2175 printf("%s\n", OP_names[type]);
2176 #endif
2177
2178 switch(type)
2179 {
2180 case OP_ANY: return dotall || c != '\n';
2181 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2182 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2183 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2184 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2185 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2186 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2187 }
2188 return FALSE;
2189 }
2190
2191
2192
2193 /*************************************************
2194 * Match a back-reference *
2195 *************************************************/
2196
2197 /* If a back reference hasn't been set, the match fails.
2198
2199 Arguments:
2200 number reference number
2201 eptr points into the subject
2202 length length to be matched
2203 md points to match data block
2204
2205 Returns: TRUE if matched
2206 */
2207
2208 static BOOL
2209 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2210 {
2211 const uschar *p = md->start_subject + md->offset_vector[number];
2212
2213 #ifdef DEBUG
2214 if (eptr >= md->end_subject)
2215 printf("matching subject <null>");
2216 else
2217 {
2218 printf("matching subject ");
2219 pchars(eptr, length, TRUE, md);
2220 }
2221 printf(" against backref ");
2222 pchars(p, length, FALSE, md);
2223 printf("\n");
2224 #endif
2225
2226 /* Always fail if not enough characters left */
2227
2228 if (length > md->end_subject - p) return FALSE;
2229
2230 /* Separate the caselesss case for speed */
2231
2232 if (md->caseless)
2233 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2234 else
2235 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2236
2237 return TRUE;
2238 }
2239
2240
2241
2242 /*************************************************
2243 * Match from current position *
2244 *************************************************/
2245
2246 /* On entry ecode points to the first opcode, and eptr to the first character.
2247
2248 Arguments:
2249 eptr pointer in subject
2250 ecode position in code
2251 offset_top current top pointer
2252 md pointer to "static" info for the match
2253
2254 Returns: TRUE if matched
2255 */
2256
2257 static BOOL
2258 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2259 match_data *md)
2260 {
2261 for (;;)
2262 {
2263 int min, max, ctype;
2264 register int i;
2265 register int c;
2266 BOOL minimize = FALSE;
2267
2268 /* Opening bracket. Check the alternative branches in turn, failing if none
2269 match. We have to set the start offset if required and there is space
2270 in the offset vector so that it is available for subsequent back references
2271 if the bracket matches. However, if the bracket fails, we must put back the
2272 previous value of both offsets in case they were set by a previous copy of
2273 the same bracket. Don't worry about setting the flag for the error case here;
2274 that is handled in the code for KET. */
2275
2276 if ((int)*ecode >= OP_BRA)
2277 {
2278 int number = (*ecode - OP_BRA) << 1;
2279 int save_offset1 = 0, save_offset2 = 0;
2280
2281 #ifdef DEBUG
2282 printf("start bracket %d\n", number/2);
2283 #endif
2284
2285 if (number > 0 && number < md->offset_end)
2286 {
2287 save_offset1 = md->offset_vector[number];
2288 save_offset2 = md->offset_vector[number+1];
2289 md->offset_vector[number] = eptr - md->start_subject;
2290
2291 #ifdef DEBUG
2292 printf("saving %d %d\n", save_offset1, save_offset2);
2293 #endif
2294 }
2295
2296 /* Recurse for all the alternatives. */
2297
2298 do
2299 {
2300 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2301 ecode += (ecode[1] << 8) + ecode[2];
2302 }
2303 while (*ecode == OP_ALT);
2304
2305 #ifdef DEBUG
2306 printf("bracket %d failed\n", number/2);
2307 #endif
2308
2309 if (number > 0 && number < md->offset_end)
2310 {
2311 md->offset_vector[number] = save_offset1;
2312 md->offset_vector[number+1] = save_offset2;
2313 }
2314
2315 return FALSE;
2316 }
2317
2318 /* Other types of node can be handled by a switch */
2319
2320 switch(*ecode)
2321 {
2322 case OP_END:
2323 md->end_match_ptr = eptr; /* Record where we ended */
2324 md->end_offset_top = offset_top; /* and how many extracts were taken */
2325 return TRUE;
2326
2327 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2328 whole thing doesn't match, so we have to get out via a longjmp(). */
2329
2330 case OP_CUT:
2331 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2332 longjmp(md->fail_env, 1);
2333
2334 /* Assertion brackets. Check the alternative branches in turn - the
2335 matching won't pass the KET for an assertion. If any one branch matches,
2336 the assertion is true. */
2337
2338 case OP_ASSERT:
2339 do
2340 {
2341 if (match(eptr, ecode+3, offset_top, md)) break;
2342 ecode += (ecode[1] << 8) + ecode[2];
2343 }
2344 while (*ecode == OP_ALT);
2345 if (*ecode == OP_KET) return FALSE;
2346
2347 /* Continue from after the assertion, updating the offsets high water
2348 mark, since extracts may have been taken during the assertion. */
2349
2350 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2351 ecode += 3;
2352 offset_top = md->end_offset_top;
2353 continue;
2354
2355 /* Negative assertion: all branches must fail to match */
2356
2357 case OP_ASSERT_NOT:
2358 do
2359 {
2360 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2361 ecode += (ecode[1] << 8) + ecode[2];
2362 }
2363 while (*ecode == OP_ALT);
2364 ecode += 3;
2365 continue;
2366
2367 /* "Once" brackets are like assertion brackets except that after a match,
2368 the point in the subject string is not moved back. Thus there can never be
2369 a move back into the brackets. Check the alternative branches in turn - the
2370 matching won't pass the KET for this kind of subpattern. If any one branch
2371 matches, we carry on, leaving the subject pointer. */
2372
2373 case OP_ONCE:
2374 do
2375 {
2376 if (match(eptr, ecode+3, offset_top, md)) break;
2377 ecode += (ecode[1] << 8) + ecode[2];
2378 }
2379 while (*ecode == OP_ALT);
2380 if (*ecode == OP_KET) return FALSE;
2381
2382 /* Continue as from after the assertion, updating the offsets high water
2383 mark, since extracts may have been taken. */
2384
2385 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2386 ecode += 3;
2387 offset_top = md->end_offset_top;
2388 eptr = md->end_match_ptr;
2389 continue;
2390
2391 /* An alternation is the end of a branch; scan along to find the end of the
2392 bracketed group and go to there. */
2393
2394 case OP_ALT:
2395 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2396 break;
2397
2398 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2399 that it may occur zero times. It may repeat infinitely, or not at all -
2400 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2401 repeat limits are compiled as a number of copies, with the optional ones
2402 preceded by BRAZERO or BRAMINZERO. */
2403
2404 case OP_BRAZERO:
2405 {
2406 const uschar *next = ecode+1;
2407 if (match(eptr, next, offset_top, md)) return TRUE;
2408 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2409 ecode = next + 3;
2410 }
2411 break;
2412
2413 case OP_BRAMINZERO:
2414 {
2415 const uschar *next = ecode+1;
2416 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2417 if (match(eptr, next+3, offset_top, md)) return TRUE;
2418 ecode++;
2419 }
2420 break;;
2421
2422 /* End of a group, repeated or non-repeating. If we are at the end of
2423 an assertion "group", stop matching and return TRUE, but record the
2424 current high water mark for use by positive assertions. */
2425
2426 case OP_KET:
2427 case OP_KETRMIN:
2428 case OP_KETRMAX:
2429 {
2430 int number;
2431 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2432
2433 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2434 {
2435 md->end_match_ptr = eptr; /* For ONCE */
2436 md->end_offset_top = offset_top;
2437 return TRUE;
2438 }
2439
2440 /* In all other cases we have to check the group number back at the
2441 start and if necessary complete handling an extraction by setting the
2442 final offset and bumping the high water mark. */
2443
2444 number = (*prev - OP_BRA) << 1;
2445
2446 #ifdef DEBUG
2447 printf("end bracket %d\n", number/2);
2448 #endif
2449
2450 if (number > 0)
2451 {
2452 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2453 {
2454 md->offset_vector[number+1] = eptr - md->start_subject;
2455 if (offset_top <= number) offset_top = number + 2;
2456 }
2457 }
2458
2459 /* For a non-repeating ket, just advance to the next node and continue at
2460 this level. */
2461
2462 if (*ecode == OP_KET)
2463 {
2464 ecode += 3;
2465 break;
2466 }
2467
2468 /* The repeating kets try the rest of the pattern or restart from the
2469 preceding bracket, in the appropriate order. */
2470
2471 if (*ecode == OP_KETRMIN)
2472 {
2473 if (match(eptr, ecode+3, offset_top, md) ||
2474 match(eptr, prev, offset_top, md)) return TRUE;
2475 }
2476 else /* OP_KETRMAX */
2477 {
2478 if (match(eptr, prev, offset_top, md) ||
2479 match(eptr, ecode+3, offset_top, md)) return TRUE;
2480 }
2481 }
2482 return FALSE;
2483
2484 /* Start of subject unless notbol, or after internal newline if multiline */
2485
2486 case OP_CIRC:
2487 if (md->notbol && eptr == md->start_subject) return FALSE;
2488 if (md->multiline)
2489 {
2490 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2491 ecode++;
2492 break;
2493 }
2494 /* ... else fall through */
2495
2496 /* Start of subject assertion */
2497
2498 case OP_SOD:
2499 if (eptr != md->start_subject) return FALSE;
2500 ecode++;
2501 break;
2502
2503 /* Assert before internal newline if multiline, or before
2504 a terminating newline unless endonly is set, else end of subject unless
2505 noteol is set. */
2506
2507 case OP_DOLL:
2508 if (md->noteol && eptr >= md->end_subject) return FALSE;
2509 if (md->multiline)
2510 {
2511 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2512 ecode++;
2513 break;
2514 }
2515 else if (!md->endonly)
2516 {
2517 if (eptr < md->end_subject - 1 ||
2518 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2519 ecode++;
2520 break;
2521 }
2522 /* ... else fall through */
2523
2524 /* End of subject assertion */
2525
2526 case OP_EOD:
2527 if (eptr < md->end_subject) return FALSE;
2528 ecode++;
2529 break;
2530
2531 /* Word boundary assertions */
2532
2533 case OP_NOT_WORD_BOUNDARY:
2534 case OP_WORD_BOUNDARY:
2535 {
2536 BOOL prev_is_word = (eptr != md->start_subject) &&
2537 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2538 BOOL cur_is_word = (eptr < md->end_subject) &&
2539 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2540 if ((*ecode++ == OP_WORD_BOUNDARY)?
2541 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2542 return FALSE;
2543 }
2544 break;
2545
2546 /* Match a single character type; inline for speed */
2547
2548 case OP_ANY:
2549 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2550 if (eptr++ >= md->end_subject) return FALSE;
2551 ecode++;
2552 break;
2553
2554 case OP_NOT_DIGIT:
2555 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2556 return FALSE;
2557 ecode++;
2558 break;
2559
2560 case OP_DIGIT:
2561 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2562 return FALSE;
2563 ecode++;
2564 break;
2565
2566 case OP_NOT_WHITESPACE:
2567 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2568 return FALSE;
2569 ecode++;
2570 break;
2571
2572 case OP_WHITESPACE:
2573 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2574 return FALSE;
2575 ecode++;
2576 break;
2577
2578 case OP_NOT_WORDCHAR:
2579 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2580 return FALSE;
2581 ecode++;
2582 break;
2583
2584 case OP_WORDCHAR:
2585 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2586 return FALSE;
2587 ecode++;
2588 break;
2589
2590 /* Match a back reference, possibly repeatedly. Look past the end of the
2591 item to see if there is repeat information following. The code is similar
2592 to that for character classes, but repeated for efficiency. Then obey
2593 similar code to character type repeats - written out again for speed.
2594 However, if the referenced string is the empty string, always treat
2595 it as matched, any number of times (otherwise there could be infinite
2596 loops). */
2597
2598 case OP_REF:
2599 {
2600 int length;
2601 int number = ecode[1] << 1; /* Doubled reference number */
2602 ecode += 2; /* Advance past the item */
2603
2604 if (number >= offset_top || md->offset_vector[number] < 0)
2605 {
2606 md->errorcode = PCRE_ERROR_BADREF;
2607 return FALSE;
2608 }
2609
2610 length = md->offset_vector[number+1] - md->offset_vector[number];
2611
2612 switch (*ecode)
2613 {
2614 case OP_CRSTAR:
2615 case OP_CRMINSTAR:
2616 case OP_CRPLUS:
2617 case OP_CRMINPLUS:
2618 case OP_CRQUERY:
2619 case OP_CRMINQUERY:
2620 c = *ecode++ - OP_CRSTAR;
2621 minimize = (c & 1) != 0;
2622 min = rep_min[c]; /* Pick up values from tables; */
2623 max = rep_max[c]; /* zero for max => infinity */
2624 if (max == 0) max = INT_MAX;
2625 break;
2626
2627 case OP_CRRANGE:
2628 case OP_CRMINRANGE:
2629 minimize = (*ecode == OP_CRMINRANGE);
2630 min = (ecode[1] << 8) + ecode[2];
2631 max = (ecode[3] << 8) + ecode[4];
2632 if (max == 0) max = INT_MAX;
2633 ecode += 5;
2634 break;
2635
2636 default: /* No repeat follows */
2637 if (!match_ref(number, eptr, length, md)) return FALSE;
2638 eptr += length;
2639 continue; /* With the main loop */
2640 }
2641
2642 /* If the length of the reference is zero, just continue with the
2643 main loop. */
2644
2645 if (length == 0) continue;
2646
2647 /* First, ensure the minimum number of matches are present. We get back
2648 the length of the reference string explicitly rather than passing the
2649 address of eptr, so that eptr can be a register variable. */
2650
2651 for (i = 1; i <= min; i++)
2652 {
2653 if (!match_ref(number, eptr, length, md)) return FALSE;
2654 eptr += length;
2655 }
2656
2657 /* If min = max, continue at the same level without recursion.
2658 They are not both allowed to be zero. */
2659
2660 if (min == max) continue;
2661
2662 /* If minimizing, keep trying and advancing the pointer */
2663
2664 if (minimize)
2665 {
2666 for (i = min;; i++)
2667 {
2668 if (match(eptr, ecode, offset_top, md)) return TRUE;
2669 if (i >= max || !match_ref(number, eptr, length, md))
2670 return FALSE;
2671 eptr += length;
2672 }
2673 /* Control never gets here */
2674 }
2675
2676 /* If maximizing, find the longest string and work backwards */
2677
2678 else
2679 {
2680 const uschar *pp = eptr;
2681 for (i = min; i < max; i++)
2682 {
2683 if (!match_ref(number, eptr, length, md)) break;
2684 eptr += length;
2685 }
2686 while (eptr >= pp)
2687 {
2688 if (match(eptr, ecode, offset_top, md)) return TRUE;
2689 eptr -= length;
2690 }
2691 return FALSE;
2692 }
2693 }
2694 /* Control never gets here */
2695
2696 /* Match a character class, possibly repeatedly. Look past the end of the
2697 item to see if there is repeat information following. Then obey similar
2698 code to character type repeats - written out again for speed. If caseless
2699 matching was set at runtime but not at compile time, we have to check both
2700 versions of a character. */
2701
2702 case OP_CLASS:
2703 {
2704 const uschar *data = ecode + 1; /* Save for matching */
2705 ecode += 33; /* Advance past the item */
2706
2707 switch (*ecode)
2708 {
2709 case OP_CRSTAR:
2710 case OP_CRMINSTAR:
2711 case OP_CRPLUS:
2712 case OP_CRMINPLUS:
2713 case OP_CRQUERY:
2714 case OP_CRMINQUERY:
2715 c = *ecode++ - OP_CRSTAR;
2716 minimize = (c & 1) != 0;
2717 min = rep_min[c]; /* Pick up values from tables; */
2718 max = rep_max[c]; /* zero for max => infinity */
2719 if (max == 0) max = INT_MAX;
2720 break;
2721
2722 case OP_CRRANGE:
2723 case OP_CRMINRANGE:
2724 minimize = (*ecode == OP_CRMINRANGE);
2725 min = (ecode[1] << 8) + ecode[2];
2726 max = (ecode[3] << 8) + ecode[4];
2727 if (max == 0) max = INT_MAX;
2728 ecode += 5;
2729 break;
2730
2731 default: /* No repeat follows */
2732 if (eptr >= md->end_subject) return FALSE;
2733 c = *eptr++;
2734 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2735 if (md->runtime_caseless)
2736 {
2737 c = pcre_fcc[c];
2738 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2739 }
2740 return FALSE;
2741 }
2742
2743 /* First, ensure the minimum number of matches are present. */
2744
2745 for (i = 1; i <= min; i++)
2746 {
2747 if (eptr >= md->end_subject) return FALSE;
2748 c = *eptr++;
2749 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2750 if (md->runtime_caseless)
2751 {
2752 c = pcre_fcc[c];
2753 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2754 }
2755 return FALSE;
2756 }
2757
2758 /* If max == min we can continue with the main loop without the
2759 need to recurse. */
2760
2761 if (min == max) continue;
2762
2763 /* If minimizing, keep testing the rest of the expression and advancing
2764 the pointer while it matches the class. */
2765
2766 if (minimize)
2767 {
2768 for (i = min;; i++)
2769 {
2770 if (match(eptr, ecode, offset_top, md)) return TRUE;
2771 if (i >= max || eptr >= md->end_subject) return FALSE;
2772 c = *eptr++;
2773 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2774 if (md->runtime_caseless)
2775 {
2776 c = pcre_fcc[c];
2777 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2778 }
2779 return FALSE;
2780 }
2781 /* Control never gets here */
2782 }
2783
2784 /* If maximizing, find the longest possible run, then work backwards. */
2785
2786 else
2787 {
2788 const uschar *pp = eptr;
2789 for (i = min; i < max; eptr++, i++)
2790 {
2791 if (eptr >= md->end_subject) break;
2792 c = *eptr;
2793 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2794 if (md->runtime_caseless)
2795 {
2796 c = pcre_fcc[c];
2797 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2798 }
2799 break;
2800 }
2801
2802 while (eptr >= pp)
2803 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2804 return FALSE;
2805 }
2806 }
2807 /* Control never gets here */
2808
2809 /* Match a run of characters */
2810
2811 case OP_CHARS:
2812 {
2813 register int length = ecode[1];
2814 ecode += 2;
2815
2816 #ifdef DEBUG
2817 if (eptr >= md->end_subject)
2818 printf("matching subject <null> against pattern ");
2819 else
2820 {
2821 printf("matching subject ");
2822 pchars(eptr, length, TRUE, md);
2823 printf(" against pattern ");
2824 }
2825 pchars(ecode, length, FALSE, md);
2826 printf("\n");
2827 #endif
2828
2829 if (length > md->end_subject - eptr) return FALSE;
2830 if (md->caseless)
2831 {
2832 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2833 }
2834 else
2835 {
2836 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2837 }
2838 }
2839 break;
2840
2841 /* Match a single character repeatedly; different opcodes share code. */
2842
2843 case OP_EXACT:
2844 min = max = (ecode[1] << 8) + ecode[2];
2845 ecode += 3;
2846 goto REPEATCHAR;
2847
2848 case OP_UPTO:
2849 case OP_MINUPTO:
2850 min = 0;
2851 max = (ecode[1] << 8) + ecode[2];
2852 minimize = *ecode == OP_MINUPTO;
2853 ecode += 3;
2854 goto REPEATCHAR;
2855
2856 case OP_STAR:
2857 case OP_MINSTAR:
2858 case OP_PLUS:
2859 case OP_MINPLUS:
2860 case OP_QUERY:
2861 case OP_MINQUERY:
2862 c = *ecode++ - OP_STAR;
2863 minimize = (c & 1) != 0;
2864 min = rep_min[c]; /* Pick up values from tables; */
2865 max = rep_max[c]; /* zero for max => infinity */
2866 if (max == 0) max = INT_MAX;
2867
2868 /* Common code for all repeated single-character matches. We can give
2869 up quickly if there are fewer than the minimum number of characters left in
2870 the subject. */
2871
2872 REPEATCHAR:
2873 if (min > md->end_subject - eptr) return FALSE;
2874 c = *ecode++;
2875
2876 /* The code is duplicated for the caseless and caseful cases, for speed,
2877 since matching characters is likely to be quite common. First, ensure the
2878 minimum number of matches are present. If min = max, continue at the same
2879 level without recursing. Otherwise, if minimizing, keep trying the rest of
2880 the expression and advancing one matching character if failing, up to the
2881 maximum. Alternatively, if maximizing, find the maximum number of
2882 characters and work backwards. */
2883
2884 #ifdef DEBUG
2885 printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2886 max, eptr);
2887 #endif
2888
2889 if (md->caseless)
2890 {
2891 c = pcre_lcc[c];
2892 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2893 if (min == max) continue;
2894 if (minimize)
2895 {
2896 for (i = min;; i++)
2897 {
2898 if (match(eptr, ecode, offset_top, md)) return TRUE;
2899 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2900 return FALSE;
2901 }
2902 /* Control never gets here */
2903 }
2904 else
2905 {
2906 const uschar *pp = eptr;
2907 for (i = min; i < max; i++)
2908 {
2909 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2910 eptr++;
2911 }
2912 while (eptr >= pp)
2913 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2914 return FALSE;
2915 }
2916 /* Control never gets here */
2917 }
2918
2919 /* Caseful comparisons */
2920
2921 else
2922 {
2923 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2924 if (min == max) continue;
2925 if (minimize)
2926 {
2927 for (i = min;; i++)
2928 {
2929 if (match(eptr, ecode, offset_top, md)) return TRUE;
2930 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2931 }
2932 /* Control never gets here */
2933 }
2934 else
2935 {
2936 const uschar *pp = eptr;
2937 for (i = min; i < max; i++)
2938 {
2939 if (eptr >= md->end_subject || c != *eptr) break;
2940 eptr++;
2941 }
2942 while (eptr >= pp)
2943 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2944 return FALSE;
2945 }
2946 }
2947 /* Control never gets here */
2948
2949 /* Match a negated single character */
2950
2951 case OP_NOT:
2952 if (eptr > md->end_subject) return FALSE;
2953 ecode++;
2954 if (md->caseless)
2955 {
2956 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2957 }
2958 else
2959 {
2960 if (*ecode++ == *eptr++) return FALSE;
2961 }
2962 break;
2963
2964 /* Match a negated single character repeatedly. This is almost a repeat of
2965 the code for a repeated single character, but I haven't found a nice way of
2966 commoning these up that doesn't require a test of the positive/negative
2967 option for each character match. Maybe that wouldn't add very much to the
2968 time taken, but character matching *is* what this is all about... */
2969
2970 case OP_NOTEXACT:
2971 min = max = (ecode[1] << 8) + ecode[2];
2972 ecode += 3;
2973 goto REPEATNOTCHAR;
2974
2975 case OP_NOTUPTO:
2976 case OP_NOTMINUPTO:
2977 min = 0;
2978 max = (ecode[1] << 8) + ecode[2];
2979 minimize = *ecode == OP_NOTMINUPTO;
2980 ecode += 3;
2981 goto REPEATNOTCHAR;
2982
2983 case OP_NOTSTAR:
2984 case OP_NOTMINSTAR:
2985 case OP_NOTPLUS:
2986 case OP_NOTMINPLUS:
2987 case OP_NOTQUERY:
2988 case OP_NOTMINQUERY:
2989 c = *ecode++ - OP_NOTSTAR;
2990 minimize = (c & 1) != 0;
2991 min = rep_min[c]; /* Pick up values from tables; */
2992 max = rep_max[c]; /* zero for max => infinity */
2993 if (max == 0) max = INT_MAX;
2994
2995 /* Common code for all repeated single-character matches. We can give
2996 up quickly if there are fewer than the minimum number of characters left in
2997 the subject. */
2998
2999 REPEATNOTCHAR:
3000 if (min > md->end_subject - eptr) return FALSE;
3001 c = *ecode++;
3002
3003 /* The code is duplicated for the caseless and caseful cases, for speed,
3004 since matching characters is likely to be quite common. First, ensure the
3005 minimum number of matches are present. If min = max, continue at the same
3006 level without recursing. Otherwise, if minimizing, keep trying the rest of
3007 the expression and advancing one matching character if failing, up to the
3008 maximum. Alternatively, if maximizing, find the maximum number of
3009 characters and work backwards. */
3010
3011 #ifdef DEBUG
3012 printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3013 max, eptr);
3014 #endif
3015
3016 if (md->caseless)
3017 {
3018 c = pcre_lcc[c];
3019 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3020 if (min == max) continue;
3021 if (minimize)
3022 {
3023 for (i = min;; i++)
3024 {
3025 if (match(eptr, ecode, offset_top, md)) return TRUE;
3026 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3027 return FALSE;
3028 }
3029 /* Control never gets here */
3030 }
3031 else
3032 {
3033 const uschar *pp = eptr;
3034 for (i = min; i < max; i++)
3035 {
3036 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3037 eptr++;
3038 }
3039 while (eptr >= pp)
3040 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3041 return FALSE;
3042 }
3043 /* Control never gets here */
3044 }
3045
3046 /* Caseful comparisons */
3047
3048 else
3049 {
3050 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3051 if (min == max) continue;
3052 if (minimize)
3053 {
3054 for (i = min;; i++)
3055 {
3056 if (match(eptr, ecode, offset_top, md)) return TRUE;
3057 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3058 }
3059 /* Control never gets here */
3060 }
3061 else
3062 {
3063 const uschar *pp = eptr;
3064 for (i = min; i < max; i++)
3065 {
3066 if (eptr >= md->end_subject || c == *eptr) break;
3067 eptr++;
3068 }
3069 while (eptr >= pp)
3070 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3071 return FALSE;
3072 }
3073 }
3074 /* Control never gets here */
3075
3076 /* Match a single character type repeatedly; several different opcodes
3077 share code. This is very similar to the code for single characters, but we
3078 repeat it in the interests of efficiency. */
3079
3080 case OP_TYPEEXACT:
3081 min = max = (ecode[1] << 8) + ecode[2];
3082 minimize = TRUE;
3083 ecode += 3;
3084 goto REPEATTYPE;
3085
3086 case OP_TYPEUPTO:
3087 case OP_TYPEMINUPTO:
3088 min = 0;
3089 max = (ecode[1] << 8) + ecode[2];
3090 minimize = *ecode == OP_TYPEMINUPTO;
3091 ecode += 3;
3092 goto REPEATTYPE;
3093
3094 case OP_TYPESTAR:
3095 case OP_TYPEMINSTAR:
3096 case OP_TYPEPLUS:
3097 case OP_TYPEMINPLUS:
3098 case OP_TYPEQUERY:
3099 case OP_TYPEMINQUERY:
3100 c = *ecode++ - OP_TYPESTAR;
3101 minimize = (c & 1) != 0;
3102 min = rep_min[c]; /* Pick up values from tables; */
3103 max = rep_max[c]; /* zero for max => infinity */
3104 if (max == 0) max = INT_MAX;
3105
3106 /* Common code for all repeated single character type matches */
3107
3108 REPEATTYPE:
3109 ctype = *ecode++; /* Code for the character type */
3110
3111 /* First, ensure the minimum number of matches are present. Use inline
3112 code for maximizing the speed, and do the type test once at the start
3113 (i.e. keep it out of the loop). Also test that there are at least the
3114 minimum number of characters before we start. */
3115
3116 if (min > md->end_subject - eptr) return FALSE;
3117 if (min > 0) switch(ctype)
3118 {
3119 case OP_ANY:
3120 if (!md->dotall)
3121 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3122 else eptr += min;
3123 break;
3124
3125 case OP_NOT_DIGIT:
3126 for (i = 1; i <= min; i++)
3127 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3128 break;
3129
3130 case OP_DIGIT:
3131 for (i = 1; i <= min; i++)
3132 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3133 break;
3134
3135 case OP_NOT_WHITESPACE:
3136 for (i = 1; i <= min; i++)
3137 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3138 break;
3139
3140 case OP_WHITESPACE:
3141 for (i = 1; i <= min; i++)
3142 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3143 break;
3144
3145 case OP_NOT_WORDCHAR:
3146 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3147 return FALSE;
3148 break;
3149
3150 case OP_WORDCHAR:
3151 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3152 return FALSE;
3153 break;
3154 }
3155
3156 /* If min = max, continue at the same level without recursing */
3157
3158 if (min == max) continue;
3159
3160 /* If minimizing, we have to test the rest of the pattern before each
3161 subsequent match, so inlining isn't much help; just use the function. */
3162
3163 if (minimize)
3164 {
3165 for (i = min;; i++)
3166 {
3167 if (match(eptr, ecode, offset_top, md)) return TRUE;
3168 if (i >= max || eptr >= md->end_subject ||
3169 !match_type(ctype, *eptr++, md->dotall))
3170 return FALSE;
3171 }
3172 /* Control never gets here */
3173 }
3174
3175 /* If maximizing it is worth using inline code for speed, doing the type
3176 test once at the start (i.e. keep it out of the loop). */
3177
3178 else
3179 {
3180 const uschar *pp = eptr;
3181 switch(ctype)
3182 {
3183 case OP_ANY:
3184 if (!md->dotall)
3185 {
3186 for (i = min; i < max; i++)
3187 {
3188 if (eptr >= md->end_subject || *eptr == '\n') break;
3189 eptr++;
3190 }
3191 }
3192 else
3193 {
3194 c = max - min;
3195 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3196 eptr += c;
3197 }
3198 break;
3199
3200 case OP_NOT_DIGIT:
3201 for (i = min; i < max; i++)
3202 {
3203 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3204 break;
3205 eptr++;
3206 }
3207 break;
3208
3209 case OP_DIGIT:
3210 for (i = min; i < max; i++)
3211 {
3212 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3213 break;
3214 eptr++;
3215 }
3216 break;
3217
3218 case OP_NOT_WHITESPACE:
3219 for (i = min; i < max; i++)
3220 {
3221 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3222 break;
3223 eptr++;
3224 }
3225 break;
3226
3227 case OP_WHITESPACE:
3228 for (i = min; i < max; i++)
3229 {
3230 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3231 break;
3232 eptr++;
3233 }
3234 break;
3235
3236 case OP_NOT_WORDCHAR:
3237 for (i = min; i < max; i++)
3238 {
3239 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3240 break;
3241 eptr++;
3242 }
3243 break;
3244
3245 case OP_WORDCHAR:
3246 for (i = min; i < max; i++)
3247 {
3248 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3249 break;
3250 eptr++;
3251 }
3252 break;
3253 }
3254
3255 while (eptr >= pp)
3256 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3257 return FALSE;
3258 }
3259 /* Control never gets here */
3260
3261 /* There's been some horrible disaster. */
3262
3263 default:
3264 #ifdef DEBUG
3265 printf("Unknown opcode %d\n", *ecode);
3266 #endif
3267 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3268 return FALSE;
3269 }
3270
3271 /* Do not stick any code in here without much thought; it is assumed
3272 that "continue" in the code above comes out to here to repeat the main
3273 loop. */
3274
3275 } /* End of main loop */
3276 /* Control never reaches here */
3277 }
3278
3279
3280
3281 /*************************************************
3282 * Execute a Regular Expression *
3283 *************************************************/
3284
3285 /* This function applies a compiled re to a subject string and picks out
3286 portions of the string if it matches. Two elements in the vector are set for
3287 each substring: the offsets to the start and end of the substring.
3288
3289 Arguments:
3290 external_re points to the compiled expression
3291 external_extra points to "hints" from pcre_study() or is NULL
3292 subject points to the subject string
3293 length length of subject string (may contain binary zeros)
3294 options option bits
3295 offsets points to a vector of ints to be filled in with offsets
3296 offsetcount the number of elements in the vector
3297
3298 Returns: > 0 => success; value is the number of elements filled in
3299 = 0 => success, but offsets is not big enough
3300 -1 => failed to match
3301 < -1 => some kind of unexpected problem
3302 */
3303
3304 int
3305 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3306 const char *subject, int length, int options, int *offsets, int offsetcount)
3307 {
3308 int resetcount;
3309 int ocount = offsetcount;
3310 int first_char = -1;
3311 match_data match_block;
3312 const uschar *start_bits = NULL;
3313 const uschar *start_match = (uschar *)subject;
3314 const uschar *end_subject;
3315 const real_pcre *re = (const real_pcre *)external_re;
3316 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3317 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3318 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3319
3320 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3321
3322 if (re == NULL || subject == NULL ||
3323 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3324 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3325
3326 match_block.start_subject = (const uschar *)subject;
3327 match_block.end_subject = match_block.start_subject + length;
3328 end_subject = match_block.end_subject;
3329
3330 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3331 match_block.runtime_caseless = match_block.caseless &&
3332 (re->options & PCRE_CASELESS) == 0;
3333
3334 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3335 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3336 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3337
3338 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3339 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3340
3341 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3342
3343 /* If the expression has got more back references than the offsets supplied can
3344 hold, we get a temporary bit of working store to use during the matching.
3345 Otherwise, we can use the vector supplied, rounding down the size of it to a
3346 multiple of 2. */
3347
3348 ocount &= (-2);
3349 if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
3350 {
3351 ocount = re->top_backref * 2 + 2;
3352 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3353 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3354 #ifdef DEBUG
3355 printf("Got memory to hold back references\n");
3356 #endif
3357 }
3358 else match_block.offset_vector = offsets;
3359
3360 match_block.offset_end = ocount;
3361 match_block.offset_overflow = FALSE;
3362
3363 /* Compute the minimum number of offsets that we need to reset each time. Doing
3364 this makes a huge difference to execution time when there aren't many brackets
3365 in the pattern. */
3366
3367 resetcount = 2 + re->top_bracket * 2;
3368 if (resetcount > offsetcount) resetcount = ocount;
3369
3370 /* If MULTILINE is set at exec time but was not set at compile time, and the
3371 anchored flag is set, we must re-check because a setting provoked by ^ in the
3372 pattern is not right in multi-line mode. Calling is_anchored() again here does
3373 the right check, because multiline is now set. If it now yields FALSE, the
3374 expression must have had ^ starting some of its branches. Check to see if
3375 that is true for *all* branches, and if so, set the startline flag. */
3376
3377 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3378 !is_anchored(re->code, match_block.multiline))
3379 {
3380 anchored = FALSE;
3381 if (is_startline(re->code)) startline = TRUE;
3382 }
3383
3384 /* Set up the first character to match, if available. The first_char value is
3385 never set for an anchored regular expression, but the anchoring may be forced
3386 at run time, so we have to test for anchoring. The first char may be unset for
3387 an unanchored pattern, of course. If there's no first char and the pattern was
3388 studied, the may be a bitmap of possible first characters. However, we can
3389 use this only if the caseless state of the studying was correct. */
3390
3391 if (!anchored)
3392 {
3393 if ((re->options & PCRE_FIRSTSET) != 0)
3394 {
3395 first_char = re->first_char;
3396 if (match_block.caseless) first_char = pcre_lcc[first_char];
3397 }
3398 else
3399 if (!startline && extra != NULL &&
3400 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3401 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3402 start_bits = extra->start_bits;
3403 }
3404
3405 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3406
3407 do
3408 {
3409 register int *iptr = match_block.offset_vector;
3410 register int *iend = iptr + resetcount;
3411
3412 /* Reset the maximum number of extractions we might see. */
3413
3414 while (iptr < iend) *iptr++ = -1;
3415
3416 /* Advance to a unique first char if possible */
3417
3418 if (first_char >= 0)
3419 {
3420 if (match_block.caseless)
3421 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3422 start_match++;
3423 else
3424 while (start_match < end_subject && *start_match != first_char)
3425 start_match++;
3426 }
3427
3428 /* Or to just after \n for a multiline match if possible */
3429
3430 else if (startline)
3431 {
3432 if (start_match > match_block.start_subject)
3433 {
3434 while (start_match < end_subject && start_match[-1] != '\n')
3435 start_match++;
3436 }
3437 }
3438
3439 /* Or to a non-unique first char */
3440
3441 else if (start_bits != NULL)
3442 {
3443 while (start_match < end_subject)
3444 {
3445 register int c = *start_match;
3446 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3447 }
3448 }
3449
3450 #ifdef DEBUG
3451 printf(">>>> Match against: ");
3452 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3453 printf("\n");
3454 #endif
3455
3456 /* When a match occurs, substrings will be set for all internal extractions;
3457 we just need to set up the whole thing as substring 0 before returning. If
3458 there were too many extractions, set the return code to zero. In the case
3459 where we had to get some local store to hold offsets for backreferences, copy
3460 those back references that we can. In this case there need not be overflow
3461 if certain parts of the pattern were not used.
3462
3463 Before starting the match, we have to set up a longjmp() target to enable
3464 the "cut" operation to fail a match completely without backtracking. */
3465
3466 if (setjmp(match_block.fail_env) == 0 &&
3467 match(start_match, re->code, 2, &match_block))
3468 {
3469 int rc;
3470
3471 if (ocount != offsetcount)
3472 {
3473 if (offsetcount >= 4)
3474 {
3475 memcpy(offsets + 2, match_block.offset_vector + 2,
3476 (offsetcount - 2) * sizeof(int));
3477 #ifdef DEBUG
3478 printf("Copied offsets; freeing temporary memory\n");
3479 #endif
3480 }
3481 if (match_block.end_offset_top > offsetcount)
3482 match_block.offset_overflow = TRUE;
3483
3484 #ifdef DEBUG
3485 printf("Freeing temporary memory\n");
3486 #endif
3487
3488 (pcre_free)(match_block.offset_vector);
3489 }
3490
3491 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3492
3493 if (match_block.offset_end < 2) rc = 0; else
3494 {
3495 offsets[0] = start_match - match_block.start_subject;
3496 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3497 }
3498
3499 #ifdef DEBUG
3500 printf(">>>> returning %d\n", rc);
3501 #endif
3502 return rc;
3503 }
3504 }
3505 while (!anchored &&
3506 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3507 start_match++ < end_subject);
3508
3509 #ifdef DEBUG
3510 printf(">>>> returning %d\n", match_block.errorcode);
3511 #endif
3512
3513 return match_block.errorcode;
3514 }
3515
3516 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12