/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (show annotations) (download)
Sat Feb 24 21:38:01 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 98918 byte(s)
Load pcre-1.00 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36
37 /* Include the internals header, which itself includes Standard C headers plus
38 the external pcre header. */
39
40 #include "internal.h"
41
42
43 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
44
45 static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
46 static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
47
48 /* Text forms of OP_ values and things, for debugging */
49
50 #ifdef DEBUG
51 static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
52 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
53 "not",
54 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
55 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
56 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
57 "*", "*?", "+", "+?", "?", "??", "{", "{",
58 "class", "Ref",
59 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
60 "Brazero", "Braminzero", "Bra"
61 };
62 #endif
63
64 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
65 are simple data values; negative values are for special things like \d and so
66 on. Zero means further processing is needed (for things like \x), or the escape
67 is invalid. */
68
69 static short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
78 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
79 0, 0, 0 /* x - z */
80 };
81
82 /* Definition to allow mutual recursion */
83
84 static BOOL compile_regex(int, int *,uschar **,uschar **,char **);
85
86 /* Structure for passing "static" information around between the functions
87 doing the matching, so that they are thread-safe. */
88
89 typedef struct match_data {
90 int errorcode; /* As it says */
91 int *offset_vector; /* Offset vector */
92 int offset_end; /* One past the end */
93 BOOL offset_overflow; /* Set if too many extractions */
94 BOOL caseless; /* Case-independent flag */
95 BOOL runtime_caseless; /* Caseless forced at run time */
96 BOOL multiline; /* Multiline flag */
97 BOOL notbol; /* NOTBOL flag */
98 BOOL noteol; /* NOTEOL flag */
99 BOOL dotall; /* Dot matches any char */
100 BOOL endonly; /* Dollar not before final \n */
101 uschar *start_subject; /* Start of the subject string */
102 uschar *end_subject; /* End of the subject string */
103 jmp_buf fail_env; /* Environment for longjump() break out */
104 uschar *end_match_ptr; /* Subject position at end match */
105 int end_offset_top; /* Highwater mark at end of match */
106 } match_data;
107
108
109
110 /*************************************************
111 * Global variables *
112 *************************************************/
113
114 /* PCRE is thread-clean and doesn't use any global variables in the normal
115 sense. However, it calls memory allocation and free functions via the two
116 indirections below, which are can be changed by the caller, but are shared
117 between all threads. */
118
119 void *(*pcre_malloc)(size_t) = malloc;
120 void (*pcre_free)(void *) = free;
121
122
123
124
125 /*************************************************
126 * Return version string *
127 *************************************************/
128
129 char *
130 pcre_version(void)
131 {
132 return PCRE_VERSION;
133 }
134
135
136
137
138 /*************************************************
139 * Return info about a compiled pattern *
140 *************************************************/
141
142 /* This function picks potentially useful data out of the private
143 structure.
144
145 Arguments:
146 external_re points to compiled code
147 optptr where to pass back the options
148 first_char where to pass back the first character,
149 or -1 if multiline and all branches start ^,
150 or -2 otherwise
151
152 Returns: number of identifying extraction brackets
153 or negative values on error
154 */
155
156 int
157 pcre_info(const pcre *external_re, int *optptr, int *first_char)
158 {
159 real_pcre *re = (real_pcre *)external_re;
160 if (re == NULL) return PCRE_ERROR_NULL;
161 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
162 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
163 if (first_char != NULL)
164 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
165 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
166 return re->top_bracket;
167 }
168
169
170
171
172 #ifdef DEBUG
173 /*************************************************
174 * Debugging function to print chars *
175 *************************************************/
176
177 /* Print a sequence of chars in printable format, stopping at the end of the
178 subject if the requested.
179
180 Arguments:
181 p points to characters
182 length number to print
183 is_subject TRUE if printing from within md->start_subject
184 md pointer to matching data block, if is_subject is TRUE
185
186 Returns: nothing
187 */
188
189 static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
190 {
191 int c;
192 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
193 while (length-- > 0)
194 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
195 }
196 #endif
197
198
199
200
201 /*************************************************
202 * Check subpattern for empty operand *
203 *************************************************/
204
205 /* This function checks a bracketed subpattern to see if any of the paths
206 through it could match an empty string. This is used to diagnose an error if
207 such a subpattern is followed by a quantifier with an unlimited upper bound.
208
209 Argument:
210 code points to the opening bracket
211
212 Returns: TRUE or FALSE
213 */
214
215 static BOOL
216 could_be_empty(uschar *code)
217 {
218 do {
219 uschar *cc = code + 3;
220
221 /* Scan along the opcodes for this branch; as soon as we find something
222 that matches a non-empty string, break out and advance to test the next
223 branch. If we get to the end of the branch, return TRUE for the whole
224 sub-expression. */
225
226 for (;;)
227 {
228 /* Test an embedded subpattern; if it could not be empty, break the
229 loop. Otherwise carry on in the branch. */
230
231 if ((int)(*cc) >= OP_BRA)
232 {
233 if (!could_be_empty(cc)) break;
234 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
235 cc += 3;
236 }
237
238 else switch (*cc)
239 {
240 /* Reached end of a branch: the subpattern may match the empty string */
241
242 case OP_ALT:
243 case OP_KET:
244 case OP_KETRMAX:
245 case OP_KETRMIN:
246 return TRUE;
247
248 /* Skip over assertive subpatterns */
249
250 case OP_ASSERT:
251 case OP_ASSERT_NOT:
252 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
253 cc += 3;
254 break;
255
256 /* Skip over things that don't match chars */
257
258 case OP_SOD:
259 case OP_EOD:
260 case OP_CIRC:
261 case OP_DOLL:
262 case OP_BRAZERO:
263 case OP_BRAMINZERO:
264 case OP_NOT_WORD_BOUNDARY:
265 case OP_WORD_BOUNDARY:
266 cc++;
267 break;
268
269 /* Skip over simple repeats with zero lower bound */
270
271 case OP_STAR:
272 case OP_MINSTAR:
273 case OP_QUERY:
274 case OP_MINQUERY:
275 case OP_TYPESTAR:
276 case OP_TYPEMINSTAR:
277 case OP_TYPEQUERY:
278 case OP_TYPEMINQUERY:
279 cc += 2;
280 break;
281
282 /* Skip over UPTOs (lower bound is zero) */
283
284 case OP_UPTO:
285 case OP_MINUPTO:
286 case OP_TYPEUPTO:
287 case OP_TYPEMINUPTO:
288 cc += 4;
289 break;
290
291 /* Check a class or a back reference for a zero minimum */
292
293 case OP_CLASS:
294 case OP_REF:
295 cc += (*cc == OP_REF)? 2 : 4 + 2 * cc[2] + cc[3];
296
297 switch (*cc)
298 {
299 case OP_CRSTAR:
300 case OP_CRMINSTAR:
301 case OP_CRQUERY:
302 case OP_CRMINQUERY:
303 cc++;
304 break;
305
306 case OP_CRRANGE:
307 case OP_CRMINRANGE:
308 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
309 cc += 3;
310 break;
311
312 default:
313 goto NEXT_BRANCH;
314 }
315 break;
316
317 /* Anything else matches at least one character */
318
319 default:
320 goto NEXT_BRANCH;
321 }
322 }
323
324 NEXT_BRANCH:
325 code += (code[1] << 8) + code[2];
326 }
327 while (*code == OP_ALT);
328
329 /* No branches match the empty string */
330
331 return FALSE;
332 }
333
334
335
336 /*************************************************
337 * Handle escapes *
338 *************************************************/
339
340 /* This function is called when a \ has been encountered. It either returns a
341 positive value for a simple escape such as \n, or a negative value which
342 encodes one of the more complicated things such as \d. On entry, ptr is
343 pointing at the \. On exit, it is on the final character of the escape
344 sequence.
345
346 Arguments:
347 ptrptr points to the pattern position pointer
348 errorptr points to the pointer to the error message
349 bracount number of previous extracting brackets
350 options the options bits
351 isclass TRUE if inside a character class
352
353 Returns: zero or positive => a data character
354 negative => a special escape sequence
355 on error, errorptr is set
356 */
357
358 static int
359 check_escape(uschar **ptrptr, char **errorptr, int bracount, int options,
360 BOOL isclass)
361 {
362 uschar *ptr = *ptrptr;
363 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
364 int i;
365
366 if (c == 0) *errorptr = ERR1;
367
368 /* Digits or letters may have special meaning; all others are literals. */
369
370 else if (c < '0' || c > 'z') {}
371
372 /* Do an initial lookup in a table. A non-zero result is something that can be
373 returned immediately. Otherwise further processing may be required. */
374
375 else if ((i = escapes[c - '0']) != 0) c = i;
376
377 /* Escapes that need further processing, or are illegal. */
378
379 else
380 {
381 uschar *oldptr;
382 switch (c)
383 {
384 /* The handling of escape sequences consisting of a string of digits
385 starting with one that is not zero is not straightforward. By experiment,
386 the way Perl works seems to be as follows:
387
388 Outside a character class, the digits are read as a decimal number. If the
389 number is less than 10, or if there are that many previous extracting
390 left brackets, then it is a back reference. Otherwise, up to three octal
391 digits are read to form an escaped byte. Thus \123 is likely to be octal
392 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
393 value is greater than 377, the least significant 8 bits are taken. Inside a
394 character class, \ followed by a digit is always an octal number. */
395
396 case '1': case '2': case '3': case '4': case '5':
397 case '6': case '7': case '8': case '9':
398
399 if (!isclass)
400 {
401 oldptr = ptr;
402 c -= '0';
403 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
404 c = c * 10 + *(++ptr) - '0';
405 if (c < 10 || c <= bracount)
406 {
407 c = -(ESC_REF + c);
408 break;
409 }
410 ptr = oldptr; /* Put the pointer back and fall through */
411 }
412
413 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
414 generates a binary zero byte and treats the digit as a following literal.
415 Thus we have to pull back the pointer by one. */
416
417 if ((c = *ptr) >= '8')
418 {
419 ptr--;
420 c = 0;
421 break;
422 }
423
424 /* \0 always starts an octal number, but we may drop through to here with a
425 larger first octal digit */
426
427 case '0':
428 c -= '0';
429 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
430 ptr[1] != '8' && ptr[1] != '9')
431 c = c * 8 + *(++ptr) - '0';
432 break;
433
434 /* Special escapes not starting with a digit are straightforward */
435
436 case 'x':
437 c = 0;
438 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
439 {
440 ptr++;
441 c = c * 16 + pcre_lcc[*ptr] -
442 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
443 }
444 break;
445
446 case 'c':
447 c = *(++ptr);
448 if (c == 0)
449 {
450 *errorptr = ERR2;
451 return 0;
452 }
453
454 /* A letter is upper-cased; then the 0x40 bit is flipped */
455
456 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
457 c ^= 0x40;
458 break;
459
460 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
461 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
462 for Perl compatibility, it is a literal. */
463
464 default:
465 if ((options & PCRE_EXTRA) != 0) switch(c)
466 {
467 case 'X':
468 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
469 break;
470
471 default:
472 *errorptr = ERR3;
473 break;
474 }
475 break;
476 }
477 }
478
479 *ptrptr = ptr;
480 return c;
481 }
482
483
484
485 /*************************************************
486 * Check for counted repeat *
487 *************************************************/
488
489 /* This function is called when a '{' is encountered in a place where it might
490 start a quantifier. It looks ahead to see if it really is a quantifier or not.
491 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
492 where the ddds are digits.
493
494 Arguments:
495 p pointer to the first char after '{'
496
497 Returns: TRUE or FALSE
498 */
499
500 static BOOL
501 is_counted_repeat(uschar *p)
502 {
503 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
504 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
505 if (*p == '}') return TRUE;
506
507 if (*p++ != ',') return FALSE;
508 if (*p == '}') return TRUE;
509
510 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
511 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
512 return (*p == '}');
513 }
514
515
516
517 /*************************************************
518 * Read repeat counts *
519 *************************************************/
520
521 /* Read an item of the form {n,m} and return the values. This is called only
522 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
523 so the syntax is guaranteed to be correct, but we need to check the values.
524
525 Arguments:
526 p pointer to first char after '{'
527 minp pointer to int for min
528 maxp pointer to int for max
529 returned as -1 if no max
530 errorptr points to pointer to error message
531
532 Returns: pointer to '}' on success;
533 current ptr on error, with errorptr set
534 */
535
536 static uschar *
537 read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)
538 {
539 int min = 0;
540 int max = -1;
541
542 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
543
544 if (*p == '}') max = min; else
545 {
546 if (*(++p) != '}')
547 {
548 max = 0;
549 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
550 if (max < min)
551 {
552 *errorptr = ERR4;
553 return p;
554 }
555 }
556 }
557
558 /* Do paranoid checks, then fill in the required variables, and pass back the
559 pointer to the terminating '}'. */
560
561 if (min > 65535 || max > 65535)
562 *errorptr = ERR5;
563 else
564 {
565 *minp = min;
566 *maxp = max;
567 }
568 return p;
569 }
570
571
572
573 /*************************************************
574 * Compile one branch *
575 *************************************************/
576
577 /* Scan the pattern, compiling it into the code vector.
578
579 Arguments:
580 options the option bits
581 bracket points to number of brackets used
582 code points to the pointer to the current code point
583 ptrptr points to the current pattern pointer
584 errorptr points to pointer to error message
585
586 Returns: TRUE on success
587 FALSE, with *errorptr set on error
588 */
589
590 static BOOL
591 compile_branch(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
592 char **errorptr)
593 {
594 int repeat_type, op_type;
595 int repeat_min, repeat_max;
596 int bravalue, length;
597 register int c;
598 register uschar *code = *codeptr;
599 uschar *ptr = *ptrptr;
600 uschar *previous = NULL;
601 uschar *oldptr;
602 uschar class[32];
603
604 /* Switch on next character until the end of the branch */
605
606 for (;; ptr++)
607 {
608 BOOL negate_class;
609 int class_charcount;
610 int class_lastchar;
611
612 c = *ptr;
613 if ((options & PCRE_EXTENDED) != 0)
614 {
615 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
616 if (c == '#')
617 {
618 while ((c = *(++ptr)) != 0 && c != '\n');
619 continue;
620 }
621 }
622
623 switch(c)
624 {
625 /* The branch terminates at end of string, |, or ). */
626
627 case 0:
628 case '|':
629 case ')':
630 *codeptr = code;
631 *ptrptr = ptr;
632 return TRUE;
633
634 /* Handle single-character metacharacters */
635
636 case '^':
637 previous = NULL;
638 *code++ = OP_CIRC;
639 break;
640
641 case '$':
642 previous = NULL;
643 *code++ = OP_DOLL;
644 break;
645
646 case '.':
647 previous = code;
648 *code++ = OP_ANY;
649 break;
650
651 /* Character classes. These always build a 32-byte bitmap of the permitted
652 characters, except in the special case where there is only one character.
653 For negated classes, we build the map as usual, then invert it at the end.
654 */
655
656 case '[':
657 previous = code;
658 *code++ = OP_CLASS;
659
660 /* If the first character is '^', set the negation flag */
661
662 if ((c = *(++ptr)) == '^')
663 {
664 negate_class = TRUE;
665 c = *(++ptr);
666 }
667 else negate_class = FALSE;
668
669 /* Keep a count of chars so that we can optimize the case of just a single
670 character. */
671
672 class_charcount = 0;
673 class_lastchar = -1;
674
675 /* Initialize the 32-char bit map to all zeros. We have to build the
676 map in a temporary bit of store, in case the class contains only 1
677 character, because in that case the compiled code doesn't use the
678 bit map. */
679
680 memset(class, 0, 32 * sizeof(uschar));
681
682 /* Process characters until ] is reached. By writing this as a "do" it
683 means that an initial ] is taken as a data character. */
684
685 do
686 {
687 if (c == 0)
688 {
689 *errorptr = ERR6;
690 goto FAILED;
691 }
692
693 /* Backslash may introduce a single character, or it may introduce one
694 of the specials, which just set a flag. Escaped items are checked for
695 validity in the pre-compiling pass. The sequence \b is a special case.
696 Inside a class (and only there) it is treated as backslash. Elsewhere
697 it marks a word boundary. Other escapes have preset maps ready to
698 or into the one we are building. We assume they have more than one
699 character in them, so set class_count bigger than one. */
700
701 if (c == '\\')
702 {
703 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
704 if (-c == ESC_b) c = '\b';
705 else if (c < 0)
706 {
707 class_charcount = 10;
708 switch (-c)
709 {
710 case ESC_d:
711 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
712 continue;
713
714 case ESC_D:
715 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
716 continue;
717
718 case ESC_w:
719 for (c = 0; c < 32; c++)
720 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
721 continue;
722
723 case ESC_W:
724 for (c = 0; c < 32; c++)
725 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
726 continue;
727
728 case ESC_s:
729 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
730 continue;
731
732 case ESC_S:
733 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
734 continue;
735
736 default:
737 *errorptr = ERR7;
738 goto FAILED;
739 }
740 }
741 /* Fall through if single character */
742 }
743
744 /* A single character may be followed by '-' to form a range. However,
745 Perl does not permit ']' to be the end of the range. A '-' character
746 here is treated as a literal. */
747
748 if (ptr[1] == '-' && ptr[2] != ']')
749 {
750 int d;
751 ptr += 2;
752 d = *ptr;
753
754 if (d == 0)
755 {
756 *errorptr = ERR6;
757 goto FAILED;
758 }
759
760 /* The second part of a range can be a single-character escape, but
761 not any of the other escapes. */
762
763 if (d == '\\')
764 {
765 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
766 if (d < 0)
767 {
768 if (d == -ESC_b) d = '\b'; else
769 {
770 *errorptr = ERR7;
771 goto FAILED;
772 }
773 }
774 }
775
776 if (d < c)
777 {
778 *errorptr = ERR8;
779 goto FAILED;
780 }
781
782 for (; c <= d; c++)
783 {
784 class[c/8] |= (1 << (c&7));
785 if ((options & PCRE_CASELESS) != 0)
786 {
787 int uc = pcre_fcc[c]; /* flip case */
788 class[uc/8] |= (1 << (uc&7));
789 }
790 class_charcount++; /* in case a one-char range */
791 class_lastchar = c;
792 }
793 continue; /* Go get the next char in the class */
794 }
795
796 /* Handle a lone single character - we can get here for a normal
797 non-escape char, or after \ that introduces a single character. */
798
799 class [c/8] |= (1 << (c&7));
800 if ((options & PCRE_CASELESS) != 0)
801 {
802 c = pcre_fcc[c]; /* flip case */
803 class[c/8] |= (1 << (c&7));
804 }
805 class_charcount++;
806 class_lastchar = c;
807 }
808
809 /* Loop until ']' reached; the check for end of string happens inside the
810 loop. This "while" is the end of the "do" above. */
811
812 while ((c = *(++ptr)) != ']');
813
814 /* If class_charcount is 1 and class_lastchar is not negative, we saw
815 precisely one character. This doesn't need the whole 32-byte bit map.
816 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
817 it's negative. */
818
819 if (class_charcount == 1 && class_lastchar >= 0)
820 {
821 if (negate_class)
822 {
823 code[-1] = OP_NOT;
824 }
825 else
826 {
827 code[-1] = OP_CHARS;
828 *code++ = 1;
829 }
830 *code++ = class_lastchar;
831 }
832
833 /* Otherwise, negate the 32-byte map if necessary, and copy it into
834 the code vector. */
835
836 else
837 {
838 if (negate_class)
839 for (c = 0; c < 32; c++) code[c] = ~class[c];
840 else
841 memcpy(code, class, 32);
842 code += 32;
843 }
844 break;
845
846 /* Various kinds of repeat */
847
848 case '{':
849 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
850 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
851 if (*errorptr != NULL) goto FAILED;
852 goto REPEAT;
853
854 case '*':
855 repeat_min = 0;
856 repeat_max = -1;
857 goto REPEAT;
858
859 case '+':
860 repeat_min = 1;
861 repeat_max = -1;
862 goto REPEAT;
863
864 case '?':
865 repeat_min = 0;
866 repeat_max = 1;
867
868 REPEAT:
869 if (previous == NULL)
870 {
871 *errorptr = ERR9;
872 goto FAILED;
873 }
874
875 /* If the next character is '?' this is a minimizing repeat. Advance to the
876 next character. */
877
878 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
879
880 /* If the maximum is zero then the minimum must also be zero; Perl allows
881 this case, so we do too - by simply omitting the item altogether. */
882
883 if (repeat_max == 0) code = previous;
884
885 /* If previous was a string of characters, chop off the last one and use it
886 as the subject of the repeat. If there was only one character, we can
887 abolish the previous item altogether. */
888
889 else if (*previous == OP_CHARS)
890 {
891 int len = previous[1];
892 if (len == 1)
893 {
894 c = previous[2];
895 code = previous;
896 }
897 else
898 {
899 c = previous[len+1];
900 previous[1]--;
901 code--;
902 }
903 op_type = 0; /* Use single-char op codes */
904 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
905 }
906
907 /* If previous was a single negated character ([^a] or similar), we use
908 one of the special opcodes, replacing it. The code is shared with single-
909 character repeats by adding a suitable offset into repeat_type. */
910
911 else if ((int)*previous == OP_NOT)
912 {
913 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
914 c = previous[1];
915 code = previous;
916 goto OUTPUT_SINGLE_REPEAT;
917 }
918
919 /* If previous was a character type match (\d or similar), abolish it and
920 create a suitable repeat item. The code is shared with single-character
921 repeats by adding a suitable offset into repeat_type. */
922
923 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
924 {
925 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
926 c = *previous;
927 code = previous;
928
929 OUTPUT_SINGLE_REPEAT:
930 repeat_type += op_type; /* Combine both values for many cases */
931
932 /* A minimum of zero is handled either as the special case * or ?, or as
933 an UPTO, with the maximum given. */
934
935 if (repeat_min == 0)
936 {
937 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
938 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
939 else
940 {
941 *code++ = OP_UPTO + repeat_type;
942 *code++ = repeat_max >> 8;
943 *code++ = (repeat_max & 255);
944 }
945 }
946
947 /* The case {1,} is handled as the special case + */
948
949 else if (repeat_min == 1 && repeat_max == -1)
950 *code++ = OP_PLUS + repeat_type;
951
952 /* The case {n,n} is just an EXACT, while the general case {n,m} is
953 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
954
955 else
956 {
957 if (repeat_min != 1)
958 {
959 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
960 *code++ = repeat_min >> 8;
961 *code++ = (repeat_min & 255);
962 }
963
964 /* If the mininum is 1 and the previous item was a character string,
965 we either have to put back the item that got cancelled if the string
966 length was 1, or add the character back onto the end of a longer
967 string. For a character type nothing need be done; it will just get put
968 back naturally. */
969
970 else if (*previous == OP_CHARS)
971 {
972 if (code == previous) code += 2; else previous[1]++;
973 }
974
975 /* Insert an UPTO if the max is greater than the min. */
976
977 if (repeat_max != repeat_min)
978 {
979 *code++ = c;
980 repeat_max -= repeat_min;
981 *code++ = OP_UPTO + repeat_type;
982 *code++ = repeat_max >> 8;
983 *code++ = (repeat_max & 255);
984 }
985 }
986
987 /* The character or character type itself comes last in all cases. */
988
989 *code++ = c;
990 }
991
992 /* If previous was a character class or a back reference, we put the repeat
993 stuff after it. */
994
995 else if (*previous == OP_CLASS || *previous == OP_REF)
996 {
997 if (repeat_min == 0 && repeat_max == -1)
998 *code++ = OP_CRSTAR + repeat_type;
999 else if (repeat_min == 1 && repeat_max == -1)
1000 *code++ = OP_CRPLUS + repeat_type;
1001 else if (repeat_min == 0 && repeat_max == 1)
1002 *code++ = OP_CRQUERY + repeat_type;
1003 else
1004 {
1005 *code++ = OP_CRRANGE + repeat_type;
1006 *code++ = repeat_min >> 8;
1007 *code++ = repeat_min & 255;
1008 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1009 *code++ = repeat_max >> 8;
1010 *code++ = repeat_max & 255;
1011 }
1012 }
1013
1014 /* If previous was a bracket group, we may have to replicate it in certain
1015 cases. If the maximum repeat count is unlimited, check that the bracket
1016 group cannot match the empty string, and diagnose an error if it can. */
1017
1018 else if ((int)*previous >= OP_BRA)
1019 {
1020 int i;
1021 int length = code - previous;
1022
1023 if (repeat_max == -1 && could_be_empty(previous))
1024 {
1025 *errorptr = ERR10;
1026 goto FAILED;
1027 }
1028
1029 /* If the minimum is greater than zero, and the maximum is unlimited or
1030 equal to the minimum, the first copy remains where it is, and is
1031 replicated up to the minimum number of times. This case includes the +
1032 repeat, but of course no replication is needed in that case. */
1033
1034 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1035 {
1036 for (i = 1; i < repeat_min; i++)
1037 {
1038 memcpy(code, previous, length);
1039 code += length;
1040 }
1041 }
1042
1043 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1044 Then, if there is a fixed upper limit, replicated up to that many times,
1045 sticking BRAZERO in front of all the optional ones. */
1046
1047 else
1048 {
1049 if (repeat_min == 0)
1050 {
1051 memmove(previous+1, previous, length);
1052 code++;
1053 *previous++ = OP_BRAZERO + repeat_type;
1054 }
1055
1056 for (i = 1; i < repeat_min; i++)
1057 {
1058 memcpy(code, previous, length);
1059 code += length;
1060 }
1061
1062 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1063 {
1064 *code++ = OP_BRAZERO + repeat_type;
1065 memcpy(code, previous, length);
1066 code += length;
1067 }
1068 }
1069
1070 /* If the maximum is unlimited, set a repeater in the final copy. */
1071
1072 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1073 }
1074
1075 /* Else there's some kind of shambles */
1076
1077 else
1078 {
1079 *errorptr = ERR11;
1080 goto FAILED;
1081 }
1082
1083 /* In all case we no longer have a previous item. */
1084
1085 previous = NULL;
1086 break;
1087
1088
1089 /* Start of nested bracket sub-expression, or comment or lookahead.
1090 First deal with special things that can come after a bracket; all are
1091 introduced by ?, and the appearance of any of them means that this is not a
1092 referencing group. They were checked for validity in the first pass over
1093 the string, so we don't have to check for syntax errors here. */
1094
1095 case '(':
1096 previous = code; /* Only real brackets can be repeated */
1097 if (*(++ptr) == '?')
1098 {
1099 bravalue = OP_BRA;
1100
1101 switch (*(++ptr))
1102 {
1103 case '#':
1104 case 'i':
1105 case 'm':
1106 case 's':
1107 case 'x':
1108 ptr++;
1109 while (*ptr != ')') ptr++;
1110 previous = NULL;
1111 continue;
1112
1113 case ':': /* Non-extracting bracket */
1114 ptr++;
1115 break;
1116
1117 case '=': /* Assertions can't be repeated */
1118 bravalue = OP_ASSERT;
1119 ptr++;
1120 previous = NULL;
1121 break;
1122
1123 case '!':
1124 bravalue = OP_ASSERT_NOT;
1125 ptr++;
1126 previous = NULL;
1127 break;
1128
1129 case '>': /* "Match once" brackets */
1130 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1131 {
1132 bravalue = OP_ONCE;
1133 ptr++;
1134 previous = NULL;
1135 break;
1136 }
1137 /* Else fall through */
1138
1139 default:
1140 *errorptr = ERR12;
1141 goto FAILED;
1142 }
1143 }
1144
1145 /* Else we have a referencing group */
1146
1147 else
1148 {
1149 if (++(*brackets) > EXTRACT_MAX)
1150 {
1151 *errorptr = ERR13;
1152 goto FAILED;
1153 }
1154 bravalue = OP_BRA + *brackets;
1155 }
1156
1157 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1158 code into a non-register variable in order to be able to pass its address
1159 because some compilers complain otherwise. */
1160
1161 *code = bravalue;
1162 {
1163 uschar *mcode = code;
1164 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1165 goto FAILED;
1166 code = mcode;
1167 }
1168
1169 if (*ptr != ')')
1170 {
1171 *errorptr = ERR14;
1172 goto FAILED;
1173 }
1174 break;
1175
1176 /* Check \ for being a real metacharacter; if not, fall through and handle
1177 it as a data character at the start of a string. Escape items are checked
1178 for validity in the pre-compiling pass. */
1179
1180 case '\\':
1181 oldptr = ptr;
1182 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1183
1184 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1185 are arranged to be the negation of the corresponding OP_values. For the
1186 back references, the values are ESC_REF plus the reference number. Only
1187 back references and those types that consume a character may be repeated.
1188 We can test for values between ESC_b and ESC_Z for the latter; this may
1189 have to change if any new ones are ever created. */
1190
1191 if (c < 0)
1192 {
1193 if (-c >= ESC_REF)
1194 {
1195 int refnum = -c - ESC_REF;
1196 if (*brackets < refnum)
1197 {
1198 *errorptr = ERR15;
1199 goto FAILED;
1200 }
1201 previous = code;
1202 *code++ = OP_REF;
1203 *code++ = refnum;
1204 }
1205 else
1206 {
1207 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1208 *code++ = -c;
1209 }
1210 continue;
1211 }
1212
1213 /* Reset and fall through */
1214
1215 ptr = oldptr;
1216 c = '\\';
1217
1218 /* Handle a run of data characters until a metacharacter is encountered.
1219 The first character is guaranteed not to be whitespace or # when the
1220 extended flag is set. */
1221
1222 NORMAL_CHAR:
1223 default:
1224 previous = code;
1225 *code = OP_CHARS;
1226 code += 2;
1227 length = 0;
1228
1229 do
1230 {
1231 if ((options & PCRE_EXTENDED) != 0)
1232 {
1233 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1234 if (c == '#')
1235 {
1236 while ((c = *(++ptr)) != 0 && c != '\n');
1237 if (c == 0) break;
1238 continue;
1239 }
1240 }
1241
1242 /* Backslash may introduce a data char or a metacharacter. Escaped items
1243 are checked for validity in the pre-compiling pass. Stop the string
1244 before a metaitem. */
1245
1246 if (c == '\\')
1247 {
1248 oldptr = ptr;
1249 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1250 if (c < 0) { ptr = oldptr; break; }
1251 }
1252
1253 /* Ordinary character or single-char escape */
1254
1255 *code++ = c;
1256 length++;
1257 }
1258
1259 /* This "while" is the end of the "do" above. */
1260
1261 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1262
1263 /* Compute the length and set it in the data vector, and advance to
1264 the next state. */
1265
1266 previous[1] = length;
1267 ptr--;
1268 break;
1269 }
1270 } /* end of big loop */
1271
1272 /* Control never reaches here by falling through, only by a goto for all the
1273 error states. Pass back the position in the pattern so that it can be displayed
1274 to the user for diagnosing the error. */
1275
1276 FAILED:
1277 *ptrptr = ptr;
1278 return FALSE;
1279 }
1280
1281
1282
1283
1284 /*************************************************
1285 * Compile sequence of alternatives *
1286 *************************************************/
1287
1288 /* On entry, ptr is pointing past the bracket character, but on return
1289 it points to the closing bracket, or vertical bar, or end of string.
1290 The code variable is pointing at the byte into which the BRA operator has been
1291 stored.
1292
1293 Argument:
1294 options the option bits
1295 brackets -> int containing the number of extracting brackets used
1296 codeptr -> the address of the current code pointer
1297 ptrptr -> the address of the current pattern pointer
1298 errorptr -> pointer to error message
1299
1300 Returns: TRUE on success
1301 */
1302
1303 static BOOL
1304 compile_regex(int options, int *brackets, uschar **codeptr, uschar **ptrptr,
1305 char **errorptr)
1306 {
1307 uschar *ptr = *ptrptr;
1308 uschar *code = *codeptr;
1309 uschar *start_bracket = code;
1310
1311 for (;;)
1312 {
1313 int length;
1314 uschar *last_branch = code;
1315
1316 code += 3;
1317 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1318 {
1319 *ptrptr = ptr;
1320 return FALSE;
1321 }
1322
1323 /* Fill in the length of the last branch */
1324
1325 length = code - last_branch;
1326 last_branch[1] = length >> 8;
1327 last_branch[2] = length & 255;
1328
1329 /* Reached end of expression, either ')' or end of pattern. Insert a
1330 terminating ket and the length of the whole bracketed item, and return,
1331 leaving the pointer at the terminating char. */
1332
1333 if (*ptr != '|')
1334 {
1335 length = code - start_bracket;
1336 *code++ = OP_KET;
1337 *code++ = length >> 8;
1338 *code++ = length & 255;
1339 *codeptr = code;
1340 *ptrptr = ptr;
1341 return TRUE;
1342 }
1343
1344 /* Another branch follows; insert an "or" node and advance the pointer. */
1345
1346 *code = OP_ALT;
1347 ptr++;
1348 }
1349 /* Control never reaches here */
1350 }
1351
1352
1353
1354 /*************************************************
1355 * Check for anchored expression *
1356 *************************************************/
1357
1358 /* Try to find out if this is an anchored regular expression. Consider each
1359 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1360 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1361 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1362 counts, since OP_CIRC can match in the middle.
1363
1364 A branch is also implicitly anchored if it starts with .* because that will try
1365 the rest of the pattern at all possible matching points, so there is no point
1366 trying them again.
1367
1368 Argument: points to start of expression (the bracket)
1369 Returns: TRUE or FALSE
1370 */
1371
1372 static BOOL
1373 is_anchored(register uschar *code, BOOL multiline)
1374 {
1375 do {
1376 int op = (int)code[3];
1377 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1378 { if (!is_anchored(code+3, multiline)) return FALSE; }
1379 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1380 { if (code[4] != OP_ANY) return FALSE; }
1381 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1382 code += (code[1] << 8) + code[2];
1383 }
1384 while (*code == OP_ALT);
1385 return TRUE;
1386 }
1387
1388
1389
1390 /*************************************************
1391 * Check for start with \n line expression *
1392 *************************************************/
1393
1394 /* This is called for multiline expressions to try to find out if every branch
1395 starts with ^ so that "first char" processing can be done to speed things up.
1396
1397 Argument: points to start of expression (the bracket)
1398 Returns: TRUE or FALSE
1399 */
1400
1401 static BOOL
1402 is_startline(uschar *code)
1403 {
1404 do {
1405 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1406 { if (!is_startline(code+3)) return FALSE; }
1407 else if (code[3] != OP_CIRC) return FALSE;
1408 code += (code[1] << 8) + code[2];
1409 }
1410 while (*code == OP_ALT);
1411 return TRUE;
1412 }
1413
1414
1415
1416 /*************************************************
1417 * Check for fixed first char *
1418 *************************************************/
1419
1420 /* Try to find out if there is a fixed first character. This is called for
1421 unanchored expressions, as it speeds up their processing quite considerably.
1422 Consider each alternative branch. If they all start with the same char, or with
1423 a bracket all of whose alternatives start with the same char (recurse ad lib),
1424 then we return that char, otherwise -1.
1425
1426 Argument: points to start of expression (the bracket)
1427 Returns: -1 or the fixed first char
1428 */
1429
1430 static int
1431 find_firstchar(uschar *code)
1432 {
1433 register int c = -1;
1434 do
1435 {
1436 register int charoffset = 4;
1437
1438 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1439 {
1440 register int d;
1441 if ((d = find_firstchar(code+3)) < 0) return -1;
1442 if (c < 0) c = d; else if (c != d) return -1;
1443 }
1444
1445 else switch(code[3])
1446 {
1447 default:
1448 return -1;
1449
1450 case OP_EXACT: /* Fall through */
1451 charoffset++;
1452
1453 case OP_CHARS: /* Fall through */
1454 charoffset++;
1455
1456 case OP_PLUS:
1457 case OP_MINPLUS:
1458 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1459 break;
1460 }
1461 code += (code[1] << 8) + code[2];
1462 }
1463 while (*code == OP_ALT);
1464 return c;
1465 }
1466
1467
1468
1469 /*************************************************
1470 * Compile a Regular Expression *
1471 *************************************************/
1472
1473 /* This function takes a string and returns a pointer to a block of store
1474 holding a compiled version of the expression.
1475
1476 Arguments:
1477 pattern the regular expression
1478 options various option bits
1479 errorptr pointer to pointer to error text
1480 erroroffset ptr offset in pattern where error was detected
1481
1482 Returns: pointer to compiled data block, or NULL on error,
1483 with errorptr and erroroffset set
1484 */
1485
1486 pcre *
1487 pcre_compile(const char *pattern, int options, char **errorptr,
1488 int *erroroffset)
1489 {
1490 real_pcre *re;
1491 int spaces = 0;
1492 int length = 3; /* For initial BRA plus length */
1493 int runlength;
1494 int c, size;
1495 int bracount = 0;
1496 int brastack[200];
1497 int brastackptr = 0;
1498 int top_backref = 0;
1499 uschar *code, *ptr;
1500
1501 #ifdef DEBUG
1502 uschar *code_base, *code_end;
1503 #endif
1504
1505 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1506 can do is just return NULL. */
1507
1508 if (errorptr == NULL) return NULL;
1509 *errorptr = NULL;
1510
1511 /* However, we can give a message for this error */
1512
1513 if (erroroffset == NULL)
1514 {
1515 *errorptr = ERR16;
1516 return NULL;
1517 }
1518 *erroroffset = 0;
1519
1520 if ((options & ~PUBLIC_OPTIONS) != 0)
1521 {
1522 *errorptr = ERR17;
1523 return NULL;
1524 }
1525
1526 #ifdef DEBUG
1527 printf("------------------------------------------------------------------\n");
1528 printf("%s\n", pattern);
1529 #endif
1530
1531 /* The first thing to do is to make a pass over the pattern to compute the
1532 amount of store required to hold the compiled code. This does not have to be
1533 perfect as long as errors are overestimates. At the same time we can detect any
1534 internal flag settings. Make an attempt to correct for any counted white space
1535 if an "extended" flag setting appears late in the pattern. We can't be so
1536 clever for #-comments. */
1537
1538 ptr = (uschar *)(pattern - 1);
1539 while ((c = *(++ptr)) != 0)
1540 {
1541 int min, max;
1542 int class_charcount;
1543
1544 if ((pcre_ctypes[c] & ctype_space) != 0)
1545 {
1546 if ((options & PCRE_EXTENDED) != 0) continue;
1547 spaces++;
1548 }
1549
1550 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1551 {
1552 while ((c = *(++ptr)) != 0 && c != '\n');
1553 continue;
1554 }
1555
1556 switch(c)
1557 {
1558 /* A backslashed item may be an escaped "normal" character or a
1559 character type. For a "normal" character, put the pointers and
1560 character back so that tests for whitespace etc. in the input
1561 are done correctly. */
1562
1563 case '\\':
1564 {
1565 uschar *save_ptr = ptr;
1566 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1567 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1568 if (c >= 0)
1569 {
1570 ptr = save_ptr;
1571 c = '\\';
1572 goto NORMAL_CHAR;
1573 }
1574 }
1575 length++;
1576
1577 /* A back reference needs an additional char, plus either one or 5
1578 bytes for a repeat. We also need to keep the value of the highest
1579 back reference. */
1580
1581 if (c <= -ESC_REF)
1582 {
1583 int refnum = -c - ESC_REF;
1584 if (refnum > top_backref) top_backref = refnum;
1585 length++; /* For single back reference */
1586 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1587 {
1588 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1589 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1590 if ((min == 0 && (max == 1 || max == -1)) ||
1591 (min == 1 && max == -1))
1592 length++;
1593 else length += 5;
1594 if (ptr[1] == '?') ptr++;
1595 }
1596 }
1597 continue;
1598
1599 case '^':
1600 case '.':
1601 case '$':
1602 case '*': /* These repeats won't be after brackets; */
1603 case '+': /* those are handled separately */
1604 case '?':
1605 length++;
1606 continue;
1607
1608 /* This covers the cases of repeats after a single char, metachar, class,
1609 or back reference. */
1610
1611 case '{':
1612 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1613 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1614 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1615 if ((min == 0 && (max == 1 || max == -1)) ||
1616 (min == 1 && max == -1))
1617 length++;
1618 else
1619 {
1620 length--; /* Uncount the original char or metachar */
1621 if (min == 1) length++; else if (min > 0) length += 4;
1622 if (max > 0) length += 4; else length += 2;
1623 }
1624 if (ptr[1] == '?') ptr++;
1625 continue;
1626
1627 /* An alternation contains an offset to the next branch or ket. */
1628 case '|':
1629 length += 3;
1630 continue;
1631
1632 /* A character class uses 33 characters. Don't worry about character types
1633 that aren't allowed in classes - they'll get picked up during the compile.
1634 A character class that contains only one character uses 2 or 3 bytes,
1635 depending on whether it is negated or not. Notice this where we can. */
1636
1637 case '[':
1638 class_charcount = 0;
1639 if (*(++ptr) == '^') ptr++;
1640 do
1641 {
1642 if (*ptr == '\\')
1643 {
1644 int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
1645 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1646 if (-c == ESC_b) class_charcount++; else class_charcount = 10;
1647 }
1648 else class_charcount++;
1649 ptr++;
1650 }
1651 while (*ptr != 0 && *ptr != ']');
1652
1653 /* Repeats for negated single chars are handled by the general code */
1654
1655 if (class_charcount == 1) length += 3; else
1656 {
1657 length += 33;
1658
1659 /* A repeat needs either 1 or 5 bytes. */
1660
1661 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1662 {
1663 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1664 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1665 if ((min == 0 && (max == 1 || max == -1)) ||
1666 (min == 1 && max == -1))
1667 length++;
1668 else length += 5;
1669 if (ptr[1] == '?') ptr++;
1670 }
1671 }
1672 continue;
1673
1674 /* Brackets may be genuine groups or special things */
1675
1676 case '(':
1677
1678 /* Handle special forms of bracket, which all start (? */
1679
1680 if (ptr[1] == '?') switch (c = ptr[2])
1681 {
1682 /* Skip over comments entirely */
1683 case '#':
1684 ptr += 3;
1685 while (*ptr != 0 && *ptr != ')') ptr++;
1686 if (*ptr == 0)
1687 {
1688 *errorptr = ERR18;
1689 goto PCRE_ERROR_RETURN;
1690 }
1691 continue;
1692
1693 /* Non-referencing groups and lookaheads just move the pointer on, and
1694 then behave like a non-special bracket, except that they don't increment
1695 the count of extracting brackets. */
1696
1697 case ':':
1698 case '=':
1699 case '!':
1700 ptr += 2;
1701 break;
1702
1703 /* Ditto for the "once only" bracket, allowed only if the extra bit
1704 is set. */
1705
1706 case '>':
1707 if ((options & PCRE_EXTRA) != 0)
1708 {
1709 ptr += 2;
1710 break;
1711 }
1712 /* Else fall thourh */
1713
1714 /* Else loop setting valid options until ) is met. Anything else is an
1715 error. */
1716
1717 default:
1718 ptr += 2;
1719 for (;; ptr++)
1720 {
1721 if ((c = *ptr) == 'i')
1722 {
1723 options |= PCRE_CASELESS;
1724 continue;
1725 }
1726 else if ((c = *ptr) == 'm')
1727 {
1728 options |= PCRE_MULTILINE;
1729 continue;
1730 }
1731 else if (c == 's')
1732 {
1733 options |= PCRE_DOTALL;
1734 continue;
1735 }
1736 else if (c == 'x')
1737 {
1738 options |= PCRE_EXTENDED;
1739 length -= spaces; /* Already counted spaces */
1740 continue;
1741 }
1742 else if (c == ')') break;
1743
1744 *errorptr = ERR12;
1745 goto PCRE_ERROR_RETURN;
1746 }
1747 continue; /* End of this bracket handling */
1748 }
1749
1750 /* Extracting brackets must be counted so we can process escapes in a
1751 Perlish way. */
1752
1753 else bracount++;
1754
1755 /* Non-special forms of bracket. Save length for computing whole length
1756 at end if there's a repeat that requires duplication of the group. */
1757
1758 if (brastackptr >= sizeof(brastack)/sizeof(int))
1759 {
1760 *errorptr = ERR19;
1761 goto PCRE_ERROR_RETURN;
1762 }
1763
1764 brastack[brastackptr++] = length;
1765 length += 3;
1766 continue;
1767
1768 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1769 have to replicate this bracket up to that many times. */
1770
1771 case ')':
1772 length += 3;
1773 {
1774 int min = 1;
1775 int max = 1;
1776 int duplength = length - brastack[--brastackptr];
1777
1778 /* Leave ptr at the final char; for read_repeat_counts this happens
1779 automatically; for the others we need an increment. */
1780
1781 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1782 {
1783 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1784 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1785 }
1786 else if (c == '*') { min = 0; max = -1; ptr++; }
1787 else if (c == '+') { max = -1; ptr++; }
1788 else if (c == '?') { min = 0; ptr++; }
1789
1790 /* If there is a minimum > 1 we have to replicate up to min-1 times; if
1791 there is a limited maximum we have to replicate up to max-1 times and
1792 allow for a BRAZERO item before each optional copy, as we also have to
1793 do before the first copy if the minimum is zero. */
1794
1795 if (min == 0) length++;
1796 else if (min > 1) length += (min - 1) * duplength;
1797 if (max > min) length += (max - min) * (duplength + 1);
1798 }
1799
1800 continue;
1801
1802 /* Non-special character. For a run of such characters the length required
1803 is the number of characters + 2, except that the maximum run length is 255.
1804 We won't get a skipped space or a non-data escape or the start of a #
1805 comment as the first character, so the length can't be zero. */
1806
1807 NORMAL_CHAR:
1808 default:
1809 length += 2;
1810 runlength = 0;
1811 do
1812 {
1813 if ((pcre_ctypes[c] & ctype_space) != 0)
1814 {
1815 if ((options & PCRE_EXTENDED) != 0) continue;
1816 spaces++;
1817 }
1818
1819 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1820 {
1821 while ((c = *(++ptr)) != 0 && c != '\n');
1822 continue;
1823 }
1824
1825 /* Backslash may introduce a data char or a metacharacter; stop the
1826 string before the latter. */
1827
1828 if (c == '\\')
1829 {
1830 uschar *saveptr = ptr;
1831 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1832 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1833 if (c < 0) { ptr = saveptr; break; }
1834 }
1835
1836 /* Ordinary character or single-char escape */
1837
1838 runlength++;
1839 }
1840
1841 /* This "while" is the end of the "do" above. */
1842
1843 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1844
1845 ptr--;
1846 length += runlength;
1847 continue;
1848 }
1849 }
1850
1851 length += 4; /* For final KET and END */
1852
1853 if (length > 65539)
1854 {
1855 *errorptr = ERR20;
1856 return NULL;
1857 }
1858
1859 /* Compute the size of data block needed and get it, either from malloc or
1860 externally provided function. Put in the magic number and the options. */
1861
1862 size = length + offsetof(real_pcre, code);
1863 re = (real_pcre *)(pcre_malloc)(size);
1864
1865 if (re == NULL)
1866 {
1867 *errorptr = ERR21;
1868 return NULL;
1869 }
1870
1871 re->magic_number = MAGIC_NUMBER;
1872 re->options = options;
1873
1874 /* Set up a starting, non-extracting bracket, then compile the expression. On
1875 error, *errorptr will be set non-NULL, so we don't need to look at the result
1876 of the function here. */
1877
1878 ptr = (uschar *)pattern;
1879 code = re->code;
1880 *code = OP_BRA;
1881 bracount = 0;
1882 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1883 re->top_bracket = bracount;
1884 re->top_backref = top_backref;
1885
1886 /* If not reached end of pattern on success, there's an excess bracket. */
1887
1888 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1889
1890 /* Fill in the terminating state and check for disastrous overflow, but
1891 if debugging, leave the test till after things are printed out. */
1892
1893 *code++ = OP_END;
1894
1895 #ifndef DEBUG
1896 if (code - re->code > length) *errorptr = ERR23;
1897 #endif
1898
1899 /* Failed to compile */
1900
1901 if (*errorptr != NULL)
1902 {
1903 (pcre_free)(re);
1904 PCRE_ERROR_RETURN:
1905 *erroroffset = ptr - (uschar *)pattern;
1906 return NULL;
1907 }
1908
1909 /* If the anchored option was not passed, set flag if we can determine that it
1910 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1911 we can determine what the first character has to be, because that speeds up
1912 unanchored matches no end. In the case of multiline matches, an alternative is
1913 to set the PCRE_STARTLINE flag if all branches start with ^. */
1914
1915 if ((options & PCRE_ANCHORED) == 0)
1916 {
1917 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1918 re->options |= PCRE_ANCHORED;
1919 else
1920 {
1921 int c = find_firstchar(re->code);
1922 if (c >= 0)
1923 {
1924 re->first_char = c;
1925 re->options |= PCRE_FIRSTSET;
1926 }
1927 else if (is_startline(re->code))
1928 re->options |= PCRE_STARTLINE;
1929 }
1930 }
1931
1932 /* Print out the compiled data for debugging */
1933
1934 #ifdef DEBUG
1935
1936 printf("Length = %d top_bracket = %d top_backref=%d\n",
1937 length, re->top_bracket, re->top_backref);
1938
1939 if (re->options != 0)
1940 {
1941 printf("%s%s%s%s%s%s%s\n",
1942 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1943 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1944 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1945 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1946 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1947 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1948 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1949 }
1950
1951 if ((re->options & PCRE_FIRSTSET) != 0)
1952 {
1953 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1954 else printf("First char = \\x%02x\n", re->first_char);
1955 }
1956
1957 code_end = code;
1958 code_base = code = re->code;
1959
1960 while (code < code_end)
1961 {
1962 int charlength;
1963
1964 printf("%3d ", code - code_base);
1965
1966 if (*code >= OP_BRA)
1967 {
1968 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
1969 code += 2;
1970 }
1971
1972 else switch(*code)
1973 {
1974 case OP_CHARS:
1975 charlength = *(++code);
1976 printf("%3d ", charlength);
1977 while (charlength-- > 0)
1978 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
1979 break;
1980
1981 case OP_KETRMAX:
1982 case OP_KETRMIN:
1983 case OP_ALT:
1984 case OP_KET:
1985 case OP_ASSERT:
1986 case OP_ASSERT_NOT:
1987 case OP_ONCE:
1988 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
1989 code += 2;
1990 break;
1991
1992 case OP_STAR:
1993 case OP_MINSTAR:
1994 case OP_PLUS:
1995 case OP_MINPLUS:
1996 case OP_QUERY:
1997 case OP_MINQUERY:
1998 case OP_TYPESTAR:
1999 case OP_TYPEMINSTAR:
2000 case OP_TYPEPLUS:
2001 case OP_TYPEMINPLUS:
2002 case OP_TYPEQUERY:
2003 case OP_TYPEMINQUERY:
2004 if (*code >= OP_TYPESTAR)
2005 printf(" %s", OP_names[code[1]]);
2006 else if (isprint(c = code[1])) printf(" %c", c);
2007 else printf(" \\x%02x", c);
2008 printf("%s", OP_names[*code++]);
2009 break;
2010
2011 case OP_EXACT:
2012 case OP_UPTO:
2013 case OP_MINUPTO:
2014 if (isprint(c = code[3])) printf(" %c{", c);
2015 else printf(" \\x%02x{", c);
2016 if (*code != OP_EXACT) printf(",");
2017 printf("%d}", (code[1] << 8) + code[2]);
2018 if (*code == OP_MINUPTO) printf("?");
2019 code += 3;
2020 break;
2021
2022 case OP_TYPEEXACT:
2023 case OP_TYPEUPTO:
2024 case OP_TYPEMINUPTO:
2025 printf(" %s{", OP_names[code[3]]);
2026 if (*code != OP_TYPEEXACT) printf(",");
2027 printf("%d}", (code[1] << 8) + code[2]);
2028 if (*code == OP_TYPEMINUPTO) printf("?");
2029 code += 3;
2030 break;
2031
2032 case OP_NOT:
2033 if (isprint(c = *(++code))) printf(" [^%c]", c);
2034 else printf(" [^\\x%02x]", c);
2035 break;
2036
2037 case OP_NOTSTAR:
2038 case OP_NOTMINSTAR:
2039 case OP_NOTPLUS:
2040 case OP_NOTMINPLUS:
2041 case OP_NOTQUERY:
2042 case OP_NOTMINQUERY:
2043 if (isprint(c = code[1])) printf(" [^%c]", c);
2044 else printf(" [^\\x%02x]", c);
2045 printf("%s", OP_names[*code++]);
2046 break;
2047
2048 case OP_NOTEXACT:
2049 case OP_NOTUPTO:
2050 case OP_NOTMINUPTO:
2051 if (isprint(c = code[3])) printf(" [^%c]{", c);
2052 else printf(" [^\\x%02x]{", c);
2053 if (*code != OP_NOTEXACT) printf(",");
2054 printf("%d}", (code[1] << 8) + code[2]);
2055 if (*code == OP_NOTMINUPTO) printf("?");
2056 code += 3;
2057 break;
2058
2059 case OP_REF:
2060 printf(" \\%d", *(++code));
2061 break;
2062
2063 case OP_CLASS:
2064 {
2065 int i, min, max;
2066
2067 code++;
2068 printf(" [");
2069
2070 for (i = 0; i < 256; i++)
2071 {
2072 if ((code[i/8] & (1 << (i&7))) != 0)
2073 {
2074 int j;
2075 for (j = i+1; j < 256; j++)
2076 if ((code[j/8] & (1 << (j&7))) == 0) break;
2077 if (i == '-' || i == ']') printf("\\");
2078 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2079 if (--j > i)
2080 {
2081 printf("-");
2082 if (j == '-' || j == ']') printf("\\");
2083 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2084 }
2085 i = j;
2086 }
2087 }
2088 printf("]");
2089 code += 32;
2090
2091 switch(*code)
2092 {
2093 case OP_CRSTAR:
2094 case OP_CRMINSTAR:
2095 case OP_CRPLUS:
2096 case OP_CRMINPLUS:
2097 case OP_CRQUERY:
2098 case OP_CRMINQUERY:
2099 printf("%s", OP_names[*code]);
2100 break;
2101
2102 case OP_CRRANGE:
2103 case OP_CRMINRANGE:
2104 min = (code[1] << 8) + code[2];
2105 max = (code[3] << 8) + code[4];
2106 if (max == 0) printf("{%d,}", min);
2107 else printf("{%d,%d}", min, max);
2108 if (*code == OP_CRMINRANGE) printf("?");
2109 code += 4;
2110 break;
2111
2112 default:
2113 code--;
2114 }
2115 }
2116 break;
2117
2118 /* Anything else is just a one-node item */
2119
2120 default:
2121 printf(" %s", OP_names[*code]);
2122 break;
2123 }
2124
2125 code++;
2126 printf("\n");
2127 }
2128 printf("------------------------------------------------------------------\n");
2129
2130 /* This check is done here in the debugging case so that the code that
2131 was compiled can be seen. */
2132
2133 if (code - re->code > length)
2134 {
2135 *errorptr = ERR23;
2136 (pcre_free)(re);
2137 *erroroffset = ptr - (uschar *)pattern;
2138 return NULL;
2139 }
2140 #endif
2141
2142 return (pcre *)re;
2143 }
2144
2145
2146
2147 /*************************************************
2148 * Match a character type *
2149 *************************************************/
2150
2151 /* Not used in all the places it might be as it's sometimes faster
2152 to put the code inline.
2153
2154 Arguments:
2155 type the character type
2156 c the character
2157 dotall the dotall flag
2158
2159 Returns: TRUE if character is of the type
2160 */
2161
2162 static BOOL
2163 match_type(int type, int c, BOOL dotall)
2164 {
2165
2166 #ifdef DEBUG
2167 if (isprint(c)) printf("matching subject %c against ", c);
2168 else printf("matching subject \\x%02x against ", c);
2169 printf("%s\n", OP_names[type]);
2170 #endif
2171
2172 switch(type)
2173 {
2174 case OP_ANY: return dotall || c != '\n';
2175 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2176 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2177 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2178 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2179 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2180 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2181 }
2182 return FALSE;
2183 }
2184
2185
2186
2187 /*************************************************
2188 * Match a back-reference *
2189 *************************************************/
2190
2191 /* If a back reference hasn't been set, the match fails.
2192
2193 Arguments:
2194 number reference number
2195 eptr points into the subject
2196 length length to be matched
2197 md points to match data block
2198
2199 Returns: TRUE if matched
2200 */
2201
2202 static BOOL
2203 match_ref(int number, register uschar *eptr, int length, match_data *md)
2204 {
2205 uschar *p = md->start_subject + md->offset_vector[number];
2206
2207 #ifdef DEBUG
2208 if (eptr >= md->end_subject)
2209 printf("matching subject <null>");
2210 else
2211 {
2212 printf("matching subject ");
2213 pchars(eptr, length, TRUE, md);
2214 }
2215 printf(" against backref ");
2216 pchars(p, length, FALSE, md);
2217 printf("\n");
2218 #endif
2219
2220 /* Always fail if not enough characters left */
2221
2222 if (length > md->end_subject - p) return FALSE;
2223
2224 /* Separate the caselesss case for speed */
2225
2226 if (md->caseless)
2227 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2228 else
2229 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2230
2231 return TRUE;
2232 }
2233
2234
2235
2236 /*************************************************
2237 * Match from current position *
2238 *************************************************/
2239
2240 /* On entry ecode points to the first opcode, and eptr to the first character.
2241
2242 Arguments:
2243 eptr pointer in subject
2244 ecode position in code
2245 offset_top current top pointer
2246 md pointer to "static" info for the match
2247
2248 Returns: TRUE if matched
2249 */
2250
2251 static BOOL
2252 match(register uschar *eptr, register uschar *ecode, int offset_top,
2253 match_data *md)
2254 {
2255 for (;;)
2256 {
2257 int min, max, ctype;
2258 register int i;
2259 register int c;
2260 BOOL minimize;
2261
2262 /* Opening bracket. Check the alternative branches in turn, failing if none
2263 match. We have to set the start offset if required and there is space
2264 in the offset vector so that it is available for subsequent back references
2265 if the bracket matches. However, if the bracket fails, we must put back the
2266 previous value of both offsets in case they were set by a previous copy of
2267 the same bracket. Don't worry about setting the flag for the error case here;
2268 that is handled in the code for KET. */
2269
2270 if ((int)*ecode >= OP_BRA)
2271 {
2272 int number = (*ecode - OP_BRA) << 1;
2273 int save_offset1, save_offset2;
2274
2275 #ifdef DEBUG
2276 printf("start bracket %d\n", number/2);
2277 #endif
2278
2279 if (number > 0 && number < md->offset_end)
2280 {
2281 save_offset1 = md->offset_vector[number];
2282 save_offset2 = md->offset_vector[number+1];
2283 md->offset_vector[number] = eptr - md->start_subject;
2284
2285 #ifdef DEBUG
2286 printf("saving %d %d\n", save_offset1, save_offset2);
2287 #endif
2288 }
2289
2290 /* Recurse for all the alternatives. */
2291
2292 do
2293 {
2294 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2295 ecode += (ecode[1] << 8) + ecode[2];
2296 }
2297 while (*ecode == OP_ALT);
2298
2299 #ifdef DEBUG
2300 printf("bracket %d failed\n", number/2);
2301 #endif
2302
2303 if (number > 0 && number < md->offset_end)
2304 {
2305 md->offset_vector[number] = save_offset1;
2306 md->offset_vector[number+1] = save_offset2;
2307 }
2308
2309 return FALSE;
2310 }
2311
2312 /* Other types of node can be handled by a switch */
2313
2314 switch(*ecode)
2315 {
2316 case OP_END:
2317 md->end_match_ptr = eptr; /* Record where we ended */
2318 md->end_offset_top = offset_top; /* and how many extracts were taken */
2319 return TRUE;
2320
2321 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2322 whole thing doesn't match, so we have to get out via a longjmp(). */
2323
2324 case OP_CUT:
2325 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2326 longjmp(md->fail_env, 1);
2327
2328 /* Assertion brackets. Check the alternative branches in turn - the
2329 matching won't pass the KET for an assertion. If any one branch matches,
2330 the assertion is true. */
2331
2332 case OP_ASSERT:
2333 do
2334 {
2335 if (match(eptr, ecode+3, offset_top, md)) break;
2336 ecode += (ecode[1] << 8) + ecode[2];
2337 }
2338 while (*ecode == OP_ALT);
2339 if (*ecode == OP_KET) return FALSE;
2340
2341 /* Continue from after the assertion, updating the offsets high water
2342 mark, since extracts may have been taken during the assertion. */
2343
2344 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2345 ecode += 3;
2346 offset_top = md->end_offset_top;
2347 continue;
2348
2349 /* Negative assertion: all branches must fail to match */
2350
2351 case OP_ASSERT_NOT:
2352 do
2353 {
2354 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2355 ecode += (ecode[1] << 8) + ecode[2];
2356 }
2357 while (*ecode == OP_ALT);
2358 ecode += 3;
2359 continue;
2360
2361 /* "Once" brackets are like assertion brackets except that after a match,
2362 the point in the subject string is not moved back. Thus there can never be
2363 a back into the brackets. Check the alternative branches in turn - the
2364 matching won't pass the KET for this kind of subpattern. If any one branch
2365 matches, we carry on, leaving the subject pointer. */
2366
2367 case OP_ONCE:
2368 do
2369 {
2370 if (match(eptr, ecode+3, offset_top, md)) break;
2371 ecode += (ecode[1] << 8) + ecode[2];
2372 }
2373 while (*ecode == OP_ALT);
2374 if (*ecode == OP_KET) return FALSE;
2375
2376 /* Continue as from after the assertion, updating the offsets high water
2377 mark, since extracts may have been taken. */
2378
2379 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2380 ecode += 3;
2381 offset_top = md->end_offset_top;
2382 eptr = md->end_match_ptr;
2383 continue;
2384
2385 /* An alternation is the end of a branch; scan along to find the end of the
2386 bracketed group and go to there. */
2387
2388 case OP_ALT:
2389 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2390 break;
2391
2392 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2393 that it may occur zero times. It may repeat infinitely, or not at all -
2394 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2395 repeat limits are compiled as a number of copies, with the optional ones
2396 preceded by BRAZERO or BRAMINZERO. */
2397
2398 case OP_BRAZERO:
2399 {
2400 uschar *next = ecode+1;
2401 if (match(eptr, next, offset_top, md)) return TRUE;
2402 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2403 ecode = next + 3;
2404 }
2405 break;
2406
2407 case OP_BRAMINZERO:
2408 {
2409 uschar *next = ecode+1;
2410 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2411 if (match(eptr, next+3, offset_top, md)) return TRUE;
2412 ecode++;
2413 }
2414 break;;
2415
2416 /* End of a group, repeated or non-repeating. If we are at the end of
2417 an assertion "group", stop matching and return TRUE, but record the
2418 current high water mark for use by positive assertions. */
2419
2420 case OP_KET:
2421 case OP_KETRMIN:
2422 case OP_KETRMAX:
2423 {
2424 int number;
2425 uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2426
2427 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2428 {
2429 md->end_match_ptr = eptr; /* For ONCE */
2430 md->end_offset_top = offset_top;
2431 return TRUE;
2432 }
2433
2434 /* In all other cases we have to check the group number back at the
2435 start and if necessary complete handling an extraction by setting the
2436 final offset and bumping the high water mark. */
2437
2438 number = (*prev - OP_BRA) << 1;
2439
2440 #ifdef DEBUG
2441 printf("end bracket %d\n", number/2);
2442 #endif
2443
2444 if (number > 0)
2445 {
2446 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2447 {
2448 md->offset_vector[number+1] = eptr - md->start_subject;
2449 if (offset_top <= number) offset_top = number + 2;
2450 }
2451 }
2452
2453 /* For a non-repeating ket, just advance to the next node and continue at
2454 this level. */
2455
2456 if (*ecode == OP_KET)
2457 {
2458 ecode += 3;
2459 break;
2460 }
2461
2462 /* The repeating kets try the rest of the pattern or restart from the
2463 preceding bracket, in the appropriate order. */
2464
2465 if (*ecode == OP_KETRMIN)
2466 {
2467 if (match(eptr, ecode+3, offset_top, md) ||
2468 match(eptr, prev, offset_top, md)) return TRUE;
2469 }
2470 else /* OP_KETRMAX */
2471 {
2472 if (match(eptr, prev, offset_top, md) ||
2473 match(eptr, ecode+3, offset_top, md)) return TRUE;
2474 }
2475 }
2476 return FALSE;
2477
2478 /* Start of subject unless notbol, or after internal newline if multiline */
2479
2480 case OP_CIRC:
2481 if (md->notbol && eptr == md->start_subject) return FALSE;
2482 if (md->multiline)
2483 {
2484 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2485 ecode++;
2486 break;
2487 }
2488 /* ... else fall through */
2489
2490 /* Start of subject assertion */
2491
2492 case OP_SOD:
2493 if (eptr != md->start_subject) return FALSE;
2494 ecode++;
2495 break;
2496
2497 /* Assert before internal newline if multiline, or before
2498 a terminating newline unless endonly is set, else end of subject unless
2499 noteol is set. */
2500
2501 case OP_DOLL:
2502 if (md->noteol && eptr >= md->end_subject) return FALSE;
2503 if (md->multiline)
2504 {
2505 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2506 ecode++;
2507 break;
2508 }
2509 else if (!md->endonly)
2510 {
2511 if (eptr < md->end_subject - 1 ||
2512 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2513 ecode++;
2514 break;
2515 }
2516 /* ... else fall through */
2517
2518 /* End of subject assertion */
2519
2520 case OP_EOD:
2521 if (eptr < md->end_subject) return FALSE;
2522 ecode++;
2523 break;
2524
2525 /* Word boundary assertions */
2526
2527 case OP_NOT_WORD_BOUNDARY:
2528 case OP_WORD_BOUNDARY:
2529 {
2530 BOOL prev_is_word = (eptr != md->start_subject) &&
2531 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2532 BOOL cur_is_word = (eptr < md->end_subject) &&
2533 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2534 if ((*ecode++ == OP_WORD_BOUNDARY)?
2535 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2536 return FALSE;
2537 }
2538 break;
2539
2540 /* Match a single character type; inline for speed */
2541
2542 case OP_ANY:
2543 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2544 if (eptr++ >= md->end_subject) return FALSE;
2545 ecode++;
2546 break;
2547
2548 case OP_NOT_DIGIT:
2549 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2550 return FALSE;
2551 ecode++;
2552 break;
2553
2554 case OP_DIGIT:
2555 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2556 return FALSE;
2557 ecode++;
2558 break;
2559
2560 case OP_NOT_WHITESPACE:
2561 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2562 return FALSE;
2563 ecode++;
2564 break;
2565
2566 case OP_WHITESPACE:
2567 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2568 return FALSE;
2569 ecode++;
2570 break;
2571
2572 case OP_NOT_WORDCHAR:
2573 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2574 return FALSE;
2575 ecode++;
2576 break;
2577
2578 case OP_WORDCHAR:
2579 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2580 return FALSE;
2581 ecode++;
2582 break;
2583
2584 /* Match a back reference, possibly repeatedly. Look past the end of the
2585 item to see if there is repeat information following. The code is similar
2586 to that for character classes, but repeated for efficiency. Then obey
2587 similar code to character type repeats - written out again for speed.
2588 However, if the referenced string is the empty string, always treat
2589 it as matched, any number of times (otherwise there could be infinite
2590 loops). */
2591
2592 case OP_REF:
2593 {
2594 int length;
2595 int number = ecode[1] << 1; /* Doubled reference number */
2596 ecode += 2; /* Advance past the item */
2597
2598 if (number >= offset_top || md->offset_vector[number] < 0)
2599 {
2600 md->errorcode = PCRE_ERROR_BADREF;
2601 return FALSE;
2602 }
2603
2604 length = md->offset_vector[number+1] - md->offset_vector[number];
2605
2606 switch (*ecode)
2607 {
2608 case OP_CRSTAR:
2609 case OP_CRMINSTAR:
2610 case OP_CRPLUS:
2611 case OP_CRMINPLUS:
2612 case OP_CRQUERY:
2613 case OP_CRMINQUERY:
2614 c = *ecode++ - OP_CRSTAR;
2615 minimize = (c & 1) != 0;
2616 min = rep_min[c]; /* Pick up values from tables; */
2617 max = rep_max[c]; /* zero for max => infinity */
2618 if (max == 0) max = INT_MAX;
2619 break;
2620
2621 case OP_CRRANGE:
2622 case OP_CRMINRANGE:
2623 minimize = (*ecode == OP_CRMINRANGE);
2624 min = (ecode[1] << 8) + ecode[2];
2625 max = (ecode[3] << 8) + ecode[4];
2626 if (max == 0) max = INT_MAX;
2627 ecode += 5;
2628 break;
2629
2630 default: /* No repeat follows */
2631 if (!match_ref(number, eptr, length, md)) return FALSE;
2632 eptr += length;
2633 continue; /* With the main loop */
2634 }
2635
2636 /* If the length of the reference is zero, just continue with the
2637 main loop. */
2638
2639 if (length == 0) continue;
2640
2641 /* First, ensure the minimum number of matches are present. We get back
2642 the length of the reference string explicitly rather than passing the
2643 address of eptr, so that eptr can be a register variable. */
2644
2645 for (i = 1; i <= min; i++)
2646 {
2647 if (!match_ref(number, eptr, length, md)) return FALSE;
2648 eptr += length;
2649 }
2650
2651 /* If min = max, continue at the same level without recursion.
2652 They are not both allowed to be zero. */
2653
2654 if (min == max) continue;
2655
2656 /* If minimizing, keep trying and advancing the pointer */
2657
2658 if (minimize)
2659 {
2660 for (i = min;; i++)
2661 {
2662 if (match(eptr, ecode, offset_top, md)) return TRUE;
2663 if (i >= max || !match_ref(number, eptr, length, md))
2664 return FALSE;
2665 eptr += length;
2666 }
2667 /* Control never gets here */
2668 }
2669
2670 /* If maximizing, find the longest string and work backwards */
2671
2672 else
2673 {
2674 uschar *pp = eptr;
2675 for (i = min; i < max; i++)
2676 {
2677 if (!match_ref(number, eptr, length, md)) break;
2678 eptr += length;
2679 }
2680 while (eptr >= pp)
2681 {
2682 if (match(eptr, ecode, offset_top, md)) return TRUE;
2683 eptr -= length;
2684 }
2685 return FALSE;
2686 }
2687 }
2688 /* Control never gets here */
2689
2690 /* Match a character class, possibly repeatedly. Look past the end of the
2691 item to see if there is repeat information following. Then obey similar
2692 code to character type repeats - written out again for speed. If caseless
2693 matching was set at runtime but not at compile time, we have to check both
2694 versions of a character. */
2695
2696 case OP_CLASS:
2697 {
2698 uschar *data = ecode + 1; /* Save for matching */
2699 ecode += 33; /* Advance past the item */
2700
2701 switch (*ecode)
2702 {
2703 case OP_CRSTAR:
2704 case OP_CRMINSTAR:
2705 case OP_CRPLUS:
2706 case OP_CRMINPLUS:
2707 case OP_CRQUERY:
2708 case OP_CRMINQUERY:
2709 c = *ecode++ - OP_CRSTAR;
2710 minimize = (c & 1) != 0;
2711 min = rep_min[c]; /* Pick up values from tables; */
2712 max = rep_max[c]; /* zero for max => infinity */
2713 if (max == 0) max = INT_MAX;
2714 break;
2715
2716 case OP_CRRANGE:
2717 case OP_CRMINRANGE:
2718 minimize = (*ecode == OP_CRMINRANGE);
2719 min = (ecode[1] << 8) + ecode[2];
2720 max = (ecode[3] << 8) + ecode[4];
2721 if (max == 0) max = INT_MAX;
2722 ecode += 5;
2723 break;
2724
2725 default: /* No repeat follows */
2726 if (eptr >= md->end_subject) return FALSE;
2727 c = *eptr++;
2728 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2729 if (md->runtime_caseless)
2730 {
2731 c = pcre_fcc[c];
2732 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
2733 }
2734 return FALSE;
2735 }
2736
2737 /* First, ensure the minimum number of matches are present. */
2738
2739 for (i = 1; i <= min; i++)
2740 {
2741 if (eptr >= md->end_subject) return FALSE;
2742 c = *eptr++;
2743 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2744 if (md->runtime_caseless)
2745 {
2746 c = pcre_fcc[c];
2747 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2748 }
2749 return FALSE;
2750 }
2751
2752 /* If max == min we can continue with the main loop without the
2753 need to recurse. */
2754
2755 if (min == max) continue;
2756
2757 /* If minimizing, keep testing the rest of the expression and advancing
2758 the pointer while it matches the class. */
2759
2760 if (minimize)
2761 {
2762 for (i = min;; i++)
2763 {
2764 if (match(eptr, ecode, offset_top, md)) return TRUE;
2765 if (i >= max || eptr >= md->end_subject) return FALSE;
2766 c = *eptr++;
2767 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2768 if (md->runtime_caseless)
2769 {
2770 c = pcre_fcc[c];
2771 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2772 }
2773 return FALSE;
2774 }
2775 /* Control never gets here */
2776 }
2777
2778 /* If maximizing, find the longest possible run, then work backwards. */
2779
2780 else
2781 {
2782 uschar *pp = eptr;
2783 for (i = min; i < max; eptr++, i++)
2784 {
2785 if (eptr >= md->end_subject) break;
2786 c = *eptr;
2787 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2788 if (md->runtime_caseless)
2789 {
2790 c = pcre_fcc[c];
2791 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2792 }
2793 break;
2794 }
2795
2796 while (eptr >= pp)
2797 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2798 return FALSE;
2799 }
2800 }
2801 /* Control never gets here */
2802
2803 /* Match a run of characters */
2804
2805 case OP_CHARS:
2806 {
2807 register int length = ecode[1];
2808 ecode += 2;
2809
2810 #ifdef DEBUG
2811 if (eptr >= md->end_subject)
2812 printf("matching subject <null> against pattern ");
2813 else
2814 {
2815 printf("matching subject ");
2816 pchars(eptr, length, TRUE, md);
2817 printf(" against pattern ");
2818 }
2819 pchars(ecode, length, FALSE, md);
2820 printf("\n");
2821 #endif
2822
2823 if (length > md->end_subject - eptr) return FALSE;
2824 if (md->caseless)
2825 {
2826 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2827 }
2828 else
2829 {
2830 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2831 }
2832 }
2833 break;
2834
2835 /* Match a single character repeatedly; different opcodes share code. */
2836
2837 case OP_EXACT:
2838 min = max = (ecode[1] << 8) + ecode[2];
2839 ecode += 3;
2840 goto REPEATCHAR;
2841
2842 case OP_UPTO:
2843 case OP_MINUPTO:
2844 min = 0;
2845 max = (ecode[1] << 8) + ecode[2];
2846 minimize = *ecode == OP_MINUPTO;
2847 ecode += 3;
2848 goto REPEATCHAR;
2849
2850 case OP_STAR:
2851 case OP_MINSTAR:
2852 case OP_PLUS:
2853 case OP_MINPLUS:
2854 case OP_QUERY:
2855 case OP_MINQUERY:
2856 c = *ecode++ - OP_STAR;
2857 minimize = (c & 1) != 0;
2858 min = rep_min[c]; /* Pick up values from tables; */
2859 max = rep_max[c]; /* zero for max => infinity */
2860 if (max == 0) max = INT_MAX;
2861
2862 /* Common code for all repeated single-character matches. We can give
2863 up quickly if there are fewer than the minimum number of characters left in
2864 the subject. */
2865
2866 REPEATCHAR:
2867 if (min > md->end_subject - eptr) return FALSE;
2868 c = *ecode++;
2869
2870 /* The code is duplicated for the caseless and caseful cases, for speed,
2871 since matching characters is likely to be quite common. First, ensure the
2872 minimum number of matches are present. If min = max, continue at the same
2873 level without recursing. Otherwise, if minimizing, keep trying the rest of
2874 the expression and advancing one matching character if failing, up to the
2875 maximum. Alternatively, if maximizing, find the maximum number of
2876 characters and work backwards. */
2877
2878 #ifdef DEBUG
2879 printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2880 max, eptr);
2881 #endif
2882
2883 if (md->caseless)
2884 {
2885 c = pcre_lcc[c];
2886 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2887 if (min == max) continue;
2888 if (minimize)
2889 {
2890 for (i = min;; i++)
2891 {
2892 if (match(eptr, ecode, offset_top, md)) return TRUE;
2893 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2894 return FALSE;
2895 }
2896 /* Control never gets here */
2897 }
2898 else
2899 {
2900 uschar *pp = eptr;
2901 for (i = min; i < max; i++)
2902 {
2903 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2904 eptr++;
2905 }
2906 while (eptr >= pp)
2907 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2908 return FALSE;
2909 }
2910 /* Control never gets here */
2911 }
2912
2913 /* Caseful comparisons */
2914
2915 else
2916 {
2917 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
2918 if (min == max) continue;
2919 if (minimize)
2920 {
2921 for (i = min;; i++)
2922 {
2923 if (match(eptr, ecode, offset_top, md)) return TRUE;
2924 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
2925 }
2926 /* Control never gets here */
2927 }
2928 else
2929 {
2930 uschar *pp = eptr;
2931 for (i = min; i < max; i++)
2932 {
2933 if (eptr >= md->end_subject || c != *eptr) break;
2934 eptr++;
2935 }
2936 while (eptr >= pp)
2937 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2938 return FALSE;
2939 }
2940 }
2941 /* Control never gets here */
2942
2943 /* Match a negated single character */
2944
2945 case OP_NOT:
2946 if (eptr > md->end_subject) return FALSE;
2947 ecode++;
2948 if (md->caseless)
2949 {
2950 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
2951 }
2952 else
2953 {
2954 if (*ecode++ == *eptr++) return FALSE;
2955 }
2956 break;
2957
2958 /* Match a negated single character repeatedly. This is almost a repeat of
2959 the code for a repeated single character, but I haven't found a nice way of
2960 commoning these up that doesn't require a test of the positive/negative
2961 option for each character match. Maybe that wouldn't add very much to the
2962 time taken, but character matching *is* what this is all about... */
2963
2964 case OP_NOTEXACT:
2965 min = max = (ecode[1] << 8) + ecode[2];
2966 ecode += 3;
2967 goto REPEATNOTCHAR;
2968
2969 case OP_NOTUPTO:
2970 case OP_NOTMINUPTO:
2971 min = 0;
2972 max = (ecode[1] << 8) + ecode[2];
2973 minimize = *ecode == OP_NOTMINUPTO;
2974 ecode += 3;
2975 goto REPEATNOTCHAR;
2976
2977 case OP_NOTSTAR:
2978 case OP_NOTMINSTAR:
2979 case OP_NOTPLUS:
2980 case OP_NOTMINPLUS:
2981 case OP_NOTQUERY:
2982 case OP_NOTMINQUERY:
2983 c = *ecode++ - OP_NOTSTAR;
2984 minimize = (c & 1) != 0;
2985 min = rep_min[c]; /* Pick up values from tables; */
2986 max = rep_max[c]; /* zero for max => infinity */
2987 if (max == 0) max = INT_MAX;
2988
2989 /* Common code for all repeated single-character matches. We can give
2990 up quickly if there are fewer than the minimum number of characters left in
2991 the subject. */
2992
2993 REPEATNOTCHAR:
2994 if (min > md->end_subject - eptr) return FALSE;
2995 c = *ecode++;
2996
2997 /* The code is duplicated for the caseless and caseful cases, for speed,
2998 since matching characters is likely to be quite common. First, ensure the
2999 minimum number of matches are present. If min = max, continue at the same
3000 level without recursing. Otherwise, if minimizing, keep trying the rest of
3001 the expression and advancing one matching character if failing, up to the
3002 maximum. Alternatively, if maximizing, find the maximum number of
3003 characters and work backwards. */
3004
3005 #ifdef DEBUG
3006 printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3007 max, eptr);
3008 #endif
3009
3010 if (md->caseless)
3011 {
3012 c = pcre_lcc[c];
3013 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3014 if (min == max) continue;
3015 if (minimize)
3016 {
3017 for (i = min;; i++)
3018 {
3019 if (match(eptr, ecode, offset_top, md)) return TRUE;
3020 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3021 return FALSE;
3022 }
3023 /* Control never gets here */
3024 }
3025 else
3026 {
3027 uschar *pp = eptr;
3028 for (i = min; i < max; i++)
3029 {
3030 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3031 eptr++;
3032 }
3033 while (eptr >= pp)
3034 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3035 return FALSE;
3036 }
3037 /* Control never gets here */
3038 }
3039
3040 /* Caseful comparisons */
3041
3042 else
3043 {
3044 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3045 if (min == max) continue;
3046 if (minimize)
3047 {
3048 for (i = min;; i++)
3049 {
3050 if (match(eptr, ecode, offset_top, md)) return TRUE;
3051 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3052 }
3053 /* Control never gets here */
3054 }
3055 else
3056 {
3057 uschar *pp = eptr;
3058 for (i = min; i < max; i++)
3059 {
3060 if (eptr >= md->end_subject || c == *eptr) break;
3061 eptr++;
3062 }
3063 while (eptr >= pp)
3064 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3065 return FALSE;
3066 }
3067 }
3068 /* Control never gets here */
3069
3070 /* Match a single character type repeatedly; several different opcodes
3071 share code. This is very similar to the code for single characters, but we
3072 repeat it in the interests of efficiency. */
3073
3074 case OP_TYPEEXACT:
3075 min = max = (ecode[1] << 8) + ecode[2];
3076 minimize = TRUE;
3077 ecode += 3;
3078 goto REPEATTYPE;
3079
3080 case OP_TYPEUPTO:
3081 case OP_TYPEMINUPTO:
3082 min = 0;
3083 max = (ecode[1] << 8) + ecode[2];
3084 minimize = *ecode == OP_TYPEMINUPTO;
3085 ecode += 3;
3086 goto REPEATTYPE;
3087
3088 case OP_TYPESTAR:
3089 case OP_TYPEMINSTAR:
3090 case OP_TYPEPLUS:
3091 case OP_TYPEMINPLUS:
3092 case OP_TYPEQUERY:
3093 case OP_TYPEMINQUERY:
3094 c = *ecode++ - OP_TYPESTAR;
3095 minimize = (c & 1) != 0;
3096 min = rep_min[c]; /* Pick up values from tables; */
3097 max = rep_max[c]; /* zero for max => infinity */
3098 if (max == 0) max = INT_MAX;
3099
3100 /* Common code for all repeated single character type matches */
3101
3102 REPEATTYPE:
3103 ctype = *ecode++; /* Code for the character type */
3104
3105 /* First, ensure the minimum number of matches are present. Use inline
3106 code for maximizing the speed, and do the type test once at the start
3107 (i.e. keep it out of the loop). Also test that there are at least the
3108 minimum number of characters before we start. */
3109
3110 if (min > md->end_subject - eptr) return FALSE;
3111 if (min > 0) switch(ctype)
3112 {
3113 case OP_ANY:
3114 if (!md->dotall)
3115 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3116 else eptr += min;
3117 break;
3118
3119 case OP_NOT_DIGIT:
3120 for (i = 1; i <= min; i++)
3121 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3122 break;
3123
3124 case OP_DIGIT:
3125 for (i = 1; i <= min; i++)
3126 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3127 break;
3128
3129 case OP_NOT_WHITESPACE:
3130 for (i = 1; i <= min; i++)
3131 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3132 break;
3133
3134 case OP_WHITESPACE:
3135 for (i = 1; i <= min; i++)
3136 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3137 break;
3138
3139 case OP_NOT_WORDCHAR:
3140 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3141 return FALSE;
3142 break;
3143
3144 case OP_WORDCHAR:
3145 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3146 return FALSE;
3147 break;
3148 }
3149
3150 /* If min = max, continue at the same level without recursing */
3151
3152 if (min == max) continue;
3153
3154 /* If minimizing, we have to test the rest of the pattern before each
3155 subsequent match, so inlining isn't much help; just use the function. */
3156
3157 if (minimize)
3158 {
3159 for (i = min;; i++)
3160 {
3161 if (match(eptr, ecode, offset_top, md)) return TRUE;
3162 if (i >= max || eptr >= md->end_subject ||
3163 !match_type(ctype, *eptr++, md->dotall))
3164 return FALSE;
3165 }
3166 /* Control never gets here */
3167 }
3168
3169 /* If maximizing it is worth using inline code for speed, doing the type
3170 test once at the start (i.e. keep it out of the loop). */
3171
3172 else
3173 {
3174 uschar *pp = eptr;
3175 switch(ctype)
3176 {
3177 case OP_ANY:
3178 if (!md->dotall)
3179 {
3180 for (i = min; i < max; i++)
3181 {
3182 if (eptr >= md->end_subject || *eptr == '\n') break;
3183 eptr++;
3184 }
3185 }
3186 else
3187 {
3188 c = max - min;
3189 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3190 eptr += c;
3191 }
3192 break;
3193
3194 case OP_NOT_DIGIT:
3195 for (i = min; i < max; i++)
3196 {
3197 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3198 break;
3199 eptr++;
3200 }
3201 break;
3202
3203 case OP_DIGIT:
3204 for (i = min; i < max; i++)
3205 {
3206 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3207 break;
3208 eptr++;
3209 }
3210 break;
3211
3212 case OP_NOT_WHITESPACE:
3213 for (i = min; i < max; i++)
3214 {
3215 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3216 break;
3217 eptr++;
3218 }
3219 break;
3220
3221 case OP_WHITESPACE:
3222 for (i = min; i < max; i++)
3223 {
3224 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3225 break;
3226 eptr++;
3227 }
3228 break;
3229
3230 case OP_NOT_WORDCHAR:
3231 for (i = min; i < max; i++)
3232 {
3233 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3234 break;
3235 eptr++;
3236 }
3237 break;
3238
3239 case OP_WORDCHAR:
3240 for (i = min; i < max; i++)
3241 {
3242 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3243 break;
3244 eptr++;
3245 }
3246 break;
3247 }
3248
3249 while (eptr >= pp)
3250 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3251 return FALSE;
3252 }
3253 /* Control never gets here */
3254
3255 /* There's been some horrible disaster. */
3256
3257 default:
3258 #ifdef DEBUG
3259 printf("Unknown opcode %d\n", *ecode);
3260 #endif
3261 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3262 return FALSE;
3263 }
3264
3265 /* Do not stick any code in here without much thought; it is assumed
3266 that "continue" in the code above comes out to here to repeat the main
3267 loop. */
3268
3269 } /* End of main loop */
3270 /* Control never reaches here */
3271 }
3272
3273
3274
3275 /*************************************************
3276 * Execute a Regular Expression *
3277 *************************************************/
3278
3279 /* This function applies a compiled re to a subject string and picks out
3280 portions of the string if it matches. Two elements in the vector are set for
3281 each substring: the offsets to the start and end of the substring.
3282
3283 Arguments:
3284 external_re points to the compiled expression
3285 external_extra points to "hints" from pcre_study() or is NULL
3286 subject points to the subject string
3287 length length of subject string (may contain binary zeros)
3288 options option bits
3289 offsets points to a vector of ints to be filled in with offsets
3290 offsetcount the number of elements in the vector
3291
3292 Returns: > 0 => success; value is the number of elements filled in
3293 = 0 => success, but offsets is not big enough
3294 -1 => failed to match
3295 < -1 => some kind of unexpected problem
3296 */
3297
3298 int
3299 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3300 const char *subject, int length, int options, int *offsets, int offsetcount)
3301 {
3302 int resetcount;
3303 int ocount = offsetcount;
3304 int first_char = -1;
3305 match_data match_block;
3306 uschar *start_bits = NULL;
3307 uschar *start_match = (uschar *)subject;
3308 uschar *end_subject;
3309 real_pcre *re = (real_pcre *)external_re;
3310 real_pcre_extra *extra = (real_pcre_extra *)external_extra;
3311 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3312 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3313
3314 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3315
3316 if (re == NULL || subject == NULL ||
3317 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3318 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3319
3320 match_block.start_subject = (uschar *)subject;
3321 match_block.end_subject = match_block.start_subject + length;
3322 end_subject = match_block.end_subject;
3323
3324 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3325 match_block.runtime_caseless = match_block.caseless &&
3326 (re->options & PCRE_CASELESS) == 0;
3327
3328 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3329 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3330 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3331
3332 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3333 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3334
3335 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3336
3337 /* If the expression has got more back references than the offsets supplied can
3338 hold, we get a temporary bit of working store to use during the matching.
3339 Otherwise, we can use the vector supplied, rounding down the size of it to a
3340 multiple of 2. */
3341
3342 ocount &= (-2);
3343 if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
3344 {
3345 ocount = re->top_backref * 2 + 2;
3346 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
3347 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3348 #ifdef DEBUG
3349 printf("Got memory to hold back references\n");
3350 #endif
3351 }
3352 else match_block.offset_vector = offsets;
3353
3354 match_block.offset_end = ocount;
3355 match_block.offset_overflow = FALSE;
3356
3357 /* Compute the minimum number of offsets that we need to reset each time. Doing
3358 this makes a huge difference to execution time when there aren't many brackets
3359 in the pattern. */
3360
3361 resetcount = 2 + re->top_bracket * 2;
3362 if (resetcount > offsetcount) resetcount = ocount;
3363
3364 /* If MULTILINE is set at exec time but was not set at compile time, and the
3365 anchored flag is set, we must re-check because a setting provoked by ^ in the
3366 pattern is not right in multi-line mode. Calling is_anchored() again here does
3367 the right check, because multiline is now set. If it now yields FALSE, the
3368 expression must have had ^ starting some of its branches. Check to see if
3369 that is true for *all* branches, and if so, set the startline flag. */
3370
3371 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3372 !is_anchored(re->code, match_block.multiline))
3373 {
3374 anchored = FALSE;
3375 if (is_startline(re->code)) startline = TRUE;
3376 }
3377
3378 /* Set up the first character to match, if available. The first_char value is
3379 never set for an anchored regular expression, but the anchoring may be forced
3380 at run time, so we have to test for anchoring. The first char may be unset for
3381 an unanchored pattern, of course. If there's no first char and the pattern was
3382 studied, the may be a bitmap of possible first characters. However, we can
3383 use this only if the caseless state of the studying was correct. */
3384
3385 if (!anchored)
3386 {
3387 if ((re->options & PCRE_FIRSTSET) != 0)
3388 {
3389 first_char = re->first_char;
3390 if (match_block.caseless) first_char = pcre_lcc[first_char];
3391 }
3392 else
3393 if (!startline && extra != NULL &&
3394 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3395 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3396 start_bits = extra->start_bits;
3397 }
3398
3399 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3400
3401 do
3402 {
3403 register int *iptr = match_block.offset_vector;
3404 register int *iend = iptr + resetcount;
3405
3406 /* Reset the maximum number of extractions we might see. */
3407
3408 while (iptr < iend) *iptr++ = -1;
3409
3410 /* Advance to a unique first char if possible */
3411
3412 if (first_char >= 0)
3413 {
3414 if (match_block.caseless)
3415 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3416 start_match++;
3417 else
3418 while (start_match < end_subject && *start_match != first_char)
3419 start_match++;
3420 }
3421
3422 /* Or to just after \n for a multiline match if possible */
3423
3424 else if (startline)
3425 {
3426 if (start_match > match_block.start_subject)
3427 {
3428 while (start_match < end_subject && start_match[-1] != '\n')
3429 start_match++;
3430 }
3431 }
3432
3433 /* Or to a non-unique first char */
3434
3435 else if (start_bits != NULL)
3436 {
3437 while (start_match < end_subject)
3438 {
3439 register int c = *start_match;
3440 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3441 }
3442 }
3443
3444 #ifdef DEBUG
3445 printf(">>>> Match against: ");
3446 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3447 printf("\n");
3448 #endif
3449
3450 /* When a match occurs, substrings will be set for all internal extractions;
3451 we just need to set up the whole thing as substring 0 before returning. If
3452 there were too many extractions, set the return code to zero. In the case
3453 where we had to get some local store to hold offsets for backreferences, copy
3454 those back references that we can. In this case there need not be overflow
3455 if certain parts of the pattern were not used.
3456
3457 Before starting the match, we have to set up a longjmp() target to enable
3458 the "cut" operation to fail a match completely without backtracking. */
3459
3460 if (setjmp(match_block.fail_env) == 0 &&
3461 match(start_match, re->code, 2, &match_block))
3462 {
3463 int rc;
3464
3465 if (ocount != offsetcount)
3466 {
3467 if (offsetcount >= 4)
3468 {
3469 memcpy(offsets + 2, match_block.offset_vector + 2,
3470 (offsetcount - 2) * sizeof(int));
3471 #ifdef DEBUG
3472 printf("Copied offsets; freeing temporary memory\n");
3473 #endif
3474 }
3475 if (match_block.end_offset_top > offsetcount)
3476 match_block.offset_overflow = TRUE;
3477
3478 #ifdef DEBUG
3479 printf("Freeing temporary memory\n");
3480 #endif
3481
3482 (pcre_free)(match_block.offset_vector);
3483 }
3484
3485 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3486
3487 if (match_block.offset_end < 2) rc = 0; else
3488 {
3489 offsets[0] = start_match - match_block.start_subject;
3490 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3491 }
3492
3493 #ifdef DEBUG
3494 printf(">>>> returning %d\n", rc);
3495 #endif
3496 return rc;
3497 }
3498 }
3499 while (!anchored &&
3500 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3501 start_match++ < end_subject);
3502
3503 #ifdef DEBUG
3504 printf(">>>> returning %d\n", match_block.errorcode);
3505 #endif
3506
3507 return match_block.errorcode;
3508 }
3509
3510 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12