/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 21 - (show annotations) (download)
Sat Feb 24 21:38:37 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 104554 byte(s)
Load pcre-1.09 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1998 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the the use
37 of #ifdef inline, and there are *still* stupid compilers about that don't like
38 indented pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Allow compilation as C++ source code, should anybody want to do that. */
53
54 #ifdef __cplusplus
55 #define class pcre_class
56 #endif
57
58
59 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
60
61 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
62 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
63
64 /* Text forms of OP_ values and things, for debugging (not all used) */
65
66 #ifdef DEBUG
67 static const char *OP_names[] = {
68 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
69 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
70 "not",
71 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
72 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
73 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
74 "*", "*?", "+", "+?", "?", "??", "{", "{",
75 "class", "negclass", "Ref",
76 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
77 "Brazero", "Braminzero", "Bra"
78 };
79 #endif
80
81 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
82 are simple data values; negative values are for special things like \d and so
83 on. Zero means further processing is needed (for things like \x), or the escape
84 is invalid. */
85
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
90 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
94 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
95 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
96 0, 0, 0 /* x - z */
97 };
98
99 /* Definition to allow mutual recursion */
100
101 static BOOL
102 compile_regex(int, int *, uschar **, const uschar **, const char **);
103
104 /* Structure for passing "static" information around between the functions
105 doing the matching, so that they are thread-safe. */
106
107 typedef struct match_data {
108 int errorcode; /* As it says */
109 int *offset_vector; /* Offset vector */
110 int offset_end; /* One past the end */
111 BOOL offset_overflow; /* Set if too many extractions */
112 BOOL caseless; /* Case-independent flag */
113 BOOL runtime_caseless; /* Caseless forced at run time */
114 BOOL multiline; /* Multiline flag */
115 BOOL notbol; /* NOTBOL flag */
116 BOOL noteol; /* NOTEOL flag */
117 BOOL dotall; /* Dot matches any char */
118 BOOL endonly; /* Dollar not before final \n */
119 const uschar *start_subject; /* Start of the subject string */
120 const uschar *end_subject; /* End of the subject string */
121 jmp_buf fail_env; /* Environment for longjump() break out */
122 const uschar *end_match_ptr; /* Subject position at end match */
123 int end_offset_top; /* Highwater mark at end of match */
124 } match_data;
125
126
127
128 /*************************************************
129 * Global variables *
130 *************************************************/
131
132 /* PCRE is thread-clean and doesn't use any global variables in the normal
133 sense. However, it calls memory allocation and free functions via the two
134 indirections below, which are can be changed by the caller, but are shared
135 between all threads. */
136
137 void *(*pcre_malloc)(size_t) = malloc;
138 void (*pcre_free)(void *) = free;
139
140
141
142
143 /*************************************************
144 * Return version string *
145 *************************************************/
146
147 const char *
148 pcre_version(void)
149 {
150 return PCRE_VERSION;
151 }
152
153
154
155
156 /*************************************************
157 * Return info about a compiled pattern *
158 *************************************************/
159
160 /* This function picks potentially useful data out of the private
161 structure.
162
163 Arguments:
164 external_re points to compiled code
165 optptr where to pass back the options
166 first_char where to pass back the first character,
167 or -1 if multiline and all branches start ^,
168 or -2 otherwise
169
170 Returns: number of identifying extraction brackets
171 or negative values on error
172 */
173
174 int
175 pcre_info(const pcre *external_re, int *optptr, int *first_char)
176 {
177 const real_pcre *re = (const real_pcre *)external_re;
178 if (re == NULL) return PCRE_ERROR_NULL;
179 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
180 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
181 if (first_char != NULL)
182 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
183 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
184 return re->top_bracket;
185 }
186
187
188
189
190 #ifdef DEBUG
191 /*************************************************
192 * Debugging function to print chars *
193 *************************************************/
194
195 /* Print a sequence of chars in printable format, stopping at the end of the
196 subject if the requested.
197
198 Arguments:
199 p points to characters
200 length number to print
201 is_subject TRUE if printing from within md->start_subject
202 md pointer to matching data block, if is_subject is TRUE
203
204 Returns: nothing
205 */
206
207 static void
208 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
209 {
210 int c;
211 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
212 while (length-- > 0)
213 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
214 }
215 #endif
216
217
218
219
220 /*************************************************
221 * Check subpattern for empty operand *
222 *************************************************/
223
224 /* This function checks a bracketed subpattern to see if any of the paths
225 through it could match an empty string. This is used to diagnose an error if
226 such a subpattern is followed by a quantifier with an unlimited upper bound.
227
228 Argument:
229 code points to the opening bracket
230
231 Returns: TRUE or FALSE
232 */
233
234 static BOOL
235 could_be_empty(uschar *code)
236 {
237 do {
238 uschar *cc = code + 3;
239
240 /* Scan along the opcodes for this branch; as soon as we find something
241 that matches a non-empty string, break out and advance to test the next
242 branch. If we get to the end of the branch, return TRUE for the whole
243 sub-expression. */
244
245 for (;;)
246 {
247 /* Test an embedded subpattern; if it could not be empty, break the
248 loop. Otherwise carry on in the branch. */
249
250 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
251 {
252 if (!could_be_empty(cc)) break;
253 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
254 cc += 3;
255 }
256
257 else switch (*cc)
258 {
259 /* Reached end of a branch: the subpattern may match the empty string */
260
261 case OP_ALT:
262 case OP_KET:
263 case OP_KETRMAX:
264 case OP_KETRMIN:
265 return TRUE;
266
267 /* Skip over entire bracket groups with zero lower bound */
268
269 case OP_BRAZERO:
270 case OP_BRAMINZERO:
271 cc++;
272 /* Fall through */
273
274 /* Skip over assertive subpatterns */
275
276 case OP_ASSERT:
277 case OP_ASSERT_NOT:
278 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
279 cc += 3;
280 break;
281
282 /* Skip over things that don't match chars */
283
284 case OP_SOD:
285 case OP_EOD:
286 case OP_CIRC:
287 case OP_DOLL:
288 case OP_NOT_WORD_BOUNDARY:
289 case OP_WORD_BOUNDARY:
290 cc++;
291 break;
292
293 /* Skip over simple repeats with zero lower bound */
294
295 case OP_STAR:
296 case OP_MINSTAR:
297 case OP_QUERY:
298 case OP_MINQUERY:
299 case OP_NOTSTAR:
300 case OP_NOTMINSTAR:
301 case OP_NOTQUERY:
302 case OP_NOTMINQUERY:
303 case OP_TYPESTAR:
304 case OP_TYPEMINSTAR:
305 case OP_TYPEQUERY:
306 case OP_TYPEMINQUERY:
307 cc += 2;
308 break;
309
310 /* Skip over UPTOs (lower bound is zero) */
311
312 case OP_UPTO:
313 case OP_MINUPTO:
314 case OP_TYPEUPTO:
315 case OP_TYPEMINUPTO:
316 cc += 4;
317 break;
318
319 /* Check a class or a back reference for a zero minimum */
320
321 case OP_CLASS:
322 case OP_NEGCLASS:
323 case OP_REF:
324 cc += (*cc == OP_REF)? 2 : 33;
325
326 switch (*cc)
327 {
328 case OP_CRSTAR:
329 case OP_CRMINSTAR:
330 case OP_CRQUERY:
331 case OP_CRMINQUERY:
332 cc++;
333 break;
334
335 case OP_CRRANGE:
336 case OP_CRMINRANGE:
337 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
338 cc += 3;
339 break;
340
341 default:
342 goto NEXT_BRANCH;
343 }
344 break;
345
346 /* Anything else matches at least one character */
347
348 default:
349 goto NEXT_BRANCH;
350 }
351 }
352
353 NEXT_BRANCH:
354 code += (code[1] << 8) + code[2];
355 }
356 while (*code == OP_ALT);
357
358 /* No branches match the empty string */
359
360 return FALSE;
361 }
362
363
364
365 /*************************************************
366 * Handle escapes *
367 *************************************************/
368
369 /* This function is called when a \ has been encountered. It either returns a
370 positive value for a simple escape such as \n, or a negative value which
371 encodes one of the more complicated things such as \d. On entry, ptr is
372 pointing at the \. On exit, it is on the final character of the escape
373 sequence.
374
375 Arguments:
376 ptrptr points to the pattern position pointer
377 errorptr points to the pointer to the error message
378 bracount number of previous extracting brackets
379 options the options bits
380 isclass TRUE if inside a character class
381
382 Returns: zero or positive => a data character
383 negative => a special escape sequence
384 on error, errorptr is set
385 */
386
387 static int
388 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
389 int options, BOOL isclass)
390 {
391 const uschar *ptr = *ptrptr;
392 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
393 int i;
394
395 if (c == 0) *errorptr = ERR1;
396
397 /* Digits or letters may have special meaning; all others are literals. */
398
399 else if (c < '0' || c > 'z') {}
400
401 /* Do an initial lookup in a table. A non-zero result is something that can be
402 returned immediately. Otherwise further processing may be required. */
403
404 else if ((i = escapes[c - '0']) != 0) c = i;
405
406 /* Escapes that need further processing, or are illegal. */
407
408 else
409 {
410 const uschar *oldptr;
411 switch (c)
412 {
413 /* The handling of escape sequences consisting of a string of digits
414 starting with one that is not zero is not straightforward. By experiment,
415 the way Perl works seems to be as follows:
416
417 Outside a character class, the digits are read as a decimal number. If the
418 number is less than 10, or if there are that many previous extracting
419 left brackets, then it is a back reference. Otherwise, up to three octal
420 digits are read to form an escaped byte. Thus \123 is likely to be octal
421 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
422 value is greater than 377, the least significant 8 bits are taken. Inside a
423 character class, \ followed by a digit is always an octal number. */
424
425 case '1': case '2': case '3': case '4': case '5':
426 case '6': case '7': case '8': case '9':
427
428 if (!isclass)
429 {
430 oldptr = ptr;
431 c -= '0';
432 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
433 c = c * 10 + *(++ptr) - '0';
434 if (c < 10 || c <= bracount)
435 {
436 c = -(ESC_REF + c);
437 break;
438 }
439 ptr = oldptr; /* Put the pointer back and fall through */
440 }
441
442 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
443 generates a binary zero byte and treats the digit as a following literal.
444 Thus we have to pull back the pointer by one. */
445
446 if ((c = *ptr) >= '8')
447 {
448 ptr--;
449 c = 0;
450 break;
451 }
452
453 /* \0 always starts an octal number, but we may drop through to here with a
454 larger first octal digit */
455
456 case '0':
457 c -= '0';
458 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
459 ptr[1] != '8' && ptr[1] != '9')
460 c = c * 8 + *(++ptr) - '0';
461 break;
462
463 /* Special escapes not starting with a digit are straightforward */
464
465 case 'x':
466 c = 0;
467 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
468 {
469 ptr++;
470 c = c * 16 + pcre_lcc[*ptr] -
471 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
472 }
473 break;
474
475 case 'c':
476 c = *(++ptr);
477 if (c == 0)
478 {
479 *errorptr = ERR2;
480 return 0;
481 }
482
483 /* A letter is upper-cased; then the 0x40 bit is flipped */
484
485 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
486 c ^= 0x40;
487 break;
488
489 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
490 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
491 for Perl compatibility, it is a literal. */
492
493 default:
494 if ((options & PCRE_EXTRA) != 0) switch(c)
495 {
496 case 'X':
497 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
498 break;
499
500 default:
501 *errorptr = ERR3;
502 break;
503 }
504 break;
505 }
506 }
507
508 *ptrptr = ptr;
509 return c;
510 }
511
512
513
514 /*************************************************
515 * Check for counted repeat *
516 *************************************************/
517
518 /* This function is called when a '{' is encountered in a place where it might
519 start a quantifier. It looks ahead to see if it really is a quantifier or not.
520 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
521 where the ddds are digits.
522
523 Arguments:
524 p pointer to the first char after '{'
525
526 Returns: TRUE or FALSE
527 */
528
529 static BOOL
530 is_counted_repeat(const uschar *p)
531 {
532 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
533 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
534 if (*p == '}') return TRUE;
535
536 if (*p++ != ',') return FALSE;
537 if (*p == '}') return TRUE;
538
539 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
540 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
541 return (*p == '}');
542 }
543
544
545
546 /*************************************************
547 * Read repeat counts *
548 *************************************************/
549
550 /* Read an item of the form {n,m} and return the values. This is called only
551 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
552 so the syntax is guaranteed to be correct, but we need to check the values.
553
554 Arguments:
555 p pointer to first char after '{'
556 minp pointer to int for min
557 maxp pointer to int for max
558 returned as -1 if no max
559 errorptr points to pointer to error message
560
561 Returns: pointer to '}' on success;
562 current ptr on error, with errorptr set
563 */
564
565 static const uschar *
566 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
567 {
568 int min = 0;
569 int max = -1;
570
571 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
572
573 if (*p == '}') max = min; else
574 {
575 if (*(++p) != '}')
576 {
577 max = 0;
578 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
579 if (max < min)
580 {
581 *errorptr = ERR4;
582 return p;
583 }
584 }
585 }
586
587 /* Do paranoid checks, then fill in the required variables, and pass back the
588 pointer to the terminating '}'. */
589
590 if (min > 65535 || max > 65535)
591 *errorptr = ERR5;
592 else
593 {
594 *minp = min;
595 *maxp = max;
596 }
597 return p;
598 }
599
600
601
602 /*************************************************
603 * Compile one branch *
604 *************************************************/
605
606 /* Scan the pattern, compiling it into the code vector.
607
608 Arguments:
609 options the option bits
610 bracket points to number of brackets used
611 code points to the pointer to the current code point
612 ptrptr points to the current pattern pointer
613 errorptr points to pointer to error message
614
615 Returns: TRUE on success
616 FALSE, with *errorptr set on error
617 */
618
619 static BOOL
620 compile_branch(int options, int *brackets, uschar **codeptr,
621 const uschar **ptrptr, const char **errorptr)
622 {
623 int repeat_type, op_type;
624 int repeat_min, repeat_max;
625 int bravalue, length;
626 int greedy_default, greedy_non_default;
627 register int c;
628 register uschar *code = *codeptr;
629 const uschar *ptr = *ptrptr;
630 const uschar *oldptr;
631 uschar *previous = NULL;
632 uschar class[32];
633
634 /* Set up the default and non-default settings for greediness */
635
636 greedy_default = ((options & PCRE_UNGREEDY) != 0);
637 greedy_non_default = greedy_default ^ 1;
638
639 /* Switch on next character until the end of the branch */
640
641 for (;; ptr++)
642 {
643 BOOL negate_class;
644 int class_charcount;
645 int class_lastchar;
646
647 c = *ptr;
648 if ((options & PCRE_EXTENDED) != 0)
649 {
650 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
651 if (c == '#')
652 {
653 while ((c = *(++ptr)) != 0 && c != '\n');
654 continue;
655 }
656 }
657
658 switch(c)
659 {
660 /* The branch terminates at end of string, |, or ). */
661
662 case 0:
663 case '|':
664 case ')':
665 *codeptr = code;
666 *ptrptr = ptr;
667 return TRUE;
668
669 /* Handle single-character metacharacters */
670
671 case '^':
672 previous = NULL;
673 *code++ = OP_CIRC;
674 break;
675
676 case '$':
677 previous = NULL;
678 *code++ = OP_DOLL;
679 break;
680
681 case '.':
682 previous = code;
683 *code++ = OP_ANY;
684 break;
685
686 /* Character classes. These always build a 32-byte bitmap of the permitted
687 characters, except in the special case where there is only one character.
688 For negated classes, we build the map as usual, then invert it at the end.
689 */
690
691 case '[':
692 previous = code;
693
694 /* If the first character is '^', set the negation flag, and use a
695 different opcode. This only matters if caseless matching is specified at
696 runtime. */
697
698 if ((c = *(++ptr)) == '^')
699 {
700 negate_class = TRUE;
701 *code++ = OP_NEGCLASS;
702 c = *(++ptr);
703 }
704 else
705 {
706 negate_class = FALSE;
707 *code++ = OP_CLASS;
708 }
709
710 /* Keep a count of chars so that we can optimize the case of just a single
711 character. */
712
713 class_charcount = 0;
714 class_lastchar = -1;
715
716 /* Initialize the 32-char bit map to all zeros. We have to build the
717 map in a temporary bit of store, in case the class contains only 1
718 character, because in that case the compiled code doesn't use the
719 bit map. */
720
721 memset(class, 0, 32 * sizeof(uschar));
722
723 /* Process characters until ] is reached. By writing this as a "do" it
724 means that an initial ] is taken as a data character. */
725
726 do
727 {
728 if (c == 0)
729 {
730 *errorptr = ERR6;
731 goto FAILED;
732 }
733
734 /* Backslash may introduce a single character, or it may introduce one
735 of the specials, which just set a flag. Escaped items are checked for
736 validity in the pre-compiling pass. The sequence \b is a special case.
737 Inside a class (and only there) it is treated as backspace. Elsewhere
738 it marks a word boundary. Other escapes have preset maps ready to
739 or into the one we are building. We assume they have more than one
740 character in them, so set class_count bigger than one. */
741
742 if (c == '\\')
743 {
744 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
745 if (-c == ESC_b) c = '\b';
746 else if (c < 0)
747 {
748 class_charcount = 10;
749 switch (-c)
750 {
751 case ESC_d:
752 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
753 continue;
754
755 case ESC_D:
756 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
757 continue;
758
759 case ESC_w:
760 for (c = 0; c < 32; c++)
761 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
762 continue;
763
764 case ESC_W:
765 for (c = 0; c < 32; c++)
766 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
767 continue;
768
769 case ESC_s:
770 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
771 continue;
772
773 case ESC_S:
774 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
775 continue;
776
777 default:
778 *errorptr = ERR7;
779 goto FAILED;
780 }
781 }
782 /* Fall through if single character */
783 }
784
785 /* A single character may be followed by '-' to form a range. However,
786 Perl does not permit ']' to be the end of the range. A '-' character
787 here is treated as a literal. */
788
789 if (ptr[1] == '-' && ptr[2] != ']')
790 {
791 int d;
792 ptr += 2;
793 d = *ptr;
794
795 if (d == 0)
796 {
797 *errorptr = ERR6;
798 goto FAILED;
799 }
800
801 /* The second part of a range can be a single-character escape, but
802 not any of the other escapes. */
803
804 if (d == '\\')
805 {
806 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
807 if (d < 0)
808 {
809 if (d == -ESC_b) d = '\b'; else
810 {
811 *errorptr = ERR7;
812 goto FAILED;
813 }
814 }
815 }
816
817 if (d < c)
818 {
819 *errorptr = ERR8;
820 goto FAILED;
821 }
822
823 for (; c <= d; c++)
824 {
825 class[c/8] |= (1 << (c&7));
826 if ((options & PCRE_CASELESS) != 0)
827 {
828 int uc = pcre_fcc[c]; /* flip case */
829 class[uc/8] |= (1 << (uc&7));
830 }
831 class_charcount++; /* in case a one-char range */
832 class_lastchar = c;
833 }
834 continue; /* Go get the next char in the class */
835 }
836
837 /* Handle a lone single character - we can get here for a normal
838 non-escape char, or after \ that introduces a single character. */
839
840 class [c/8] |= (1 << (c&7));
841 if ((options & PCRE_CASELESS) != 0)
842 {
843 c = pcre_fcc[c]; /* flip case */
844 class[c/8] |= (1 << (c&7));
845 }
846 class_charcount++;
847 class_lastchar = c;
848 }
849
850 /* Loop until ']' reached; the check for end of string happens inside the
851 loop. This "while" is the end of the "do" above. */
852
853 while ((c = *(++ptr)) != ']');
854
855 /* If class_charcount is 1 and class_lastchar is not negative, we saw
856 precisely one character. This doesn't need the whole 32-byte bit map.
857 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
858 it's negative. */
859
860 if (class_charcount == 1 && class_lastchar >= 0)
861 {
862 if (negate_class)
863 {
864 code[-1] = OP_NOT;
865 }
866 else
867 {
868 code[-1] = OP_CHARS;
869 *code++ = 1;
870 }
871 *code++ = class_lastchar;
872 }
873
874 /* Otherwise, negate the 32-byte map if necessary, and copy it into
875 the code vector. */
876
877 else
878 {
879 if (negate_class)
880 for (c = 0; c < 32; c++) code[c] = ~class[c];
881 else
882 memcpy(code, class, 32);
883 code += 32;
884 }
885 break;
886
887 /* Various kinds of repeat */
888
889 case '{':
890 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
891 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
892 if (*errorptr != NULL) goto FAILED;
893 goto REPEAT;
894
895 case '*':
896 repeat_min = 0;
897 repeat_max = -1;
898 goto REPEAT;
899
900 case '+':
901 repeat_min = 1;
902 repeat_max = -1;
903 goto REPEAT;
904
905 case '?':
906 repeat_min = 0;
907 repeat_max = 1;
908
909 REPEAT:
910 if (previous == NULL)
911 {
912 *errorptr = ERR9;
913 goto FAILED;
914 }
915
916 /* If the next character is '?' this is a minimizing repeat, by default,
917 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
918 next character. */
919
920 if (ptr[1] == '?')
921 { repeat_type = greedy_non_default; ptr++; }
922 else repeat_type = greedy_default;
923
924 /* If the maximum is zero then the minimum must also be zero; Perl allows
925 this case, so we do too - by simply omitting the item altogether. */
926
927 if (repeat_max == 0) code = previous;
928
929 /* If previous was a string of characters, chop off the last one and use it
930 as the subject of the repeat. If there was only one character, we can
931 abolish the previous item altogether. */
932
933 else if (*previous == OP_CHARS)
934 {
935 int len = previous[1];
936 if (len == 1)
937 {
938 c = previous[2];
939 code = previous;
940 }
941 else
942 {
943 c = previous[len+1];
944 previous[1]--;
945 code--;
946 }
947 op_type = 0; /* Use single-char op codes */
948 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
949 }
950
951 /* If previous was a single negated character ([^a] or similar), we use
952 one of the special opcodes, replacing it. The code is shared with single-
953 character repeats by adding a suitable offset into repeat_type. */
954
955 else if ((int)*previous == OP_NOT)
956 {
957 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
958 c = previous[1];
959 code = previous;
960 goto OUTPUT_SINGLE_REPEAT;
961 }
962
963 /* If previous was a character type match (\d or similar), abolish it and
964 create a suitable repeat item. The code is shared with single-character
965 repeats by adding a suitable offset into repeat_type. */
966
967 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
968 {
969 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
970 c = *previous;
971 code = previous;
972
973 OUTPUT_SINGLE_REPEAT:
974 repeat_type += op_type; /* Combine both values for many cases */
975
976 /* A minimum of zero is handled either as the special case * or ?, or as
977 an UPTO, with the maximum given. */
978
979 if (repeat_min == 0)
980 {
981 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
982 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
983 else
984 {
985 *code++ = OP_UPTO + repeat_type;
986 *code++ = repeat_max >> 8;
987 *code++ = (repeat_max & 255);
988 }
989 }
990
991 /* The case {1,} is handled as the special case + */
992
993 else if (repeat_min == 1 && repeat_max == -1)
994 *code++ = OP_PLUS + repeat_type;
995
996 /* The case {n,n} is just an EXACT, while the general case {n,m} is
997 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
998
999 else
1000 {
1001 if (repeat_min != 1)
1002 {
1003 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1004 *code++ = repeat_min >> 8;
1005 *code++ = (repeat_min & 255);
1006 }
1007
1008 /* If the mininum is 1 and the previous item was a character string,
1009 we either have to put back the item that got cancelled if the string
1010 length was 1, or add the character back onto the end of a longer
1011 string. For a character type nothing need be done; it will just get
1012 put back naturally. Note that the final character is always going to
1013 get added below. */
1014
1015 else if (*previous == OP_CHARS)
1016 {
1017 if (code == previous) code += 2; else previous[1]++;
1018 }
1019
1020 /* For a single negated character we also have to put back the
1021 item that got cancelled. */
1022
1023 else if (*previous == OP_NOT) code++;
1024
1025 /* If the maximum is unlimited, insert an OP_STAR. */
1026
1027 if (repeat_max < 0)
1028 {
1029 *code++ = c;
1030 *code++ = OP_STAR + repeat_type;
1031 }
1032
1033 /* Else insert an UPTO if the max is greater than the min. */
1034
1035 else if (repeat_max != repeat_min)
1036 {
1037 *code++ = c;
1038 repeat_max -= repeat_min;
1039 *code++ = OP_UPTO + repeat_type;
1040 *code++ = repeat_max >> 8;
1041 *code++ = (repeat_max & 255);
1042 }
1043 }
1044
1045 /* The character or character type itself comes last in all cases. */
1046
1047 *code++ = c;
1048 }
1049
1050 /* If previous was a character class or a back reference, we put the repeat
1051 stuff after it. */
1052
1053 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1054 *previous == OP_REF)
1055 {
1056 if (repeat_min == 0 && repeat_max == -1)
1057 *code++ = OP_CRSTAR + repeat_type;
1058 else if (repeat_min == 1 && repeat_max == -1)
1059 *code++ = OP_CRPLUS + repeat_type;
1060 else if (repeat_min == 0 && repeat_max == 1)
1061 *code++ = OP_CRQUERY + repeat_type;
1062 else
1063 {
1064 *code++ = OP_CRRANGE + repeat_type;
1065 *code++ = repeat_min >> 8;
1066 *code++ = repeat_min & 255;
1067 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1068 *code++ = repeat_max >> 8;
1069 *code++ = repeat_max & 255;
1070 }
1071 }
1072
1073 /* If previous was a bracket group, we may have to replicate it in certain
1074 cases. If the maximum repeat count is unlimited, check that the bracket
1075 group cannot match the empty string, and diagnose an error if it can. */
1076
1077 else if ((int)*previous >= OP_BRA)
1078 {
1079 int i;
1080 int len = code - previous;
1081
1082 if (repeat_max == -1 && could_be_empty(previous))
1083 {
1084 *errorptr = ERR10;
1085 goto FAILED;
1086 }
1087
1088 /* If the minimum is greater than zero, and the maximum is unlimited or
1089 equal to the minimum, the first copy remains where it is, and is
1090 replicated up to the minimum number of times. This case includes the +
1091 repeat, but of course no replication is needed in that case. */
1092
1093 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1094 {
1095 for (i = 1; i < repeat_min; i++)
1096 {
1097 memcpy(code, previous, len);
1098 code += len;
1099 }
1100 }
1101
1102 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1103 Then, if there is a fixed upper limit, replicated up to that many times,
1104 sticking BRAZERO in front of all the optional ones. */
1105
1106 else
1107 {
1108 if (repeat_min == 0)
1109 {
1110 memmove(previous+1, previous, len);
1111 code++;
1112 *previous++ = OP_BRAZERO + repeat_type;
1113 }
1114
1115 for (i = 1; i < repeat_min; i++)
1116 {
1117 memcpy(code, previous, len);
1118 code += len;
1119 }
1120
1121 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1122 {
1123 *code++ = OP_BRAZERO + repeat_type;
1124 memcpy(code, previous, len);
1125 code += len;
1126 }
1127 }
1128
1129 /* If the maximum is unlimited, set a repeater in the final copy. */
1130
1131 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1132 }
1133
1134 /* Else there's some kind of shambles */
1135
1136 else
1137 {
1138 *errorptr = ERR11;
1139 goto FAILED;
1140 }
1141
1142 /* In all case we no longer have a previous item. */
1143
1144 previous = NULL;
1145 break;
1146
1147
1148 /* Start of nested bracket sub-expression, or comment or lookahead.
1149 First deal with special things that can come after a bracket; all are
1150 introduced by ?, and the appearance of any of them means that this is not a
1151 referencing group. They were checked for validity in the first pass over
1152 the string, so we don't have to check for syntax errors here. */
1153
1154 case '(':
1155 previous = code; /* Only real brackets can be repeated */
1156 if (*(++ptr) == '?')
1157 {
1158 bravalue = OP_BRA;
1159
1160 switch (*(++ptr))
1161 {
1162 case '#':
1163 case 'i':
1164 case 'm':
1165 case 's':
1166 case 'x':
1167 case 'U':
1168 case 'X':
1169 ptr++;
1170 while (*ptr != ')') ptr++;
1171 previous = NULL;
1172 continue;
1173
1174 case ':': /* Non-extracting bracket */
1175 ptr++;
1176 break;
1177
1178 case '=': /* Assertions can't be repeated */
1179 bravalue = OP_ASSERT;
1180 ptr++;
1181 previous = NULL;
1182 break;
1183
1184 case '!':
1185 bravalue = OP_ASSERT_NOT;
1186 ptr++;
1187 previous = NULL;
1188 break;
1189
1190 case '>': /* "Match once" brackets */
1191 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1192 {
1193 bravalue = OP_ONCE;
1194 ptr++;
1195 previous = NULL;
1196 break;
1197 }
1198 /* Else fall through */
1199
1200 default:
1201 *errorptr = ERR12;
1202 goto FAILED;
1203 }
1204 }
1205
1206 /* Else we have a referencing group */
1207
1208 else
1209 {
1210 if (++(*brackets) > EXTRACT_MAX)
1211 {
1212 *errorptr = ERR13;
1213 goto FAILED;
1214 }
1215 bravalue = OP_BRA + *brackets;
1216 }
1217
1218 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1219 code into a non-register variable in order to be able to pass its address
1220 because some compilers complain otherwise. */
1221
1222 *code = bravalue;
1223 {
1224 uschar *mcode = code;
1225 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1226 goto FAILED;
1227 code = mcode;
1228 }
1229
1230 if (*ptr != ')')
1231 {
1232 *errorptr = ERR14;
1233 goto FAILED;
1234 }
1235 break;
1236
1237 /* Check \ for being a real metacharacter; if not, fall through and handle
1238 it as a data character at the start of a string. Escape items are checked
1239 for validity in the pre-compiling pass. */
1240
1241 case '\\':
1242 oldptr = ptr;
1243 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1244
1245 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1246 are arranged to be the negation of the corresponding OP_values. For the
1247 back references, the values are ESC_REF plus the reference number. Only
1248 back references and those types that consume a character may be repeated.
1249 We can test for values between ESC_b and ESC_Z for the latter; this may
1250 have to change if any new ones are ever created. */
1251
1252 if (c < 0)
1253 {
1254 if (-c >= ESC_REF)
1255 {
1256 int refnum = -c - ESC_REF;
1257 if (*brackets < refnum)
1258 {
1259 *errorptr = ERR15;
1260 goto FAILED;
1261 }
1262 previous = code;
1263 *code++ = OP_REF;
1264 *code++ = refnum;
1265 }
1266 else
1267 {
1268 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1269 *code++ = -c;
1270 }
1271 continue;
1272 }
1273
1274 /* Data character: reset and fall through */
1275
1276 ptr = oldptr;
1277 c = '\\';
1278
1279 /* Handle a run of data characters until a metacharacter is encountered.
1280 The first character is guaranteed not to be whitespace or # when the
1281 extended flag is set. */
1282
1283 NORMAL_CHAR:
1284 default:
1285 previous = code;
1286 *code = OP_CHARS;
1287 code += 2;
1288 length = 0;
1289
1290 do
1291 {
1292 if ((options & PCRE_EXTENDED) != 0)
1293 {
1294 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1295 if (c == '#')
1296 {
1297 while ((c = *(++ptr)) != 0 && c != '\n');
1298 if (c == 0) break;
1299 continue;
1300 }
1301 }
1302
1303 /* Backslash may introduce a data char or a metacharacter. Escaped items
1304 are checked for validity in the pre-compiling pass. Stop the string
1305 before a metaitem. */
1306
1307 if (c == '\\')
1308 {
1309 oldptr = ptr;
1310 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1311 if (c < 0) { ptr = oldptr; break; }
1312 }
1313
1314 /* Ordinary character or single-char escape */
1315
1316 *code++ = c;
1317 length++;
1318 }
1319
1320 /* This "while" is the end of the "do" above. */
1321
1322 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1323
1324 /* Compute the length and set it in the data vector, and advance to
1325 the next state. */
1326
1327 previous[1] = length;
1328 if (length < 255) ptr--;
1329 break;
1330 }
1331 } /* end of big loop */
1332
1333 /* Control never reaches here by falling through, only by a goto for all the
1334 error states. Pass back the position in the pattern so that it can be displayed
1335 to the user for diagnosing the error. */
1336
1337 FAILED:
1338 *ptrptr = ptr;
1339 return FALSE;
1340 }
1341
1342
1343
1344
1345 /*************************************************
1346 * Compile sequence of alternatives *
1347 *************************************************/
1348
1349 /* On entry, ptr is pointing past the bracket character, but on return
1350 it points to the closing bracket, or vertical bar, or end of string.
1351 The code variable is pointing at the byte into which the BRA operator has been
1352 stored.
1353
1354 Argument:
1355 options the option bits
1356 brackets -> int containing the number of extracting brackets used
1357 codeptr -> the address of the current code pointer
1358 ptrptr -> the address of the current pattern pointer
1359 errorptr -> pointer to error message
1360
1361 Returns: TRUE on success
1362 */
1363
1364 static BOOL
1365 compile_regex(int options, int *brackets, uschar **codeptr,
1366 const uschar **ptrptr, const char **errorptr)
1367 {
1368 const uschar *ptr = *ptrptr;
1369 uschar *code = *codeptr;
1370 uschar *start_bracket = code;
1371
1372 for (;;)
1373 {
1374 int length;
1375 uschar *last_branch = code;
1376
1377 code += 3;
1378 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1379 {
1380 *ptrptr = ptr;
1381 return FALSE;
1382 }
1383
1384 /* Fill in the length of the last branch */
1385
1386 length = code - last_branch;
1387 last_branch[1] = length >> 8;
1388 last_branch[2] = length & 255;
1389
1390 /* Reached end of expression, either ')' or end of pattern. Insert a
1391 terminating ket and the length of the whole bracketed item, and return,
1392 leaving the pointer at the terminating char. */
1393
1394 if (*ptr != '|')
1395 {
1396 length = code - start_bracket;
1397 *code++ = OP_KET;
1398 *code++ = length >> 8;
1399 *code++ = length & 255;
1400 *codeptr = code;
1401 *ptrptr = ptr;
1402 return TRUE;
1403 }
1404
1405 /* Another branch follows; insert an "or" node and advance the pointer. */
1406
1407 *code = OP_ALT;
1408 ptr++;
1409 }
1410 /* Control never reaches here */
1411 }
1412
1413
1414
1415 /*************************************************
1416 * Check for anchored expression *
1417 *************************************************/
1418
1419 /* Try to find out if this is an anchored regular expression. Consider each
1420 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1421 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1422 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1423 counts, since OP_CIRC can match in the middle.
1424
1425 A branch is also implicitly anchored if it starts with .* because that will try
1426 the rest of the pattern at all possible matching points, so there is no point
1427 trying them again.
1428
1429 Argument: points to start of expression (the bracket)
1430 Returns: TRUE or FALSE
1431 */
1432
1433 static BOOL
1434 is_anchored(register const uschar *code, BOOL multiline)
1435 {
1436 do {
1437 int op = (int)code[3];
1438 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1439 { if (!is_anchored(code+3, multiline)) return FALSE; }
1440 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1441 { if (code[4] != OP_ANY) return FALSE; }
1442 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1443 code += (code[1] << 8) + code[2];
1444 }
1445 while (*code == OP_ALT);
1446 return TRUE;
1447 }
1448
1449
1450
1451 /*************************************************
1452 * Check for start with \n line expression *
1453 *************************************************/
1454
1455 /* This is called for multiline expressions to try to find out if every branch
1456 starts with ^ so that "first char" processing can be done to speed things up.
1457
1458 Argument: points to start of expression (the bracket)
1459 Returns: TRUE or FALSE
1460 */
1461
1462 static BOOL
1463 is_startline(const uschar *code)
1464 {
1465 do {
1466 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1467 { if (!is_startline(code+3)) return FALSE; }
1468 else if (code[3] != OP_CIRC) return FALSE;
1469 code += (code[1] << 8) + code[2];
1470 }
1471 while (*code == OP_ALT);
1472 return TRUE;
1473 }
1474
1475
1476
1477 /*************************************************
1478 * Check for fixed first char *
1479 *************************************************/
1480
1481 /* Try to find out if there is a fixed first character. This is called for
1482 unanchored expressions, as it speeds up their processing quite considerably.
1483 Consider each alternative branch. If they all start with the same char, or with
1484 a bracket all of whose alternatives start with the same char (recurse ad lib),
1485 then we return that char, otherwise -1.
1486
1487 Argument: points to start of expression (the bracket)
1488 Returns: -1 or the fixed first char
1489 */
1490
1491 static int
1492 find_firstchar(uschar *code)
1493 {
1494 register int c = -1;
1495 do
1496 {
1497 register int charoffset = 4;
1498
1499 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1500 {
1501 register int d;
1502 if ((d = find_firstchar(code+3)) < 0) return -1;
1503 if (c < 0) c = d; else if (c != d) return -1;
1504 }
1505
1506 else switch(code[3])
1507 {
1508 default:
1509 return -1;
1510
1511 case OP_EXACT: /* Fall through */
1512 charoffset++;
1513
1514 case OP_CHARS: /* Fall through */
1515 charoffset++;
1516
1517 case OP_PLUS:
1518 case OP_MINPLUS:
1519 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1520 break;
1521 }
1522 code += (code[1] << 8) + code[2];
1523 }
1524 while (*code == OP_ALT);
1525 return c;
1526 }
1527
1528
1529
1530 /*************************************************
1531 * Compile a Regular Expression *
1532 *************************************************/
1533
1534 /* This function takes a string and returns a pointer to a block of store
1535 holding a compiled version of the expression.
1536
1537 Arguments:
1538 pattern the regular expression
1539 options various option bits
1540 errorptr pointer to pointer to error text
1541 erroroffset ptr offset in pattern where error was detected
1542
1543 Returns: pointer to compiled data block, or NULL on error,
1544 with errorptr and erroroffset set
1545 */
1546
1547 pcre *
1548 pcre_compile(const char *pattern, int options, const char **errorptr,
1549 int *erroroffset)
1550 {
1551 real_pcre *re;
1552 int spaces = 0;
1553 int length = 3; /* For initial BRA plus length */
1554 int runlength;
1555 int c, size;
1556 int bracount = 0;
1557 int brastack[200];
1558 int top_backref = 0;
1559 unsigned int brastackptr = 0;
1560 uschar *code;
1561 const uschar *ptr;
1562
1563 #ifdef DEBUG
1564 uschar *code_base, *code_end;
1565 #endif
1566
1567 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1568 can do is just return NULL. */
1569
1570 if (errorptr == NULL) return NULL;
1571 *errorptr = NULL;
1572
1573 /* However, we can give a message for this error */
1574
1575 if (erroroffset == NULL)
1576 {
1577 *errorptr = ERR16;
1578 return NULL;
1579 }
1580 *erroroffset = 0;
1581
1582 if ((options & ~PUBLIC_OPTIONS) != 0)
1583 {
1584 *errorptr = ERR17;
1585 return NULL;
1586 }
1587
1588 DPRINTF(("------------------------------------------------------------------\n"));
1589 DPRINTF(("%s\n", pattern));
1590
1591 /* The first thing to do is to make a pass over the pattern to compute the
1592 amount of store required to hold the compiled code. This does not have to be
1593 perfect as long as errors are overestimates. At the same time we can detect any
1594 internal flag settings. Make an attempt to correct for any counted white space
1595 if an "extended" flag setting appears late in the pattern. We can't be so
1596 clever for #-comments. */
1597
1598 ptr = (const uschar *)(pattern - 1);
1599 while ((c = *(++ptr)) != 0)
1600 {
1601 int min, max;
1602 int class_charcount;
1603
1604 if ((pcre_ctypes[c] & ctype_space) != 0)
1605 {
1606 if ((options & PCRE_EXTENDED) != 0) continue;
1607 spaces++;
1608 }
1609
1610 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1611 {
1612 while ((c = *(++ptr)) != 0 && c != '\n');
1613 continue;
1614 }
1615
1616 switch(c)
1617 {
1618 /* A backslashed item may be an escaped "normal" character or a
1619 character type. For a "normal" character, put the pointers and
1620 character back so that tests for whitespace etc. in the input
1621 are done correctly. */
1622
1623 case '\\':
1624 {
1625 const uschar *save_ptr = ptr;
1626 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1627 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1628 if (c >= 0)
1629 {
1630 ptr = save_ptr;
1631 c = '\\';
1632 goto NORMAL_CHAR;
1633 }
1634 }
1635 length++;
1636
1637 /* A back reference needs an additional char, plus either one or 5
1638 bytes for a repeat. We also need to keep the value of the highest
1639 back reference. */
1640
1641 if (c <= -ESC_REF)
1642 {
1643 int refnum = -c - ESC_REF;
1644 if (refnum > top_backref) top_backref = refnum;
1645 length++; /* For single back reference */
1646 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1647 {
1648 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1649 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1650 if ((min == 0 && (max == 1 || max == -1)) ||
1651 (min == 1 && max == -1))
1652 length++;
1653 else length += 5;
1654 if (ptr[1] == '?') ptr++;
1655 }
1656 }
1657 continue;
1658
1659 case '^':
1660 case '.':
1661 case '$':
1662 case '*': /* These repeats won't be after brackets; */
1663 case '+': /* those are handled separately */
1664 case '?':
1665 length++;
1666 continue;
1667
1668 /* This covers the cases of repeats after a single char, metachar, class,
1669 or back reference. */
1670
1671 case '{':
1672 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1673 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1674 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1675 if ((min == 0 && (max == 1 || max == -1)) ||
1676 (min == 1 && max == -1))
1677 length++;
1678 else
1679 {
1680 length--; /* Uncount the original char or metachar */
1681 if (min == 1) length++; else if (min > 0) length += 4;
1682 if (max > 0) length += 4; else length += 2;
1683 }
1684 if (ptr[1] == '?') ptr++;
1685 continue;
1686
1687 /* An alternation contains an offset to the next branch or ket. */
1688 case '|':
1689 length += 3;
1690 continue;
1691
1692 /* A character class uses 33 characters. Don't worry about character types
1693 that aren't allowed in classes - they'll get picked up during the compile.
1694 A character class that contains only one character uses 2 or 3 bytes,
1695 depending on whether it is negated or not. Notice this where we can. */
1696
1697 case '[':
1698 class_charcount = 0;
1699 if (*(++ptr) == '^') ptr++;
1700 do
1701 {
1702 if (*ptr == '\\')
1703 {
1704 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1705 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1706 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1707 }
1708 else class_charcount++;
1709 ptr++;
1710 }
1711 while (*ptr != 0 && *ptr != ']');
1712
1713 /* Repeats for negated single chars are handled by the general code */
1714
1715 if (class_charcount == 1) length += 3; else
1716 {
1717 length += 33;
1718
1719 /* A repeat needs either 1 or 5 bytes. */
1720
1721 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1722 {
1723 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1724 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1725 if ((min == 0 && (max == 1 || max == -1)) ||
1726 (min == 1 && max == -1))
1727 length++;
1728 else length += 5;
1729 if (ptr[1] == '?') ptr++;
1730 }
1731 }
1732 continue;
1733
1734 /* Brackets may be genuine groups or special things */
1735
1736 case '(':
1737
1738 /* Handle special forms of bracket, which all start (? */
1739
1740 if (ptr[1] == '?') switch (c = ptr[2])
1741 {
1742 /* Skip over comments entirely */
1743 case '#':
1744 ptr += 3;
1745 while (*ptr != 0 && *ptr != ')') ptr++;
1746 if (*ptr == 0)
1747 {
1748 *errorptr = ERR18;
1749 goto PCRE_ERROR_RETURN;
1750 }
1751 continue;
1752
1753 /* Non-referencing groups and lookaheads just move the pointer on, and
1754 then behave like a non-special bracket, except that they don't increment
1755 the count of extracting brackets. */
1756
1757 case ':':
1758 case '=':
1759 case '!':
1760 ptr += 2;
1761 break;
1762
1763 /* Ditto for the "once only" bracket, allowed only if the extra bit
1764 is set. */
1765
1766 case '>':
1767 if ((options & PCRE_EXTRA) != 0)
1768 {
1769 ptr += 2;
1770 break;
1771 }
1772 /* Else fall through */
1773
1774 /* Else loop setting valid options until ) is met. Anything else is an
1775 error. */
1776
1777 default:
1778 ptr += 2;
1779 for (;; ptr++)
1780 {
1781 if ((c = *ptr) == 'i')
1782 {
1783 options |= PCRE_CASELESS;
1784 continue;
1785 }
1786 else if ((c = *ptr) == 'm')
1787 {
1788 options |= PCRE_MULTILINE;
1789 continue;
1790 }
1791 else if (c == 's')
1792 {
1793 options |= PCRE_DOTALL;
1794 continue;
1795 }
1796 else if (c == 'x')
1797 {
1798 options |= PCRE_EXTENDED;
1799 length -= spaces; /* Already counted spaces */
1800 continue;
1801 }
1802 else if (c == 'X')
1803 {
1804 options |= PCRE_EXTRA;
1805 continue;
1806 }
1807 else if (c == 'U')
1808 {
1809 options |= PCRE_UNGREEDY;
1810 continue;
1811 }
1812 else if (c == ')') break;
1813
1814 *errorptr = ERR12;
1815 goto PCRE_ERROR_RETURN;
1816 }
1817 continue; /* End of this bracket handling */
1818 }
1819
1820 /* Extracting brackets must be counted so we can process escapes in a
1821 Perlish way. */
1822
1823 else bracount++;
1824
1825 /* Non-special forms of bracket. Save length for computing whole length
1826 at end if there's a repeat that requires duplication of the group. */
1827
1828 if (brastackptr >= sizeof(brastack)/sizeof(int))
1829 {
1830 *errorptr = ERR19;
1831 goto PCRE_ERROR_RETURN;
1832 }
1833
1834 brastack[brastackptr++] = length;
1835 length += 3;
1836 continue;
1837
1838 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1839 have to replicate this bracket up to that many times. If brastackptr is
1840 0 this is an unmatched bracket which will generate an error, but take care
1841 not to try to access brastack[-1]. */
1842
1843 case ')':
1844 length += 3;
1845 {
1846 int minval = 1;
1847 int maxval = 1;
1848 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1849
1850 /* Leave ptr at the final char; for read_repeat_counts this happens
1851 automatically; for the others we need an increment. */
1852
1853 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1854 {
1855 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1856 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1857 }
1858 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1859 else if (c == '+') { maxval = -1; ptr++; }
1860 else if (c == '?') { minval = 0; ptr++; }
1861
1862 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1863 if there is a limited maximum we have to replicate up to maxval-1 times
1864 and allow for a BRAZERO item before each optional copy, as we also have
1865 to do before the first copy if the minimum is zero. */
1866
1867 if (minval == 0) length++;
1868 else if (minval > 1) length += (minval - 1) * duplength;
1869 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1870 }
1871 continue;
1872
1873 /* Non-special character. For a run of such characters the length required
1874 is the number of characters + 2, except that the maximum run length is 255.
1875 We won't get a skipped space or a non-data escape or the start of a #
1876 comment as the first character, so the length can't be zero. */
1877
1878 NORMAL_CHAR:
1879 default:
1880 length += 2;
1881 runlength = 0;
1882 do
1883 {
1884 if ((pcre_ctypes[c] & ctype_space) != 0)
1885 {
1886 if ((options & PCRE_EXTENDED) != 0) continue;
1887 spaces++;
1888 }
1889
1890 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1891 {
1892 while ((c = *(++ptr)) != 0 && c != '\n');
1893 continue;
1894 }
1895
1896 /* Backslash may introduce a data char or a metacharacter; stop the
1897 string before the latter. */
1898
1899 if (c == '\\')
1900 {
1901 const uschar *saveptr = ptr;
1902 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1903 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1904 if (c < 0) { ptr = saveptr; break; }
1905 }
1906
1907 /* Ordinary character or single-char escape */
1908
1909 runlength++;
1910 }
1911
1912 /* This "while" is the end of the "do" above. */
1913
1914 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1915
1916 ptr--;
1917 length += runlength;
1918 continue;
1919 }
1920 }
1921
1922 length += 4; /* For final KET and END */
1923
1924 if (length > 65539)
1925 {
1926 *errorptr = ERR20;
1927 return NULL;
1928 }
1929
1930 /* Compute the size of data block needed and get it, either from malloc or
1931 externally provided function. We specify "code[0]" in the offsetof() expression
1932 rather than just "code", because it has been reported that one broken compiler
1933 fails on "code" because it is also an independent variable. It should make no
1934 difference to the value of the offsetof(). */
1935
1936 size = length + offsetof(real_pcre, code[0]);
1937 re = (real_pcre *)(pcre_malloc)(size);
1938
1939 if (re == NULL)
1940 {
1941 *errorptr = ERR21;
1942 return NULL;
1943 }
1944
1945 /* Put in the magic number and the options. */
1946
1947 re->magic_number = MAGIC_NUMBER;
1948 re->options = options;
1949
1950 /* Set up a starting, non-extracting bracket, then compile the expression. On
1951 error, *errorptr will be set non-NULL, so we don't need to look at the result
1952 of the function here. */
1953
1954 ptr = (const uschar *)pattern;
1955 code = re->code;
1956 *code = OP_BRA;
1957 bracount = 0;
1958 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1959 re->top_bracket = bracount;
1960 re->top_backref = top_backref;
1961
1962 /* If not reached end of pattern on success, there's an excess bracket. */
1963
1964 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1965
1966 /* Fill in the terminating state and check for disastrous overflow, but
1967 if debugging, leave the test till after things are printed out. */
1968
1969 *code++ = OP_END;
1970
1971 #ifndef DEBUG
1972 if (code - re->code > length) *errorptr = ERR23;
1973 #endif
1974
1975 /* Failed to compile */
1976
1977 if (*errorptr != NULL)
1978 {
1979 (pcre_free)(re);
1980 PCRE_ERROR_RETURN:
1981 *erroroffset = ptr - (const uschar *)pattern;
1982 return NULL;
1983 }
1984
1985 /* If the anchored option was not passed, set flag if we can determine that it
1986 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1987 we can determine what the first character has to be, because that speeds up
1988 unanchored matches no end. In the case of multiline matches, an alternative is
1989 to set the PCRE_STARTLINE flag if all branches start with ^. */
1990
1991 if ((options & PCRE_ANCHORED) == 0)
1992 {
1993 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1994 re->options |= PCRE_ANCHORED;
1995 else
1996 {
1997 int ch = find_firstchar(re->code);
1998 if (ch >= 0)
1999 {
2000 re->first_char = ch;
2001 re->options |= PCRE_FIRSTSET;
2002 }
2003 else if (is_startline(re->code))
2004 re->options |= PCRE_STARTLINE;
2005 }
2006 }
2007
2008 /* Print out the compiled data for debugging */
2009
2010 #ifdef DEBUG
2011
2012 printf("Length = %d top_bracket = %d top_backref=%d\n",
2013 length, re->top_bracket, re->top_backref);
2014
2015 if (re->options != 0)
2016 {
2017 printf("%s%s%s%s%s%s%s%s\n",
2018 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2019 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2020 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2021 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2022 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2023 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2024 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2025 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2026 }
2027
2028 if ((re->options & PCRE_FIRSTSET) != 0)
2029 {
2030 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2031 else printf("First char = \\x%02x\n", re->first_char);
2032 }
2033
2034 code_end = code;
2035 code_base = code = re->code;
2036
2037 while (code < code_end)
2038 {
2039 int charlength;
2040
2041 printf("%3d ", code - code_base);
2042
2043 if (*code >= OP_BRA)
2044 {
2045 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2046 code += 2;
2047 }
2048
2049 else switch(*code)
2050 {
2051 case OP_CHARS:
2052 charlength = *(++code);
2053 printf("%3d ", charlength);
2054 while (charlength-- > 0)
2055 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2056 break;
2057
2058 case OP_KETRMAX:
2059 case OP_KETRMIN:
2060 case OP_ALT:
2061 case OP_KET:
2062 case OP_ASSERT:
2063 case OP_ASSERT_NOT:
2064 case OP_ONCE:
2065 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2066 code += 2;
2067 break;
2068
2069 case OP_STAR:
2070 case OP_MINSTAR:
2071 case OP_PLUS:
2072 case OP_MINPLUS:
2073 case OP_QUERY:
2074 case OP_MINQUERY:
2075 case OP_TYPESTAR:
2076 case OP_TYPEMINSTAR:
2077 case OP_TYPEPLUS:
2078 case OP_TYPEMINPLUS:
2079 case OP_TYPEQUERY:
2080 case OP_TYPEMINQUERY:
2081 if (*code >= OP_TYPESTAR)
2082 printf(" %s", OP_names[code[1]]);
2083 else if (isprint(c = code[1])) printf(" %c", c);
2084 else printf(" \\x%02x", c);
2085 printf("%s", OP_names[*code++]);
2086 break;
2087
2088 case OP_EXACT:
2089 case OP_UPTO:
2090 case OP_MINUPTO:
2091 if (isprint(c = code[3])) printf(" %c{", c);
2092 else printf(" \\x%02x{", c);
2093 if (*code != OP_EXACT) printf("0,");
2094 printf("%d}", (code[1] << 8) + code[2]);
2095 if (*code == OP_MINUPTO) printf("?");
2096 code += 3;
2097 break;
2098
2099 case OP_TYPEEXACT:
2100 case OP_TYPEUPTO:
2101 case OP_TYPEMINUPTO:
2102 printf(" %s{", OP_names[code[3]]);
2103 if (*code != OP_TYPEEXACT) printf(",");
2104 printf("%d}", (code[1] << 8) + code[2]);
2105 if (*code == OP_TYPEMINUPTO) printf("?");
2106 code += 3;
2107 break;
2108
2109 case OP_NOT:
2110 if (isprint(c = *(++code))) printf(" [^%c]", c);
2111 else printf(" [^\\x%02x]", c);
2112 break;
2113
2114 case OP_NOTSTAR:
2115 case OP_NOTMINSTAR:
2116 case OP_NOTPLUS:
2117 case OP_NOTMINPLUS:
2118 case OP_NOTQUERY:
2119 case OP_NOTMINQUERY:
2120 if (isprint(c = code[1])) printf(" [^%c]", c);
2121 else printf(" [^\\x%02x]", c);
2122 printf("%s", OP_names[*code++]);
2123 break;
2124
2125 case OP_NOTEXACT:
2126 case OP_NOTUPTO:
2127 case OP_NOTMINUPTO:
2128 if (isprint(c = code[3])) printf(" [^%c]{", c);
2129 else printf(" [^\\x%02x]{", c);
2130 if (*code != OP_NOTEXACT) printf(",");
2131 printf("%d}", (code[1] << 8) + code[2]);
2132 if (*code == OP_NOTMINUPTO) printf("?");
2133 code += 3;
2134 break;
2135
2136 case OP_REF:
2137 printf(" \\%d", *(++code));
2138 code ++;
2139 goto CLASS_REF_REPEAT;
2140
2141 case OP_CLASS:
2142 case OP_NEGCLASS:
2143 {
2144 int i, min, max;
2145
2146 if (*code++ == OP_CLASS) printf(" [");
2147 else printf(" ^[");
2148
2149 for (i = 0; i < 256; i++)
2150 {
2151 if ((code[i/8] & (1 << (i&7))) != 0)
2152 {
2153 int j;
2154 for (j = i+1; j < 256; j++)
2155 if ((code[j/8] & (1 << (j&7))) == 0) break;
2156 if (i == '-' || i == ']') printf("\\");
2157 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2158 if (--j > i)
2159 {
2160 printf("-");
2161 if (j == '-' || j == ']') printf("\\");
2162 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2163 }
2164 i = j;
2165 }
2166 }
2167 printf("]");
2168 code += 32;
2169
2170 CLASS_REF_REPEAT:
2171
2172 switch(*code)
2173 {
2174 case OP_CRSTAR:
2175 case OP_CRMINSTAR:
2176 case OP_CRPLUS:
2177 case OP_CRMINPLUS:
2178 case OP_CRQUERY:
2179 case OP_CRMINQUERY:
2180 printf("%s", OP_names[*code]);
2181 break;
2182
2183 case OP_CRRANGE:
2184 case OP_CRMINRANGE:
2185 min = (code[1] << 8) + code[2];
2186 max = (code[3] << 8) + code[4];
2187 if (max == 0) printf("{%d,}", min);
2188 else printf("{%d,%d}", min, max);
2189 if (*code == OP_CRMINRANGE) printf("?");
2190 code += 4;
2191 break;
2192
2193 default:
2194 code--;
2195 }
2196 }
2197 break;
2198
2199 /* Anything else is just a one-node item */
2200
2201 default:
2202 printf(" %s", OP_names[*code]);
2203 break;
2204 }
2205
2206 code++;
2207 printf("\n");
2208 }
2209 printf("------------------------------------------------------------------\n");
2210
2211 /* This check is done here in the debugging case so that the code that
2212 was compiled can be seen. */
2213
2214 if (code - re->code > length)
2215 {
2216 *errorptr = ERR23;
2217 (pcre_free)(re);
2218 *erroroffset = ptr - (uschar *)pattern;
2219 return NULL;
2220 }
2221 #endif
2222
2223 return (pcre *)re;
2224 }
2225
2226
2227
2228 /*************************************************
2229 * Match a character type *
2230 *************************************************/
2231
2232 /* Not used in all the places it might be as it's sometimes faster
2233 to put the code inline.
2234
2235 Arguments:
2236 type the character type
2237 c the character
2238 dotall the dotall flag
2239
2240 Returns: TRUE if character is of the type
2241 */
2242
2243 static BOOL
2244 match_type(int type, int c, BOOL dotall)
2245 {
2246
2247 #ifdef DEBUG
2248 if (isprint(c)) printf("matching subject %c against ", c);
2249 else printf("matching subject \\x%02x against ", c);
2250 printf("%s\n", OP_names[type]);
2251 #endif
2252
2253 switch(type)
2254 {
2255 case OP_ANY: return dotall || c != '\n';
2256 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2257 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2258 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2259 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2260 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2261 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2262 }
2263 return FALSE;
2264 }
2265
2266
2267
2268 /*************************************************
2269 * Match a back-reference *
2270 *************************************************/
2271
2272 /* If a back reference hasn't been set, the match fails.
2273
2274 Arguments:
2275 number reference number
2276 eptr points into the subject
2277 length length to be matched
2278 md points to match data block
2279
2280 Returns: TRUE if matched
2281 */
2282
2283 static BOOL
2284 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2285 {
2286 const uschar *p = md->start_subject + md->offset_vector[number];
2287
2288 #ifdef DEBUG
2289 if (eptr >= md->end_subject)
2290 printf("matching subject <null>");
2291 else
2292 {
2293 printf("matching subject ");
2294 pchars(eptr, length, TRUE, md);
2295 }
2296 printf(" against backref ");
2297 pchars(p, length, FALSE, md);
2298 printf("\n");
2299 #endif
2300
2301 /* Always fail if not enough characters left */
2302
2303 if (length > md->end_subject - p) return FALSE;
2304
2305 /* Separate the caselesss case for speed */
2306
2307 if (md->caseless)
2308 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2309 else
2310 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2311
2312 return TRUE;
2313 }
2314
2315
2316
2317 /*************************************************
2318 * Match from current position *
2319 *************************************************/
2320
2321 /* On entry ecode points to the first opcode, and eptr to the first character.
2322
2323 Arguments:
2324 eptr pointer in subject
2325 ecode position in code
2326 offset_top current top pointer
2327 md pointer to "static" info for the match
2328
2329 Returns: TRUE if matched
2330 */
2331
2332 static BOOL
2333 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2334 match_data *md)
2335 {
2336 for (;;)
2337 {
2338 int min, max, ctype;
2339 register int i;
2340 register int c;
2341 BOOL minimize = FALSE;
2342
2343 /* Opening bracket. Check the alternative branches in turn, failing if none
2344 match. We have to set the start offset if required and there is space
2345 in the offset vector so that it is available for subsequent back references
2346 if the bracket matches. However, if the bracket fails, we must put back the
2347 previous value of both offsets in case they were set by a previous copy of
2348 the same bracket. Don't worry about setting the flag for the error case here;
2349 that is handled in the code for KET. */
2350
2351 if ((int)*ecode >= OP_BRA)
2352 {
2353 int number = (*ecode - OP_BRA) << 1;
2354 int save_offset1 = 0, save_offset2 = 0;
2355
2356 DPRINTF(("start bracket %d\n", number/2));
2357
2358 if (number > 0 && number < md->offset_end)
2359 {
2360 save_offset1 = md->offset_vector[number];
2361 save_offset2 = md->offset_vector[number+1];
2362 md->offset_vector[number] = eptr - md->start_subject;
2363
2364 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2365 }
2366
2367 /* Recurse for all the alternatives. */
2368
2369 do
2370 {
2371 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2372 ecode += (ecode[1] << 8) + ecode[2];
2373 }
2374 while (*ecode == OP_ALT);
2375
2376 DPRINTF(("bracket %d failed\n", number/2));
2377
2378 if (number > 0 && number < md->offset_end)
2379 {
2380 md->offset_vector[number] = save_offset1;
2381 md->offset_vector[number+1] = save_offset2;
2382 }
2383
2384 return FALSE;
2385 }
2386
2387 /* Other types of node can be handled by a switch */
2388
2389 switch(*ecode)
2390 {
2391 case OP_END:
2392 md->end_match_ptr = eptr; /* Record where we ended */
2393 md->end_offset_top = offset_top; /* and how many extracts were taken */
2394 return TRUE;
2395
2396 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2397 whole thing doesn't match, so we have to get out via a longjmp(). */
2398
2399 case OP_CUT:
2400 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2401 longjmp(md->fail_env, 1);
2402
2403 /* Assertion brackets. Check the alternative branches in turn - the
2404 matching won't pass the KET for an assertion. If any one branch matches,
2405 the assertion is true. */
2406
2407 case OP_ASSERT:
2408 do
2409 {
2410 if (match(eptr, ecode+3, offset_top, md)) break;
2411 ecode += (ecode[1] << 8) + ecode[2];
2412 }
2413 while (*ecode == OP_ALT);
2414 if (*ecode == OP_KET) return FALSE;
2415
2416 /* Continue from after the assertion, updating the offsets high water
2417 mark, since extracts may have been taken during the assertion. */
2418
2419 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2420 ecode += 3;
2421 offset_top = md->end_offset_top;
2422 continue;
2423
2424 /* Negative assertion: all branches must fail to match */
2425
2426 case OP_ASSERT_NOT:
2427 do
2428 {
2429 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2430 ecode += (ecode[1] << 8) + ecode[2];
2431 }
2432 while (*ecode == OP_ALT);
2433 ecode += 3;
2434 continue;
2435
2436 /* "Once" brackets are like assertion brackets except that after a match,
2437 the point in the subject string is not moved back. Thus there can never be
2438 a move back into the brackets. Check the alternative branches in turn - the
2439 matching won't pass the KET for this kind of subpattern. If any one branch
2440 matches, we carry on, leaving the subject pointer. */
2441
2442 case OP_ONCE:
2443 do
2444 {
2445 if (match(eptr, ecode+3, offset_top, md)) break;
2446 ecode += (ecode[1] << 8) + ecode[2];
2447 }
2448 while (*ecode == OP_ALT);
2449 if (*ecode == OP_KET) return FALSE;
2450
2451 /* Continue as from after the assertion, updating the offsets high water
2452 mark, since extracts may have been taken. */
2453
2454 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2455 ecode += 3;
2456 offset_top = md->end_offset_top;
2457 eptr = md->end_match_ptr;
2458 continue;
2459
2460 /* An alternation is the end of a branch; scan along to find the end of the
2461 bracketed group and go to there. */
2462
2463 case OP_ALT:
2464 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2465 break;
2466
2467 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2468 that it may occur zero times. It may repeat infinitely, or not at all -
2469 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2470 repeat limits are compiled as a number of copies, with the optional ones
2471 preceded by BRAZERO or BRAMINZERO. */
2472
2473 case OP_BRAZERO:
2474 {
2475 const uschar *next = ecode+1;
2476 if (match(eptr, next, offset_top, md)) return TRUE;
2477 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2478 ecode = next + 3;
2479 }
2480 break;
2481
2482 case OP_BRAMINZERO:
2483 {
2484 const uschar *next = ecode+1;
2485 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2486 if (match(eptr, next+3, offset_top, md)) return TRUE;
2487 ecode++;
2488 }
2489 break;;
2490
2491 /* End of a group, repeated or non-repeating. If we are at the end of
2492 an assertion "group", stop matching and return TRUE, but record the
2493 current high water mark for use by positive assertions. */
2494
2495 case OP_KET:
2496 case OP_KETRMIN:
2497 case OP_KETRMAX:
2498 {
2499 int number;
2500 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2501
2502 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2503 {
2504 md->end_match_ptr = eptr; /* For ONCE */
2505 md->end_offset_top = offset_top;
2506 return TRUE;
2507 }
2508
2509 /* In all other cases we have to check the group number back at the
2510 start and if necessary complete handling an extraction by setting the
2511 final offset and bumping the high water mark. */
2512
2513 number = (*prev - OP_BRA) << 1;
2514
2515 DPRINTF(("end bracket %d\n", number/2));
2516
2517 if (number > 0)
2518 {
2519 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2520 {
2521 md->offset_vector[number+1] = eptr - md->start_subject;
2522 if (offset_top <= number) offset_top = number + 2;
2523 }
2524 }
2525
2526 /* For a non-repeating ket, just advance to the next node and continue at
2527 this level. */
2528
2529 if (*ecode == OP_KET)
2530 {
2531 ecode += 3;
2532 break;
2533 }
2534
2535 /* The repeating kets try the rest of the pattern or restart from the
2536 preceding bracket, in the appropriate order. */
2537
2538 if (*ecode == OP_KETRMIN)
2539 {
2540 if (match(eptr, ecode+3, offset_top, md) ||
2541 match(eptr, prev, offset_top, md)) return TRUE;
2542 }
2543 else /* OP_KETRMAX */
2544 {
2545 if (match(eptr, prev, offset_top, md) ||
2546 match(eptr, ecode+3, offset_top, md)) return TRUE;
2547 }
2548 }
2549 return FALSE;
2550
2551 /* Start of subject unless notbol, or after internal newline if multiline */
2552
2553 case OP_CIRC:
2554 if (md->notbol && eptr == md->start_subject) return FALSE;
2555 if (md->multiline)
2556 {
2557 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2558 ecode++;
2559 break;
2560 }
2561 /* ... else fall through */
2562
2563 /* Start of subject assertion */
2564
2565 case OP_SOD:
2566 if (eptr != md->start_subject) return FALSE;
2567 ecode++;
2568 break;
2569
2570 /* Assert before internal newline if multiline, or before
2571 a terminating newline unless endonly is set, else end of subject unless
2572 noteol is set. */
2573
2574 case OP_DOLL:
2575 if (md->noteol && eptr >= md->end_subject) return FALSE;
2576 if (md->multiline)
2577 {
2578 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2579 ecode++;
2580 break;
2581 }
2582 else if (!md->endonly)
2583 {
2584 if (eptr < md->end_subject - 1 ||
2585 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2586 ecode++;
2587 break;
2588 }
2589 /* ... else fall through */
2590
2591 /* End of subject assertion */
2592
2593 case OP_EOD:
2594 if (eptr < md->end_subject) return FALSE;
2595 ecode++;
2596 break;
2597
2598 /* Word boundary assertions */
2599
2600 case OP_NOT_WORD_BOUNDARY:
2601 case OP_WORD_BOUNDARY:
2602 {
2603 BOOL prev_is_word = (eptr != md->start_subject) &&
2604 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2605 BOOL cur_is_word = (eptr < md->end_subject) &&
2606 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2607 if ((*ecode++ == OP_WORD_BOUNDARY)?
2608 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2609 return FALSE;
2610 }
2611 break;
2612
2613 /* Match a single character type; inline for speed */
2614
2615 case OP_ANY:
2616 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2617 if (eptr++ >= md->end_subject) return FALSE;
2618 ecode++;
2619 break;
2620
2621 case OP_NOT_DIGIT:
2622 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2623 return FALSE;
2624 ecode++;
2625 break;
2626
2627 case OP_DIGIT:
2628 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2629 return FALSE;
2630 ecode++;
2631 break;
2632
2633 case OP_NOT_WHITESPACE:
2634 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2635 return FALSE;
2636 ecode++;
2637 break;
2638
2639 case OP_WHITESPACE:
2640 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2641 return FALSE;
2642 ecode++;
2643 break;
2644
2645 case OP_NOT_WORDCHAR:
2646 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2647 return FALSE;
2648 ecode++;
2649 break;
2650
2651 case OP_WORDCHAR:
2652 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2653 return FALSE;
2654 ecode++;
2655 break;
2656
2657 /* Match a back reference, possibly repeatedly. Look past the end of the
2658 item to see if there is repeat information following. The code is similar
2659 to that for character classes, but repeated for efficiency. Then obey
2660 similar code to character type repeats - written out again for speed.
2661 However, if the referenced string is the empty string, always treat
2662 it as matched, any number of times (otherwise there could be infinite
2663 loops). */
2664
2665 case OP_REF:
2666 {
2667 int length;
2668 int number = ecode[1] << 1; /* Doubled reference number */
2669 ecode += 2; /* Advance past the item */
2670
2671 if (number >= offset_top || md->offset_vector[number] < 0)
2672 {
2673 md->errorcode = PCRE_ERROR_BADREF;
2674 return FALSE;
2675 }
2676
2677 length = md->offset_vector[number+1] - md->offset_vector[number];
2678
2679 switch (*ecode)
2680 {
2681 case OP_CRSTAR:
2682 case OP_CRMINSTAR:
2683 case OP_CRPLUS:
2684 case OP_CRMINPLUS:
2685 case OP_CRQUERY:
2686 case OP_CRMINQUERY:
2687 c = *ecode++ - OP_CRSTAR;
2688 minimize = (c & 1) != 0;
2689 min = rep_min[c]; /* Pick up values from tables; */
2690 max = rep_max[c]; /* zero for max => infinity */
2691 if (max == 0) max = INT_MAX;
2692 break;
2693
2694 case OP_CRRANGE:
2695 case OP_CRMINRANGE:
2696 minimize = (*ecode == OP_CRMINRANGE);
2697 min = (ecode[1] << 8) + ecode[2];
2698 max = (ecode[3] << 8) + ecode[4];
2699 if (max == 0) max = INT_MAX;
2700 ecode += 5;
2701 break;
2702
2703 default: /* No repeat follows */
2704 if (!match_ref(number, eptr, length, md)) return FALSE;
2705 eptr += length;
2706 continue; /* With the main loop */
2707 }
2708
2709 /* If the length of the reference is zero, just continue with the
2710 main loop. */
2711
2712 if (length == 0) continue;
2713
2714 /* First, ensure the minimum number of matches are present. We get back
2715 the length of the reference string explicitly rather than passing the
2716 address of eptr, so that eptr can be a register variable. */
2717
2718 for (i = 1; i <= min; i++)
2719 {
2720 if (!match_ref(number, eptr, length, md)) return FALSE;
2721 eptr += length;
2722 }
2723
2724 /* If min = max, continue at the same level without recursion.
2725 They are not both allowed to be zero. */
2726
2727 if (min == max) continue;
2728
2729 /* If minimizing, keep trying and advancing the pointer */
2730
2731 if (minimize)
2732 {
2733 for (i = min;; i++)
2734 {
2735 if (match(eptr, ecode, offset_top, md)) return TRUE;
2736 if (i >= max || !match_ref(number, eptr, length, md))
2737 return FALSE;
2738 eptr += length;
2739 }
2740 /* Control never gets here */
2741 }
2742
2743 /* If maximizing, find the longest string and work backwards */
2744
2745 else
2746 {
2747 const uschar *pp = eptr;
2748 for (i = min; i < max; i++)
2749 {
2750 if (!match_ref(number, eptr, length, md)) break;
2751 eptr += length;
2752 }
2753 while (eptr >= pp)
2754 {
2755 if (match(eptr, ecode, offset_top, md)) return TRUE;
2756 eptr -= length;
2757 }
2758 return FALSE;
2759 }
2760 }
2761 /* Control never gets here */
2762
2763 /* Match a character class, possibly repeatedly. Look past the end of the
2764 item to see if there is repeat information following. Then obey similar
2765 code to character type repeats - written out again for speed. If caseless
2766 matching was set at runtime but not at compile time, we have to check both
2767 versions of a character, and we have to behave differently for positive and
2768 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
2769 treated differently. */
2770
2771 case OP_CLASS:
2772 case OP_NEGCLASS:
2773 {
2774 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
2775 const uschar *data = ecode + 1; /* Save for matching */
2776 ecode += 33; /* Advance past the item */
2777
2778 switch (*ecode)
2779 {
2780 case OP_CRSTAR:
2781 case OP_CRMINSTAR:
2782 case OP_CRPLUS:
2783 case OP_CRMINPLUS:
2784 case OP_CRQUERY:
2785 case OP_CRMINQUERY:
2786 c = *ecode++ - OP_CRSTAR;
2787 minimize = (c & 1) != 0;
2788 min = rep_min[c]; /* Pick up values from tables; */
2789 max = rep_max[c]; /* zero for max => infinity */
2790 if (max == 0) max = INT_MAX;
2791 break;
2792
2793 case OP_CRRANGE:
2794 case OP_CRMINRANGE:
2795 minimize = (*ecode == OP_CRMINRANGE);
2796 min = (ecode[1] << 8) + ecode[2];
2797 max = (ecode[3] << 8) + ecode[4];
2798 if (max == 0) max = INT_MAX;
2799 ecode += 5;
2800 break;
2801
2802 default: /* No repeat follows */
2803 min = max = 1;
2804 break;
2805 }
2806
2807 /* First, ensure the minimum number of matches are present. */
2808
2809 for (i = 1; i <= min; i++)
2810 {
2811 if (eptr >= md->end_subject) return FALSE;
2812 c = *eptr++;
2813
2814 /* Either not runtime caseless, or it was a positive class. For
2815 runtime caseless, continue if either case is in the map. */
2816
2817 if (!nasty_case)
2818 {
2819 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2820 if (md->runtime_caseless)
2821 {
2822 c = pcre_fcc[c];
2823 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2824 }
2825 }
2826
2827 /* Runtime caseless and it was a negative class. Continue only if
2828 both cases are in the map. */
2829
2830 else
2831 {
2832 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2833 c = pcre_fcc[c];
2834 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2835 }
2836
2837 return FALSE;
2838 }
2839
2840 /* If max == min we can continue with the main loop without the
2841 need to recurse. */
2842
2843 if (min == max) continue;
2844
2845 /* If minimizing, keep testing the rest of the expression and advancing
2846 the pointer while it matches the class. */
2847
2848 if (minimize)
2849 {
2850 for (i = min;; i++)
2851 {
2852 if (match(eptr, ecode, offset_top, md)) return TRUE;
2853 if (i >= max || eptr >= md->end_subject) return FALSE;
2854 c = *eptr++;
2855
2856 /* Either not runtime caseless, or it was a positive class. For
2857 runtime caseless, continue if either case is in the map. */
2858
2859 if (!nasty_case)
2860 {
2861 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2862 if (md->runtime_caseless)
2863 {
2864 c = pcre_fcc[c];
2865 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2866 }
2867 }
2868
2869 /* Runtime caseless and it was a negative class. Continue only if
2870 both cases are in the map. */
2871
2872 else
2873 {
2874 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2875 c = pcre_fcc[c];
2876 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2877 }
2878
2879 return FALSE;
2880 }
2881 /* Control never gets here */
2882 }
2883
2884 /* If maximizing, find the longest possible run, then work backwards. */
2885
2886 else
2887 {
2888 const uschar *pp = eptr;
2889 for (i = min; i < max; eptr++, i++)
2890 {
2891 if (eptr >= md->end_subject) break;
2892 c = *eptr;
2893
2894 /* Either not runtime caseless, or it was a positive class. For
2895 runtime caseless, continue if either case is in the map. */
2896
2897 if (!nasty_case)
2898 {
2899 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2900 if (md->runtime_caseless)
2901 {
2902 c = pcre_fcc[c];
2903 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2904 }
2905 }
2906
2907 /* Runtime caseless and it was a negative class. Continue only if
2908 both cases are in the map. */
2909
2910 else
2911 {
2912 if ((data[c/8] & (1 << (c&7))) == 0) break;
2913 c = pcre_fcc[c];
2914 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2915 }
2916
2917 break;
2918 }
2919
2920 while (eptr >= pp)
2921 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2922 return FALSE;
2923 }
2924 }
2925 /* Control never gets here */
2926
2927 /* Match a run of characters */
2928
2929 case OP_CHARS:
2930 {
2931 register int length = ecode[1];
2932 ecode += 2;
2933
2934 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2935 if (eptr >= md->end_subject)
2936 printf("matching subject <null> against pattern ");
2937 else
2938 {
2939 printf("matching subject ");
2940 pchars(eptr, length, TRUE, md);
2941 printf(" against pattern ");
2942 }
2943 pchars(ecode, length, FALSE, md);
2944 printf("\n");
2945 #endif
2946
2947 if (length > md->end_subject - eptr) return FALSE;
2948 if (md->caseless)
2949 {
2950 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2951 }
2952 else
2953 {
2954 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2955 }
2956 }
2957 break;
2958
2959 /* Match a single character repeatedly; different opcodes share code. */
2960
2961 case OP_EXACT:
2962 min = max = (ecode[1] << 8) + ecode[2];
2963 ecode += 3;
2964 goto REPEATCHAR;
2965
2966 case OP_UPTO:
2967 case OP_MINUPTO:
2968 min = 0;
2969 max = (ecode[1] << 8) + ecode[2];
2970 minimize = *ecode == OP_MINUPTO;
2971 ecode += 3;
2972 goto REPEATCHAR;
2973
2974 case OP_STAR:
2975 case OP_MINSTAR:
2976 case OP_PLUS:
2977 case OP_MINPLUS:
2978 case OP_QUERY:
2979 case OP_MINQUERY:
2980 c = *ecode++ - OP_STAR;
2981 minimize = (c & 1) != 0;
2982 min = rep_min[c]; /* Pick up values from tables; */
2983 max = rep_max[c]; /* zero for max => infinity */
2984 if (max == 0) max = INT_MAX;
2985
2986 /* Common code for all repeated single-character matches. We can give
2987 up quickly if there are fewer than the minimum number of characters left in
2988 the subject. */
2989
2990 REPEATCHAR:
2991 if (min > md->end_subject - eptr) return FALSE;
2992 c = *ecode++;
2993
2994 /* The code is duplicated for the caseless and caseful cases, for speed,
2995 since matching characters is likely to be quite common. First, ensure the
2996 minimum number of matches are present. If min = max, continue at the same
2997 level without recursing. Otherwise, if minimizing, keep trying the rest of
2998 the expression and advancing one matching character if failing, up to the
2999 maximum. Alternatively, if maximizing, find the maximum number of
3000 characters and work backwards. */
3001
3002 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3003 max, eptr));
3004
3005 if (md->caseless)
3006 {
3007 c = pcre_lcc[c];
3008 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
3009 if (min == max) continue;
3010 if (minimize)
3011 {
3012 for (i = min;; i++)
3013 {
3014 if (match(eptr, ecode, offset_top, md)) return TRUE;
3015 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3016 return FALSE;
3017 }
3018 /* Control never gets here */
3019 }
3020 else
3021 {
3022 const uschar *pp = eptr;
3023 for (i = min; i < max; i++)
3024 {
3025 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3026 eptr++;
3027 }
3028 while (eptr >= pp)
3029 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3030 return FALSE;
3031 }
3032 /* Control never gets here */
3033 }
3034
3035 /* Caseful comparisons */
3036
3037 else
3038 {
3039 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3040 if (min == max) continue;
3041 if (minimize)
3042 {
3043 for (i = min;; i++)
3044 {
3045 if (match(eptr, ecode, offset_top, md)) return TRUE;
3046 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3047 }
3048 /* Control never gets here */
3049 }
3050 else
3051 {
3052 const uschar *pp = eptr;
3053 for (i = min; i < max; i++)
3054 {
3055 if (eptr >= md->end_subject || c != *eptr) break;
3056 eptr++;
3057 }
3058 while (eptr >= pp)
3059 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3060 return FALSE;
3061 }
3062 }
3063 /* Control never gets here */
3064
3065 /* Match a negated single character */
3066
3067 case OP_NOT:
3068 if (eptr >= md->end_subject) return FALSE;
3069 ecode++;
3070 if (md->caseless)
3071 {
3072 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3073 }
3074 else
3075 {
3076 if (*ecode++ == *eptr++) return FALSE;
3077 }
3078 break;
3079
3080 /* Match a negated single character repeatedly. This is almost a repeat of
3081 the code for a repeated single character, but I haven't found a nice way of
3082 commoning these up that doesn't require a test of the positive/negative
3083 option for each character match. Maybe that wouldn't add very much to the
3084 time taken, but character matching *is* what this is all about... */
3085
3086 case OP_NOTEXACT:
3087 min = max = (ecode[1] << 8) + ecode[2];
3088 ecode += 3;
3089 goto REPEATNOTCHAR;
3090
3091 case OP_NOTUPTO:
3092 case OP_NOTMINUPTO:
3093 min = 0;
3094 max = (ecode[1] << 8) + ecode[2];
3095 minimize = *ecode == OP_NOTMINUPTO;
3096 ecode += 3;
3097 goto REPEATNOTCHAR;
3098
3099 case OP_NOTSTAR:
3100 case OP_NOTMINSTAR:
3101 case OP_NOTPLUS:
3102 case OP_NOTMINPLUS:
3103 case OP_NOTQUERY:
3104 case OP_NOTMINQUERY:
3105 c = *ecode++ - OP_NOTSTAR;
3106 minimize = (c & 1) != 0;
3107 min = rep_min[c]; /* Pick up values from tables; */
3108 max = rep_max[c]; /* zero for max => infinity */
3109 if (max == 0) max = INT_MAX;
3110
3111 /* Common code for all repeated single-character matches. We can give
3112 up quickly if there are fewer than the minimum number of characters left in
3113 the subject. */
3114
3115 REPEATNOTCHAR:
3116 if (min > md->end_subject - eptr) return FALSE;
3117 c = *ecode++;
3118
3119 /* The code is duplicated for the caseless and caseful cases, for speed,
3120 since matching characters is likely to be quite common. First, ensure the
3121 minimum number of matches are present. If min = max, continue at the same
3122 level without recursing. Otherwise, if minimizing, keep trying the rest of
3123 the expression and advancing one matching character if failing, up to the
3124 maximum. Alternatively, if maximizing, find the maximum number of
3125 characters and work backwards. */
3126
3127 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3128 max, eptr));
3129
3130 if (md->caseless)
3131 {
3132 c = pcre_lcc[c];
3133 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3134 if (min == max) continue;
3135 if (minimize)
3136 {
3137 for (i = min;; i++)
3138 {
3139 if (match(eptr, ecode, offset_top, md)) return TRUE;
3140 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3141 return FALSE;
3142 }
3143 /* Control never gets here */
3144 }
3145 else
3146 {
3147 const uschar *pp = eptr;
3148 for (i = min; i < max; i++)
3149 {
3150 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3151 eptr++;
3152 }
3153 while (eptr >= pp)
3154 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3155 return FALSE;
3156 }
3157 /* Control never gets here */
3158 }
3159
3160 /* Caseful comparisons */
3161
3162 else
3163 {
3164 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3165 if (min == max) continue;
3166 if (minimize)
3167 {
3168 for (i = min;; i++)
3169 {
3170 if (match(eptr, ecode, offset_top, md)) return TRUE;
3171 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3172 }
3173 /* Control never gets here */
3174 }
3175 else
3176 {
3177 const uschar *pp = eptr;
3178 for (i = min; i < max; i++)
3179 {
3180 if (eptr >= md->end_subject || c == *eptr) break;
3181 eptr++;
3182 }
3183 while (eptr >= pp)
3184 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3185 return FALSE;
3186 }
3187 }
3188 /* Control never gets here */
3189
3190 /* Match a single character type repeatedly; several different opcodes
3191 share code. This is very similar to the code for single characters, but we
3192 repeat it in the interests of efficiency. */
3193
3194 case OP_TYPEEXACT:
3195 min = max = (ecode[1] << 8) + ecode[2];
3196 minimize = TRUE;
3197 ecode += 3;
3198 goto REPEATTYPE;
3199
3200 case OP_TYPEUPTO:
3201 case OP_TYPEMINUPTO:
3202 min = 0;
3203 max = (ecode[1] << 8) + ecode[2];
3204 minimize = *ecode == OP_TYPEMINUPTO;
3205 ecode += 3;
3206 goto REPEATTYPE;
3207
3208 case OP_TYPESTAR:
3209 case OP_TYPEMINSTAR:
3210 case OP_TYPEPLUS:
3211 case OP_TYPEMINPLUS:
3212 case OP_TYPEQUERY:
3213 case OP_TYPEMINQUERY:
3214 c = *ecode++ - OP_TYPESTAR;
3215 minimize = (c & 1) != 0;
3216 min = rep_min[c]; /* Pick up values from tables; */
3217 max = rep_max[c]; /* zero for max => infinity */
3218 if (max == 0) max = INT_MAX;
3219
3220 /* Common code for all repeated single character type matches */
3221
3222 REPEATTYPE:
3223 ctype = *ecode++; /* Code for the character type */
3224
3225 /* First, ensure the minimum number of matches are present. Use inline
3226 code for maximizing the speed, and do the type test once at the start
3227 (i.e. keep it out of the loop). Also test that there are at least the
3228 minimum number of characters before we start. */
3229
3230 if (min > md->end_subject - eptr) return FALSE;
3231 if (min > 0) switch(ctype)
3232 {
3233 case OP_ANY:
3234 if (!md->dotall)
3235 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3236 else eptr += min;
3237 break;
3238
3239 case OP_NOT_DIGIT:
3240 for (i = 1; i <= min; i++)
3241 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3242 break;
3243
3244 case OP_DIGIT:
3245 for (i = 1; i <= min; i++)
3246 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3247 break;
3248
3249 case OP_NOT_WHITESPACE:
3250 for (i = 1; i <= min; i++)
3251 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3252 break;
3253
3254 case OP_WHITESPACE:
3255 for (i = 1; i <= min; i++)
3256 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3257 break;
3258
3259 case OP_NOT_WORDCHAR:
3260 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3261 return FALSE;
3262 break;
3263
3264 case OP_WORDCHAR:
3265 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3266 return FALSE;
3267 break;
3268 }
3269
3270 /* If min = max, continue at the same level without recursing */
3271
3272 if (min == max) continue;
3273
3274 /* If minimizing, we have to test the rest of the pattern before each
3275 subsequent match, so inlining isn't much help; just use the function. */
3276
3277 if (minimize)
3278 {
3279 for (i = min;; i++)
3280 {
3281 if (match(eptr, ecode, offset_top, md)) return TRUE;
3282 if (i >= max || eptr >= md->end_subject ||
3283 !match_type(ctype, *eptr++, md->dotall))
3284 return FALSE;
3285 }
3286 /* Control never gets here */
3287 }
3288
3289 /* If maximizing it is worth using inline code for speed, doing the type
3290 test once at the start (i.e. keep it out of the loop). */
3291
3292 else
3293 {
3294 const uschar *pp = eptr;
3295 switch(ctype)
3296 {
3297 case OP_ANY:
3298 if (!md->dotall)
3299 {
3300 for (i = min; i < max; i++)
3301 {
3302 if (eptr >= md->end_subject || *eptr == '\n') break;
3303 eptr++;
3304 }
3305 }
3306 else
3307 {
3308 c = max - min;
3309 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3310 eptr += c;
3311 }
3312 break;
3313
3314 case OP_NOT_DIGIT:
3315 for (i = min; i < max; i++)
3316 {
3317 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3318 break;
3319 eptr++;
3320 }
3321 break;
3322
3323 case OP_DIGIT:
3324 for (i = min; i < max; i++)
3325 {
3326 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3327 break;
3328 eptr++;
3329 }
3330 break;
3331
3332 case OP_NOT_WHITESPACE:
3333 for (i = min; i < max; i++)
3334 {
3335 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3336 break;
3337 eptr++;
3338 }
3339 break;
3340
3341 case OP_WHITESPACE:
3342 for (i = min; i < max; i++)
3343 {
3344 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3345 break;
3346 eptr++;
3347 }
3348 break;
3349
3350 case OP_NOT_WORDCHAR:
3351 for (i = min; i < max; i++)
3352 {
3353 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3354 break;
3355 eptr++;
3356 }
3357 break;
3358
3359 case OP_WORDCHAR:
3360 for (i = min; i < max; i++)
3361 {
3362 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3363 break;
3364 eptr++;
3365 }
3366 break;
3367 }
3368
3369 while (eptr >= pp)
3370 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3371 return FALSE;
3372 }
3373 /* Control never gets here */
3374
3375 /* There's been some horrible disaster. */
3376
3377 default:
3378 DPRINTF(("Unknown opcode %d\n", *ecode));
3379 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3380 return FALSE;
3381 }
3382
3383 /* Do not stick any code in here without much thought; it is assumed
3384 that "continue" in the code above comes out to here to repeat the main
3385 loop. */
3386
3387 } /* End of main loop */
3388 /* Control never reaches here */
3389 }
3390
3391
3392
3393 /*************************************************
3394 * Segregate setjmp() *
3395 *************************************************/
3396
3397 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3398 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3399 hide it in a separate function. This is called only when PCRE_EXTRA is set,
3400 since it's needed only for the extension \X option, and with any luck, a good
3401 compiler will spot the tail recursion and compile it efficiently.
3402
3403 Arguments:
3404 eptr pointer in subject
3405 ecode position in code
3406 offset_top current top pointer
3407 md pointer to "static" info for the match
3408
3409 Returns: TRUE if matched
3410 */
3411
3412 static BOOL
3413 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
3414 match_data *match_block)
3415 {
3416 return setjmp(match_block->fail_env) == 0 &&
3417 match(eptr, ecode, offset_top, match_block);
3418 }
3419
3420
3421
3422 /*************************************************
3423 * Execute a Regular Expression *
3424 *************************************************/
3425
3426 /* This function applies a compiled re to a subject string and picks out
3427 portions of the string if it matches. Two elements in the vector are set for
3428 each substring: the offsets to the start and end of the substring.
3429
3430 Arguments:
3431 external_re points to the compiled expression
3432 external_extra points to "hints" from pcre_study() or is NULL
3433 subject points to the subject string
3434 length length of subject string (may contain binary zeros)
3435 options option bits
3436 offsets points to a vector of ints to be filled in with offsets
3437 offsetcount the number of elements in the vector
3438
3439 Returns: > 0 => success; value is the number of elements filled in
3440 = 0 => success, but offsets is not big enough
3441 -1 => failed to match
3442 < -1 => some kind of unexpected problem
3443 */
3444
3445 int
3446 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3447 const char *subject, int length, int options, int *offsets, int offsetcount)
3448 {
3449 int resetcount, ocount;
3450 int first_char = -1;
3451 match_data match_block;
3452 const uschar *start_bits = NULL;
3453 const uschar *start_match = (const uschar *)subject;
3454 const uschar *end_subject;
3455 const real_pcre *re = (const real_pcre *)external_re;
3456 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3457 BOOL using_temporary_offsets = FALSE;
3458 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3459 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3460
3461 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3462
3463 if (re == NULL || subject == NULL ||
3464 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3465 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3466
3467 match_block.start_subject = (const uschar *)subject;
3468 match_block.end_subject = match_block.start_subject + length;
3469 end_subject = match_block.end_subject;
3470
3471 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3472 match_block.runtime_caseless = match_block.caseless &&
3473 (re->options & PCRE_CASELESS) == 0;
3474
3475 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3476 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3477 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3478
3479 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3480 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3481
3482 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3483
3484 /* If the expression has got more back references than the offsets supplied can
3485 hold, we get a temporary bit of working store to use during the matching.
3486 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3487 of 2. */
3488
3489 ocount = offsetcount & (-2);
3490 if (re->top_backref > 0 && re->top_backref >= ocount/2)
3491 {
3492 ocount = re->top_backref * 2 + 2;
3493 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3494 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3495 using_temporary_offsets = TRUE;
3496 DPRINTF(("Got memory to hold back references\n"));
3497 }
3498 else match_block.offset_vector = offsets;
3499
3500 match_block.offset_end = ocount;
3501 match_block.offset_overflow = FALSE;
3502
3503 /* Compute the minimum number of offsets that we need to reset each time. Doing
3504 this makes a huge difference to execution time when there aren't many brackets
3505 in the pattern. */
3506
3507 resetcount = 2 + re->top_bracket * 2;
3508 if (resetcount > offsetcount) resetcount = ocount;
3509
3510 /* If MULTILINE is set at exec time but was not set at compile time, and the
3511 anchored flag is set, we must re-check because a setting provoked by ^ in the
3512 pattern is not right in multi-line mode. Calling is_anchored() again here does
3513 the right check, because multiline is now set. If it now yields FALSE, the
3514 expression must have had ^ starting some of its branches. Check to see if
3515 that is true for *all* branches, and if so, set the startline flag. */
3516
3517 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3518 !is_anchored(re->code, match_block.multiline))
3519 {
3520 anchored = FALSE;
3521 if (is_startline(re->code)) startline = TRUE;
3522 }
3523
3524 /* Set up the first character to match, if available. The first_char value is
3525 never set for an anchored regular expression, but the anchoring may be forced
3526 at run time, so we have to test for anchoring. The first char may be unset for
3527 an unanchored pattern, of course. If there's no first char and the pattern was
3528 studied, the may be a bitmap of possible first characters. However, we can
3529 use this only if the caseless state of the studying was correct. */
3530
3531 if (!anchored)
3532 {
3533 if ((re->options & PCRE_FIRSTSET) != 0)
3534 {
3535 first_char = re->first_char;
3536 if (match_block.caseless) first_char = pcre_lcc[first_char];
3537 }
3538 else
3539 if (!startline && extra != NULL &&
3540 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3541 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3542 start_bits = extra->start_bits;
3543 }
3544
3545 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3546
3547 do
3548 {
3549 int rc;
3550 register int *iptr = match_block.offset_vector;
3551 register int *iend = iptr + resetcount;
3552
3553 /* Reset the maximum number of extractions we might see. */
3554
3555 while (iptr < iend) *iptr++ = -1;
3556
3557 /* Advance to a unique first char if possible */
3558
3559 if (first_char >= 0)
3560 {
3561 if (match_block.caseless)
3562 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3563 start_match++;
3564 else
3565 while (start_match < end_subject && *start_match != first_char)
3566 start_match++;
3567 }
3568
3569 /* Or to just after \n for a multiline match if possible */
3570
3571 else if (startline)
3572 {
3573 if (start_match > match_block.start_subject)
3574 {
3575 while (start_match < end_subject && start_match[-1] != '\n')
3576 start_match++;
3577 }
3578 }
3579
3580 /* Or to a non-unique first char */
3581
3582 else if (start_bits != NULL)
3583 {
3584 while (start_match < end_subject)
3585 {
3586 register int c = *start_match;
3587 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3588 }
3589 }
3590
3591 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3592 printf(">>>> Match against: ");
3593 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3594 printf("\n");
3595 #endif
3596
3597 /* When a match occurs, substrings will be set for all internal extractions;
3598 we just need to set up the whole thing as substring 0 before returning. If
3599 there were too many extractions, set the return code to zero. In the case
3600 where we had to get some local store to hold offsets for backreferences, copy
3601 those back references that we can. In this case there need not be overflow
3602 if certain parts of the pattern were not used.
3603
3604 Before starting the match, we have to set up a longjmp() target to enable
3605 the "cut" operation to fail a match completely without backtracking. This
3606 is done in a separate function to avoid compiler warnings. We need not do
3607 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3608 enabled. */
3609
3610 if ((re->options & PCRE_EXTRA) != 0)
3611 {
3612 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
3613 continue;
3614 }
3615 else if (!match(start_match, re->code, 2, &match_block)) continue;
3616
3617 /* Copy the offset information from temporary store if necessary */
3618
3619 if (using_temporary_offsets)
3620 {
3621 if (offsetcount >= 4)
3622 {
3623 memcpy(offsets + 2, match_block.offset_vector + 2,
3624 (offsetcount - 2) * sizeof(int));
3625 DPRINTF(("Copied offsets from temporary memory\n"));
3626 }
3627 if (match_block.end_offset_top > offsetcount)
3628 match_block.offset_overflow = TRUE;
3629
3630 DPRINTF(("Freeing temporary memory\n"));
3631 (pcre_free)(match_block.offset_vector);
3632 }
3633
3634 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3635
3636 if (match_block.offset_end < 2) rc = 0; else
3637 {
3638 offsets[0] = start_match - match_block.start_subject;
3639 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3640 }
3641
3642 DPRINTF((">>>> returning %d\n", rc));
3643 return rc;
3644 }
3645 while (!anchored &&
3646 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3647 start_match++ < end_subject);
3648
3649 if (using_temporary_offsets)
3650 {
3651 DPRINTF(("Freeing temporary memory\n"));
3652 (pcre_free)(match_block.offset_vector);
3653 }
3654
3655 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3656
3657 return match_block.errorcode;
3658 }
3659
3660 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12