/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 19 - (show annotations) (download)
Sat Feb 24 21:38:33 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 104324 byte(s)
Load pcre-1.08 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1998 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the the use
37 of #ifdef inline, and there are *still* stupid compilers about that don't like
38 indented pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Allow compilation as C++ source code, should anybody want to do that. */
53
54 #ifdef __cplusplus
55 #define class pcre_class
56 #endif
57
58
59 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
60
61 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
62 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
63
64 /* Text forms of OP_ values and things, for debugging (not all used) */
65
66 #ifdef DEBUG
67 static const char *OP_names[] = {
68 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
69 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
70 "not",
71 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
72 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
73 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
74 "*", "*?", "+", "+?", "?", "??", "{", "{",
75 "class", "negclass", "Ref",
76 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
77 "Brazero", "Braminzero", "Bra"
78 };
79 #endif
80
81 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
82 are simple data values; negative values are for special things like \d and so
83 on. Zero means further processing is needed (for things like \x), or the escape
84 is invalid. */
85
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
90 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
94 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
95 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
96 0, 0, 0 /* x - z */
97 };
98
99 /* Definition to allow mutual recursion */
100
101 static BOOL
102 compile_regex(int, int *, uschar **, const uschar **, const char **);
103
104 /* Structure for passing "static" information around between the functions
105 doing the matching, so that they are thread-safe. */
106
107 typedef struct match_data {
108 int errorcode; /* As it says */
109 int *offset_vector; /* Offset vector */
110 int offset_end; /* One past the end */
111 BOOL offset_overflow; /* Set if too many extractions */
112 BOOL caseless; /* Case-independent flag */
113 BOOL runtime_caseless; /* Caseless forced at run time */
114 BOOL multiline; /* Multiline flag */
115 BOOL notbol; /* NOTBOL flag */
116 BOOL noteol; /* NOTEOL flag */
117 BOOL dotall; /* Dot matches any char */
118 BOOL endonly; /* Dollar not before final \n */
119 const uschar *start_subject; /* Start of the subject string */
120 const uschar *end_subject; /* End of the subject string */
121 jmp_buf fail_env; /* Environment for longjump() break out */
122 const uschar *end_match_ptr; /* Subject position at end match */
123 int end_offset_top; /* Highwater mark at end of match */
124 } match_data;
125
126
127
128 /*************************************************
129 * Global variables *
130 *************************************************/
131
132 /* PCRE is thread-clean and doesn't use any global variables in the normal
133 sense. However, it calls memory allocation and free functions via the two
134 indirections below, which are can be changed by the caller, but are shared
135 between all threads. */
136
137 void *(*pcre_malloc)(size_t) = malloc;
138 void (*pcre_free)(void *) = free;
139
140
141
142
143 /*************************************************
144 * Return version string *
145 *************************************************/
146
147 const char *
148 pcre_version(void)
149 {
150 return PCRE_VERSION;
151 }
152
153
154
155
156 /*************************************************
157 * Return info about a compiled pattern *
158 *************************************************/
159
160 /* This function picks potentially useful data out of the private
161 structure.
162
163 Arguments:
164 external_re points to compiled code
165 optptr where to pass back the options
166 first_char where to pass back the first character,
167 or -1 if multiline and all branches start ^,
168 or -2 otherwise
169
170 Returns: number of identifying extraction brackets
171 or negative values on error
172 */
173
174 int
175 pcre_info(const pcre *external_re, int *optptr, int *first_char)
176 {
177 const real_pcre *re = (const real_pcre *)external_re;
178 if (re == NULL) return PCRE_ERROR_NULL;
179 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
180 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
181 if (first_char != NULL)
182 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
183 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
184 return re->top_bracket;
185 }
186
187
188
189
190 #ifdef DEBUG
191 /*************************************************
192 * Debugging function to print chars *
193 *************************************************/
194
195 /* Print a sequence of chars in printable format, stopping at the end of the
196 subject if the requested.
197
198 Arguments:
199 p points to characters
200 length number to print
201 is_subject TRUE if printing from within md->start_subject
202 md pointer to matching data block, if is_subject is TRUE
203
204 Returns: nothing
205 */
206
207 static void
208 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
209 {
210 int c;
211 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
212 while (length-- > 0)
213 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
214 }
215 #endif
216
217
218
219
220 /*************************************************
221 * Check subpattern for empty operand *
222 *************************************************/
223
224 /* This function checks a bracketed subpattern to see if any of the paths
225 through it could match an empty string. This is used to diagnose an error if
226 such a subpattern is followed by a quantifier with an unlimited upper bound.
227
228 Argument:
229 code points to the opening bracket
230
231 Returns: TRUE or FALSE
232 */
233
234 static BOOL
235 could_be_empty(uschar *code)
236 {
237 do {
238 uschar *cc = code + 3;
239
240 /* Scan along the opcodes for this branch; as soon as we find something
241 that matches a non-empty string, break out and advance to test the next
242 branch. If we get to the end of the branch, return TRUE for the whole
243 sub-expression. */
244
245 for (;;)
246 {
247 /* Test an embedded subpattern; if it could not be empty, break the
248 loop. Otherwise carry on in the branch. */
249
250 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
251 {
252 if (!could_be_empty(cc)) break;
253 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
254 cc += 3;
255 }
256
257 else switch (*cc)
258 {
259 /* Reached end of a branch: the subpattern may match the empty string */
260
261 case OP_ALT:
262 case OP_KET:
263 case OP_KETRMAX:
264 case OP_KETRMIN:
265 return TRUE;
266
267 /* Skip over entire bracket groups with zero lower bound */
268
269 case OP_BRAZERO:
270 case OP_BRAMINZERO:
271 cc++;
272 /* Fall through */
273
274 /* Skip over assertive subpatterns */
275
276 case OP_ASSERT:
277 case OP_ASSERT_NOT:
278 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
279 cc += 3;
280 break;
281
282 /* Skip over things that don't match chars */
283
284 case OP_SOD:
285 case OP_EOD:
286 case OP_CIRC:
287 case OP_DOLL:
288 case OP_NOT_WORD_BOUNDARY:
289 case OP_WORD_BOUNDARY:
290 cc++;
291 break;
292
293 /* Skip over simple repeats with zero lower bound */
294
295 case OP_STAR:
296 case OP_MINSTAR:
297 case OP_QUERY:
298 case OP_MINQUERY:
299 case OP_NOTSTAR:
300 case OP_NOTMINSTAR:
301 case OP_NOTQUERY:
302 case OP_NOTMINQUERY:
303 case OP_TYPESTAR:
304 case OP_TYPEMINSTAR:
305 case OP_TYPEQUERY:
306 case OP_TYPEMINQUERY:
307 cc += 2;
308 break;
309
310 /* Skip over UPTOs (lower bound is zero) */
311
312 case OP_UPTO:
313 case OP_MINUPTO:
314 case OP_TYPEUPTO:
315 case OP_TYPEMINUPTO:
316 cc += 4;
317 break;
318
319 /* Check a class or a back reference for a zero minimum */
320
321 case OP_CLASS:
322 case OP_NEGCLASS:
323 case OP_REF:
324 cc += (*cc == OP_REF)? 2 : 33;
325
326 switch (*cc)
327 {
328 case OP_CRSTAR:
329 case OP_CRMINSTAR:
330 case OP_CRQUERY:
331 case OP_CRMINQUERY:
332 cc++;
333 break;
334
335 case OP_CRRANGE:
336 case OP_CRMINRANGE:
337 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
338 cc += 3;
339 break;
340
341 default:
342 goto NEXT_BRANCH;
343 }
344 break;
345
346 /* Anything else matches at least one character */
347
348 default:
349 goto NEXT_BRANCH;
350 }
351 }
352
353 NEXT_BRANCH:
354 code += (code[1] << 8) + code[2];
355 }
356 while (*code == OP_ALT);
357
358 /* No branches match the empty string */
359
360 return FALSE;
361 }
362
363
364
365 /*************************************************
366 * Handle escapes *
367 *************************************************/
368
369 /* This function is called when a \ has been encountered. It either returns a
370 positive value for a simple escape such as \n, or a negative value which
371 encodes one of the more complicated things such as \d. On entry, ptr is
372 pointing at the \. On exit, it is on the final character of the escape
373 sequence.
374
375 Arguments:
376 ptrptr points to the pattern position pointer
377 errorptr points to the pointer to the error message
378 bracount number of previous extracting brackets
379 options the options bits
380 isclass TRUE if inside a character class
381
382 Returns: zero or positive => a data character
383 negative => a special escape sequence
384 on error, errorptr is set
385 */
386
387 static int
388 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
389 int options, BOOL isclass)
390 {
391 const uschar *ptr = *ptrptr;
392 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
393 int i;
394
395 if (c == 0) *errorptr = ERR1;
396
397 /* Digits or letters may have special meaning; all others are literals. */
398
399 else if (c < '0' || c > 'z') {}
400
401 /* Do an initial lookup in a table. A non-zero result is something that can be
402 returned immediately. Otherwise further processing may be required. */
403
404 else if ((i = escapes[c - '0']) != 0) c = i;
405
406 /* Escapes that need further processing, or are illegal. */
407
408 else
409 {
410 const uschar *oldptr;
411 switch (c)
412 {
413 /* The handling of escape sequences consisting of a string of digits
414 starting with one that is not zero is not straightforward. By experiment,
415 the way Perl works seems to be as follows:
416
417 Outside a character class, the digits are read as a decimal number. If the
418 number is less than 10, or if there are that many previous extracting
419 left brackets, then it is a back reference. Otherwise, up to three octal
420 digits are read to form an escaped byte. Thus \123 is likely to be octal
421 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
422 value is greater than 377, the least significant 8 bits are taken. Inside a
423 character class, \ followed by a digit is always an octal number. */
424
425 case '1': case '2': case '3': case '4': case '5':
426 case '6': case '7': case '8': case '9':
427
428 if (!isclass)
429 {
430 oldptr = ptr;
431 c -= '0';
432 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
433 c = c * 10 + *(++ptr) - '0';
434 if (c < 10 || c <= bracount)
435 {
436 c = -(ESC_REF + c);
437 break;
438 }
439 ptr = oldptr; /* Put the pointer back and fall through */
440 }
441
442 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
443 generates a binary zero byte and treats the digit as a following literal.
444 Thus we have to pull back the pointer by one. */
445
446 if ((c = *ptr) >= '8')
447 {
448 ptr--;
449 c = 0;
450 break;
451 }
452
453 /* \0 always starts an octal number, but we may drop through to here with a
454 larger first octal digit */
455
456 case '0':
457 c -= '0';
458 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
459 ptr[1] != '8' && ptr[1] != '9')
460 c = c * 8 + *(++ptr) - '0';
461 break;
462
463 /* Special escapes not starting with a digit are straightforward */
464
465 case 'x':
466 c = 0;
467 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
468 {
469 ptr++;
470 c = c * 16 + pcre_lcc[*ptr] -
471 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
472 }
473 break;
474
475 case 'c':
476 c = *(++ptr);
477 if (c == 0)
478 {
479 *errorptr = ERR2;
480 return 0;
481 }
482
483 /* A letter is upper-cased; then the 0x40 bit is flipped */
484
485 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
486 c ^= 0x40;
487 break;
488
489 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
490 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
491 for Perl compatibility, it is a literal. */
492
493 default:
494 if ((options & PCRE_EXTRA) != 0) switch(c)
495 {
496 case 'X':
497 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
498 break;
499
500 default:
501 *errorptr = ERR3;
502 break;
503 }
504 break;
505 }
506 }
507
508 *ptrptr = ptr;
509 return c;
510 }
511
512
513
514 /*************************************************
515 * Check for counted repeat *
516 *************************************************/
517
518 /* This function is called when a '{' is encountered in a place where it might
519 start a quantifier. It looks ahead to see if it really is a quantifier or not.
520 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
521 where the ddds are digits.
522
523 Arguments:
524 p pointer to the first char after '{'
525
526 Returns: TRUE or FALSE
527 */
528
529 static BOOL
530 is_counted_repeat(const uschar *p)
531 {
532 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
533 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
534 if (*p == '}') return TRUE;
535
536 if (*p++ != ',') return FALSE;
537 if (*p == '}') return TRUE;
538
539 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
540 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
541 return (*p == '}');
542 }
543
544
545
546 /*************************************************
547 * Read repeat counts *
548 *************************************************/
549
550 /* Read an item of the form {n,m} and return the values. This is called only
551 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
552 so the syntax is guaranteed to be correct, but we need to check the values.
553
554 Arguments:
555 p pointer to first char after '{'
556 minp pointer to int for min
557 maxp pointer to int for max
558 returned as -1 if no max
559 errorptr points to pointer to error message
560
561 Returns: pointer to '}' on success;
562 current ptr on error, with errorptr set
563 */
564
565 static const uschar *
566 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
567 {
568 int min = 0;
569 int max = -1;
570
571 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
572
573 if (*p == '}') max = min; else
574 {
575 if (*(++p) != '}')
576 {
577 max = 0;
578 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
579 if (max < min)
580 {
581 *errorptr = ERR4;
582 return p;
583 }
584 }
585 }
586
587 /* Do paranoid checks, then fill in the required variables, and pass back the
588 pointer to the terminating '}'. */
589
590 if (min > 65535 || max > 65535)
591 *errorptr = ERR5;
592 else
593 {
594 *minp = min;
595 *maxp = max;
596 }
597 return p;
598 }
599
600
601
602 /*************************************************
603 * Compile one branch *
604 *************************************************/
605
606 /* Scan the pattern, compiling it into the code vector.
607
608 Arguments:
609 options the option bits
610 bracket points to number of brackets used
611 code points to the pointer to the current code point
612 ptrptr points to the current pattern pointer
613 errorptr points to pointer to error message
614
615 Returns: TRUE on success
616 FALSE, with *errorptr set on error
617 */
618
619 static BOOL
620 compile_branch(int options, int *brackets, uschar **codeptr,
621 const uschar **ptrptr, const char **errorptr)
622 {
623 int repeat_type, op_type;
624 int repeat_min, repeat_max;
625 int bravalue, length;
626 int greedy_default, greedy_non_default;
627 register int c;
628 register uschar *code = *codeptr;
629 const uschar *ptr = *ptrptr;
630 const uschar *oldptr;
631 uschar *previous = NULL;
632 uschar class[32];
633
634 /* Set up the default and non-default settings for greediness */
635
636 greedy_default = ((options & PCRE_UNGREEDY) != 0);
637 greedy_non_default = greedy_default ^ 1;
638
639 /* Switch on next character until the end of the branch */
640
641 for (;; ptr++)
642 {
643 BOOL negate_class;
644 int class_charcount;
645 int class_lastchar;
646
647 c = *ptr;
648 if ((options & PCRE_EXTENDED) != 0)
649 {
650 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
651 if (c == '#')
652 {
653 while ((c = *(++ptr)) != 0 && c != '\n');
654 continue;
655 }
656 }
657
658 switch(c)
659 {
660 /* The branch terminates at end of string, |, or ). */
661
662 case 0:
663 case '|':
664 case ')':
665 *codeptr = code;
666 *ptrptr = ptr;
667 return TRUE;
668
669 /* Handle single-character metacharacters */
670
671 case '^':
672 previous = NULL;
673 *code++ = OP_CIRC;
674 break;
675
676 case '$':
677 previous = NULL;
678 *code++ = OP_DOLL;
679 break;
680
681 case '.':
682 previous = code;
683 *code++ = OP_ANY;
684 break;
685
686 /* Character classes. These always build a 32-byte bitmap of the permitted
687 characters, except in the special case where there is only one character.
688 For negated classes, we build the map as usual, then invert it at the end.
689 */
690
691 case '[':
692 previous = code;
693
694 /* If the first character is '^', set the negation flag, and use a
695 different opcode. This only matters if caseless matching is specified at
696 runtime. */
697
698 if ((c = *(++ptr)) == '^')
699 {
700 negate_class = TRUE;
701 *code++ = OP_NEGCLASS;
702 c = *(++ptr);
703 }
704 else
705 {
706 negate_class = FALSE;
707 *code++ = OP_CLASS;
708 }
709
710 /* Keep a count of chars so that we can optimize the case of just a single
711 character. */
712
713 class_charcount = 0;
714 class_lastchar = -1;
715
716 /* Initialize the 32-char bit map to all zeros. We have to build the
717 map in a temporary bit of store, in case the class contains only 1
718 character, because in that case the compiled code doesn't use the
719 bit map. */
720
721 memset(class, 0, 32 * sizeof(uschar));
722
723 /* Process characters until ] is reached. By writing this as a "do" it
724 means that an initial ] is taken as a data character. */
725
726 do
727 {
728 if (c == 0)
729 {
730 *errorptr = ERR6;
731 goto FAILED;
732 }
733
734 /* Backslash may introduce a single character, or it may introduce one
735 of the specials, which just set a flag. Escaped items are checked for
736 validity in the pre-compiling pass. The sequence \b is a special case.
737 Inside a class (and only there) it is treated as backspace. Elsewhere
738 it marks a word boundary. Other escapes have preset maps ready to
739 or into the one we are building. We assume they have more than one
740 character in them, so set class_count bigger than one. */
741
742 if (c == '\\')
743 {
744 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
745 if (-c == ESC_b) c = '\b';
746 else if (c < 0)
747 {
748 class_charcount = 10;
749 switch (-c)
750 {
751 case ESC_d:
752 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
753 continue;
754
755 case ESC_D:
756 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
757 continue;
758
759 case ESC_w:
760 for (c = 0; c < 32; c++)
761 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
762 continue;
763
764 case ESC_W:
765 for (c = 0; c < 32; c++)
766 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
767 continue;
768
769 case ESC_s:
770 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
771 continue;
772
773 case ESC_S:
774 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
775 continue;
776
777 default:
778 *errorptr = ERR7;
779 goto FAILED;
780 }
781 }
782 /* Fall through if single character */
783 }
784
785 /* A single character may be followed by '-' to form a range. However,
786 Perl does not permit ']' to be the end of the range. A '-' character
787 here is treated as a literal. */
788
789 if (ptr[1] == '-' && ptr[2] != ']')
790 {
791 int d;
792 ptr += 2;
793 d = *ptr;
794
795 if (d == 0)
796 {
797 *errorptr = ERR6;
798 goto FAILED;
799 }
800
801 /* The second part of a range can be a single-character escape, but
802 not any of the other escapes. */
803
804 if (d == '\\')
805 {
806 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
807 if (d < 0)
808 {
809 if (d == -ESC_b) d = '\b'; else
810 {
811 *errorptr = ERR7;
812 goto FAILED;
813 }
814 }
815 }
816
817 if (d < c)
818 {
819 *errorptr = ERR8;
820 goto FAILED;
821 }
822
823 for (; c <= d; c++)
824 {
825 class[c/8] |= (1 << (c&7));
826 if ((options & PCRE_CASELESS) != 0)
827 {
828 int uc = pcre_fcc[c]; /* flip case */
829 class[uc/8] |= (1 << (uc&7));
830 }
831 class_charcount++; /* in case a one-char range */
832 class_lastchar = c;
833 }
834 continue; /* Go get the next char in the class */
835 }
836
837 /* Handle a lone single character - we can get here for a normal
838 non-escape char, or after \ that introduces a single character. */
839
840 class [c/8] |= (1 << (c&7));
841 if ((options & PCRE_CASELESS) != 0)
842 {
843 c = pcre_fcc[c]; /* flip case */
844 class[c/8] |= (1 << (c&7));
845 }
846 class_charcount++;
847 class_lastchar = c;
848 }
849
850 /* Loop until ']' reached; the check for end of string happens inside the
851 loop. This "while" is the end of the "do" above. */
852
853 while ((c = *(++ptr)) != ']');
854
855 /* If class_charcount is 1 and class_lastchar is not negative, we saw
856 precisely one character. This doesn't need the whole 32-byte bit map.
857 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
858 it's negative. */
859
860 if (class_charcount == 1 && class_lastchar >= 0)
861 {
862 if (negate_class)
863 {
864 code[-1] = OP_NOT;
865 }
866 else
867 {
868 code[-1] = OP_CHARS;
869 *code++ = 1;
870 }
871 *code++ = class_lastchar;
872 }
873
874 /* Otherwise, negate the 32-byte map if necessary, and copy it into
875 the code vector. */
876
877 else
878 {
879 if (negate_class)
880 for (c = 0; c < 32; c++) code[c] = ~class[c];
881 else
882 memcpy(code, class, 32);
883 code += 32;
884 }
885 break;
886
887 /* Various kinds of repeat */
888
889 case '{':
890 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
891 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
892 if (*errorptr != NULL) goto FAILED;
893 goto REPEAT;
894
895 case '*':
896 repeat_min = 0;
897 repeat_max = -1;
898 goto REPEAT;
899
900 case '+':
901 repeat_min = 1;
902 repeat_max = -1;
903 goto REPEAT;
904
905 case '?':
906 repeat_min = 0;
907 repeat_max = 1;
908
909 REPEAT:
910 if (previous == NULL)
911 {
912 *errorptr = ERR9;
913 goto FAILED;
914 }
915
916 /* If the next character is '?' this is a minimizing repeat, by default,
917 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
918 next character. */
919
920 if (ptr[1] == '?')
921 { repeat_type = greedy_non_default; ptr++; }
922 else repeat_type = greedy_default;
923
924 /* If the maximum is zero then the minimum must also be zero; Perl allows
925 this case, so we do too - by simply omitting the item altogether. */
926
927 if (repeat_max == 0) code = previous;
928
929 /* If previous was a string of characters, chop off the last one and use it
930 as the subject of the repeat. If there was only one character, we can
931 abolish the previous item altogether. */
932
933 else if (*previous == OP_CHARS)
934 {
935 int len = previous[1];
936 if (len == 1)
937 {
938 c = previous[2];
939 code = previous;
940 }
941 else
942 {
943 c = previous[len+1];
944 previous[1]--;
945 code--;
946 }
947 op_type = 0; /* Use single-char op codes */
948 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
949 }
950
951 /* If previous was a single negated character ([^a] or similar), we use
952 one of the special opcodes, replacing it. The code is shared with single-
953 character repeats by adding a suitable offset into repeat_type. */
954
955 else if ((int)*previous == OP_NOT)
956 {
957 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
958 c = previous[1];
959 code = previous;
960 goto OUTPUT_SINGLE_REPEAT;
961 }
962
963 /* If previous was a character type match (\d or similar), abolish it and
964 create a suitable repeat item. The code is shared with single-character
965 repeats by adding a suitable offset into repeat_type. */
966
967 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
968 {
969 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
970 c = *previous;
971 code = previous;
972
973 OUTPUT_SINGLE_REPEAT:
974 repeat_type += op_type; /* Combine both values for many cases */
975
976 /* A minimum of zero is handled either as the special case * or ?, or as
977 an UPTO, with the maximum given. */
978
979 if (repeat_min == 0)
980 {
981 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
982 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
983 else
984 {
985 *code++ = OP_UPTO + repeat_type;
986 *code++ = repeat_max >> 8;
987 *code++ = (repeat_max & 255);
988 }
989 }
990
991 /* The case {1,} is handled as the special case + */
992
993 else if (repeat_min == 1 && repeat_max == -1)
994 *code++ = OP_PLUS + repeat_type;
995
996 /* The case {n,n} is just an EXACT, while the general case {n,m} is
997 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
998
999 else
1000 {
1001 if (repeat_min != 1)
1002 {
1003 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1004 *code++ = repeat_min >> 8;
1005 *code++ = (repeat_min & 255);
1006 }
1007
1008 /* If the mininum is 1 and the previous item was a character string,
1009 we either have to put back the item that got cancelled if the string
1010 length was 1, or add the character back onto the end of a longer
1011 string. For a character type nothing need be done; it will just get put
1012 back naturally. */
1013
1014 else if (*previous == OP_CHARS)
1015 {
1016 if (code == previous) code += 2; else previous[1]++;
1017 }
1018
1019 /* If the maximum is unlimited, insert an OP_STAR. */
1020
1021 if (repeat_max < 0)
1022 {
1023 *code++ = c;
1024 *code++ = OP_STAR + repeat_type;
1025 }
1026
1027 /* Else insert an UPTO if the max is greater than the min. */
1028
1029 else if (repeat_max != repeat_min)
1030 {
1031 *code++ = c;
1032 repeat_max -= repeat_min;
1033 *code++ = OP_UPTO + repeat_type;
1034 *code++ = repeat_max >> 8;
1035 *code++ = (repeat_max & 255);
1036 }
1037 }
1038
1039 /* The character or character type itself comes last in all cases. */
1040
1041 *code++ = c;
1042 }
1043
1044 /* If previous was a character class or a back reference, we put the repeat
1045 stuff after it. */
1046
1047 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1048 *previous == OP_REF)
1049 {
1050 if (repeat_min == 0 && repeat_max == -1)
1051 *code++ = OP_CRSTAR + repeat_type;
1052 else if (repeat_min == 1 && repeat_max == -1)
1053 *code++ = OP_CRPLUS + repeat_type;
1054 else if (repeat_min == 0 && repeat_max == 1)
1055 *code++ = OP_CRQUERY + repeat_type;
1056 else
1057 {
1058 *code++ = OP_CRRANGE + repeat_type;
1059 *code++ = repeat_min >> 8;
1060 *code++ = repeat_min & 255;
1061 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1062 *code++ = repeat_max >> 8;
1063 *code++ = repeat_max & 255;
1064 }
1065 }
1066
1067 /* If previous was a bracket group, we may have to replicate it in certain
1068 cases. If the maximum repeat count is unlimited, check that the bracket
1069 group cannot match the empty string, and diagnose an error if it can. */
1070
1071 else if ((int)*previous >= OP_BRA)
1072 {
1073 int i;
1074 int len = code - previous;
1075
1076 if (repeat_max == -1 && could_be_empty(previous))
1077 {
1078 *errorptr = ERR10;
1079 goto FAILED;
1080 }
1081
1082 /* If the minimum is greater than zero, and the maximum is unlimited or
1083 equal to the minimum, the first copy remains where it is, and is
1084 replicated up to the minimum number of times. This case includes the +
1085 repeat, but of course no replication is needed in that case. */
1086
1087 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1088 {
1089 for (i = 1; i < repeat_min; i++)
1090 {
1091 memcpy(code, previous, len);
1092 code += len;
1093 }
1094 }
1095
1096 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1097 Then, if there is a fixed upper limit, replicated up to that many times,
1098 sticking BRAZERO in front of all the optional ones. */
1099
1100 else
1101 {
1102 if (repeat_min == 0)
1103 {
1104 memmove(previous+1, previous, len);
1105 code++;
1106 *previous++ = OP_BRAZERO + repeat_type;
1107 }
1108
1109 for (i = 1; i < repeat_min; i++)
1110 {
1111 memcpy(code, previous, len);
1112 code += len;
1113 }
1114
1115 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1116 {
1117 *code++ = OP_BRAZERO + repeat_type;
1118 memcpy(code, previous, len);
1119 code += len;
1120 }
1121 }
1122
1123 /* If the maximum is unlimited, set a repeater in the final copy. */
1124
1125 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1126 }
1127
1128 /* Else there's some kind of shambles */
1129
1130 else
1131 {
1132 *errorptr = ERR11;
1133 goto FAILED;
1134 }
1135
1136 /* In all case we no longer have a previous item. */
1137
1138 previous = NULL;
1139 break;
1140
1141
1142 /* Start of nested bracket sub-expression, or comment or lookahead.
1143 First deal with special things that can come after a bracket; all are
1144 introduced by ?, and the appearance of any of them means that this is not a
1145 referencing group. They were checked for validity in the first pass over
1146 the string, so we don't have to check for syntax errors here. */
1147
1148 case '(':
1149 previous = code; /* Only real brackets can be repeated */
1150 if (*(++ptr) == '?')
1151 {
1152 bravalue = OP_BRA;
1153
1154 switch (*(++ptr))
1155 {
1156 case '#':
1157 case 'i':
1158 case 'm':
1159 case 's':
1160 case 'x':
1161 case 'U':
1162 case 'X':
1163 ptr++;
1164 while (*ptr != ')') ptr++;
1165 previous = NULL;
1166 continue;
1167
1168 case ':': /* Non-extracting bracket */
1169 ptr++;
1170 break;
1171
1172 case '=': /* Assertions can't be repeated */
1173 bravalue = OP_ASSERT;
1174 ptr++;
1175 previous = NULL;
1176 break;
1177
1178 case '!':
1179 bravalue = OP_ASSERT_NOT;
1180 ptr++;
1181 previous = NULL;
1182 break;
1183
1184 case '>': /* "Match once" brackets */
1185 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1186 {
1187 bravalue = OP_ONCE;
1188 ptr++;
1189 previous = NULL;
1190 break;
1191 }
1192 /* Else fall through */
1193
1194 default:
1195 *errorptr = ERR12;
1196 goto FAILED;
1197 }
1198 }
1199
1200 /* Else we have a referencing group */
1201
1202 else
1203 {
1204 if (++(*brackets) > EXTRACT_MAX)
1205 {
1206 *errorptr = ERR13;
1207 goto FAILED;
1208 }
1209 bravalue = OP_BRA + *brackets;
1210 }
1211
1212 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1213 code into a non-register variable in order to be able to pass its address
1214 because some compilers complain otherwise. */
1215
1216 *code = bravalue;
1217 {
1218 uschar *mcode = code;
1219 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1220 goto FAILED;
1221 code = mcode;
1222 }
1223
1224 if (*ptr != ')')
1225 {
1226 *errorptr = ERR14;
1227 goto FAILED;
1228 }
1229 break;
1230
1231 /* Check \ for being a real metacharacter; if not, fall through and handle
1232 it as a data character at the start of a string. Escape items are checked
1233 for validity in the pre-compiling pass. */
1234
1235 case '\\':
1236 oldptr = ptr;
1237 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1238
1239 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1240 are arranged to be the negation of the corresponding OP_values. For the
1241 back references, the values are ESC_REF plus the reference number. Only
1242 back references and those types that consume a character may be repeated.
1243 We can test for values between ESC_b and ESC_Z for the latter; this may
1244 have to change if any new ones are ever created. */
1245
1246 if (c < 0)
1247 {
1248 if (-c >= ESC_REF)
1249 {
1250 int refnum = -c - ESC_REF;
1251 if (*brackets < refnum)
1252 {
1253 *errorptr = ERR15;
1254 goto FAILED;
1255 }
1256 previous = code;
1257 *code++ = OP_REF;
1258 *code++ = refnum;
1259 }
1260 else
1261 {
1262 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1263 *code++ = -c;
1264 }
1265 continue;
1266 }
1267
1268 /* Data character: reset and fall through */
1269
1270 ptr = oldptr;
1271 c = '\\';
1272
1273 /* Handle a run of data characters until a metacharacter is encountered.
1274 The first character is guaranteed not to be whitespace or # when the
1275 extended flag is set. */
1276
1277 NORMAL_CHAR:
1278 default:
1279 previous = code;
1280 *code = OP_CHARS;
1281 code += 2;
1282 length = 0;
1283
1284 do
1285 {
1286 if ((options & PCRE_EXTENDED) != 0)
1287 {
1288 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1289 if (c == '#')
1290 {
1291 while ((c = *(++ptr)) != 0 && c != '\n');
1292 if (c == 0) break;
1293 continue;
1294 }
1295 }
1296
1297 /* Backslash may introduce a data char or a metacharacter. Escaped items
1298 are checked for validity in the pre-compiling pass. Stop the string
1299 before a metaitem. */
1300
1301 if (c == '\\')
1302 {
1303 oldptr = ptr;
1304 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1305 if (c < 0) { ptr = oldptr; break; }
1306 }
1307
1308 /* Ordinary character or single-char escape */
1309
1310 *code++ = c;
1311 length++;
1312 }
1313
1314 /* This "while" is the end of the "do" above. */
1315
1316 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1317
1318 /* Compute the length and set it in the data vector, and advance to
1319 the next state. */
1320
1321 previous[1] = length;
1322 if (length < 255) ptr--;
1323 break;
1324 }
1325 } /* end of big loop */
1326
1327 /* Control never reaches here by falling through, only by a goto for all the
1328 error states. Pass back the position in the pattern so that it can be displayed
1329 to the user for diagnosing the error. */
1330
1331 FAILED:
1332 *ptrptr = ptr;
1333 return FALSE;
1334 }
1335
1336
1337
1338
1339 /*************************************************
1340 * Compile sequence of alternatives *
1341 *************************************************/
1342
1343 /* On entry, ptr is pointing past the bracket character, but on return
1344 it points to the closing bracket, or vertical bar, or end of string.
1345 The code variable is pointing at the byte into which the BRA operator has been
1346 stored.
1347
1348 Argument:
1349 options the option bits
1350 brackets -> int containing the number of extracting brackets used
1351 codeptr -> the address of the current code pointer
1352 ptrptr -> the address of the current pattern pointer
1353 errorptr -> pointer to error message
1354
1355 Returns: TRUE on success
1356 */
1357
1358 static BOOL
1359 compile_regex(int options, int *brackets, uschar **codeptr,
1360 const uschar **ptrptr, const char **errorptr)
1361 {
1362 const uschar *ptr = *ptrptr;
1363 uschar *code = *codeptr;
1364 uschar *start_bracket = code;
1365
1366 for (;;)
1367 {
1368 int length;
1369 uschar *last_branch = code;
1370
1371 code += 3;
1372 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1373 {
1374 *ptrptr = ptr;
1375 return FALSE;
1376 }
1377
1378 /* Fill in the length of the last branch */
1379
1380 length = code - last_branch;
1381 last_branch[1] = length >> 8;
1382 last_branch[2] = length & 255;
1383
1384 /* Reached end of expression, either ')' or end of pattern. Insert a
1385 terminating ket and the length of the whole bracketed item, and return,
1386 leaving the pointer at the terminating char. */
1387
1388 if (*ptr != '|')
1389 {
1390 length = code - start_bracket;
1391 *code++ = OP_KET;
1392 *code++ = length >> 8;
1393 *code++ = length & 255;
1394 *codeptr = code;
1395 *ptrptr = ptr;
1396 return TRUE;
1397 }
1398
1399 /* Another branch follows; insert an "or" node and advance the pointer. */
1400
1401 *code = OP_ALT;
1402 ptr++;
1403 }
1404 /* Control never reaches here */
1405 }
1406
1407
1408
1409 /*************************************************
1410 * Check for anchored expression *
1411 *************************************************/
1412
1413 /* Try to find out if this is an anchored regular expression. Consider each
1414 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1415 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1416 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1417 counts, since OP_CIRC can match in the middle.
1418
1419 A branch is also implicitly anchored if it starts with .* because that will try
1420 the rest of the pattern at all possible matching points, so there is no point
1421 trying them again.
1422
1423 Argument: points to start of expression (the bracket)
1424 Returns: TRUE or FALSE
1425 */
1426
1427 static BOOL
1428 is_anchored(register const uschar *code, BOOL multiline)
1429 {
1430 do {
1431 int op = (int)code[3];
1432 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1433 { if (!is_anchored(code+3, multiline)) return FALSE; }
1434 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1435 { if (code[4] != OP_ANY) return FALSE; }
1436 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1437 code += (code[1] << 8) + code[2];
1438 }
1439 while (*code == OP_ALT);
1440 return TRUE;
1441 }
1442
1443
1444
1445 /*************************************************
1446 * Check for start with \n line expression *
1447 *************************************************/
1448
1449 /* This is called for multiline expressions to try to find out if every branch
1450 starts with ^ so that "first char" processing can be done to speed things up.
1451
1452 Argument: points to start of expression (the bracket)
1453 Returns: TRUE or FALSE
1454 */
1455
1456 static BOOL
1457 is_startline(const uschar *code)
1458 {
1459 do {
1460 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1461 { if (!is_startline(code+3)) return FALSE; }
1462 else if (code[3] != OP_CIRC) return FALSE;
1463 code += (code[1] << 8) + code[2];
1464 }
1465 while (*code == OP_ALT);
1466 return TRUE;
1467 }
1468
1469
1470
1471 /*************************************************
1472 * Check for fixed first char *
1473 *************************************************/
1474
1475 /* Try to find out if there is a fixed first character. This is called for
1476 unanchored expressions, as it speeds up their processing quite considerably.
1477 Consider each alternative branch. If they all start with the same char, or with
1478 a bracket all of whose alternatives start with the same char (recurse ad lib),
1479 then we return that char, otherwise -1.
1480
1481 Argument: points to start of expression (the bracket)
1482 Returns: -1 or the fixed first char
1483 */
1484
1485 static int
1486 find_firstchar(uschar *code)
1487 {
1488 register int c = -1;
1489 do
1490 {
1491 register int charoffset = 4;
1492
1493 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1494 {
1495 register int d;
1496 if ((d = find_firstchar(code+3)) < 0) return -1;
1497 if (c < 0) c = d; else if (c != d) return -1;
1498 }
1499
1500 else switch(code[3])
1501 {
1502 default:
1503 return -1;
1504
1505 case OP_EXACT: /* Fall through */
1506 charoffset++;
1507
1508 case OP_CHARS: /* Fall through */
1509 charoffset++;
1510
1511 case OP_PLUS:
1512 case OP_MINPLUS:
1513 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1514 break;
1515 }
1516 code += (code[1] << 8) + code[2];
1517 }
1518 while (*code == OP_ALT);
1519 return c;
1520 }
1521
1522
1523
1524 /*************************************************
1525 * Compile a Regular Expression *
1526 *************************************************/
1527
1528 /* This function takes a string and returns a pointer to a block of store
1529 holding a compiled version of the expression.
1530
1531 Arguments:
1532 pattern the regular expression
1533 options various option bits
1534 errorptr pointer to pointer to error text
1535 erroroffset ptr offset in pattern where error was detected
1536
1537 Returns: pointer to compiled data block, or NULL on error,
1538 with errorptr and erroroffset set
1539 */
1540
1541 pcre *
1542 pcre_compile(const char *pattern, int options, const char **errorptr,
1543 int *erroroffset)
1544 {
1545 real_pcre *re;
1546 int spaces = 0;
1547 int length = 3; /* For initial BRA plus length */
1548 int runlength;
1549 int c, size;
1550 int bracount = 0;
1551 int brastack[200];
1552 int top_backref = 0;
1553 unsigned int brastackptr = 0;
1554 uschar *code;
1555 const uschar *ptr;
1556
1557 #ifdef DEBUG
1558 uschar *code_base, *code_end;
1559 #endif
1560
1561 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1562 can do is just return NULL. */
1563
1564 if (errorptr == NULL) return NULL;
1565 *errorptr = NULL;
1566
1567 /* However, we can give a message for this error */
1568
1569 if (erroroffset == NULL)
1570 {
1571 *errorptr = ERR16;
1572 return NULL;
1573 }
1574 *erroroffset = 0;
1575
1576 if ((options & ~PUBLIC_OPTIONS) != 0)
1577 {
1578 *errorptr = ERR17;
1579 return NULL;
1580 }
1581
1582 DPRINTF(("------------------------------------------------------------------\n"));
1583 DPRINTF(("%s\n", pattern));
1584
1585 /* The first thing to do is to make a pass over the pattern to compute the
1586 amount of store required to hold the compiled code. This does not have to be
1587 perfect as long as errors are overestimates. At the same time we can detect any
1588 internal flag settings. Make an attempt to correct for any counted white space
1589 if an "extended" flag setting appears late in the pattern. We can't be so
1590 clever for #-comments. */
1591
1592 ptr = (const uschar *)(pattern - 1);
1593 while ((c = *(++ptr)) != 0)
1594 {
1595 int min, max;
1596 int class_charcount;
1597
1598 if ((pcre_ctypes[c] & ctype_space) != 0)
1599 {
1600 if ((options & PCRE_EXTENDED) != 0) continue;
1601 spaces++;
1602 }
1603
1604 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1605 {
1606 while ((c = *(++ptr)) != 0 && c != '\n');
1607 continue;
1608 }
1609
1610 switch(c)
1611 {
1612 /* A backslashed item may be an escaped "normal" character or a
1613 character type. For a "normal" character, put the pointers and
1614 character back so that tests for whitespace etc. in the input
1615 are done correctly. */
1616
1617 case '\\':
1618 {
1619 const uschar *save_ptr = ptr;
1620 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1621 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1622 if (c >= 0)
1623 {
1624 ptr = save_ptr;
1625 c = '\\';
1626 goto NORMAL_CHAR;
1627 }
1628 }
1629 length++;
1630
1631 /* A back reference needs an additional char, plus either one or 5
1632 bytes for a repeat. We also need to keep the value of the highest
1633 back reference. */
1634
1635 if (c <= -ESC_REF)
1636 {
1637 int refnum = -c - ESC_REF;
1638 if (refnum > top_backref) top_backref = refnum;
1639 length++; /* For single back reference */
1640 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1641 {
1642 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1643 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1644 if ((min == 0 && (max == 1 || max == -1)) ||
1645 (min == 1 && max == -1))
1646 length++;
1647 else length += 5;
1648 if (ptr[1] == '?') ptr++;
1649 }
1650 }
1651 continue;
1652
1653 case '^':
1654 case '.':
1655 case '$':
1656 case '*': /* These repeats won't be after brackets; */
1657 case '+': /* those are handled separately */
1658 case '?':
1659 length++;
1660 continue;
1661
1662 /* This covers the cases of repeats after a single char, metachar, class,
1663 or back reference. */
1664
1665 case '{':
1666 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1667 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1668 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1669 if ((min == 0 && (max == 1 || max == -1)) ||
1670 (min == 1 && max == -1))
1671 length++;
1672 else
1673 {
1674 length--; /* Uncount the original char or metachar */
1675 if (min == 1) length++; else if (min > 0) length += 4;
1676 if (max > 0) length += 4; else length += 2;
1677 }
1678 if (ptr[1] == '?') ptr++;
1679 continue;
1680
1681 /* An alternation contains an offset to the next branch or ket. */
1682 case '|':
1683 length += 3;
1684 continue;
1685
1686 /* A character class uses 33 characters. Don't worry about character types
1687 that aren't allowed in classes - they'll get picked up during the compile.
1688 A character class that contains only one character uses 2 or 3 bytes,
1689 depending on whether it is negated or not. Notice this where we can. */
1690
1691 case '[':
1692 class_charcount = 0;
1693 if (*(++ptr) == '^') ptr++;
1694 do
1695 {
1696 if (*ptr == '\\')
1697 {
1698 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1699 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1700 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1701 }
1702 else class_charcount++;
1703 ptr++;
1704 }
1705 while (*ptr != 0 && *ptr != ']');
1706
1707 /* Repeats for negated single chars are handled by the general code */
1708
1709 if (class_charcount == 1) length += 3; else
1710 {
1711 length += 33;
1712
1713 /* A repeat needs either 1 or 5 bytes. */
1714
1715 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1716 {
1717 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1718 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1719 if ((min == 0 && (max == 1 || max == -1)) ||
1720 (min == 1 && max == -1))
1721 length++;
1722 else length += 5;
1723 if (ptr[1] == '?') ptr++;
1724 }
1725 }
1726 continue;
1727
1728 /* Brackets may be genuine groups or special things */
1729
1730 case '(':
1731
1732 /* Handle special forms of bracket, which all start (? */
1733
1734 if (ptr[1] == '?') switch (c = ptr[2])
1735 {
1736 /* Skip over comments entirely */
1737 case '#':
1738 ptr += 3;
1739 while (*ptr != 0 && *ptr != ')') ptr++;
1740 if (*ptr == 0)
1741 {
1742 *errorptr = ERR18;
1743 goto PCRE_ERROR_RETURN;
1744 }
1745 continue;
1746
1747 /* Non-referencing groups and lookaheads just move the pointer on, and
1748 then behave like a non-special bracket, except that they don't increment
1749 the count of extracting brackets. */
1750
1751 case ':':
1752 case '=':
1753 case '!':
1754 ptr += 2;
1755 break;
1756
1757 /* Ditto for the "once only" bracket, allowed only if the extra bit
1758 is set. */
1759
1760 case '>':
1761 if ((options & PCRE_EXTRA) != 0)
1762 {
1763 ptr += 2;
1764 break;
1765 }
1766 /* Else fall through */
1767
1768 /* Else loop setting valid options until ) is met. Anything else is an
1769 error. */
1770
1771 default:
1772 ptr += 2;
1773 for (;; ptr++)
1774 {
1775 if ((c = *ptr) == 'i')
1776 {
1777 options |= PCRE_CASELESS;
1778 continue;
1779 }
1780 else if ((c = *ptr) == 'm')
1781 {
1782 options |= PCRE_MULTILINE;
1783 continue;
1784 }
1785 else if (c == 's')
1786 {
1787 options |= PCRE_DOTALL;
1788 continue;
1789 }
1790 else if (c == 'x')
1791 {
1792 options |= PCRE_EXTENDED;
1793 length -= spaces; /* Already counted spaces */
1794 continue;
1795 }
1796 else if (c == 'X')
1797 {
1798 options |= PCRE_EXTRA;
1799 continue;
1800 }
1801 else if (c == 'U')
1802 {
1803 options |= PCRE_UNGREEDY;
1804 continue;
1805 }
1806 else if (c == ')') break;
1807
1808 *errorptr = ERR12;
1809 goto PCRE_ERROR_RETURN;
1810 }
1811 continue; /* End of this bracket handling */
1812 }
1813
1814 /* Extracting brackets must be counted so we can process escapes in a
1815 Perlish way. */
1816
1817 else bracount++;
1818
1819 /* Non-special forms of bracket. Save length for computing whole length
1820 at end if there's a repeat that requires duplication of the group. */
1821
1822 if (brastackptr >= sizeof(brastack)/sizeof(int))
1823 {
1824 *errorptr = ERR19;
1825 goto PCRE_ERROR_RETURN;
1826 }
1827
1828 brastack[brastackptr++] = length;
1829 length += 3;
1830 continue;
1831
1832 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1833 have to replicate this bracket up to that many times. If brastackptr is
1834 0 this is an unmatched bracket which will generate an error, but take care
1835 not to try to access brastack[-1]. */
1836
1837 case ')':
1838 length += 3;
1839 {
1840 int minval = 1;
1841 int maxval = 1;
1842 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1843
1844 /* Leave ptr at the final char; for read_repeat_counts this happens
1845 automatically; for the others we need an increment. */
1846
1847 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1848 {
1849 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1850 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1851 }
1852 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1853 else if (c == '+') { maxval = -1; ptr++; }
1854 else if (c == '?') { minval = 0; ptr++; }
1855
1856 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1857 if there is a limited maximum we have to replicate up to maxval-1 times
1858 and allow for a BRAZERO item before each optional copy, as we also have
1859 to do before the first copy if the minimum is zero. */
1860
1861 if (minval == 0) length++;
1862 else if (minval > 1) length += (minval - 1) * duplength;
1863 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1864 }
1865 continue;
1866
1867 /* Non-special character. For a run of such characters the length required
1868 is the number of characters + 2, except that the maximum run length is 255.
1869 We won't get a skipped space or a non-data escape or the start of a #
1870 comment as the first character, so the length can't be zero. */
1871
1872 NORMAL_CHAR:
1873 default:
1874 length += 2;
1875 runlength = 0;
1876 do
1877 {
1878 if ((pcre_ctypes[c] & ctype_space) != 0)
1879 {
1880 if ((options & PCRE_EXTENDED) != 0) continue;
1881 spaces++;
1882 }
1883
1884 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1885 {
1886 while ((c = *(++ptr)) != 0 && c != '\n');
1887 continue;
1888 }
1889
1890 /* Backslash may introduce a data char or a metacharacter; stop the
1891 string before the latter. */
1892
1893 if (c == '\\')
1894 {
1895 const uschar *saveptr = ptr;
1896 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1897 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1898 if (c < 0) { ptr = saveptr; break; }
1899 }
1900
1901 /* Ordinary character or single-char escape */
1902
1903 runlength++;
1904 }
1905
1906 /* This "while" is the end of the "do" above. */
1907
1908 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1909
1910 ptr--;
1911 length += runlength;
1912 continue;
1913 }
1914 }
1915
1916 length += 4; /* For final KET and END */
1917
1918 if (length > 65539)
1919 {
1920 *errorptr = ERR20;
1921 return NULL;
1922 }
1923
1924 /* Compute the size of data block needed and get it, either from malloc or
1925 externally provided function. We specify "code[0]" in the offsetof() expression
1926 rather than just "code", because it has been reported that one broken compiler
1927 fails on "code" because it is also an independent variable. It should make no
1928 difference to the value of the offsetof(). */
1929
1930 size = length + offsetof(real_pcre, code[0]);
1931 re = (real_pcre *)(pcre_malloc)(size);
1932
1933 if (re == NULL)
1934 {
1935 *errorptr = ERR21;
1936 return NULL;
1937 }
1938
1939 /* Put in the magic number and the options. */
1940
1941 re->magic_number = MAGIC_NUMBER;
1942 re->options = options;
1943
1944 /* Set up a starting, non-extracting bracket, then compile the expression. On
1945 error, *errorptr will be set non-NULL, so we don't need to look at the result
1946 of the function here. */
1947
1948 ptr = (const uschar *)pattern;
1949 code = re->code;
1950 *code = OP_BRA;
1951 bracount = 0;
1952 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1953 re->top_bracket = bracount;
1954 re->top_backref = top_backref;
1955
1956 /* If not reached end of pattern on success, there's an excess bracket. */
1957
1958 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1959
1960 /* Fill in the terminating state and check for disastrous overflow, but
1961 if debugging, leave the test till after things are printed out. */
1962
1963 *code++ = OP_END;
1964
1965 #ifndef DEBUG
1966 if (code - re->code > length) *errorptr = ERR23;
1967 #endif
1968
1969 /* Failed to compile */
1970
1971 if (*errorptr != NULL)
1972 {
1973 (pcre_free)(re);
1974 PCRE_ERROR_RETURN:
1975 *erroroffset = ptr - (const uschar *)pattern;
1976 return NULL;
1977 }
1978
1979 /* If the anchored option was not passed, set flag if we can determine that it
1980 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1981 we can determine what the first character has to be, because that speeds up
1982 unanchored matches no end. In the case of multiline matches, an alternative is
1983 to set the PCRE_STARTLINE flag if all branches start with ^. */
1984
1985 if ((options & PCRE_ANCHORED) == 0)
1986 {
1987 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1988 re->options |= PCRE_ANCHORED;
1989 else
1990 {
1991 int ch = find_firstchar(re->code);
1992 if (ch >= 0)
1993 {
1994 re->first_char = ch;
1995 re->options |= PCRE_FIRSTSET;
1996 }
1997 else if (is_startline(re->code))
1998 re->options |= PCRE_STARTLINE;
1999 }
2000 }
2001
2002 /* Print out the compiled data for debugging */
2003
2004 #ifdef DEBUG
2005
2006 printf("Length = %d top_bracket = %d top_backref=%d\n",
2007 length, re->top_bracket, re->top_backref);
2008
2009 if (re->options != 0)
2010 {
2011 printf("%s%s%s%s%s%s%s%s\n",
2012 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2013 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2014 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2015 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2016 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2017 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2018 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2019 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2020 }
2021
2022 if ((re->options & PCRE_FIRSTSET) != 0)
2023 {
2024 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2025 else printf("First char = \\x%02x\n", re->first_char);
2026 }
2027
2028 code_end = code;
2029 code_base = code = re->code;
2030
2031 while (code < code_end)
2032 {
2033 int charlength;
2034
2035 printf("%3d ", code - code_base);
2036
2037 if (*code >= OP_BRA)
2038 {
2039 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2040 code += 2;
2041 }
2042
2043 else switch(*code)
2044 {
2045 case OP_CHARS:
2046 charlength = *(++code);
2047 printf("%3d ", charlength);
2048 while (charlength-- > 0)
2049 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2050 break;
2051
2052 case OP_KETRMAX:
2053 case OP_KETRMIN:
2054 case OP_ALT:
2055 case OP_KET:
2056 case OP_ASSERT:
2057 case OP_ASSERT_NOT:
2058 case OP_ONCE:
2059 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2060 code += 2;
2061 break;
2062
2063 case OP_STAR:
2064 case OP_MINSTAR:
2065 case OP_PLUS:
2066 case OP_MINPLUS:
2067 case OP_QUERY:
2068 case OP_MINQUERY:
2069 case OP_TYPESTAR:
2070 case OP_TYPEMINSTAR:
2071 case OP_TYPEPLUS:
2072 case OP_TYPEMINPLUS:
2073 case OP_TYPEQUERY:
2074 case OP_TYPEMINQUERY:
2075 if (*code >= OP_TYPESTAR)
2076 printf(" %s", OP_names[code[1]]);
2077 else if (isprint(c = code[1])) printf(" %c", c);
2078 else printf(" \\x%02x", c);
2079 printf("%s", OP_names[*code++]);
2080 break;
2081
2082 case OP_EXACT:
2083 case OP_UPTO:
2084 case OP_MINUPTO:
2085 if (isprint(c = code[3])) printf(" %c{", c);
2086 else printf(" \\x%02x{", c);
2087 if (*code != OP_EXACT) printf("0,");
2088 printf("%d}", (code[1] << 8) + code[2]);
2089 if (*code == OP_MINUPTO) printf("?");
2090 code += 3;
2091 break;
2092
2093 case OP_TYPEEXACT:
2094 case OP_TYPEUPTO:
2095 case OP_TYPEMINUPTO:
2096 printf(" %s{", OP_names[code[3]]);
2097 if (*code != OP_TYPEEXACT) printf(",");
2098 printf("%d}", (code[1] << 8) + code[2]);
2099 if (*code == OP_TYPEMINUPTO) printf("?");
2100 code += 3;
2101 break;
2102
2103 case OP_NOT:
2104 if (isprint(c = *(++code))) printf(" [^%c]", c);
2105 else printf(" [^\\x%02x]", c);
2106 break;
2107
2108 case OP_NOTSTAR:
2109 case OP_NOTMINSTAR:
2110 case OP_NOTPLUS:
2111 case OP_NOTMINPLUS:
2112 case OP_NOTQUERY:
2113 case OP_NOTMINQUERY:
2114 if (isprint(c = code[1])) printf(" [^%c]", c);
2115 else printf(" [^\\x%02x]", c);
2116 printf("%s", OP_names[*code++]);
2117 break;
2118
2119 case OP_NOTEXACT:
2120 case OP_NOTUPTO:
2121 case OP_NOTMINUPTO:
2122 if (isprint(c = code[3])) printf(" [^%c]{", c);
2123 else printf(" [^\\x%02x]{", c);
2124 if (*code != OP_NOTEXACT) printf(",");
2125 printf("%d}", (code[1] << 8) + code[2]);
2126 if (*code == OP_NOTMINUPTO) printf("?");
2127 code += 3;
2128 break;
2129
2130 case OP_REF:
2131 printf(" \\%d", *(++code));
2132 code ++;
2133 goto CLASS_REF_REPEAT;
2134
2135 case OP_CLASS:
2136 case OP_NEGCLASS:
2137 {
2138 int i, min, max;
2139
2140 if (*code++ == OP_CLASS) printf(" [");
2141 else printf(" ^[");
2142
2143 for (i = 0; i < 256; i++)
2144 {
2145 if ((code[i/8] & (1 << (i&7))) != 0)
2146 {
2147 int j;
2148 for (j = i+1; j < 256; j++)
2149 if ((code[j/8] & (1 << (j&7))) == 0) break;
2150 if (i == '-' || i == ']') printf("\\");
2151 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2152 if (--j > i)
2153 {
2154 printf("-");
2155 if (j == '-' || j == ']') printf("\\");
2156 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2157 }
2158 i = j;
2159 }
2160 }
2161 printf("]");
2162 code += 32;
2163
2164 CLASS_REF_REPEAT:
2165
2166 switch(*code)
2167 {
2168 case OP_CRSTAR:
2169 case OP_CRMINSTAR:
2170 case OP_CRPLUS:
2171 case OP_CRMINPLUS:
2172 case OP_CRQUERY:
2173 case OP_CRMINQUERY:
2174 printf("%s", OP_names[*code]);
2175 break;
2176
2177 case OP_CRRANGE:
2178 case OP_CRMINRANGE:
2179 min = (code[1] << 8) + code[2];
2180 max = (code[3] << 8) + code[4];
2181 if (max == 0) printf("{%d,}", min);
2182 else printf("{%d,%d}", min, max);
2183 if (*code == OP_CRMINRANGE) printf("?");
2184 code += 4;
2185 break;
2186
2187 default:
2188 code--;
2189 }
2190 }
2191 break;
2192
2193 /* Anything else is just a one-node item */
2194
2195 default:
2196 printf(" %s", OP_names[*code]);
2197 break;
2198 }
2199
2200 code++;
2201 printf("\n");
2202 }
2203 printf("------------------------------------------------------------------\n");
2204
2205 /* This check is done here in the debugging case so that the code that
2206 was compiled can be seen. */
2207
2208 if (code - re->code > length)
2209 {
2210 *errorptr = ERR23;
2211 (pcre_free)(re);
2212 *erroroffset = ptr - (uschar *)pattern;
2213 return NULL;
2214 }
2215 #endif
2216
2217 return (pcre *)re;
2218 }
2219
2220
2221
2222 /*************************************************
2223 * Match a character type *
2224 *************************************************/
2225
2226 /* Not used in all the places it might be as it's sometimes faster
2227 to put the code inline.
2228
2229 Arguments:
2230 type the character type
2231 c the character
2232 dotall the dotall flag
2233
2234 Returns: TRUE if character is of the type
2235 */
2236
2237 static BOOL
2238 match_type(int type, int c, BOOL dotall)
2239 {
2240
2241 #ifdef DEBUG
2242 if (isprint(c)) printf("matching subject %c against ", c);
2243 else printf("matching subject \\x%02x against ", c);
2244 printf("%s\n", OP_names[type]);
2245 #endif
2246
2247 switch(type)
2248 {
2249 case OP_ANY: return dotall || c != '\n';
2250 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2251 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2252 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2253 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2254 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2255 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2256 }
2257 return FALSE;
2258 }
2259
2260
2261
2262 /*************************************************
2263 * Match a back-reference *
2264 *************************************************/
2265
2266 /* If a back reference hasn't been set, the match fails.
2267
2268 Arguments:
2269 number reference number
2270 eptr points into the subject
2271 length length to be matched
2272 md points to match data block
2273
2274 Returns: TRUE if matched
2275 */
2276
2277 static BOOL
2278 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2279 {
2280 const uschar *p = md->start_subject + md->offset_vector[number];
2281
2282 #ifdef DEBUG
2283 if (eptr >= md->end_subject)
2284 printf("matching subject <null>");
2285 else
2286 {
2287 printf("matching subject ");
2288 pchars(eptr, length, TRUE, md);
2289 }
2290 printf(" against backref ");
2291 pchars(p, length, FALSE, md);
2292 printf("\n");
2293 #endif
2294
2295 /* Always fail if not enough characters left */
2296
2297 if (length > md->end_subject - p) return FALSE;
2298
2299 /* Separate the caselesss case for speed */
2300
2301 if (md->caseless)
2302 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2303 else
2304 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2305
2306 return TRUE;
2307 }
2308
2309
2310
2311 /*************************************************
2312 * Match from current position *
2313 *************************************************/
2314
2315 /* On entry ecode points to the first opcode, and eptr to the first character.
2316
2317 Arguments:
2318 eptr pointer in subject
2319 ecode position in code
2320 offset_top current top pointer
2321 md pointer to "static" info for the match
2322
2323 Returns: TRUE if matched
2324 */
2325
2326 static BOOL
2327 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2328 match_data *md)
2329 {
2330 for (;;)
2331 {
2332 int min, max, ctype;
2333 register int i;
2334 register int c;
2335 BOOL minimize = FALSE;
2336
2337 /* Opening bracket. Check the alternative branches in turn, failing if none
2338 match. We have to set the start offset if required and there is space
2339 in the offset vector so that it is available for subsequent back references
2340 if the bracket matches. However, if the bracket fails, we must put back the
2341 previous value of both offsets in case they were set by a previous copy of
2342 the same bracket. Don't worry about setting the flag for the error case here;
2343 that is handled in the code for KET. */
2344
2345 if ((int)*ecode >= OP_BRA)
2346 {
2347 int number = (*ecode - OP_BRA) << 1;
2348 int save_offset1 = 0, save_offset2 = 0;
2349
2350 DPRINTF(("start bracket %d\n", number/2));
2351
2352 if (number > 0 && number < md->offset_end)
2353 {
2354 save_offset1 = md->offset_vector[number];
2355 save_offset2 = md->offset_vector[number+1];
2356 md->offset_vector[number] = eptr - md->start_subject;
2357
2358 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2359 }
2360
2361 /* Recurse for all the alternatives. */
2362
2363 do
2364 {
2365 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2366 ecode += (ecode[1] << 8) + ecode[2];
2367 }
2368 while (*ecode == OP_ALT);
2369
2370 DPRINTF(("bracket %d failed\n", number/2));
2371
2372 if (number > 0 && number < md->offset_end)
2373 {
2374 md->offset_vector[number] = save_offset1;
2375 md->offset_vector[number+1] = save_offset2;
2376 }
2377
2378 return FALSE;
2379 }
2380
2381 /* Other types of node can be handled by a switch */
2382
2383 switch(*ecode)
2384 {
2385 case OP_END:
2386 md->end_match_ptr = eptr; /* Record where we ended */
2387 md->end_offset_top = offset_top; /* and how many extracts were taken */
2388 return TRUE;
2389
2390 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2391 whole thing doesn't match, so we have to get out via a longjmp(). */
2392
2393 case OP_CUT:
2394 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2395 longjmp(md->fail_env, 1);
2396
2397 /* Assertion brackets. Check the alternative branches in turn - the
2398 matching won't pass the KET for an assertion. If any one branch matches,
2399 the assertion is true. */
2400
2401 case OP_ASSERT:
2402 do
2403 {
2404 if (match(eptr, ecode+3, offset_top, md)) break;
2405 ecode += (ecode[1] << 8) + ecode[2];
2406 }
2407 while (*ecode == OP_ALT);
2408 if (*ecode == OP_KET) return FALSE;
2409
2410 /* Continue from after the assertion, updating the offsets high water
2411 mark, since extracts may have been taken during the assertion. */
2412
2413 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2414 ecode += 3;
2415 offset_top = md->end_offset_top;
2416 continue;
2417
2418 /* Negative assertion: all branches must fail to match */
2419
2420 case OP_ASSERT_NOT:
2421 do
2422 {
2423 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2424 ecode += (ecode[1] << 8) + ecode[2];
2425 }
2426 while (*ecode == OP_ALT);
2427 ecode += 3;
2428 continue;
2429
2430 /* "Once" brackets are like assertion brackets except that after a match,
2431 the point in the subject string is not moved back. Thus there can never be
2432 a move back into the brackets. Check the alternative branches in turn - the
2433 matching won't pass the KET for this kind of subpattern. If any one branch
2434 matches, we carry on, leaving the subject pointer. */
2435
2436 case OP_ONCE:
2437 do
2438 {
2439 if (match(eptr, ecode+3, offset_top, md)) break;
2440 ecode += (ecode[1] << 8) + ecode[2];
2441 }
2442 while (*ecode == OP_ALT);
2443 if (*ecode == OP_KET) return FALSE;
2444
2445 /* Continue as from after the assertion, updating the offsets high water
2446 mark, since extracts may have been taken. */
2447
2448 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2449 ecode += 3;
2450 offset_top = md->end_offset_top;
2451 eptr = md->end_match_ptr;
2452 continue;
2453
2454 /* An alternation is the end of a branch; scan along to find the end of the
2455 bracketed group and go to there. */
2456
2457 case OP_ALT:
2458 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2459 break;
2460
2461 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2462 that it may occur zero times. It may repeat infinitely, or not at all -
2463 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2464 repeat limits are compiled as a number of copies, with the optional ones
2465 preceded by BRAZERO or BRAMINZERO. */
2466
2467 case OP_BRAZERO:
2468 {
2469 const uschar *next = ecode+1;
2470 if (match(eptr, next, offset_top, md)) return TRUE;
2471 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2472 ecode = next + 3;
2473 }
2474 break;
2475
2476 case OP_BRAMINZERO:
2477 {
2478 const uschar *next = ecode+1;
2479 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2480 if (match(eptr, next+3, offset_top, md)) return TRUE;
2481 ecode++;
2482 }
2483 break;;
2484
2485 /* End of a group, repeated or non-repeating. If we are at the end of
2486 an assertion "group", stop matching and return TRUE, but record the
2487 current high water mark for use by positive assertions. */
2488
2489 case OP_KET:
2490 case OP_KETRMIN:
2491 case OP_KETRMAX:
2492 {
2493 int number;
2494 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2495
2496 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2497 {
2498 md->end_match_ptr = eptr; /* For ONCE */
2499 md->end_offset_top = offset_top;
2500 return TRUE;
2501 }
2502
2503 /* In all other cases we have to check the group number back at the
2504 start and if necessary complete handling an extraction by setting the
2505 final offset and bumping the high water mark. */
2506
2507 number = (*prev - OP_BRA) << 1;
2508
2509 DPRINTF(("end bracket %d\n", number/2));
2510
2511 if (number > 0)
2512 {
2513 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2514 {
2515 md->offset_vector[number+1] = eptr - md->start_subject;
2516 if (offset_top <= number) offset_top = number + 2;
2517 }
2518 }
2519
2520 /* For a non-repeating ket, just advance to the next node and continue at
2521 this level. */
2522
2523 if (*ecode == OP_KET)
2524 {
2525 ecode += 3;
2526 break;
2527 }
2528
2529 /* The repeating kets try the rest of the pattern or restart from the
2530 preceding bracket, in the appropriate order. */
2531
2532 if (*ecode == OP_KETRMIN)
2533 {
2534 if (match(eptr, ecode+3, offset_top, md) ||
2535 match(eptr, prev, offset_top, md)) return TRUE;
2536 }
2537 else /* OP_KETRMAX */
2538 {
2539 if (match(eptr, prev, offset_top, md) ||
2540 match(eptr, ecode+3, offset_top, md)) return TRUE;
2541 }
2542 }
2543 return FALSE;
2544
2545 /* Start of subject unless notbol, or after internal newline if multiline */
2546
2547 case OP_CIRC:
2548 if (md->notbol && eptr == md->start_subject) return FALSE;
2549 if (md->multiline)
2550 {
2551 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2552 ecode++;
2553 break;
2554 }
2555 /* ... else fall through */
2556
2557 /* Start of subject assertion */
2558
2559 case OP_SOD:
2560 if (eptr != md->start_subject) return FALSE;
2561 ecode++;
2562 break;
2563
2564 /* Assert before internal newline if multiline, or before
2565 a terminating newline unless endonly is set, else end of subject unless
2566 noteol is set. */
2567
2568 case OP_DOLL:
2569 if (md->noteol && eptr >= md->end_subject) return FALSE;
2570 if (md->multiline)
2571 {
2572 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2573 ecode++;
2574 break;
2575 }
2576 else if (!md->endonly)
2577 {
2578 if (eptr < md->end_subject - 1 ||
2579 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2580 ecode++;
2581 break;
2582 }
2583 /* ... else fall through */
2584
2585 /* End of subject assertion */
2586
2587 case OP_EOD:
2588 if (eptr < md->end_subject) return FALSE;
2589 ecode++;
2590 break;
2591
2592 /* Word boundary assertions */
2593
2594 case OP_NOT_WORD_BOUNDARY:
2595 case OP_WORD_BOUNDARY:
2596 {
2597 BOOL prev_is_word = (eptr != md->start_subject) &&
2598 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2599 BOOL cur_is_word = (eptr < md->end_subject) &&
2600 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2601 if ((*ecode++ == OP_WORD_BOUNDARY)?
2602 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2603 return FALSE;
2604 }
2605 break;
2606
2607 /* Match a single character type; inline for speed */
2608
2609 case OP_ANY:
2610 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2611 if (eptr++ >= md->end_subject) return FALSE;
2612 ecode++;
2613 break;
2614
2615 case OP_NOT_DIGIT:
2616 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2617 return FALSE;
2618 ecode++;
2619 break;
2620
2621 case OP_DIGIT:
2622 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2623 return FALSE;
2624 ecode++;
2625 break;
2626
2627 case OP_NOT_WHITESPACE:
2628 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2629 return FALSE;
2630 ecode++;
2631 break;
2632
2633 case OP_WHITESPACE:
2634 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2635 return FALSE;
2636 ecode++;
2637 break;
2638
2639 case OP_NOT_WORDCHAR:
2640 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2641 return FALSE;
2642 ecode++;
2643 break;
2644
2645 case OP_WORDCHAR:
2646 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2647 return FALSE;
2648 ecode++;
2649 break;
2650
2651 /* Match a back reference, possibly repeatedly. Look past the end of the
2652 item to see if there is repeat information following. The code is similar
2653 to that for character classes, but repeated for efficiency. Then obey
2654 similar code to character type repeats - written out again for speed.
2655 However, if the referenced string is the empty string, always treat
2656 it as matched, any number of times (otherwise there could be infinite
2657 loops). */
2658
2659 case OP_REF:
2660 {
2661 int length;
2662 int number = ecode[1] << 1; /* Doubled reference number */
2663 ecode += 2; /* Advance past the item */
2664
2665 if (number >= offset_top || md->offset_vector[number] < 0)
2666 {
2667 md->errorcode = PCRE_ERROR_BADREF;
2668 return FALSE;
2669 }
2670
2671 length = md->offset_vector[number+1] - md->offset_vector[number];
2672
2673 switch (*ecode)
2674 {
2675 case OP_CRSTAR:
2676 case OP_CRMINSTAR:
2677 case OP_CRPLUS:
2678 case OP_CRMINPLUS:
2679 case OP_CRQUERY:
2680 case OP_CRMINQUERY:
2681 c = *ecode++ - OP_CRSTAR;
2682 minimize = (c & 1) != 0;
2683 min = rep_min[c]; /* Pick up values from tables; */
2684 max = rep_max[c]; /* zero for max => infinity */
2685 if (max == 0) max = INT_MAX;
2686 break;
2687
2688 case OP_CRRANGE:
2689 case OP_CRMINRANGE:
2690 minimize = (*ecode == OP_CRMINRANGE);
2691 min = (ecode[1] << 8) + ecode[2];
2692 max = (ecode[3] << 8) + ecode[4];
2693 if (max == 0) max = INT_MAX;
2694 ecode += 5;
2695 break;
2696
2697 default: /* No repeat follows */
2698 if (!match_ref(number, eptr, length, md)) return FALSE;
2699 eptr += length;
2700 continue; /* With the main loop */
2701 }
2702
2703 /* If the length of the reference is zero, just continue with the
2704 main loop. */
2705
2706 if (length == 0) continue;
2707
2708 /* First, ensure the minimum number of matches are present. We get back
2709 the length of the reference string explicitly rather than passing the
2710 address of eptr, so that eptr can be a register variable. */
2711
2712 for (i = 1; i <= min; i++)
2713 {
2714 if (!match_ref(number, eptr, length, md)) return FALSE;
2715 eptr += length;
2716 }
2717
2718 /* If min = max, continue at the same level without recursion.
2719 They are not both allowed to be zero. */
2720
2721 if (min == max) continue;
2722
2723 /* If minimizing, keep trying and advancing the pointer */
2724
2725 if (minimize)
2726 {
2727 for (i = min;; i++)
2728 {
2729 if (match(eptr, ecode, offset_top, md)) return TRUE;
2730 if (i >= max || !match_ref(number, eptr, length, md))
2731 return FALSE;
2732 eptr += length;
2733 }
2734 /* Control never gets here */
2735 }
2736
2737 /* If maximizing, find the longest string and work backwards */
2738
2739 else
2740 {
2741 const uschar *pp = eptr;
2742 for (i = min; i < max; i++)
2743 {
2744 if (!match_ref(number, eptr, length, md)) break;
2745 eptr += length;
2746 }
2747 while (eptr >= pp)
2748 {
2749 if (match(eptr, ecode, offset_top, md)) return TRUE;
2750 eptr -= length;
2751 }
2752 return FALSE;
2753 }
2754 }
2755 /* Control never gets here */
2756
2757 /* Match a character class, possibly repeatedly. Look past the end of the
2758 item to see if there is repeat information following. Then obey similar
2759 code to character type repeats - written out again for speed. If caseless
2760 matching was set at runtime but not at compile time, we have to check both
2761 versions of a character, and we have to behave differently for positive and
2762 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
2763 treated differently. */
2764
2765 case OP_CLASS:
2766 case OP_NEGCLASS:
2767 {
2768 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
2769 const uschar *data = ecode + 1; /* Save for matching */
2770 ecode += 33; /* Advance past the item */
2771
2772 switch (*ecode)
2773 {
2774 case OP_CRSTAR:
2775 case OP_CRMINSTAR:
2776 case OP_CRPLUS:
2777 case OP_CRMINPLUS:
2778 case OP_CRQUERY:
2779 case OP_CRMINQUERY:
2780 c = *ecode++ - OP_CRSTAR;
2781 minimize = (c & 1) != 0;
2782 min = rep_min[c]; /* Pick up values from tables; */
2783 max = rep_max[c]; /* zero for max => infinity */
2784 if (max == 0) max = INT_MAX;
2785 break;
2786
2787 case OP_CRRANGE:
2788 case OP_CRMINRANGE:
2789 minimize = (*ecode == OP_CRMINRANGE);
2790 min = (ecode[1] << 8) + ecode[2];
2791 max = (ecode[3] << 8) + ecode[4];
2792 if (max == 0) max = INT_MAX;
2793 ecode += 5;
2794 break;
2795
2796 default: /* No repeat follows */
2797 min = max = 1;
2798 break;
2799 }
2800
2801 /* First, ensure the minimum number of matches are present. */
2802
2803 for (i = 1; i <= min; i++)
2804 {
2805 if (eptr >= md->end_subject) return FALSE;
2806 c = *eptr++;
2807
2808 /* Either not runtime caseless, or it was a positive class. For
2809 runtime caseless, continue if either case is in the map. */
2810
2811 if (!nasty_case)
2812 {
2813 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2814 if (md->runtime_caseless)
2815 {
2816 c = pcre_fcc[c];
2817 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2818 }
2819 }
2820
2821 /* Runtime caseless and it was a negative class. Continue only if
2822 both cases are in the map. */
2823
2824 else
2825 {
2826 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2827 c = pcre_fcc[c];
2828 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2829 }
2830
2831 return FALSE;
2832 }
2833
2834 /* If max == min we can continue with the main loop without the
2835 need to recurse. */
2836
2837 if (min == max) continue;
2838
2839 /* If minimizing, keep testing the rest of the expression and advancing
2840 the pointer while it matches the class. */
2841
2842 if (minimize)
2843 {
2844 for (i = min;; i++)
2845 {
2846 if (match(eptr, ecode, offset_top, md)) return TRUE;
2847 if (i >= max || eptr >= md->end_subject) return FALSE;
2848 c = *eptr++;
2849
2850 /* Either not runtime caseless, or it was a positive class. For
2851 runtime caseless, continue if either case is in the map. */
2852
2853 if (!nasty_case)
2854 {
2855 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2856 if (md->runtime_caseless)
2857 {
2858 c = pcre_fcc[c];
2859 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2860 }
2861 }
2862
2863 /* Runtime caseless and it was a negative class. Continue only if
2864 both cases are in the map. */
2865
2866 else
2867 {
2868 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2869 c = pcre_fcc[c];
2870 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2871 }
2872
2873 return FALSE;
2874 }
2875 /* Control never gets here */
2876 }
2877
2878 /* If maximizing, find the longest possible run, then work backwards. */
2879
2880 else
2881 {
2882 const uschar *pp = eptr;
2883 for (i = min; i < max; eptr++, i++)
2884 {
2885 if (eptr >= md->end_subject) break;
2886 c = *eptr;
2887
2888 /* Either not runtime caseless, or it was a positive class. For
2889 runtime caseless, continue if either case is in the map. */
2890
2891 if (!nasty_case)
2892 {
2893 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2894 if (md->runtime_caseless)
2895 {
2896 c = pcre_fcc[c];
2897 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2898 }
2899 }
2900
2901 /* Runtime caseless and it was a negative class. Continue only if
2902 both cases are in the map. */
2903
2904 else
2905 {
2906 if ((data[c/8] & (1 << (c&7))) == 0) break;
2907 c = pcre_fcc[c];
2908 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2909 }
2910
2911 break;
2912 }
2913
2914 while (eptr >= pp)
2915 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2916 return FALSE;
2917 }
2918 }
2919 /* Control never gets here */
2920
2921 /* Match a run of characters */
2922
2923 case OP_CHARS:
2924 {
2925 register int length = ecode[1];
2926 ecode += 2;
2927
2928 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2929 if (eptr >= md->end_subject)
2930 printf("matching subject <null> against pattern ");
2931 else
2932 {
2933 printf("matching subject ");
2934 pchars(eptr, length, TRUE, md);
2935 printf(" against pattern ");
2936 }
2937 pchars(ecode, length, FALSE, md);
2938 printf("\n");
2939 #endif
2940
2941 if (length > md->end_subject - eptr) return FALSE;
2942 if (md->caseless)
2943 {
2944 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2945 }
2946 else
2947 {
2948 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2949 }
2950 }
2951 break;
2952
2953 /* Match a single character repeatedly; different opcodes share code. */
2954
2955 case OP_EXACT:
2956 min = max = (ecode[1] << 8) + ecode[2];
2957 ecode += 3;
2958 goto REPEATCHAR;
2959
2960 case OP_UPTO:
2961 case OP_MINUPTO:
2962 min = 0;
2963 max = (ecode[1] << 8) + ecode[2];
2964 minimize = *ecode == OP_MINUPTO;
2965 ecode += 3;
2966 goto REPEATCHAR;
2967
2968 case OP_STAR:
2969 case OP_MINSTAR:
2970 case OP_PLUS:
2971 case OP_MINPLUS:
2972 case OP_QUERY:
2973 case OP_MINQUERY:
2974 c = *ecode++ - OP_STAR;
2975 minimize = (c & 1) != 0;
2976 min = rep_min[c]; /* Pick up values from tables; */
2977 max = rep_max[c]; /* zero for max => infinity */
2978 if (max == 0) max = INT_MAX;
2979
2980 /* Common code for all repeated single-character matches. We can give
2981 up quickly if there are fewer than the minimum number of characters left in
2982 the subject. */
2983
2984 REPEATCHAR:
2985 if (min > md->end_subject - eptr) return FALSE;
2986 c = *ecode++;
2987
2988 /* The code is duplicated for the caseless and caseful cases, for speed,
2989 since matching characters is likely to be quite common. First, ensure the
2990 minimum number of matches are present. If min = max, continue at the same
2991 level without recursing. Otherwise, if minimizing, keep trying the rest of
2992 the expression and advancing one matching character if failing, up to the
2993 maximum. Alternatively, if maximizing, find the maximum number of
2994 characters and work backwards. */
2995
2996 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2997 max, eptr));
2998
2999 if (md->caseless)
3000 {
3001 c = pcre_lcc[c];
3002 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
3003 if (min == max) continue;
3004 if (minimize)
3005 {
3006 for (i = min;; i++)
3007 {
3008 if (match(eptr, ecode, offset_top, md)) return TRUE;
3009 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3010 return FALSE;
3011 }
3012 /* Control never gets here */
3013 }
3014 else
3015 {
3016 const uschar *pp = eptr;
3017 for (i = min; i < max; i++)
3018 {
3019 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3020 eptr++;
3021 }
3022 while (eptr >= pp)
3023 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3024 return FALSE;
3025 }
3026 /* Control never gets here */
3027 }
3028
3029 /* Caseful comparisons */
3030
3031 else
3032 {
3033 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3034 if (min == max) continue;
3035 if (minimize)
3036 {
3037 for (i = min;; i++)
3038 {
3039 if (match(eptr, ecode, offset_top, md)) return TRUE;
3040 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3041 }
3042 /* Control never gets here */
3043 }
3044 else
3045 {
3046 const uschar *pp = eptr;
3047 for (i = min; i < max; i++)
3048 {
3049 if (eptr >= md->end_subject || c != *eptr) break;
3050 eptr++;
3051 }
3052 while (eptr >= pp)
3053 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3054 return FALSE;
3055 }
3056 }
3057 /* Control never gets here */
3058
3059 /* Match a negated single character */
3060
3061 case OP_NOT:
3062 if (eptr >= md->end_subject) return FALSE;
3063 ecode++;
3064 if (md->caseless)
3065 {
3066 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3067 }
3068 else
3069 {
3070 if (*ecode++ == *eptr++) return FALSE;
3071 }
3072 break;
3073
3074 /* Match a negated single character repeatedly. This is almost a repeat of
3075 the code for a repeated single character, but I haven't found a nice way of
3076 commoning these up that doesn't require a test of the positive/negative
3077 option for each character match. Maybe that wouldn't add very much to the
3078 time taken, but character matching *is* what this is all about... */
3079
3080 case OP_NOTEXACT:
3081 min = max = (ecode[1] << 8) + ecode[2];
3082 ecode += 3;
3083 goto REPEATNOTCHAR;
3084
3085 case OP_NOTUPTO:
3086 case OP_NOTMINUPTO:
3087 min = 0;
3088 max = (ecode[1] << 8) + ecode[2];
3089 minimize = *ecode == OP_NOTMINUPTO;
3090 ecode += 3;
3091 goto REPEATNOTCHAR;
3092
3093 case OP_NOTSTAR:
3094 case OP_NOTMINSTAR:
3095 case OP_NOTPLUS:
3096 case OP_NOTMINPLUS:
3097 case OP_NOTQUERY:
3098 case OP_NOTMINQUERY:
3099 c = *ecode++ - OP_NOTSTAR;
3100 minimize = (c & 1) != 0;
3101 min = rep_min[c]; /* Pick up values from tables; */
3102 max = rep_max[c]; /* zero for max => infinity */
3103 if (max == 0) max = INT_MAX;
3104
3105 /* Common code for all repeated single-character matches. We can give
3106 up quickly if there are fewer than the minimum number of characters left in
3107 the subject. */
3108
3109 REPEATNOTCHAR:
3110 if (min > md->end_subject - eptr) return FALSE;
3111 c = *ecode++;
3112
3113 /* The code is duplicated for the caseless and caseful cases, for speed,
3114 since matching characters is likely to be quite common. First, ensure the
3115 minimum number of matches are present. If min = max, continue at the same
3116 level without recursing. Otherwise, if minimizing, keep trying the rest of
3117 the expression and advancing one matching character if failing, up to the
3118 maximum. Alternatively, if maximizing, find the maximum number of
3119 characters and work backwards. */
3120
3121 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3122 max, eptr));
3123
3124 if (md->caseless)
3125 {
3126 c = pcre_lcc[c];
3127 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3128 if (min == max) continue;
3129 if (minimize)
3130 {
3131 for (i = min;; i++)
3132 {
3133 if (match(eptr, ecode, offset_top, md)) return TRUE;
3134 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3135 return FALSE;
3136 }
3137 /* Control never gets here */
3138 }
3139 else
3140 {
3141 const uschar *pp = eptr;
3142 for (i = min; i < max; i++)
3143 {
3144 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3145 eptr++;
3146 }
3147 while (eptr >= pp)
3148 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3149 return FALSE;
3150 }
3151 /* Control never gets here */
3152 }
3153
3154 /* Caseful comparisons */
3155
3156 else
3157 {
3158 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3159 if (min == max) continue;
3160 if (minimize)
3161 {
3162 for (i = min;; i++)
3163 {
3164 if (match(eptr, ecode, offset_top, md)) return TRUE;
3165 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3166 }
3167 /* Control never gets here */
3168 }
3169 else
3170 {
3171 const uschar *pp = eptr;
3172 for (i = min; i < max; i++)
3173 {
3174 if (eptr >= md->end_subject || c == *eptr) break;
3175 eptr++;
3176 }
3177 while (eptr >= pp)
3178 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3179 return FALSE;
3180 }
3181 }
3182 /* Control never gets here */
3183
3184 /* Match a single character type repeatedly; several different opcodes
3185 share code. This is very similar to the code for single characters, but we
3186 repeat it in the interests of efficiency. */
3187
3188 case OP_TYPEEXACT:
3189 min = max = (ecode[1] << 8) + ecode[2];
3190 minimize = TRUE;
3191 ecode += 3;
3192 goto REPEATTYPE;
3193
3194 case OP_TYPEUPTO:
3195 case OP_TYPEMINUPTO:
3196 min = 0;
3197 max = (ecode[1] << 8) + ecode[2];
3198 minimize = *ecode == OP_TYPEMINUPTO;
3199 ecode += 3;
3200 goto REPEATTYPE;
3201
3202 case OP_TYPESTAR:
3203 case OP_TYPEMINSTAR:
3204 case OP_TYPEPLUS:
3205 case OP_TYPEMINPLUS:
3206 case OP_TYPEQUERY:
3207 case OP_TYPEMINQUERY:
3208 c = *ecode++ - OP_TYPESTAR;
3209 minimize = (c & 1) != 0;
3210 min = rep_min[c]; /* Pick up values from tables; */
3211 max = rep_max[c]; /* zero for max => infinity */
3212 if (max == 0) max = INT_MAX;
3213
3214 /* Common code for all repeated single character type matches */
3215
3216 REPEATTYPE:
3217 ctype = *ecode++; /* Code for the character type */
3218
3219 /* First, ensure the minimum number of matches are present. Use inline
3220 code for maximizing the speed, and do the type test once at the start
3221 (i.e. keep it out of the loop). Also test that there are at least the
3222 minimum number of characters before we start. */
3223
3224 if (min > md->end_subject - eptr) return FALSE;
3225 if (min > 0) switch(ctype)
3226 {
3227 case OP_ANY:
3228 if (!md->dotall)
3229 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3230 else eptr += min;
3231 break;
3232
3233 case OP_NOT_DIGIT:
3234 for (i = 1; i <= min; i++)
3235 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3236 break;
3237
3238 case OP_DIGIT:
3239 for (i = 1; i <= min; i++)
3240 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3241 break;
3242
3243 case OP_NOT_WHITESPACE:
3244 for (i = 1; i <= min; i++)
3245 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3246 break;
3247
3248 case OP_WHITESPACE:
3249 for (i = 1; i <= min; i++)
3250 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3251 break;
3252
3253 case OP_NOT_WORDCHAR:
3254 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3255 return FALSE;
3256 break;
3257
3258 case OP_WORDCHAR:
3259 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3260 return FALSE;
3261 break;
3262 }
3263
3264 /* If min = max, continue at the same level without recursing */
3265
3266 if (min == max) continue;
3267
3268 /* If minimizing, we have to test the rest of the pattern before each
3269 subsequent match, so inlining isn't much help; just use the function. */
3270
3271 if (minimize)
3272 {
3273 for (i = min;; i++)
3274 {
3275 if (match(eptr, ecode, offset_top, md)) return TRUE;
3276 if (i >= max || eptr >= md->end_subject ||
3277 !match_type(ctype, *eptr++, md->dotall))
3278 return FALSE;
3279 }
3280 /* Control never gets here */
3281 }
3282
3283 /* If maximizing it is worth using inline code for speed, doing the type
3284 test once at the start (i.e. keep it out of the loop). */
3285
3286 else
3287 {
3288 const uschar *pp = eptr;
3289 switch(ctype)
3290 {
3291 case OP_ANY:
3292 if (!md->dotall)
3293 {
3294 for (i = min; i < max; i++)
3295 {
3296 if (eptr >= md->end_subject || *eptr == '\n') break;
3297 eptr++;
3298 }
3299 }
3300 else
3301 {
3302 c = max - min;
3303 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3304 eptr += c;
3305 }
3306 break;
3307
3308 case OP_NOT_DIGIT:
3309 for (i = min; i < max; i++)
3310 {
3311 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3312 break;
3313 eptr++;
3314 }
3315 break;
3316
3317 case OP_DIGIT:
3318 for (i = min; i < max; i++)
3319 {
3320 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3321 break;
3322 eptr++;
3323 }
3324 break;
3325
3326 case OP_NOT_WHITESPACE:
3327 for (i = min; i < max; i++)
3328 {
3329 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3330 break;
3331 eptr++;
3332 }
3333 break;
3334
3335 case OP_WHITESPACE:
3336 for (i = min; i < max; i++)
3337 {
3338 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3339 break;
3340 eptr++;
3341 }
3342 break;
3343
3344 case OP_NOT_WORDCHAR:
3345 for (i = min; i < max; i++)
3346 {
3347 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3348 break;
3349 eptr++;
3350 }
3351 break;
3352
3353 case OP_WORDCHAR:
3354 for (i = min; i < max; i++)
3355 {
3356 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3357 break;
3358 eptr++;
3359 }
3360 break;
3361 }
3362
3363 while (eptr >= pp)
3364 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3365 return FALSE;
3366 }
3367 /* Control never gets here */
3368
3369 /* There's been some horrible disaster. */
3370
3371 default:
3372 DPRINTF(("Unknown opcode %d\n", *ecode));
3373 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3374 return FALSE;
3375 }
3376
3377 /* Do not stick any code in here without much thought; it is assumed
3378 that "continue" in the code above comes out to here to repeat the main
3379 loop. */
3380
3381 } /* End of main loop */
3382 /* Control never reaches here */
3383 }
3384
3385
3386
3387 /*************************************************
3388 * Segregate setjmp() *
3389 *************************************************/
3390
3391 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3392 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3393 hide it in a separate function. This is called only when PCRE_EXTRA is set,
3394 since it's needed only for the extension \X option, and with any luck, a good
3395 compiler will spot the tail recursion and compile it efficiently.
3396
3397 Arguments:
3398 eptr pointer in subject
3399 ecode position in code
3400 offset_top current top pointer
3401 md pointer to "static" info for the match
3402
3403 Returns: TRUE if matched
3404 */
3405
3406 static BOOL
3407 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
3408 match_data *match_block)
3409 {
3410 return setjmp(match_block->fail_env) == 0 &&
3411 match(eptr, ecode, offset_top, match_block);
3412 }
3413
3414
3415
3416 /*************************************************
3417 * Execute a Regular Expression *
3418 *************************************************/
3419
3420 /* This function applies a compiled re to a subject string and picks out
3421 portions of the string if it matches. Two elements in the vector are set for
3422 each substring: the offsets to the start and end of the substring.
3423
3424 Arguments:
3425 external_re points to the compiled expression
3426 external_extra points to "hints" from pcre_study() or is NULL
3427 subject points to the subject string
3428 length length of subject string (may contain binary zeros)
3429 options option bits
3430 offsets points to a vector of ints to be filled in with offsets
3431 offsetcount the number of elements in the vector
3432
3433 Returns: > 0 => success; value is the number of elements filled in
3434 = 0 => success, but offsets is not big enough
3435 -1 => failed to match
3436 < -1 => some kind of unexpected problem
3437 */
3438
3439 int
3440 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3441 const char *subject, int length, int options, int *offsets, int offsetcount)
3442 {
3443 int resetcount, ocount;
3444 int first_char = -1;
3445 match_data match_block;
3446 const uschar *start_bits = NULL;
3447 const uschar *start_match = (const uschar *)subject;
3448 const uschar *end_subject;
3449 const real_pcre *re = (const real_pcre *)external_re;
3450 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3451 BOOL using_temporary_offsets = FALSE;
3452 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3453 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3454
3455 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3456
3457 if (re == NULL || subject == NULL ||
3458 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3459 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3460
3461 match_block.start_subject = (const uschar *)subject;
3462 match_block.end_subject = match_block.start_subject + length;
3463 end_subject = match_block.end_subject;
3464
3465 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3466 match_block.runtime_caseless = match_block.caseless &&
3467 (re->options & PCRE_CASELESS) == 0;
3468
3469 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3470 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3471 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3472
3473 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3474 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3475
3476 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3477
3478 /* If the expression has got more back references than the offsets supplied can
3479 hold, we get a temporary bit of working store to use during the matching.
3480 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3481 of 2. */
3482
3483 ocount = offsetcount & (-2);
3484 if (re->top_backref > 0 && re->top_backref >= ocount/2)
3485 {
3486 ocount = re->top_backref * 2 + 2;
3487 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3488 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3489 using_temporary_offsets = TRUE;
3490 DPRINTF(("Got memory to hold back references\n"));
3491 }
3492 else match_block.offset_vector = offsets;
3493
3494 match_block.offset_end = ocount;
3495 match_block.offset_overflow = FALSE;
3496
3497 /* Compute the minimum number of offsets that we need to reset each time. Doing
3498 this makes a huge difference to execution time when there aren't many brackets
3499 in the pattern. */
3500
3501 resetcount = 2 + re->top_bracket * 2;
3502 if (resetcount > offsetcount) resetcount = ocount;
3503
3504 /* If MULTILINE is set at exec time but was not set at compile time, and the
3505 anchored flag is set, we must re-check because a setting provoked by ^ in the
3506 pattern is not right in multi-line mode. Calling is_anchored() again here does
3507 the right check, because multiline is now set. If it now yields FALSE, the
3508 expression must have had ^ starting some of its branches. Check to see if
3509 that is true for *all* branches, and if so, set the startline flag. */
3510
3511 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3512 !is_anchored(re->code, match_block.multiline))
3513 {
3514 anchored = FALSE;
3515 if (is_startline(re->code)) startline = TRUE;
3516 }
3517
3518 /* Set up the first character to match, if available. The first_char value is
3519 never set for an anchored regular expression, but the anchoring may be forced
3520 at run time, so we have to test for anchoring. The first char may be unset for
3521 an unanchored pattern, of course. If there's no first char and the pattern was
3522 studied, the may be a bitmap of possible first characters. However, we can
3523 use this only if the caseless state of the studying was correct. */
3524
3525 if (!anchored)
3526 {
3527 if ((re->options & PCRE_FIRSTSET) != 0)
3528 {
3529 first_char = re->first_char;
3530 if (match_block.caseless) first_char = pcre_lcc[first_char];
3531 }
3532 else
3533 if (!startline && extra != NULL &&
3534 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3535 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3536 start_bits = extra->start_bits;
3537 }
3538
3539 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3540
3541 do
3542 {
3543 int rc;
3544 register int *iptr = match_block.offset_vector;
3545 register int *iend = iptr + resetcount;
3546
3547 /* Reset the maximum number of extractions we might see. */
3548
3549 while (iptr < iend) *iptr++ = -1;
3550
3551 /* Advance to a unique first char if possible */
3552
3553 if (first_char >= 0)
3554 {
3555 if (match_block.caseless)
3556 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3557 start_match++;
3558 else
3559 while (start_match < end_subject && *start_match != first_char)
3560 start_match++;
3561 }
3562
3563 /* Or to just after \n for a multiline match if possible */
3564
3565 else if (startline)
3566 {
3567 if (start_match > match_block.start_subject)
3568 {
3569 while (start_match < end_subject && start_match[-1] != '\n')
3570 start_match++;
3571 }
3572 }
3573
3574 /* Or to a non-unique first char */
3575
3576 else if (start_bits != NULL)
3577 {
3578 while (start_match < end_subject)
3579 {
3580 register int c = *start_match;
3581 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3582 }
3583 }
3584
3585 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3586 printf(">>>> Match against: ");
3587 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3588 printf("\n");
3589 #endif
3590
3591 /* When a match occurs, substrings will be set for all internal extractions;
3592 we just need to set up the whole thing as substring 0 before returning. If
3593 there were too many extractions, set the return code to zero. In the case
3594 where we had to get some local store to hold offsets for backreferences, copy
3595 those back references that we can. In this case there need not be overflow
3596 if certain parts of the pattern were not used.
3597
3598 Before starting the match, we have to set up a longjmp() target to enable
3599 the "cut" operation to fail a match completely without backtracking. This
3600 is done in a separate function to avoid compiler warnings. We need not do
3601 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3602 enabled. */
3603
3604 if ((re->options & PCRE_EXTRA) != 0)
3605 {
3606 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
3607 continue;
3608 }
3609 else if (!match(start_match, re->code, 2, &match_block)) continue;
3610
3611 /* Copy the offset information from temporary store if necessary */
3612
3613 if (using_temporary_offsets)
3614 {
3615 if (offsetcount >= 4)
3616 {
3617 memcpy(offsets + 2, match_block.offset_vector + 2,
3618 (offsetcount - 2) * sizeof(int));
3619 DPRINTF(("Copied offsets from temporary memory\n"));
3620 }
3621 if (match_block.end_offset_top > offsetcount)
3622 match_block.offset_overflow = TRUE;
3623
3624 DPRINTF(("Freeing temporary memory\n"));
3625 (pcre_free)(match_block.offset_vector);
3626 }
3627
3628 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3629
3630 if (match_block.offset_end < 2) rc = 0; else
3631 {
3632 offsets[0] = start_match - match_block.start_subject;
3633 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3634 }
3635
3636 DPRINTF((">>>> returning %d\n", rc));
3637 return rc;
3638 }
3639 while (!anchored &&
3640 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3641 start_match++ < end_subject);
3642
3643 if (using_temporary_offsets)
3644 {
3645 DPRINTF(("Freeing temporary memory\n"));
3646 (pcre_free)(match_block.offset_vector);
3647 }
3648
3649 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3650
3651 return match_block.errorcode;
3652 }
3653
3654 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12