/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 17 - (show annotations) (download)
Sat Feb 24 21:38:29 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 103700 byte(s)
Load pcre-1.07 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1998 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the the use
37 of #ifdef inline, and there are *still* stupid compilers about that don't like
38 indented pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Allow compilation as C++ source code, should anybody want to do that. */
53
54 #ifdef __cplusplus
55 #define class pcre_class
56 #endif
57
58
59 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
60
61 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
62 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
63
64 /* Text forms of OP_ values and things, for debugging (not all used) */
65
66 #ifdef DEBUG
67 static const char *OP_names[] = {
68 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
69 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
70 "not",
71 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
72 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
73 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
74 "*", "*?", "+", "+?", "?", "??", "{", "{",
75 "class", "negclass", "Ref",
76 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
77 "Brazero", "Braminzero", "Bra"
78 };
79 #endif
80
81 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
82 are simple data values; negative values are for special things like \d and so
83 on. Zero means further processing is needed (for things like \x), or the escape
84 is invalid. */
85
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
90 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
94 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
95 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
96 0, 0, 0 /* x - z */
97 };
98
99 /* Definition to allow mutual recursion */
100
101 static BOOL
102 compile_regex(int, int *, uschar **, const uschar **, const char **);
103
104 /* Structure for passing "static" information around between the functions
105 doing the matching, so that they are thread-safe. */
106
107 typedef struct match_data {
108 int errorcode; /* As it says */
109 int *offset_vector; /* Offset vector */
110 int offset_end; /* One past the end */
111 BOOL offset_overflow; /* Set if too many extractions */
112 BOOL caseless; /* Case-independent flag */
113 BOOL runtime_caseless; /* Caseless forced at run time */
114 BOOL multiline; /* Multiline flag */
115 BOOL notbol; /* NOTBOL flag */
116 BOOL noteol; /* NOTEOL flag */
117 BOOL dotall; /* Dot matches any char */
118 BOOL endonly; /* Dollar not before final \n */
119 const uschar *start_subject; /* Start of the subject string */
120 const uschar *end_subject; /* End of the subject string */
121 jmp_buf fail_env; /* Environment for longjump() break out */
122 const uschar *end_match_ptr; /* Subject position at end match */
123 int end_offset_top; /* Highwater mark at end of match */
124 } match_data;
125
126
127
128 /*************************************************
129 * Global variables *
130 *************************************************/
131
132 /* PCRE is thread-clean and doesn't use any global variables in the normal
133 sense. However, it calls memory allocation and free functions via the two
134 indirections below, which are can be changed by the caller, but are shared
135 between all threads. */
136
137 void *(*pcre_malloc)(size_t) = malloc;
138 void (*pcre_free)(void *) = free;
139
140
141
142
143 /*************************************************
144 * Return version string *
145 *************************************************/
146
147 const char *
148 pcre_version(void)
149 {
150 return PCRE_VERSION;
151 }
152
153
154
155
156 /*************************************************
157 * Return info about a compiled pattern *
158 *************************************************/
159
160 /* This function picks potentially useful data out of the private
161 structure.
162
163 Arguments:
164 external_re points to compiled code
165 optptr where to pass back the options
166 first_char where to pass back the first character,
167 or -1 if multiline and all branches start ^,
168 or -2 otherwise
169
170 Returns: number of identifying extraction brackets
171 or negative values on error
172 */
173
174 int
175 pcre_info(const pcre *external_re, int *optptr, int *first_char)
176 {
177 const real_pcre *re = (const real_pcre *)external_re;
178 if (re == NULL) return PCRE_ERROR_NULL;
179 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
180 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
181 if (first_char != NULL)
182 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
183 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
184 return re->top_bracket;
185 }
186
187
188
189
190 #ifdef DEBUG
191 /*************************************************
192 * Debugging function to print chars *
193 *************************************************/
194
195 /* Print a sequence of chars in printable format, stopping at the end of the
196 subject if the requested.
197
198 Arguments:
199 p points to characters
200 length number to print
201 is_subject TRUE if printing from within md->start_subject
202 md pointer to matching data block, if is_subject is TRUE
203
204 Returns: nothing
205 */
206
207 static void
208 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
209 {
210 int c;
211 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
212 while (length-- > 0)
213 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
214 }
215 #endif
216
217
218
219
220 /*************************************************
221 * Check subpattern for empty operand *
222 *************************************************/
223
224 /* This function checks a bracketed subpattern to see if any of the paths
225 through it could match an empty string. This is used to diagnose an error if
226 such a subpattern is followed by a quantifier with an unlimited upper bound.
227
228 Argument:
229 code points to the opening bracket
230
231 Returns: TRUE or FALSE
232 */
233
234 static BOOL
235 could_be_empty(uschar *code)
236 {
237 do {
238 uschar *cc = code + 3;
239
240 /* Scan along the opcodes for this branch; as soon as we find something
241 that matches a non-empty string, break out and advance to test the next
242 branch. If we get to the end of the branch, return TRUE for the whole
243 sub-expression. */
244
245 for (;;)
246 {
247 /* Test an embedded subpattern; if it could not be empty, break the
248 loop. Otherwise carry on in the branch. */
249
250 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
251 {
252 if (!could_be_empty(cc)) break;
253 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
254 cc += 3;
255 }
256
257 else switch (*cc)
258 {
259 /* Reached end of a branch: the subpattern may match the empty string */
260
261 case OP_ALT:
262 case OP_KET:
263 case OP_KETRMAX:
264 case OP_KETRMIN:
265 return TRUE;
266
267 /* Skip over entire bracket groups with zero lower bound */
268
269 case OP_BRAZERO:
270 case OP_BRAMINZERO:
271 cc++;
272 /* Fall through */
273
274 /* Skip over assertive subpatterns */
275
276 case OP_ASSERT:
277 case OP_ASSERT_NOT:
278 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
279 cc += 3;
280 break;
281
282 /* Skip over things that don't match chars */
283
284 case OP_SOD:
285 case OP_EOD:
286 case OP_CIRC:
287 case OP_DOLL:
288 case OP_NOT_WORD_BOUNDARY:
289 case OP_WORD_BOUNDARY:
290 cc++;
291 break;
292
293 /* Skip over simple repeats with zero lower bound */
294
295 case OP_STAR:
296 case OP_MINSTAR:
297 case OP_QUERY:
298 case OP_MINQUERY:
299 case OP_NOTSTAR:
300 case OP_NOTMINSTAR:
301 case OP_NOTQUERY:
302 case OP_NOTMINQUERY:
303 case OP_TYPESTAR:
304 case OP_TYPEMINSTAR:
305 case OP_TYPEQUERY:
306 case OP_TYPEMINQUERY:
307 cc += 2;
308 break;
309
310 /* Skip over UPTOs (lower bound is zero) */
311
312 case OP_UPTO:
313 case OP_MINUPTO:
314 case OP_TYPEUPTO:
315 case OP_TYPEMINUPTO:
316 cc += 4;
317 break;
318
319 /* Check a class or a back reference for a zero minimum */
320
321 case OP_CLASS:
322 case OP_NEGCLASS:
323 case OP_REF:
324 cc += (*cc == OP_REF)? 2 : 33;
325
326 switch (*cc)
327 {
328 case OP_CRSTAR:
329 case OP_CRMINSTAR:
330 case OP_CRQUERY:
331 case OP_CRMINQUERY:
332 cc++;
333 break;
334
335 case OP_CRRANGE:
336 case OP_CRMINRANGE:
337 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
338 cc += 3;
339 break;
340
341 default:
342 goto NEXT_BRANCH;
343 }
344 break;
345
346 /* Anything else matches at least one character */
347
348 default:
349 goto NEXT_BRANCH;
350 }
351 }
352
353 NEXT_BRANCH:
354 code += (code[1] << 8) + code[2];
355 }
356 while (*code == OP_ALT);
357
358 /* No branches match the empty string */
359
360 return FALSE;
361 }
362
363
364
365 /*************************************************
366 * Handle escapes *
367 *************************************************/
368
369 /* This function is called when a \ has been encountered. It either returns a
370 positive value for a simple escape such as \n, or a negative value which
371 encodes one of the more complicated things such as \d. On entry, ptr is
372 pointing at the \. On exit, it is on the final character of the escape
373 sequence.
374
375 Arguments:
376 ptrptr points to the pattern position pointer
377 errorptr points to the pointer to the error message
378 bracount number of previous extracting brackets
379 options the options bits
380 isclass TRUE if inside a character class
381
382 Returns: zero or positive => a data character
383 negative => a special escape sequence
384 on error, errorptr is set
385 */
386
387 static int
388 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
389 int options, BOOL isclass)
390 {
391 const uschar *ptr = *ptrptr;
392 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
393 int i;
394
395 if (c == 0) *errorptr = ERR1;
396
397 /* Digits or letters may have special meaning; all others are literals. */
398
399 else if (c < '0' || c > 'z') {}
400
401 /* Do an initial lookup in a table. A non-zero result is something that can be
402 returned immediately. Otherwise further processing may be required. */
403
404 else if ((i = escapes[c - '0']) != 0) c = i;
405
406 /* Escapes that need further processing, or are illegal. */
407
408 else
409 {
410 const uschar *oldptr;
411 switch (c)
412 {
413 /* The handling of escape sequences consisting of a string of digits
414 starting with one that is not zero is not straightforward. By experiment,
415 the way Perl works seems to be as follows:
416
417 Outside a character class, the digits are read as a decimal number. If the
418 number is less than 10, or if there are that many previous extracting
419 left brackets, then it is a back reference. Otherwise, up to three octal
420 digits are read to form an escaped byte. Thus \123 is likely to be octal
421 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
422 value is greater than 377, the least significant 8 bits are taken. Inside a
423 character class, \ followed by a digit is always an octal number. */
424
425 case '1': case '2': case '3': case '4': case '5':
426 case '6': case '7': case '8': case '9':
427
428 if (!isclass)
429 {
430 oldptr = ptr;
431 c -= '0';
432 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
433 c = c * 10 + *(++ptr) - '0';
434 if (c < 10 || c <= bracount)
435 {
436 c = -(ESC_REF + c);
437 break;
438 }
439 ptr = oldptr; /* Put the pointer back and fall through */
440 }
441
442 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
443 generates a binary zero byte and treats the digit as a following literal.
444 Thus we have to pull back the pointer by one. */
445
446 if ((c = *ptr) >= '8')
447 {
448 ptr--;
449 c = 0;
450 break;
451 }
452
453 /* \0 always starts an octal number, but we may drop through to here with a
454 larger first octal digit */
455
456 case '0':
457 c -= '0';
458 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
459 ptr[1] != '8' && ptr[1] != '9')
460 c = c * 8 + *(++ptr) - '0';
461 break;
462
463 /* Special escapes not starting with a digit are straightforward */
464
465 case 'x':
466 c = 0;
467 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
468 {
469 ptr++;
470 c = c * 16 + pcre_lcc[*ptr] -
471 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
472 }
473 break;
474
475 case 'c':
476 c = *(++ptr);
477 if (c == 0)
478 {
479 *errorptr = ERR2;
480 return 0;
481 }
482
483 /* A letter is upper-cased; then the 0x40 bit is flipped */
484
485 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
486 c ^= 0x40;
487 break;
488
489 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
490 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
491 for Perl compatibility, it is a literal. */
492
493 default:
494 if ((options & PCRE_EXTRA) != 0) switch(c)
495 {
496 case 'X':
497 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
498 break;
499
500 default:
501 *errorptr = ERR3;
502 break;
503 }
504 break;
505 }
506 }
507
508 *ptrptr = ptr;
509 return c;
510 }
511
512
513
514 /*************************************************
515 * Check for counted repeat *
516 *************************************************/
517
518 /* This function is called when a '{' is encountered in a place where it might
519 start a quantifier. It looks ahead to see if it really is a quantifier or not.
520 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
521 where the ddds are digits.
522
523 Arguments:
524 p pointer to the first char after '{'
525
526 Returns: TRUE or FALSE
527 */
528
529 static BOOL
530 is_counted_repeat(const uschar *p)
531 {
532 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
533 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
534 if (*p == '}') return TRUE;
535
536 if (*p++ != ',') return FALSE;
537 if (*p == '}') return TRUE;
538
539 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
540 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
541 return (*p == '}');
542 }
543
544
545
546 /*************************************************
547 * Read repeat counts *
548 *************************************************/
549
550 /* Read an item of the form {n,m} and return the values. This is called only
551 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
552 so the syntax is guaranteed to be correct, but we need to check the values.
553
554 Arguments:
555 p pointer to first char after '{'
556 minp pointer to int for min
557 maxp pointer to int for max
558 returned as -1 if no max
559 errorptr points to pointer to error message
560
561 Returns: pointer to '}' on success;
562 current ptr on error, with errorptr set
563 */
564
565 static const uschar *
566 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
567 {
568 int min = 0;
569 int max = -1;
570
571 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
572
573 if (*p == '}') max = min; else
574 {
575 if (*(++p) != '}')
576 {
577 max = 0;
578 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
579 if (max < min)
580 {
581 *errorptr = ERR4;
582 return p;
583 }
584 }
585 }
586
587 /* Do paranoid checks, then fill in the required variables, and pass back the
588 pointer to the terminating '}'. */
589
590 if (min > 65535 || max > 65535)
591 *errorptr = ERR5;
592 else
593 {
594 *minp = min;
595 *maxp = max;
596 }
597 return p;
598 }
599
600
601
602 /*************************************************
603 * Compile one branch *
604 *************************************************/
605
606 /* Scan the pattern, compiling it into the code vector.
607
608 Arguments:
609 options the option bits
610 bracket points to number of brackets used
611 code points to the pointer to the current code point
612 ptrptr points to the current pattern pointer
613 errorptr points to pointer to error message
614
615 Returns: TRUE on success
616 FALSE, with *errorptr set on error
617 */
618
619 static BOOL
620 compile_branch(int options, int *brackets, uschar **codeptr,
621 const uschar **ptrptr, const char **errorptr)
622 {
623 int repeat_type, op_type;
624 int repeat_min, repeat_max;
625 int bravalue, length;
626 register int c;
627 register uschar *code = *codeptr;
628 const uschar *ptr = *ptrptr;
629 const uschar *oldptr;
630 uschar *previous = NULL;
631 uschar class[32];
632
633 /* Switch on next character until the end of the branch */
634
635 for (;; ptr++)
636 {
637 BOOL negate_class;
638 int class_charcount;
639 int class_lastchar;
640
641 c = *ptr;
642 if ((options & PCRE_EXTENDED) != 0)
643 {
644 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
645 if (c == '#')
646 {
647 while ((c = *(++ptr)) != 0 && c != '\n');
648 continue;
649 }
650 }
651
652 switch(c)
653 {
654 /* The branch terminates at end of string, |, or ). */
655
656 case 0:
657 case '|':
658 case ')':
659 *codeptr = code;
660 *ptrptr = ptr;
661 return TRUE;
662
663 /* Handle single-character metacharacters */
664
665 case '^':
666 previous = NULL;
667 *code++ = OP_CIRC;
668 break;
669
670 case '$':
671 previous = NULL;
672 *code++ = OP_DOLL;
673 break;
674
675 case '.':
676 previous = code;
677 *code++ = OP_ANY;
678 break;
679
680 /* Character classes. These always build a 32-byte bitmap of the permitted
681 characters, except in the special case where there is only one character.
682 For negated classes, we build the map as usual, then invert it at the end.
683 */
684
685 case '[':
686 previous = code;
687
688 /* If the first character is '^', set the negation flag, and use a
689 different opcode. This only matters if caseless matching is specified at
690 runtime. */
691
692 if ((c = *(++ptr)) == '^')
693 {
694 negate_class = TRUE;
695 *code++ = OP_NEGCLASS;
696 c = *(++ptr);
697 }
698 else
699 {
700 negate_class = FALSE;
701 *code++ = OP_CLASS;
702 }
703
704 /* Keep a count of chars so that we can optimize the case of just a single
705 character. */
706
707 class_charcount = 0;
708 class_lastchar = -1;
709
710 /* Initialize the 32-char bit map to all zeros. We have to build the
711 map in a temporary bit of store, in case the class contains only 1
712 character, because in that case the compiled code doesn't use the
713 bit map. */
714
715 memset(class, 0, 32 * sizeof(uschar));
716
717 /* Process characters until ] is reached. By writing this as a "do" it
718 means that an initial ] is taken as a data character. */
719
720 do
721 {
722 if (c == 0)
723 {
724 *errorptr = ERR6;
725 goto FAILED;
726 }
727
728 /* Backslash may introduce a single character, or it may introduce one
729 of the specials, which just set a flag. Escaped items are checked for
730 validity in the pre-compiling pass. The sequence \b is a special case.
731 Inside a class (and only there) it is treated as backspace. Elsewhere
732 it marks a word boundary. Other escapes have preset maps ready to
733 or into the one we are building. We assume they have more than one
734 character in them, so set class_count bigger than one. */
735
736 if (c == '\\')
737 {
738 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
739 if (-c == ESC_b) c = '\b';
740 else if (c < 0)
741 {
742 class_charcount = 10;
743 switch (-c)
744 {
745 case ESC_d:
746 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
747 continue;
748
749 case ESC_D:
750 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
751 continue;
752
753 case ESC_w:
754 for (c = 0; c < 32; c++)
755 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
756 continue;
757
758 case ESC_W:
759 for (c = 0; c < 32; c++)
760 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
761 continue;
762
763 case ESC_s:
764 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
765 continue;
766
767 case ESC_S:
768 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
769 continue;
770
771 default:
772 *errorptr = ERR7;
773 goto FAILED;
774 }
775 }
776 /* Fall through if single character */
777 }
778
779 /* A single character may be followed by '-' to form a range. However,
780 Perl does not permit ']' to be the end of the range. A '-' character
781 here is treated as a literal. */
782
783 if (ptr[1] == '-' && ptr[2] != ']')
784 {
785 int d;
786 ptr += 2;
787 d = *ptr;
788
789 if (d == 0)
790 {
791 *errorptr = ERR6;
792 goto FAILED;
793 }
794
795 /* The second part of a range can be a single-character escape, but
796 not any of the other escapes. */
797
798 if (d == '\\')
799 {
800 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
801 if (d < 0)
802 {
803 if (d == -ESC_b) d = '\b'; else
804 {
805 *errorptr = ERR7;
806 goto FAILED;
807 }
808 }
809 }
810
811 if (d < c)
812 {
813 *errorptr = ERR8;
814 goto FAILED;
815 }
816
817 for (; c <= d; c++)
818 {
819 class[c/8] |= (1 << (c&7));
820 if ((options & PCRE_CASELESS) != 0)
821 {
822 int uc = pcre_fcc[c]; /* flip case */
823 class[uc/8] |= (1 << (uc&7));
824 }
825 class_charcount++; /* in case a one-char range */
826 class_lastchar = c;
827 }
828 continue; /* Go get the next char in the class */
829 }
830
831 /* Handle a lone single character - we can get here for a normal
832 non-escape char, or after \ that introduces a single character. */
833
834 class [c/8] |= (1 << (c&7));
835 if ((options & PCRE_CASELESS) != 0)
836 {
837 c = pcre_fcc[c]; /* flip case */
838 class[c/8] |= (1 << (c&7));
839 }
840 class_charcount++;
841 class_lastchar = c;
842 }
843
844 /* Loop until ']' reached; the check for end of string happens inside the
845 loop. This "while" is the end of the "do" above. */
846
847 while ((c = *(++ptr)) != ']');
848
849 /* If class_charcount is 1 and class_lastchar is not negative, we saw
850 precisely one character. This doesn't need the whole 32-byte bit map.
851 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
852 it's negative. */
853
854 if (class_charcount == 1 && class_lastchar >= 0)
855 {
856 if (negate_class)
857 {
858 code[-1] = OP_NOT;
859 }
860 else
861 {
862 code[-1] = OP_CHARS;
863 *code++ = 1;
864 }
865 *code++ = class_lastchar;
866 }
867
868 /* Otherwise, negate the 32-byte map if necessary, and copy it into
869 the code vector. */
870
871 else
872 {
873 if (negate_class)
874 for (c = 0; c < 32; c++) code[c] = ~class[c];
875 else
876 memcpy(code, class, 32);
877 code += 32;
878 }
879 break;
880
881 /* Various kinds of repeat */
882
883 case '{':
884 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
885 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
886 if (*errorptr != NULL) goto FAILED;
887 goto REPEAT;
888
889 case '*':
890 repeat_min = 0;
891 repeat_max = -1;
892 goto REPEAT;
893
894 case '+':
895 repeat_min = 1;
896 repeat_max = -1;
897 goto REPEAT;
898
899 case '?':
900 repeat_min = 0;
901 repeat_max = 1;
902
903 REPEAT:
904 if (previous == NULL)
905 {
906 *errorptr = ERR9;
907 goto FAILED;
908 }
909
910 /* If the next character is '?' this is a minimizing repeat. Advance to the
911 next character. */
912
913 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
914
915 /* If the maximum is zero then the minimum must also be zero; Perl allows
916 this case, so we do too - by simply omitting the item altogether. */
917
918 if (repeat_max == 0) code = previous;
919
920 /* If previous was a string of characters, chop off the last one and use it
921 as the subject of the repeat. If there was only one character, we can
922 abolish the previous item altogether. */
923
924 else if (*previous == OP_CHARS)
925 {
926 int len = previous[1];
927 if (len == 1)
928 {
929 c = previous[2];
930 code = previous;
931 }
932 else
933 {
934 c = previous[len+1];
935 previous[1]--;
936 code--;
937 }
938 op_type = 0; /* Use single-char op codes */
939 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
940 }
941
942 /* If previous was a single negated character ([^a] or similar), we use
943 one of the special opcodes, replacing it. The code is shared with single-
944 character repeats by adding a suitable offset into repeat_type. */
945
946 else if ((int)*previous == OP_NOT)
947 {
948 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
949 c = previous[1];
950 code = previous;
951 goto OUTPUT_SINGLE_REPEAT;
952 }
953
954 /* If previous was a character type match (\d or similar), abolish it and
955 create a suitable repeat item. The code is shared with single-character
956 repeats by adding a suitable offset into repeat_type. */
957
958 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
959 {
960 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
961 c = *previous;
962 code = previous;
963
964 OUTPUT_SINGLE_REPEAT:
965 repeat_type += op_type; /* Combine both values for many cases */
966
967 /* A minimum of zero is handled either as the special case * or ?, or as
968 an UPTO, with the maximum given. */
969
970 if (repeat_min == 0)
971 {
972 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
973 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
974 else
975 {
976 *code++ = OP_UPTO + repeat_type;
977 *code++ = repeat_max >> 8;
978 *code++ = (repeat_max & 255);
979 }
980 }
981
982 /* The case {1,} is handled as the special case + */
983
984 else if (repeat_min == 1 && repeat_max == -1)
985 *code++ = OP_PLUS + repeat_type;
986
987 /* The case {n,n} is just an EXACT, while the general case {n,m} is
988 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
989
990 else
991 {
992 if (repeat_min != 1)
993 {
994 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
995 *code++ = repeat_min >> 8;
996 *code++ = (repeat_min & 255);
997 }
998
999 /* If the mininum is 1 and the previous item was a character string,
1000 we either have to put back the item that got cancelled if the string
1001 length was 1, or add the character back onto the end of a longer
1002 string. For a character type nothing need be done; it will just get put
1003 back naturally. */
1004
1005 else if (*previous == OP_CHARS)
1006 {
1007 if (code == previous) code += 2; else previous[1]++;
1008 }
1009
1010 /* If the maximum is unlimited, insert an OP_STAR. */
1011
1012 if (repeat_max < 0)
1013 {
1014 *code++ = c;
1015 *code++ = OP_STAR + repeat_type;
1016 }
1017
1018 /* Else insert an UPTO if the max is greater than the min. */
1019
1020 else if (repeat_max != repeat_min)
1021 {
1022 *code++ = c;
1023 repeat_max -= repeat_min;
1024 *code++ = OP_UPTO + repeat_type;
1025 *code++ = repeat_max >> 8;
1026 *code++ = (repeat_max & 255);
1027 }
1028 }
1029
1030 /* The character or character type itself comes last in all cases. */
1031
1032 *code++ = c;
1033 }
1034
1035 /* If previous was a character class or a back reference, we put the repeat
1036 stuff after it. */
1037
1038 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1039 *previous == OP_REF)
1040 {
1041 if (repeat_min == 0 && repeat_max == -1)
1042 *code++ = OP_CRSTAR + repeat_type;
1043 else if (repeat_min == 1 && repeat_max == -1)
1044 *code++ = OP_CRPLUS + repeat_type;
1045 else if (repeat_min == 0 && repeat_max == 1)
1046 *code++ = OP_CRQUERY + repeat_type;
1047 else
1048 {
1049 *code++ = OP_CRRANGE + repeat_type;
1050 *code++ = repeat_min >> 8;
1051 *code++ = repeat_min & 255;
1052 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1053 *code++ = repeat_max >> 8;
1054 *code++ = repeat_max & 255;
1055 }
1056 }
1057
1058 /* If previous was a bracket group, we may have to replicate it in certain
1059 cases. If the maximum repeat count is unlimited, check that the bracket
1060 group cannot match the empty string, and diagnose an error if it can. */
1061
1062 else if ((int)*previous >= OP_BRA)
1063 {
1064 int i;
1065 int len = code - previous;
1066
1067 if (repeat_max == -1 && could_be_empty(previous))
1068 {
1069 *errorptr = ERR10;
1070 goto FAILED;
1071 }
1072
1073 /* If the minimum is greater than zero, and the maximum is unlimited or
1074 equal to the minimum, the first copy remains where it is, and is
1075 replicated up to the minimum number of times. This case includes the +
1076 repeat, but of course no replication is needed in that case. */
1077
1078 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1079 {
1080 for (i = 1; i < repeat_min; i++)
1081 {
1082 memcpy(code, previous, len);
1083 code += len;
1084 }
1085 }
1086
1087 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1088 Then, if there is a fixed upper limit, replicated up to that many times,
1089 sticking BRAZERO in front of all the optional ones. */
1090
1091 else
1092 {
1093 if (repeat_min == 0)
1094 {
1095 memmove(previous+1, previous, len);
1096 code++;
1097 *previous++ = OP_BRAZERO + repeat_type;
1098 }
1099
1100 for (i = 1; i < repeat_min; i++)
1101 {
1102 memcpy(code, previous, len);
1103 code += len;
1104 }
1105
1106 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1107 {
1108 *code++ = OP_BRAZERO + repeat_type;
1109 memcpy(code, previous, len);
1110 code += len;
1111 }
1112 }
1113
1114 /* If the maximum is unlimited, set a repeater in the final copy. */
1115
1116 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1117 }
1118
1119 /* Else there's some kind of shambles */
1120
1121 else
1122 {
1123 *errorptr = ERR11;
1124 goto FAILED;
1125 }
1126
1127 /* In all case we no longer have a previous item. */
1128
1129 previous = NULL;
1130 break;
1131
1132
1133 /* Start of nested bracket sub-expression, or comment or lookahead.
1134 First deal with special things that can come after a bracket; all are
1135 introduced by ?, and the appearance of any of them means that this is not a
1136 referencing group. They were checked for validity in the first pass over
1137 the string, so we don't have to check for syntax errors here. */
1138
1139 case '(':
1140 previous = code; /* Only real brackets can be repeated */
1141 if (*(++ptr) == '?')
1142 {
1143 bravalue = OP_BRA;
1144
1145 switch (*(++ptr))
1146 {
1147 case '#':
1148 case 'i':
1149 case 'm':
1150 case 's':
1151 case 'x':
1152 ptr++;
1153 while (*ptr != ')') ptr++;
1154 previous = NULL;
1155 continue;
1156
1157 case ':': /* Non-extracting bracket */
1158 ptr++;
1159 break;
1160
1161 case '=': /* Assertions can't be repeated */
1162 bravalue = OP_ASSERT;
1163 ptr++;
1164 previous = NULL;
1165 break;
1166
1167 case '!':
1168 bravalue = OP_ASSERT_NOT;
1169 ptr++;
1170 previous = NULL;
1171 break;
1172
1173 case '>': /* "Match once" brackets */
1174 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1175 {
1176 bravalue = OP_ONCE;
1177 ptr++;
1178 previous = NULL;
1179 break;
1180 }
1181 /* Else fall through */
1182
1183 default:
1184 *errorptr = ERR12;
1185 goto FAILED;
1186 }
1187 }
1188
1189 /* Else we have a referencing group */
1190
1191 else
1192 {
1193 if (++(*brackets) > EXTRACT_MAX)
1194 {
1195 *errorptr = ERR13;
1196 goto FAILED;
1197 }
1198 bravalue = OP_BRA + *brackets;
1199 }
1200
1201 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1202 code into a non-register variable in order to be able to pass its address
1203 because some compilers complain otherwise. */
1204
1205 *code = bravalue;
1206 {
1207 uschar *mcode = code;
1208 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1209 goto FAILED;
1210 code = mcode;
1211 }
1212
1213 if (*ptr != ')')
1214 {
1215 *errorptr = ERR14;
1216 goto FAILED;
1217 }
1218 break;
1219
1220 /* Check \ for being a real metacharacter; if not, fall through and handle
1221 it as a data character at the start of a string. Escape items are checked
1222 for validity in the pre-compiling pass. */
1223
1224 case '\\':
1225 oldptr = ptr;
1226 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1227
1228 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1229 are arranged to be the negation of the corresponding OP_values. For the
1230 back references, the values are ESC_REF plus the reference number. Only
1231 back references and those types that consume a character may be repeated.
1232 We can test for values between ESC_b and ESC_Z for the latter; this may
1233 have to change if any new ones are ever created. */
1234
1235 if (c < 0)
1236 {
1237 if (-c >= ESC_REF)
1238 {
1239 int refnum = -c - ESC_REF;
1240 if (*brackets < refnum)
1241 {
1242 *errorptr = ERR15;
1243 goto FAILED;
1244 }
1245 previous = code;
1246 *code++ = OP_REF;
1247 *code++ = refnum;
1248 }
1249 else
1250 {
1251 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1252 *code++ = -c;
1253 }
1254 continue;
1255 }
1256
1257 /* Data character: reset and fall through */
1258
1259 ptr = oldptr;
1260 c = '\\';
1261
1262 /* Handle a run of data characters until a metacharacter is encountered.
1263 The first character is guaranteed not to be whitespace or # when the
1264 extended flag is set. */
1265
1266 NORMAL_CHAR:
1267 default:
1268 previous = code;
1269 *code = OP_CHARS;
1270 code += 2;
1271 length = 0;
1272
1273 do
1274 {
1275 if ((options & PCRE_EXTENDED) != 0)
1276 {
1277 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1278 if (c == '#')
1279 {
1280 while ((c = *(++ptr)) != 0 && c != '\n');
1281 if (c == 0) break;
1282 continue;
1283 }
1284 }
1285
1286 /* Backslash may introduce a data char or a metacharacter. Escaped items
1287 are checked for validity in the pre-compiling pass. Stop the string
1288 before a metaitem. */
1289
1290 if (c == '\\')
1291 {
1292 oldptr = ptr;
1293 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1294 if (c < 0) { ptr = oldptr; break; }
1295 }
1296
1297 /* Ordinary character or single-char escape */
1298
1299 *code++ = c;
1300 length++;
1301 }
1302
1303 /* This "while" is the end of the "do" above. */
1304
1305 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1306
1307 /* Compute the length and set it in the data vector, and advance to
1308 the next state. */
1309
1310 previous[1] = length;
1311 if (length < 255) ptr--;
1312 break;
1313 }
1314 } /* end of big loop */
1315
1316 /* Control never reaches here by falling through, only by a goto for all the
1317 error states. Pass back the position in the pattern so that it can be displayed
1318 to the user for diagnosing the error. */
1319
1320 FAILED:
1321 *ptrptr = ptr;
1322 return FALSE;
1323 }
1324
1325
1326
1327
1328 /*************************************************
1329 * Compile sequence of alternatives *
1330 *************************************************/
1331
1332 /* On entry, ptr is pointing past the bracket character, but on return
1333 it points to the closing bracket, or vertical bar, or end of string.
1334 The code variable is pointing at the byte into which the BRA operator has been
1335 stored.
1336
1337 Argument:
1338 options the option bits
1339 brackets -> int containing the number of extracting brackets used
1340 codeptr -> the address of the current code pointer
1341 ptrptr -> the address of the current pattern pointer
1342 errorptr -> pointer to error message
1343
1344 Returns: TRUE on success
1345 */
1346
1347 static BOOL
1348 compile_regex(int options, int *brackets, uschar **codeptr,
1349 const uschar **ptrptr, const char **errorptr)
1350 {
1351 const uschar *ptr = *ptrptr;
1352 uschar *code = *codeptr;
1353 uschar *start_bracket = code;
1354
1355 for (;;)
1356 {
1357 int length;
1358 uschar *last_branch = code;
1359
1360 code += 3;
1361 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1362 {
1363 *ptrptr = ptr;
1364 return FALSE;
1365 }
1366
1367 /* Fill in the length of the last branch */
1368
1369 length = code - last_branch;
1370 last_branch[1] = length >> 8;
1371 last_branch[2] = length & 255;
1372
1373 /* Reached end of expression, either ')' or end of pattern. Insert a
1374 terminating ket and the length of the whole bracketed item, and return,
1375 leaving the pointer at the terminating char. */
1376
1377 if (*ptr != '|')
1378 {
1379 length = code - start_bracket;
1380 *code++ = OP_KET;
1381 *code++ = length >> 8;
1382 *code++ = length & 255;
1383 *codeptr = code;
1384 *ptrptr = ptr;
1385 return TRUE;
1386 }
1387
1388 /* Another branch follows; insert an "or" node and advance the pointer. */
1389
1390 *code = OP_ALT;
1391 ptr++;
1392 }
1393 /* Control never reaches here */
1394 }
1395
1396
1397
1398 /*************************************************
1399 * Check for anchored expression *
1400 *************************************************/
1401
1402 /* Try to find out if this is an anchored regular expression. Consider each
1403 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1404 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1405 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1406 counts, since OP_CIRC can match in the middle.
1407
1408 A branch is also implicitly anchored if it starts with .* because that will try
1409 the rest of the pattern at all possible matching points, so there is no point
1410 trying them again.
1411
1412 Argument: points to start of expression (the bracket)
1413 Returns: TRUE or FALSE
1414 */
1415
1416 static BOOL
1417 is_anchored(register const uschar *code, BOOL multiline)
1418 {
1419 do {
1420 int op = (int)code[3];
1421 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1422 { if (!is_anchored(code+3, multiline)) return FALSE; }
1423 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1424 { if (code[4] != OP_ANY) return FALSE; }
1425 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1426 code += (code[1] << 8) + code[2];
1427 }
1428 while (*code == OP_ALT);
1429 return TRUE;
1430 }
1431
1432
1433
1434 /*************************************************
1435 * Check for start with \n line expression *
1436 *************************************************/
1437
1438 /* This is called for multiline expressions to try to find out if every branch
1439 starts with ^ so that "first char" processing can be done to speed things up.
1440
1441 Argument: points to start of expression (the bracket)
1442 Returns: TRUE or FALSE
1443 */
1444
1445 static BOOL
1446 is_startline(const uschar *code)
1447 {
1448 do {
1449 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1450 { if (!is_startline(code+3)) return FALSE; }
1451 else if (code[3] != OP_CIRC) return FALSE;
1452 code += (code[1] << 8) + code[2];
1453 }
1454 while (*code == OP_ALT);
1455 return TRUE;
1456 }
1457
1458
1459
1460 /*************************************************
1461 * Check for fixed first char *
1462 *************************************************/
1463
1464 /* Try to find out if there is a fixed first character. This is called for
1465 unanchored expressions, as it speeds up their processing quite considerably.
1466 Consider each alternative branch. If they all start with the same char, or with
1467 a bracket all of whose alternatives start with the same char (recurse ad lib),
1468 then we return that char, otherwise -1.
1469
1470 Argument: points to start of expression (the bracket)
1471 Returns: -1 or the fixed first char
1472 */
1473
1474 static int
1475 find_firstchar(uschar *code)
1476 {
1477 register int c = -1;
1478 do
1479 {
1480 register int charoffset = 4;
1481
1482 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1483 {
1484 register int d;
1485 if ((d = find_firstchar(code+3)) < 0) return -1;
1486 if (c < 0) c = d; else if (c != d) return -1;
1487 }
1488
1489 else switch(code[3])
1490 {
1491 default:
1492 return -1;
1493
1494 case OP_EXACT: /* Fall through */
1495 charoffset++;
1496
1497 case OP_CHARS: /* Fall through */
1498 charoffset++;
1499
1500 case OP_PLUS:
1501 case OP_MINPLUS:
1502 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1503 break;
1504 }
1505 code += (code[1] << 8) + code[2];
1506 }
1507 while (*code == OP_ALT);
1508 return c;
1509 }
1510
1511
1512
1513 /*************************************************
1514 * Compile a Regular Expression *
1515 *************************************************/
1516
1517 /* This function takes a string and returns a pointer to a block of store
1518 holding a compiled version of the expression.
1519
1520 Arguments:
1521 pattern the regular expression
1522 options various option bits
1523 errorptr pointer to pointer to error text
1524 erroroffset ptr offset in pattern where error was detected
1525
1526 Returns: pointer to compiled data block, or NULL on error,
1527 with errorptr and erroroffset set
1528 */
1529
1530 pcre *
1531 pcre_compile(const char *pattern, int options, const char **errorptr,
1532 int *erroroffset)
1533 {
1534 real_pcre *re;
1535 int spaces = 0;
1536 int length = 3; /* For initial BRA plus length */
1537 int runlength;
1538 int c, size;
1539 int bracount = 0;
1540 int brastack[200];
1541 int top_backref = 0;
1542 unsigned int brastackptr = 0;
1543 uschar *code;
1544 const uschar *ptr;
1545
1546 #ifdef DEBUG
1547 uschar *code_base, *code_end;
1548 #endif
1549
1550 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1551 can do is just return NULL. */
1552
1553 if (errorptr == NULL) return NULL;
1554 *errorptr = NULL;
1555
1556 /* However, we can give a message for this error */
1557
1558 if (erroroffset == NULL)
1559 {
1560 *errorptr = ERR16;
1561 return NULL;
1562 }
1563 *erroroffset = 0;
1564
1565 if ((options & ~PUBLIC_OPTIONS) != 0)
1566 {
1567 *errorptr = ERR17;
1568 return NULL;
1569 }
1570
1571 DPRINTF(("------------------------------------------------------------------\n"));
1572 DPRINTF(("%s\n", pattern));
1573
1574 /* The first thing to do is to make a pass over the pattern to compute the
1575 amount of store required to hold the compiled code. This does not have to be
1576 perfect as long as errors are overestimates. At the same time we can detect any
1577 internal flag settings. Make an attempt to correct for any counted white space
1578 if an "extended" flag setting appears late in the pattern. We can't be so
1579 clever for #-comments. */
1580
1581 ptr = (const uschar *)(pattern - 1);
1582 while ((c = *(++ptr)) != 0)
1583 {
1584 int min, max;
1585 int class_charcount;
1586
1587 if ((pcre_ctypes[c] & ctype_space) != 0)
1588 {
1589 if ((options & PCRE_EXTENDED) != 0) continue;
1590 spaces++;
1591 }
1592
1593 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1594 {
1595 while ((c = *(++ptr)) != 0 && c != '\n');
1596 continue;
1597 }
1598
1599 switch(c)
1600 {
1601 /* A backslashed item may be an escaped "normal" character or a
1602 character type. For a "normal" character, put the pointers and
1603 character back so that tests for whitespace etc. in the input
1604 are done correctly. */
1605
1606 case '\\':
1607 {
1608 const uschar *save_ptr = ptr;
1609 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1610 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1611 if (c >= 0)
1612 {
1613 ptr = save_ptr;
1614 c = '\\';
1615 goto NORMAL_CHAR;
1616 }
1617 }
1618 length++;
1619
1620 /* A back reference needs an additional char, plus either one or 5
1621 bytes for a repeat. We also need to keep the value of the highest
1622 back reference. */
1623
1624 if (c <= -ESC_REF)
1625 {
1626 int refnum = -c - ESC_REF;
1627 if (refnum > top_backref) top_backref = refnum;
1628 length++; /* For single back reference */
1629 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1630 {
1631 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1632 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1633 if ((min == 0 && (max == 1 || max == -1)) ||
1634 (min == 1 && max == -1))
1635 length++;
1636 else length += 5;
1637 if (ptr[1] == '?') ptr++;
1638 }
1639 }
1640 continue;
1641
1642 case '^':
1643 case '.':
1644 case '$':
1645 case '*': /* These repeats won't be after brackets; */
1646 case '+': /* those are handled separately */
1647 case '?':
1648 length++;
1649 continue;
1650
1651 /* This covers the cases of repeats after a single char, metachar, class,
1652 or back reference. */
1653
1654 case '{':
1655 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1656 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1657 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1658 if ((min == 0 && (max == 1 || max == -1)) ||
1659 (min == 1 && max == -1))
1660 length++;
1661 else
1662 {
1663 length--; /* Uncount the original char or metachar */
1664 if (min == 1) length++; else if (min > 0) length += 4;
1665 if (max > 0) length += 4; else length += 2;
1666 }
1667 if (ptr[1] == '?') ptr++;
1668 continue;
1669
1670 /* An alternation contains an offset to the next branch or ket. */
1671 case '|':
1672 length += 3;
1673 continue;
1674
1675 /* A character class uses 33 characters. Don't worry about character types
1676 that aren't allowed in classes - they'll get picked up during the compile.
1677 A character class that contains only one character uses 2 or 3 bytes,
1678 depending on whether it is negated or not. Notice this where we can. */
1679
1680 case '[':
1681 class_charcount = 0;
1682 if (*(++ptr) == '^') ptr++;
1683 do
1684 {
1685 if (*ptr == '\\')
1686 {
1687 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1688 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1689 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1690 }
1691 else class_charcount++;
1692 ptr++;
1693 }
1694 while (*ptr != 0 && *ptr != ']');
1695
1696 /* Repeats for negated single chars are handled by the general code */
1697
1698 if (class_charcount == 1) length += 3; else
1699 {
1700 length += 33;
1701
1702 /* A repeat needs either 1 or 5 bytes. */
1703
1704 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1705 {
1706 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1707 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1708 if ((min == 0 && (max == 1 || max == -1)) ||
1709 (min == 1 && max == -1))
1710 length++;
1711 else length += 5;
1712 if (ptr[1] == '?') ptr++;
1713 }
1714 }
1715 continue;
1716
1717 /* Brackets may be genuine groups or special things */
1718
1719 case '(':
1720
1721 /* Handle special forms of bracket, which all start (? */
1722
1723 if (ptr[1] == '?') switch (c = ptr[2])
1724 {
1725 /* Skip over comments entirely */
1726 case '#':
1727 ptr += 3;
1728 while (*ptr != 0 && *ptr != ')') ptr++;
1729 if (*ptr == 0)
1730 {
1731 *errorptr = ERR18;
1732 goto PCRE_ERROR_RETURN;
1733 }
1734 continue;
1735
1736 /* Non-referencing groups and lookaheads just move the pointer on, and
1737 then behave like a non-special bracket, except that they don't increment
1738 the count of extracting brackets. */
1739
1740 case ':':
1741 case '=':
1742 case '!':
1743 ptr += 2;
1744 break;
1745
1746 /* Ditto for the "once only" bracket, allowed only if the extra bit
1747 is set. */
1748
1749 case '>':
1750 if ((options & PCRE_EXTRA) != 0)
1751 {
1752 ptr += 2;
1753 break;
1754 }
1755 /* Else fall thourh */
1756
1757 /* Else loop setting valid options until ) is met. Anything else is an
1758 error. */
1759
1760 default:
1761 ptr += 2;
1762 for (;; ptr++)
1763 {
1764 if ((c = *ptr) == 'i')
1765 {
1766 options |= PCRE_CASELESS;
1767 continue;
1768 }
1769 else if ((c = *ptr) == 'm')
1770 {
1771 options |= PCRE_MULTILINE;
1772 continue;
1773 }
1774 else if (c == 's')
1775 {
1776 options |= PCRE_DOTALL;
1777 continue;
1778 }
1779 else if (c == 'x')
1780 {
1781 options |= PCRE_EXTENDED;
1782 length -= spaces; /* Already counted spaces */
1783 continue;
1784 }
1785 else if (c == ')') break;
1786
1787 *errorptr = ERR12;
1788 goto PCRE_ERROR_RETURN;
1789 }
1790 continue; /* End of this bracket handling */
1791 }
1792
1793 /* Extracting brackets must be counted so we can process escapes in a
1794 Perlish way. */
1795
1796 else bracount++;
1797
1798 /* Non-special forms of bracket. Save length for computing whole length
1799 at end if there's a repeat that requires duplication of the group. */
1800
1801 if (brastackptr >= sizeof(brastack)/sizeof(int))
1802 {
1803 *errorptr = ERR19;
1804 goto PCRE_ERROR_RETURN;
1805 }
1806
1807 brastack[brastackptr++] = length;
1808 length += 3;
1809 continue;
1810
1811 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1812 have to replicate this bracket up to that many times. If brastackptr is
1813 0 this is an unmatched bracket which will generate an error, but take care
1814 not to try to access brastack[-1]. */
1815
1816 case ')':
1817 length += 3;
1818 {
1819 int minval = 1;
1820 int maxval = 1;
1821 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1822
1823 /* Leave ptr at the final char; for read_repeat_counts this happens
1824 automatically; for the others we need an increment. */
1825
1826 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1827 {
1828 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1829 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1830 }
1831 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1832 else if (c == '+') { maxval = -1; ptr++; }
1833 else if (c == '?') { minval = 0; ptr++; }
1834
1835 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1836 if there is a limited maximum we have to replicate up to maxval-1 times
1837 and allow for a BRAZERO item before each optional copy, as we also have
1838 to do before the first copy if the minimum is zero. */
1839
1840 if (minval == 0) length++;
1841 else if (minval > 1) length += (minval - 1) * duplength;
1842 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1843 }
1844 continue;
1845
1846 /* Non-special character. For a run of such characters the length required
1847 is the number of characters + 2, except that the maximum run length is 255.
1848 We won't get a skipped space or a non-data escape or the start of a #
1849 comment as the first character, so the length can't be zero. */
1850
1851 NORMAL_CHAR:
1852 default:
1853 length += 2;
1854 runlength = 0;
1855 do
1856 {
1857 if ((pcre_ctypes[c] & ctype_space) != 0)
1858 {
1859 if ((options & PCRE_EXTENDED) != 0) continue;
1860 spaces++;
1861 }
1862
1863 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1864 {
1865 while ((c = *(++ptr)) != 0 && c != '\n');
1866 continue;
1867 }
1868
1869 /* Backslash may introduce a data char or a metacharacter; stop the
1870 string before the latter. */
1871
1872 if (c == '\\')
1873 {
1874 const uschar *saveptr = ptr;
1875 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1876 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1877 if (c < 0) { ptr = saveptr; break; }
1878 }
1879
1880 /* Ordinary character or single-char escape */
1881
1882 runlength++;
1883 }
1884
1885 /* This "while" is the end of the "do" above. */
1886
1887 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1888
1889 ptr--;
1890 length += runlength;
1891 continue;
1892 }
1893 }
1894
1895 length += 4; /* For final KET and END */
1896
1897 if (length > 65539)
1898 {
1899 *errorptr = ERR20;
1900 return NULL;
1901 }
1902
1903 /* Compute the size of data block needed and get it, either from malloc or
1904 externally provided function. We specify "code[0]" in the offsetof() expression
1905 rather than just "code", because it has been reported that one broken compiler
1906 fails on "code" because it is also an independent variable. It should make no
1907 difference to the value of the offsetof(). */
1908
1909 size = length + offsetof(real_pcre, code[0]);
1910 re = (real_pcre *)(pcre_malloc)(size);
1911
1912 if (re == NULL)
1913 {
1914 *errorptr = ERR21;
1915 return NULL;
1916 }
1917
1918 /* Put in the magic number and the options. */
1919
1920 re->magic_number = MAGIC_NUMBER;
1921 re->options = options;
1922
1923 /* Set up a starting, non-extracting bracket, then compile the expression. On
1924 error, *errorptr will be set non-NULL, so we don't need to look at the result
1925 of the function here. */
1926
1927 ptr = (const uschar *)pattern;
1928 code = re->code;
1929 *code = OP_BRA;
1930 bracount = 0;
1931 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1932 re->top_bracket = bracount;
1933 re->top_backref = top_backref;
1934
1935 /* If not reached end of pattern on success, there's an excess bracket. */
1936
1937 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1938
1939 /* Fill in the terminating state and check for disastrous overflow, but
1940 if debugging, leave the test till after things are printed out. */
1941
1942 *code++ = OP_END;
1943
1944 #ifndef DEBUG
1945 if (code - re->code > length) *errorptr = ERR23;
1946 #endif
1947
1948 /* Failed to compile */
1949
1950 if (*errorptr != NULL)
1951 {
1952 (pcre_free)(re);
1953 PCRE_ERROR_RETURN:
1954 *erroroffset = ptr - (const uschar *)pattern;
1955 return NULL;
1956 }
1957
1958 /* If the anchored option was not passed, set flag if we can determine that it
1959 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1960 we can determine what the first character has to be, because that speeds up
1961 unanchored matches no end. In the case of multiline matches, an alternative is
1962 to set the PCRE_STARTLINE flag if all branches start with ^. */
1963
1964 if ((options & PCRE_ANCHORED) == 0)
1965 {
1966 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1967 re->options |= PCRE_ANCHORED;
1968 else
1969 {
1970 int ch = find_firstchar(re->code);
1971 if (ch >= 0)
1972 {
1973 re->first_char = ch;
1974 re->options |= PCRE_FIRSTSET;
1975 }
1976 else if (is_startline(re->code))
1977 re->options |= PCRE_STARTLINE;
1978 }
1979 }
1980
1981 /* Print out the compiled data for debugging */
1982
1983 #ifdef DEBUG
1984
1985 printf("Length = %d top_bracket = %d top_backref=%d\n",
1986 length, re->top_bracket, re->top_backref);
1987
1988 if (re->options != 0)
1989 {
1990 printf("%s%s%s%s%s%s%s\n",
1991 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1992 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1993 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1994 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1995 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1996 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1997 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1998 }
1999
2000 if ((re->options & PCRE_FIRSTSET) != 0)
2001 {
2002 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2003 else printf("First char = \\x%02x\n", re->first_char);
2004 }
2005
2006 code_end = code;
2007 code_base = code = re->code;
2008
2009 while (code < code_end)
2010 {
2011 int charlength;
2012
2013 printf("%3d ", code - code_base);
2014
2015 if (*code >= OP_BRA)
2016 {
2017 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2018 code += 2;
2019 }
2020
2021 else switch(*code)
2022 {
2023 case OP_CHARS:
2024 charlength = *(++code);
2025 printf("%3d ", charlength);
2026 while (charlength-- > 0)
2027 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2028 break;
2029
2030 case OP_KETRMAX:
2031 case OP_KETRMIN:
2032 case OP_ALT:
2033 case OP_KET:
2034 case OP_ASSERT:
2035 case OP_ASSERT_NOT:
2036 case OP_ONCE:
2037 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2038 code += 2;
2039 break;
2040
2041 case OP_STAR:
2042 case OP_MINSTAR:
2043 case OP_PLUS:
2044 case OP_MINPLUS:
2045 case OP_QUERY:
2046 case OP_MINQUERY:
2047 case OP_TYPESTAR:
2048 case OP_TYPEMINSTAR:
2049 case OP_TYPEPLUS:
2050 case OP_TYPEMINPLUS:
2051 case OP_TYPEQUERY:
2052 case OP_TYPEMINQUERY:
2053 if (*code >= OP_TYPESTAR)
2054 printf(" %s", OP_names[code[1]]);
2055 else if (isprint(c = code[1])) printf(" %c", c);
2056 else printf(" \\x%02x", c);
2057 printf("%s", OP_names[*code++]);
2058 break;
2059
2060 case OP_EXACT:
2061 case OP_UPTO:
2062 case OP_MINUPTO:
2063 if (isprint(c = code[3])) printf(" %c{", c);
2064 else printf(" \\x%02x{", c);
2065 if (*code != OP_EXACT) printf("0,");
2066 printf("%d}", (code[1] << 8) + code[2]);
2067 if (*code == OP_MINUPTO) printf("?");
2068 code += 3;
2069 break;
2070
2071 case OP_TYPEEXACT:
2072 case OP_TYPEUPTO:
2073 case OP_TYPEMINUPTO:
2074 printf(" %s{", OP_names[code[3]]);
2075 if (*code != OP_TYPEEXACT) printf(",");
2076 printf("%d}", (code[1] << 8) + code[2]);
2077 if (*code == OP_TYPEMINUPTO) printf("?");
2078 code += 3;
2079 break;
2080
2081 case OP_NOT:
2082 if (isprint(c = *(++code))) printf(" [^%c]", c);
2083 else printf(" [^\\x%02x]", c);
2084 break;
2085
2086 case OP_NOTSTAR:
2087 case OP_NOTMINSTAR:
2088 case OP_NOTPLUS:
2089 case OP_NOTMINPLUS:
2090 case OP_NOTQUERY:
2091 case OP_NOTMINQUERY:
2092 if (isprint(c = code[1])) printf(" [^%c]", c);
2093 else printf(" [^\\x%02x]", c);
2094 printf("%s", OP_names[*code++]);
2095 break;
2096
2097 case OP_NOTEXACT:
2098 case OP_NOTUPTO:
2099 case OP_NOTMINUPTO:
2100 if (isprint(c = code[3])) printf(" [^%c]{", c);
2101 else printf(" [^\\x%02x]{", c);
2102 if (*code != OP_NOTEXACT) printf(",");
2103 printf("%d}", (code[1] << 8) + code[2]);
2104 if (*code == OP_NOTMINUPTO) printf("?");
2105 code += 3;
2106 break;
2107
2108 case OP_REF:
2109 printf(" \\%d", *(++code));
2110 code ++;
2111 goto CLASS_REF_REPEAT;
2112
2113 case OP_CLASS:
2114 case OP_NEGCLASS:
2115 {
2116 int i, min, max;
2117
2118 if (*code++ == OP_CLASS) printf(" [");
2119 else printf(" ^[");
2120
2121 for (i = 0; i < 256; i++)
2122 {
2123 if ((code[i/8] & (1 << (i&7))) != 0)
2124 {
2125 int j;
2126 for (j = i+1; j < 256; j++)
2127 if ((code[j/8] & (1 << (j&7))) == 0) break;
2128 if (i == '-' || i == ']') printf("\\");
2129 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2130 if (--j > i)
2131 {
2132 printf("-");
2133 if (j == '-' || j == ']') printf("\\");
2134 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2135 }
2136 i = j;
2137 }
2138 }
2139 printf("]");
2140 code += 32;
2141
2142 CLASS_REF_REPEAT:
2143
2144 switch(*code)
2145 {
2146 case OP_CRSTAR:
2147 case OP_CRMINSTAR:
2148 case OP_CRPLUS:
2149 case OP_CRMINPLUS:
2150 case OP_CRQUERY:
2151 case OP_CRMINQUERY:
2152 printf("%s", OP_names[*code]);
2153 break;
2154
2155 case OP_CRRANGE:
2156 case OP_CRMINRANGE:
2157 min = (code[1] << 8) + code[2];
2158 max = (code[3] << 8) + code[4];
2159 if (max == 0) printf("{%d,}", min);
2160 else printf("{%d,%d}", min, max);
2161 if (*code == OP_CRMINRANGE) printf("?");
2162 code += 4;
2163 break;
2164
2165 default:
2166 code--;
2167 }
2168 }
2169 break;
2170
2171 /* Anything else is just a one-node item */
2172
2173 default:
2174 printf(" %s", OP_names[*code]);
2175 break;
2176 }
2177
2178 code++;
2179 printf("\n");
2180 }
2181 printf("------------------------------------------------------------------\n");
2182
2183 /* This check is done here in the debugging case so that the code that
2184 was compiled can be seen. */
2185
2186 if (code - re->code > length)
2187 {
2188 *errorptr = ERR23;
2189 (pcre_free)(re);
2190 *erroroffset = ptr - (uschar *)pattern;
2191 return NULL;
2192 }
2193 #endif
2194
2195 return (pcre *)re;
2196 }
2197
2198
2199
2200 /*************************************************
2201 * Match a character type *
2202 *************************************************/
2203
2204 /* Not used in all the places it might be as it's sometimes faster
2205 to put the code inline.
2206
2207 Arguments:
2208 type the character type
2209 c the character
2210 dotall the dotall flag
2211
2212 Returns: TRUE if character is of the type
2213 */
2214
2215 static BOOL
2216 match_type(int type, int c, BOOL dotall)
2217 {
2218
2219 #ifdef DEBUG
2220 if (isprint(c)) printf("matching subject %c against ", c);
2221 else printf("matching subject \\x%02x against ", c);
2222 printf("%s\n", OP_names[type]);
2223 #endif
2224
2225 switch(type)
2226 {
2227 case OP_ANY: return dotall || c != '\n';
2228 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2229 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2230 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2231 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2232 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2233 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2234 }
2235 return FALSE;
2236 }
2237
2238
2239
2240 /*************************************************
2241 * Match a back-reference *
2242 *************************************************/
2243
2244 /* If a back reference hasn't been set, the match fails.
2245
2246 Arguments:
2247 number reference number
2248 eptr points into the subject
2249 length length to be matched
2250 md points to match data block
2251
2252 Returns: TRUE if matched
2253 */
2254
2255 static BOOL
2256 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2257 {
2258 const uschar *p = md->start_subject + md->offset_vector[number];
2259
2260 #ifdef DEBUG
2261 if (eptr >= md->end_subject)
2262 printf("matching subject <null>");
2263 else
2264 {
2265 printf("matching subject ");
2266 pchars(eptr, length, TRUE, md);
2267 }
2268 printf(" against backref ");
2269 pchars(p, length, FALSE, md);
2270 printf("\n");
2271 #endif
2272
2273 /* Always fail if not enough characters left */
2274
2275 if (length > md->end_subject - p) return FALSE;
2276
2277 /* Separate the caselesss case for speed */
2278
2279 if (md->caseless)
2280 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2281 else
2282 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2283
2284 return TRUE;
2285 }
2286
2287
2288
2289 /*************************************************
2290 * Match from current position *
2291 *************************************************/
2292
2293 /* On entry ecode points to the first opcode, and eptr to the first character.
2294
2295 Arguments:
2296 eptr pointer in subject
2297 ecode position in code
2298 offset_top current top pointer
2299 md pointer to "static" info for the match
2300
2301 Returns: TRUE if matched
2302 */
2303
2304 static BOOL
2305 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2306 match_data *md)
2307 {
2308 for (;;)
2309 {
2310 int min, max, ctype;
2311 register int i;
2312 register int c;
2313 BOOL minimize = FALSE;
2314
2315 /* Opening bracket. Check the alternative branches in turn, failing if none
2316 match. We have to set the start offset if required and there is space
2317 in the offset vector so that it is available for subsequent back references
2318 if the bracket matches. However, if the bracket fails, we must put back the
2319 previous value of both offsets in case they were set by a previous copy of
2320 the same bracket. Don't worry about setting the flag for the error case here;
2321 that is handled in the code for KET. */
2322
2323 if ((int)*ecode >= OP_BRA)
2324 {
2325 int number = (*ecode - OP_BRA) << 1;
2326 int save_offset1 = 0, save_offset2 = 0;
2327
2328 DPRINTF(("start bracket %d\n", number/2));
2329
2330 if (number > 0 && number < md->offset_end)
2331 {
2332 save_offset1 = md->offset_vector[number];
2333 save_offset2 = md->offset_vector[number+1];
2334 md->offset_vector[number] = eptr - md->start_subject;
2335
2336 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2337 }
2338
2339 /* Recurse for all the alternatives. */
2340
2341 do
2342 {
2343 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2344 ecode += (ecode[1] << 8) + ecode[2];
2345 }
2346 while (*ecode == OP_ALT);
2347
2348 DPRINTF(("bracket %d failed\n", number/2));
2349
2350 if (number > 0 && number < md->offset_end)
2351 {
2352 md->offset_vector[number] = save_offset1;
2353 md->offset_vector[number+1] = save_offset2;
2354 }
2355
2356 return FALSE;
2357 }
2358
2359 /* Other types of node can be handled by a switch */
2360
2361 switch(*ecode)
2362 {
2363 case OP_END:
2364 md->end_match_ptr = eptr; /* Record where we ended */
2365 md->end_offset_top = offset_top; /* and how many extracts were taken */
2366 return TRUE;
2367
2368 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2369 whole thing doesn't match, so we have to get out via a longjmp(). */
2370
2371 case OP_CUT:
2372 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2373 longjmp(md->fail_env, 1);
2374
2375 /* Assertion brackets. Check the alternative branches in turn - the
2376 matching won't pass the KET for an assertion. If any one branch matches,
2377 the assertion is true. */
2378
2379 case OP_ASSERT:
2380 do
2381 {
2382 if (match(eptr, ecode+3, offset_top, md)) break;
2383 ecode += (ecode[1] << 8) + ecode[2];
2384 }
2385 while (*ecode == OP_ALT);
2386 if (*ecode == OP_KET) return FALSE;
2387
2388 /* Continue from after the assertion, updating the offsets high water
2389 mark, since extracts may have been taken during the assertion. */
2390
2391 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2392 ecode += 3;
2393 offset_top = md->end_offset_top;
2394 continue;
2395
2396 /* Negative assertion: all branches must fail to match */
2397
2398 case OP_ASSERT_NOT:
2399 do
2400 {
2401 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2402 ecode += (ecode[1] << 8) + ecode[2];
2403 }
2404 while (*ecode == OP_ALT);
2405 ecode += 3;
2406 continue;
2407
2408 /* "Once" brackets are like assertion brackets except that after a match,
2409 the point in the subject string is not moved back. Thus there can never be
2410 a move back into the brackets. Check the alternative branches in turn - the
2411 matching won't pass the KET for this kind of subpattern. If any one branch
2412 matches, we carry on, leaving the subject pointer. */
2413
2414 case OP_ONCE:
2415 do
2416 {
2417 if (match(eptr, ecode+3, offset_top, md)) break;
2418 ecode += (ecode[1] << 8) + ecode[2];
2419 }
2420 while (*ecode == OP_ALT);
2421 if (*ecode == OP_KET) return FALSE;
2422
2423 /* Continue as from after the assertion, updating the offsets high water
2424 mark, since extracts may have been taken. */
2425
2426 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2427 ecode += 3;
2428 offset_top = md->end_offset_top;
2429 eptr = md->end_match_ptr;
2430 continue;
2431
2432 /* An alternation is the end of a branch; scan along to find the end of the
2433 bracketed group and go to there. */
2434
2435 case OP_ALT:
2436 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2437 break;
2438
2439 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2440 that it may occur zero times. It may repeat infinitely, or not at all -
2441 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2442 repeat limits are compiled as a number of copies, with the optional ones
2443 preceded by BRAZERO or BRAMINZERO. */
2444
2445 case OP_BRAZERO:
2446 {
2447 const uschar *next = ecode+1;
2448 if (match(eptr, next, offset_top, md)) return TRUE;
2449 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2450 ecode = next + 3;
2451 }
2452 break;
2453
2454 case OP_BRAMINZERO:
2455 {
2456 const uschar *next = ecode+1;
2457 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2458 if (match(eptr, next+3, offset_top, md)) return TRUE;
2459 ecode++;
2460 }
2461 break;;
2462
2463 /* End of a group, repeated or non-repeating. If we are at the end of
2464 an assertion "group", stop matching and return TRUE, but record the
2465 current high water mark for use by positive assertions. */
2466
2467 case OP_KET:
2468 case OP_KETRMIN:
2469 case OP_KETRMAX:
2470 {
2471 int number;
2472 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2473
2474 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2475 {
2476 md->end_match_ptr = eptr; /* For ONCE */
2477 md->end_offset_top = offset_top;
2478 return TRUE;
2479 }
2480
2481 /* In all other cases we have to check the group number back at the
2482 start and if necessary complete handling an extraction by setting the
2483 final offset and bumping the high water mark. */
2484
2485 number = (*prev - OP_BRA) << 1;
2486
2487 DPRINTF(("end bracket %d\n", number/2));
2488
2489 if (number > 0)
2490 {
2491 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2492 {
2493 md->offset_vector[number+1] = eptr - md->start_subject;
2494 if (offset_top <= number) offset_top = number + 2;
2495 }
2496 }
2497
2498 /* For a non-repeating ket, just advance to the next node and continue at
2499 this level. */
2500
2501 if (*ecode == OP_KET)
2502 {
2503 ecode += 3;
2504 break;
2505 }
2506
2507 /* The repeating kets try the rest of the pattern or restart from the
2508 preceding bracket, in the appropriate order. */
2509
2510 if (*ecode == OP_KETRMIN)
2511 {
2512 if (match(eptr, ecode+3, offset_top, md) ||
2513 match(eptr, prev, offset_top, md)) return TRUE;
2514 }
2515 else /* OP_KETRMAX */
2516 {
2517 if (match(eptr, prev, offset_top, md) ||
2518 match(eptr, ecode+3, offset_top, md)) return TRUE;
2519 }
2520 }
2521 return FALSE;
2522
2523 /* Start of subject unless notbol, or after internal newline if multiline */
2524
2525 case OP_CIRC:
2526 if (md->notbol && eptr == md->start_subject) return FALSE;
2527 if (md->multiline)
2528 {
2529 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2530 ecode++;
2531 break;
2532 }
2533 /* ... else fall through */
2534
2535 /* Start of subject assertion */
2536
2537 case OP_SOD:
2538 if (eptr != md->start_subject) return FALSE;
2539 ecode++;
2540 break;
2541
2542 /* Assert before internal newline if multiline, or before
2543 a terminating newline unless endonly is set, else end of subject unless
2544 noteol is set. */
2545
2546 case OP_DOLL:
2547 if (md->noteol && eptr >= md->end_subject) return FALSE;
2548 if (md->multiline)
2549 {
2550 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2551 ecode++;
2552 break;
2553 }
2554 else if (!md->endonly)
2555 {
2556 if (eptr < md->end_subject - 1 ||
2557 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2558 ecode++;
2559 break;
2560 }
2561 /* ... else fall through */
2562
2563 /* End of subject assertion */
2564
2565 case OP_EOD:
2566 if (eptr < md->end_subject) return FALSE;
2567 ecode++;
2568 break;
2569
2570 /* Word boundary assertions */
2571
2572 case OP_NOT_WORD_BOUNDARY:
2573 case OP_WORD_BOUNDARY:
2574 {
2575 BOOL prev_is_word = (eptr != md->start_subject) &&
2576 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2577 BOOL cur_is_word = (eptr < md->end_subject) &&
2578 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2579 if ((*ecode++ == OP_WORD_BOUNDARY)?
2580 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2581 return FALSE;
2582 }
2583 break;
2584
2585 /* Match a single character type; inline for speed */
2586
2587 case OP_ANY:
2588 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2589 if (eptr++ >= md->end_subject) return FALSE;
2590 ecode++;
2591 break;
2592
2593 case OP_NOT_DIGIT:
2594 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2595 return FALSE;
2596 ecode++;
2597 break;
2598
2599 case OP_DIGIT:
2600 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2601 return FALSE;
2602 ecode++;
2603 break;
2604
2605 case OP_NOT_WHITESPACE:
2606 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2607 return FALSE;
2608 ecode++;
2609 break;
2610
2611 case OP_WHITESPACE:
2612 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2613 return FALSE;
2614 ecode++;
2615 break;
2616
2617 case OP_NOT_WORDCHAR:
2618 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2619 return FALSE;
2620 ecode++;
2621 break;
2622
2623 case OP_WORDCHAR:
2624 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2625 return FALSE;
2626 ecode++;
2627 break;
2628
2629 /* Match a back reference, possibly repeatedly. Look past the end of the
2630 item to see if there is repeat information following. The code is similar
2631 to that for character classes, but repeated for efficiency. Then obey
2632 similar code to character type repeats - written out again for speed.
2633 However, if the referenced string is the empty string, always treat
2634 it as matched, any number of times (otherwise there could be infinite
2635 loops). */
2636
2637 case OP_REF:
2638 {
2639 int length;
2640 int number = ecode[1] << 1; /* Doubled reference number */
2641 ecode += 2; /* Advance past the item */
2642
2643 if (number >= offset_top || md->offset_vector[number] < 0)
2644 {
2645 md->errorcode = PCRE_ERROR_BADREF;
2646 return FALSE;
2647 }
2648
2649 length = md->offset_vector[number+1] - md->offset_vector[number];
2650
2651 switch (*ecode)
2652 {
2653 case OP_CRSTAR:
2654 case OP_CRMINSTAR:
2655 case OP_CRPLUS:
2656 case OP_CRMINPLUS:
2657 case OP_CRQUERY:
2658 case OP_CRMINQUERY:
2659 c = *ecode++ - OP_CRSTAR;
2660 minimize = (c & 1) != 0;
2661 min = rep_min[c]; /* Pick up values from tables; */
2662 max = rep_max[c]; /* zero for max => infinity */
2663 if (max == 0) max = INT_MAX;
2664 break;
2665
2666 case OP_CRRANGE:
2667 case OP_CRMINRANGE:
2668 minimize = (*ecode == OP_CRMINRANGE);
2669 min = (ecode[1] << 8) + ecode[2];
2670 max = (ecode[3] << 8) + ecode[4];
2671 if (max == 0) max = INT_MAX;
2672 ecode += 5;
2673 break;
2674
2675 default: /* No repeat follows */
2676 if (!match_ref(number, eptr, length, md)) return FALSE;
2677 eptr += length;
2678 continue; /* With the main loop */
2679 }
2680
2681 /* If the length of the reference is zero, just continue with the
2682 main loop. */
2683
2684 if (length == 0) continue;
2685
2686 /* First, ensure the minimum number of matches are present. We get back
2687 the length of the reference string explicitly rather than passing the
2688 address of eptr, so that eptr can be a register variable. */
2689
2690 for (i = 1; i <= min; i++)
2691 {
2692 if (!match_ref(number, eptr, length, md)) return FALSE;
2693 eptr += length;
2694 }
2695
2696 /* If min = max, continue at the same level without recursion.
2697 They are not both allowed to be zero. */
2698
2699 if (min == max) continue;
2700
2701 /* If minimizing, keep trying and advancing the pointer */
2702
2703 if (minimize)
2704 {
2705 for (i = min;; i++)
2706 {
2707 if (match(eptr, ecode, offset_top, md)) return TRUE;
2708 if (i >= max || !match_ref(number, eptr, length, md))
2709 return FALSE;
2710 eptr += length;
2711 }
2712 /* Control never gets here */
2713 }
2714
2715 /* If maximizing, find the longest string and work backwards */
2716
2717 else
2718 {
2719 const uschar *pp = eptr;
2720 for (i = min; i < max; i++)
2721 {
2722 if (!match_ref(number, eptr, length, md)) break;
2723 eptr += length;
2724 }
2725 while (eptr >= pp)
2726 {
2727 if (match(eptr, ecode, offset_top, md)) return TRUE;
2728 eptr -= length;
2729 }
2730 return FALSE;
2731 }
2732 }
2733 /* Control never gets here */
2734
2735 /* Match a character class, possibly repeatedly. Look past the end of the
2736 item to see if there is repeat information following. Then obey similar
2737 code to character type repeats - written out again for speed. If caseless
2738 matching was set at runtime but not at compile time, we have to check both
2739 versions of a character, and we have to behave differently for positive and
2740 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
2741 treated differently. */
2742
2743 case OP_CLASS:
2744 case OP_NEGCLASS:
2745 {
2746 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
2747 const uschar *data = ecode + 1; /* Save for matching */
2748 ecode += 33; /* Advance past the item */
2749
2750 switch (*ecode)
2751 {
2752 case OP_CRSTAR:
2753 case OP_CRMINSTAR:
2754 case OP_CRPLUS:
2755 case OP_CRMINPLUS:
2756 case OP_CRQUERY:
2757 case OP_CRMINQUERY:
2758 c = *ecode++ - OP_CRSTAR;
2759 minimize = (c & 1) != 0;
2760 min = rep_min[c]; /* Pick up values from tables; */
2761 max = rep_max[c]; /* zero for max => infinity */
2762 if (max == 0) max = INT_MAX;
2763 break;
2764
2765 case OP_CRRANGE:
2766 case OP_CRMINRANGE:
2767 minimize = (*ecode == OP_CRMINRANGE);
2768 min = (ecode[1] << 8) + ecode[2];
2769 max = (ecode[3] << 8) + ecode[4];
2770 if (max == 0) max = INT_MAX;
2771 ecode += 5;
2772 break;
2773
2774 default: /* No repeat follows */
2775 min = max = 1;
2776 break;
2777 }
2778
2779 /* First, ensure the minimum number of matches are present. */
2780
2781 for (i = 1; i <= min; i++)
2782 {
2783 if (eptr >= md->end_subject) return FALSE;
2784 c = *eptr++;
2785
2786 /* Either not runtime caseless, or it was a positive class. For
2787 runtime caseless, continue if either case is in the map. */
2788
2789 if (!nasty_case)
2790 {
2791 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2792 if (md->runtime_caseless)
2793 {
2794 c = pcre_fcc[c];
2795 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2796 }
2797 }
2798
2799 /* Runtime caseless and it was a negative class. Continue only if
2800 both cases are in the map. */
2801
2802 else
2803 {
2804 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2805 c = pcre_fcc[c];
2806 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2807 }
2808
2809 return FALSE;
2810 }
2811
2812 /* If max == min we can continue with the main loop without the
2813 need to recurse. */
2814
2815 if (min == max) continue;
2816
2817 /* If minimizing, keep testing the rest of the expression and advancing
2818 the pointer while it matches the class. */
2819
2820 if (minimize)
2821 {
2822 for (i = min;; i++)
2823 {
2824 if (match(eptr, ecode, offset_top, md)) return TRUE;
2825 if (i >= max || eptr >= md->end_subject) return FALSE;
2826 c = *eptr++;
2827
2828 /* Either not runtime caseless, or it was a positive class. For
2829 runtime caseless, continue if either case is in the map. */
2830
2831 if (!nasty_case)
2832 {
2833 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2834 if (md->runtime_caseless)
2835 {
2836 c = pcre_fcc[c];
2837 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2838 }
2839 }
2840
2841 /* Runtime caseless and it was a negative class. Continue only if
2842 both cases are in the map. */
2843
2844 else
2845 {
2846 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2847 c = pcre_fcc[c];
2848 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2849 }
2850
2851 return FALSE;
2852 }
2853 /* Control never gets here */
2854 }
2855
2856 /* If maximizing, find the longest possible run, then work backwards. */
2857
2858 else
2859 {
2860 const uschar *pp = eptr;
2861 for (i = min; i < max; eptr++, i++)
2862 {
2863 if (eptr >= md->end_subject) break;
2864 c = *eptr;
2865
2866 /* Either not runtime caseless, or it was a positive class. For
2867 runtime caseless, continue if either case is in the map. */
2868
2869 if (!nasty_case)
2870 {
2871 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2872 if (md->runtime_caseless)
2873 {
2874 c = pcre_fcc[c];
2875 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2876 }
2877 }
2878
2879 /* Runtime caseless and it was a negative class. Continue only if
2880 both cases are in the map. */
2881
2882 else
2883 {
2884 if ((data[c/8] & (1 << (c&7))) == 0) break;
2885 c = pcre_fcc[c];
2886 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2887 }
2888
2889 break;
2890 }
2891
2892 while (eptr >= pp)
2893 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2894 return FALSE;
2895 }
2896 }
2897 /* Control never gets here */
2898
2899 /* Match a run of characters */
2900
2901 case OP_CHARS:
2902 {
2903 register int length = ecode[1];
2904 ecode += 2;
2905
2906 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2907 if (eptr >= md->end_subject)
2908 printf("matching subject <null> against pattern ");
2909 else
2910 {
2911 printf("matching subject ");
2912 pchars(eptr, length, TRUE, md);
2913 printf(" against pattern ");
2914 }
2915 pchars(ecode, length, FALSE, md);
2916 printf("\n");
2917 #endif
2918
2919 if (length > md->end_subject - eptr) return FALSE;
2920 if (md->caseless)
2921 {
2922 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2923 }
2924 else
2925 {
2926 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2927 }
2928 }
2929 break;
2930
2931 /* Match a single character repeatedly; different opcodes share code. */
2932
2933 case OP_EXACT:
2934 min = max = (ecode[1] << 8) + ecode[2];
2935 ecode += 3;
2936 goto REPEATCHAR;
2937
2938 case OP_UPTO:
2939 case OP_MINUPTO:
2940 min = 0;
2941 max = (ecode[1] << 8) + ecode[2];
2942 minimize = *ecode == OP_MINUPTO;
2943 ecode += 3;
2944 goto REPEATCHAR;
2945
2946 case OP_STAR:
2947 case OP_MINSTAR:
2948 case OP_PLUS:
2949 case OP_MINPLUS:
2950 case OP_QUERY:
2951 case OP_MINQUERY:
2952 c = *ecode++ - OP_STAR;
2953 minimize = (c & 1) != 0;
2954 min = rep_min[c]; /* Pick up values from tables; */
2955 max = rep_max[c]; /* zero for max => infinity */
2956 if (max == 0) max = INT_MAX;
2957
2958 /* Common code for all repeated single-character matches. We can give
2959 up quickly if there are fewer than the minimum number of characters left in
2960 the subject. */
2961
2962 REPEATCHAR:
2963 if (min > md->end_subject - eptr) return FALSE;
2964 c = *ecode++;
2965
2966 /* The code is duplicated for the caseless and caseful cases, for speed,
2967 since matching characters is likely to be quite common. First, ensure the
2968 minimum number of matches are present. If min = max, continue at the same
2969 level without recursing. Otherwise, if minimizing, keep trying the rest of
2970 the expression and advancing one matching character if failing, up to the
2971 maximum. Alternatively, if maximizing, find the maximum number of
2972 characters and work backwards. */
2973
2974 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2975 max, eptr));
2976
2977 if (md->caseless)
2978 {
2979 c = pcre_lcc[c];
2980 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2981 if (min == max) continue;
2982 if (minimize)
2983 {
2984 for (i = min;; i++)
2985 {
2986 if (match(eptr, ecode, offset_top, md)) return TRUE;
2987 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2988 return FALSE;
2989 }
2990 /* Control never gets here */
2991 }
2992 else
2993 {
2994 const uschar *pp = eptr;
2995 for (i = min; i < max; i++)
2996 {
2997 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2998 eptr++;
2999 }
3000 while (eptr >= pp)
3001 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3002 return FALSE;
3003 }
3004 /* Control never gets here */
3005 }
3006
3007 /* Caseful comparisons */
3008
3009 else
3010 {
3011 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3012 if (min == max) continue;
3013 if (minimize)
3014 {
3015 for (i = min;; i++)
3016 {
3017 if (match(eptr, ecode, offset_top, md)) return TRUE;
3018 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3019 }
3020 /* Control never gets here */
3021 }
3022 else
3023 {
3024 const uschar *pp = eptr;
3025 for (i = min; i < max; i++)
3026 {
3027 if (eptr >= md->end_subject || c != *eptr) break;
3028 eptr++;
3029 }
3030 while (eptr >= pp)
3031 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3032 return FALSE;
3033 }
3034 }
3035 /* Control never gets here */
3036
3037 /* Match a negated single character */
3038
3039 case OP_NOT:
3040 if (eptr >= md->end_subject) return FALSE;
3041 ecode++;
3042 if (md->caseless)
3043 {
3044 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3045 }
3046 else
3047 {
3048 if (*ecode++ == *eptr++) return FALSE;
3049 }
3050 break;
3051
3052 /* Match a negated single character repeatedly. This is almost a repeat of
3053 the code for a repeated single character, but I haven't found a nice way of
3054 commoning these up that doesn't require a test of the positive/negative
3055 option for each character match. Maybe that wouldn't add very much to the
3056 time taken, but character matching *is* what this is all about... */
3057
3058 case OP_NOTEXACT:
3059 min = max = (ecode[1] << 8) + ecode[2];
3060 ecode += 3;
3061 goto REPEATNOTCHAR;
3062
3063 case OP_NOTUPTO:
3064 case OP_NOTMINUPTO:
3065 min = 0;
3066 max = (ecode[1] << 8) + ecode[2];
3067 minimize = *ecode == OP_NOTMINUPTO;
3068 ecode += 3;
3069 goto REPEATNOTCHAR;
3070
3071 case OP_NOTSTAR:
3072 case OP_NOTMINSTAR:
3073 case OP_NOTPLUS:
3074 case OP_NOTMINPLUS:
3075 case OP_NOTQUERY:
3076 case OP_NOTMINQUERY:
3077 c = *ecode++ - OP_NOTSTAR;
3078 minimize = (c & 1) != 0;
3079 min = rep_min[c]; /* Pick up values from tables; */
3080 max = rep_max[c]; /* zero for max => infinity */
3081 if (max == 0) max = INT_MAX;
3082
3083 /* Common code for all repeated single-character matches. We can give
3084 up quickly if there are fewer than the minimum number of characters left in
3085 the subject. */
3086
3087 REPEATNOTCHAR:
3088 if (min > md->end_subject - eptr) return FALSE;
3089 c = *ecode++;
3090
3091 /* The code is duplicated for the caseless and caseful cases, for speed,
3092 since matching characters is likely to be quite common. First, ensure the
3093 minimum number of matches are present. If min = max, continue at the same
3094 level without recursing. Otherwise, if minimizing, keep trying the rest of
3095 the expression and advancing one matching character if failing, up to the
3096 maximum. Alternatively, if maximizing, find the maximum number of
3097 characters and work backwards. */
3098
3099 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3100 max, eptr));
3101
3102 if (md->caseless)
3103 {
3104 c = pcre_lcc[c];
3105 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3106 if (min == max) continue;
3107 if (minimize)
3108 {
3109 for (i = min;; i++)
3110 {
3111 if (match(eptr, ecode, offset_top, md)) return TRUE;
3112 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3113 return FALSE;
3114 }
3115 /* Control never gets here */
3116 }
3117 else
3118 {
3119 const uschar *pp = eptr;
3120 for (i = min; i < max; i++)
3121 {
3122 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3123 eptr++;
3124 }
3125 while (eptr >= pp)
3126 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3127 return FALSE;
3128 }
3129 /* Control never gets here */
3130 }
3131
3132 /* Caseful comparisons */
3133
3134 else
3135 {
3136 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3137 if (min == max) continue;
3138 if (minimize)
3139 {
3140 for (i = min;; i++)
3141 {
3142 if (match(eptr, ecode, offset_top, md)) return TRUE;
3143 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3144 }
3145 /* Control never gets here */
3146 }
3147 else
3148 {
3149 const uschar *pp = eptr;
3150 for (i = min; i < max; i++)
3151 {
3152 if (eptr >= md->end_subject || c == *eptr) break;
3153 eptr++;
3154 }
3155 while (eptr >= pp)
3156 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3157 return FALSE;
3158 }
3159 }
3160 /* Control never gets here */
3161
3162 /* Match a single character type repeatedly; several different opcodes
3163 share code. This is very similar to the code for single characters, but we
3164 repeat it in the interests of efficiency. */
3165
3166 case OP_TYPEEXACT:
3167 min = max = (ecode[1] << 8) + ecode[2];
3168 minimize = TRUE;
3169 ecode += 3;
3170 goto REPEATTYPE;
3171
3172 case OP_TYPEUPTO:
3173 case OP_TYPEMINUPTO:
3174 min = 0;
3175 max = (ecode[1] << 8) + ecode[2];
3176 minimize = *ecode == OP_TYPEMINUPTO;
3177 ecode += 3;
3178 goto REPEATTYPE;
3179
3180 case OP_TYPESTAR:
3181 case OP_TYPEMINSTAR:
3182 case OP_TYPEPLUS:
3183 case OP_TYPEMINPLUS:
3184 case OP_TYPEQUERY:
3185 case OP_TYPEMINQUERY:
3186 c = *ecode++ - OP_TYPESTAR;
3187 minimize = (c & 1) != 0;
3188 min = rep_min[c]; /* Pick up values from tables; */
3189 max = rep_max[c]; /* zero for max => infinity */
3190 if (max == 0) max = INT_MAX;
3191
3192 /* Common code for all repeated single character type matches */
3193
3194 REPEATTYPE:
3195 ctype = *ecode++; /* Code for the character type */
3196
3197 /* First, ensure the minimum number of matches are present. Use inline
3198 code for maximizing the speed, and do the type test once at the start
3199 (i.e. keep it out of the loop). Also test that there are at least the
3200 minimum number of characters before we start. */
3201
3202 if (min > md->end_subject - eptr) return FALSE;
3203 if (min > 0) switch(ctype)
3204 {
3205 case OP_ANY:
3206 if (!md->dotall)
3207 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3208 else eptr += min;
3209 break;
3210
3211 case OP_NOT_DIGIT:
3212 for (i = 1; i <= min; i++)
3213 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3214 break;
3215
3216 case OP_DIGIT:
3217 for (i = 1; i <= min; i++)
3218 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3219 break;
3220
3221 case OP_NOT_WHITESPACE:
3222 for (i = 1; i <= min; i++)
3223 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3224 break;
3225
3226 case OP_WHITESPACE:
3227 for (i = 1; i <= min; i++)
3228 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3229 break;
3230
3231 case OP_NOT_WORDCHAR:
3232 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3233 return FALSE;
3234 break;
3235
3236 case OP_WORDCHAR:
3237 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3238 return FALSE;
3239 break;
3240 }
3241
3242 /* If min = max, continue at the same level without recursing */
3243
3244 if (min == max) continue;
3245
3246 /* If minimizing, we have to test the rest of the pattern before each
3247 subsequent match, so inlining isn't much help; just use the function. */
3248
3249 if (minimize)
3250 {
3251 for (i = min;; i++)
3252 {
3253 if (match(eptr, ecode, offset_top, md)) return TRUE;
3254 if (i >= max || eptr >= md->end_subject ||
3255 !match_type(ctype, *eptr++, md->dotall))
3256 return FALSE;
3257 }
3258 /* Control never gets here */
3259 }
3260
3261 /* If maximizing it is worth using inline code for speed, doing the type
3262 test once at the start (i.e. keep it out of the loop). */
3263
3264 else
3265 {
3266 const uschar *pp = eptr;
3267 switch(ctype)
3268 {
3269 case OP_ANY:
3270 if (!md->dotall)
3271 {
3272 for (i = min; i < max; i++)
3273 {
3274 if (eptr >= md->end_subject || *eptr == '\n') break;
3275 eptr++;
3276 }
3277 }
3278 else
3279 {
3280 c = max - min;
3281 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3282 eptr += c;
3283 }
3284 break;
3285
3286 case OP_NOT_DIGIT:
3287 for (i = min; i < max; i++)
3288 {
3289 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3290 break;
3291 eptr++;
3292 }
3293 break;
3294
3295 case OP_DIGIT:
3296 for (i = min; i < max; i++)
3297 {
3298 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3299 break;
3300 eptr++;
3301 }
3302 break;
3303
3304 case OP_NOT_WHITESPACE:
3305 for (i = min; i < max; i++)
3306 {
3307 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3308 break;
3309 eptr++;
3310 }
3311 break;
3312
3313 case OP_WHITESPACE:
3314 for (i = min; i < max; i++)
3315 {
3316 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3317 break;
3318 eptr++;
3319 }
3320 break;
3321
3322 case OP_NOT_WORDCHAR:
3323 for (i = min; i < max; i++)
3324 {
3325 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3326 break;
3327 eptr++;
3328 }
3329 break;
3330
3331 case OP_WORDCHAR:
3332 for (i = min; i < max; i++)
3333 {
3334 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3335 break;
3336 eptr++;
3337 }
3338 break;
3339 }
3340
3341 while (eptr >= pp)
3342 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3343 return FALSE;
3344 }
3345 /* Control never gets here */
3346
3347 /* There's been some horrible disaster. */
3348
3349 default:
3350 DPRINTF(("Unknown opcode %d\n", *ecode));
3351 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3352 return FALSE;
3353 }
3354
3355 /* Do not stick any code in here without much thought; it is assumed
3356 that "continue" in the code above comes out to here to repeat the main
3357 loop. */
3358
3359 } /* End of main loop */
3360 /* Control never reaches here */
3361 }
3362
3363
3364
3365 /*************************************************
3366 * Segregate setjmp() *
3367 *************************************************/
3368
3369 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3370 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3371 hide it in a separate function. This is called only when PCRE_EXTRA is set,
3372 since it's needed only for the extension \X option, and with any luck, a good
3373 compiler will spot the tail recursion and compile it efficiently.
3374
3375 Arguments:
3376 eptr pointer in subject
3377 ecode position in code
3378 offset_top current top pointer
3379 md pointer to "static" info for the match
3380
3381 Returns: TRUE if matched
3382 */
3383
3384 static BOOL
3385 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
3386 match_data *match_block)
3387 {
3388 return setjmp(match_block->fail_env) == 0 &&
3389 match(eptr, ecode, offset_top, match_block);
3390 }
3391
3392
3393
3394 /*************************************************
3395 * Execute a Regular Expression *
3396 *************************************************/
3397
3398 /* This function applies a compiled re to a subject string and picks out
3399 portions of the string if it matches. Two elements in the vector are set for
3400 each substring: the offsets to the start and end of the substring.
3401
3402 Arguments:
3403 external_re points to the compiled expression
3404 external_extra points to "hints" from pcre_study() or is NULL
3405 subject points to the subject string
3406 length length of subject string (may contain binary zeros)
3407 options option bits
3408 offsets points to a vector of ints to be filled in with offsets
3409 offsetcount the number of elements in the vector
3410
3411 Returns: > 0 => success; value is the number of elements filled in
3412 = 0 => success, but offsets is not big enough
3413 -1 => failed to match
3414 < -1 => some kind of unexpected problem
3415 */
3416
3417 int
3418 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3419 const char *subject, int length, int options, int *offsets, int offsetcount)
3420 {
3421 int resetcount, ocount;
3422 int first_char = -1;
3423 match_data match_block;
3424 const uschar *start_bits = NULL;
3425 const uschar *start_match = (const uschar *)subject;
3426 const uschar *end_subject;
3427 const real_pcre *re = (const real_pcre *)external_re;
3428 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3429 BOOL using_temporary_offsets = FALSE;
3430 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3431 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3432
3433 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3434
3435 if (re == NULL || subject == NULL ||
3436 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3437 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3438
3439 match_block.start_subject = (const uschar *)subject;
3440 match_block.end_subject = match_block.start_subject + length;
3441 end_subject = match_block.end_subject;
3442
3443 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3444 match_block.runtime_caseless = match_block.caseless &&
3445 (re->options & PCRE_CASELESS) == 0;
3446
3447 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3448 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3449 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3450
3451 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3452 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3453
3454 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3455
3456 /* If the expression has got more back references than the offsets supplied can
3457 hold, we get a temporary bit of working store to use during the matching.
3458 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3459 of 2. */
3460
3461 ocount = offsetcount & (-2);
3462 if (re->top_backref > 0 && re->top_backref >= ocount/2)
3463 {
3464 ocount = re->top_backref * 2 + 2;
3465 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3466 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3467 using_temporary_offsets = TRUE;
3468 DPRINTF(("Got memory to hold back references\n"));
3469 }
3470 else match_block.offset_vector = offsets;
3471
3472 match_block.offset_end = ocount;
3473 match_block.offset_overflow = FALSE;
3474
3475 /* Compute the minimum number of offsets that we need to reset each time. Doing
3476 this makes a huge difference to execution time when there aren't many brackets
3477 in the pattern. */
3478
3479 resetcount = 2 + re->top_bracket * 2;
3480 if (resetcount > offsetcount) resetcount = ocount;
3481
3482 /* If MULTILINE is set at exec time but was not set at compile time, and the
3483 anchored flag is set, we must re-check because a setting provoked by ^ in the
3484 pattern is not right in multi-line mode. Calling is_anchored() again here does
3485 the right check, because multiline is now set. If it now yields FALSE, the
3486 expression must have had ^ starting some of its branches. Check to see if
3487 that is true for *all* branches, and if so, set the startline flag. */
3488
3489 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3490 !is_anchored(re->code, match_block.multiline))
3491 {
3492 anchored = FALSE;
3493 if (is_startline(re->code)) startline = TRUE;
3494 }
3495
3496 /* Set up the first character to match, if available. The first_char value is
3497 never set for an anchored regular expression, but the anchoring may be forced
3498 at run time, so we have to test for anchoring. The first char may be unset for
3499 an unanchored pattern, of course. If there's no first char and the pattern was
3500 studied, the may be a bitmap of possible first characters. However, we can
3501 use this only if the caseless state of the studying was correct. */
3502
3503 if (!anchored)
3504 {
3505 if ((re->options & PCRE_FIRSTSET) != 0)
3506 {
3507 first_char = re->first_char;
3508 if (match_block.caseless) first_char = pcre_lcc[first_char];
3509 }
3510 else
3511 if (!startline && extra != NULL &&
3512 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3513 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3514 start_bits = extra->start_bits;
3515 }
3516
3517 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3518
3519 do
3520 {
3521 int rc;
3522 register int *iptr = match_block.offset_vector;
3523 register int *iend = iptr + resetcount;
3524
3525 /* Reset the maximum number of extractions we might see. */
3526
3527 while (iptr < iend) *iptr++ = -1;
3528
3529 /* Advance to a unique first char if possible */
3530
3531 if (first_char >= 0)
3532 {
3533 if (match_block.caseless)
3534 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3535 start_match++;
3536 else
3537 while (start_match < end_subject && *start_match != first_char)
3538 start_match++;
3539 }
3540
3541 /* Or to just after \n for a multiline match if possible */
3542
3543 else if (startline)
3544 {
3545 if (start_match > match_block.start_subject)
3546 {
3547 while (start_match < end_subject && start_match[-1] != '\n')
3548 start_match++;
3549 }
3550 }
3551
3552 /* Or to a non-unique first char */
3553
3554 else if (start_bits != NULL)
3555 {
3556 while (start_match < end_subject)
3557 {
3558 register int c = *start_match;
3559 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3560 }
3561 }
3562
3563 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3564 printf(">>>> Match against: ");
3565 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3566 printf("\n");
3567 #endif
3568
3569 /* When a match occurs, substrings will be set for all internal extractions;
3570 we just need to set up the whole thing as substring 0 before returning. If
3571 there were too many extractions, set the return code to zero. In the case
3572 where we had to get some local store to hold offsets for backreferences, copy
3573 those back references that we can. In this case there need not be overflow
3574 if certain parts of the pattern were not used.
3575
3576 Before starting the match, we have to set up a longjmp() target to enable
3577 the "cut" operation to fail a match completely without backtracking. This
3578 is done in a separate function to avoid compiler warnings. We need not do
3579 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3580 enabled. */
3581
3582 if ((re->options & PCRE_EXTRA) != 0)
3583 {
3584 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
3585 continue;
3586 }
3587 else if (!match(start_match, re->code, 2, &match_block)) continue;
3588
3589 /* Copy the offset information from temporary store if necessary */
3590
3591 if (using_temporary_offsets)
3592 {
3593 if (offsetcount >= 4)
3594 {
3595 memcpy(offsets + 2, match_block.offset_vector + 2,
3596 (offsetcount - 2) * sizeof(int));
3597 DPRINTF(("Copied offsets from temporary memory\n"));
3598 }
3599 if (match_block.end_offset_top > offsetcount)
3600 match_block.offset_overflow = TRUE;
3601
3602 DPRINTF(("Freeing temporary memory\n"));
3603 (pcre_free)(match_block.offset_vector);
3604 }
3605
3606 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3607
3608 if (match_block.offset_end < 2) rc = 0; else
3609 {
3610 offsets[0] = start_match - match_block.start_subject;
3611 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3612 }
3613
3614 DPRINTF((">>>> returning %d\n", rc));
3615 return rc;
3616 }
3617 while (!anchored &&
3618 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3619 start_match++ < end_subject);
3620
3621 if (using_temporary_offsets)
3622 {
3623 DPRINTF(("Freeing temporary memory\n"));
3624 (pcre_free)(match_block.offset_vector);
3625 }
3626
3627 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3628
3629 return match_block.errorcode;
3630 }
3631
3632 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12