/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (show annotations) (download)
Sat Feb 24 21:38:25 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 103595 byte(s)
Load pcre-1.06 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1998 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Define DEBUG to get debugging output on stdout. */
33
34 /* #define DEBUG */
35
36 /* Use a macro for debugging printing, 'cause that eliminates the the use
37 of #ifdef inline, and there are *still* stupid compilers about that don't like
38 indented pre-processor statements. I suppose it's only been 10 years... */
39
40 #ifdef DEBUG
41 #define DPRINTF(p) printf p
42 #else
43 #define DPRINTF(p) /*nothing*/
44 #endif
45
46 /* Include the internals header, which itself includes Standard C headers plus
47 the external pcre header. */
48
49 #include "internal.h"
50
51
52 /* Allow compilation as C++ source code, should anybody want to do that. */
53
54 #ifdef __cplusplus
55 #define class pcre_class
56 #endif
57
58
59 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
60
61 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
62 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
63
64 /* Text forms of OP_ values and things, for debugging (not all used) */
65
66 #ifdef DEBUG
67 static const char *OP_names[] = {
68 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
69 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z", "^", "$", "Any", "chars",
70 "not",
71 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
72 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
73 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
74 "*", "*?", "+", "+?", "?", "??", "{", "{",
75 "class", "negclass", "Ref",
76 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
77 "Brazero", "Braminzero", "Bra"
78 };
79 #endif
80
81 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
82 are simple data values; negative values are for special things like \d and so
83 on. Zero means further processing is needed (for things like \x), or the escape
84 is invalid. */
85
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
90 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
94 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
95 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
96 0, 0, 0 /* x - z */
97 };
98
99 /* Definition to allow mutual recursion */
100
101 static BOOL
102 compile_regex(int, int *, uschar **, const uschar **, const char **);
103
104 /* Structure for passing "static" information around between the functions
105 doing the matching, so that they are thread-safe. */
106
107 typedef struct match_data {
108 int errorcode; /* As it says */
109 int *offset_vector; /* Offset vector */
110 int offset_end; /* One past the end */
111 BOOL offset_overflow; /* Set if too many extractions */
112 BOOL caseless; /* Case-independent flag */
113 BOOL runtime_caseless; /* Caseless forced at run time */
114 BOOL multiline; /* Multiline flag */
115 BOOL notbol; /* NOTBOL flag */
116 BOOL noteol; /* NOTEOL flag */
117 BOOL dotall; /* Dot matches any char */
118 BOOL endonly; /* Dollar not before final \n */
119 const uschar *start_subject; /* Start of the subject string */
120 const uschar *end_subject; /* End of the subject string */
121 jmp_buf fail_env; /* Environment for longjump() break out */
122 const uschar *end_match_ptr; /* Subject position at end match */
123 int end_offset_top; /* Highwater mark at end of match */
124 } match_data;
125
126
127
128 /*************************************************
129 * Global variables *
130 *************************************************/
131
132 /* PCRE is thread-clean and doesn't use any global variables in the normal
133 sense. However, it calls memory allocation and free functions via the two
134 indirections below, which are can be changed by the caller, but are shared
135 between all threads. */
136
137 void *(*pcre_malloc)(size_t) = malloc;
138 void (*pcre_free)(void *) = free;
139
140
141
142
143 /*************************************************
144 * Return version string *
145 *************************************************/
146
147 const char *
148 pcre_version(void)
149 {
150 return PCRE_VERSION;
151 }
152
153
154
155
156 /*************************************************
157 * Return info about a compiled pattern *
158 *************************************************/
159
160 /* This function picks potentially useful data out of the private
161 structure.
162
163 Arguments:
164 external_re points to compiled code
165 optptr where to pass back the options
166 first_char where to pass back the first character,
167 or -1 if multiline and all branches start ^,
168 or -2 otherwise
169
170 Returns: number of identifying extraction brackets
171 or negative values on error
172 */
173
174 int
175 pcre_info(const pcre *external_re, int *optptr, int *first_char)
176 {
177 const real_pcre *re = (const real_pcre *)external_re;
178 if (re == NULL) return PCRE_ERROR_NULL;
179 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
180 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
181 if (first_char != NULL)
182 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
183 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
184 return re->top_bracket;
185 }
186
187
188
189
190 #ifdef DEBUG
191 /*************************************************
192 * Debugging function to print chars *
193 *************************************************/
194
195 /* Print a sequence of chars in printable format, stopping at the end of the
196 subject if the requested.
197
198 Arguments:
199 p points to characters
200 length number to print
201 is_subject TRUE if printing from within md->start_subject
202 md pointer to matching data block, if is_subject is TRUE
203
204 Returns: nothing
205 */
206
207 static void
208 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
209 {
210 int c;
211 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
212 while (length-- > 0)
213 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
214 }
215 #endif
216
217
218
219
220 /*************************************************
221 * Check subpattern for empty operand *
222 *************************************************/
223
224 /* This function checks a bracketed subpattern to see if any of the paths
225 through it could match an empty string. This is used to diagnose an error if
226 such a subpattern is followed by a quantifier with an unlimited upper bound.
227
228 Argument:
229 code points to the opening bracket
230
231 Returns: TRUE or FALSE
232 */
233
234 static BOOL
235 could_be_empty(uschar *code)
236 {
237 do {
238 uschar *cc = code + 3;
239
240 /* Scan along the opcodes for this branch; as soon as we find something
241 that matches a non-empty string, break out and advance to test the next
242 branch. If we get to the end of the branch, return TRUE for the whole
243 sub-expression. */
244
245 for (;;)
246 {
247 /* Test an embedded subpattern; if it could not be empty, break the
248 loop. Otherwise carry on in the branch. */
249
250 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
251 {
252 if (!could_be_empty(cc)) break;
253 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
254 cc += 3;
255 }
256
257 else switch (*cc)
258 {
259 /* Reached end of a branch: the subpattern may match the empty string */
260
261 case OP_ALT:
262 case OP_KET:
263 case OP_KETRMAX:
264 case OP_KETRMIN:
265 return TRUE;
266
267 /* Skip over assertive subpatterns */
268
269 case OP_ASSERT:
270 case OP_ASSERT_NOT:
271 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
272 cc += 3;
273 break;
274
275 /* Skip over things that don't match chars */
276
277 case OP_SOD:
278 case OP_EOD:
279 case OP_CIRC:
280 case OP_DOLL:
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
283 case OP_NOT_WORD_BOUNDARY:
284 case OP_WORD_BOUNDARY:
285 cc++;
286 break;
287
288 /* Skip over simple repeats with zero lower bound */
289
290 case OP_STAR:
291 case OP_MINSTAR:
292 case OP_QUERY:
293 case OP_MINQUERY:
294 case OP_NOTSTAR:
295 case OP_NOTMINSTAR:
296 case OP_NOTQUERY:
297 case OP_NOTMINQUERY:
298 case OP_TYPESTAR:
299 case OP_TYPEMINSTAR:
300 case OP_TYPEQUERY:
301 case OP_TYPEMINQUERY:
302 cc += 2;
303 break;
304
305 /* Skip over UPTOs (lower bound is zero) */
306
307 case OP_UPTO:
308 case OP_MINUPTO:
309 case OP_TYPEUPTO:
310 case OP_TYPEMINUPTO:
311 cc += 4;
312 break;
313
314 /* Check a class or a back reference for a zero minimum */
315
316 case OP_CLASS:
317 case OP_NEGCLASS:
318 case OP_REF:
319 cc += (*cc == OP_REF)? 2 : 33;
320
321 switch (*cc)
322 {
323 case OP_CRSTAR:
324 case OP_CRMINSTAR:
325 case OP_CRQUERY:
326 case OP_CRMINQUERY:
327 cc++;
328 break;
329
330 case OP_CRRANGE:
331 case OP_CRMINRANGE:
332 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
333 cc += 3;
334 break;
335
336 default:
337 goto NEXT_BRANCH;
338 }
339 break;
340
341 /* Anything else matches at least one character */
342
343 default:
344 goto NEXT_BRANCH;
345 }
346 }
347
348 NEXT_BRANCH:
349 code += (code[1] << 8) + code[2];
350 }
351 while (*code == OP_ALT);
352
353 /* No branches match the empty string */
354
355 return FALSE;
356 }
357
358
359
360 /*************************************************
361 * Handle escapes *
362 *************************************************/
363
364 /* This function is called when a \ has been encountered. It either returns a
365 positive value for a simple escape such as \n, or a negative value which
366 encodes one of the more complicated things such as \d. On entry, ptr is
367 pointing at the \. On exit, it is on the final character of the escape
368 sequence.
369
370 Arguments:
371 ptrptr points to the pattern position pointer
372 errorptr points to the pointer to the error message
373 bracount number of previous extracting brackets
374 options the options bits
375 isclass TRUE if inside a character class
376
377 Returns: zero or positive => a data character
378 negative => a special escape sequence
379 on error, errorptr is set
380 */
381
382 static int
383 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
384 int options, BOOL isclass)
385 {
386 const uschar *ptr = *ptrptr;
387 int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
388 int i;
389
390 if (c == 0) *errorptr = ERR1;
391
392 /* Digits or letters may have special meaning; all others are literals. */
393
394 else if (c < '0' || c > 'z') {}
395
396 /* Do an initial lookup in a table. A non-zero result is something that can be
397 returned immediately. Otherwise further processing may be required. */
398
399 else if ((i = escapes[c - '0']) != 0) c = i;
400
401 /* Escapes that need further processing, or are illegal. */
402
403 else
404 {
405 const uschar *oldptr;
406 switch (c)
407 {
408 /* The handling of escape sequences consisting of a string of digits
409 starting with one that is not zero is not straightforward. By experiment,
410 the way Perl works seems to be as follows:
411
412 Outside a character class, the digits are read as a decimal number. If the
413 number is less than 10, or if there are that many previous extracting
414 left brackets, then it is a back reference. Otherwise, up to three octal
415 digits are read to form an escaped byte. Thus \123 is likely to be octal
416 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
417 value is greater than 377, the least significant 8 bits are taken. Inside a
418 character class, \ followed by a digit is always an octal number. */
419
420 case '1': case '2': case '3': case '4': case '5':
421 case '6': case '7': case '8': case '9':
422
423 if (!isclass)
424 {
425 oldptr = ptr;
426 c -= '0';
427 while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0)
428 c = c * 10 + *(++ptr) - '0';
429 if (c < 10 || c <= bracount)
430 {
431 c = -(ESC_REF + c);
432 break;
433 }
434 ptr = oldptr; /* Put the pointer back and fall through */
435 }
436
437 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
438 generates a binary zero byte and treats the digit as a following literal.
439 Thus we have to pull back the pointer by one. */
440
441 if ((c = *ptr) >= '8')
442 {
443 ptr--;
444 c = 0;
445 break;
446 }
447
448 /* \0 always starts an octal number, but we may drop through to here with a
449 larger first octal digit */
450
451 case '0':
452 c -= '0';
453 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
454 ptr[1] != '8' && ptr[1] != '9')
455 c = c * 8 + *(++ptr) - '0';
456 break;
457
458 /* Special escapes not starting with a digit are straightforward */
459
460 case 'x':
461 c = 0;
462 while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
463 {
464 ptr++;
465 c = c * 16 + pcre_lcc[*ptr] -
466 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
467 }
468 break;
469
470 case 'c':
471 c = *(++ptr);
472 if (c == 0)
473 {
474 *errorptr = ERR2;
475 return 0;
476 }
477
478 /* A letter is upper-cased; then the 0x40 bit is flipped */
479
480 if (c >= 'a' && c <= 'z') c = pcre_fcc[c];
481 c ^= 0x40;
482 break;
483
484 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
485 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
486 for Perl compatibility, it is a literal. */
487
488 default:
489 if ((options & PCRE_EXTRA) != 0) switch(c)
490 {
491 case 'X':
492 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
493 break;
494
495 default:
496 *errorptr = ERR3;
497 break;
498 }
499 break;
500 }
501 }
502
503 *ptrptr = ptr;
504 return c;
505 }
506
507
508
509 /*************************************************
510 * Check for counted repeat *
511 *************************************************/
512
513 /* This function is called when a '{' is encountered in a place where it might
514 start a quantifier. It looks ahead to see if it really is a quantifier or not.
515 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
516 where the ddds are digits.
517
518 Arguments:
519 p pointer to the first char after '{'
520
521 Returns: TRUE or FALSE
522 */
523
524 static BOOL
525 is_counted_repeat(const uschar *p)
526 {
527 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
528 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
529 if (*p == '}') return TRUE;
530
531 if (*p++ != ',') return FALSE;
532 if (*p == '}') return TRUE;
533
534 if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
535 while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
536 return (*p == '}');
537 }
538
539
540
541 /*************************************************
542 * Read repeat counts *
543 *************************************************/
544
545 /* Read an item of the form {n,m} and return the values. This is called only
546 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
547 so the syntax is guaranteed to be correct, but we need to check the values.
548
549 Arguments:
550 p pointer to first char after '{'
551 minp pointer to int for min
552 maxp pointer to int for max
553 returned as -1 if no max
554 errorptr points to pointer to error message
555
556 Returns: pointer to '}' on success;
557 current ptr on error, with errorptr set
558 */
559
560 static const uschar *
561 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
562 {
563 int min = 0;
564 int max = -1;
565
566 while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
567
568 if (*p == '}') max = min; else
569 {
570 if (*(++p) != '}')
571 {
572 max = 0;
573 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
574 if (max < min)
575 {
576 *errorptr = ERR4;
577 return p;
578 }
579 }
580 }
581
582 /* Do paranoid checks, then fill in the required variables, and pass back the
583 pointer to the terminating '}'. */
584
585 if (min > 65535 || max > 65535)
586 *errorptr = ERR5;
587 else
588 {
589 *minp = min;
590 *maxp = max;
591 }
592 return p;
593 }
594
595
596
597 /*************************************************
598 * Compile one branch *
599 *************************************************/
600
601 /* Scan the pattern, compiling it into the code vector.
602
603 Arguments:
604 options the option bits
605 bracket points to number of brackets used
606 code points to the pointer to the current code point
607 ptrptr points to the current pattern pointer
608 errorptr points to pointer to error message
609
610 Returns: TRUE on success
611 FALSE, with *errorptr set on error
612 */
613
614 static BOOL
615 compile_branch(int options, int *brackets, uschar **codeptr,
616 const uschar **ptrptr, const char **errorptr)
617 {
618 int repeat_type, op_type;
619 int repeat_min, repeat_max;
620 int bravalue, length;
621 register int c;
622 register uschar *code = *codeptr;
623 const uschar *ptr = *ptrptr;
624 const uschar *oldptr;
625 uschar *previous = NULL;
626 uschar class[32];
627
628 /* Switch on next character until the end of the branch */
629
630 for (;; ptr++)
631 {
632 BOOL negate_class;
633 int class_charcount;
634 int class_lastchar;
635
636 c = *ptr;
637 if ((options & PCRE_EXTENDED) != 0)
638 {
639 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
640 if (c == '#')
641 {
642 while ((c = *(++ptr)) != 0 && c != '\n');
643 continue;
644 }
645 }
646
647 switch(c)
648 {
649 /* The branch terminates at end of string, |, or ). */
650
651 case 0:
652 case '|':
653 case ')':
654 *codeptr = code;
655 *ptrptr = ptr;
656 return TRUE;
657
658 /* Handle single-character metacharacters */
659
660 case '^':
661 previous = NULL;
662 *code++ = OP_CIRC;
663 break;
664
665 case '$':
666 previous = NULL;
667 *code++ = OP_DOLL;
668 break;
669
670 case '.':
671 previous = code;
672 *code++ = OP_ANY;
673 break;
674
675 /* Character classes. These always build a 32-byte bitmap of the permitted
676 characters, except in the special case where there is only one character.
677 For negated classes, we build the map as usual, then invert it at the end.
678 */
679
680 case '[':
681 previous = code;
682
683 /* If the first character is '^', set the negation flag, and use a
684 different opcode. This only matters if caseless matching is specified at
685 runtime. */
686
687 if ((c = *(++ptr)) == '^')
688 {
689 negate_class = TRUE;
690 *code++ = OP_NEGCLASS;
691 c = *(++ptr);
692 }
693 else
694 {
695 negate_class = FALSE;
696 *code++ = OP_CLASS;
697 }
698
699 /* Keep a count of chars so that we can optimize the case of just a single
700 character. */
701
702 class_charcount = 0;
703 class_lastchar = -1;
704
705 /* Initialize the 32-char bit map to all zeros. We have to build the
706 map in a temporary bit of store, in case the class contains only 1
707 character, because in that case the compiled code doesn't use the
708 bit map. */
709
710 memset(class, 0, 32 * sizeof(uschar));
711
712 /* Process characters until ] is reached. By writing this as a "do" it
713 means that an initial ] is taken as a data character. */
714
715 do
716 {
717 if (c == 0)
718 {
719 *errorptr = ERR6;
720 goto FAILED;
721 }
722
723 /* Backslash may introduce a single character, or it may introduce one
724 of the specials, which just set a flag. Escaped items are checked for
725 validity in the pre-compiling pass. The sequence \b is a special case.
726 Inside a class (and only there) it is treated as backspace. Elsewhere
727 it marks a word boundary. Other escapes have preset maps ready to
728 or into the one we are building. We assume they have more than one
729 character in them, so set class_count bigger than one. */
730
731 if (c == '\\')
732 {
733 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
734 if (-c == ESC_b) c = '\b';
735 else if (c < 0)
736 {
737 class_charcount = 10;
738 switch (-c)
739 {
740 case ESC_d:
741 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
742 continue;
743
744 case ESC_D:
745 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
746 continue;
747
748 case ESC_w:
749 for (c = 0; c < 32; c++)
750 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
751 continue;
752
753 case ESC_W:
754 for (c = 0; c < 32; c++)
755 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
756 continue;
757
758 case ESC_s:
759 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
760 continue;
761
762 case ESC_S:
763 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
764 continue;
765
766 default:
767 *errorptr = ERR7;
768 goto FAILED;
769 }
770 }
771 /* Fall through if single character */
772 }
773
774 /* A single character may be followed by '-' to form a range. However,
775 Perl does not permit ']' to be the end of the range. A '-' character
776 here is treated as a literal. */
777
778 if (ptr[1] == '-' && ptr[2] != ']')
779 {
780 int d;
781 ptr += 2;
782 d = *ptr;
783
784 if (d == 0)
785 {
786 *errorptr = ERR6;
787 goto FAILED;
788 }
789
790 /* The second part of a range can be a single-character escape, but
791 not any of the other escapes. */
792
793 if (d == '\\')
794 {
795 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
796 if (d < 0)
797 {
798 if (d == -ESC_b) d = '\b'; else
799 {
800 *errorptr = ERR7;
801 goto FAILED;
802 }
803 }
804 }
805
806 if (d < c)
807 {
808 *errorptr = ERR8;
809 goto FAILED;
810 }
811
812 for (; c <= d; c++)
813 {
814 class[c/8] |= (1 << (c&7));
815 if ((options & PCRE_CASELESS) != 0)
816 {
817 int uc = pcre_fcc[c]; /* flip case */
818 class[uc/8] |= (1 << (uc&7));
819 }
820 class_charcount++; /* in case a one-char range */
821 class_lastchar = c;
822 }
823 continue; /* Go get the next char in the class */
824 }
825
826 /* Handle a lone single character - we can get here for a normal
827 non-escape char, or after \ that introduces a single character. */
828
829 class [c/8] |= (1 << (c&7));
830 if ((options & PCRE_CASELESS) != 0)
831 {
832 c = pcre_fcc[c]; /* flip case */
833 class[c/8] |= (1 << (c&7));
834 }
835 class_charcount++;
836 class_lastchar = c;
837 }
838
839 /* Loop until ']' reached; the check for end of string happens inside the
840 loop. This "while" is the end of the "do" above. */
841
842 while ((c = *(++ptr)) != ']');
843
844 /* If class_charcount is 1 and class_lastchar is not negative, we saw
845 precisely one character. This doesn't need the whole 32-byte bit map.
846 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
847 it's negative. */
848
849 if (class_charcount == 1 && class_lastchar >= 0)
850 {
851 if (negate_class)
852 {
853 code[-1] = OP_NOT;
854 }
855 else
856 {
857 code[-1] = OP_CHARS;
858 *code++ = 1;
859 }
860 *code++ = class_lastchar;
861 }
862
863 /* Otherwise, negate the 32-byte map if necessary, and copy it into
864 the code vector. */
865
866 else
867 {
868 if (negate_class)
869 for (c = 0; c < 32; c++) code[c] = ~class[c];
870 else
871 memcpy(code, class, 32);
872 code += 32;
873 }
874 break;
875
876 /* Various kinds of repeat */
877
878 case '{':
879 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
880 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
881 if (*errorptr != NULL) goto FAILED;
882 goto REPEAT;
883
884 case '*':
885 repeat_min = 0;
886 repeat_max = -1;
887 goto REPEAT;
888
889 case '+':
890 repeat_min = 1;
891 repeat_max = -1;
892 goto REPEAT;
893
894 case '?':
895 repeat_min = 0;
896 repeat_max = 1;
897
898 REPEAT:
899 if (previous == NULL)
900 {
901 *errorptr = ERR9;
902 goto FAILED;
903 }
904
905 /* If the next character is '?' this is a minimizing repeat. Advance to the
906 next character. */
907
908 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
909
910 /* If the maximum is zero then the minimum must also be zero; Perl allows
911 this case, so we do too - by simply omitting the item altogether. */
912
913 if (repeat_max == 0) code = previous;
914
915 /* If previous was a string of characters, chop off the last one and use it
916 as the subject of the repeat. If there was only one character, we can
917 abolish the previous item altogether. */
918
919 else if (*previous == OP_CHARS)
920 {
921 int len = previous[1];
922 if (len == 1)
923 {
924 c = previous[2];
925 code = previous;
926 }
927 else
928 {
929 c = previous[len+1];
930 previous[1]--;
931 code--;
932 }
933 op_type = 0; /* Use single-char op codes */
934 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
935 }
936
937 /* If previous was a single negated character ([^a] or similar), we use
938 one of the special opcodes, replacing it. The code is shared with single-
939 character repeats by adding a suitable offset into repeat_type. */
940
941 else if ((int)*previous == OP_NOT)
942 {
943 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
944 c = previous[1];
945 code = previous;
946 goto OUTPUT_SINGLE_REPEAT;
947 }
948
949 /* If previous was a character type match (\d or similar), abolish it and
950 create a suitable repeat item. The code is shared with single-character
951 repeats by adding a suitable offset into repeat_type. */
952
953 else if ((int)*previous < OP_EOD || *previous == OP_ANY)
954 {
955 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
956 c = *previous;
957 code = previous;
958
959 OUTPUT_SINGLE_REPEAT:
960 repeat_type += op_type; /* Combine both values for many cases */
961
962 /* A minimum of zero is handled either as the special case * or ?, or as
963 an UPTO, with the maximum given. */
964
965 if (repeat_min == 0)
966 {
967 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
968 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
969 else
970 {
971 *code++ = OP_UPTO + repeat_type;
972 *code++ = repeat_max >> 8;
973 *code++ = (repeat_max & 255);
974 }
975 }
976
977 /* The case {1,} is handled as the special case + */
978
979 else if (repeat_min == 1 && repeat_max == -1)
980 *code++ = OP_PLUS + repeat_type;
981
982 /* The case {n,n} is just an EXACT, while the general case {n,m} is
983 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
984
985 else
986 {
987 if (repeat_min != 1)
988 {
989 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
990 *code++ = repeat_min >> 8;
991 *code++ = (repeat_min & 255);
992 }
993
994 /* If the mininum is 1 and the previous item was a character string,
995 we either have to put back the item that got cancelled if the string
996 length was 1, or add the character back onto the end of a longer
997 string. For a character type nothing need be done; it will just get put
998 back naturally. */
999
1000 else if (*previous == OP_CHARS)
1001 {
1002 if (code == previous) code += 2; else previous[1]++;
1003 }
1004
1005 /* If the maximum is unlimited, insert an OP_STAR. */
1006
1007 if (repeat_max < 0)
1008 {
1009 *code++ = c;
1010 *code++ = OP_STAR + repeat_type;
1011 }
1012
1013 /* Else insert an UPTO if the max is greater than the min. */
1014
1015 else if (repeat_max != repeat_min)
1016 {
1017 *code++ = c;
1018 repeat_max -= repeat_min;
1019 *code++ = OP_UPTO + repeat_type;
1020 *code++ = repeat_max >> 8;
1021 *code++ = (repeat_max & 255);
1022 }
1023 }
1024
1025 /* The character or character type itself comes last in all cases. */
1026
1027 *code++ = c;
1028 }
1029
1030 /* If previous was a character class or a back reference, we put the repeat
1031 stuff after it. */
1032
1033 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1034 *previous == OP_REF)
1035 {
1036 if (repeat_min == 0 && repeat_max == -1)
1037 *code++ = OP_CRSTAR + repeat_type;
1038 else if (repeat_min == 1 && repeat_max == -1)
1039 *code++ = OP_CRPLUS + repeat_type;
1040 else if (repeat_min == 0 && repeat_max == 1)
1041 *code++ = OP_CRQUERY + repeat_type;
1042 else
1043 {
1044 *code++ = OP_CRRANGE + repeat_type;
1045 *code++ = repeat_min >> 8;
1046 *code++ = repeat_min & 255;
1047 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1048 *code++ = repeat_max >> 8;
1049 *code++ = repeat_max & 255;
1050 }
1051 }
1052
1053 /* If previous was a bracket group, we may have to replicate it in certain
1054 cases. If the maximum repeat count is unlimited, check that the bracket
1055 group cannot match the empty string, and diagnose an error if it can. */
1056
1057 else if ((int)*previous >= OP_BRA)
1058 {
1059 int i;
1060 int len = code - previous;
1061
1062 if (repeat_max == -1 && could_be_empty(previous))
1063 {
1064 *errorptr = ERR10;
1065 goto FAILED;
1066 }
1067
1068 /* If the minimum is greater than zero, and the maximum is unlimited or
1069 equal to the minimum, the first copy remains where it is, and is
1070 replicated up to the minimum number of times. This case includes the +
1071 repeat, but of course no replication is needed in that case. */
1072
1073 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1074 {
1075 for (i = 1; i < repeat_min; i++)
1076 {
1077 memcpy(code, previous, len);
1078 code += len;
1079 }
1080 }
1081
1082 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1083 Then, if there is a fixed upper limit, replicated up to that many times,
1084 sticking BRAZERO in front of all the optional ones. */
1085
1086 else
1087 {
1088 if (repeat_min == 0)
1089 {
1090 memmove(previous+1, previous, len);
1091 code++;
1092 *previous++ = OP_BRAZERO + repeat_type;
1093 }
1094
1095 for (i = 1; i < repeat_min; i++)
1096 {
1097 memcpy(code, previous, len);
1098 code += len;
1099 }
1100
1101 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1102 {
1103 *code++ = OP_BRAZERO + repeat_type;
1104 memcpy(code, previous, len);
1105 code += len;
1106 }
1107 }
1108
1109 /* If the maximum is unlimited, set a repeater in the final copy. */
1110
1111 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1112 }
1113
1114 /* Else there's some kind of shambles */
1115
1116 else
1117 {
1118 *errorptr = ERR11;
1119 goto FAILED;
1120 }
1121
1122 /* In all case we no longer have a previous item. */
1123
1124 previous = NULL;
1125 break;
1126
1127
1128 /* Start of nested bracket sub-expression, or comment or lookahead.
1129 First deal with special things that can come after a bracket; all are
1130 introduced by ?, and the appearance of any of them means that this is not a
1131 referencing group. They were checked for validity in the first pass over
1132 the string, so we don't have to check for syntax errors here. */
1133
1134 case '(':
1135 previous = code; /* Only real brackets can be repeated */
1136 if (*(++ptr) == '?')
1137 {
1138 bravalue = OP_BRA;
1139
1140 switch (*(++ptr))
1141 {
1142 case '#':
1143 case 'i':
1144 case 'm':
1145 case 's':
1146 case 'x':
1147 ptr++;
1148 while (*ptr != ')') ptr++;
1149 previous = NULL;
1150 continue;
1151
1152 case ':': /* Non-extracting bracket */
1153 ptr++;
1154 break;
1155
1156 case '=': /* Assertions can't be repeated */
1157 bravalue = OP_ASSERT;
1158 ptr++;
1159 previous = NULL;
1160 break;
1161
1162 case '!':
1163 bravalue = OP_ASSERT_NOT;
1164 ptr++;
1165 previous = NULL;
1166 break;
1167
1168 case '>': /* "Match once" brackets */
1169 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1170 {
1171 bravalue = OP_ONCE;
1172 ptr++;
1173 previous = NULL;
1174 break;
1175 }
1176 /* Else fall through */
1177
1178 default:
1179 *errorptr = ERR12;
1180 goto FAILED;
1181 }
1182 }
1183
1184 /* Else we have a referencing group */
1185
1186 else
1187 {
1188 if (++(*brackets) > EXTRACT_MAX)
1189 {
1190 *errorptr = ERR13;
1191 goto FAILED;
1192 }
1193 bravalue = OP_BRA + *brackets;
1194 }
1195
1196 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1197 code into a non-register variable in order to be able to pass its address
1198 because some compilers complain otherwise. */
1199
1200 *code = bravalue;
1201 {
1202 uschar *mcode = code;
1203 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr))
1204 goto FAILED;
1205 code = mcode;
1206 }
1207
1208 if (*ptr != ')')
1209 {
1210 *errorptr = ERR14;
1211 goto FAILED;
1212 }
1213 break;
1214
1215 /* Check \ for being a real metacharacter; if not, fall through and handle
1216 it as a data character at the start of a string. Escape items are checked
1217 for validity in the pre-compiling pass. */
1218
1219 case '\\':
1220 oldptr = ptr;
1221 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1222
1223 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1224 are arranged to be the negation of the corresponding OP_values. For the
1225 back references, the values are ESC_REF plus the reference number. Only
1226 back references and those types that consume a character may be repeated.
1227 We can test for values between ESC_b and ESC_Z for the latter; this may
1228 have to change if any new ones are ever created. */
1229
1230 if (c < 0)
1231 {
1232 if (-c >= ESC_REF)
1233 {
1234 int refnum = -c - ESC_REF;
1235 if (*brackets < refnum)
1236 {
1237 *errorptr = ERR15;
1238 goto FAILED;
1239 }
1240 previous = code;
1241 *code++ = OP_REF;
1242 *code++ = refnum;
1243 }
1244 else
1245 {
1246 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1247 *code++ = -c;
1248 }
1249 continue;
1250 }
1251
1252 /* Data character: reset and fall through */
1253
1254 ptr = oldptr;
1255 c = '\\';
1256
1257 /* Handle a run of data characters until a metacharacter is encountered.
1258 The first character is guaranteed not to be whitespace or # when the
1259 extended flag is set. */
1260
1261 NORMAL_CHAR:
1262 default:
1263 previous = code;
1264 *code = OP_CHARS;
1265 code += 2;
1266 length = 0;
1267
1268 do
1269 {
1270 if ((options & PCRE_EXTENDED) != 0)
1271 {
1272 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1273 if (c == '#')
1274 {
1275 while ((c = *(++ptr)) != 0 && c != '\n');
1276 if (c == 0) break;
1277 continue;
1278 }
1279 }
1280
1281 /* Backslash may introduce a data char or a metacharacter. Escaped items
1282 are checked for validity in the pre-compiling pass. Stop the string
1283 before a metaitem. */
1284
1285 if (c == '\\')
1286 {
1287 oldptr = ptr;
1288 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
1289 if (c < 0) { ptr = oldptr; break; }
1290 }
1291
1292 /* Ordinary character or single-char escape */
1293
1294 *code++ = c;
1295 length++;
1296 }
1297
1298 /* This "while" is the end of the "do" above. */
1299
1300 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1301
1302 /* Compute the length and set it in the data vector, and advance to
1303 the next state. */
1304
1305 previous[1] = length;
1306 if (length < 255) ptr--;
1307 break;
1308 }
1309 } /* end of big loop */
1310
1311 /* Control never reaches here by falling through, only by a goto for all the
1312 error states. Pass back the position in the pattern so that it can be displayed
1313 to the user for diagnosing the error. */
1314
1315 FAILED:
1316 *ptrptr = ptr;
1317 return FALSE;
1318 }
1319
1320
1321
1322
1323 /*************************************************
1324 * Compile sequence of alternatives *
1325 *************************************************/
1326
1327 /* On entry, ptr is pointing past the bracket character, but on return
1328 it points to the closing bracket, or vertical bar, or end of string.
1329 The code variable is pointing at the byte into which the BRA operator has been
1330 stored.
1331
1332 Argument:
1333 options the option bits
1334 brackets -> int containing the number of extracting brackets used
1335 codeptr -> the address of the current code pointer
1336 ptrptr -> the address of the current pattern pointer
1337 errorptr -> pointer to error message
1338
1339 Returns: TRUE on success
1340 */
1341
1342 static BOOL
1343 compile_regex(int options, int *brackets, uschar **codeptr,
1344 const uschar **ptrptr, const char **errorptr)
1345 {
1346 const uschar *ptr = *ptrptr;
1347 uschar *code = *codeptr;
1348 uschar *start_bracket = code;
1349
1350 for (;;)
1351 {
1352 int length;
1353 uschar *last_branch = code;
1354
1355 code += 3;
1356 if (!compile_branch(options, brackets, &code, &ptr, errorptr))
1357 {
1358 *ptrptr = ptr;
1359 return FALSE;
1360 }
1361
1362 /* Fill in the length of the last branch */
1363
1364 length = code - last_branch;
1365 last_branch[1] = length >> 8;
1366 last_branch[2] = length & 255;
1367
1368 /* Reached end of expression, either ')' or end of pattern. Insert a
1369 terminating ket and the length of the whole bracketed item, and return,
1370 leaving the pointer at the terminating char. */
1371
1372 if (*ptr != '|')
1373 {
1374 length = code - start_bracket;
1375 *code++ = OP_KET;
1376 *code++ = length >> 8;
1377 *code++ = length & 255;
1378 *codeptr = code;
1379 *ptrptr = ptr;
1380 return TRUE;
1381 }
1382
1383 /* Another branch follows; insert an "or" node and advance the pointer. */
1384
1385 *code = OP_ALT;
1386 ptr++;
1387 }
1388 /* Control never reaches here */
1389 }
1390
1391
1392
1393 /*************************************************
1394 * Check for anchored expression *
1395 *************************************************/
1396
1397 /* Try to find out if this is an anchored regular expression. Consider each
1398 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1399 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1400 it's anchored. However, if this is a multiline pattern, then only OP_SOD
1401 counts, since OP_CIRC can match in the middle.
1402
1403 A branch is also implicitly anchored if it starts with .* because that will try
1404 the rest of the pattern at all possible matching points, so there is no point
1405 trying them again.
1406
1407 Argument: points to start of expression (the bracket)
1408 Returns: TRUE or FALSE
1409 */
1410
1411 static BOOL
1412 is_anchored(register const uschar *code, BOOL multiline)
1413 {
1414 do {
1415 int op = (int)code[3];
1416 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
1417 { if (!is_anchored(code+3, multiline)) return FALSE; }
1418 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1419 { if (code[4] != OP_ANY) return FALSE; }
1420 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
1421 code += (code[1] << 8) + code[2];
1422 }
1423 while (*code == OP_ALT);
1424 return TRUE;
1425 }
1426
1427
1428
1429 /*************************************************
1430 * Check for start with \n line expression *
1431 *************************************************/
1432
1433 /* This is called for multiline expressions to try to find out if every branch
1434 starts with ^ so that "first char" processing can be done to speed things up.
1435
1436 Argument: points to start of expression (the bracket)
1437 Returns: TRUE or FALSE
1438 */
1439
1440 static BOOL
1441 is_startline(const uschar *code)
1442 {
1443 do {
1444 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1445 { if (!is_startline(code+3)) return FALSE; }
1446 else if (code[3] != OP_CIRC) return FALSE;
1447 code += (code[1] << 8) + code[2];
1448 }
1449 while (*code == OP_ALT);
1450 return TRUE;
1451 }
1452
1453
1454
1455 /*************************************************
1456 * Check for fixed first char *
1457 *************************************************/
1458
1459 /* Try to find out if there is a fixed first character. This is called for
1460 unanchored expressions, as it speeds up their processing quite considerably.
1461 Consider each alternative branch. If they all start with the same char, or with
1462 a bracket all of whose alternatives start with the same char (recurse ad lib),
1463 then we return that char, otherwise -1.
1464
1465 Argument: points to start of expression (the bracket)
1466 Returns: -1 or the fixed first char
1467 */
1468
1469 static int
1470 find_firstchar(uschar *code)
1471 {
1472 register int c = -1;
1473 do
1474 {
1475 register int charoffset = 4;
1476
1477 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
1478 {
1479 register int d;
1480 if ((d = find_firstchar(code+3)) < 0) return -1;
1481 if (c < 0) c = d; else if (c != d) return -1;
1482 }
1483
1484 else switch(code[3])
1485 {
1486 default:
1487 return -1;
1488
1489 case OP_EXACT: /* Fall through */
1490 charoffset++;
1491
1492 case OP_CHARS: /* Fall through */
1493 charoffset++;
1494
1495 case OP_PLUS:
1496 case OP_MINPLUS:
1497 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
1498 break;
1499 }
1500 code += (code[1] << 8) + code[2];
1501 }
1502 while (*code == OP_ALT);
1503 return c;
1504 }
1505
1506
1507
1508 /*************************************************
1509 * Compile a Regular Expression *
1510 *************************************************/
1511
1512 /* This function takes a string and returns a pointer to a block of store
1513 holding a compiled version of the expression.
1514
1515 Arguments:
1516 pattern the regular expression
1517 options various option bits
1518 errorptr pointer to pointer to error text
1519 erroroffset ptr offset in pattern where error was detected
1520
1521 Returns: pointer to compiled data block, or NULL on error,
1522 with errorptr and erroroffset set
1523 */
1524
1525 pcre *
1526 pcre_compile(const char *pattern, int options, const char **errorptr,
1527 int *erroroffset)
1528 {
1529 real_pcre *re;
1530 int spaces = 0;
1531 int length = 3; /* For initial BRA plus length */
1532 int runlength;
1533 int c, size;
1534 int bracount = 0;
1535 int brastack[200];
1536 int top_backref = 0;
1537 unsigned int brastackptr = 0;
1538 uschar *code;
1539 const uschar *ptr;
1540
1541 #ifdef DEBUG
1542 uschar *code_base, *code_end;
1543 #endif
1544
1545 /* We can't pass back an error message if errorptr is NULL; I guess the best we
1546 can do is just return NULL. */
1547
1548 if (errorptr == NULL) return NULL;
1549 *errorptr = NULL;
1550
1551 /* However, we can give a message for this error */
1552
1553 if (erroroffset == NULL)
1554 {
1555 *errorptr = ERR16;
1556 return NULL;
1557 }
1558 *erroroffset = 0;
1559
1560 if ((options & ~PUBLIC_OPTIONS) != 0)
1561 {
1562 *errorptr = ERR17;
1563 return NULL;
1564 }
1565
1566 DPRINTF(("------------------------------------------------------------------\n"));
1567 DPRINTF(("%s\n", pattern));
1568
1569 /* The first thing to do is to make a pass over the pattern to compute the
1570 amount of store required to hold the compiled code. This does not have to be
1571 perfect as long as errors are overestimates. At the same time we can detect any
1572 internal flag settings. Make an attempt to correct for any counted white space
1573 if an "extended" flag setting appears late in the pattern. We can't be so
1574 clever for #-comments. */
1575
1576 ptr = (const uschar *)(pattern - 1);
1577 while ((c = *(++ptr)) != 0)
1578 {
1579 int min, max;
1580 int class_charcount;
1581
1582 if ((pcre_ctypes[c] & ctype_space) != 0)
1583 {
1584 if ((options & PCRE_EXTENDED) != 0) continue;
1585 spaces++;
1586 }
1587
1588 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1589 {
1590 while ((c = *(++ptr)) != 0 && c != '\n');
1591 continue;
1592 }
1593
1594 switch(c)
1595 {
1596 /* A backslashed item may be an escaped "normal" character or a
1597 character type. For a "normal" character, put the pointers and
1598 character back so that tests for whitespace etc. in the input
1599 are done correctly. */
1600
1601 case '\\':
1602 {
1603 const uschar *save_ptr = ptr;
1604 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1605 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1606 if (c >= 0)
1607 {
1608 ptr = save_ptr;
1609 c = '\\';
1610 goto NORMAL_CHAR;
1611 }
1612 }
1613 length++;
1614
1615 /* A back reference needs an additional char, plus either one or 5
1616 bytes for a repeat. We also need to keep the value of the highest
1617 back reference. */
1618
1619 if (c <= -ESC_REF)
1620 {
1621 int refnum = -c - ESC_REF;
1622 if (refnum > top_backref) top_backref = refnum;
1623 length++; /* For single back reference */
1624 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
1625 {
1626 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1627 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1628 if ((min == 0 && (max == 1 || max == -1)) ||
1629 (min == 1 && max == -1))
1630 length++;
1631 else length += 5;
1632 if (ptr[1] == '?') ptr++;
1633 }
1634 }
1635 continue;
1636
1637 case '^':
1638 case '.':
1639 case '$':
1640 case '*': /* These repeats won't be after brackets; */
1641 case '+': /* those are handled separately */
1642 case '?':
1643 length++;
1644 continue;
1645
1646 /* This covers the cases of repeats after a single char, metachar, class,
1647 or back reference. */
1648
1649 case '{':
1650 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
1651 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
1652 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1653 if ((min == 0 && (max == 1 || max == -1)) ||
1654 (min == 1 && max == -1))
1655 length++;
1656 else
1657 {
1658 length--; /* Uncount the original char or metachar */
1659 if (min == 1) length++; else if (min > 0) length += 4;
1660 if (max > 0) length += 4; else length += 2;
1661 }
1662 if (ptr[1] == '?') ptr++;
1663 continue;
1664
1665 /* An alternation contains an offset to the next branch or ket. */
1666 case '|':
1667 length += 3;
1668 continue;
1669
1670 /* A character class uses 33 characters. Don't worry about character types
1671 that aren't allowed in classes - they'll get picked up during the compile.
1672 A character class that contains only one character uses 2 or 3 bytes,
1673 depending on whether it is negated or not. Notice this where we can. */
1674
1675 case '[':
1676 class_charcount = 0;
1677 if (*(++ptr) == '^') ptr++;
1678 do
1679 {
1680 if (*ptr == '\\')
1681 {
1682 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
1683 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1684 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
1685 }
1686 else class_charcount++;
1687 ptr++;
1688 }
1689 while (*ptr != 0 && *ptr != ']');
1690
1691 /* Repeats for negated single chars are handled by the general code */
1692
1693 if (class_charcount == 1) length += 3; else
1694 {
1695 length += 33;
1696
1697 /* A repeat needs either 1 or 5 bytes. */
1698
1699 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
1700 {
1701 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
1702 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1703 if ((min == 0 && (max == 1 || max == -1)) ||
1704 (min == 1 && max == -1))
1705 length++;
1706 else length += 5;
1707 if (ptr[1] == '?') ptr++;
1708 }
1709 }
1710 continue;
1711
1712 /* Brackets may be genuine groups or special things */
1713
1714 case '(':
1715
1716 /* Handle special forms of bracket, which all start (? */
1717
1718 if (ptr[1] == '?') switch (c = ptr[2])
1719 {
1720 /* Skip over comments entirely */
1721 case '#':
1722 ptr += 3;
1723 while (*ptr != 0 && *ptr != ')') ptr++;
1724 if (*ptr == 0)
1725 {
1726 *errorptr = ERR18;
1727 goto PCRE_ERROR_RETURN;
1728 }
1729 continue;
1730
1731 /* Non-referencing groups and lookaheads just move the pointer on, and
1732 then behave like a non-special bracket, except that they don't increment
1733 the count of extracting brackets. */
1734
1735 case ':':
1736 case '=':
1737 case '!':
1738 ptr += 2;
1739 break;
1740
1741 /* Ditto for the "once only" bracket, allowed only if the extra bit
1742 is set. */
1743
1744 case '>':
1745 if ((options & PCRE_EXTRA) != 0)
1746 {
1747 ptr += 2;
1748 break;
1749 }
1750 /* Else fall thourh */
1751
1752 /* Else loop setting valid options until ) is met. Anything else is an
1753 error. */
1754
1755 default:
1756 ptr += 2;
1757 for (;; ptr++)
1758 {
1759 if ((c = *ptr) == 'i')
1760 {
1761 options |= PCRE_CASELESS;
1762 continue;
1763 }
1764 else if ((c = *ptr) == 'm')
1765 {
1766 options |= PCRE_MULTILINE;
1767 continue;
1768 }
1769 else if (c == 's')
1770 {
1771 options |= PCRE_DOTALL;
1772 continue;
1773 }
1774 else if (c == 'x')
1775 {
1776 options |= PCRE_EXTENDED;
1777 length -= spaces; /* Already counted spaces */
1778 continue;
1779 }
1780 else if (c == ')') break;
1781
1782 *errorptr = ERR12;
1783 goto PCRE_ERROR_RETURN;
1784 }
1785 continue; /* End of this bracket handling */
1786 }
1787
1788 /* Extracting brackets must be counted so we can process escapes in a
1789 Perlish way. */
1790
1791 else bracount++;
1792
1793 /* Non-special forms of bracket. Save length for computing whole length
1794 at end if there's a repeat that requires duplication of the group. */
1795
1796 if (brastackptr >= sizeof(brastack)/sizeof(int))
1797 {
1798 *errorptr = ERR19;
1799 goto PCRE_ERROR_RETURN;
1800 }
1801
1802 brastack[brastackptr++] = length;
1803 length += 3;
1804 continue;
1805
1806 /* Handle ket. Look for subsequent max/min; for certain sets of values we
1807 have to replicate this bracket up to that many times. If brastackptr is
1808 0 this is an unmatched bracket which will generate an error, but take care
1809 not to try to access brastack[-1]. */
1810
1811 case ')':
1812 length += 3;
1813 {
1814 int minval = 1;
1815 int maxval = 1;
1816 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
1817
1818 /* Leave ptr at the final char; for read_repeat_counts this happens
1819 automatically; for the others we need an increment. */
1820
1821 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
1822 {
1823 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
1824 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1825 }
1826 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
1827 else if (c == '+') { maxval = -1; ptr++; }
1828 else if (c == '?') { minval = 0; ptr++; }
1829
1830 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
1831 if there is a limited maximum we have to replicate up to maxval-1 times
1832 and allow for a BRAZERO item before each optional copy, as we also have
1833 to do before the first copy if the minimum is zero. */
1834
1835 if (minval == 0) length++;
1836 else if (minval > 1) length += (minval - 1) * duplength;
1837 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
1838 }
1839 continue;
1840
1841 /* Non-special character. For a run of such characters the length required
1842 is the number of characters + 2, except that the maximum run length is 255.
1843 We won't get a skipped space or a non-data escape or the start of a #
1844 comment as the first character, so the length can't be zero. */
1845
1846 NORMAL_CHAR:
1847 default:
1848 length += 2;
1849 runlength = 0;
1850 do
1851 {
1852 if ((pcre_ctypes[c] & ctype_space) != 0)
1853 {
1854 if ((options & PCRE_EXTENDED) != 0) continue;
1855 spaces++;
1856 }
1857
1858 if (c == '#' && (options & PCRE_EXTENDED) != 0)
1859 {
1860 while ((c = *(++ptr)) != 0 && c != '\n');
1861 continue;
1862 }
1863
1864 /* Backslash may introduce a data char or a metacharacter; stop the
1865 string before the latter. */
1866
1867 if (c == '\\')
1868 {
1869 const uschar *saveptr = ptr;
1870 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
1871 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
1872 if (c < 0) { ptr = saveptr; break; }
1873 }
1874
1875 /* Ordinary character or single-char escape */
1876
1877 runlength++;
1878 }
1879
1880 /* This "while" is the end of the "do" above. */
1881
1882 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1883
1884 ptr--;
1885 length += runlength;
1886 continue;
1887 }
1888 }
1889
1890 length += 4; /* For final KET and END */
1891
1892 if (length > 65539)
1893 {
1894 *errorptr = ERR20;
1895 return NULL;
1896 }
1897
1898 /* Compute the size of data block needed and get it, either from malloc or
1899 externally provided function. We specify "code[0]" in the offsetof() expression
1900 rather than just "code", because it has been reported that one broken compiler
1901 fails on "code" because it is also an independent variable. It should make no
1902 difference to the value of the offsetof(). */
1903
1904 size = length + offsetof(real_pcre, code[0]);
1905 re = (real_pcre *)(pcre_malloc)(size);
1906
1907 if (re == NULL)
1908 {
1909 *errorptr = ERR21;
1910 return NULL;
1911 }
1912
1913 /* Put in the magic number and the options. */
1914
1915 re->magic_number = MAGIC_NUMBER;
1916 re->options = options;
1917
1918 /* Set up a starting, non-extracting bracket, then compile the expression. On
1919 error, *errorptr will be set non-NULL, so we don't need to look at the result
1920 of the function here. */
1921
1922 ptr = (const uschar *)pattern;
1923 code = re->code;
1924 *code = OP_BRA;
1925 bracount = 0;
1926 (void)compile_regex(options, &bracount, &code, &ptr, errorptr);
1927 re->top_bracket = bracount;
1928 re->top_backref = top_backref;
1929
1930 /* If not reached end of pattern on success, there's an excess bracket. */
1931
1932 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
1933
1934 /* Fill in the terminating state and check for disastrous overflow, but
1935 if debugging, leave the test till after things are printed out. */
1936
1937 *code++ = OP_END;
1938
1939 #ifndef DEBUG
1940 if (code - re->code > length) *errorptr = ERR23;
1941 #endif
1942
1943 /* Failed to compile */
1944
1945 if (*errorptr != NULL)
1946 {
1947 (pcre_free)(re);
1948 PCRE_ERROR_RETURN:
1949 *erroroffset = ptr - (const uschar *)pattern;
1950 return NULL;
1951 }
1952
1953 /* If the anchored option was not passed, set flag if we can determine that it
1954 is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
1955 we can determine what the first character has to be, because that speeds up
1956 unanchored matches no end. In the case of multiline matches, an alternative is
1957 to set the PCRE_STARTLINE flag if all branches start with ^. */
1958
1959 if ((options & PCRE_ANCHORED) == 0)
1960 {
1961 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
1962 re->options |= PCRE_ANCHORED;
1963 else
1964 {
1965 int ch = find_firstchar(re->code);
1966 if (ch >= 0)
1967 {
1968 re->first_char = ch;
1969 re->options |= PCRE_FIRSTSET;
1970 }
1971 else if (is_startline(re->code))
1972 re->options |= PCRE_STARTLINE;
1973 }
1974 }
1975
1976 /* Print out the compiled data for debugging */
1977
1978 #ifdef DEBUG
1979
1980 printf("Length = %d top_bracket = %d top_backref=%d\n",
1981 length, re->top_bracket, re->top_backref);
1982
1983 if (re->options != 0)
1984 {
1985 printf("%s%s%s%s%s%s%s\n",
1986 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
1987 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
1988 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
1989 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
1990 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
1991 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
1992 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
1993 }
1994
1995 if ((re->options & PCRE_FIRSTSET) != 0)
1996 {
1997 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
1998 else printf("First char = \\x%02x\n", re->first_char);
1999 }
2000
2001 code_end = code;
2002 code_base = code = re->code;
2003
2004 while (code < code_end)
2005 {
2006 int charlength;
2007
2008 printf("%3d ", code - code_base);
2009
2010 if (*code >= OP_BRA)
2011 {
2012 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2013 code += 2;
2014 }
2015
2016 else switch(*code)
2017 {
2018 case OP_CHARS:
2019 charlength = *(++code);
2020 printf("%3d ", charlength);
2021 while (charlength-- > 0)
2022 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2023 break;
2024
2025 case OP_KETRMAX:
2026 case OP_KETRMIN:
2027 case OP_ALT:
2028 case OP_KET:
2029 case OP_ASSERT:
2030 case OP_ASSERT_NOT:
2031 case OP_ONCE:
2032 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2033 code += 2;
2034 break;
2035
2036 case OP_STAR:
2037 case OP_MINSTAR:
2038 case OP_PLUS:
2039 case OP_MINPLUS:
2040 case OP_QUERY:
2041 case OP_MINQUERY:
2042 case OP_TYPESTAR:
2043 case OP_TYPEMINSTAR:
2044 case OP_TYPEPLUS:
2045 case OP_TYPEMINPLUS:
2046 case OP_TYPEQUERY:
2047 case OP_TYPEMINQUERY:
2048 if (*code >= OP_TYPESTAR)
2049 printf(" %s", OP_names[code[1]]);
2050 else if (isprint(c = code[1])) printf(" %c", c);
2051 else printf(" \\x%02x", c);
2052 printf("%s", OP_names[*code++]);
2053 break;
2054
2055 case OP_EXACT:
2056 case OP_UPTO:
2057 case OP_MINUPTO:
2058 if (isprint(c = code[3])) printf(" %c{", c);
2059 else printf(" \\x%02x{", c);
2060 if (*code != OP_EXACT) printf("0,");
2061 printf("%d}", (code[1] << 8) + code[2]);
2062 if (*code == OP_MINUPTO) printf("?");
2063 code += 3;
2064 break;
2065
2066 case OP_TYPEEXACT:
2067 case OP_TYPEUPTO:
2068 case OP_TYPEMINUPTO:
2069 printf(" %s{", OP_names[code[3]]);
2070 if (*code != OP_TYPEEXACT) printf(",");
2071 printf("%d}", (code[1] << 8) + code[2]);
2072 if (*code == OP_TYPEMINUPTO) printf("?");
2073 code += 3;
2074 break;
2075
2076 case OP_NOT:
2077 if (isprint(c = *(++code))) printf(" [^%c]", c);
2078 else printf(" [^\\x%02x]", c);
2079 break;
2080
2081 case OP_NOTSTAR:
2082 case OP_NOTMINSTAR:
2083 case OP_NOTPLUS:
2084 case OP_NOTMINPLUS:
2085 case OP_NOTQUERY:
2086 case OP_NOTMINQUERY:
2087 if (isprint(c = code[1])) printf(" [^%c]", c);
2088 else printf(" [^\\x%02x]", c);
2089 printf("%s", OP_names[*code++]);
2090 break;
2091
2092 case OP_NOTEXACT:
2093 case OP_NOTUPTO:
2094 case OP_NOTMINUPTO:
2095 if (isprint(c = code[3])) printf(" [^%c]{", c);
2096 else printf(" [^\\x%02x]{", c);
2097 if (*code != OP_NOTEXACT) printf(",");
2098 printf("%d}", (code[1] << 8) + code[2]);
2099 if (*code == OP_NOTMINUPTO) printf("?");
2100 code += 3;
2101 break;
2102
2103 case OP_REF:
2104 printf(" \\%d", *(++code));
2105 code ++;
2106 goto CLASS_REF_REPEAT;
2107
2108 case OP_CLASS:
2109 case OP_NEGCLASS:
2110 {
2111 int i, min, max;
2112
2113 if (*code++ == OP_CLASS) printf(" [");
2114 else printf(" ^[");
2115
2116 for (i = 0; i < 256; i++)
2117 {
2118 if ((code[i/8] & (1 << (i&7))) != 0)
2119 {
2120 int j;
2121 for (j = i+1; j < 256; j++)
2122 if ((code[j/8] & (1 << (j&7))) == 0) break;
2123 if (i == '-' || i == ']') printf("\\");
2124 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2125 if (--j > i)
2126 {
2127 printf("-");
2128 if (j == '-' || j == ']') printf("\\");
2129 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2130 }
2131 i = j;
2132 }
2133 }
2134 printf("]");
2135 code += 32;
2136
2137 CLASS_REF_REPEAT:
2138
2139 switch(*code)
2140 {
2141 case OP_CRSTAR:
2142 case OP_CRMINSTAR:
2143 case OP_CRPLUS:
2144 case OP_CRMINPLUS:
2145 case OP_CRQUERY:
2146 case OP_CRMINQUERY:
2147 printf("%s", OP_names[*code]);
2148 break;
2149
2150 case OP_CRRANGE:
2151 case OP_CRMINRANGE:
2152 min = (code[1] << 8) + code[2];
2153 max = (code[3] << 8) + code[4];
2154 if (max == 0) printf("{%d,}", min);
2155 else printf("{%d,%d}", min, max);
2156 if (*code == OP_CRMINRANGE) printf("?");
2157 code += 4;
2158 break;
2159
2160 default:
2161 code--;
2162 }
2163 }
2164 break;
2165
2166 /* Anything else is just a one-node item */
2167
2168 default:
2169 printf(" %s", OP_names[*code]);
2170 break;
2171 }
2172
2173 code++;
2174 printf("\n");
2175 }
2176 printf("------------------------------------------------------------------\n");
2177
2178 /* This check is done here in the debugging case so that the code that
2179 was compiled can be seen. */
2180
2181 if (code - re->code > length)
2182 {
2183 *errorptr = ERR23;
2184 (pcre_free)(re);
2185 *erroroffset = ptr - (uschar *)pattern;
2186 return NULL;
2187 }
2188 #endif
2189
2190 return (pcre *)re;
2191 }
2192
2193
2194
2195 /*************************************************
2196 * Match a character type *
2197 *************************************************/
2198
2199 /* Not used in all the places it might be as it's sometimes faster
2200 to put the code inline.
2201
2202 Arguments:
2203 type the character type
2204 c the character
2205 dotall the dotall flag
2206
2207 Returns: TRUE if character is of the type
2208 */
2209
2210 static BOOL
2211 match_type(int type, int c, BOOL dotall)
2212 {
2213
2214 #ifdef DEBUG
2215 if (isprint(c)) printf("matching subject %c against ", c);
2216 else printf("matching subject \\x%02x against ", c);
2217 printf("%s\n", OP_names[type]);
2218 #endif
2219
2220 switch(type)
2221 {
2222 case OP_ANY: return dotall || c != '\n';
2223 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2224 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2225 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2226 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2227 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2228 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2229 }
2230 return FALSE;
2231 }
2232
2233
2234
2235 /*************************************************
2236 * Match a back-reference *
2237 *************************************************/
2238
2239 /* If a back reference hasn't been set, the match fails.
2240
2241 Arguments:
2242 number reference number
2243 eptr points into the subject
2244 length length to be matched
2245 md points to match data block
2246
2247 Returns: TRUE if matched
2248 */
2249
2250 static BOOL
2251 match_ref(int number, register const uschar *eptr, int length, match_data *md)
2252 {
2253 const uschar *p = md->start_subject + md->offset_vector[number];
2254
2255 #ifdef DEBUG
2256 if (eptr >= md->end_subject)
2257 printf("matching subject <null>");
2258 else
2259 {
2260 printf("matching subject ");
2261 pchars(eptr, length, TRUE, md);
2262 }
2263 printf(" against backref ");
2264 pchars(p, length, FALSE, md);
2265 printf("\n");
2266 #endif
2267
2268 /* Always fail if not enough characters left */
2269
2270 if (length > md->end_subject - p) return FALSE;
2271
2272 /* Separate the caselesss case for speed */
2273
2274 if (md->caseless)
2275 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2276 else
2277 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2278
2279 return TRUE;
2280 }
2281
2282
2283
2284 /*************************************************
2285 * Match from current position *
2286 *************************************************/
2287
2288 /* On entry ecode points to the first opcode, and eptr to the first character.
2289
2290 Arguments:
2291 eptr pointer in subject
2292 ecode position in code
2293 offset_top current top pointer
2294 md pointer to "static" info for the match
2295
2296 Returns: TRUE if matched
2297 */
2298
2299 static BOOL
2300 match(register const uschar *eptr, register const uschar *ecode, int offset_top,
2301 match_data *md)
2302 {
2303 for (;;)
2304 {
2305 int min, max, ctype;
2306 register int i;
2307 register int c;
2308 BOOL minimize = FALSE;
2309
2310 /* Opening bracket. Check the alternative branches in turn, failing if none
2311 match. We have to set the start offset if required and there is space
2312 in the offset vector so that it is available for subsequent back references
2313 if the bracket matches. However, if the bracket fails, we must put back the
2314 previous value of both offsets in case they were set by a previous copy of
2315 the same bracket. Don't worry about setting the flag for the error case here;
2316 that is handled in the code for KET. */
2317
2318 if ((int)*ecode >= OP_BRA)
2319 {
2320 int number = (*ecode - OP_BRA) << 1;
2321 int save_offset1 = 0, save_offset2 = 0;
2322
2323 DPRINTF(("start bracket %d\n", number/2));
2324
2325 if (number > 0 && number < md->offset_end)
2326 {
2327 save_offset1 = md->offset_vector[number];
2328 save_offset2 = md->offset_vector[number+1];
2329 md->offset_vector[number] = eptr - md->start_subject;
2330
2331 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
2332 }
2333
2334 /* Recurse for all the alternatives. */
2335
2336 do
2337 {
2338 if (match(eptr, ecode+3, offset_top, md)) return TRUE;
2339 ecode += (ecode[1] << 8) + ecode[2];
2340 }
2341 while (*ecode == OP_ALT);
2342
2343 DPRINTF(("bracket %d failed\n", number/2));
2344
2345 if (number > 0 && number < md->offset_end)
2346 {
2347 md->offset_vector[number] = save_offset1;
2348 md->offset_vector[number+1] = save_offset2;
2349 }
2350
2351 return FALSE;
2352 }
2353
2354 /* Other types of node can be handled by a switch */
2355
2356 switch(*ecode)
2357 {
2358 case OP_END:
2359 md->end_match_ptr = eptr; /* Record where we ended */
2360 md->end_offset_top = offset_top; /* and how many extracts were taken */
2361 return TRUE;
2362
2363 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
2364 whole thing doesn't match, so we have to get out via a longjmp(). */
2365
2366 case OP_CUT:
2367 if (match(eptr, ecode+1, offset_top, md)) return TRUE;
2368 longjmp(md->fail_env, 1);
2369
2370 /* Assertion brackets. Check the alternative branches in turn - the
2371 matching won't pass the KET for an assertion. If any one branch matches,
2372 the assertion is true. */
2373
2374 case OP_ASSERT:
2375 do
2376 {
2377 if (match(eptr, ecode+3, offset_top, md)) break;
2378 ecode += (ecode[1] << 8) + ecode[2];
2379 }
2380 while (*ecode == OP_ALT);
2381 if (*ecode == OP_KET) return FALSE;
2382
2383 /* Continue from after the assertion, updating the offsets high water
2384 mark, since extracts may have been taken during the assertion. */
2385
2386 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2387 ecode += 3;
2388 offset_top = md->end_offset_top;
2389 continue;
2390
2391 /* Negative assertion: all branches must fail to match */
2392
2393 case OP_ASSERT_NOT:
2394 do
2395 {
2396 if (match(eptr, ecode+3, offset_top, md)) return FALSE;
2397 ecode += (ecode[1] << 8) + ecode[2];
2398 }
2399 while (*ecode == OP_ALT);
2400 ecode += 3;
2401 continue;
2402
2403 /* "Once" brackets are like assertion brackets except that after a match,
2404 the point in the subject string is not moved back. Thus there can never be
2405 a move back into the brackets. Check the alternative branches in turn - the
2406 matching won't pass the KET for this kind of subpattern. If any one branch
2407 matches, we carry on, leaving the subject pointer. */
2408
2409 case OP_ONCE:
2410 do
2411 {
2412 if (match(eptr, ecode+3, offset_top, md)) break;
2413 ecode += (ecode[1] << 8) + ecode[2];
2414 }
2415 while (*ecode == OP_ALT);
2416 if (*ecode == OP_KET) return FALSE;
2417
2418 /* Continue as from after the assertion, updating the offsets high water
2419 mark, since extracts may have been taken. */
2420
2421 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2422 ecode += 3;
2423 offset_top = md->end_offset_top;
2424 eptr = md->end_match_ptr;
2425 continue;
2426
2427 /* An alternation is the end of a branch; scan along to find the end of the
2428 bracketed group and go to there. */
2429
2430 case OP_ALT:
2431 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
2432 break;
2433
2434 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
2435 that it may occur zero times. It may repeat infinitely, or not at all -
2436 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
2437 repeat limits are compiled as a number of copies, with the optional ones
2438 preceded by BRAZERO or BRAMINZERO. */
2439
2440 case OP_BRAZERO:
2441 {
2442 const uschar *next = ecode+1;
2443 if (match(eptr, next, offset_top, md)) return TRUE;
2444 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2445 ecode = next + 3;
2446 }
2447 break;
2448
2449 case OP_BRAMINZERO:
2450 {
2451 const uschar *next = ecode+1;
2452 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
2453 if (match(eptr, next+3, offset_top, md)) return TRUE;
2454 ecode++;
2455 }
2456 break;;
2457
2458 /* End of a group, repeated or non-repeating. If we are at the end of
2459 an assertion "group", stop matching and return TRUE, but record the
2460 current high water mark for use by positive assertions. */
2461
2462 case OP_KET:
2463 case OP_KETRMIN:
2464 case OP_KETRMAX:
2465 {
2466 int number;
2467 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
2468
2469 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
2470 {
2471 md->end_match_ptr = eptr; /* For ONCE */
2472 md->end_offset_top = offset_top;
2473 return TRUE;
2474 }
2475
2476 /* In all other cases we have to check the group number back at the
2477 start and if necessary complete handling an extraction by setting the
2478 final offset and bumping the high water mark. */
2479
2480 number = (*prev - OP_BRA) << 1;
2481
2482 DPRINTF(("end bracket %d\n", number/2));
2483
2484 if (number > 0)
2485 {
2486 if (number >= md->offset_end) md->offset_overflow = TRUE; else
2487 {
2488 md->offset_vector[number+1] = eptr - md->start_subject;
2489 if (offset_top <= number) offset_top = number + 2;
2490 }
2491 }
2492
2493 /* For a non-repeating ket, just advance to the next node and continue at
2494 this level. */
2495
2496 if (*ecode == OP_KET)
2497 {
2498 ecode += 3;
2499 break;
2500 }
2501
2502 /* The repeating kets try the rest of the pattern or restart from the
2503 preceding bracket, in the appropriate order. */
2504
2505 if (*ecode == OP_KETRMIN)
2506 {
2507 if (match(eptr, ecode+3, offset_top, md) ||
2508 match(eptr, prev, offset_top, md)) return TRUE;
2509 }
2510 else /* OP_KETRMAX */
2511 {
2512 if (match(eptr, prev, offset_top, md) ||
2513 match(eptr, ecode+3, offset_top, md)) return TRUE;
2514 }
2515 }
2516 return FALSE;
2517
2518 /* Start of subject unless notbol, or after internal newline if multiline */
2519
2520 case OP_CIRC:
2521 if (md->notbol && eptr == md->start_subject) return FALSE;
2522 if (md->multiline)
2523 {
2524 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
2525 ecode++;
2526 break;
2527 }
2528 /* ... else fall through */
2529
2530 /* Start of subject assertion */
2531
2532 case OP_SOD:
2533 if (eptr != md->start_subject) return FALSE;
2534 ecode++;
2535 break;
2536
2537 /* Assert before internal newline if multiline, or before
2538 a terminating newline unless endonly is set, else end of subject unless
2539 noteol is set. */
2540
2541 case OP_DOLL:
2542 if (md->noteol && eptr >= md->end_subject) return FALSE;
2543 if (md->multiline)
2544 {
2545 if (eptr < md->end_subject && *eptr != '\n') return FALSE;
2546 ecode++;
2547 break;
2548 }
2549 else if (!md->endonly)
2550 {
2551 if (eptr < md->end_subject - 1 ||
2552 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
2553 ecode++;
2554 break;
2555 }
2556 /* ... else fall through */
2557
2558 /* End of subject assertion */
2559
2560 case OP_EOD:
2561 if (eptr < md->end_subject) return FALSE;
2562 ecode++;
2563 break;
2564
2565 /* Word boundary assertions */
2566
2567 case OP_NOT_WORD_BOUNDARY:
2568 case OP_WORD_BOUNDARY:
2569 {
2570 BOOL prev_is_word = (eptr != md->start_subject) &&
2571 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
2572 BOOL cur_is_word = (eptr < md->end_subject) &&
2573 ((pcre_ctypes[*eptr] & ctype_word) != 0);
2574 if ((*ecode++ == OP_WORD_BOUNDARY)?
2575 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2576 return FALSE;
2577 }
2578 break;
2579
2580 /* Match a single character type; inline for speed */
2581
2582 case OP_ANY:
2583 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') return FALSE;
2584 if (eptr++ >= md->end_subject) return FALSE;
2585 ecode++;
2586 break;
2587
2588 case OP_NOT_DIGIT:
2589 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
2590 return FALSE;
2591 ecode++;
2592 break;
2593
2594 case OP_DIGIT:
2595 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
2596 return FALSE;
2597 ecode++;
2598 break;
2599
2600 case OP_NOT_WHITESPACE:
2601 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
2602 return FALSE;
2603 ecode++;
2604 break;
2605
2606 case OP_WHITESPACE:
2607 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
2608 return FALSE;
2609 ecode++;
2610 break;
2611
2612 case OP_NOT_WORDCHAR:
2613 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
2614 return FALSE;
2615 ecode++;
2616 break;
2617
2618 case OP_WORDCHAR:
2619 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
2620 return FALSE;
2621 ecode++;
2622 break;
2623
2624 /* Match a back reference, possibly repeatedly. Look past the end of the
2625 item to see if there is repeat information following. The code is similar
2626 to that for character classes, but repeated for efficiency. Then obey
2627 similar code to character type repeats - written out again for speed.
2628 However, if the referenced string is the empty string, always treat
2629 it as matched, any number of times (otherwise there could be infinite
2630 loops). */
2631
2632 case OP_REF:
2633 {
2634 int length;
2635 int number = ecode[1] << 1; /* Doubled reference number */
2636 ecode += 2; /* Advance past the item */
2637
2638 if (number >= offset_top || md->offset_vector[number] < 0)
2639 {
2640 md->errorcode = PCRE_ERROR_BADREF;
2641 return FALSE;
2642 }
2643
2644 length = md->offset_vector[number+1] - md->offset_vector[number];
2645
2646 switch (*ecode)
2647 {
2648 case OP_CRSTAR:
2649 case OP_CRMINSTAR:
2650 case OP_CRPLUS:
2651 case OP_CRMINPLUS:
2652 case OP_CRQUERY:
2653 case OP_CRMINQUERY:
2654 c = *ecode++ - OP_CRSTAR;
2655 minimize = (c & 1) != 0;
2656 min = rep_min[c]; /* Pick up values from tables; */
2657 max = rep_max[c]; /* zero for max => infinity */
2658 if (max == 0) max = INT_MAX;
2659 break;
2660
2661 case OP_CRRANGE:
2662 case OP_CRMINRANGE:
2663 minimize = (*ecode == OP_CRMINRANGE);
2664 min = (ecode[1] << 8) + ecode[2];
2665 max = (ecode[3] << 8) + ecode[4];
2666 if (max == 0) max = INT_MAX;
2667 ecode += 5;
2668 break;
2669
2670 default: /* No repeat follows */
2671 if (!match_ref(number, eptr, length, md)) return FALSE;
2672 eptr += length;
2673 continue; /* With the main loop */
2674 }
2675
2676 /* If the length of the reference is zero, just continue with the
2677 main loop. */
2678
2679 if (length == 0) continue;
2680
2681 /* First, ensure the minimum number of matches are present. We get back
2682 the length of the reference string explicitly rather than passing the
2683 address of eptr, so that eptr can be a register variable. */
2684
2685 for (i = 1; i <= min; i++)
2686 {
2687 if (!match_ref(number, eptr, length, md)) return FALSE;
2688 eptr += length;
2689 }
2690
2691 /* If min = max, continue at the same level without recursion.
2692 They are not both allowed to be zero. */
2693
2694 if (min == max) continue;
2695
2696 /* If minimizing, keep trying and advancing the pointer */
2697
2698 if (minimize)
2699 {
2700 for (i = min;; i++)
2701 {
2702 if (match(eptr, ecode, offset_top, md)) return TRUE;
2703 if (i >= max || !match_ref(number, eptr, length, md))
2704 return FALSE;
2705 eptr += length;
2706 }
2707 /* Control never gets here */
2708 }
2709
2710 /* If maximizing, find the longest string and work backwards */
2711
2712 else
2713 {
2714 const uschar *pp = eptr;
2715 for (i = min; i < max; i++)
2716 {
2717 if (!match_ref(number, eptr, length, md)) break;
2718 eptr += length;
2719 }
2720 while (eptr >= pp)
2721 {
2722 if (match(eptr, ecode, offset_top, md)) return TRUE;
2723 eptr -= length;
2724 }
2725 return FALSE;
2726 }
2727 }
2728 /* Control never gets here */
2729
2730 /* Match a character class, possibly repeatedly. Look past the end of the
2731 item to see if there is repeat information following. Then obey similar
2732 code to character type repeats - written out again for speed. If caseless
2733 matching was set at runtime but not at compile time, we have to check both
2734 versions of a character, and we have to behave differently for positive and
2735 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
2736 treated differently. */
2737
2738 case OP_CLASS:
2739 case OP_NEGCLASS:
2740 {
2741 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
2742 const uschar *data = ecode + 1; /* Save for matching */
2743 ecode += 33; /* Advance past the item */
2744
2745 switch (*ecode)
2746 {
2747 case OP_CRSTAR:
2748 case OP_CRMINSTAR:
2749 case OP_CRPLUS:
2750 case OP_CRMINPLUS:
2751 case OP_CRQUERY:
2752 case OP_CRMINQUERY:
2753 c = *ecode++ - OP_CRSTAR;
2754 minimize = (c & 1) != 0;
2755 min = rep_min[c]; /* Pick up values from tables; */
2756 max = rep_max[c]; /* zero for max => infinity */
2757 if (max == 0) max = INT_MAX;
2758 break;
2759
2760 case OP_CRRANGE:
2761 case OP_CRMINRANGE:
2762 minimize = (*ecode == OP_CRMINRANGE);
2763 min = (ecode[1] << 8) + ecode[2];
2764 max = (ecode[3] << 8) + ecode[4];
2765 if (max == 0) max = INT_MAX;
2766 ecode += 5;
2767 break;
2768
2769 default: /* No repeat follows */
2770 min = max = 1;
2771 break;
2772 }
2773
2774 /* First, ensure the minimum number of matches are present. */
2775
2776 for (i = 1; i <= min; i++)
2777 {
2778 if (eptr >= md->end_subject) return FALSE;
2779 c = *eptr++;
2780
2781 /* Either not runtime caseless, or it was a positive class. For
2782 runtime caseless, continue if either case is in the map. */
2783
2784 if (!nasty_case)
2785 {
2786 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2787 if (md->runtime_caseless)
2788 {
2789 c = pcre_fcc[c];
2790 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2791 }
2792 }
2793
2794 /* Runtime caseless and it was a negative class. Continue only if
2795 both cases are in the map. */
2796
2797 else
2798 {
2799 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2800 c = pcre_fcc[c];
2801 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2802 }
2803
2804 return FALSE;
2805 }
2806
2807 /* If max == min we can continue with the main loop without the
2808 need to recurse. */
2809
2810 if (min == max) continue;
2811
2812 /* If minimizing, keep testing the rest of the expression and advancing
2813 the pointer while it matches the class. */
2814
2815 if (minimize)
2816 {
2817 for (i = min;; i++)
2818 {
2819 if (match(eptr, ecode, offset_top, md)) return TRUE;
2820 if (i >= max || eptr >= md->end_subject) return FALSE;
2821 c = *eptr++;
2822
2823 /* Either not runtime caseless, or it was a positive class. For
2824 runtime caseless, continue if either case is in the map. */
2825
2826 if (!nasty_case)
2827 {
2828 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2829 if (md->runtime_caseless)
2830 {
2831 c = pcre_fcc[c];
2832 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2833 }
2834 }
2835
2836 /* Runtime caseless and it was a negative class. Continue only if
2837 both cases are in the map. */
2838
2839 else
2840 {
2841 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
2842 c = pcre_fcc[c];
2843 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2844 }
2845
2846 return FALSE;
2847 }
2848 /* Control never gets here */
2849 }
2850
2851 /* If maximizing, find the longest possible run, then work backwards. */
2852
2853 else
2854 {
2855 const uschar *pp = eptr;
2856 for (i = min; i < max; eptr++, i++)
2857 {
2858 if (eptr >= md->end_subject) break;
2859 c = *eptr;
2860
2861 /* Either not runtime caseless, or it was a positive class. For
2862 runtime caseless, continue if either case is in the map. */
2863
2864 if (!nasty_case)
2865 {
2866 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2867 if (md->runtime_caseless)
2868 {
2869 c = pcre_fcc[c];
2870 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2871 }
2872 }
2873
2874 /* Runtime caseless and it was a negative class. Continue only if
2875 both cases are in the map. */
2876
2877 else
2878 {
2879 if ((data[c/8] & (1 << (c&7))) == 0) break;
2880 c = pcre_fcc[c];
2881 if ((data[c/8] & (1 << (c&7))) != 0) continue;
2882 }
2883
2884 break;
2885 }
2886
2887 while (eptr >= pp)
2888 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2889 return FALSE;
2890 }
2891 }
2892 /* Control never gets here */
2893
2894 /* Match a run of characters */
2895
2896 case OP_CHARS:
2897 {
2898 register int length = ecode[1];
2899 ecode += 2;
2900
2901 #ifdef DEBUG /* Sigh. Some compilers never learn. */
2902 if (eptr >= md->end_subject)
2903 printf("matching subject <null> against pattern ");
2904 else
2905 {
2906 printf("matching subject ");
2907 pchars(eptr, length, TRUE, md);
2908 printf(" against pattern ");
2909 }
2910 pchars(ecode, length, FALSE, md);
2911 printf("\n");
2912 #endif
2913
2914 if (length > md->end_subject - eptr) return FALSE;
2915 if (md->caseless)
2916 {
2917 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE;
2918 }
2919 else
2920 {
2921 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
2922 }
2923 }
2924 break;
2925
2926 /* Match a single character repeatedly; different opcodes share code. */
2927
2928 case OP_EXACT:
2929 min = max = (ecode[1] << 8) + ecode[2];
2930 ecode += 3;
2931 goto REPEATCHAR;
2932
2933 case OP_UPTO:
2934 case OP_MINUPTO:
2935 min = 0;
2936 max = (ecode[1] << 8) + ecode[2];
2937 minimize = *ecode == OP_MINUPTO;
2938 ecode += 3;
2939 goto REPEATCHAR;
2940
2941 case OP_STAR:
2942 case OP_MINSTAR:
2943 case OP_PLUS:
2944 case OP_MINPLUS:
2945 case OP_QUERY:
2946 case OP_MINQUERY:
2947 c = *ecode++ - OP_STAR;
2948 minimize = (c & 1) != 0;
2949 min = rep_min[c]; /* Pick up values from tables; */
2950 max = rep_max[c]; /* zero for max => infinity */
2951 if (max == 0) max = INT_MAX;
2952
2953 /* Common code for all repeated single-character matches. We can give
2954 up quickly if there are fewer than the minimum number of characters left in
2955 the subject. */
2956
2957 REPEATCHAR:
2958 if (min > md->end_subject - eptr) return FALSE;
2959 c = *ecode++;
2960
2961 /* The code is duplicated for the caseless and caseful cases, for speed,
2962 since matching characters is likely to be quite common. First, ensure the
2963 minimum number of matches are present. If min = max, continue at the same
2964 level without recursing. Otherwise, if minimizing, keep trying the rest of
2965 the expression and advancing one matching character if failing, up to the
2966 maximum. Alternatively, if maximizing, find the maximum number of
2967 characters and work backwards. */
2968
2969 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
2970 max, eptr));
2971
2972 if (md->caseless)
2973 {
2974 c = pcre_lcc[c];
2975 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE;
2976 if (min == max) continue;
2977 if (minimize)
2978 {
2979 for (i = min;; i++)
2980 {
2981 if (match(eptr, ecode, offset_top, md)) return TRUE;
2982 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
2983 return FALSE;
2984 }
2985 /* Control never gets here */
2986 }
2987 else
2988 {
2989 const uschar *pp = eptr;
2990 for (i = min; i < max; i++)
2991 {
2992 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
2993 eptr++;
2994 }
2995 while (eptr >= pp)
2996 if (match(eptr--, ecode, offset_top, md)) return TRUE;
2997 return FALSE;
2998 }
2999 /* Control never gets here */
3000 }
3001
3002 /* Caseful comparisons */
3003
3004 else
3005 {
3006 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3007 if (min == max) continue;
3008 if (minimize)
3009 {
3010 for (i = min;; i++)
3011 {
3012 if (match(eptr, ecode, offset_top, md)) return TRUE;
3013 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3014 }
3015 /* Control never gets here */
3016 }
3017 else
3018 {
3019 const uschar *pp = eptr;
3020 for (i = min; i < max; i++)
3021 {
3022 if (eptr >= md->end_subject || c != *eptr) break;
3023 eptr++;
3024 }
3025 while (eptr >= pp)
3026 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3027 return FALSE;
3028 }
3029 }
3030 /* Control never gets here */
3031
3032 /* Match a negated single character */
3033
3034 case OP_NOT:
3035 if (eptr >= md->end_subject) return FALSE;
3036 ecode++;
3037 if (md->caseless)
3038 {
3039 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE;
3040 }
3041 else
3042 {
3043 if (*ecode++ == *eptr++) return FALSE;
3044 }
3045 break;
3046
3047 /* Match a negated single character repeatedly. This is almost a repeat of
3048 the code for a repeated single character, but I haven't found a nice way of
3049 commoning these up that doesn't require a test of the positive/negative
3050 option for each character match. Maybe that wouldn't add very much to the
3051 time taken, but character matching *is* what this is all about... */
3052
3053 case OP_NOTEXACT:
3054 min = max = (ecode[1] << 8) + ecode[2];
3055 ecode += 3;
3056 goto REPEATNOTCHAR;
3057
3058 case OP_NOTUPTO:
3059 case OP_NOTMINUPTO:
3060 min = 0;
3061 max = (ecode[1] << 8) + ecode[2];
3062 minimize = *ecode == OP_NOTMINUPTO;
3063 ecode += 3;
3064 goto REPEATNOTCHAR;
3065
3066 case OP_NOTSTAR:
3067 case OP_NOTMINSTAR:
3068 case OP_NOTPLUS:
3069 case OP_NOTMINPLUS:
3070 case OP_NOTQUERY:
3071 case OP_NOTMINQUERY:
3072 c = *ecode++ - OP_NOTSTAR;
3073 minimize = (c & 1) != 0;
3074 min = rep_min[c]; /* Pick up values from tables; */
3075 max = rep_max[c]; /* zero for max => infinity */
3076 if (max == 0) max = INT_MAX;
3077
3078 /* Common code for all repeated single-character matches. We can give
3079 up quickly if there are fewer than the minimum number of characters left in
3080 the subject. */
3081
3082 REPEATNOTCHAR:
3083 if (min > md->end_subject - eptr) return FALSE;
3084 c = *ecode++;
3085
3086 /* The code is duplicated for the caseless and caseful cases, for speed,
3087 since matching characters is likely to be quite common. First, ensure the
3088 minimum number of matches are present. If min = max, continue at the same
3089 level without recursing. Otherwise, if minimizing, keep trying the rest of
3090 the expression and advancing one matching character if failing, up to the
3091 maximum. Alternatively, if maximizing, find the maximum number of
3092 characters and work backwards. */
3093
3094 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3095 max, eptr));
3096
3097 if (md->caseless)
3098 {
3099 c = pcre_lcc[c];
3100 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE;
3101 if (min == max) continue;
3102 if (minimize)
3103 {
3104 for (i = min;; i++)
3105 {
3106 if (match(eptr, ecode, offset_top, md)) return TRUE;
3107 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
3108 return FALSE;
3109 }
3110 /* Control never gets here */
3111 }
3112 else
3113 {
3114 const uschar *pp = eptr;
3115 for (i = min; i < max; i++)
3116 {
3117 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
3118 eptr++;
3119 }
3120 while (eptr >= pp)
3121 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3122 return FALSE;
3123 }
3124 /* Control never gets here */
3125 }
3126
3127 /* Caseful comparisons */
3128
3129 else
3130 {
3131 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3132 if (min == max) continue;
3133 if (minimize)
3134 {
3135 for (i = min;; i++)
3136 {
3137 if (match(eptr, ecode, offset_top, md)) return TRUE;
3138 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3139 }
3140 /* Control never gets here */
3141 }
3142 else
3143 {
3144 const uschar *pp = eptr;
3145 for (i = min; i < max; i++)
3146 {
3147 if (eptr >= md->end_subject || c == *eptr) break;
3148 eptr++;
3149 }
3150 while (eptr >= pp)
3151 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3152 return FALSE;
3153 }
3154 }
3155 /* Control never gets here */
3156
3157 /* Match a single character type repeatedly; several different opcodes
3158 share code. This is very similar to the code for single characters, but we
3159 repeat it in the interests of efficiency. */
3160
3161 case OP_TYPEEXACT:
3162 min = max = (ecode[1] << 8) + ecode[2];
3163 minimize = TRUE;
3164 ecode += 3;
3165 goto REPEATTYPE;
3166
3167 case OP_TYPEUPTO:
3168 case OP_TYPEMINUPTO:
3169 min = 0;
3170 max = (ecode[1] << 8) + ecode[2];
3171 minimize = *ecode == OP_TYPEMINUPTO;
3172 ecode += 3;
3173 goto REPEATTYPE;
3174
3175 case OP_TYPESTAR:
3176 case OP_TYPEMINSTAR:
3177 case OP_TYPEPLUS:
3178 case OP_TYPEMINPLUS:
3179 case OP_TYPEQUERY:
3180 case OP_TYPEMINQUERY:
3181 c = *ecode++ - OP_TYPESTAR;
3182 minimize = (c & 1) != 0;
3183 min = rep_min[c]; /* Pick up values from tables; */
3184 max = rep_max[c]; /* zero for max => infinity */
3185 if (max == 0) max = INT_MAX;
3186
3187 /* Common code for all repeated single character type matches */
3188
3189 REPEATTYPE:
3190 ctype = *ecode++; /* Code for the character type */
3191
3192 /* First, ensure the minimum number of matches are present. Use inline
3193 code for maximizing the speed, and do the type test once at the start
3194 (i.e. keep it out of the loop). Also test that there are at least the
3195 minimum number of characters before we start. */
3196
3197 if (min > md->end_subject - eptr) return FALSE;
3198 if (min > 0) switch(ctype)
3199 {
3200 case OP_ANY:
3201 if (!md->dotall)
3202 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3203 else eptr += min;
3204 break;
3205
3206 case OP_NOT_DIGIT:
3207 for (i = 1; i <= min; i++)
3208 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3209 break;
3210
3211 case OP_DIGIT:
3212 for (i = 1; i <= min; i++)
3213 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3214 break;
3215
3216 case OP_NOT_WHITESPACE:
3217 for (i = 1; i <= min; i++)
3218 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3219 break;
3220
3221 case OP_WHITESPACE:
3222 for (i = 1; i <= min; i++)
3223 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3224 break;
3225
3226 case OP_NOT_WORDCHAR:
3227 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3228 return FALSE;
3229 break;
3230
3231 case OP_WORDCHAR:
3232 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3233 return FALSE;
3234 break;
3235 }
3236
3237 /* If min = max, continue at the same level without recursing */
3238
3239 if (min == max) continue;
3240
3241 /* If minimizing, we have to test the rest of the pattern before each
3242 subsequent match, so inlining isn't much help; just use the function. */
3243
3244 if (minimize)
3245 {
3246 for (i = min;; i++)
3247 {
3248 if (match(eptr, ecode, offset_top, md)) return TRUE;
3249 if (i >= max || eptr >= md->end_subject ||
3250 !match_type(ctype, *eptr++, md->dotall))
3251 return FALSE;
3252 }
3253 /* Control never gets here */
3254 }
3255
3256 /* If maximizing it is worth using inline code for speed, doing the type
3257 test once at the start (i.e. keep it out of the loop). */
3258
3259 else
3260 {
3261 const uschar *pp = eptr;
3262 switch(ctype)
3263 {
3264 case OP_ANY:
3265 if (!md->dotall)
3266 {
3267 for (i = min; i < max; i++)
3268 {
3269 if (eptr >= md->end_subject || *eptr == '\n') break;
3270 eptr++;
3271 }
3272 }
3273 else
3274 {
3275 c = max - min;
3276 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3277 eptr += c;
3278 }
3279 break;
3280
3281 case OP_NOT_DIGIT:
3282 for (i = min; i < max; i++)
3283 {
3284 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3285 break;
3286 eptr++;
3287 }
3288 break;
3289
3290 case OP_DIGIT:
3291 for (i = min; i < max; i++)
3292 {
3293 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3294 break;
3295 eptr++;
3296 }
3297 break;
3298
3299 case OP_NOT_WHITESPACE:
3300 for (i = min; i < max; i++)
3301 {
3302 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3303 break;
3304 eptr++;
3305 }
3306 break;
3307
3308 case OP_WHITESPACE:
3309 for (i = min; i < max; i++)
3310 {
3311 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3312 break;
3313 eptr++;
3314 }
3315 break;
3316
3317 case OP_NOT_WORDCHAR:
3318 for (i = min; i < max; i++)
3319 {
3320 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3321 break;
3322 eptr++;
3323 }
3324 break;
3325
3326 case OP_WORDCHAR:
3327 for (i = min; i < max; i++)
3328 {
3329 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3330 break;
3331 eptr++;
3332 }
3333 break;
3334 }
3335
3336 while (eptr >= pp)
3337 if (match(eptr--, ecode, offset_top, md)) return TRUE;
3338 return FALSE;
3339 }
3340 /* Control never gets here */
3341
3342 /* There's been some horrible disaster. */
3343
3344 default:
3345 DPRINTF(("Unknown opcode %d\n", *ecode));
3346 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3347 return FALSE;
3348 }
3349
3350 /* Do not stick any code in here without much thought; it is assumed
3351 that "continue" in the code above comes out to here to repeat the main
3352 loop. */
3353
3354 } /* End of main loop */
3355 /* Control never reaches here */
3356 }
3357
3358
3359
3360 /*************************************************
3361 * Segregate setjmp() *
3362 *************************************************/
3363
3364 /* The -Wall option of gcc gives warnings for all local variables when setjmp()
3365 is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
3366 hide it in a separate function. This is called only when PCRE_EXTRA is set,
3367 since it's needed only for the extension \X option, and with any luck, a good
3368 compiler will spot the tail recursion and compile it efficiently.
3369
3370 Arguments:
3371 eptr pointer in subject
3372 ecode position in code
3373 offset_top current top pointer
3374 md pointer to "static" info for the match
3375
3376 Returns: TRUE if matched
3377 */
3378
3379 static BOOL
3380 match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
3381 match_data *match_block)
3382 {
3383 return setjmp(match_block->fail_env) == 0 &&
3384 match(eptr, ecode, offset_top, match_block);
3385 }
3386
3387
3388
3389 /*************************************************
3390 * Execute a Regular Expression *
3391 *************************************************/
3392
3393 /* This function applies a compiled re to a subject string and picks out
3394 portions of the string if it matches. Two elements in the vector are set for
3395 each substring: the offsets to the start and end of the substring.
3396
3397 Arguments:
3398 external_re points to the compiled expression
3399 external_extra points to "hints" from pcre_study() or is NULL
3400 subject points to the subject string
3401 length length of subject string (may contain binary zeros)
3402 options option bits
3403 offsets points to a vector of ints to be filled in with offsets
3404 offsetcount the number of elements in the vector
3405
3406 Returns: > 0 => success; value is the number of elements filled in
3407 = 0 => success, but offsets is not big enough
3408 -1 => failed to match
3409 < -1 => some kind of unexpected problem
3410 */
3411
3412 int
3413 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
3414 const char *subject, int length, int options, int *offsets, int offsetcount)
3415 {
3416 int resetcount, ocount;
3417 int first_char = -1;
3418 match_data match_block;
3419 const uschar *start_bits = NULL;
3420 const uschar *start_match = (const uschar *)subject;
3421 const uschar *end_subject;
3422 const real_pcre *re = (const real_pcre *)external_re;
3423 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
3424 BOOL using_temporary_offsets = FALSE;
3425 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3426 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3427
3428 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3429
3430 if (re == NULL || subject == NULL ||
3431 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3432 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3433
3434 match_block.start_subject = (const uschar *)subject;
3435 match_block.end_subject = match_block.start_subject + length;
3436 end_subject = match_block.end_subject;
3437
3438 match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3439 match_block.runtime_caseless = match_block.caseless &&
3440 (re->options & PCRE_CASELESS) == 0;
3441
3442 match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
3443 match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
3444 match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
3445
3446 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3447 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3448
3449 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3450
3451 /* If the expression has got more back references than the offsets supplied can
3452 hold, we get a temporary bit of working store to use during the matching.
3453 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3454 of 2. */
3455
3456 ocount = offsetcount & (-2);
3457 if (re->top_backref > 0 && re->top_backref >= ocount/2)
3458 {
3459 ocount = re->top_backref * 2 + 2;
3460 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3461 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3462 using_temporary_offsets = TRUE;
3463 DPRINTF(("Got memory to hold back references\n"));
3464 }
3465 else match_block.offset_vector = offsets;
3466
3467 match_block.offset_end = ocount;
3468 match_block.offset_overflow = FALSE;
3469
3470 /* Compute the minimum number of offsets that we need to reset each time. Doing
3471 this makes a huge difference to execution time when there aren't many brackets
3472 in the pattern. */
3473
3474 resetcount = 2 + re->top_bracket * 2;
3475 if (resetcount > offsetcount) resetcount = ocount;
3476
3477 /* If MULTILINE is set at exec time but was not set at compile time, and the
3478 anchored flag is set, we must re-check because a setting provoked by ^ in the
3479 pattern is not right in multi-line mode. Calling is_anchored() again here does
3480 the right check, because multiline is now set. If it now yields FALSE, the
3481 expression must have had ^ starting some of its branches. Check to see if
3482 that is true for *all* branches, and if so, set the startline flag. */
3483
3484 if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3485 !is_anchored(re->code, match_block.multiline))
3486 {
3487 anchored = FALSE;
3488 if (is_startline(re->code)) startline = TRUE;
3489 }
3490
3491 /* Set up the first character to match, if available. The first_char value is
3492 never set for an anchored regular expression, but the anchoring may be forced
3493 at run time, so we have to test for anchoring. The first char may be unset for
3494 an unanchored pattern, of course. If there's no first char and the pattern was
3495 studied, the may be a bitmap of possible first characters. However, we can
3496 use this only if the caseless state of the studying was correct. */
3497
3498 if (!anchored)
3499 {
3500 if ((re->options & PCRE_FIRSTSET) != 0)
3501 {
3502 first_char = re->first_char;
3503 if (match_block.caseless) first_char = pcre_lcc[first_char];
3504 }
3505 else
3506 if (!startline && extra != NULL &&
3507 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3508 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3509 start_bits = extra->start_bits;
3510 }
3511
3512 /* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3513
3514 do
3515 {
3516 int rc;
3517 register int *iptr = match_block.offset_vector;
3518 register int *iend = iptr + resetcount;
3519
3520 /* Reset the maximum number of extractions we might see. */
3521
3522 while (iptr < iend) *iptr++ = -1;
3523
3524 /* Advance to a unique first char if possible */
3525
3526 if (first_char >= 0)
3527 {
3528 if (match_block.caseless)
3529 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
3530 start_match++;
3531 else
3532 while (start_match < end_subject && *start_match != first_char)
3533 start_match++;
3534 }
3535
3536 /* Or to just after \n for a multiline match if possible */
3537
3538 else if (startline)
3539 {
3540 if (start_match > match_block.start_subject)
3541 {
3542 while (start_match < end_subject && start_match[-1] != '\n')
3543 start_match++;
3544 }
3545 }
3546
3547 /* Or to a non-unique first char */
3548
3549 else if (start_bits != NULL)
3550 {
3551 while (start_match < end_subject)
3552 {
3553 register int c = *start_match;
3554 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3555 }
3556 }
3557
3558 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3559 printf(">>>> Match against: ");
3560 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3561 printf("\n");
3562 #endif
3563
3564 /* When a match occurs, substrings will be set for all internal extractions;
3565 we just need to set up the whole thing as substring 0 before returning. If
3566 there were too many extractions, set the return code to zero. In the case
3567 where we had to get some local store to hold offsets for backreferences, copy
3568 those back references that we can. In this case there need not be overflow
3569 if certain parts of the pattern were not used.
3570
3571 Before starting the match, we have to set up a longjmp() target to enable
3572 the "cut" operation to fail a match completely without backtracking. This
3573 is done in a separate function to avoid compiler warnings. We need not do
3574 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
3575 enabled. */
3576
3577 if ((re->options & PCRE_EXTRA) != 0)
3578 {
3579 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
3580 continue;
3581 }
3582 else if (!match(start_match, re->code, 2, &match_block)) continue;
3583
3584 /* Copy the offset information from temporary store if necessary */
3585
3586 if (using_temporary_offsets)
3587 {
3588 if (offsetcount >= 4)
3589 {
3590 memcpy(offsets + 2, match_block.offset_vector + 2,
3591 (offsetcount - 2) * sizeof(int));
3592 DPRINTF(("Copied offsets from temporary memory\n"));
3593 }
3594 if (match_block.end_offset_top > offsetcount)
3595 match_block.offset_overflow = TRUE;
3596
3597 DPRINTF(("Freeing temporary memory\n"));
3598 (pcre_free)(match_block.offset_vector);
3599 }
3600
3601 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3602
3603 if (match_block.offset_end < 2) rc = 0; else
3604 {
3605 offsets[0] = start_match - match_block.start_subject;
3606 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3607 }
3608
3609 DPRINTF((">>>> returning %d\n", rc));
3610 return rc;
3611 }
3612 while (!anchored &&
3613 match_block.errorcode == PCRE_ERROR_NOMATCH &&
3614 start_match++ < end_subject);
3615
3616 if (using_temporary_offsets)
3617 {
3618 DPRINTF(("Freeing temporary memory\n"));
3619 (pcre_free)(match_block.offset_vector);
3620 }
3621
3622 DPRINTF((">>>> returning %d\n", match_block.errorcode));
3623
3624 return match_block.errorcode;
3625 }
3626
3627 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12