/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 47 - (show annotations) (download)
Sat Feb 24 21:39:29 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 142941 byte(s)
Load pcre-3.2 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2000 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70
71 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73
74 /* Text forms of OP_ values and things, for debugging (not all used) */
75
76 #ifdef DEBUG
77 static const char *OP_names[] = {
78 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80 "Opt", "^", "$", "Any", "chars", "not",
81 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84 "*", "*?", "+", "+?", "?", "??", "{", "{",
85 "class", "Ref", "Recurse",
86 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 "Brazero", "Braminzero", "Bra"
89 };
90 #endif
91
92 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93 are simple data values; negative values are for special things like \d and so
94 on. Zero means further processing is needed (for things like \x), or the escape
95 is invalid. */
96
97 static const short int escapes[] = {
98 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 0, 0, -ESC_z /* x - z */
108 };
109
110 /* Tables of names of POSIX character classes and their lengths. The list is
111 terminated by a zero length entry. The first three must be alpha, upper, lower,
112 as this is assumed for handling case independence. */
113
114 static const char *posix_names[] = {
115 "alpha", "lower", "upper",
116 "alnum", "ascii", "cntrl", "digit", "graph",
117 "print", "punct", "space", "word", "xdigit" };
118
119 static const uschar posix_name_lengths[] = {
120 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121
122 /* Table of class bit maps for each POSIX class; up to three may be combined
123 to form the class. */
124
125 static const int posix_class_maps[] = {
126 cbit_lower, cbit_upper, -1, /* alpha */
127 cbit_lower, -1, -1, /* lower */
128 cbit_upper, -1, -1, /* upper */
129 cbit_digit, cbit_lower, cbit_upper, /* alnum */
130 cbit_print, cbit_cntrl, -1, /* ascii */
131 cbit_cntrl, -1, -1, /* cntrl */
132 cbit_digit, -1, -1, /* digit */
133 cbit_graph, -1, -1, /* graph */
134 cbit_print, -1, -1, /* print */
135 cbit_punct, -1, -1, /* punct */
136 cbit_space, -1, -1, /* space */
137 cbit_word, -1, -1, /* word */
138 cbit_xdigit,-1, -1 /* xdigit */
139 };
140
141
142 /* Definition to allow mutual recursion */
143
144 static BOOL
145 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146 BOOL, int, int *, int *, compile_data *);
147
148 /* Structure for building a chain of data that actually lives on the
149 stack, for holding the values of the subject pointer at the start of each
150 subpattern, so as to detect when an empty string has been matched by a
151 subpattern - to break infinite loops. */
152
153 typedef struct eptrblock {
154 struct eptrblock *prev;
155 const uschar *saved_eptr;
156 } eptrblock;
157
158 /* Flag bits for the match() function */
159
160 #define match_condassert 0x01 /* Called to check a condition assertion */
161 #define match_isgroup 0x02 /* Set if start of bracketed group */
162
163
164
165 /*************************************************
166 * Global variables *
167 *************************************************/
168
169 /* PCRE is thread-clean and doesn't use any global variables in the normal
170 sense. However, it calls memory allocation and free functions via the two
171 indirections below, which are can be changed by the caller, but are shared
172 between all threads. */
173
174 void *(*pcre_malloc)(size_t) = malloc;
175 void (*pcre_free)(void *) = free;
176
177
178
179
180 /*************************************************
181 * Default character tables *
182 *************************************************/
183
184 /* A default set of character tables is included in the PCRE binary. Its source
185 is built by the maketables auxiliary program, which uses the default C ctypes
186 functions, and put in the file chartables.c. These tables are used by PCRE
187 whenever the caller of pcre_compile() does not provide an alternate set of
188 tables. */
189
190 #include "chartables.c"
191
192
193
194 /*************************************************
195 * Return version string *
196 *************************************************/
197
198 #define STRING(a) # a
199 #define XSTRING(s) STRING(s)
200
201 const char *
202 pcre_version(void)
203 {
204 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
205 }
206
207
208
209
210 /*************************************************
211 * (Obsolete) Return info about compiled pattern *
212 *************************************************/
213
214 /* This is the original "info" function. It picks potentially useful data out
215 of the private structure, but its interface was too rigid. It remains for
216 backwards compatibility. The public options are passed back in an int - though
217 the re->options field has been expanded to a long int, all the public options
218 at the low end of it, and so even on 16-bit systems this will still be OK.
219 Therefore, I haven't changed the API for pcre_info().
220
221 Arguments:
222 external_re points to compiled code
223 optptr where to pass back the options
224 first_char where to pass back the first character,
225 or -1 if multiline and all branches start ^,
226 or -2 otherwise
227
228 Returns: number of capturing subpatterns
229 or negative values on error
230 */
231
232 int
233 pcre_info(const pcre *external_re, int *optptr, int *first_char)
234 {
235 const real_pcre *re = (const real_pcre *)external_re;
236 if (re == NULL) return PCRE_ERROR_NULL;
237 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
238 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
239 if (first_char != NULL)
240 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
241 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
242 return re->top_bracket;
243 }
244
245
246
247 /*************************************************
248 * Return info about compiled pattern *
249 *************************************************/
250
251 /* This is a newer "info" function which has an extensible interface so
252 that additional items can be added compatibly.
253
254 Arguments:
255 external_re points to compiled code
256 external_study points to study data, or NULL
257 what what information is required
258 where where to put the information
259
260 Returns: 0 if data returned, negative on error
261 */
262
263 int
264 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
265 void *where)
266 {
267 const real_pcre *re = (const real_pcre *)external_re;
268 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
269
270 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
271 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
272
273 switch (what)
274 {
275 case PCRE_INFO_OPTIONS:
276 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
277 break;
278
279 case PCRE_INFO_SIZE:
280 *((size_t *)where) = re->size;
281 break;
282
283 case PCRE_INFO_CAPTURECOUNT:
284 *((int *)where) = re->top_bracket;
285 break;
286
287 case PCRE_INFO_BACKREFMAX:
288 *((int *)where) = re->top_backref;
289 break;
290
291 case PCRE_INFO_FIRSTCHAR:
292 *((int *)where) =
293 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
294 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
295 break;
296
297 case PCRE_INFO_FIRSTTABLE:
298 *((const uschar **)where) =
299 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
300 study->start_bits : NULL;
301 break;
302
303 case PCRE_INFO_LASTLITERAL:
304 *((int *)where) =
305 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
306 break;
307
308 default: return PCRE_ERROR_BADOPTION;
309 }
310
311 return 0;
312 }
313
314
315
316 #ifdef DEBUG
317 /*************************************************
318 * Debugging function to print chars *
319 *************************************************/
320
321 /* Print a sequence of chars in printable format, stopping at the end of the
322 subject if the requested.
323
324 Arguments:
325 p points to characters
326 length number to print
327 is_subject TRUE if printing from within md->start_subject
328 md pointer to matching data block, if is_subject is TRUE
329
330 Returns: nothing
331 */
332
333 static void
334 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
335 {
336 int c;
337 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
338 while (length-- > 0)
339 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
340 }
341 #endif
342
343
344
345
346 /*************************************************
347 * Handle escapes *
348 *************************************************/
349
350 /* This function is called when a \ has been encountered. It either returns a
351 positive value for a simple escape such as \n, or a negative value which
352 encodes one of the more complicated things such as \d. On entry, ptr is
353 pointing at the \. On exit, it is on the final character of the escape
354 sequence.
355
356 Arguments:
357 ptrptr points to the pattern position pointer
358 errorptr points to the pointer to the error message
359 bracount number of previous extracting brackets
360 options the options bits
361 isclass TRUE if inside a character class
362 cd pointer to char tables block
363
364 Returns: zero or positive => a data character
365 negative => a special escape sequence
366 on error, errorptr is set
367 */
368
369 static int
370 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
371 int options, BOOL isclass, compile_data *cd)
372 {
373 const uschar *ptr = *ptrptr;
374 int c, i;
375
376 c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
377 if (c == 0) *errorptr = ERR1;
378
379 /* Digits or letters may have special meaning; all others are literals. */
380
381 else if (c < '0' || c > 'z') {}
382
383 /* Do an initial lookup in a table. A non-zero result is something that can be
384 returned immediately. Otherwise further processing may be required. */
385
386 else if ((i = escapes[c - '0']) != 0) c = i;
387
388 /* Escapes that need further processing, or are illegal. */
389
390 else
391 {
392 const uschar *oldptr;
393 switch (c)
394 {
395 /* The handling of escape sequences consisting of a string of digits
396 starting with one that is not zero is not straightforward. By experiment,
397 the way Perl works seems to be as follows:
398
399 Outside a character class, the digits are read as a decimal number. If the
400 number is less than 10, or if there are that many previous extracting
401 left brackets, then it is a back reference. Otherwise, up to three octal
402 digits are read to form an escaped byte. Thus \123 is likely to be octal
403 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
404 value is greater than 377, the least significant 8 bits are taken. Inside a
405 character class, \ followed by a digit is always an octal number. */
406
407 case '1': case '2': case '3': case '4': case '5':
408 case '6': case '7': case '8': case '9':
409
410 if (!isclass)
411 {
412 oldptr = ptr;
413 c -= '0';
414 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
415 c = c * 10 + *(++ptr) - '0';
416 if (c < 10 || c <= bracount)
417 {
418 c = -(ESC_REF + c);
419 break;
420 }
421 ptr = oldptr; /* Put the pointer back and fall through */
422 }
423
424 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
425 generates a binary zero byte and treats the digit as a following literal.
426 Thus we have to pull back the pointer by one. */
427
428 if ((c = *ptr) >= '8')
429 {
430 ptr--;
431 c = 0;
432 break;
433 }
434
435 /* \0 always starts an octal number, but we may drop through to here with a
436 larger first octal digit */
437
438 case '0':
439 c -= '0';
440 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
441 ptr[1] != '8' && ptr[1] != '9')
442 c = c * 8 + *(++ptr) - '0';
443 break;
444
445 /* Special escapes not starting with a digit are straightforward */
446
447 case 'x':
448 c = 0;
449 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
450 {
451 ptr++;
452 c = c * 16 + cd->lcc[*ptr] -
453 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
454 }
455 break;
456
457 case 'c':
458 c = *(++ptr);
459 if (c == 0)
460 {
461 *errorptr = ERR2;
462 return 0;
463 }
464
465 /* A letter is upper-cased; then the 0x40 bit is flipped */
466
467 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
468 c ^= 0x40;
469 break;
470
471 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
472 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
473 for Perl compatibility, it is a literal. This code looks a bit odd, but
474 there used to be some cases other than the default, and there may be again
475 in future, so I haven't "optimized" it. */
476
477 default:
478 if ((options & PCRE_EXTRA) != 0) switch(c)
479 {
480 default:
481 *errorptr = ERR3;
482 break;
483 }
484 break;
485 }
486 }
487
488 *ptrptr = ptr;
489 return c;
490 }
491
492
493
494 /*************************************************
495 * Check for counted repeat *
496 *************************************************/
497
498 /* This function is called when a '{' is encountered in a place where it might
499 start a quantifier. It looks ahead to see if it really is a quantifier or not.
500 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
501 where the ddds are digits.
502
503 Arguments:
504 p pointer to the first char after '{'
505 cd pointer to char tables block
506
507 Returns: TRUE or FALSE
508 */
509
510 static BOOL
511 is_counted_repeat(const uschar *p, compile_data *cd)
512 {
513 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
514 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
515 if (*p == '}') return TRUE;
516
517 if (*p++ != ',') return FALSE;
518 if (*p == '}') return TRUE;
519
520 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
521 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
522 return (*p == '}');
523 }
524
525
526
527 /*************************************************
528 * Read repeat counts *
529 *************************************************/
530
531 /* Read an item of the form {n,m} and return the values. This is called only
532 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
533 so the syntax is guaranteed to be correct, but we need to check the values.
534
535 Arguments:
536 p pointer to first char after '{'
537 minp pointer to int for min
538 maxp pointer to int for max
539 returned as -1 if no max
540 errorptr points to pointer to error message
541 cd pointer to character tables clock
542
543 Returns: pointer to '}' on success;
544 current ptr on error, with errorptr set
545 */
546
547 static const uschar *
548 read_repeat_counts(const uschar *p, int *minp, int *maxp,
549 const char **errorptr, compile_data *cd)
550 {
551 int min = 0;
552 int max = -1;
553
554 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
555
556 if (*p == '}') max = min; else
557 {
558 if (*(++p) != '}')
559 {
560 max = 0;
561 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
562 if (max < min)
563 {
564 *errorptr = ERR4;
565 return p;
566 }
567 }
568 }
569
570 /* Do paranoid checks, then fill in the required variables, and pass back the
571 pointer to the terminating '}'. */
572
573 if (min > 65535 || max > 65535)
574 *errorptr = ERR5;
575 else
576 {
577 *minp = min;
578 *maxp = max;
579 }
580 return p;
581 }
582
583
584
585 /*************************************************
586 * Find the fixed length of a pattern *
587 *************************************************/
588
589 /* Scan a pattern and compute the fixed length of subject that will match it,
590 if the length is fixed. This is needed for dealing with backward assertions.
591
592 Arguments:
593 code points to the start of the pattern (the bracket)
594
595 Returns: the fixed length, or -1 if there is no fixed length
596 */
597
598 static int
599 find_fixedlength(uschar *code)
600 {
601 int length = -1;
602
603 register int branchlength = 0;
604 register uschar *cc = code + 3;
605
606 /* Scan along the opcodes for this branch. If we get to the end of the
607 branch, check the length against that of the other branches. */
608
609 for (;;)
610 {
611 int d;
612 register int op = *cc;
613 if (op >= OP_BRA) op = OP_BRA;
614
615 switch (op)
616 {
617 case OP_BRA:
618 case OP_ONCE:
619 case OP_COND:
620 d = find_fixedlength(cc);
621 if (d < 0) return -1;
622 branchlength += d;
623 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
624 cc += 3;
625 break;
626
627 /* Reached end of a branch; if it's a ket it is the end of a nested
628 call. If it's ALT it is an alternation in a nested call. If it is
629 END it's the end of the outer call. All can be handled by the same code. */
630
631 case OP_ALT:
632 case OP_KET:
633 case OP_KETRMAX:
634 case OP_KETRMIN:
635 case OP_END:
636 if (length < 0) length = branchlength;
637 else if (length != branchlength) return -1;
638 if (*cc != OP_ALT) return length;
639 cc += 3;
640 branchlength = 0;
641 break;
642
643 /* Skip over assertive subpatterns */
644
645 case OP_ASSERT:
646 case OP_ASSERT_NOT:
647 case OP_ASSERTBACK:
648 case OP_ASSERTBACK_NOT:
649 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
650 cc += 3;
651 break;
652
653 /* Skip over things that don't match chars */
654
655 case OP_REVERSE:
656 cc++;
657 /* Fall through */
658
659 case OP_CREF:
660 case OP_OPT:
661 cc++;
662 /* Fall through */
663
664 case OP_SOD:
665 case OP_EOD:
666 case OP_EODN:
667 case OP_CIRC:
668 case OP_DOLL:
669 case OP_NOT_WORD_BOUNDARY:
670 case OP_WORD_BOUNDARY:
671 cc++;
672 break;
673
674 /* Handle char strings */
675
676 case OP_CHARS:
677 branchlength += *(++cc);
678 cc += *cc + 1;
679 break;
680
681 /* Handle exact repetitions */
682
683 case OP_EXACT:
684 case OP_TYPEEXACT:
685 branchlength += (cc[1] << 8) + cc[2];
686 cc += 4;
687 break;
688
689 /* Handle single-char matchers */
690
691 case OP_NOT_DIGIT:
692 case OP_DIGIT:
693 case OP_NOT_WHITESPACE:
694 case OP_WHITESPACE:
695 case OP_NOT_WORDCHAR:
696 case OP_WORDCHAR:
697 case OP_ANY:
698 branchlength++;
699 cc++;
700 break;
701
702
703 /* Check a class for variable quantification */
704
705 case OP_CLASS:
706 cc += (*cc == OP_REF)? 2 : 33;
707
708 switch (*cc)
709 {
710 case OP_CRSTAR:
711 case OP_CRMINSTAR:
712 case OP_CRQUERY:
713 case OP_CRMINQUERY:
714 return -1;
715
716 case OP_CRRANGE:
717 case OP_CRMINRANGE:
718 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
719 branchlength += (cc[1] << 8) + cc[2];
720 cc += 5;
721 break;
722
723 default:
724 branchlength++;
725 }
726 break;
727
728 /* Anything else is variable length */
729
730 default:
731 return -1;
732 }
733 }
734 /* Control never gets here */
735 }
736
737
738
739
740 /*************************************************
741 * Check for POSIX class syntax *
742 *************************************************/
743
744 /* This function is called when the sequence "[:" or "[." or "[=" is
745 encountered in a character class. It checks whether this is followed by an
746 optional ^ and then a sequence of letters, terminated by a matching ":]" or
747 ".]" or "=]".
748
749 Argument:
750 ptr pointer to the initial [
751 endptr where to return the end pointer
752 cd pointer to compile data
753
754 Returns: TRUE or FALSE
755 */
756
757 static BOOL
758 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
759 {
760 int terminator; /* Don't combine these lines; the Solaris cc */
761 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
762 if (*(++ptr) == '^') ptr++;
763 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
764 if (*ptr == terminator && ptr[1] == ']')
765 {
766 *endptr = ptr;
767 return TRUE;
768 }
769 return FALSE;
770 }
771
772
773
774
775 /*************************************************
776 * Check POSIX class name *
777 *************************************************/
778
779 /* This function is called to check the name given in a POSIX-style class entry
780 such as [:alnum:].
781
782 Arguments:
783 ptr points to the first letter
784 len the length of the name
785
786 Returns: a value representing the name, or -1 if unknown
787 */
788
789 static int
790 check_posix_name(const uschar *ptr, int len)
791 {
792 register int yield = 0;
793 while (posix_name_lengths[yield] != 0)
794 {
795 if (len == posix_name_lengths[yield] &&
796 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
797 yield++;
798 }
799 return -1;
800 }
801
802
803
804
805 /*************************************************
806 * Compile one branch *
807 *************************************************/
808
809 /* Scan the pattern, compiling it into the code vector.
810
811 Arguments:
812 options the option bits
813 brackets points to number of brackets used
814 code points to the pointer to the current code point
815 ptrptr points to the current pattern pointer
816 errorptr points to pointer to error message
817 optchanged set to the value of the last OP_OPT item compiled
818 reqchar set to the last literal character required, else -1
819 countlits set to count of mandatory literal characters
820 cd contains pointers to tables
821
822 Returns: TRUE on success
823 FALSE, with *errorptr set on error
824 */
825
826 static BOOL
827 compile_branch(int options, int *brackets, uschar **codeptr,
828 const uschar **ptrptr, const char **errorptr, int *optchanged,
829 int *reqchar, int *countlits, compile_data *cd)
830 {
831 int repeat_type, op_type;
832 int repeat_min, repeat_max;
833 int bravalue, length;
834 int greedy_default, greedy_non_default;
835 int prevreqchar;
836 int condcount = 0;
837 int subcountlits = 0;
838 register int c;
839 register uschar *code = *codeptr;
840 uschar *tempcode;
841 const uschar *ptr = *ptrptr;
842 const uschar *tempptr;
843 uschar *previous = NULL;
844 uschar class[32];
845
846 /* Set up the default and non-default settings for greediness */
847
848 greedy_default = ((options & PCRE_UNGREEDY) != 0);
849 greedy_non_default = greedy_default ^ 1;
850
851 /* Initialize no required char, and count of literals */
852
853 *reqchar = prevreqchar = -1;
854 *countlits = 0;
855
856 /* Switch on next character until the end of the branch */
857
858 for (;; ptr++)
859 {
860 BOOL negate_class;
861 int class_charcount;
862 int class_lastchar;
863 int newoptions;
864 int condref;
865 int subreqchar;
866
867 c = *ptr;
868 if ((options & PCRE_EXTENDED) != 0)
869 {
870 if ((cd->ctypes[c] & ctype_space) != 0) continue;
871 if (c == '#')
872 {
873 /* The space before the ; is to avoid a warning on a silly compiler
874 on the Macintosh. */
875 while ((c = *(++ptr)) != 0 && c != '\n') ;
876 continue;
877 }
878 }
879
880 switch(c)
881 {
882 /* The branch terminates at end of string, |, or ). */
883
884 case 0:
885 case '|':
886 case ')':
887 *codeptr = code;
888 *ptrptr = ptr;
889 return TRUE;
890
891 /* Handle single-character metacharacters */
892
893 case '^':
894 previous = NULL;
895 *code++ = OP_CIRC;
896 break;
897
898 case '$':
899 previous = NULL;
900 *code++ = OP_DOLL;
901 break;
902
903 case '.':
904 previous = code;
905 *code++ = OP_ANY;
906 break;
907
908 /* Character classes. These always build a 32-byte bitmap of the permitted
909 characters, except in the special case where there is only one character.
910 For negated classes, we build the map as usual, then invert it at the end.
911 */
912
913 case '[':
914 previous = code;
915 *code++ = OP_CLASS;
916
917 /* If the first character is '^', set the negation flag and skip it. */
918
919 if ((c = *(++ptr)) == '^')
920 {
921 negate_class = TRUE;
922 c = *(++ptr);
923 }
924 else negate_class = FALSE;
925
926 /* Keep a count of chars so that we can optimize the case of just a single
927 character. */
928
929 class_charcount = 0;
930 class_lastchar = -1;
931
932 /* Initialize the 32-char bit map to all zeros. We have to build the
933 map in a temporary bit of store, in case the class contains only 1
934 character, because in that case the compiled code doesn't use the
935 bit map. */
936
937 memset(class, 0, 32 * sizeof(uschar));
938
939 /* Process characters until ] is reached. By writing this as a "do" it
940 means that an initial ] is taken as a data character. */
941
942 do
943 {
944 if (c == 0)
945 {
946 *errorptr = ERR6;
947 goto FAILED;
948 }
949
950 /* Handle POSIX class names. Perl allows a negation extension of the
951 form [:^name]. A square bracket that doesn't match the syntax is
952 treated as a literal. We also recognize the POSIX constructions
953 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
954 5.6 does. */
955
956 if (c == '[' &&
957 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
958 check_posix_syntax(ptr, &tempptr, cd))
959 {
960 BOOL local_negate = FALSE;
961 int posix_class, i;
962 register const uschar *cbits = cd->cbits;
963
964 if (ptr[1] != ':')
965 {
966 *errorptr = ERR31;
967 goto FAILED;
968 }
969
970 ptr += 2;
971 if (*ptr == '^')
972 {
973 local_negate = TRUE;
974 ptr++;
975 }
976
977 posix_class = check_posix_name(ptr, tempptr - ptr);
978 if (posix_class < 0)
979 {
980 *errorptr = ERR30;
981 goto FAILED;
982 }
983
984 /* If matching is caseless, upper and lower are converted to
985 alpha. This relies on the fact that the class table starts with
986 alpha, lower, upper as the first 3 entries. */
987
988 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
989 posix_class = 0;
990
991 /* Or into the map we are building up to 3 of the static class
992 tables, or their negations. */
993
994 posix_class *= 3;
995 for (i = 0; i < 3; i++)
996 {
997 int taboffset = posix_class_maps[posix_class + i];
998 if (taboffset < 0) break;
999 if (local_negate)
1000 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1001 else
1002 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1003 }
1004
1005 ptr = tempptr + 1;
1006 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1007 continue;
1008 }
1009
1010 /* Backslash may introduce a single character, or it may introduce one
1011 of the specials, which just set a flag. Escaped items are checked for
1012 validity in the pre-compiling pass. The sequence \b is a special case.
1013 Inside a class (and only there) it is treated as backspace. Elsewhere
1014 it marks a word boundary. Other escapes have preset maps ready to
1015 or into the one we are building. We assume they have more than one
1016 character in them, so set class_count bigger than one. */
1017
1018 if (c == '\\')
1019 {
1020 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1021 if (-c == ESC_b) c = '\b';
1022 else if (c < 0)
1023 {
1024 register const uschar *cbits = cd->cbits;
1025 class_charcount = 10;
1026 switch (-c)
1027 {
1028 case ESC_d:
1029 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1030 continue;
1031
1032 case ESC_D:
1033 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1034 continue;
1035
1036 case ESC_w:
1037 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1038 continue;
1039
1040 case ESC_W:
1041 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1042 continue;
1043
1044 case ESC_s:
1045 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1046 continue;
1047
1048 case ESC_S:
1049 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1050 continue;
1051
1052 default:
1053 *errorptr = ERR7;
1054 goto FAILED;
1055 }
1056 }
1057 /* Fall through if single character */
1058 }
1059
1060 /* A single character may be followed by '-' to form a range. However,
1061 Perl does not permit ']' to be the end of the range. A '-' character
1062 here is treated as a literal. */
1063
1064 if (ptr[1] == '-' && ptr[2] != ']')
1065 {
1066 int d;
1067 ptr += 2;
1068 d = *ptr;
1069
1070 if (d == 0)
1071 {
1072 *errorptr = ERR6;
1073 goto FAILED;
1074 }
1075
1076 /* The second part of a range can be a single-character escape, but
1077 not any of the other escapes. */
1078
1079 if (d == '\\')
1080 {
1081 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1082 if (d < 0)
1083 {
1084 if (d == -ESC_b) d = '\b'; else
1085 {
1086 *errorptr = ERR7;
1087 goto FAILED;
1088 }
1089 }
1090 }
1091
1092 if (d < c)
1093 {
1094 *errorptr = ERR8;
1095 goto FAILED;
1096 }
1097
1098 for (; c <= d; c++)
1099 {
1100 class[c/8] |= (1 << (c&7));
1101 if ((options & PCRE_CASELESS) != 0)
1102 {
1103 int uc = cd->fcc[c]; /* flip case */
1104 class[uc/8] |= (1 << (uc&7));
1105 }
1106 class_charcount++; /* in case a one-char range */
1107 class_lastchar = c;
1108 }
1109 continue; /* Go get the next char in the class */
1110 }
1111
1112 /* Handle a lone single character - we can get here for a normal
1113 non-escape char, or after \ that introduces a single character. */
1114
1115 class [c/8] |= (1 << (c&7));
1116 if ((options & PCRE_CASELESS) != 0)
1117 {
1118 c = cd->fcc[c]; /* flip case */
1119 class[c/8] |= (1 << (c&7));
1120 }
1121 class_charcount++;
1122 class_lastchar = c;
1123 }
1124
1125 /* Loop until ']' reached; the check for end of string happens inside the
1126 loop. This "while" is the end of the "do" above. */
1127
1128 while ((c = *(++ptr)) != ']');
1129
1130 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1131 precisely one character. This doesn't need the whole 32-byte bit map.
1132 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1133 it's negative. */
1134
1135 if (class_charcount == 1 && class_lastchar >= 0)
1136 {
1137 if (negate_class)
1138 {
1139 code[-1] = OP_NOT;
1140 }
1141 else
1142 {
1143 code[-1] = OP_CHARS;
1144 *code++ = 1;
1145 }
1146 *code++ = class_lastchar;
1147 }
1148
1149 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1150 the code vector. */
1151
1152 else
1153 {
1154 if (negate_class)
1155 for (c = 0; c < 32; c++) code[c] = ~class[c];
1156 else
1157 memcpy(code, class, 32);
1158 code += 32;
1159 }
1160 break;
1161
1162 /* Various kinds of repeat */
1163
1164 case '{':
1165 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1166 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1167 if (*errorptr != NULL) goto FAILED;
1168 goto REPEAT;
1169
1170 case '*':
1171 repeat_min = 0;
1172 repeat_max = -1;
1173 goto REPEAT;
1174
1175 case '+':
1176 repeat_min = 1;
1177 repeat_max = -1;
1178 goto REPEAT;
1179
1180 case '?':
1181 repeat_min = 0;
1182 repeat_max = 1;
1183
1184 REPEAT:
1185 if (previous == NULL)
1186 {
1187 *errorptr = ERR9;
1188 goto FAILED;
1189 }
1190
1191 /* If the next character is '?' this is a minimizing repeat, by default,
1192 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1193 next character. */
1194
1195 if (ptr[1] == '?')
1196 { repeat_type = greedy_non_default; ptr++; }
1197 else repeat_type = greedy_default;
1198
1199 /* If previous was a string of characters, chop off the last one and use it
1200 as the subject of the repeat. If there was only one character, we can
1201 abolish the previous item altogether. A repeat with a zero minimum wipes
1202 out any reqchar setting, backing up to the previous value. We must also
1203 adjust the countlits value. */
1204
1205 if (*previous == OP_CHARS)
1206 {
1207 int len = previous[1];
1208
1209 if (repeat_min == 0) *reqchar = prevreqchar;
1210 *countlits += repeat_min - 1;
1211
1212 if (len == 1)
1213 {
1214 c = previous[2];
1215 code = previous;
1216 }
1217 else
1218 {
1219 c = previous[len+1];
1220 previous[1]--;
1221 code--;
1222 }
1223 op_type = 0; /* Use single-char op codes */
1224 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1225 }
1226
1227 /* If previous was a single negated character ([^a] or similar), we use
1228 one of the special opcodes, replacing it. The code is shared with single-
1229 character repeats by adding a suitable offset into repeat_type. */
1230
1231 else if ((int)*previous == OP_NOT)
1232 {
1233 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1234 c = previous[1];
1235 code = previous;
1236 goto OUTPUT_SINGLE_REPEAT;
1237 }
1238
1239 /* If previous was a character type match (\d or similar), abolish it and
1240 create a suitable repeat item. The code is shared with single-character
1241 repeats by adding a suitable offset into repeat_type. */
1242
1243 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1244 {
1245 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1246 c = *previous;
1247 code = previous;
1248
1249 OUTPUT_SINGLE_REPEAT:
1250
1251 /* If the maximum is zero then the minimum must also be zero; Perl allows
1252 this case, so we do too - by simply omitting the item altogether. */
1253
1254 if (repeat_max == 0) goto END_REPEAT;
1255
1256 /* Combine the op_type with the repeat_type */
1257
1258 repeat_type += op_type;
1259
1260 /* A minimum of zero is handled either as the special case * or ?, or as
1261 an UPTO, with the maximum given. */
1262
1263 if (repeat_min == 0)
1264 {
1265 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1266 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1267 else
1268 {
1269 *code++ = OP_UPTO + repeat_type;
1270 *code++ = repeat_max >> 8;
1271 *code++ = (repeat_max & 255);
1272 }
1273 }
1274
1275 /* The case {1,} is handled as the special case + */
1276
1277 else if (repeat_min == 1 && repeat_max == -1)
1278 *code++ = OP_PLUS + repeat_type;
1279
1280 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1281 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1282
1283 else
1284 {
1285 if (repeat_min != 1)
1286 {
1287 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1288 *code++ = repeat_min >> 8;
1289 *code++ = (repeat_min & 255);
1290 }
1291
1292 /* If the mininum is 1 and the previous item was a character string,
1293 we either have to put back the item that got cancelled if the string
1294 length was 1, or add the character back onto the end of a longer
1295 string. For a character type nothing need be done; it will just get
1296 put back naturally. Note that the final character is always going to
1297 get added below. */
1298
1299 else if (*previous == OP_CHARS)
1300 {
1301 if (code == previous) code += 2; else previous[1]++;
1302 }
1303
1304 /* For a single negated character we also have to put back the
1305 item that got cancelled. */
1306
1307 else if (*previous == OP_NOT) code++;
1308
1309 /* If the maximum is unlimited, insert an OP_STAR. */
1310
1311 if (repeat_max < 0)
1312 {
1313 *code++ = c;
1314 *code++ = OP_STAR + repeat_type;
1315 }
1316
1317 /* Else insert an UPTO if the max is greater than the min. */
1318
1319 else if (repeat_max != repeat_min)
1320 {
1321 *code++ = c;
1322 repeat_max -= repeat_min;
1323 *code++ = OP_UPTO + repeat_type;
1324 *code++ = repeat_max >> 8;
1325 *code++ = (repeat_max & 255);
1326 }
1327 }
1328
1329 /* The character or character type itself comes last in all cases. */
1330
1331 *code++ = c;
1332 }
1333
1334 /* If previous was a character class or a back reference, we put the repeat
1335 stuff after it, but just skip the item if the repeat was {0,0}. */
1336
1337 else if (*previous == OP_CLASS || *previous == OP_REF)
1338 {
1339 if (repeat_max == 0)
1340 {
1341 code = previous;
1342 goto END_REPEAT;
1343 }
1344 if (repeat_min == 0 && repeat_max == -1)
1345 *code++ = OP_CRSTAR + repeat_type;
1346 else if (repeat_min == 1 && repeat_max == -1)
1347 *code++ = OP_CRPLUS + repeat_type;
1348 else if (repeat_min == 0 && repeat_max == 1)
1349 *code++ = OP_CRQUERY + repeat_type;
1350 else
1351 {
1352 *code++ = OP_CRRANGE + repeat_type;
1353 *code++ = repeat_min >> 8;
1354 *code++ = repeat_min & 255;
1355 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1356 *code++ = repeat_max >> 8;
1357 *code++ = repeat_max & 255;
1358 }
1359 }
1360
1361 /* If previous was a bracket group, we may have to replicate it in certain
1362 cases. */
1363
1364 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1365 (int)*previous == OP_COND)
1366 {
1367 register int i;
1368 int ketoffset = 0;
1369 int len = code - previous;
1370 uschar *bralink = NULL;
1371
1372 /* If the maximum repeat count is unlimited, find the end of the bracket
1373 by scanning through from the start, and compute the offset back to it
1374 from the current code pointer. There may be an OP_OPT setting following
1375 the final KET, so we can't find the end just by going back from the code
1376 pointer. */
1377
1378 if (repeat_max == -1)
1379 {
1380 register uschar *ket = previous;
1381 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1382 ketoffset = code - ket;
1383 }
1384
1385 /* The case of a zero minimum is special because of the need to stick
1386 OP_BRAZERO in front of it, and because the group appears once in the
1387 data, whereas in other cases it appears the minimum number of times. For
1388 this reason, it is simplest to treat this case separately, as otherwise
1389 the code gets far too mess. There are several special subcases when the
1390 minimum is zero. */
1391
1392 if (repeat_min == 0)
1393 {
1394 /* If we set up a required char from the bracket, we must back off
1395 to the previous value and reset the countlits value too. */
1396
1397 if (subcountlits > 0)
1398 {
1399 *reqchar = prevreqchar;
1400 *countlits -= subcountlits;
1401 }
1402
1403 /* If the maximum is also zero, we just omit the group from the output
1404 altogether. */
1405
1406 if (repeat_max == 0)
1407 {
1408 code = previous;
1409 goto END_REPEAT;
1410 }
1411
1412 /* If the maximum is 1 or unlimited, we just have to stick in the
1413 BRAZERO and do no more at this point. */
1414
1415 if (repeat_max <= 1)
1416 {
1417 memmove(previous+1, previous, len);
1418 code++;
1419 *previous++ = OP_BRAZERO + repeat_type;
1420 }
1421
1422 /* If the maximum is greater than 1 and limited, we have to replicate
1423 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1424 The first one has to be handled carefully because it's the original
1425 copy, which has to be moved up. The remainder can be handled by code
1426 that is common with the non-zero minimum case below. We just have to
1427 adjust the value or repeat_max, since one less copy is required. */
1428
1429 else
1430 {
1431 int offset;
1432 memmove(previous+4, previous, len);
1433 code += 4;
1434 *previous++ = OP_BRAZERO + repeat_type;
1435 *previous++ = OP_BRA;
1436
1437 /* We chain together the bracket offset fields that have to be
1438 filled in later when the ends of the brackets are reached. */
1439
1440 offset = (bralink == NULL)? 0 : previous - bralink;
1441 bralink = previous;
1442 *previous++ = offset >> 8;
1443 *previous++ = offset & 255;
1444 }
1445
1446 repeat_max--;
1447 }
1448
1449 /* If the minimum is greater than zero, replicate the group as many
1450 times as necessary, and adjust the maximum to the number of subsequent
1451 copies that we need. */
1452
1453 else
1454 {
1455 for (i = 1; i < repeat_min; i++)
1456 {
1457 memcpy(code, previous, len);
1458 code += len;
1459 }
1460 if (repeat_max > 0) repeat_max -= repeat_min;
1461 }
1462
1463 /* This code is common to both the zero and non-zero minimum cases. If
1464 the maximum is limited, it replicates the group in a nested fashion,
1465 remembering the bracket starts on a stack. In the case of a zero minimum,
1466 the first one was set up above. In all cases the repeat_max now specifies
1467 the number of additional copies needed. */
1468
1469 if (repeat_max >= 0)
1470 {
1471 for (i = repeat_max - 1; i >= 0; i--)
1472 {
1473 *code++ = OP_BRAZERO + repeat_type;
1474
1475 /* All but the final copy start a new nesting, maintaining the
1476 chain of brackets outstanding. */
1477
1478 if (i != 0)
1479 {
1480 int offset;
1481 *code++ = OP_BRA;
1482 offset = (bralink == NULL)? 0 : code - bralink;
1483 bralink = code;
1484 *code++ = offset >> 8;
1485 *code++ = offset & 255;
1486 }
1487
1488 memcpy(code, previous, len);
1489 code += len;
1490 }
1491
1492 /* Now chain through the pending brackets, and fill in their length
1493 fields (which are holding the chain links pro tem). */
1494
1495 while (bralink != NULL)
1496 {
1497 int oldlinkoffset;
1498 int offset = code - bralink + 1;
1499 uschar *bra = code - offset;
1500 oldlinkoffset = (bra[1] << 8) + bra[2];
1501 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1502 *code++ = OP_KET;
1503 *code++ = bra[1] = offset >> 8;
1504 *code++ = bra[2] = (offset & 255);
1505 }
1506 }
1507
1508 /* If the maximum is unlimited, set a repeater in the final copy. We
1509 can't just offset backwards from the current code point, because we
1510 don't know if there's been an options resetting after the ket. The
1511 correct offset was computed above. */
1512
1513 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1514 }
1515
1516 /* Else there's some kind of shambles */
1517
1518 else
1519 {
1520 *errorptr = ERR11;
1521 goto FAILED;
1522 }
1523
1524 /* In all case we no longer have a previous item. */
1525
1526 END_REPEAT:
1527 previous = NULL;
1528 break;
1529
1530
1531 /* Start of nested bracket sub-expression, or comment or lookahead or
1532 lookbehind or option setting or condition. First deal with special things
1533 that can come after a bracket; all are introduced by ?, and the appearance
1534 of any of them means that this is not a referencing group. They were
1535 checked for validity in the first pass over the string, so we don't have to
1536 check for syntax errors here. */
1537
1538 case '(':
1539 newoptions = options;
1540 condref = -1;
1541
1542 if (*(++ptr) == '?')
1543 {
1544 int set, unset;
1545 int *optset;
1546
1547 switch (*(++ptr))
1548 {
1549 case '#': /* Comment; skip to ket */
1550 ptr++;
1551 while (*ptr != ')') ptr++;
1552 continue;
1553
1554 case ':': /* Non-extracting bracket */
1555 bravalue = OP_BRA;
1556 ptr++;
1557 break;
1558
1559 case '(':
1560 bravalue = OP_COND; /* Conditional group */
1561 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1562 {
1563 condref = *ptr - '0';
1564 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1565 ptr++;
1566 }
1567 else ptr--;
1568 break;
1569
1570 case '=': /* Positive lookahead */
1571 bravalue = OP_ASSERT;
1572 ptr++;
1573 break;
1574
1575 case '!': /* Negative lookahead */
1576 bravalue = OP_ASSERT_NOT;
1577 ptr++;
1578 break;
1579
1580 case '<': /* Lookbehinds */
1581 switch (*(++ptr))
1582 {
1583 case '=': /* Positive lookbehind */
1584 bravalue = OP_ASSERTBACK;
1585 ptr++;
1586 break;
1587
1588 case '!': /* Negative lookbehind */
1589 bravalue = OP_ASSERTBACK_NOT;
1590 ptr++;
1591 break;
1592
1593 default: /* Syntax error */
1594 *errorptr = ERR24;
1595 goto FAILED;
1596 }
1597 break;
1598
1599 case '>': /* One-time brackets */
1600 bravalue = OP_ONCE;
1601 ptr++;
1602 break;
1603
1604 case 'R': /* Pattern recursion */
1605 *code++ = OP_RECURSE;
1606 ptr++;
1607 continue;
1608
1609 default: /* Option setting */
1610 set = unset = 0;
1611 optset = &set;
1612
1613 while (*ptr != ')' && *ptr != ':')
1614 {
1615 switch (*ptr++)
1616 {
1617 case '-': optset = &unset; break;
1618
1619 case 'i': *optset |= PCRE_CASELESS; break;
1620 case 'm': *optset |= PCRE_MULTILINE; break;
1621 case 's': *optset |= PCRE_DOTALL; break;
1622 case 'x': *optset |= PCRE_EXTENDED; break;
1623 case 'U': *optset |= PCRE_UNGREEDY; break;
1624 case 'X': *optset |= PCRE_EXTRA; break;
1625
1626 default:
1627 *errorptr = ERR12;
1628 goto FAILED;
1629 }
1630 }
1631
1632 /* Set up the changed option bits, but don't change anything yet. */
1633
1634 newoptions = (options | set) & (~unset);
1635
1636 /* If the options ended with ')' this is not the start of a nested
1637 group with option changes, so the options change at this level. At top
1638 level there is nothing else to be done (the options will in fact have
1639 been set from the start of compiling as a result of the first pass) but
1640 at an inner level we must compile code to change the ims options if
1641 necessary, and pass the new setting back so that it can be put at the
1642 start of any following branches, and when this group ends, a resetting
1643 item can be compiled. */
1644
1645 if (*ptr == ')')
1646 {
1647 if ((options & PCRE_INGROUP) != 0 &&
1648 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1649 {
1650 *code++ = OP_OPT;
1651 *code++ = *optchanged = newoptions & PCRE_IMS;
1652 }
1653 options = newoptions; /* Change options at this level */
1654 previous = NULL; /* This item can't be repeated */
1655 continue; /* It is complete */
1656 }
1657
1658 /* If the options ended with ':' we are heading into a nested group
1659 with possible change of options. Such groups are non-capturing and are
1660 not assertions of any kind. All we need to do is skip over the ':';
1661 the newoptions value is handled below. */
1662
1663 bravalue = OP_BRA;
1664 ptr++;
1665 }
1666 }
1667
1668 /* Else we have a referencing group; adjust the opcode. */
1669
1670 else
1671 {
1672 if (++(*brackets) > EXTRACT_MAX)
1673 {
1674 *errorptr = ERR13;
1675 goto FAILED;
1676 }
1677 bravalue = OP_BRA + *brackets;
1678 }
1679
1680 /* Process nested bracketed re. Assertions may not be repeated, but other
1681 kinds can be. We copy code into a non-register variable in order to be able
1682 to pass its address because some compilers complain otherwise. Pass in a
1683 new setting for the ims options if they have changed. */
1684
1685 previous = (bravalue >= OP_ONCE)? code : NULL;
1686 *code = bravalue;
1687 tempcode = code;
1688
1689 if (!compile_regex(
1690 options | PCRE_INGROUP, /* Set for all nested groups */
1691 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1692 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1693 brackets, /* Bracket level */
1694 &tempcode, /* Where to put code (updated) */
1695 &ptr, /* Input pointer (updated) */
1696 errorptr, /* Where to put an error message */
1697 (bravalue == OP_ASSERTBACK ||
1698 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1699 condref, /* Condition reference number */
1700 &subreqchar, /* For possible last char */
1701 &subcountlits, /* For literal count */
1702 cd)) /* Tables block */
1703 goto FAILED;
1704
1705 /* At the end of compiling, code is still pointing to the start of the
1706 group, while tempcode has been updated to point past the end of the group
1707 and any option resetting that may follow it. The pattern pointer (ptr)
1708 is on the bracket. */
1709
1710 /* If this is a conditional bracket, check that there are no more than
1711 two branches in the group. */
1712
1713 if (bravalue == OP_COND)
1714 {
1715 uschar *tc = code;
1716 condcount = 0;
1717
1718 do {
1719 condcount++;
1720 tc += (tc[1] << 8) | tc[2];
1721 }
1722 while (*tc != OP_KET);
1723
1724 if (condcount > 2)
1725 {
1726 *errorptr = ERR27;
1727 goto FAILED;
1728 }
1729 }
1730
1731 /* Handle updating of the required character. If the subpattern didn't
1732 set one, leave it as it was. Otherwise, update it for normal brackets of
1733 all kinds, forward assertions, and conditions with two branches. Don't
1734 update the literal count for forward assertions, however. If the bracket
1735 is followed by a quantifier with zero repeat, we have to back off. Hence
1736 the definition of prevreqchar and subcountlits outside the main loop so
1737 that they can be accessed for the back off. */
1738
1739 if (subreqchar > 0 &&
1740 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1741 (bravalue == OP_COND && condcount == 2)))
1742 {
1743 prevreqchar = *reqchar;
1744 *reqchar = subreqchar;
1745 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1746 }
1747
1748 /* Now update the main code pointer to the end of the group. */
1749
1750 code = tempcode;
1751
1752 /* Error if hit end of pattern */
1753
1754 if (*ptr != ')')
1755 {
1756 *errorptr = ERR14;
1757 goto FAILED;
1758 }
1759 break;
1760
1761 /* Check \ for being a real metacharacter; if not, fall through and handle
1762 it as a data character at the start of a string. Escape items are checked
1763 for validity in the pre-compiling pass. */
1764
1765 case '\\':
1766 tempptr = ptr;
1767 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1768
1769 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1770 are arranged to be the negation of the corresponding OP_values. For the
1771 back references, the values are ESC_REF plus the reference number. Only
1772 back references and those types that consume a character may be repeated.
1773 We can test for values between ESC_b and ESC_Z for the latter; this may
1774 have to change if any new ones are ever created. */
1775
1776 if (c < 0)
1777 {
1778 if (-c >= ESC_REF)
1779 {
1780 previous = code;
1781 *code++ = OP_REF;
1782 *code++ = -c - ESC_REF;
1783 }
1784 else
1785 {
1786 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1787 *code++ = -c;
1788 }
1789 continue;
1790 }
1791
1792 /* Data character: reset and fall through */
1793
1794 ptr = tempptr;
1795 c = '\\';
1796
1797 /* Handle a run of data characters until a metacharacter is encountered.
1798 The first character is guaranteed not to be whitespace or # when the
1799 extended flag is set. */
1800
1801 NORMAL_CHAR:
1802 default:
1803 previous = code;
1804 *code = OP_CHARS;
1805 code += 2;
1806 length = 0;
1807
1808 do
1809 {
1810 if ((options & PCRE_EXTENDED) != 0)
1811 {
1812 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1813 if (c == '#')
1814 {
1815 /* The space before the ; is to avoid a warning on a silly compiler
1816 on the Macintosh. */
1817 while ((c = *(++ptr)) != 0 && c != '\n') ;
1818 if (c == 0) break;
1819 continue;
1820 }
1821 }
1822
1823 /* Backslash may introduce a data char or a metacharacter. Escaped items
1824 are checked for validity in the pre-compiling pass. Stop the string
1825 before a metaitem. */
1826
1827 if (c == '\\')
1828 {
1829 tempptr = ptr;
1830 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1831 if (c < 0) { ptr = tempptr; break; }
1832 }
1833
1834 /* Ordinary character or single-char escape */
1835
1836 *code++ = c;
1837 length++;
1838 }
1839
1840 /* This "while" is the end of the "do" above. */
1841
1842 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1843
1844 /* Update the last character and the count of literals */
1845
1846 prevreqchar = (length > 1)? code[-2] : *reqchar;
1847 *reqchar = code[-1];
1848 *countlits += length;
1849
1850 /* Compute the length and set it in the data vector, and advance to
1851 the next state. */
1852
1853 previous[1] = length;
1854 if (length < 255) ptr--;
1855 break;
1856 }
1857 } /* end of big loop */
1858
1859 /* Control never reaches here by falling through, only by a goto for all the
1860 error states. Pass back the position in the pattern so that it can be displayed
1861 to the user for diagnosing the error. */
1862
1863 FAILED:
1864 *ptrptr = ptr;
1865 return FALSE;
1866 }
1867
1868
1869
1870
1871 /*************************************************
1872 * Compile sequence of alternatives *
1873 *************************************************/
1874
1875 /* On entry, ptr is pointing past the bracket character, but on return
1876 it points to the closing bracket, or vertical bar, or end of string.
1877 The code variable is pointing at the byte into which the BRA operator has been
1878 stored. If the ims options are changed at the start (for a (?ims: group) or
1879 during any branch, we need to insert an OP_OPT item at the start of every
1880 following branch to ensure they get set correctly at run time, and also pass
1881 the new options into every subsequent branch compile.
1882
1883 Argument:
1884 options the option bits
1885 optchanged new ims options to set as if (?ims) were at the start, or -1
1886 for no change
1887 brackets -> int containing the number of extracting brackets used
1888 codeptr -> the address of the current code pointer
1889 ptrptr -> the address of the current pattern pointer
1890 errorptr -> pointer to error message
1891 lookbehind TRUE if this is a lookbehind assertion
1892 condref > 0 for OPT_CREF setting at start of conditional group
1893 reqchar -> place to put the last required character, or a negative number
1894 countlits -> place to put the shortest literal count of any branch
1895 cd points to the data block with tables pointers
1896
1897 Returns: TRUE on success
1898 */
1899
1900 static BOOL
1901 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1902 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1903 int *reqchar, int *countlits, compile_data *cd)
1904 {
1905 const uschar *ptr = *ptrptr;
1906 uschar *code = *codeptr;
1907 uschar *last_branch = code;
1908 uschar *start_bracket = code;
1909 uschar *reverse_count = NULL;
1910 int oldoptions = options & PCRE_IMS;
1911 int branchreqchar, branchcountlits;
1912
1913 *reqchar = -1;
1914 *countlits = INT_MAX;
1915 code += 3;
1916
1917 /* At the start of a reference-based conditional group, insert the reference
1918 number as an OP_CREF item. */
1919
1920 if (condref > 0)
1921 {
1922 *code++ = OP_CREF;
1923 *code++ = condref;
1924 }
1925
1926 /* Loop for each alternative branch */
1927
1928 for (;;)
1929 {
1930 int length;
1931
1932 /* Handle change of options */
1933
1934 if (optchanged >= 0)
1935 {
1936 *code++ = OP_OPT;
1937 *code++ = optchanged;
1938 options = (options & ~PCRE_IMS) | optchanged;
1939 }
1940
1941 /* Set up dummy OP_REVERSE if lookbehind assertion */
1942
1943 if (lookbehind)
1944 {
1945 *code++ = OP_REVERSE;
1946 reverse_count = code;
1947 *code++ = 0;
1948 *code++ = 0;
1949 }
1950
1951 /* Now compile the branch */
1952
1953 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1954 &branchreqchar, &branchcountlits, cd))
1955 {
1956 *ptrptr = ptr;
1957 return FALSE;
1958 }
1959
1960 /* Fill in the length of the last branch */
1961
1962 length = code - last_branch;
1963 last_branch[1] = length >> 8;
1964 last_branch[2] = length & 255;
1965
1966 /* Save the last required character if all branches have the same; a current
1967 value of -1 means unset, while -2 means "previous branch had no last required
1968 char". */
1969
1970 if (*reqchar != -2)
1971 {
1972 if (branchreqchar >= 0)
1973 {
1974 if (*reqchar == -1) *reqchar = branchreqchar;
1975 else if (*reqchar != branchreqchar) *reqchar = -2;
1976 }
1977 else *reqchar = -2;
1978 }
1979
1980 /* Keep the shortest literal count */
1981
1982 if (branchcountlits < *countlits) *countlits = branchcountlits;
1983 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1984
1985 /* If lookbehind, check that this branch matches a fixed-length string,
1986 and put the length into the OP_REVERSE item. Temporarily mark the end of
1987 the branch with OP_END. */
1988
1989 if (lookbehind)
1990 {
1991 *code = OP_END;
1992 length = find_fixedlength(last_branch);
1993 DPRINTF(("fixed length = %d\n", length));
1994 if (length < 0)
1995 {
1996 *errorptr = ERR25;
1997 *ptrptr = ptr;
1998 return FALSE;
1999 }
2000 reverse_count[0] = (length >> 8);
2001 reverse_count[1] = length & 255;
2002 }
2003
2004 /* Reached end of expression, either ')' or end of pattern. Insert a
2005 terminating ket and the length of the whole bracketed item, and return,
2006 leaving the pointer at the terminating char. If any of the ims options
2007 were changed inside the group, compile a resetting op-code following. */
2008
2009 if (*ptr != '|')
2010 {
2011 length = code - start_bracket;
2012 *code++ = OP_KET;
2013 *code++ = length >> 8;
2014 *code++ = length & 255;
2015 if (optchanged >= 0)
2016 {
2017 *code++ = OP_OPT;
2018 *code++ = oldoptions;
2019 }
2020 *codeptr = code;
2021 *ptrptr = ptr;
2022 return TRUE;
2023 }
2024
2025 /* Another branch follows; insert an "or" node and advance the pointer. */
2026
2027 *code = OP_ALT;
2028 last_branch = code;
2029 code += 3;
2030 ptr++;
2031 }
2032 /* Control never reaches here */
2033 }
2034
2035
2036
2037
2038 /*************************************************
2039 * Find first significant op code *
2040 *************************************************/
2041
2042 /* This is called by several functions that scan a compiled expression looking
2043 for a fixed first character, or an anchoring op code etc. It skips over things
2044 that do not influence this. For one application, a change of caseless option is
2045 important.
2046
2047 Arguments:
2048 code pointer to the start of the group
2049 options pointer to external options
2050 optbit the option bit whose changing is significant, or
2051 zero if none are
2052 optstop TRUE to return on option change, otherwise change the options
2053 value and continue
2054
2055 Returns: pointer to the first significant opcode
2056 */
2057
2058 static const uschar*
2059 first_significant_code(const uschar *code, int *options, int optbit,
2060 BOOL optstop)
2061 {
2062 for (;;)
2063 {
2064 switch ((int)*code)
2065 {
2066 case OP_OPT:
2067 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2068 {
2069 if (optstop) return code;
2070 *options = (int)code[1];
2071 }
2072 code += 2;
2073 break;
2074
2075 case OP_CREF:
2076 code += 2;
2077 break;
2078
2079 case OP_WORD_BOUNDARY:
2080 case OP_NOT_WORD_BOUNDARY:
2081 code++;
2082 break;
2083
2084 case OP_ASSERT_NOT:
2085 case OP_ASSERTBACK:
2086 case OP_ASSERTBACK_NOT:
2087 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2088 code += 3;
2089 break;
2090
2091 default:
2092 return code;
2093 }
2094 }
2095 /* Control never reaches here */
2096 }
2097
2098
2099
2100
2101 /*************************************************
2102 * Check for anchored expression *
2103 *************************************************/
2104
2105 /* Try to find out if this is an anchored regular expression. Consider each
2106 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2107 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2108 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2109 counts, since OP_CIRC can match in the middle.
2110
2111 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2112 because that will try the rest of the pattern at all possible matching points,
2113 so there is no point trying them again.
2114
2115 Arguments:
2116 code points to start of expression (the bracket)
2117 options points to the options setting
2118
2119 Returns: TRUE or FALSE
2120 */
2121
2122 static BOOL
2123 is_anchored(register const uschar *code, int *options)
2124 {
2125 do {
2126 const uschar *scode = first_significant_code(code + 3, options,
2127 PCRE_MULTILINE, FALSE);
2128 register int op = *scode;
2129 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2130 { if (!is_anchored(scode, options)) return FALSE; }
2131 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2132 (*options & PCRE_DOTALL) != 0)
2133 { if (scode[1] != OP_ANY) return FALSE; }
2134 else if (op != OP_SOD &&
2135 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2136 return FALSE;
2137 code += (code[1] << 8) + code[2];
2138 }
2139 while (*code == OP_ALT);
2140 return TRUE;
2141 }
2142
2143
2144
2145 /*************************************************
2146 * Check for starting with ^ or .* *
2147 *************************************************/
2148
2149 /* This is called to find out if every branch starts with ^ or .* so that
2150 "first char" processing can be done to speed things up in multiline
2151 matching and for non-DOTALL patterns that start with .* (which must start at
2152 the beginning or after \n).
2153
2154 Argument: points to start of expression (the bracket)
2155 Returns: TRUE or FALSE
2156 */
2157
2158 static BOOL
2159 is_startline(const uschar *code)
2160 {
2161 do {
2162 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2163 register int op = *scode;
2164 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2165 { if (!is_startline(scode)) return FALSE; }
2166 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2167 { if (scode[1] != OP_ANY) return FALSE; }
2168 else if (op != OP_CIRC) return FALSE;
2169 code += (code[1] << 8) + code[2];
2170 }
2171 while (*code == OP_ALT);
2172 return TRUE;
2173 }
2174
2175
2176
2177 /*************************************************
2178 * Check for fixed first char *
2179 *************************************************/
2180
2181 /* Try to find out if there is a fixed first character. This is called for
2182 unanchored expressions, as it speeds up their processing quite considerably.
2183 Consider each alternative branch. If they all start with the same char, or with
2184 a bracket all of whose alternatives start with the same char (recurse ad lib),
2185 then we return that char, otherwise -1.
2186
2187 Arguments:
2188 code points to start of expression (the bracket)
2189 options pointer to the options (used to check casing changes)
2190
2191 Returns: -1 or the fixed first char
2192 */
2193
2194 static int
2195 find_firstchar(const uschar *code, int *options)
2196 {
2197 register int c = -1;
2198 do {
2199 int d;
2200 const uschar *scode = first_significant_code(code + 3, options,
2201 PCRE_CASELESS, TRUE);
2202 register int op = *scode;
2203
2204 if (op >= OP_BRA) op = OP_BRA;
2205
2206 switch(op)
2207 {
2208 default:
2209 return -1;
2210
2211 case OP_BRA:
2212 case OP_ASSERT:
2213 case OP_ONCE:
2214 case OP_COND:
2215 if ((d = find_firstchar(scode, options)) < 0) return -1;
2216 if (c < 0) c = d; else if (c != d) return -1;
2217 break;
2218
2219 case OP_EXACT: /* Fall through */
2220 scode++;
2221
2222 case OP_CHARS: /* Fall through */
2223 scode++;
2224
2225 case OP_PLUS:
2226 case OP_MINPLUS:
2227 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2228 break;
2229 }
2230
2231 code += (code[1] << 8) + code[2];
2232 }
2233 while (*code == OP_ALT);
2234 return c;
2235 }
2236
2237
2238
2239
2240
2241 /*************************************************
2242 * Compile a Regular Expression *
2243 *************************************************/
2244
2245 /* This function takes a string and returns a pointer to a block of store
2246 holding a compiled version of the expression.
2247
2248 Arguments:
2249 pattern the regular expression
2250 options various option bits
2251 errorptr pointer to pointer to error text
2252 erroroffset ptr offset in pattern where error was detected
2253 tables pointer to character tables or NULL
2254
2255 Returns: pointer to compiled data block, or NULL on error,
2256 with errorptr and erroroffset set
2257 */
2258
2259 pcre *
2260 pcre_compile(const char *pattern, int options, const char **errorptr,
2261 int *erroroffset, const unsigned char *tables)
2262 {
2263 real_pcre *re;
2264 int length = 3; /* For initial BRA plus length */
2265 int runlength;
2266 int c, reqchar, countlits;
2267 int bracount = 0;
2268 int top_backref = 0;
2269 int branch_extra = 0;
2270 int branch_newextra;
2271 unsigned int brastackptr = 0;
2272 size_t size;
2273 uschar *code;
2274 const uschar *ptr;
2275 compile_data compile_block;
2276 int brastack[BRASTACK_SIZE];
2277 uschar bralenstack[BRASTACK_SIZE];
2278
2279 #ifdef DEBUG
2280 uschar *code_base, *code_end;
2281 #endif
2282
2283 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2284 can do is just return NULL. */
2285
2286 if (errorptr == NULL) return NULL;
2287 *errorptr = NULL;
2288
2289 /* However, we can give a message for this error */
2290
2291 if (erroroffset == NULL)
2292 {
2293 *errorptr = ERR16;
2294 return NULL;
2295 }
2296 *erroroffset = 0;
2297
2298 if ((options & ~PUBLIC_OPTIONS) != 0)
2299 {
2300 *errorptr = ERR17;
2301 return NULL;
2302 }
2303
2304 /* Set up pointers to the individual character tables */
2305
2306 if (tables == NULL) tables = pcre_default_tables;
2307 compile_block.lcc = tables + lcc_offset;
2308 compile_block.fcc = tables + fcc_offset;
2309 compile_block.cbits = tables + cbits_offset;
2310 compile_block.ctypes = tables + ctypes_offset;
2311
2312 /* Reflect pattern for debugging output */
2313
2314 DPRINTF(("------------------------------------------------------------------\n"));
2315 DPRINTF(("%s\n", pattern));
2316
2317 /* The first thing to do is to make a pass over the pattern to compute the
2318 amount of store required to hold the compiled code. This does not have to be
2319 perfect as long as errors are overestimates. At the same time we can detect any
2320 internal flag settings. Make an attempt to correct for any counted white space
2321 if an "extended" flag setting appears late in the pattern. We can't be so
2322 clever for #-comments. */
2323
2324 ptr = (const uschar *)(pattern - 1);
2325 while ((c = *(++ptr)) != 0)
2326 {
2327 int min, max;
2328 int class_charcount;
2329
2330 if ((options & PCRE_EXTENDED) != 0)
2331 {
2332 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2333 if (c == '#')
2334 {
2335 /* The space before the ; is to avoid a warning on a silly compiler
2336 on the Macintosh. */
2337 while ((c = *(++ptr)) != 0 && c != '\n') ;
2338 continue;
2339 }
2340 }
2341
2342 switch(c)
2343 {
2344 /* A backslashed item may be an escaped "normal" character or a
2345 character type. For a "normal" character, put the pointers and
2346 character back so that tests for whitespace etc. in the input
2347 are done correctly. */
2348
2349 case '\\':
2350 {
2351 const uschar *save_ptr = ptr;
2352 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2353 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2354 if (c >= 0)
2355 {
2356 ptr = save_ptr;
2357 c = '\\';
2358 goto NORMAL_CHAR;
2359 }
2360 }
2361 length++;
2362
2363 /* A back reference needs an additional char, plus either one or 5
2364 bytes for a repeat. We also need to keep the value of the highest
2365 back reference. */
2366
2367 if (c <= -ESC_REF)
2368 {
2369 int refnum = -c - ESC_REF;
2370 if (refnum > top_backref) top_backref = refnum;
2371 length++; /* For single back reference */
2372 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2373 {
2374 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2375 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2376 if ((min == 0 && (max == 1 || max == -1)) ||
2377 (min == 1 && max == -1))
2378 length++;
2379 else length += 5;
2380 if (ptr[1] == '?') ptr++;
2381 }
2382 }
2383 continue;
2384
2385 case '^':
2386 case '.':
2387 case '$':
2388 case '*': /* These repeats won't be after brackets; */
2389 case '+': /* those are handled separately */
2390 case '?':
2391 length++;
2392 continue;
2393
2394 /* This covers the cases of repeats after a single char, metachar, class,
2395 or back reference. */
2396
2397 case '{':
2398 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2399 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2400 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2401 if ((min == 0 && (max == 1 || max == -1)) ||
2402 (min == 1 && max == -1))
2403 length++;
2404 else
2405 {
2406 length--; /* Uncount the original char or metachar */
2407 if (min == 1) length++; else if (min > 0) length += 4;
2408 if (max > 0) length += 4; else length += 2;
2409 }
2410 if (ptr[1] == '?') ptr++;
2411 continue;
2412
2413 /* An alternation contains an offset to the next branch or ket. If any ims
2414 options changed in the previous branch(es), and/or if we are in a
2415 lookbehind assertion, extra space will be needed at the start of the
2416 branch. This is handled by branch_extra. */
2417
2418 case '|':
2419 length += 3 + branch_extra;
2420 continue;
2421
2422 /* A character class uses 33 characters. Don't worry about character types
2423 that aren't allowed in classes - they'll get picked up during the compile.
2424 A character class that contains only one character uses 2 or 3 bytes,
2425 depending on whether it is negated or not. Notice this where we can. */
2426
2427 case '[':
2428 class_charcount = 0;
2429 if (*(++ptr) == '^') ptr++;
2430 do
2431 {
2432 if (*ptr == '\\')
2433 {
2434 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2435 &compile_block);
2436 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2437 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2438 }
2439 else class_charcount++;
2440 ptr++;
2441 }
2442 while (*ptr != 0 && *ptr != ']');
2443
2444 /* Repeats for negated single chars are handled by the general code */
2445
2446 if (class_charcount == 1) length += 3; else
2447 {
2448 length += 33;
2449
2450 /* A repeat needs either 1 or 5 bytes. */
2451
2452 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2453 {
2454 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2455 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2456 if ((min == 0 && (max == 1 || max == -1)) ||
2457 (min == 1 && max == -1))
2458 length++;
2459 else length += 5;
2460 if (ptr[1] == '?') ptr++;
2461 }
2462 }
2463 continue;
2464
2465 /* Brackets may be genuine groups or special things */
2466
2467 case '(':
2468 branch_newextra = 0;
2469
2470 /* Handle special forms of bracket, which all start (? */
2471
2472 if (ptr[1] == '?')
2473 {
2474 int set, unset;
2475 int *optset;
2476
2477 switch (c = ptr[2])
2478 {
2479 /* Skip over comments entirely */
2480 case '#':
2481 ptr += 3;
2482 while (*ptr != 0 && *ptr != ')') ptr++;
2483 if (*ptr == 0)
2484 {
2485 *errorptr = ERR18;
2486 goto PCRE_ERROR_RETURN;
2487 }
2488 continue;
2489
2490 /* Non-referencing groups and lookaheads just move the pointer on, and
2491 then behave like a non-special bracket, except that they don't increment
2492 the count of extracting brackets. Ditto for the "once only" bracket,
2493 which is in Perl from version 5.005. */
2494
2495 case ':':
2496 case '=':
2497 case '!':
2498 case '>':
2499 ptr += 2;
2500 break;
2501
2502 /* A recursive call to the regex is an extension, to provide the
2503 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2504
2505 case 'R':
2506 if (ptr[3] != ')')
2507 {
2508 *errorptr = ERR29;
2509 goto PCRE_ERROR_RETURN;
2510 }
2511 ptr += 3;
2512 length += 1;
2513 break;
2514
2515 /* Lookbehinds are in Perl from version 5.005 */
2516
2517 case '<':
2518 if (ptr[3] == '=' || ptr[3] == '!')
2519 {
2520 ptr += 3;
2521 branch_newextra = 3;
2522 length += 3; /* For the first branch */
2523 break;
2524 }
2525 *errorptr = ERR24;
2526 goto PCRE_ERROR_RETURN;
2527
2528 /* Conditionals are in Perl from version 5.005. The bracket must either
2529 be followed by a number (for bracket reference) or by an assertion
2530 group. */
2531
2532 case '(':
2533 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2534 {
2535 ptr += 4;
2536 length += 2;
2537 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2538 if (*ptr != ')')
2539 {
2540 *errorptr = ERR26;
2541 goto PCRE_ERROR_RETURN;
2542 }
2543 }
2544 else /* An assertion must follow */
2545 {
2546 ptr++; /* Can treat like ':' as far as spacing is concerned */
2547 if (ptr[2] != '?' ||
2548 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2549 {
2550 ptr += 2; /* To get right offset in message */
2551 *errorptr = ERR28;
2552 goto PCRE_ERROR_RETURN;
2553 }
2554 }
2555 break;
2556
2557 /* Else loop checking valid options until ) is met. Anything else is an
2558 error. If we are without any brackets, i.e. at top level, the settings
2559 act as if specified in the options, so massage the options immediately.
2560 This is for backward compatibility with Perl 5.004. */
2561
2562 default:
2563 set = unset = 0;
2564 optset = &set;
2565 ptr += 2;
2566
2567 for (;; ptr++)
2568 {
2569 c = *ptr;
2570 switch (c)
2571 {
2572 case 'i':
2573 *optset |= PCRE_CASELESS;
2574 continue;
2575
2576 case 'm':
2577 *optset |= PCRE_MULTILINE;
2578 continue;
2579
2580 case 's':
2581 *optset |= PCRE_DOTALL;
2582 continue;
2583
2584 case 'x':
2585 *optset |= PCRE_EXTENDED;
2586 continue;
2587
2588 case 'X':
2589 *optset |= PCRE_EXTRA;
2590 continue;
2591
2592 case 'U':
2593 *optset |= PCRE_UNGREEDY;
2594 continue;
2595
2596 case '-':
2597 optset = &unset;
2598 continue;
2599
2600 /* A termination by ')' indicates an options-setting-only item;
2601 this is global at top level; otherwise nothing is done here and
2602 it is handled during the compiling process on a per-bracket-group
2603 basis. */
2604
2605 case ')':
2606 if (brastackptr == 0)
2607 {
2608 options = (options | set) & (~unset);
2609 set = unset = 0; /* To save length */
2610 }
2611 /* Fall through */
2612
2613 /* A termination by ':' indicates the start of a nested group with
2614 the given options set. This is again handled at compile time, but
2615 we must allow for compiled space if any of the ims options are
2616 set. We also have to allow for resetting space at the end of
2617 the group, which is why 4 is added to the length and not just 2.
2618 If there are several changes of options within the same group, this
2619 will lead to an over-estimate on the length, but this shouldn't
2620 matter very much. We also have to allow for resetting options at
2621 the start of any alternations, which we do by setting
2622 branch_newextra to 2. Finally, we record whether the case-dependent
2623 flag ever changes within the regex. This is used by the "required
2624 character" code. */
2625
2626 case ':':
2627 if (((set|unset) & PCRE_IMS) != 0)
2628 {
2629 length += 4;
2630 branch_newextra = 2;
2631 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2632 }
2633 goto END_OPTIONS;
2634
2635 /* Unrecognized option character */
2636
2637 default:
2638 *errorptr = ERR12;
2639 goto PCRE_ERROR_RETURN;
2640 }
2641 }
2642
2643 /* If we hit a closing bracket, that's it - this is a freestanding
2644 option-setting. We need to ensure that branch_extra is updated if
2645 necessary. The only values branch_newextra can have here are 0 or 2.
2646 If the value is 2, then branch_extra must either be 2 or 5, depending
2647 on whether this is a lookbehind group or not. */
2648
2649 END_OPTIONS:
2650 if (c == ')')
2651 {
2652 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2653 branch_extra += branch_newextra;
2654 continue;
2655 }
2656
2657 /* If options were terminated by ':' control comes here. Fall through
2658 to handle the group below. */
2659 }
2660 }
2661
2662 /* Extracting brackets must be counted so we can process escapes in a
2663 Perlish way. */
2664
2665 else bracount++;
2666
2667 /* Non-special forms of bracket. Save length for computing whole length
2668 at end if there's a repeat that requires duplication of the group. Also
2669 save the current value of branch_extra, and start the new group with
2670 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2671 for a lookbehind assertion. */
2672
2673 if (brastackptr >= sizeof(brastack)/sizeof(int))
2674 {
2675 *errorptr = ERR19;
2676 goto PCRE_ERROR_RETURN;
2677 }
2678
2679 bralenstack[brastackptr] = branch_extra;
2680 branch_extra = branch_newextra;
2681
2682 brastack[brastackptr++] = length;
2683 length += 3;
2684 continue;
2685
2686 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2687 have to replicate this bracket up to that many times. If brastackptr is
2688 0 this is an unmatched bracket which will generate an error, but take care
2689 not to try to access brastack[-1] when computing the length and restoring
2690 the branch_extra value. */
2691
2692 case ')':
2693 length += 3;
2694 {
2695 int minval = 1;
2696 int maxval = 1;
2697 int duplength;
2698
2699 if (brastackptr > 0)
2700 {
2701 duplength = length - brastack[--brastackptr];
2702 branch_extra = bralenstack[brastackptr];
2703 }
2704 else duplength = 0;
2705
2706 /* Leave ptr at the final char; for read_repeat_counts this happens
2707 automatically; for the others we need an increment. */
2708
2709 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2710 {
2711 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2712 &compile_block);
2713 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2714 }
2715 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2716 else if (c == '+') { maxval = -1; ptr++; }
2717 else if (c == '?') { minval = 0; ptr++; }
2718
2719 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2720 group, and if the maximum is greater than zero, we have to replicate
2721 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2722 bracket set - hence the 7. */
2723
2724 if (minval == 0)
2725 {
2726 length++;
2727 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2728 }
2729
2730 /* When the minimum is greater than zero, 1 we have to replicate up to
2731 minval-1 times, with no additions required in the copies. Then, if
2732 there is a limited maximum we have to replicate up to maxval-1 times
2733 allowing for a BRAZERO item before each optional copy and nesting
2734 brackets for all but one of the optional copies. */
2735
2736 else
2737 {
2738 length += (minval - 1) * duplength;
2739 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2740 length += (maxval - minval) * (duplength + 7) - 6;
2741 }
2742 }
2743 continue;
2744
2745 /* Non-special character. For a run of such characters the length required
2746 is the number of characters + 2, except that the maximum run length is 255.
2747 We won't get a skipped space or a non-data escape or the start of a #
2748 comment as the first character, so the length can't be zero. */
2749
2750 NORMAL_CHAR:
2751 default:
2752 length += 2;
2753 runlength = 0;
2754 do
2755 {
2756 if ((options & PCRE_EXTENDED) != 0)
2757 {
2758 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2759 if (c == '#')
2760 {
2761 /* The space before the ; is to avoid a warning on a silly compiler
2762 on the Macintosh. */
2763 while ((c = *(++ptr)) != 0 && c != '\n') ;
2764 continue;
2765 }
2766 }
2767
2768 /* Backslash may introduce a data char or a metacharacter; stop the
2769 string before the latter. */
2770
2771 if (c == '\\')
2772 {
2773 const uschar *saveptr = ptr;
2774 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2775 &compile_block);
2776 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2777 if (c < 0) { ptr = saveptr; break; }
2778 }
2779
2780 /* Ordinary character or single-char escape */
2781
2782 runlength++;
2783 }
2784
2785 /* This "while" is the end of the "do" above. */
2786
2787 while (runlength < 255 &&
2788 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2789
2790 ptr--;
2791 length += runlength;
2792 continue;
2793 }
2794 }
2795
2796 length += 4; /* For final KET and END */
2797
2798 if (length > 65539)
2799 {
2800 *errorptr = ERR20;
2801 return NULL;
2802 }
2803
2804 /* Compute the size of data block needed and get it, either from malloc or
2805 externally provided function. We specify "code[0]" in the offsetof() expression
2806 rather than just "code", because it has been reported that one broken compiler
2807 fails on "code" because it is also an independent variable. It should make no
2808 difference to the value of the offsetof(). */
2809
2810 size = length + offsetof(real_pcre, code[0]);
2811 re = (real_pcre *)(pcre_malloc)(size);
2812
2813 if (re == NULL)
2814 {
2815 *errorptr = ERR21;
2816 return NULL;
2817 }
2818
2819 /* Put in the magic number, and save the size, options, and table pointer */
2820
2821 re->magic_number = MAGIC_NUMBER;
2822 re->size = size;
2823 re->options = options;
2824 re->tables = tables;
2825
2826 /* Set up a starting, non-extracting bracket, then compile the expression. On
2827 error, *errorptr will be set non-NULL, so we don't need to look at the result
2828 of the function here. */
2829
2830 ptr = (const uschar *)pattern;
2831 code = re->code;
2832 *code = OP_BRA;
2833 bracount = 0;
2834 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2835 &reqchar, &countlits, &compile_block);
2836 re->top_bracket = bracount;
2837 re->top_backref = top_backref;
2838
2839 /* If not reached end of pattern on success, there's an excess bracket. */
2840
2841 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2842
2843 /* Fill in the terminating state and check for disastrous overflow, but
2844 if debugging, leave the test till after things are printed out. */
2845
2846 *code++ = OP_END;
2847
2848 #ifndef DEBUG
2849 if (code - re->code > length) *errorptr = ERR23;
2850 #endif
2851
2852 /* Give an error if there's back reference to a non-existent capturing
2853 subpattern. */
2854
2855 if (top_backref > re->top_bracket) *errorptr = ERR15;
2856
2857 /* Failed to compile */
2858
2859 if (*errorptr != NULL)
2860 {
2861 (pcre_free)(re);
2862 PCRE_ERROR_RETURN:
2863 *erroroffset = ptr - (const uschar *)pattern;
2864 return NULL;
2865 }
2866
2867 /* If the anchored option was not passed, set flag if we can determine that the
2868 pattern is anchored by virtue of ^ characters or \A or anything else (such as
2869 starting with .* when DOTALL is set).
2870
2871 Otherwise, see if we can determine what the first character has to be, because
2872 that speeds up unanchored matches no end. If not, see if we can set the
2873 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2874 start with ^. and also when all branches start with .* for non-DOTALL matches.
2875 */
2876
2877 if ((options & PCRE_ANCHORED) == 0)
2878 {
2879 int temp_options = options;
2880 if (is_anchored(re->code, &temp_options))
2881 re->options |= PCRE_ANCHORED;
2882 else
2883 {
2884 int ch = find_firstchar(re->code, &temp_options);
2885 if (ch >= 0)
2886 {
2887 re->first_char = ch;
2888 re->options |= PCRE_FIRSTSET;
2889 }
2890 else if (is_startline(re->code))
2891 re->options |= PCRE_STARTLINE;
2892 }
2893 }
2894
2895 /* Save the last required character if there are at least two literal
2896 characters on all paths, or if there is no first character setting. */
2897
2898 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2899 {
2900 re->req_char = reqchar;
2901 re->options |= PCRE_REQCHSET;
2902 }
2903
2904 /* Print out the compiled data for debugging */
2905
2906 #ifdef DEBUG
2907
2908 printf("Length = %d top_bracket = %d top_backref = %d\n",
2909 length, re->top_bracket, re->top_backref);
2910
2911 if (re->options != 0)
2912 {
2913 printf("%s%s%s%s%s%s%s%s%s\n",
2914 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2915 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2916 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2917 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2918 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2919 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2920 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2921 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2922 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2923 }
2924
2925 if ((re->options & PCRE_FIRSTSET) != 0)
2926 {
2927 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2928 else printf("First char = \\x%02x\n", re->first_char);
2929 }
2930
2931 if ((re->options & PCRE_REQCHSET) != 0)
2932 {
2933 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2934 else printf("Req char = \\x%02x\n", re->req_char);
2935 }
2936
2937 code_end = code;
2938 code_base = code = re->code;
2939
2940 while (code < code_end)
2941 {
2942 int charlength;
2943
2944 printf("%3d ", code - code_base);
2945
2946 if (*code >= OP_BRA)
2947 {
2948 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2949 code += 2;
2950 }
2951
2952 else switch(*code)
2953 {
2954 case OP_OPT:
2955 printf(" %.2x %s", code[1], OP_names[*code]);
2956 code++;
2957 break;
2958
2959 case OP_COND:
2960 printf("%3d Cond", (code[1] << 8) + code[2]);
2961 code += 2;
2962 break;
2963
2964 case OP_CREF:
2965 printf(" %.2d %s", code[1], OP_names[*code]);
2966 code++;
2967 break;
2968
2969 case OP_CHARS:
2970 charlength = *(++code);
2971 printf("%3d ", charlength);
2972 while (charlength-- > 0)
2973 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2974 break;
2975
2976 case OP_KETRMAX:
2977 case OP_KETRMIN:
2978 case OP_ALT:
2979 case OP_KET:
2980 case OP_ASSERT:
2981 case OP_ASSERT_NOT:
2982 case OP_ASSERTBACK:
2983 case OP_ASSERTBACK_NOT:
2984 case OP_ONCE:
2985 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2986 code += 2;
2987 break;
2988
2989 case OP_REVERSE:
2990 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2991 code += 2;
2992 break;
2993
2994 case OP_STAR:
2995 case OP_MINSTAR:
2996 case OP_PLUS:
2997 case OP_MINPLUS:
2998 case OP_QUERY:
2999 case OP_MINQUERY:
3000 case OP_TYPESTAR:
3001 case OP_TYPEMINSTAR:
3002 case OP_TYPEPLUS:
3003 case OP_TYPEMINPLUS:
3004 case OP_TYPEQUERY:
3005 case OP_TYPEMINQUERY:
3006 if (*code >= OP_TYPESTAR)
3007 printf(" %s", OP_names[code[1]]);
3008 else if (isprint(c = code[1])) printf(" %c", c);
3009 else printf(" \\x%02x", c);
3010 printf("%s", OP_names[*code++]);
3011 break;
3012
3013 case OP_EXACT:
3014 case OP_UPTO:
3015 case OP_MINUPTO:
3016 if (isprint(c = code[3])) printf(" %c{", c);
3017 else printf(" \\x%02x{", c);
3018 if (*code != OP_EXACT) printf("0,");
3019 printf("%d}", (code[1] << 8) + code[2]);
3020 if (*code == OP_MINUPTO) printf("?");
3021 code += 3;
3022 break;
3023
3024 case OP_TYPEEXACT:
3025 case OP_TYPEUPTO:
3026 case OP_TYPEMINUPTO:
3027 printf(" %s{", OP_names[code[3]]);
3028 if (*code != OP_TYPEEXACT) printf(",");
3029 printf("%d}", (code[1] << 8) + code[2]);
3030 if (*code == OP_TYPEMINUPTO) printf("?");
3031 code += 3;
3032 break;
3033
3034 case OP_NOT:
3035 if (isprint(c = *(++code))) printf(" [^%c]", c);
3036 else printf(" [^\\x%02x]", c);
3037 break;
3038
3039 case OP_NOTSTAR:
3040 case OP_NOTMINSTAR:
3041 case OP_NOTPLUS:
3042 case OP_NOTMINPLUS:
3043 case OP_NOTQUERY:
3044 case OP_NOTMINQUERY:
3045 if (isprint(c = code[1])) printf(" [^%c]", c);
3046 else printf(" [^\\x%02x]", c);
3047 printf("%s", OP_names[*code++]);
3048 break;
3049
3050 case OP_NOTEXACT:
3051 case OP_NOTUPTO:
3052 case OP_NOTMINUPTO:
3053 if (isprint(c = code[3])) printf(" [^%c]{", c);
3054 else printf(" [^\\x%02x]{", c);
3055 if (*code != OP_NOTEXACT) printf(",");
3056 printf("%d}", (code[1] << 8) + code[2]);
3057 if (*code == OP_NOTMINUPTO) printf("?");
3058 code += 3;
3059 break;
3060
3061 case OP_REF:
3062 printf(" \\%d", *(++code));
3063 code ++;
3064 goto CLASS_REF_REPEAT;
3065
3066 case OP_CLASS:
3067 {
3068 int i, min, max;
3069 code++;
3070 printf(" [");
3071
3072 for (i = 0; i < 256; i++)
3073 {
3074 if ((code[i/8] & (1 << (i&7))) != 0)
3075 {
3076 int j;
3077 for (j = i+1; j < 256; j++)
3078 if ((code[j/8] & (1 << (j&7))) == 0) break;
3079 if (i == '-' || i == ']') printf("\\");
3080 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3081 if (--j > i)
3082 {
3083 printf("-");
3084 if (j == '-' || j == ']') printf("\\");
3085 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3086 }
3087 i = j;
3088 }
3089 }
3090 printf("]");
3091 code += 32;
3092
3093 CLASS_REF_REPEAT:
3094
3095 switch(*code)
3096 {
3097 case OP_CRSTAR:
3098 case OP_CRMINSTAR:
3099 case OP_CRPLUS:
3100 case OP_CRMINPLUS:
3101 case OP_CRQUERY:
3102 case OP_CRMINQUERY:
3103 printf("%s", OP_names[*code]);
3104 break;
3105
3106 case OP_CRRANGE:
3107 case OP_CRMINRANGE:
3108 min = (code[1] << 8) + code[2];
3109 max = (code[3] << 8) + code[4];
3110 if (max == 0) printf("{%d,}", min);
3111 else printf("{%d,%d}", min, max);
3112 if (*code == OP_CRMINRANGE) printf("?");
3113 code += 4;
3114 break;
3115
3116 default:
3117 code--;
3118 }
3119 }
3120 break;
3121
3122 /* Anything else is just a one-node item */
3123
3124 default:
3125 printf(" %s", OP_names[*code]);
3126 break;
3127 }
3128
3129 code++;
3130 printf("\n");
3131 }
3132 printf("------------------------------------------------------------------\n");
3133
3134 /* This check is done here in the debugging case so that the code that
3135 was compiled can be seen. */
3136
3137 if (code - re->code > length)
3138 {
3139 *errorptr = ERR23;
3140 (pcre_free)(re);
3141 *erroroffset = ptr - (uschar *)pattern;
3142 return NULL;
3143 }
3144 #endif
3145
3146 return (pcre *)re;
3147 }
3148
3149
3150
3151 /*************************************************
3152 * Match a back-reference *
3153 *************************************************/
3154
3155 /* If a back reference hasn't been set, the length that is passed is greater
3156 than the number of characters left in the string, so the match fails.
3157
3158 Arguments:
3159 offset index into the offset vector
3160 eptr points into the subject
3161 length length to be matched
3162 md points to match data block
3163 ims the ims flags
3164
3165 Returns: TRUE if matched
3166 */
3167
3168 static BOOL
3169 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3170 unsigned long int ims)
3171 {
3172 const uschar *p = md->start_subject + md->offset_vector[offset];
3173
3174 #ifdef DEBUG
3175 if (eptr >= md->end_subject)
3176 printf("matching subject <null>");
3177 else
3178 {
3179 printf("matching subject ");
3180 pchars(eptr, length, TRUE, md);
3181 }
3182 printf(" against backref ");
3183 pchars(p, length, FALSE, md);
3184 printf("\n");
3185 #endif
3186
3187 /* Always fail if not enough characters left */
3188
3189 if (length > md->end_subject - eptr) return FALSE;
3190
3191 /* Separate the caselesss case for speed */
3192
3193 if ((ims & PCRE_CASELESS) != 0)
3194 {
3195 while (length-- > 0)
3196 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3197 }
3198 else
3199 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3200
3201 return TRUE;
3202 }
3203
3204
3205
3206 /*************************************************
3207 * Match from current position *
3208 *************************************************/
3209
3210 /* On entry ecode points to the first opcode, and eptr to the first character
3211 in the subject string, while eptrb holds the value of eptr at the start of the
3212 last bracketed group - used for breaking infinite loops matching zero-length
3213 strings.
3214
3215 Arguments:
3216 eptr pointer in subject
3217 ecode position in code
3218 offset_top current top pointer
3219 md pointer to "static" info for the match
3220 ims current /i, /m, and /s options
3221 eptrb pointer to chain of blocks containing eptr at start of
3222 brackets - for testing for empty matches
3223 flags can contain
3224 match_condassert - this is an assertion condition
3225 match_isgroup - this is the start of a bracketed group
3226
3227 Returns: TRUE if matched
3228 */
3229
3230 static BOOL
3231 match(register const uschar *eptr, register const uschar *ecode,
3232 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3233 int flags)
3234 {
3235 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3236 eptrblock newptrb;
3237
3238 /* At the start of a bracketed group, add the current subject pointer to the
3239 stack of such pointers, to be re-instated at the end of the group when we hit
3240 the closing ket. When match() is called in other circumstances, we don't add to
3241 the stack. */
3242
3243 if ((flags & match_isgroup) != 0)
3244 {
3245 newptrb.prev = eptrb;
3246 newptrb.saved_eptr = eptr;
3247 eptrb = &newptrb;
3248 }
3249
3250 /* Now start processing the operations. */
3251
3252 for (;;)
3253 {
3254 int op = (int)*ecode;
3255 int min, max, ctype;
3256 register int i;
3257 register int c;
3258 BOOL minimize = FALSE;
3259
3260 /* Opening capturing bracket. If there is space in the offset vector, save
3261 the current subject position in the working slot at the top of the vector. We
3262 mustn't change the current values of the data slot, because they may be set
3263 from a previous iteration of this group, and be referred to by a reference
3264 inside the group.
3265
3266 If the bracket fails to match, we need to restore this value and also the
3267 values of the final offsets, in case they were set by a previous iteration of
3268 the same bracket.
3269
3270 If there isn't enough space in the offset vector, treat this as if it were a
3271 non-capturing bracket. Don't worry about setting the flag for the error case
3272 here; that is handled in the code for KET. */
3273
3274 if (op > OP_BRA)
3275 {
3276 int number = op - OP_BRA;
3277 int offset = number << 1;
3278
3279 #ifdef DEBUG
3280 printf("start bracket %d subject=", number);
3281 pchars(eptr, 16, TRUE, md);
3282 printf("\n");
3283 #endif
3284
3285 if (offset < md->offset_max)
3286 {
3287 int save_offset1 = md->offset_vector[offset];
3288 int save_offset2 = md->offset_vector[offset+1];
3289 int save_offset3 = md->offset_vector[md->offset_end - number];
3290
3291 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3292 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3293
3294 do
3295 {
3296 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3297 return TRUE;
3298 ecode += (ecode[1] << 8) + ecode[2];
3299 }
3300 while (*ecode == OP_ALT);
3301
3302 DPRINTF(("bracket %d failed\n", number));
3303
3304 md->offset_vector[offset] = save_offset1;
3305 md->offset_vector[offset+1] = save_offset2;
3306 md->offset_vector[md->offset_end - number] = save_offset3;
3307 return FALSE;
3308 }
3309
3310 /* Insufficient room for saving captured contents */
3311
3312 else op = OP_BRA;
3313 }
3314
3315 /* Other types of node can be handled by a switch */
3316
3317 switch(op)
3318 {
3319 case OP_BRA: /* Non-capturing bracket: optimized */
3320 DPRINTF(("start bracket 0\n"));
3321 do
3322 {
3323 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3324 return TRUE;
3325 ecode += (ecode[1] << 8) + ecode[2];
3326 }
3327 while (*ecode == OP_ALT);
3328 DPRINTF(("bracket 0 failed\n"));
3329 return FALSE;
3330
3331 /* Conditional group: compilation checked that there are no more than
3332 two branches. If the condition is false, skipping the first branch takes us
3333 past the end if there is only one branch, but that's OK because that is
3334 exactly what going to the ket would do. */
3335
3336 case OP_COND:
3337 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3338 {
3339 int offset = ecode[4] << 1; /* Doubled reference number */
3340 return match(eptr,
3341 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3342 5 : 3 + (ecode[1] << 8) + ecode[2]),
3343 offset_top, md, ims, eptrb, match_isgroup);
3344 }
3345
3346 /* The condition is an assertion. Call match() to evaluate it - setting
3347 the final argument TRUE causes it to stop at the end of an assertion. */
3348
3349 else
3350 {
3351 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3352 match_condassert | match_isgroup))
3353 {
3354 ecode += 3 + (ecode[4] << 8) + ecode[5];
3355 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3356 }
3357 else ecode += (ecode[1] << 8) + ecode[2];
3358 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3359 }
3360 /* Control never reaches here */
3361
3362 /* Skip over conditional reference data if encountered (should not be) */
3363
3364 case OP_CREF:
3365 ecode += 2;
3366 break;
3367
3368 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3369 an empty string - recursion will then try other alternatives, if any. */
3370
3371 case OP_END:
3372 if (md->notempty && eptr == md->start_match) return FALSE;
3373 md->end_match_ptr = eptr; /* Record where we ended */
3374 md->end_offset_top = offset_top; /* and how many extracts were taken */
3375 return TRUE;
3376
3377 /* Change option settings */
3378
3379 case OP_OPT:
3380 ims = ecode[1];
3381 ecode += 2;
3382 DPRINTF(("ims set to %02lx\n", ims));
3383 break;
3384
3385 /* Assertion brackets. Check the alternative branches in turn - the
3386 matching won't pass the KET for an assertion. If any one branch matches,
3387 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3388 start of each branch to move the current point backwards, so the code at
3389 this level is identical to the lookahead case. */
3390
3391 case OP_ASSERT:
3392 case OP_ASSERTBACK:
3393 do
3394 {
3395 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3396 ecode += (ecode[1] << 8) + ecode[2];
3397 }
3398 while (*ecode == OP_ALT);
3399 if (*ecode == OP_KET) return FALSE;
3400
3401 /* If checking an assertion for a condition, return TRUE. */
3402
3403 if ((flags & match_condassert) != 0) return TRUE;
3404
3405 /* Continue from after the assertion, updating the offsets high water
3406 mark, since extracts may have been taken during the assertion. */
3407
3408 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3409 ecode += 3;
3410 offset_top = md->end_offset_top;
3411 continue;
3412
3413 /* Negative assertion: all branches must fail to match */
3414
3415 case OP_ASSERT_NOT:
3416 case OP_ASSERTBACK_NOT:
3417 do
3418 {
3419 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3420 return FALSE;
3421 ecode += (ecode[1] << 8) + ecode[2];
3422 }
3423 while (*ecode == OP_ALT);
3424
3425 if ((flags & match_condassert) != 0) return TRUE;
3426
3427 ecode += 3;
3428 continue;
3429
3430 /* Move the subject pointer back. This occurs only at the start of
3431 each branch of a lookbehind assertion. If we are too close to the start to
3432 move back, this match function fails. */
3433
3434 case OP_REVERSE:
3435 eptr -= (ecode[1] << 8) + ecode[2];
3436 if (eptr < md->start_subject) return FALSE;
3437 ecode += 3;
3438 break;
3439
3440 /* Recursion matches the current regex, nested. If there are any capturing
3441 brackets started but not finished, we have to save their starting points
3442 and reinstate them after the recursion. However, we don't know how many
3443 such there are (offset_top records the completed total) so we just have
3444 to save all the potential data. There may be up to 99 such values, which
3445 is a bit large to put on the stack, but using malloc for small numbers
3446 seems expensive. As a compromise, the stack is used when there are fewer
3447 than 16 values to store; otherwise malloc is used. A problem is what to do
3448 if the malloc fails ... there is no way of returning to the top level with
3449 an error. Save the top 15 values on the stack, and accept that the rest
3450 may be wrong. */
3451
3452 case OP_RECURSE:
3453 {
3454 BOOL rc;
3455 int *save;
3456 int stacksave[15];
3457
3458 c = md->offset_max;
3459
3460 if (c < 16) save = stacksave; else
3461 {
3462 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3463 if (save == NULL)
3464 {
3465 save = stacksave;
3466 c = 15;
3467 }
3468 }
3469
3470 for (i = 1; i <= c; i++)
3471 save[i] = md->offset_vector[md->offset_end - i];
3472 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3473 match_isgroup);
3474 for (i = 1; i <= c; i++)
3475 md->offset_vector[md->offset_end - i] = save[i];
3476 if (save != stacksave) (pcre_free)(save);
3477 if (!rc) return FALSE;
3478
3479 /* In case the recursion has set more capturing values, save the final
3480 number, then move along the subject till after the recursive match,
3481 and advance one byte in the pattern code. */
3482
3483 offset_top = md->end_offset_top;
3484 eptr = md->end_match_ptr;
3485 ecode++;
3486 }
3487 break;
3488
3489 /* "Once" brackets are like assertion brackets except that after a match,
3490 the point in the subject string is not moved back. Thus there can never be
3491 a move back into the brackets. Check the alternative branches in turn - the
3492 matching won't pass the KET for this kind of subpattern. If any one branch
3493 matches, we carry on as at the end of a normal bracket, leaving the subject
3494 pointer. */
3495
3496 case OP_ONCE:
3497 {
3498 const uschar *prev = ecode;
3499 const uschar *saved_eptr = eptr;
3500
3501 do
3502 {
3503 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3504 break;
3505 ecode += (ecode[1] << 8) + ecode[2];
3506 }
3507 while (*ecode == OP_ALT);
3508
3509 /* If hit the end of the group (which could be repeated), fail */
3510
3511 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3512
3513 /* Continue as from after the assertion, updating the offsets high water
3514 mark, since extracts may have been taken. */
3515
3516 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3517
3518 offset_top = md->end_offset_top;
3519 eptr = md->end_match_ptr;
3520
3521 /* For a non-repeating ket, just continue at this level. This also
3522 happens for a repeating ket if no characters were matched in the group.
3523 This is the forcible breaking of infinite loops as implemented in Perl
3524 5.005. If there is an options reset, it will get obeyed in the normal
3525 course of events. */
3526
3527 if (*ecode == OP_KET || eptr == saved_eptr)
3528 {
3529 ecode += 3;
3530 break;
3531 }
3532
3533 /* The repeating kets try the rest of the pattern or restart from the
3534 preceding bracket, in the appropriate order. We need to reset any options
3535 that changed within the bracket before re-running it, so check the next
3536 opcode. */
3537
3538 if (ecode[3] == OP_OPT)
3539 {
3540 ims = (ims & ~PCRE_IMS) | ecode[4];
3541 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3542 }
3543
3544 if (*ecode == OP_KETRMIN)
3545 {
3546 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3547 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3548 return TRUE;
3549 }
3550 else /* OP_KETRMAX */
3551 {
3552 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3553 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3554 }
3555 }
3556 return FALSE;
3557
3558 /* An alternation is the end of a branch; scan along to find the end of the
3559 bracketed group and go to there. */
3560
3561 case OP_ALT:
3562 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3563 break;
3564
3565 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3566 that it may occur zero times. It may repeat infinitely, or not at all -
3567 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3568 repeat limits are compiled as a number of copies, with the optional ones
3569 preceded by BRAZERO or BRAMINZERO. */
3570
3571 case OP_BRAZERO:
3572 {
3573 const uschar *next = ecode+1;
3574 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3575 return TRUE;
3576 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3577 ecode = next + 3;
3578 }
3579 break;
3580
3581 case OP_BRAMINZERO:
3582 {
3583 const uschar *next = ecode+1;
3584 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3585 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3586 return TRUE;
3587 ecode++;
3588 }
3589 break;
3590
3591 /* End of a group, repeated or non-repeating. If we are at the end of
3592 an assertion "group", stop matching and return TRUE, but record the
3593 current high water mark for use by positive assertions. Do this also
3594 for the "once" (not-backup up) groups. */
3595
3596 case OP_KET:
3597 case OP_KETRMIN:
3598 case OP_KETRMAX:
3599 {
3600 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3601 const uschar *saved_eptr = eptrb->saved_eptr;
3602
3603 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3604
3605 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3606 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3607 *prev == OP_ONCE)
3608 {
3609 md->end_match_ptr = eptr; /* For ONCE */
3610 md->end_offset_top = offset_top;
3611 return TRUE;
3612 }
3613
3614 /* In all other cases except a conditional group we have to check the
3615 group number back at the start and if necessary complete handling an
3616 extraction by setting the offsets and bumping the high water mark. */
3617
3618 if (*prev != OP_COND)
3619 {
3620 int number = *prev - OP_BRA;
3621 int offset = number << 1;
3622
3623 #ifdef DEBUG
3624 printf("end bracket %d", number);
3625 printf("\n");
3626 #endif
3627
3628 if (number > 0)
3629 {
3630 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3631 {
3632 md->offset_vector[offset] =
3633 md->offset_vector[md->offset_end - number];
3634 md->offset_vector[offset+1] = eptr - md->start_subject;
3635 if (offset_top <= offset) offset_top = offset + 2;
3636 }
3637 }
3638 }
3639
3640 /* Reset the value of the ims flags, in case they got changed during
3641 the group. */
3642
3643 ims = original_ims;
3644 DPRINTF(("ims reset to %02lx\n", ims));
3645
3646 /* For a non-repeating ket, just continue at this level. This also
3647 happens for a repeating ket if no characters were matched in the group.
3648 This is the forcible breaking of infinite loops as implemented in Perl
3649 5.005. If there is an options reset, it will get obeyed in the normal
3650 course of events. */
3651
3652 if (*ecode == OP_KET || eptr == saved_eptr)
3653 {
3654 ecode += 3;
3655 break;
3656 }
3657
3658 /* The repeating kets try the rest of the pattern or restart from the
3659 preceding bracket, in the appropriate order. */
3660
3661 if (*ecode == OP_KETRMIN)
3662 {
3663 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3664 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3665 return TRUE;
3666 }
3667 else /* OP_KETRMAX */
3668 {
3669 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3670 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3671 }
3672 }
3673 return FALSE;
3674
3675 /* Start of subject unless notbol, or after internal newline if multiline */
3676
3677 case OP_CIRC:
3678 if (md->notbol && eptr == md->start_subject) return FALSE;
3679 if ((ims & PCRE_MULTILINE) != 0)
3680 {
3681 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3682 ecode++;
3683 break;
3684 }
3685 /* ... else fall through */
3686
3687 /* Start of subject assertion */
3688
3689 case OP_SOD:
3690 if (eptr != md->start_subject) return FALSE;
3691 ecode++;
3692 break;
3693
3694 /* Assert before internal newline if multiline, or before a terminating
3695 newline unless endonly is set, else end of subject unless noteol is set. */
3696
3697 case OP_DOLL:
3698 if ((ims & PCRE_MULTILINE) != 0)
3699 {
3700 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3701 else { if (md->noteol) return FALSE; }
3702 ecode++;
3703 break;
3704 }
3705 else
3706 {
3707 if (md->noteol) return FALSE;
3708 if (!md->endonly)
3709 {
3710 if (eptr < md->end_subject - 1 ||
3711 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3712
3713 ecode++;
3714 break;
3715 }
3716 }
3717 /* ... else fall through */
3718
3719 /* End of subject assertion (\z) */
3720
3721 case OP_EOD:
3722 if (eptr < md->end_subject) return FALSE;
3723 ecode++;
3724 break;
3725
3726 /* End of subject or ending \n assertion (\Z) */
3727
3728 case OP_EODN:
3729 if (eptr < md->end_subject - 1 ||
3730 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3731 ecode++;
3732 break;
3733
3734 /* Word boundary assertions */
3735
3736 case OP_NOT_WORD_BOUNDARY:
3737 case OP_WORD_BOUNDARY:
3738 {
3739 BOOL prev_is_word = (eptr != md->start_subject) &&
3740 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3741 BOOL cur_is_word = (eptr < md->end_subject) &&
3742 ((md->ctypes[*eptr] & ctype_word) != 0);
3743 if ((*ecode++ == OP_WORD_BOUNDARY)?
3744 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3745 return FALSE;
3746 }
3747 break;
3748
3749 /* Match a single character type; inline for speed */
3750
3751 case OP_ANY:
3752 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3753 return FALSE;
3754 if (eptr++ >= md->end_subject) return FALSE;
3755 ecode++;
3756 break;
3757
3758 case OP_NOT_DIGIT:
3759 if (eptr >= md->end_subject ||
3760 (md->ctypes[*eptr++] & ctype_digit) != 0)
3761 return FALSE;
3762 ecode++;
3763 break;
3764
3765 case OP_DIGIT:
3766 if (eptr >= md->end_subject ||
3767 (md->ctypes[*eptr++] & ctype_digit) == 0)
3768 return FALSE;
3769 ecode++;
3770 break;
3771
3772 case OP_NOT_WHITESPACE:
3773 if (eptr >= md->end_subject ||
3774 (md->ctypes[*eptr++] & ctype_space) != 0)
3775 return FALSE;
3776 ecode++;
3777 break;
3778
3779 case OP_WHITESPACE:
3780 if (eptr >= md->end_subject ||
3781 (md->ctypes[*eptr++] & ctype_space) == 0)
3782 return FALSE;
3783 ecode++;
3784 break;
3785
3786 case OP_NOT_WORDCHAR:
3787 if (eptr >= md->end_subject ||
3788 (md->ctypes[*eptr++] & ctype_word) != 0)
3789 return FALSE;
3790 ecode++;
3791 break;
3792
3793 case OP_WORDCHAR:
3794 if (eptr >= md->end_subject ||
3795 (md->ctypes[*eptr++] & ctype_word) == 0)
3796 return FALSE;
3797 ecode++;
3798 break;
3799
3800 /* Match a back reference, possibly repeatedly. Look past the end of the
3801 item to see if there is repeat information following. The code is similar
3802 to that for character classes, but repeated for efficiency. Then obey
3803 similar code to character type repeats - written out again for speed.
3804 However, if the referenced string is the empty string, always treat
3805 it as matched, any number of times (otherwise there could be infinite
3806 loops). */
3807
3808 case OP_REF:
3809 {
3810 int length;
3811 int offset = ecode[1] << 1; /* Doubled reference number */
3812 ecode += 2; /* Advance past the item */
3813
3814 /* If the reference is unset, set the length to be longer than the amount
3815 of subject left; this ensures that every attempt at a match fails. We
3816 can't just fail here, because of the possibility of quantifiers with zero
3817 minima. */
3818
3819 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3820 md->end_subject - eptr + 1 :
3821 md->offset_vector[offset+1] - md->offset_vector[offset];
3822
3823 /* Set up for repetition, or handle the non-repeated case */
3824
3825 switch (*ecode)
3826 {
3827 case OP_CRSTAR:
3828 case OP_CRMINSTAR:
3829 case OP_CRPLUS:
3830 case OP_CRMINPLUS:
3831 case OP_CRQUERY:
3832 case OP_CRMINQUERY:
3833 c = *ecode++ - OP_CRSTAR;
3834 minimize = (c & 1) != 0;
3835 min = rep_min[c]; /* Pick up values from tables; */
3836 max = rep_max[c]; /* zero for max => infinity */
3837 if (max == 0) max = INT_MAX;
3838 break;
3839
3840 case OP_CRRANGE:
3841 case OP_CRMINRANGE:
3842 minimize = (*ecode == OP_CRMINRANGE);
3843 min = (ecode[1] << 8) + ecode[2];
3844 max = (ecode[3] << 8) + ecode[4];
3845 if (max == 0) max = INT_MAX;
3846 ecode += 5;
3847 break;
3848
3849 default: /* No repeat follows */
3850 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3851 eptr += length;
3852 continue; /* With the main loop */
3853 }
3854
3855 /* If the length of the reference is zero, just continue with the
3856 main loop. */
3857
3858 if (length == 0) continue;
3859
3860 /* First, ensure the minimum number of matches are present. We get back
3861 the length of the reference string explicitly rather than passing the
3862 address of eptr, so that eptr can be a register variable. */
3863
3864 for (i = 1; i <= min; i++)
3865 {
3866 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3867 eptr += length;
3868 }
3869
3870 /* If min = max, continue at the same level without recursion.
3871 They are not both allowed to be zero. */
3872
3873 if (min == max) continue;
3874
3875 /* If minimizing, keep trying and advancing the pointer */
3876
3877 if (minimize)
3878 {
3879 for (i = min;; i++)
3880 {
3881 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3882 return TRUE;
3883 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3884 return FALSE;
3885 eptr += length;
3886 }
3887 /* Control never gets here */
3888 }
3889
3890 /* If maximizing, find the longest string and work backwards */
3891
3892 else
3893 {
3894 const uschar *pp = eptr;
3895 for (i = min; i < max; i++)
3896 {
3897 if (!match_ref(offset, eptr, length, md, ims)) break;
3898 eptr += length;
3899 }
3900 while (eptr >= pp)
3901 {
3902 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3903 return TRUE;
3904 eptr -= length;
3905 }
3906 return FALSE;
3907 }
3908 }
3909 /* Control never gets here */
3910
3911
3912
3913 /* Match a character class, possibly repeatedly. Look past the end of the
3914 item to see if there is repeat information following. Then obey similar
3915 code to character type repeats - written out again for speed. */
3916
3917 case OP_CLASS:
3918 {
3919 const uschar *data = ecode + 1; /* Save for matching */
3920 ecode += 33; /* Advance past the item */
3921
3922 switch (*ecode)
3923 {
3924 case OP_CRSTAR:
3925 case OP_CRMINSTAR:
3926 case OP_CRPLUS:
3927 case OP_CRMINPLUS:
3928 case OP_CRQUERY:
3929 case OP_CRMINQUERY:
3930 c = *ecode++ - OP_CRSTAR;
3931 minimize = (c & 1) != 0;
3932 min = rep_min[c]; /* Pick up values from tables; */
3933 max = rep_max[c]; /* zero for max => infinity */
3934 if (max == 0) max = INT_MAX;
3935 break;
3936
3937 case OP_CRRANGE:
3938 case OP_CRMINRANGE:
3939 minimize = (*ecode == OP_CRMINRANGE);
3940 min = (ecode[1] << 8) + ecode[2];
3941 max = (ecode[3] << 8) + ecode[4];
3942 if (max == 0) max = INT_MAX;
3943 ecode += 5;
3944 break;
3945
3946 default: /* No repeat follows */
3947 min = max = 1;
3948 break;
3949 }
3950
3951 /* First, ensure the minimum number of matches are present. */
3952
3953 for (i = 1; i <= min; i++)
3954 {
3955 if (eptr >= md->end_subject) return FALSE;
3956 c = *eptr++;
3957 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3958 return FALSE;
3959 }
3960
3961 /* If max == min we can continue with the main loop without the
3962 need to recurse. */
3963
3964 if (min == max) continue;
3965
3966 /* If minimizing, keep testing the rest of the expression and advancing
3967 the pointer while it matches the class. */
3968
3969 if (minimize)
3970 {
3971 for (i = min;; i++)
3972 {
3973 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
3974 return TRUE;
3975 if (i >= max || eptr >= md->end_subject) return FALSE;
3976 c = *eptr++;
3977 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3978 return FALSE;
3979 }
3980 /* Control never gets here */
3981 }
3982
3983 /* If maximizing, find the longest possible run, then work backwards. */
3984
3985 else
3986 {
3987 const uschar *pp = eptr;
3988 for (i = min; i < max; eptr++, i++)
3989 {
3990 if (eptr >= md->end_subject) break;
3991 c = *eptr;
3992 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3993 break;
3994 }
3995
3996 while (eptr >= pp)
3997 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
3998 return TRUE;
3999 return FALSE;
4000 }
4001 }
4002 /* Control never gets here */
4003
4004 /* Match a run of characters */
4005
4006 case OP_CHARS:
4007 {
4008 register int length = ecode[1];
4009 ecode += 2;
4010
4011 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4012 if (eptr >= md->end_subject)
4013 printf("matching subject <null> against pattern ");
4014 else
4015 {
4016 printf("matching subject ");
4017 pchars(eptr, length, TRUE, md);
4018 printf(" against pattern ");
4019 }
4020 pchars(ecode, length, FALSE, md);
4021 printf("\n");
4022 #endif
4023
4024 if (length > md->end_subject - eptr) return FALSE;
4025 if ((ims & PCRE_CASELESS) != 0)
4026 {
4027 while (length-- > 0)
4028 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4029 return FALSE;
4030 }
4031 else
4032 {
4033 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4034 }
4035 }
4036 break;
4037
4038 /* Match a single character repeatedly; different opcodes share code. */
4039
4040 case OP_EXACT:
4041 min = max = (ecode[1] << 8) + ecode[2];
4042 ecode += 3;
4043 goto REPEATCHAR;
4044
4045 case OP_UPTO:
4046 case OP_MINUPTO:
4047 min = 0;
4048 max = (ecode[1] << 8) + ecode[2];
4049 minimize = *ecode == OP_MINUPTO;
4050 ecode += 3;
4051 goto REPEATCHAR;
4052
4053 case OP_STAR:
4054 case OP_MINSTAR:
4055 case OP_PLUS:
4056 case OP_MINPLUS:
4057 case OP_QUERY:
4058 case OP_MINQUERY:
4059 c = *ecode++ - OP_STAR;
4060 minimize = (c & 1) != 0;
4061 min = rep_min[c]; /* Pick up values from tables; */
4062 max = rep_max[c]; /* zero for max => infinity */
4063 if (max == 0) max = INT_MAX;
4064
4065 /* Common code for all repeated single-character matches. We can give
4066 up quickly if there are fewer than the minimum number of characters left in
4067 the subject. */
4068
4069 REPEATCHAR:
4070 if (min > md->end_subject - eptr) return FALSE;
4071 c = *ecode++;
4072
4073 /* The code is duplicated for the caseless and caseful cases, for speed,
4074 since matching characters is likely to be quite common. First, ensure the
4075 minimum number of matches are present. If min = max, continue at the same
4076 level without recursing. Otherwise, if minimizing, keep trying the rest of
4077 the expression and advancing one matching character if failing, up to the
4078 maximum. Alternatively, if maximizing, find the maximum number of
4079 characters and work backwards. */
4080
4081 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4082 max, eptr));
4083
4084 if ((ims & PCRE_CASELESS) != 0)
4085 {
4086 c = md->lcc[c];
4087 for (i = 1; i <= min; i++)
4088 if (c != md->lcc[*eptr++]) return FALSE;
4089 if (min == max) continue;
4090 if (minimize)
4091 {
4092 for (i = min;; i++)
4093 {
4094 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4095 return TRUE;
4096 if (i >= max || eptr >= md->end_subject ||
4097 c != md->lcc[*eptr++])
4098 return FALSE;
4099 }
4100 /* Control never gets here */
4101 }
4102 else
4103 {
4104 const uschar *pp = eptr;
4105 for (i = min; i < max; i++)
4106 {
4107 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4108 eptr++;
4109 }
4110 while (eptr >= pp)
4111 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4112 return TRUE;
4113 return FALSE;
4114 }
4115 /* Control never gets here */
4116 }
4117
4118 /* Caseful comparisons */
4119
4120 else
4121 {
4122 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4123 if (min == max) continue;
4124 if (minimize)
4125 {
4126 for (i = min;; i++)
4127 {
4128 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4129 return TRUE;
4130 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4131 }
4132 /* Control never gets here */
4133 }
4134 else
4135 {
4136 const uschar *pp = eptr;
4137 for (i = min; i < max; i++)
4138 {
4139 if (eptr >= md->end_subject || c != *eptr) break;
4140 eptr++;
4141 }
4142 while (eptr >= pp)
4143 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4144 return TRUE;
4145 return FALSE;
4146 }
4147 }
4148 /* Control never gets here */
4149
4150 /* Match a negated single character */
4151
4152 case OP_NOT:
4153 if (eptr >= md->end_subject) return FALSE;
4154 ecode++;
4155 if ((ims & PCRE_CASELESS) != 0)
4156 {
4157 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4158 }
4159 else
4160 {
4161 if (*ecode++ == *eptr++) return FALSE;
4162 }
4163 break;
4164
4165 /* Match a negated single character repeatedly. This is almost a repeat of
4166 the code for a repeated single character, but I haven't found a nice way of
4167 commoning these up that doesn't require a test of the positive/negative
4168 option for each character match. Maybe that wouldn't add very much to the
4169 time taken, but character matching *is* what this is all about... */
4170
4171 case OP_NOTEXACT:
4172 min = max = (ecode[1] << 8) + ecode[2];
4173 ecode += 3;
4174 goto REPEATNOTCHAR;
4175
4176 case OP_NOTUPTO:
4177 case OP_NOTMINUPTO:
4178 min = 0;
4179 max = (ecode[1] << 8) + ecode[2];
4180 minimize = *ecode == OP_NOTMINUPTO;
4181 ecode += 3;
4182 goto REPEATNOTCHAR;
4183
4184 case OP_NOTSTAR:
4185 case OP_NOTMINSTAR:
4186 case OP_NOTPLUS:
4187 case OP_NOTMINPLUS:
4188 case OP_NOTQUERY:
4189 case OP_NOTMINQUERY:
4190 c = *ecode++ - OP_NOTSTAR;
4191 minimize = (c & 1) != 0;
4192 min = rep_min[c]; /* Pick up values from tables; */
4193 max = rep_max[c]; /* zero for max => infinity */
4194 if (max == 0) max = INT_MAX;
4195
4196 /* Common code for all repeated single-character matches. We can give
4197 up quickly if there are fewer than the minimum number of characters left in
4198 the subject. */
4199
4200 REPEATNOTCHAR:
4201 if (min > md->end_subject - eptr) return FALSE;
4202 c = *ecode++;
4203
4204 /* The code is duplicated for the caseless and caseful cases, for speed,
4205 since matching characters is likely to be quite common. First, ensure the
4206 minimum number of matches are present. If min = max, continue at the same
4207 level without recursing. Otherwise, if minimizing, keep trying the rest of
4208 the expression and advancing one matching character if failing, up to the
4209 maximum. Alternatively, if maximizing, find the maximum number of
4210 characters and work backwards. */
4211
4212 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4213 max, eptr));
4214
4215 if ((ims & PCRE_CASELESS) != 0)
4216 {
4217 c = md->lcc[c];
4218 for (i = 1; i <= min; i++)
4219 if (c == md->lcc[*eptr++]) return FALSE;
4220 if (min == max) continue;
4221 if (minimize)
4222 {
4223 for (i = min;; i++)
4224 {
4225 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4226 return TRUE;
4227 if (i >= max || eptr >= md->end_subject ||
4228 c == md->lcc[*eptr++])
4229 return FALSE;
4230 }
4231 /* Control never gets here */
4232 }
4233 else
4234 {
4235 const uschar *pp = eptr;
4236 for (i = min; i < max; i++)
4237 {
4238 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4239 eptr++;
4240 }
4241 while (eptr >= pp)
4242 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4243 return TRUE;
4244 return FALSE;
4245 }
4246 /* Control never gets here */
4247 }
4248
4249 /* Caseful comparisons */
4250
4251 else
4252 {
4253 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4254 if (min == max) continue;
4255 if (minimize)
4256 {
4257 for (i = min;; i++)
4258 {
4259 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4260 return TRUE;
4261 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4262 }
4263 /* Control never gets here */
4264 }
4265 else
4266 {
4267 const uschar *pp = eptr;
4268 for (i = min; i < max; i++)
4269 {
4270 if (eptr >= md->end_subject || c == *eptr) break;
4271 eptr++;
4272 }
4273 while (eptr >= pp)
4274 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4275 return TRUE;
4276 return FALSE;
4277 }
4278 }
4279 /* Control never gets here */
4280
4281 /* Match a single character type repeatedly; several different opcodes
4282 share code. This is very similar to the code for single characters, but we
4283 repeat it in the interests of efficiency. */
4284
4285 case OP_TYPEEXACT:
4286 min = max = (ecode[1] << 8) + ecode[2];
4287 minimize = TRUE;
4288 ecode += 3;
4289 goto REPEATTYPE;
4290
4291 case OP_TYPEUPTO:
4292 case OP_TYPEMINUPTO:
4293 min = 0;
4294 max = (ecode[1] << 8) + ecode[2];
4295 minimize = *ecode == OP_TYPEMINUPTO;
4296 ecode += 3;
4297 goto REPEATTYPE;
4298
4299 case OP_TYPESTAR:
4300 case OP_TYPEMINSTAR:
4301 case OP_TYPEPLUS:
4302 case OP_TYPEMINPLUS:
4303 case OP_TYPEQUERY:
4304 case OP_TYPEMINQUERY:
4305 c = *ecode++ - OP_TYPESTAR;
4306 minimize = (c & 1) != 0;
4307 min = rep_min[c]; /* Pick up values from tables; */
4308 max = rep_max[c]; /* zero for max => infinity */
4309 if (max == 0) max = INT_MAX;
4310
4311 /* Common code for all repeated single character type matches */
4312
4313 REPEATTYPE:
4314 ctype = *ecode++; /* Code for the character type */
4315
4316 /* First, ensure the minimum number of matches are present. Use inline
4317 code for maximizing the speed, and do the type test once at the start
4318 (i.e. keep it out of the loop). Also test that there are at least the
4319 minimum number of characters before we start. */
4320
4321 if (min > md->end_subject - eptr) return FALSE;
4322 if (min > 0) switch(ctype)
4323 {
4324 case OP_ANY:
4325 if ((ims & PCRE_DOTALL) == 0)
4326 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4327 else eptr += min;
4328 break;
4329
4330 case OP_NOT_DIGIT:
4331 for (i = 1; i <= min; i++)
4332 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4333 break;
4334
4335 case OP_DIGIT:
4336 for (i = 1; i <= min; i++)
4337 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4338 break;
4339
4340 case OP_NOT_WHITESPACE:
4341 for (i = 1; i <= min; i++)
4342 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4343 break;
4344
4345 case OP_WHITESPACE:
4346 for (i = 1; i <= min; i++)
4347 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4348 break;
4349
4350 case OP_NOT_WORDCHAR:
4351 for (i = 1; i <= min; i++)
4352 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4353 return FALSE;
4354 break;
4355
4356 case OP_WORDCHAR:
4357 for (i = 1; i <= min; i++)
4358 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4359 return FALSE;
4360 break;
4361 }
4362
4363 /* If min = max, continue at the same level without recursing */
4364
4365 if (min == max) continue;
4366
4367 /* If minimizing, we have to test the rest of the pattern before each
4368 subsequent match. */
4369
4370 if (minimize)
4371 {
4372 for (i = min;; i++)
4373 {
4374 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4375 if (i >= max || eptr >= md->end_subject) return FALSE;
4376
4377 c = *eptr++;
4378 switch(ctype)
4379 {
4380 case OP_ANY:
4381 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4382 break;
4383
4384 case OP_NOT_DIGIT:
4385 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4386 break;
4387
4388 case OP_DIGIT:
4389 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4390 break;
4391
4392 case OP_NOT_WHITESPACE:
4393 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4394 break;
4395
4396 case OP_WHITESPACE:
4397 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4398 break;
4399
4400 case OP_NOT_WORDCHAR:
4401 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4402 break;
4403
4404 case OP_WORDCHAR:
4405 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4406 break;
4407 }
4408 }
4409 /* Control never gets here */
4410 }
4411
4412 /* If maximizing it is worth using inline code for speed, doing the type
4413 test once at the start (i.e. keep it out of the loop). */
4414
4415 else
4416 {
4417 const uschar *pp = eptr;
4418 switch(ctype)
4419 {
4420 case OP_ANY:
4421 if ((ims & PCRE_DOTALL) == 0)
4422 {
4423 for (i = min; i < max; i++)
4424 {
4425 if (eptr >= md->end_subject || *eptr == '\n') break;
4426 eptr++;
4427 }
4428 }
4429 else
4430 {
4431 c = max - min;
4432 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4433 eptr += c;
4434 }
4435 break;
4436
4437 case OP_NOT_DIGIT:
4438 for (i = min; i < max; i++)
4439 {
4440 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4441 break;
4442 eptr++;
4443 }
4444 break;
4445
4446 case OP_DIGIT:
4447 for (i = min; i < max; i++)
4448 {
4449 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4450 break;
4451 eptr++;
4452 }
4453 break;
4454
4455 case OP_NOT_WHITESPACE:
4456 for (i = min; i < max; i++)
4457 {
4458 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4459 break;
4460 eptr++;
4461 }
4462 break;
4463
4464 case OP_WHITESPACE:
4465 for (i = min; i < max; i++)
4466 {
4467 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4468 break;
4469 eptr++;
4470 }
4471 break;
4472
4473 case OP_NOT_WORDCHAR:
4474 for (i = min; i < max; i++)
4475 {
4476 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4477 break;
4478 eptr++;
4479 }
4480 break;
4481
4482 case OP_WORDCHAR:
4483 for (i = min; i < max; i++)
4484 {
4485 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4486 break;
4487 eptr++;
4488 }
4489 break;
4490 }
4491
4492 while (eptr >= pp)
4493 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4494 return TRUE;
4495 return FALSE;
4496 }
4497 /* Control never gets here */
4498
4499 /* There's been some horrible disaster. */
4500
4501 default:
4502 DPRINTF(("Unknown opcode %d\n", *ecode));
4503 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4504 return FALSE;
4505 }
4506
4507 /* Do not stick any code in here without much thought; it is assumed
4508 that "continue" in the code above comes out to here to repeat the main
4509 loop. */
4510
4511 } /* End of main loop */
4512 /* Control never reaches here */
4513 }
4514
4515
4516
4517
4518 /*************************************************
4519 * Execute a Regular Expression *
4520 *************************************************/
4521
4522 /* This function applies a compiled re to a subject string and picks out
4523 portions of the string if it matches. Two elements in the vector are set for
4524 each substring: the offsets to the start and end of the substring.
4525
4526 Arguments:
4527 external_re points to the compiled expression
4528 external_extra points to "hints" from pcre_study() or is NULL
4529 subject points to the subject string
4530 length length of subject string (may contain binary zeros)
4531 start_offset where to start in the subject string
4532 options option bits
4533 offsets points to a vector of ints to be filled in with offsets
4534 offsetcount the number of elements in the vector
4535
4536 Returns: > 0 => success; value is the number of elements filled in
4537 = 0 => success, but offsets is not big enough
4538 -1 => failed to match
4539 < -1 => some kind of unexpected problem
4540 */
4541
4542 int
4543 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4544 const char *subject, int length, int start_offset, int options, int *offsets,
4545 int offsetcount)
4546 {
4547 int resetcount, ocount;
4548 int first_char = -1;
4549 int req_char = -1;
4550 int req_char2 = -1;
4551 unsigned long int ims = 0;
4552 match_data match_block;
4553 const uschar *start_bits = NULL;
4554 const uschar *start_match = (const uschar *)subject + start_offset;
4555 const uschar *end_subject;
4556 const uschar *req_char_ptr = start_match - 1;
4557 const real_pcre *re = (const real_pcre *)external_re;
4558 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4559 BOOL using_temporary_offsets = FALSE;
4560 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4561 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4562
4563 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4564
4565 if (re == NULL || subject == NULL ||
4566 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4567 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4568
4569 match_block.start_pattern = re->code;
4570 match_block.start_subject = (const uschar *)subject;
4571 match_block.end_subject = match_block.start_subject + length;
4572 end_subject = match_block.end_subject;
4573
4574 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4575
4576 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4577 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4578 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4579
4580 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4581
4582 match_block.lcc = re->tables + lcc_offset;
4583 match_block.ctypes = re->tables + ctypes_offset;
4584
4585 /* The ims options can vary during the matching as a result of the presence
4586 of (?ims) items in the pattern. They are kept in a local variable so that
4587 restoring at the exit of a group is easy. */
4588
4589 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4590
4591 /* If the expression has got more back references than the offsets supplied can
4592 hold, we get a temporary bit of working store to use during the matching.
4593 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4594 of 3. */
4595
4596 ocount = offsetcount - (offsetcount % 3);
4597
4598 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4599 {
4600 ocount = re->top_backref * 3 + 3;
4601 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4602 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4603 using_temporary_offsets = TRUE;
4604 DPRINTF(("Got memory to hold back references\n"));
4605 }
4606 else match_block.offset_vector = offsets;
4607
4608 match_block.offset_end = ocount;
4609 match_block.offset_max = (2*ocount)/3;
4610 match_block.offset_overflow = FALSE;
4611
4612 /* Compute the minimum number of offsets that we need to reset each time. Doing
4613 this makes a huge difference to execution time when there aren't many brackets
4614 in the pattern. */
4615
4616 resetcount = 2 + re->top_bracket * 2;
4617 if (resetcount > offsetcount) resetcount = ocount;
4618
4619 /* Reset the working variable associated with each extraction. These should
4620 never be used unless previously set, but they get saved and restored, and so we
4621 initialize them to avoid reading uninitialized locations. */
4622
4623 if (match_block.offset_vector != NULL)
4624 {
4625 register int *iptr = match_block.offset_vector + ocount;
4626 register int *iend = iptr - resetcount/2 + 1;
4627 while (--iptr >= iend) *iptr = -1;
4628 }
4629
4630 /* Set up the first character to match, if available. The first_char value is
4631 never set for an anchored regular expression, but the anchoring may be forced
4632 at run time, so we have to test for anchoring. The first char may be unset for
4633 an unanchored pattern, of course. If there's no first char and the pattern was
4634 studied, there may be a bitmap of possible first characters. */
4635
4636 if (!anchored)
4637 {
4638 if ((re->options & PCRE_FIRSTSET) != 0)
4639 {
4640 first_char = re->first_char;
4641 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4642 }
4643 else
4644 if (!startline && extra != NULL &&
4645 (extra->options & PCRE_STUDY_MAPPED) != 0)
4646 start_bits = extra->start_bits;
4647 }
4648
4649 /* For anchored or unanchored matches, there may be a "last known required
4650 character" set. If the PCRE_CASELESS is set, implying that the match starts
4651 caselessly, or if there are any changes of this flag within the regex, set up
4652 both cases of the character. Otherwise set the two values the same, which will
4653 avoid duplicate testing (which takes significant time). This covers the vast
4654 majority of cases. It will be suboptimal when the case flag changes in a regex
4655 and the required character in fact is caseful. */
4656
4657 if ((re->options & PCRE_REQCHSET) != 0)
4658 {
4659 req_char = re->req_char;
4660 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4661 (re->tables + fcc_offset)[req_char] : req_char;
4662 }
4663
4664 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4665 the loop runs just once. */
4666
4667 do
4668 {
4669 int rc;
4670 register int *iptr = match_block.offset_vector;
4671 register int *iend = iptr + resetcount;
4672
4673 /* Reset the maximum number of extractions we might see. */
4674
4675 while (iptr < iend) *iptr++ = -1;
4676
4677 /* Advance to a unique first char if possible */
4678
4679 if (first_char >= 0)
4680 {
4681 if ((ims & PCRE_CASELESS) != 0)
4682 while (start_match < end_subject &&
4683 match_block.lcc[*start_match] != first_char)
4684 start_match++;
4685 else
4686 while (start_match < end_subject && *start_match != first_char)
4687 start_match++;
4688 }
4689
4690 /* Or to just after \n for a multiline match if possible */
4691
4692 else if (startline)
4693 {
4694 if (start_match > match_block.start_subject + start_offset)
4695 {
4696 while (start_match < end_subject && start_match[-1] != '\n')
4697 start_match++;
4698 }
4699 }
4700
4701 /* Or to a non-unique first char after study */
4702
4703 else if (start_bits != NULL)
4704 {
4705 while (start_match < end_subject)
4706 {
4707 register int c = *start_match;
4708 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4709 }
4710 }
4711
4712 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4713 printf(">>>> Match against: ");
4714 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4715 printf("\n");
4716 #endif
4717
4718 /* If req_char is set, we know that that character must appear in the subject
4719 for the match to succeed. If the first character is set, req_char must be
4720 later in the subject; otherwise the test starts at the match point. This
4721 optimization can save a huge amount of backtracking in patterns with nested
4722 unlimited repeats that aren't going to match. We don't know what the state of
4723 case matching may be when this character is hit, so test for it in both its
4724 cases if necessary. However, the different cased versions will not be set up
4725 unless PCRE_CASELESS was given or the casing state changes within the regex.
4726 Writing separate code makes it go faster, as does using an autoincrement and
4727 backing off on a match. */
4728
4729 if (req_char >= 0)
4730 {
4731 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4732
4733 /* We don't need to repeat the search if we haven't yet reached the
4734 place we found it at last time. */
4735
4736 if (p > req_char_ptr)
4737 {
4738 /* Do a single test if no case difference is set up */
4739
4740 if (req_char == req_char2)
4741 {
4742 while (p < end_subject)
4743 {
4744 if (*p++ == req_char) { p--; break; }
4745 }
4746 }
4747
4748 /* Otherwise test for either case */
4749
4750 else
4751 {
4752 while (p < end_subject)
4753 {
4754 register int pp = *p++;
4755 if (pp == req_char || pp == req_char2) { p--; break; }
4756 }
4757 }
4758
4759 /* If we can't find the required character, break the matching loop */
4760
4761 if (p >= end_subject) break;
4762
4763 /* If we have found the required character, save the point where we
4764 found it, so that we don't search again next time round the loop if
4765 the start hasn't passed this character yet. */
4766
4767 req_char_ptr = p;
4768 }
4769 }
4770
4771 /* When a match occurs, substrings will be set for all internal extractions;
4772 we just need to set up the whole thing as substring 0 before returning. If
4773 there were too many extractions, set the return code to zero. In the case
4774 where we had to get some local store to hold offsets for backreferences, copy
4775 those back references that we can. In this case there need not be overflow
4776 if certain parts of the pattern were not used. */
4777
4778 match_block.start_match = start_match;
4779 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
4780 continue;
4781
4782 /* Copy the offset information from temporary store if necessary */
4783
4784 if (using_temporary_offsets)
4785 {
4786 if (offsetcount >= 4)
4787 {
4788 memcpy(offsets + 2, match_block.offset_vector + 2,
4789 (offsetcount - 2) * sizeof(int));
4790 DPRINTF(("Copied offsets from temporary memory\n"));
4791 }
4792 if (match_block.end_offset_top > offsetcount)
4793 match_block.offset_overflow = TRUE;
4794
4795 DPRINTF(("Freeing temporary memory\n"));
4796 (pcre_free)(match_block.offset_vector);
4797 }
4798
4799 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4800
4801 if (match_block.offset_end < 2) rc = 0; else
4802 {
4803 offsets[0] = start_match - match_block.start_subject;
4804 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4805 }
4806
4807 DPRINTF((">>>> returning %d\n", rc));
4808 return rc;
4809 }
4810
4811 /* This "while" is the end of the "do" above */
4812
4813 while (!anchored &&
4814 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4815 start_match++ < end_subject);
4816
4817 if (using_temporary_offsets)
4818 {
4819 DPRINTF(("Freeing temporary memory\n"));
4820 (pcre_free)(match_block.offset_vector);
4821 }
4822
4823 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4824
4825 return match_block.errorcode;
4826 }
4827
4828 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12