/[pcre]/code/tags/pcre-3.3/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-3.3/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 50 - (show annotations) (download)
Sat Feb 24 21:39:35 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 151967 byte(s)
Tag code/trunk as code/tags/pcre-3.3.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2000 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* The number of bytes in a literal character string above which we can't add
70 any more is different when UTF-8 characters may be encountered. */
71
72 #ifdef SUPPORT_UTF8
73 #define MAXLIT 250
74 #else
75 #define MAXLIT 255
76 #endif
77
78
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83
84 /* Text forms of OP_ values and things, for debugging (not all used) */
85
86 #ifdef DEBUG
87 static const char *OP_names[] = {
88 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90 "Opt", "^", "$", "Any", "chars", "not",
91 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94 "*", "*?", "+", "+?", "?", "??", "{", "{",
95 "class", "Ref", "Recurse",
96 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98 "Brazero", "Braminzero", "Bra"
99 };
100 #endif
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
116 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
118 };
119
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
123
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
128
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. */
134
135 static const int posix_class_maps[] = {
136 cbit_lower, cbit_upper, -1, /* alpha */
137 cbit_lower, -1, -1, /* lower */
138 cbit_upper, -1, -1, /* upper */
139 cbit_digit, cbit_lower, cbit_upper, /* alnum */
140 cbit_print, cbit_cntrl, -1, /* ascii */
141 cbit_cntrl, -1, -1, /* cntrl */
142 cbit_digit, -1, -1, /* digit */
143 cbit_graph, -1, -1, /* graph */
144 cbit_print, -1, -1, /* print */
145 cbit_punct, -1, -1, /* punct */
146 cbit_space, -1, -1, /* space */
147 cbit_word, -1, -1, /* word */
148 cbit_xdigit,-1, -1 /* xdigit */
149 };
150
151
152 /* Definition to allow mutual recursion */
153
154 static BOOL
155 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156 BOOL, int, int *, int *, compile_data *);
157
158 /* Structure for building a chain of data that actually lives on the
159 stack, for holding the values of the subject pointer at the start of each
160 subpattern, so as to detect when an empty string has been matched by a
161 subpattern - to break infinite loops. */
162
163 typedef struct eptrblock {
164 struct eptrblock *prev;
165 const uschar *saved_eptr;
166 } eptrblock;
167
168 /* Flag bits for the match() function */
169
170 #define match_condassert 0x01 /* Called to check a condition assertion */
171 #define match_isgroup 0x02 /* Set if start of bracketed group */
172
173
174
175 /*************************************************
176 * Global variables *
177 *************************************************/
178
179 /* PCRE is thread-clean and doesn't use any global variables in the normal
180 sense. However, it calls memory allocation and free functions via the two
181 indirections below, which are can be changed by the caller, but are shared
182 between all threads. */
183
184 void *(*pcre_malloc)(size_t) = malloc;
185 void (*pcre_free)(void *) = free;
186
187
188
189 /*************************************************
190 * Macros and tables for character handling *
191 *************************************************/
192
193 /* When UTF-8 encoding is being used, a character is no longer just a single
194 byte. The macros for character handling generate simple sequences when used in
195 byte-mode, and more complicated ones for UTF-8 characters. */
196
197 #ifndef SUPPORT_UTF8
198 #define GETCHARINC(c, eptr) c = *eptr++;
199 #define GETCHARLEN(c, eptr, len) c = *eptr;
200 #define BACKCHAR(eptr)
201
202 #else /* SUPPORT_UTF8 */
203
204 /* Get the next UTF-8 character, advancing the pointer */
205
206 #define GETCHARINC(c, eptr) \
207 c = *eptr++; \
208 if (md->utf8 && (c & 0xc0) == 0xc0) \
209 { \
210 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
211 int s = 6 - a; /* Amount to shift next byte */ \
212 c &= utf8_table3[a]; /* Low order bits from first byte */ \
213 while (a-- > 0) \
214 { \
215 c |= (*eptr++ & 0x3f) << s; \
216 s += 6; \
217 } \
218 }
219
220 /* Get the next UTF-8 character, not advancing the pointer, setting length */
221
222 #define GETCHARLEN(c, eptr, len) \
223 c = *eptr; \
224 len = 1; \
225 if (md->utf8 && (c & 0xc0) == 0xc0) \
226 { \
227 int i; \
228 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
229 int s = 6 - a; /* Amount to shift next byte */ \
230 c &= utf8_table3[a]; /* Low order bits from first byte */ \
231 for (i = 1; i <= a; i++) \
232 { \
233 c |= (eptr[i] & 0x3f) << s; \
234 s += 6; \
235 } \
236 len += a; \
237 }
238
239 /* If the pointer is not at the start of a character, move it back until
240 it is. */
241
242 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
243
244 #endif
245
246
247
248 /*************************************************
249 * Default character tables *
250 *************************************************/
251
252 /* A default set of character tables is included in the PCRE binary. Its source
253 is built by the maketables auxiliary program, which uses the default C ctypes
254 functions, and put in the file chartables.c. These tables are used by PCRE
255 whenever the caller of pcre_compile() does not provide an alternate set of
256 tables. */
257
258 #include "chartables.c"
259
260
261
262 #ifdef SUPPORT_UTF8
263 /*************************************************
264 * Tables for UTF-8 support *
265 *************************************************/
266
267 /* These are the breakpoints for different numbers of bytes in a UTF-8
268 character. */
269
270 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
271
272 /* These are the indicator bits and the mask for the data bits to set in the
273 first byte of a character, indexed by the number of additional bytes. */
274
275 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
277
278 /* Table of the number of extra characters, indexed by the first character
279 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
280 0x3d. */
281
282 static uschar utf8_table4[] = {
283 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
287
288
289 /*************************************************
290 * Convert character value to UTF-8 *
291 *************************************************/
292
293 /* This function takes an integer value in the range 0 - 0x7fffffff
294 and encodes it as a UTF-8 character in 0 to 6 bytes.
295
296 Arguments:
297 cvalue the character value
298 buffer pointer to buffer for result - at least 6 bytes long
299
300 Returns: number of characters placed in the buffer
301 */
302
303 static int
304 ord2utf8(int cvalue, uschar *buffer)
305 {
306 register int i, j;
307 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308 if (cvalue <= utf8_table1[i]) break;
309 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
310 cvalue >>= 6 - i;
311 for (j = 0; j < i; j++)
312 {
313 *buffer++ = 0x80 | (cvalue & 0x3f);
314 cvalue >>= 6;
315 }
316 return i + 1;
317 }
318 #endif
319
320
321
322 /*************************************************
323 * Return version string *
324 *************************************************/
325
326 #define STRING(a) # a
327 #define XSTRING(s) STRING(s)
328
329 const char *
330 pcre_version(void)
331 {
332 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
333 }
334
335
336
337
338 /*************************************************
339 * (Obsolete) Return info about compiled pattern *
340 *************************************************/
341
342 /* This is the original "info" function. It picks potentially useful data out
343 of the private structure, but its interface was too rigid. It remains for
344 backwards compatibility. The public options are passed back in an int - though
345 the re->options field has been expanded to a long int, all the public options
346 at the low end of it, and so even on 16-bit systems this will still be OK.
347 Therefore, I haven't changed the API for pcre_info().
348
349 Arguments:
350 external_re points to compiled code
351 optptr where to pass back the options
352 first_char where to pass back the first character,
353 or -1 if multiline and all branches start ^,
354 or -2 otherwise
355
356 Returns: number of capturing subpatterns
357 or negative values on error
358 */
359
360 int
361 pcre_info(const pcre *external_re, int *optptr, int *first_char)
362 {
363 const real_pcre *re = (const real_pcre *)external_re;
364 if (re == NULL) return PCRE_ERROR_NULL;
365 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367 if (first_char != NULL)
368 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
370 return re->top_bracket;
371 }
372
373
374
375 /*************************************************
376 * Return info about compiled pattern *
377 *************************************************/
378
379 /* This is a newer "info" function which has an extensible interface so
380 that additional items can be added compatibly.
381
382 Arguments:
383 external_re points to compiled code
384 external_study points to study data, or NULL
385 what what information is required
386 where where to put the information
387
388 Returns: 0 if data returned, negative on error
389 */
390
391 int
392 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
393 void *where)
394 {
395 const real_pcre *re = (const real_pcre *)external_re;
396 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
397
398 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
400
401 switch (what)
402 {
403 case PCRE_INFO_OPTIONS:
404 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
405 break;
406
407 case PCRE_INFO_SIZE:
408 *((size_t *)where) = re->size;
409 break;
410
411 case PCRE_INFO_CAPTURECOUNT:
412 *((int *)where) = re->top_bracket;
413 break;
414
415 case PCRE_INFO_BACKREFMAX:
416 *((int *)where) = re->top_backref;
417 break;
418
419 case PCRE_INFO_FIRSTCHAR:
420 *((int *)where) =
421 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
423 break;
424
425 case PCRE_INFO_FIRSTTABLE:
426 *((const uschar **)where) =
427 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428 study->start_bits : NULL;
429 break;
430
431 case PCRE_INFO_LASTLITERAL:
432 *((int *)where) =
433 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
434 break;
435
436 default: return PCRE_ERROR_BADOPTION;
437 }
438
439 return 0;
440 }
441
442
443
444 #ifdef DEBUG
445 /*************************************************
446 * Debugging function to print chars *
447 *************************************************/
448
449 /* Print a sequence of chars in printable format, stopping at the end of the
450 subject if the requested.
451
452 Arguments:
453 p points to characters
454 length number to print
455 is_subject TRUE if printing from within md->start_subject
456 md pointer to matching data block, if is_subject is TRUE
457
458 Returns: nothing
459 */
460
461 static void
462 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
463 {
464 int c;
465 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
466 while (length-- > 0)
467 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
468 }
469 #endif
470
471
472
473
474 /*************************************************
475 * Handle escapes *
476 *************************************************/
477
478 /* This function is called when a \ has been encountered. It either returns a
479 positive value for a simple escape such as \n, or a negative value which
480 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481 a positive value greater than 255 may be returned. On entry, ptr is pointing at
482 the \. On exit, it is on the final character of the escape sequence.
483
484 Arguments:
485 ptrptr points to the pattern position pointer
486 errorptr points to the pointer to the error message
487 bracount number of previous extracting brackets
488 options the options bits
489 isclass TRUE if inside a character class
490 cd pointer to char tables block
491
492 Returns: zero or positive => a data character
493 negative => a special escape sequence
494 on error, errorptr is set
495 */
496
497 static int
498 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499 int options, BOOL isclass, compile_data *cd)
500 {
501 const uschar *ptr = *ptrptr;
502 int c, i;
503
504 /* If backslash is at the end of the pattern, it's an error. */
505
506 c = *(++ptr);
507 if (c == 0) *errorptr = ERR1;
508
509 /* Digits or letters may have special meaning; all others are literals. */
510
511 else if (c < '0' || c > 'z') {}
512
513 /* Do an initial lookup in a table. A non-zero result is something that can be
514 returned immediately. Otherwise further processing may be required. */
515
516 else if ((i = escapes[c - '0']) != 0) c = i;
517
518 /* Escapes that need further processing, or are illegal. */
519
520 else
521 {
522 const uschar *oldptr;
523 switch (c)
524 {
525 /* The handling of escape sequences consisting of a string of digits
526 starting with one that is not zero is not straightforward. By experiment,
527 the way Perl works seems to be as follows:
528
529 Outside a character class, the digits are read as a decimal number. If the
530 number is less than 10, or if there are that many previous extracting
531 left brackets, then it is a back reference. Otherwise, up to three octal
532 digits are read to form an escaped byte. Thus \123 is likely to be octal
533 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
534 value is greater than 377, the least significant 8 bits are taken. Inside a
535 character class, \ followed by a digit is always an octal number. */
536
537 case '1': case '2': case '3': case '4': case '5':
538 case '6': case '7': case '8': case '9':
539
540 if (!isclass)
541 {
542 oldptr = ptr;
543 c -= '0';
544 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545 c = c * 10 + *(++ptr) - '0';
546 if (c < 10 || c <= bracount)
547 {
548 c = -(ESC_REF + c);
549 break;
550 }
551 ptr = oldptr; /* Put the pointer back and fall through */
552 }
553
554 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
555 generates a binary zero byte and treats the digit as a following literal.
556 Thus we have to pull back the pointer by one. */
557
558 if ((c = *ptr) >= '8')
559 {
560 ptr--;
561 c = 0;
562 break;
563 }
564
565 /* \0 always starts an octal number, but we may drop through to here with a
566 larger first octal digit. */
567
568 case '0':
569 c -= '0';
570 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571 ptr[1] != '8' && ptr[1] != '9')
572 c = c * 8 + *(++ptr) - '0';
573 c &= 255; /* Take least significant 8 bits */
574 break;
575
576 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577 which can be greater than 0xff, but only if the ddd are hex digits. */
578
579 case 'x':
580 #ifdef SUPPORT_UTF8
581 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
582 {
583 const uschar *pt = ptr + 2;
584 register int count = 0;
585 c = 0;
586 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
587 {
588 count++;
589 c = c * 16 + cd->lcc[*pt] -
590 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
591 pt++;
592 }
593 if (*pt == '}')
594 {
595 if (c < 0 || count > 8) *errorptr = ERR34;
596 ptr = pt;
597 break;
598 }
599 /* If the sequence of hex digits does not end with '}', then we don't
600 recognize this construct; fall through to the normal \x handling. */
601 }
602 #endif
603
604 /* Read just a single hex char */
605
606 c = 0;
607 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
608 {
609 ptr++;
610 c = c * 16 + cd->lcc[*ptr] -
611 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
612 }
613 break;
614
615 /* Other special escapes not starting with a digit are straightforward */
616
617 case 'c':
618 c = *(++ptr);
619 if (c == 0)
620 {
621 *errorptr = ERR2;
622 return 0;
623 }
624
625 /* A letter is upper-cased; then the 0x40 bit is flipped */
626
627 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
628 c ^= 0x40;
629 break;
630
631 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633 for Perl compatibility, it is a literal. This code looks a bit odd, but
634 there used to be some cases other than the default, and there may be again
635 in future, so I haven't "optimized" it. */
636
637 default:
638 if ((options & PCRE_EXTRA) != 0) switch(c)
639 {
640 default:
641 *errorptr = ERR3;
642 break;
643 }
644 break;
645 }
646 }
647
648 *ptrptr = ptr;
649 return c;
650 }
651
652
653
654 /*************************************************
655 * Check for counted repeat *
656 *************************************************/
657
658 /* This function is called when a '{' is encountered in a place where it might
659 start a quantifier. It looks ahead to see if it really is a quantifier or not.
660 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
661 where the ddds are digits.
662
663 Arguments:
664 p pointer to the first char after '{'
665 cd pointer to char tables block
666
667 Returns: TRUE or FALSE
668 */
669
670 static BOOL
671 is_counted_repeat(const uschar *p, compile_data *cd)
672 {
673 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675 if (*p == '}') return TRUE;
676
677 if (*p++ != ',') return FALSE;
678 if (*p == '}') return TRUE;
679
680 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
682 return (*p == '}');
683 }
684
685
686
687 /*************************************************
688 * Read repeat counts *
689 *************************************************/
690
691 /* Read an item of the form {n,m} and return the values. This is called only
692 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
693 so the syntax is guaranteed to be correct, but we need to check the values.
694
695 Arguments:
696 p pointer to first char after '{'
697 minp pointer to int for min
698 maxp pointer to int for max
699 returned as -1 if no max
700 errorptr points to pointer to error message
701 cd pointer to character tables clock
702
703 Returns: pointer to '}' on success;
704 current ptr on error, with errorptr set
705 */
706
707 static const uschar *
708 read_repeat_counts(const uschar *p, int *minp, int *maxp,
709 const char **errorptr, compile_data *cd)
710 {
711 int min = 0;
712 int max = -1;
713
714 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
715
716 if (*p == '}') max = min; else
717 {
718 if (*(++p) != '}')
719 {
720 max = 0;
721 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722 if (max < min)
723 {
724 *errorptr = ERR4;
725 return p;
726 }
727 }
728 }
729
730 /* Do paranoid checks, then fill in the required variables, and pass back the
731 pointer to the terminating '}'. */
732
733 if (min > 65535 || max > 65535)
734 *errorptr = ERR5;
735 else
736 {
737 *minp = min;
738 *maxp = max;
739 }
740 return p;
741 }
742
743
744
745 /*************************************************
746 * Find the fixed length of a pattern *
747 *************************************************/
748
749 /* Scan a pattern and compute the fixed length of subject that will match it,
750 if the length is fixed. This is needed for dealing with backward assertions.
751
752 Arguments:
753 code points to the start of the pattern (the bracket)
754 options the compiling options
755
756 Returns: the fixed length, or -1 if there is no fixed length
757 */
758
759 static int
760 find_fixedlength(uschar *code, int options)
761 {
762 int length = -1;
763
764 register int branchlength = 0;
765 register uschar *cc = code + 3;
766
767 /* Scan along the opcodes for this branch. If we get to the end of the
768 branch, check the length against that of the other branches. */
769
770 for (;;)
771 {
772 int d;
773 register int op = *cc;
774 if (op >= OP_BRA) op = OP_BRA;
775
776 switch (op)
777 {
778 case OP_BRA:
779 case OP_ONCE:
780 case OP_COND:
781 d = find_fixedlength(cc, options);
782 if (d < 0) return -1;
783 branchlength += d;
784 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
785 cc += 3;
786 break;
787
788 /* Reached end of a branch; if it's a ket it is the end of a nested
789 call. If it's ALT it is an alternation in a nested call. If it is
790 END it's the end of the outer call. All can be handled by the same code. */
791
792 case OP_ALT:
793 case OP_KET:
794 case OP_KETRMAX:
795 case OP_KETRMIN:
796 case OP_END:
797 if (length < 0) length = branchlength;
798 else if (length != branchlength) return -1;
799 if (*cc != OP_ALT) return length;
800 cc += 3;
801 branchlength = 0;
802 break;
803
804 /* Skip over assertive subpatterns */
805
806 case OP_ASSERT:
807 case OP_ASSERT_NOT:
808 case OP_ASSERTBACK:
809 case OP_ASSERTBACK_NOT:
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811 cc += 3;
812 break;
813
814 /* Skip over things that don't match chars */
815
816 case OP_REVERSE:
817 cc++;
818 /* Fall through */
819
820 case OP_CREF:
821 case OP_OPT:
822 cc++;
823 /* Fall through */
824
825 case OP_SOD:
826 case OP_EOD:
827 case OP_EODN:
828 case OP_CIRC:
829 case OP_DOLL:
830 case OP_NOT_WORD_BOUNDARY:
831 case OP_WORD_BOUNDARY:
832 cc++;
833 break;
834
835 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836 This requires a scan of the string, unfortunately. We assume valid UTF-8
837 strings, so all we do is reduce the length by one for byte whose bits are
838 10xxxxxx. */
839
840 case OP_CHARS:
841 branchlength += *(++cc);
842 #ifdef SUPPORT_UTF8
843 for (d = 1; d <= *cc; d++)
844 if ((cc[d] & 0xc0) == 0x80) branchlength--;
845 #endif
846 cc += *cc + 1;
847 break;
848
849 /* Handle exact repetitions */
850
851 case OP_EXACT:
852 case OP_TYPEEXACT:
853 branchlength += (cc[1] << 8) + cc[2];
854 cc += 4;
855 break;
856
857 /* Handle single-char matchers */
858
859 case OP_NOT_DIGIT:
860 case OP_DIGIT:
861 case OP_NOT_WHITESPACE:
862 case OP_WHITESPACE:
863 case OP_NOT_WORDCHAR:
864 case OP_WORDCHAR:
865 case OP_ANY:
866 branchlength++;
867 cc++;
868 break;
869
870
871 /* Check a class for variable quantification */
872
873 case OP_CLASS:
874 cc += (*cc == OP_REF)? 2 : 33;
875
876 switch (*cc)
877 {
878 case OP_CRSTAR:
879 case OP_CRMINSTAR:
880 case OP_CRQUERY:
881 case OP_CRMINQUERY:
882 return -1;
883
884 case OP_CRRANGE:
885 case OP_CRMINRANGE:
886 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887 branchlength += (cc[1] << 8) + cc[2];
888 cc += 5;
889 break;
890
891 default:
892 branchlength++;
893 }
894 break;
895
896 /* Anything else is variable length */
897
898 default:
899 return -1;
900 }
901 }
902 /* Control never gets here */
903 }
904
905
906
907
908 /*************************************************
909 * Check for POSIX class syntax *
910 *************************************************/
911
912 /* This function is called when the sequence "[:" or "[." or "[=" is
913 encountered in a character class. It checks whether this is followed by an
914 optional ^ and then a sequence of letters, terminated by a matching ":]" or
915 ".]" or "=]".
916
917 Argument:
918 ptr pointer to the initial [
919 endptr where to return the end pointer
920 cd pointer to compile data
921
922 Returns: TRUE or FALSE
923 */
924
925 static BOOL
926 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
927 {
928 int terminator; /* Don't combine these lines; the Solaris cc */
929 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
930 if (*(++ptr) == '^') ptr++;
931 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932 if (*ptr == terminator && ptr[1] == ']')
933 {
934 *endptr = ptr;
935 return TRUE;
936 }
937 return FALSE;
938 }
939
940
941
942
943 /*************************************************
944 * Check POSIX class name *
945 *************************************************/
946
947 /* This function is called to check the name given in a POSIX-style class entry
948 such as [:alnum:].
949
950 Arguments:
951 ptr points to the first letter
952 len the length of the name
953
954 Returns: a value representing the name, or -1 if unknown
955 */
956
957 static int
958 check_posix_name(const uschar *ptr, int len)
959 {
960 register int yield = 0;
961 while (posix_name_lengths[yield] != 0)
962 {
963 if (len == posix_name_lengths[yield] &&
964 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
965 yield++;
966 }
967 return -1;
968 }
969
970
971
972
973 /*************************************************
974 * Compile one branch *
975 *************************************************/
976
977 /* Scan the pattern, compiling it into the code vector.
978
979 Arguments:
980 options the option bits
981 brackets points to number of brackets used
982 code points to the pointer to the current code point
983 ptrptr points to the current pattern pointer
984 errorptr points to pointer to error message
985 optchanged set to the value of the last OP_OPT item compiled
986 reqchar set to the last literal character required, else -1
987 countlits set to count of mandatory literal characters
988 cd contains pointers to tables
989
990 Returns: TRUE on success
991 FALSE, with *errorptr set on error
992 */
993
994 static BOOL
995 compile_branch(int options, int *brackets, uschar **codeptr,
996 const uschar **ptrptr, const char **errorptr, int *optchanged,
997 int *reqchar, int *countlits, compile_data *cd)
998 {
999 int repeat_type, op_type;
1000 int repeat_min, repeat_max;
1001 int bravalue, length;
1002 int greedy_default, greedy_non_default;
1003 int prevreqchar;
1004 int condcount = 0;
1005 int subcountlits = 0;
1006 register int c;
1007 register uschar *code = *codeptr;
1008 uschar *tempcode;
1009 const uschar *ptr = *ptrptr;
1010 const uschar *tempptr;
1011 uschar *previous = NULL;
1012 uschar class[32];
1013
1014 /* Set up the default and non-default settings for greediness */
1015
1016 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017 greedy_non_default = greedy_default ^ 1;
1018
1019 /* Initialize no required char, and count of literals */
1020
1021 *reqchar = prevreqchar = -1;
1022 *countlits = 0;
1023
1024 /* Switch on next character until the end of the branch */
1025
1026 for (;; ptr++)
1027 {
1028 BOOL negate_class;
1029 int class_charcount;
1030 int class_lastchar;
1031 int newoptions;
1032 int condref;
1033 int subreqchar;
1034
1035 c = *ptr;
1036 if ((options & PCRE_EXTENDED) != 0)
1037 {
1038 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039 if (c == '#')
1040 {
1041 /* The space before the ; is to avoid a warning on a silly compiler
1042 on the Macintosh. */
1043 while ((c = *(++ptr)) != 0 && c != '\n') ;
1044 continue;
1045 }
1046 }
1047
1048 switch(c)
1049 {
1050 /* The branch terminates at end of string, |, or ). */
1051
1052 case 0:
1053 case '|':
1054 case ')':
1055 *codeptr = code;
1056 *ptrptr = ptr;
1057 return TRUE;
1058
1059 /* Handle single-character metacharacters */
1060
1061 case '^':
1062 previous = NULL;
1063 *code++ = OP_CIRC;
1064 break;
1065
1066 case '$':
1067 previous = NULL;
1068 *code++ = OP_DOLL;
1069 break;
1070
1071 case '.':
1072 previous = code;
1073 *code++ = OP_ANY;
1074 break;
1075
1076 /* Character classes. These always build a 32-byte bitmap of the permitted
1077 characters, except in the special case where there is only one character.
1078 For negated classes, we build the map as usual, then invert it at the end.
1079 */
1080
1081 case '[':
1082 previous = code;
1083 *code++ = OP_CLASS;
1084
1085 /* If the first character is '^', set the negation flag and skip it. */
1086
1087 if ((c = *(++ptr)) == '^')
1088 {
1089 negate_class = TRUE;
1090 c = *(++ptr);
1091 }
1092 else negate_class = FALSE;
1093
1094 /* Keep a count of chars so that we can optimize the case of just a single
1095 character. */
1096
1097 class_charcount = 0;
1098 class_lastchar = -1;
1099
1100 /* Initialize the 32-char bit map to all zeros. We have to build the
1101 map in a temporary bit of store, in case the class contains only 1
1102 character, because in that case the compiled code doesn't use the
1103 bit map. */
1104
1105 memset(class, 0, 32 * sizeof(uschar));
1106
1107 /* Process characters until ] is reached. By writing this as a "do" it
1108 means that an initial ] is taken as a data character. */
1109
1110 do
1111 {
1112 if (c == 0)
1113 {
1114 *errorptr = ERR6;
1115 goto FAILED;
1116 }
1117
1118 /* Handle POSIX class names. Perl allows a negation extension of the
1119 form [:^name]. A square bracket that doesn't match the syntax is
1120 treated as a literal. We also recognize the POSIX constructions
1121 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122 5.6 does. */
1123
1124 if (c == '[' &&
1125 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126 check_posix_syntax(ptr, &tempptr, cd))
1127 {
1128 BOOL local_negate = FALSE;
1129 int posix_class, i;
1130 register const uschar *cbits = cd->cbits;
1131
1132 if (ptr[1] != ':')
1133 {
1134 *errorptr = ERR31;
1135 goto FAILED;
1136 }
1137
1138 ptr += 2;
1139 if (*ptr == '^')
1140 {
1141 local_negate = TRUE;
1142 ptr++;
1143 }
1144
1145 posix_class = check_posix_name(ptr, tempptr - ptr);
1146 if (posix_class < 0)
1147 {
1148 *errorptr = ERR30;
1149 goto FAILED;
1150 }
1151
1152 /* If matching is caseless, upper and lower are converted to
1153 alpha. This relies on the fact that the class table starts with
1154 alpha, lower, upper as the first 3 entries. */
1155
1156 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157 posix_class = 0;
1158
1159 /* Or into the map we are building up to 3 of the static class
1160 tables, or their negations. */
1161
1162 posix_class *= 3;
1163 for (i = 0; i < 3; i++)
1164 {
1165 int taboffset = posix_class_maps[posix_class + i];
1166 if (taboffset < 0) break;
1167 if (local_negate)
1168 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169 else
1170 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171 }
1172
1173 ptr = tempptr + 1;
1174 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1175 continue;
1176 }
1177
1178 /* Backslash may introduce a single character, or it may introduce one
1179 of the specials, which just set a flag. Escaped items are checked for
1180 validity in the pre-compiling pass. The sequence \b is a special case.
1181 Inside a class (and only there) it is treated as backspace. Elsewhere
1182 it marks a word boundary. Other escapes have preset maps ready to
1183 or into the one we are building. We assume they have more than one
1184 character in them, so set class_count bigger than one. */
1185
1186 if (c == '\\')
1187 {
1188 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189 if (-c == ESC_b) c = '\b';
1190 else if (c < 0)
1191 {
1192 register const uschar *cbits = cd->cbits;
1193 class_charcount = 10;
1194 switch (-c)
1195 {
1196 case ESC_d:
1197 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1198 continue;
1199
1200 case ESC_D:
1201 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1202 continue;
1203
1204 case ESC_w:
1205 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1206 continue;
1207
1208 case ESC_W:
1209 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1210 continue;
1211
1212 case ESC_s:
1213 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1214 continue;
1215
1216 case ESC_S:
1217 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1218 continue;
1219
1220 default:
1221 *errorptr = ERR7;
1222 goto FAILED;
1223 }
1224 }
1225
1226 /* Fall through if single character, but don't at present allow
1227 chars > 255 in UTF-8 mode. */
1228
1229 #ifdef SUPPORT_UTF8
1230 if (c > 255)
1231 {
1232 *errorptr = ERR33;
1233 goto FAILED;
1234 }
1235 #endif
1236 }
1237
1238 /* A single character may be followed by '-' to form a range. However,
1239 Perl does not permit ']' to be the end of the range. A '-' character
1240 here is treated as a literal. */
1241
1242 if (ptr[1] == '-' && ptr[2] != ']')
1243 {
1244 int d;
1245 ptr += 2;
1246 d = *ptr;
1247
1248 if (d == 0)
1249 {
1250 *errorptr = ERR6;
1251 goto FAILED;
1252 }
1253
1254 /* The second part of a range can be a single-character escape, but
1255 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256 in such circumstances. */
1257
1258 if (d == '\\')
1259 {
1260 const uschar *oldptr = ptr;
1261 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262
1263 #ifdef SUPPORT_UTF8
1264 if (d > 255)
1265 {
1266 *errorptr = ERR33;
1267 goto FAILED;
1268 }
1269 #endif
1270 /* \b is backslash; any other special means the '-' was literal */
1271
1272 if (d < 0)
1273 {
1274 if (d == -ESC_b) d = '\b'; else
1275 {
1276 ptr = oldptr - 2;
1277 goto SINGLE_CHARACTER; /* A few lines below */
1278 }
1279 }
1280 }
1281
1282 if (d < c)
1283 {
1284 *errorptr = ERR8;
1285 goto FAILED;
1286 }
1287
1288 for (; c <= d; c++)
1289 {
1290 class[c/8] |= (1 << (c&7));
1291 if ((options & PCRE_CASELESS) != 0)
1292 {
1293 int uc = cd->fcc[c]; /* flip case */
1294 class[uc/8] |= (1 << (uc&7));
1295 }
1296 class_charcount++; /* in case a one-char range */
1297 class_lastchar = c;
1298 }
1299 continue; /* Go get the next char in the class */
1300 }
1301
1302 /* Handle a lone single character - we can get here for a normal
1303 non-escape char, or after \ that introduces a single character. */
1304
1305 SINGLE_CHARACTER:
1306
1307 class [c/8] |= (1 << (c&7));
1308 if ((options & PCRE_CASELESS) != 0)
1309 {
1310 c = cd->fcc[c]; /* flip case */
1311 class[c/8] |= (1 << (c&7));
1312 }
1313 class_charcount++;
1314 class_lastchar = c;
1315 }
1316
1317 /* Loop until ']' reached; the check for end of string happens inside the
1318 loop. This "while" is the end of the "do" above. */
1319
1320 while ((c = *(++ptr)) != ']');
1321
1322 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1323 precisely one character. This doesn't need the whole 32-byte bit map.
1324 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1325 it's negative. */
1326
1327 if (class_charcount == 1 && class_lastchar >= 0)
1328 {
1329 if (negate_class)
1330 {
1331 code[-1] = OP_NOT;
1332 }
1333 else
1334 {
1335 code[-1] = OP_CHARS;
1336 *code++ = 1;
1337 }
1338 *code++ = class_lastchar;
1339 }
1340
1341 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1342 the code vector. */
1343
1344 else
1345 {
1346 if (negate_class)
1347 for (c = 0; c < 32; c++) code[c] = ~class[c];
1348 else
1349 memcpy(code, class, 32);
1350 code += 32;
1351 }
1352 break;
1353
1354 /* Various kinds of repeat */
1355
1356 case '{':
1357 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359 if (*errorptr != NULL) goto FAILED;
1360 goto REPEAT;
1361
1362 case '*':
1363 repeat_min = 0;
1364 repeat_max = -1;
1365 goto REPEAT;
1366
1367 case '+':
1368 repeat_min = 1;
1369 repeat_max = -1;
1370 goto REPEAT;
1371
1372 case '?':
1373 repeat_min = 0;
1374 repeat_max = 1;
1375
1376 REPEAT:
1377 if (previous == NULL)
1378 {
1379 *errorptr = ERR9;
1380 goto FAILED;
1381 }
1382
1383 /* If the next character is '?' this is a minimizing repeat, by default,
1384 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1385 next character. */
1386
1387 if (ptr[1] == '?')
1388 { repeat_type = greedy_non_default; ptr++; }
1389 else repeat_type = greedy_default;
1390
1391 /* If previous was a string of characters, chop off the last one and use it
1392 as the subject of the repeat. If there was only one character, we can
1393 abolish the previous item altogether. A repeat with a zero minimum wipes
1394 out any reqchar setting, backing up to the previous value. We must also
1395 adjust the countlits value. */
1396
1397 if (*previous == OP_CHARS)
1398 {
1399 int len = previous[1];
1400
1401 if (repeat_min == 0) *reqchar = prevreqchar;
1402 *countlits += repeat_min - 1;
1403
1404 if (len == 1)
1405 {
1406 c = previous[2];
1407 code = previous;
1408 }
1409 else
1410 {
1411 c = previous[len+1];
1412 previous[1]--;
1413 code--;
1414 }
1415 op_type = 0; /* Use single-char op codes */
1416 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1417 }
1418
1419 /* If previous was a single negated character ([^a] or similar), we use
1420 one of the special opcodes, replacing it. The code is shared with single-
1421 character repeats by adding a suitable offset into repeat_type. */
1422
1423 else if ((int)*previous == OP_NOT)
1424 {
1425 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1426 c = previous[1];
1427 code = previous;
1428 goto OUTPUT_SINGLE_REPEAT;
1429 }
1430
1431 /* If previous was a character type match (\d or similar), abolish it and
1432 create a suitable repeat item. The code is shared with single-character
1433 repeats by adding a suitable offset into repeat_type. */
1434
1435 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1436 {
1437 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1438 c = *previous;
1439 code = previous;
1440
1441 OUTPUT_SINGLE_REPEAT:
1442
1443 /* If the maximum is zero then the minimum must also be zero; Perl allows
1444 this case, so we do too - by simply omitting the item altogether. */
1445
1446 if (repeat_max == 0) goto END_REPEAT;
1447
1448 /* Combine the op_type with the repeat_type */
1449
1450 repeat_type += op_type;
1451
1452 /* A minimum of zero is handled either as the special case * or ?, or as
1453 an UPTO, with the maximum given. */
1454
1455 if (repeat_min == 0)
1456 {
1457 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1458 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1459 else
1460 {
1461 *code++ = OP_UPTO + repeat_type;
1462 *code++ = repeat_max >> 8;
1463 *code++ = (repeat_max & 255);
1464 }
1465 }
1466
1467 /* The case {1,} is handled as the special case + */
1468
1469 else if (repeat_min == 1 && repeat_max == -1)
1470 *code++ = OP_PLUS + repeat_type;
1471
1472 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1473 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1474
1475 else
1476 {
1477 if (repeat_min != 1)
1478 {
1479 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1480 *code++ = repeat_min >> 8;
1481 *code++ = (repeat_min & 255);
1482 }
1483
1484 /* If the mininum is 1 and the previous item was a character string,
1485 we either have to put back the item that got cancelled if the string
1486 length was 1, or add the character back onto the end of a longer
1487 string. For a character type nothing need be done; it will just get
1488 put back naturally. Note that the final character is always going to
1489 get added below. */
1490
1491 else if (*previous == OP_CHARS)
1492 {
1493 if (code == previous) code += 2; else previous[1]++;
1494 }
1495
1496 /* For a single negated character we also have to put back the
1497 item that got cancelled. */
1498
1499 else if (*previous == OP_NOT) code++;
1500
1501 /* If the maximum is unlimited, insert an OP_STAR. */
1502
1503 if (repeat_max < 0)
1504 {
1505 *code++ = c;
1506 *code++ = OP_STAR + repeat_type;
1507 }
1508
1509 /* Else insert an UPTO if the max is greater than the min. */
1510
1511 else if (repeat_max != repeat_min)
1512 {
1513 *code++ = c;
1514 repeat_max -= repeat_min;
1515 *code++ = OP_UPTO + repeat_type;
1516 *code++ = repeat_max >> 8;
1517 *code++ = (repeat_max & 255);
1518 }
1519 }
1520
1521 /* The character or character type itself comes last in all cases. */
1522
1523 *code++ = c;
1524 }
1525
1526 /* If previous was a character class or a back reference, we put the repeat
1527 stuff after it, but just skip the item if the repeat was {0,0}. */
1528
1529 else if (*previous == OP_CLASS || *previous == OP_REF)
1530 {
1531 if (repeat_max == 0)
1532 {
1533 code = previous;
1534 goto END_REPEAT;
1535 }
1536 if (repeat_min == 0 && repeat_max == -1)
1537 *code++ = OP_CRSTAR + repeat_type;
1538 else if (repeat_min == 1 && repeat_max == -1)
1539 *code++ = OP_CRPLUS + repeat_type;
1540 else if (repeat_min == 0 && repeat_max == 1)
1541 *code++ = OP_CRQUERY + repeat_type;
1542 else
1543 {
1544 *code++ = OP_CRRANGE + repeat_type;
1545 *code++ = repeat_min >> 8;
1546 *code++ = repeat_min & 255;
1547 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1548 *code++ = repeat_max >> 8;
1549 *code++ = repeat_max & 255;
1550 }
1551 }
1552
1553 /* If previous was a bracket group, we may have to replicate it in certain
1554 cases. */
1555
1556 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557 (int)*previous == OP_COND)
1558 {
1559 register int i;
1560 int ketoffset = 0;
1561 int len = code - previous;
1562 uschar *bralink = NULL;
1563
1564 /* If the maximum repeat count is unlimited, find the end of the bracket
1565 by scanning through from the start, and compute the offset back to it
1566 from the current code pointer. There may be an OP_OPT setting following
1567 the final KET, so we can't find the end just by going back from the code
1568 pointer. */
1569
1570 if (repeat_max == -1)
1571 {
1572 register uschar *ket = previous;
1573 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574 ketoffset = code - ket;
1575 }
1576
1577 /* The case of a zero minimum is special because of the need to stick
1578 OP_BRAZERO in front of it, and because the group appears once in the
1579 data, whereas in other cases it appears the minimum number of times. For
1580 this reason, it is simplest to treat this case separately, as otherwise
1581 the code gets far too mess. There are several special subcases when the
1582 minimum is zero. */
1583
1584 if (repeat_min == 0)
1585 {
1586 /* If we set up a required char from the bracket, we must back off
1587 to the previous value and reset the countlits value too. */
1588
1589 if (subcountlits > 0)
1590 {
1591 *reqchar = prevreqchar;
1592 *countlits -= subcountlits;
1593 }
1594
1595 /* If the maximum is also zero, we just omit the group from the output
1596 altogether. */
1597
1598 if (repeat_max == 0)
1599 {
1600 code = previous;
1601 goto END_REPEAT;
1602 }
1603
1604 /* If the maximum is 1 or unlimited, we just have to stick in the
1605 BRAZERO and do no more at this point. */
1606
1607 if (repeat_max <= 1)
1608 {
1609 memmove(previous+1, previous, len);
1610 code++;
1611 *previous++ = OP_BRAZERO + repeat_type;
1612 }
1613
1614 /* If the maximum is greater than 1 and limited, we have to replicate
1615 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616 The first one has to be handled carefully because it's the original
1617 copy, which has to be moved up. The remainder can be handled by code
1618 that is common with the non-zero minimum case below. We just have to
1619 adjust the value or repeat_max, since one less copy is required. */
1620
1621 else
1622 {
1623 int offset;
1624 memmove(previous+4, previous, len);
1625 code += 4;
1626 *previous++ = OP_BRAZERO + repeat_type;
1627 *previous++ = OP_BRA;
1628
1629 /* We chain together the bracket offset fields that have to be
1630 filled in later when the ends of the brackets are reached. */
1631
1632 offset = (bralink == NULL)? 0 : previous - bralink;
1633 bralink = previous;
1634 *previous++ = offset >> 8;
1635 *previous++ = offset & 255;
1636 }
1637
1638 repeat_max--;
1639 }
1640
1641 /* If the minimum is greater than zero, replicate the group as many
1642 times as necessary, and adjust the maximum to the number of subsequent
1643 copies that we need. */
1644
1645 else
1646 {
1647 for (i = 1; i < repeat_min; i++)
1648 {
1649 memcpy(code, previous, len);
1650 code += len;
1651 }
1652 if (repeat_max > 0) repeat_max -= repeat_min;
1653 }
1654
1655 /* This code is common to both the zero and non-zero minimum cases. If
1656 the maximum is limited, it replicates the group in a nested fashion,
1657 remembering the bracket starts on a stack. In the case of a zero minimum,
1658 the first one was set up above. In all cases the repeat_max now specifies
1659 the number of additional copies needed. */
1660
1661 if (repeat_max >= 0)
1662 {
1663 for (i = repeat_max - 1; i >= 0; i--)
1664 {
1665 *code++ = OP_BRAZERO + repeat_type;
1666
1667 /* All but the final copy start a new nesting, maintaining the
1668 chain of brackets outstanding. */
1669
1670 if (i != 0)
1671 {
1672 int offset;
1673 *code++ = OP_BRA;
1674 offset = (bralink == NULL)? 0 : code - bralink;
1675 bralink = code;
1676 *code++ = offset >> 8;
1677 *code++ = offset & 255;
1678 }
1679
1680 memcpy(code, previous, len);
1681 code += len;
1682 }
1683
1684 /* Now chain through the pending brackets, and fill in their length
1685 fields (which are holding the chain links pro tem). */
1686
1687 while (bralink != NULL)
1688 {
1689 int oldlinkoffset;
1690 int offset = code - bralink + 1;
1691 uschar *bra = code - offset;
1692 oldlinkoffset = (bra[1] << 8) + bra[2];
1693 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1694 *code++ = OP_KET;
1695 *code++ = bra[1] = offset >> 8;
1696 *code++ = bra[2] = (offset & 255);
1697 }
1698 }
1699
1700 /* If the maximum is unlimited, set a repeater in the final copy. We
1701 can't just offset backwards from the current code point, because we
1702 don't know if there's been an options resetting after the ket. The
1703 correct offset was computed above. */
1704
1705 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1706 }
1707
1708 /* Else there's some kind of shambles */
1709
1710 else
1711 {
1712 *errorptr = ERR11;
1713 goto FAILED;
1714 }
1715
1716 /* In all case we no longer have a previous item. */
1717
1718 END_REPEAT:
1719 previous = NULL;
1720 break;
1721
1722
1723 /* Start of nested bracket sub-expression, or comment or lookahead or
1724 lookbehind or option setting or condition. First deal with special things
1725 that can come after a bracket; all are introduced by ?, and the appearance
1726 of any of them means that this is not a referencing group. They were
1727 checked for validity in the first pass over the string, so we don't have to
1728 check for syntax errors here. */
1729
1730 case '(':
1731 newoptions = options;
1732 condref = -1;
1733
1734 if (*(++ptr) == '?')
1735 {
1736 int set, unset;
1737 int *optset;
1738
1739 switch (*(++ptr))
1740 {
1741 case '#': /* Comment; skip to ket */
1742 ptr++;
1743 while (*ptr != ')') ptr++;
1744 continue;
1745
1746 case ':': /* Non-extracting bracket */
1747 bravalue = OP_BRA;
1748 ptr++;
1749 break;
1750
1751 case '(':
1752 bravalue = OP_COND; /* Conditional group */
1753 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1754 {
1755 condref = *ptr - '0';
1756 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757 ptr++;
1758 }
1759 else ptr--;
1760 break;
1761
1762 case '=': /* Positive lookahead */
1763 bravalue = OP_ASSERT;
1764 ptr++;
1765 break;
1766
1767 case '!': /* Negative lookahead */
1768 bravalue = OP_ASSERT_NOT;
1769 ptr++;
1770 break;
1771
1772 case '<': /* Lookbehinds */
1773 switch (*(++ptr))
1774 {
1775 case '=': /* Positive lookbehind */
1776 bravalue = OP_ASSERTBACK;
1777 ptr++;
1778 break;
1779
1780 case '!': /* Negative lookbehind */
1781 bravalue = OP_ASSERTBACK_NOT;
1782 ptr++;
1783 break;
1784
1785 default: /* Syntax error */
1786 *errorptr = ERR24;
1787 goto FAILED;
1788 }
1789 break;
1790
1791 case '>': /* One-time brackets */
1792 bravalue = OP_ONCE;
1793 ptr++;
1794 break;
1795
1796 case 'R': /* Pattern recursion */
1797 *code++ = OP_RECURSE;
1798 ptr++;
1799 continue;
1800
1801 default: /* Option setting */
1802 set = unset = 0;
1803 optset = &set;
1804
1805 while (*ptr != ')' && *ptr != ':')
1806 {
1807 switch (*ptr++)
1808 {
1809 case '-': optset = &unset; break;
1810
1811 case 'i': *optset |= PCRE_CASELESS; break;
1812 case 'm': *optset |= PCRE_MULTILINE; break;
1813 case 's': *optset |= PCRE_DOTALL; break;
1814 case 'x': *optset |= PCRE_EXTENDED; break;
1815 case 'U': *optset |= PCRE_UNGREEDY; break;
1816 case 'X': *optset |= PCRE_EXTRA; break;
1817
1818 default:
1819 *errorptr = ERR12;
1820 goto FAILED;
1821 }
1822 }
1823
1824 /* Set up the changed option bits, but don't change anything yet. */
1825
1826 newoptions = (options | set) & (~unset);
1827
1828 /* If the options ended with ')' this is not the start of a nested
1829 group with option changes, so the options change at this level. At top
1830 level there is nothing else to be done (the options will in fact have
1831 been set from the start of compiling as a result of the first pass) but
1832 at an inner level we must compile code to change the ims options if
1833 necessary, and pass the new setting back so that it can be put at the
1834 start of any following branches, and when this group ends, a resetting
1835 item can be compiled. */
1836
1837 if (*ptr == ')')
1838 {
1839 if ((options & PCRE_INGROUP) != 0 &&
1840 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1841 {
1842 *code++ = OP_OPT;
1843 *code++ = *optchanged = newoptions & PCRE_IMS;
1844 }
1845 options = newoptions; /* Change options at this level */
1846 previous = NULL; /* This item can't be repeated */
1847 continue; /* It is complete */
1848 }
1849
1850 /* If the options ended with ':' we are heading into a nested group
1851 with possible change of options. Such groups are non-capturing and are
1852 not assertions of any kind. All we need to do is skip over the ':';
1853 the newoptions value is handled below. */
1854
1855 bravalue = OP_BRA;
1856 ptr++;
1857 }
1858 }
1859
1860 /* Else we have a referencing group; adjust the opcode. */
1861
1862 else
1863 {
1864 if (++(*brackets) > EXTRACT_MAX)
1865 {
1866 *errorptr = ERR13;
1867 goto FAILED;
1868 }
1869 bravalue = OP_BRA + *brackets;
1870 }
1871
1872 /* Process nested bracketed re. Assertions may not be repeated, but other
1873 kinds can be. We copy code into a non-register variable in order to be able
1874 to pass its address because some compilers complain otherwise. Pass in a
1875 new setting for the ims options if they have changed. */
1876
1877 previous = (bravalue >= OP_ONCE)? code : NULL;
1878 *code = bravalue;
1879 tempcode = code;
1880
1881 if (!compile_regex(
1882 options | PCRE_INGROUP, /* Set for all nested groups */
1883 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1884 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1885 brackets, /* Bracket level */
1886 &tempcode, /* Where to put code (updated) */
1887 &ptr, /* Input pointer (updated) */
1888 errorptr, /* Where to put an error message */
1889 (bravalue == OP_ASSERTBACK ||
1890 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1891 condref, /* Condition reference number */
1892 &subreqchar, /* For possible last char */
1893 &subcountlits, /* For literal count */
1894 cd)) /* Tables block */
1895 goto FAILED;
1896
1897 /* At the end of compiling, code is still pointing to the start of the
1898 group, while tempcode has been updated to point past the end of the group
1899 and any option resetting that may follow it. The pattern pointer (ptr)
1900 is on the bracket. */
1901
1902 /* If this is a conditional bracket, check that there are no more than
1903 two branches in the group. */
1904
1905 if (bravalue == OP_COND)
1906 {
1907 uschar *tc = code;
1908 condcount = 0;
1909
1910 do {
1911 condcount++;
1912 tc += (tc[1] << 8) | tc[2];
1913 }
1914 while (*tc != OP_KET);
1915
1916 if (condcount > 2)
1917 {
1918 *errorptr = ERR27;
1919 goto FAILED;
1920 }
1921 }
1922
1923 /* Handle updating of the required character. If the subpattern didn't
1924 set one, leave it as it was. Otherwise, update it for normal brackets of
1925 all kinds, forward assertions, and conditions with two branches. Don't
1926 update the literal count for forward assertions, however. If the bracket
1927 is followed by a quantifier with zero repeat, we have to back off. Hence
1928 the definition of prevreqchar and subcountlits outside the main loop so
1929 that they can be accessed for the back off. */
1930
1931 if (subreqchar > 0 &&
1932 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1933 (bravalue == OP_COND && condcount == 2)))
1934 {
1935 prevreqchar = *reqchar;
1936 *reqchar = subreqchar;
1937 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1938 }
1939
1940 /* Now update the main code pointer to the end of the group. */
1941
1942 code = tempcode;
1943
1944 /* Error if hit end of pattern */
1945
1946 if (*ptr != ')')
1947 {
1948 *errorptr = ERR14;
1949 goto FAILED;
1950 }
1951 break;
1952
1953 /* Check \ for being a real metacharacter; if not, fall through and handle
1954 it as a data character at the start of a string. Escape items are checked
1955 for validity in the pre-compiling pass. */
1956
1957 case '\\':
1958 tempptr = ptr;
1959 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1960
1961 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1962 are arranged to be the negation of the corresponding OP_values. For the
1963 back references, the values are ESC_REF plus the reference number. Only
1964 back references and those types that consume a character may be repeated.
1965 We can test for values between ESC_b and ESC_Z for the latter; this may
1966 have to change if any new ones are ever created. */
1967
1968 if (c < 0)
1969 {
1970 if (-c >= ESC_REF)
1971 {
1972 previous = code;
1973 *code++ = OP_REF;
1974 *code++ = -c - ESC_REF;
1975 }
1976 else
1977 {
1978 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1979 *code++ = -c;
1980 }
1981 continue;
1982 }
1983
1984 /* Data character: reset and fall through */
1985
1986 ptr = tempptr;
1987 c = '\\';
1988
1989 /* Handle a run of data characters until a metacharacter is encountered.
1990 The first character is guaranteed not to be whitespace or # when the
1991 extended flag is set. */
1992
1993 NORMAL_CHAR:
1994 default:
1995 previous = code;
1996 *code = OP_CHARS;
1997 code += 2;
1998 length = 0;
1999
2000 do
2001 {
2002 if ((options & PCRE_EXTENDED) != 0)
2003 {
2004 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2005 if (c == '#')
2006 {
2007 /* The space before the ; is to avoid a warning on a silly compiler
2008 on the Macintosh. */
2009 while ((c = *(++ptr)) != 0 && c != '\n') ;
2010 if (c == 0) break;
2011 continue;
2012 }
2013 }
2014
2015 /* Backslash may introduce a data char or a metacharacter. Escaped items
2016 are checked for validity in the pre-compiling pass. Stop the string
2017 before a metaitem. */
2018
2019 if (c == '\\')
2020 {
2021 tempptr = ptr;
2022 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2023 if (c < 0) { ptr = tempptr; break; }
2024
2025 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2026 two or more characters in the UTF-8 encoding. */
2027
2028 #ifdef SUPPORT_UTF8
2029 if (c > 127 && (options & PCRE_UTF8) != 0)
2030 {
2031 uschar buffer[8];
2032 int len = ord2utf8(c, buffer);
2033 for (c = 0; c < len; c++) *code++ = buffer[c];
2034 length += len;
2035 continue;
2036 }
2037 #endif
2038 }
2039
2040 /* Ordinary character or single-char escape */
2041
2042 *code++ = c;
2043 length++;
2044 }
2045
2046 /* This "while" is the end of the "do" above. */
2047
2048 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2049
2050 /* Update the last character and the count of literals */
2051
2052 prevreqchar = (length > 1)? code[-2] : *reqchar;
2053 *reqchar = code[-1];
2054 *countlits += length;
2055
2056 /* Compute the length and set it in the data vector, and advance to
2057 the next state. */
2058
2059 previous[1] = length;
2060 if (length < MAXLIT) ptr--;
2061 break;
2062 }
2063 } /* end of big loop */
2064
2065 /* Control never reaches here by falling through, only by a goto for all the
2066 error states. Pass back the position in the pattern so that it can be displayed
2067 to the user for diagnosing the error. */
2068
2069 FAILED:
2070 *ptrptr = ptr;
2071 return FALSE;
2072 }
2073
2074
2075
2076
2077 /*************************************************
2078 * Compile sequence of alternatives *
2079 *************************************************/
2080
2081 /* On entry, ptr is pointing past the bracket character, but on return
2082 it points to the closing bracket, or vertical bar, or end of string.
2083 The code variable is pointing at the byte into which the BRA operator has been
2084 stored. If the ims options are changed at the start (for a (?ims: group) or
2085 during any branch, we need to insert an OP_OPT item at the start of every
2086 following branch to ensure they get set correctly at run time, and also pass
2087 the new options into every subsequent branch compile.
2088
2089 Argument:
2090 options the option bits
2091 optchanged new ims options to set as if (?ims) were at the start, or -1
2092 for no change
2093 brackets -> int containing the number of extracting brackets used
2094 codeptr -> the address of the current code pointer
2095 ptrptr -> the address of the current pattern pointer
2096 errorptr -> pointer to error message
2097 lookbehind TRUE if this is a lookbehind assertion
2098 condref > 0 for OPT_CREF setting at start of conditional group
2099 reqchar -> place to put the last required character, or a negative number
2100 countlits -> place to put the shortest literal count of any branch
2101 cd points to the data block with tables pointers
2102
2103 Returns: TRUE on success
2104 */
2105
2106 static BOOL
2107 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2108 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2109 int *reqchar, int *countlits, compile_data *cd)
2110 {
2111 const uschar *ptr = *ptrptr;
2112 uschar *code = *codeptr;
2113 uschar *last_branch = code;
2114 uschar *start_bracket = code;
2115 uschar *reverse_count = NULL;
2116 int oldoptions = options & PCRE_IMS;
2117 int branchreqchar, branchcountlits;
2118
2119 *reqchar = -1;
2120 *countlits = INT_MAX;
2121 code += 3;
2122
2123 /* At the start of a reference-based conditional group, insert the reference
2124 number as an OP_CREF item. */
2125
2126 if (condref > 0)
2127 {
2128 *code++ = OP_CREF;
2129 *code++ = condref;
2130 }
2131
2132 /* Loop for each alternative branch */
2133
2134 for (;;)
2135 {
2136 int length;
2137
2138 /* Handle change of options */
2139
2140 if (optchanged >= 0)
2141 {
2142 *code++ = OP_OPT;
2143 *code++ = optchanged;
2144 options = (options & ~PCRE_IMS) | optchanged;
2145 }
2146
2147 /* Set up dummy OP_REVERSE if lookbehind assertion */
2148
2149 if (lookbehind)
2150 {
2151 *code++ = OP_REVERSE;
2152 reverse_count = code;
2153 *code++ = 0;
2154 *code++ = 0;
2155 }
2156
2157 /* Now compile the branch */
2158
2159 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2160 &branchreqchar, &branchcountlits, cd))
2161 {
2162 *ptrptr = ptr;
2163 return FALSE;
2164 }
2165
2166 /* Fill in the length of the last branch */
2167
2168 length = code - last_branch;
2169 last_branch[1] = length >> 8;
2170 last_branch[2] = length & 255;
2171
2172 /* Save the last required character if all branches have the same; a current
2173 value of -1 means unset, while -2 means "previous branch had no last required
2174 char". */
2175
2176 if (*reqchar != -2)
2177 {
2178 if (branchreqchar >= 0)
2179 {
2180 if (*reqchar == -1) *reqchar = branchreqchar;
2181 else if (*reqchar != branchreqchar) *reqchar = -2;
2182 }
2183 else *reqchar = -2;
2184 }
2185
2186 /* Keep the shortest literal count */
2187
2188 if (branchcountlits < *countlits) *countlits = branchcountlits;
2189 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2190
2191 /* If lookbehind, check that this branch matches a fixed-length string,
2192 and put the length into the OP_REVERSE item. Temporarily mark the end of
2193 the branch with OP_END. */
2194
2195 if (lookbehind)
2196 {
2197 *code = OP_END;
2198 length = find_fixedlength(last_branch, options);
2199 DPRINTF(("fixed length = %d\n", length));
2200 if (length < 0)
2201 {
2202 *errorptr = ERR25;
2203 *ptrptr = ptr;
2204 return FALSE;
2205 }
2206 reverse_count[0] = (length >> 8);
2207 reverse_count[1] = length & 255;
2208 }
2209
2210 /* Reached end of expression, either ')' or end of pattern. Insert a
2211 terminating ket and the length of the whole bracketed item, and return,
2212 leaving the pointer at the terminating char. If any of the ims options
2213 were changed inside the group, compile a resetting op-code following. */
2214
2215 if (*ptr != '|')
2216 {
2217 length = code - start_bracket;
2218 *code++ = OP_KET;
2219 *code++ = length >> 8;
2220 *code++ = length & 255;
2221 if (optchanged >= 0)
2222 {
2223 *code++ = OP_OPT;
2224 *code++ = oldoptions;
2225 }
2226 *codeptr = code;
2227 *ptrptr = ptr;
2228 return TRUE;
2229 }
2230
2231 /* Another branch follows; insert an "or" node and advance the pointer. */
2232
2233 *code = OP_ALT;
2234 last_branch = code;
2235 code += 3;
2236 ptr++;
2237 }
2238 /* Control never reaches here */
2239 }
2240
2241
2242
2243
2244 /*************************************************
2245 * Find first significant op code *
2246 *************************************************/
2247
2248 /* This is called by several functions that scan a compiled expression looking
2249 for a fixed first character, or an anchoring op code etc. It skips over things
2250 that do not influence this. For one application, a change of caseless option is
2251 important.
2252
2253 Arguments:
2254 code pointer to the start of the group
2255 options pointer to external options
2256 optbit the option bit whose changing is significant, or
2257 zero if none are
2258 optstop TRUE to return on option change, otherwise change the options
2259 value and continue
2260
2261 Returns: pointer to the first significant opcode
2262 */
2263
2264 static const uschar*
2265 first_significant_code(const uschar *code, int *options, int optbit,
2266 BOOL optstop)
2267 {
2268 for (;;)
2269 {
2270 switch ((int)*code)
2271 {
2272 case OP_OPT:
2273 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2274 {
2275 if (optstop) return code;
2276 *options = (int)code[1];
2277 }
2278 code += 2;
2279 break;
2280
2281 case OP_CREF:
2282 code += 2;
2283 break;
2284
2285 case OP_WORD_BOUNDARY:
2286 case OP_NOT_WORD_BOUNDARY:
2287 code++;
2288 break;
2289
2290 case OP_ASSERT_NOT:
2291 case OP_ASSERTBACK:
2292 case OP_ASSERTBACK_NOT:
2293 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2294 code += 3;
2295 break;
2296
2297 default:
2298 return code;
2299 }
2300 }
2301 /* Control never reaches here */
2302 }
2303
2304
2305
2306
2307 /*************************************************
2308 * Check for anchored expression *
2309 *************************************************/
2310
2311 /* Try to find out if this is an anchored regular expression. Consider each
2312 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2313 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2314 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2315 counts, since OP_CIRC can match in the middle.
2316
2317 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2318 because that will try the rest of the pattern at all possible matching points,
2319 so there is no point trying them again.
2320
2321 Arguments:
2322 code points to start of expression (the bracket)
2323 options points to the options setting
2324
2325 Returns: TRUE or FALSE
2326 */
2327
2328 static BOOL
2329 is_anchored(register const uschar *code, int *options)
2330 {
2331 do {
2332 const uschar *scode = first_significant_code(code + 3, options,
2333 PCRE_MULTILINE, FALSE);
2334 register int op = *scode;
2335 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2336 { if (!is_anchored(scode, options)) return FALSE; }
2337 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2338 (*options & PCRE_DOTALL) != 0)
2339 { if (scode[1] != OP_ANY) return FALSE; }
2340 else if (op != OP_SOD &&
2341 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2342 return FALSE;
2343 code += (code[1] << 8) + code[2];
2344 }
2345 while (*code == OP_ALT);
2346 return TRUE;
2347 }
2348
2349
2350
2351 /*************************************************
2352 * Check for starting with ^ or .* *
2353 *************************************************/
2354
2355 /* This is called to find out if every branch starts with ^ or .* so that
2356 "first char" processing can be done to speed things up in multiline
2357 matching and for non-DOTALL patterns that start with .* (which must start at
2358 the beginning or after \n).
2359
2360 Argument: points to start of expression (the bracket)
2361 Returns: TRUE or FALSE
2362 */
2363
2364 static BOOL
2365 is_startline(const uschar *code)
2366 {
2367 do {
2368 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2369 register int op = *scode;
2370 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2371 { if (!is_startline(scode)) return FALSE; }
2372 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2373 { if (scode[1] != OP_ANY) return FALSE; }
2374 else if (op != OP_CIRC) return FALSE;
2375 code += (code[1] << 8) + code[2];
2376 }
2377 while (*code == OP_ALT);
2378 return TRUE;
2379 }
2380
2381
2382
2383 /*************************************************
2384 * Check for fixed first char *
2385 *************************************************/
2386
2387 /* Try to find out if there is a fixed first character. This is called for
2388 unanchored expressions, as it speeds up their processing quite considerably.
2389 Consider each alternative branch. If they all start with the same char, or with
2390 a bracket all of whose alternatives start with the same char (recurse ad lib),
2391 then we return that char, otherwise -1.
2392
2393 Arguments:
2394 code points to start of expression (the bracket)
2395 options pointer to the options (used to check casing changes)
2396
2397 Returns: -1 or the fixed first char
2398 */
2399
2400 static int
2401 find_firstchar(const uschar *code, int *options)
2402 {
2403 register int c = -1;
2404 do {
2405 int d;
2406 const uschar *scode = first_significant_code(code + 3, options,
2407 PCRE_CASELESS, TRUE);
2408 register int op = *scode;
2409
2410 if (op >= OP_BRA) op = OP_BRA;
2411
2412 switch(op)
2413 {
2414 default:
2415 return -1;
2416
2417 case OP_BRA:
2418 case OP_ASSERT:
2419 case OP_ONCE:
2420 case OP_COND:
2421 if ((d = find_firstchar(scode, options)) < 0) return -1;
2422 if (c < 0) c = d; else if (c != d) return -1;
2423 break;
2424
2425 case OP_EXACT: /* Fall through */
2426 scode++;
2427
2428 case OP_CHARS: /* Fall through */
2429 scode++;
2430
2431 case OP_PLUS:
2432 case OP_MINPLUS:
2433 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2434 break;
2435 }
2436
2437 code += (code[1] << 8) + code[2];
2438 }
2439 while (*code == OP_ALT);
2440 return c;
2441 }
2442
2443
2444
2445
2446
2447 /*************************************************
2448 * Compile a Regular Expression *
2449 *************************************************/
2450
2451 /* This function takes a string and returns a pointer to a block of store
2452 holding a compiled version of the expression.
2453
2454 Arguments:
2455 pattern the regular expression
2456 options various option bits
2457 errorptr pointer to pointer to error text
2458 erroroffset ptr offset in pattern where error was detected
2459 tables pointer to character tables or NULL
2460
2461 Returns: pointer to compiled data block, or NULL on error,
2462 with errorptr and erroroffset set
2463 */
2464
2465 pcre *
2466 pcre_compile(const char *pattern, int options, const char **errorptr,
2467 int *erroroffset, const unsigned char *tables)
2468 {
2469 real_pcre *re;
2470 int length = 3; /* For initial BRA plus length */
2471 int runlength;
2472 int c, reqchar, countlits;
2473 int bracount = 0;
2474 int top_backref = 0;
2475 int branch_extra = 0;
2476 int branch_newextra;
2477 unsigned int brastackptr = 0;
2478 size_t size;
2479 uschar *code;
2480 const uschar *ptr;
2481 compile_data compile_block;
2482 int brastack[BRASTACK_SIZE];
2483 uschar bralenstack[BRASTACK_SIZE];
2484
2485 #ifdef DEBUG
2486 uschar *code_base, *code_end;
2487 #endif
2488
2489 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2490
2491 #ifndef SUPPORT_UTF8
2492 if ((options & PCRE_UTF8) != 0)
2493 {
2494 *errorptr = ERR32;
2495 return NULL;
2496 }
2497 #endif
2498
2499 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2500 can do is just return NULL. */
2501
2502 if (errorptr == NULL) return NULL;
2503 *errorptr = NULL;
2504
2505 /* However, we can give a message for this error */
2506
2507 if (erroroffset == NULL)
2508 {
2509 *errorptr = ERR16;
2510 return NULL;
2511 }
2512 *erroroffset = 0;
2513
2514 if ((options & ~PUBLIC_OPTIONS) != 0)
2515 {
2516 *errorptr = ERR17;
2517 return NULL;
2518 }
2519
2520 /* Set up pointers to the individual character tables */
2521
2522 if (tables == NULL) tables = pcre_default_tables;
2523 compile_block.lcc = tables + lcc_offset;
2524 compile_block.fcc = tables + fcc_offset;
2525 compile_block.cbits = tables + cbits_offset;
2526 compile_block.ctypes = tables + ctypes_offset;
2527
2528 /* Reflect pattern for debugging output */
2529
2530 DPRINTF(("------------------------------------------------------------------\n"));
2531 DPRINTF(("%s\n", pattern));
2532
2533 /* The first thing to do is to make a pass over the pattern to compute the
2534 amount of store required to hold the compiled code. This does not have to be
2535 perfect as long as errors are overestimates. At the same time we can detect any
2536 internal flag settings. Make an attempt to correct for any counted white space
2537 if an "extended" flag setting appears late in the pattern. We can't be so
2538 clever for #-comments. */
2539
2540 ptr = (const uschar *)(pattern - 1);
2541 while ((c = *(++ptr)) != 0)
2542 {
2543 int min, max;
2544 int class_charcount;
2545
2546 if ((options & PCRE_EXTENDED) != 0)
2547 {
2548 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2549 if (c == '#')
2550 {
2551 /* The space before the ; is to avoid a warning on a silly compiler
2552 on the Macintosh. */
2553 while ((c = *(++ptr)) != 0 && c != '\n') ;
2554 continue;
2555 }
2556 }
2557
2558 switch(c)
2559 {
2560 /* A backslashed item may be an escaped "normal" character or a
2561 character type. For a "normal" character, put the pointers and
2562 character back so that tests for whitespace etc. in the input
2563 are done correctly. */
2564
2565 case '\\':
2566 {
2567 const uschar *save_ptr = ptr;
2568 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2569 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2570 if (c >= 0)
2571 {
2572 ptr = save_ptr;
2573 c = '\\';
2574 goto NORMAL_CHAR;
2575 }
2576 }
2577 length++;
2578
2579 /* A back reference needs an additional char, plus either one or 5
2580 bytes for a repeat. We also need to keep the value of the highest
2581 back reference. */
2582
2583 if (c <= -ESC_REF)
2584 {
2585 int refnum = -c - ESC_REF;
2586 if (refnum > top_backref) top_backref = refnum;
2587 length++; /* For single back reference */
2588 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2589 {
2590 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2591 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2592 if ((min == 0 && (max == 1 || max == -1)) ||
2593 (min == 1 && max == -1))
2594 length++;
2595 else length += 5;
2596 if (ptr[1] == '?') ptr++;
2597 }
2598 }
2599 continue;
2600
2601 case '^':
2602 case '.':
2603 case '$':
2604 case '*': /* These repeats won't be after brackets; */
2605 case '+': /* those are handled separately */
2606 case '?':
2607 length++;
2608 continue;
2609
2610 /* This covers the cases of repeats after a single char, metachar, class,
2611 or back reference. */
2612
2613 case '{':
2614 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2615 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2616 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2617 if ((min == 0 && (max == 1 || max == -1)) ||
2618 (min == 1 && max == -1))
2619 length++;
2620 else
2621 {
2622 length--; /* Uncount the original char or metachar */
2623 if (min == 1) length++; else if (min > 0) length += 4;
2624 if (max > 0) length += 4; else length += 2;
2625 }
2626 if (ptr[1] == '?') ptr++;
2627 continue;
2628
2629 /* An alternation contains an offset to the next branch or ket. If any ims
2630 options changed in the previous branch(es), and/or if we are in a
2631 lookbehind assertion, extra space will be needed at the start of the
2632 branch. This is handled by branch_extra. */
2633
2634 case '|':
2635 length += 3 + branch_extra;
2636 continue;
2637
2638 /* A character class uses 33 characters. Don't worry about character types
2639 that aren't allowed in classes - they'll get picked up during the compile.
2640 A character class that contains only one character uses 2 or 3 bytes,
2641 depending on whether it is negated or not. Notice this where we can. */
2642
2643 case '[':
2644 class_charcount = 0;
2645 if (*(++ptr) == '^') ptr++;
2646 do
2647 {
2648 if (*ptr == '\\')
2649 {
2650 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2651 &compile_block);
2652 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2653 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2654 }
2655 else class_charcount++;
2656 ptr++;
2657 }
2658 while (*ptr != 0 && *ptr != ']');
2659
2660 /* Repeats for negated single chars are handled by the general code */
2661
2662 if (class_charcount == 1) length += 3; else
2663 {
2664 length += 33;
2665
2666 /* A repeat needs either 1 or 5 bytes. */
2667
2668 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2669 {
2670 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2671 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2672 if ((min == 0 && (max == 1 || max == -1)) ||
2673 (min == 1 && max == -1))
2674 length++;
2675 else length += 5;
2676 if (ptr[1] == '?') ptr++;
2677 }
2678 }
2679 continue;
2680
2681 /* Brackets may be genuine groups or special things */
2682
2683 case '(':
2684 branch_newextra = 0;
2685
2686 /* Handle special forms of bracket, which all start (? */
2687
2688 if (ptr[1] == '?')
2689 {
2690 int set, unset;
2691 int *optset;
2692
2693 switch (c = ptr[2])
2694 {
2695 /* Skip over comments entirely */
2696 case '#':
2697 ptr += 3;
2698 while (*ptr != 0 && *ptr != ')') ptr++;
2699 if (*ptr == 0)
2700 {
2701 *errorptr = ERR18;
2702 goto PCRE_ERROR_RETURN;
2703 }
2704 continue;
2705
2706 /* Non-referencing groups and lookaheads just move the pointer on, and
2707 then behave like a non-special bracket, except that they don't increment
2708 the count of extracting brackets. Ditto for the "once only" bracket,
2709 which is in Perl from version 5.005. */
2710
2711 case ':':
2712 case '=':
2713 case '!':
2714 case '>':
2715 ptr += 2;
2716 break;
2717
2718 /* A recursive call to the regex is an extension, to provide the
2719 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2720
2721 case 'R':
2722 if (ptr[3] != ')')
2723 {
2724 *errorptr = ERR29;
2725 goto PCRE_ERROR_RETURN;
2726 }
2727 ptr += 3;
2728 length += 1;
2729 break;
2730
2731 /* Lookbehinds are in Perl from version 5.005 */
2732
2733 case '<':
2734 if (ptr[3] == '=' || ptr[3] == '!')
2735 {
2736 ptr += 3;
2737 branch_newextra = 3;
2738 length += 3; /* For the first branch */
2739 break;
2740 }
2741 *errorptr = ERR24;
2742 goto PCRE_ERROR_RETURN;
2743
2744 /* Conditionals are in Perl from version 5.005. The bracket must either
2745 be followed by a number (for bracket reference) or by an assertion
2746 group. */
2747
2748 case '(':
2749 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2750 {
2751 ptr += 4;
2752 length += 2;
2753 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2754 if (*ptr != ')')
2755 {
2756 *errorptr = ERR26;
2757 goto PCRE_ERROR_RETURN;
2758 }
2759 }
2760 else /* An assertion must follow */
2761 {
2762 ptr++; /* Can treat like ':' as far as spacing is concerned */
2763 if (ptr[2] != '?' ||
2764 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2765 {
2766 ptr += 2; /* To get right offset in message */
2767 *errorptr = ERR28;
2768 goto PCRE_ERROR_RETURN;
2769 }
2770 }
2771 break;
2772
2773 /* Else loop checking valid options until ) is met. Anything else is an
2774 error. If we are without any brackets, i.e. at top level, the settings
2775 act as if specified in the options, so massage the options immediately.
2776 This is for backward compatibility with Perl 5.004. */
2777
2778 default:
2779 set = unset = 0;
2780 optset = &set;
2781 ptr += 2;
2782
2783 for (;; ptr++)
2784 {
2785 c = *ptr;
2786 switch (c)
2787 {
2788 case 'i':
2789 *optset |= PCRE_CASELESS;
2790 continue;
2791
2792 case 'm':
2793 *optset |= PCRE_MULTILINE;
2794 continue;
2795
2796 case 's':
2797 *optset |= PCRE_DOTALL;
2798 continue;
2799
2800 case 'x':
2801 *optset |= PCRE_EXTENDED;
2802 continue;
2803
2804 case 'X':
2805 *optset |= PCRE_EXTRA;
2806 continue;
2807
2808 case 'U':
2809 *optset |= PCRE_UNGREEDY;
2810 continue;
2811
2812 case '-':
2813 optset = &unset;
2814 continue;
2815
2816 /* A termination by ')' indicates an options-setting-only item;
2817 this is global at top level; otherwise nothing is done here and
2818 it is handled during the compiling process on a per-bracket-group
2819 basis. */
2820
2821 case ')':
2822 if (brastackptr == 0)
2823 {
2824 options = (options | set) & (~unset);
2825 set = unset = 0; /* To save length */
2826 }
2827 /* Fall through */
2828
2829 /* A termination by ':' indicates the start of a nested group with
2830 the given options set. This is again handled at compile time, but
2831 we must allow for compiled space if any of the ims options are
2832 set. We also have to allow for resetting space at the end of
2833 the group, which is why 4 is added to the length and not just 2.
2834 If there are several changes of options within the same group, this
2835 will lead to an over-estimate on the length, but this shouldn't
2836 matter very much. We also have to allow for resetting options at
2837 the start of any alternations, which we do by setting
2838 branch_newextra to 2. Finally, we record whether the case-dependent
2839 flag ever changes within the regex. This is used by the "required
2840 character" code. */
2841
2842 case ':':
2843 if (((set|unset) & PCRE_IMS) != 0)
2844 {
2845 length += 4;
2846 branch_newextra = 2;
2847 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2848 }
2849 goto END_OPTIONS;
2850
2851 /* Unrecognized option character */
2852
2853 default:
2854 *errorptr = ERR12;
2855 goto PCRE_ERROR_RETURN;
2856 }
2857 }
2858
2859 /* If we hit a closing bracket, that's it - this is a freestanding
2860 option-setting. We need to ensure that branch_extra is updated if
2861 necessary. The only values branch_newextra can have here are 0 or 2.
2862 If the value is 2, then branch_extra must either be 2 or 5, depending
2863 on whether this is a lookbehind group or not. */
2864
2865 END_OPTIONS:
2866 if (c == ')')
2867 {
2868 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2869 branch_extra += branch_newextra;
2870 continue;
2871 }
2872
2873 /* If options were terminated by ':' control comes here. Fall through
2874 to handle the group below. */
2875 }
2876 }
2877
2878 /* Extracting brackets must be counted so we can process escapes in a
2879 Perlish way. */
2880
2881 else bracount++;
2882
2883 /* Non-special forms of bracket. Save length for computing whole length
2884 at end if there's a repeat that requires duplication of the group. Also
2885 save the current value of branch_extra, and start the new group with
2886 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2887 for a lookbehind assertion. */
2888
2889 if (brastackptr >= sizeof(brastack)/sizeof(int))
2890 {
2891 *errorptr = ERR19;
2892 goto PCRE_ERROR_RETURN;
2893 }
2894
2895 bralenstack[brastackptr] = branch_extra;
2896 branch_extra = branch_newextra;
2897
2898 brastack[brastackptr++] = length;
2899 length += 3;
2900 continue;
2901
2902 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2903 have to replicate this bracket up to that many times. If brastackptr is
2904 0 this is an unmatched bracket which will generate an error, but take care
2905 not to try to access brastack[-1] when computing the length and restoring
2906 the branch_extra value. */
2907
2908 case ')':
2909 length += 3;
2910 {
2911 int minval = 1;
2912 int maxval = 1;
2913 int duplength;
2914
2915 if (brastackptr > 0)
2916 {
2917 duplength = length - brastack[--brastackptr];
2918 branch_extra = bralenstack[brastackptr];
2919 }
2920 else duplength = 0;
2921
2922 /* Leave ptr at the final char; for read_repeat_counts this happens
2923 automatically; for the others we need an increment. */
2924
2925 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2926 {
2927 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2928 &compile_block);
2929 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2930 }
2931 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2932 else if (c == '+') { maxval = -1; ptr++; }
2933 else if (c == '?') { minval = 0; ptr++; }
2934
2935 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2936 group, and if the maximum is greater than zero, we have to replicate
2937 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2938 bracket set - hence the 7. */
2939
2940 if (minval == 0)
2941 {
2942 length++;
2943 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2944 }
2945
2946 /* When the minimum is greater than zero, 1 we have to replicate up to
2947 minval-1 times, with no additions required in the copies. Then, if
2948 there is a limited maximum we have to replicate up to maxval-1 times
2949 allowing for a BRAZERO item before each optional copy and nesting
2950 brackets for all but one of the optional copies. */
2951
2952 else
2953 {
2954 length += (minval - 1) * duplength;
2955 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2956 length += (maxval - minval) * (duplength + 7) - 6;
2957 }
2958 }
2959 continue;
2960
2961 /* Non-special character. For a run of such characters the length required
2962 is the number of characters + 2, except that the maximum run length is 255.
2963 We won't get a skipped space or a non-data escape or the start of a #
2964 comment as the first character, so the length can't be zero. */
2965
2966 NORMAL_CHAR:
2967 default:
2968 length += 2;
2969 runlength = 0;
2970 do
2971 {
2972 if ((options & PCRE_EXTENDED) != 0)
2973 {
2974 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2975 if (c == '#')
2976 {
2977 /* The space before the ; is to avoid a warning on a silly compiler
2978 on the Macintosh. */
2979 while ((c = *(++ptr)) != 0 && c != '\n') ;
2980 continue;
2981 }
2982 }
2983
2984 /* Backslash may introduce a data char or a metacharacter; stop the
2985 string before the latter. */
2986
2987 if (c == '\\')
2988 {
2989 const uschar *saveptr = ptr;
2990 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2991 &compile_block);
2992 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2993 if (c < 0) { ptr = saveptr; break; }
2994
2995 #ifdef SUPPORT_UTF8
2996 if (c > 127 && (options & PCRE_UTF8) != 0)
2997 {
2998 int i;
2999 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3000 if (c <= utf8_table1[i]) break;
3001 runlength += i;
3002 }
3003 #endif
3004 }
3005
3006 /* Ordinary character or single-char escape */
3007
3008 runlength++;
3009 }
3010
3011 /* This "while" is the end of the "do" above. */
3012
3013 while (runlength < MAXLIT &&
3014 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3015
3016 ptr--;
3017 length += runlength;
3018 continue;
3019 }
3020 }
3021
3022 length += 4; /* For final KET and END */
3023
3024 if (length > 65539)
3025 {
3026 *errorptr = ERR20;
3027 return NULL;
3028 }
3029
3030 /* Compute the size of data block needed and get it, either from malloc or
3031 externally provided function. We specify "code[0]" in the offsetof() expression
3032 rather than just "code", because it has been reported that one broken compiler
3033 fails on "code" because it is also an independent variable. It should make no
3034 difference to the value of the offsetof(). */
3035
3036 size = length + offsetof(real_pcre, code[0]);
3037 re = (real_pcre *)(pcre_malloc)(size);
3038
3039 if (re == NULL)
3040 {
3041 *errorptr = ERR21;
3042 return NULL;
3043 }
3044
3045 /* Put in the magic number, and save the size, options, and table pointer */
3046
3047 re->magic_number = MAGIC_NUMBER;
3048 re->size = size;
3049 re->options = options;
3050 re->tables = tables;
3051
3052 /* Set up a starting, non-extracting bracket, then compile the expression. On
3053 error, *errorptr will be set non-NULL, so we don't need to look at the result
3054 of the function here. */
3055
3056 ptr = (const uschar *)pattern;
3057 code = re->code;
3058 *code = OP_BRA;
3059 bracount = 0;
3060 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3061 &reqchar, &countlits, &compile_block);
3062 re->top_bracket = bracount;
3063 re->top_backref = top_backref;
3064
3065 /* If not reached end of pattern on success, there's an excess bracket. */
3066
3067 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3068
3069 /* Fill in the terminating state and check for disastrous overflow, but
3070 if debugging, leave the test till after things are printed out. */
3071
3072 *code++ = OP_END;
3073
3074 #ifndef DEBUG
3075 if (code - re->code > length) *errorptr = ERR23;
3076 #endif
3077
3078 /* Give an error if there's back reference to a non-existent capturing
3079 subpattern. */
3080
3081 if (top_backref > re->top_bracket) *errorptr = ERR15;
3082
3083 /* Failed to compile */
3084
3085 if (*errorptr != NULL)
3086 {
3087 (pcre_free)(re);
3088 PCRE_ERROR_RETURN:
3089 *erroroffset = ptr - (const uschar *)pattern;
3090 return NULL;
3091 }
3092
3093 /* If the anchored option was not passed, set flag if we can determine that the
3094 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3095 starting with .* when DOTALL is set).
3096
3097 Otherwise, see if we can determine what the first character has to be, because
3098 that speeds up unanchored matches no end. If not, see if we can set the
3099 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3100 start with ^. and also when all branches start with .* for non-DOTALL matches.
3101 */
3102
3103 if ((options & PCRE_ANCHORED) == 0)
3104 {
3105 int temp_options = options;
3106 if (is_anchored(re->code, &temp_options))
3107 re->options |= PCRE_ANCHORED;
3108 else
3109 {
3110 int ch = find_firstchar(re->code, &temp_options);
3111 if (ch >= 0)
3112 {
3113 re->first_char = ch;
3114 re->options |= PCRE_FIRSTSET;
3115 }
3116 else if (is_startline(re->code))
3117 re->options |= PCRE_STARTLINE;
3118 }
3119 }
3120
3121 /* Save the last required character if there are at least two literal
3122 characters on all paths, or if there is no first character setting. */
3123
3124 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3125 {
3126 re->req_char = reqchar;
3127 re->options |= PCRE_REQCHSET;
3128 }
3129
3130 /* Print out the compiled data for debugging */
3131
3132 #ifdef DEBUG
3133
3134 printf("Length = %d top_bracket = %d top_backref = %d\n",
3135 length, re->top_bracket, re->top_backref);
3136
3137 if (re->options != 0)
3138 {
3139 printf("%s%s%s%s%s%s%s%s%s\n",
3140 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3141 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3142 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3143 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3144 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3145 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3146 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3147 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3148 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3149 }
3150
3151 if ((re->options & PCRE_FIRSTSET) != 0)
3152 {
3153 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3154 else printf("First char = \\x%02x\n", re->first_char);
3155 }
3156
3157 if ((re->options & PCRE_REQCHSET) != 0)
3158 {
3159 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3160 else printf("Req char = \\x%02x\n", re->req_char);
3161 }
3162
3163 code_end = code;
3164 code_base = code = re->code;
3165
3166 while (code < code_end)
3167 {
3168 int charlength;
3169
3170 printf("%3d ", code - code_base);
3171
3172 if (*code >= OP_BRA)
3173 {
3174 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3175 code += 2;
3176 }
3177
3178 else switch(*code)
3179 {
3180 case OP_OPT:
3181 printf(" %.2x %s", code[1], OP_names[*code]);
3182 code++;
3183 break;
3184
3185 case OP_COND:
3186 printf("%3d Cond", (code[1] << 8) + code[2]);
3187 code += 2;
3188 break;
3189
3190 case OP_CREF:
3191 printf(" %.2d %s", code[1], OP_names[*code]);
3192 code++;
3193 break;
3194
3195 case OP_CHARS:
3196 charlength = *(++code);
3197 printf("%3d ", charlength);
3198 while (charlength-- > 0)
3199 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3200 break;
3201
3202 case OP_KETRMAX:
3203 case OP_KETRMIN:
3204 case OP_ALT:
3205 case OP_KET:
3206 case OP_ASSERT:
3207 case OP_ASSERT_NOT:
3208 case OP_ASSERTBACK:
3209 case OP_ASSERTBACK_NOT:
3210 case OP_ONCE:
3211 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3212 code += 2;
3213 break;
3214
3215 case OP_REVERSE:
3216 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3217 code += 2;
3218 break;
3219
3220 case OP_STAR:
3221 case OP_MINSTAR:
3222 case OP_PLUS:
3223 case OP_MINPLUS:
3224 case OP_QUERY:
3225 case OP_MINQUERY:
3226 case OP_TYPESTAR:
3227 case OP_TYPEMINSTAR:
3228 case OP_TYPEPLUS:
3229 case OP_TYPEMINPLUS:
3230 case OP_TYPEQUERY:
3231 case OP_TYPEMINQUERY:
3232 if (*code >= OP_TYPESTAR)
3233 printf(" %s", OP_names[code[1]]);
3234 else if (isprint(c = code[1])) printf(" %c", c);
3235 else printf(" \\x%02x", c);
3236 printf("%s", OP_names[*code++]);
3237 break;
3238
3239 case OP_EXACT:
3240 case OP_UPTO:
3241 case OP_MINUPTO:
3242 if (isprint(c = code[3])) printf(" %c{", c);
3243 else printf(" \\x%02x{", c);
3244 if (*code != OP_EXACT) printf("0,");
3245 printf("%d}", (code[1] << 8) + code[2]);
3246 if (*code == OP_MINUPTO) printf("?");
3247 code += 3;
3248 break;
3249
3250 case OP_TYPEEXACT:
3251 case OP_TYPEUPTO:
3252 case OP_TYPEMINUPTO:
3253 printf(" %s{", OP_names[code[3]]);
3254 if (*code != OP_TYPEEXACT) printf(",");
3255 printf("%d}", (code[1] << 8) + code[2]);
3256 if (*code == OP_TYPEMINUPTO) printf("?");
3257 code += 3;
3258 break;
3259
3260 case OP_NOT:
3261 if (isprint(c = *(++code))) printf(" [^%c]", c);
3262 else printf(" [^\\x%02x]", c);
3263 break;
3264
3265 case OP_NOTSTAR:
3266 case OP_NOTMINSTAR:
3267 case OP_NOTPLUS:
3268 case OP_NOTMINPLUS:
3269 case OP_NOTQUERY:
3270 case OP_NOTMINQUERY:
3271 if (isprint(c = code[1])) printf(" [^%c]", c);
3272 else printf(" [^\\x%02x]", c);
3273 printf("%s", OP_names[*code++]);
3274 break;
3275
3276 case OP_NOTEXACT:
3277 case OP_NOTUPTO:
3278 case OP_NOTMINUPTO:
3279 if (isprint(c = code[3])) printf(" [^%c]{", c);
3280 else printf(" [^\\x%02x]{", c);
3281 if (*code != OP_NOTEXACT) printf(",");
3282 printf("%d}", (code[1] << 8) + code[2]);
3283 if (*code == OP_NOTMINUPTO) printf("?");
3284 code += 3;
3285 break;
3286
3287 case OP_REF:
3288 printf(" \\%d", *(++code));
3289 code ++;
3290 goto CLASS_REF_REPEAT;
3291
3292 case OP_CLASS:
3293 {
3294 int i, min, max;
3295 code++;
3296 printf(" [");
3297
3298 for (i = 0; i < 256; i++)
3299 {
3300 if ((code[i/8] & (1 << (i&7))) != 0)
3301 {
3302 int j;
3303 for (j = i+1; j < 256; j++)
3304 if ((code[j/8] & (1 << (j&7))) == 0) break;
3305 if (i == '-' || i == ']') printf("\\");
3306 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3307 if (--j > i)
3308 {
3309 printf("-");
3310 if (j == '-' || j == ']') printf("\\");
3311 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3312 }
3313 i = j;
3314 }
3315 }
3316 printf("]");
3317 code += 32;
3318
3319 CLASS_REF_REPEAT:
3320
3321 switch(*code)
3322 {
3323 case OP_CRSTAR:
3324 case OP_CRMINSTAR:
3325 case OP_CRPLUS:
3326 case OP_CRMINPLUS:
3327 case OP_CRQUERY:
3328 case OP_CRMINQUERY:
3329 printf("%s", OP_names[*code]);
3330 break;
3331
3332 case OP_CRRANGE:
3333 case OP_CRMINRANGE:
3334 min = (code[1] << 8) + code[2];
3335 max = (code[3] << 8) + code[4];
3336 if (max == 0) printf("{%d,}", min);
3337 else printf("{%d,%d}", min, max);
3338 if (*code == OP_CRMINRANGE) printf("?");
3339 code += 4;
3340 break;
3341
3342 default:
3343 code--;
3344 }
3345 }
3346 break;
3347
3348 /* Anything else is just a one-node item */
3349
3350 default:
3351 printf(" %s", OP_names[*code]);
3352 break;
3353 }
3354
3355 code++;
3356 printf("\n");
3357 }
3358 printf("------------------------------------------------------------------\n");
3359
3360 /* This check is done here in the debugging case so that the code that
3361 was compiled can be seen. */
3362
3363 if (code - re->code > length)
3364 {
3365 *errorptr = ERR23;
3366 (pcre_free)(re);
3367 *erroroffset = ptr - (uschar *)pattern;
3368 return NULL;
3369 }
3370 #endif
3371
3372 return (pcre *)re;
3373 }
3374
3375
3376
3377 /*************************************************
3378 * Match a back-reference *
3379 *************************************************/
3380
3381 /* If a back reference hasn't been set, the length that is passed is greater
3382 than the number of characters left in the string, so the match fails.
3383
3384 Arguments:
3385 offset index into the offset vector
3386 eptr points into the subject
3387 length length to be matched
3388 md points to match data block
3389 ims the ims flags
3390
3391 Returns: TRUE if matched
3392 */
3393
3394 static BOOL
3395 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3396 unsigned long int ims)
3397 {
3398 const uschar *p = md->start_subject + md->offset_vector[offset];
3399
3400 #ifdef DEBUG
3401 if (eptr >= md->end_subject)
3402 printf("matching subject <null>");
3403 else
3404 {
3405 printf("matching subject ");
3406 pchars(eptr, length, TRUE, md);
3407 }
3408 printf(" against backref ");
3409 pchars(p, length, FALSE, md);
3410 printf("\n");
3411 #endif
3412
3413 /* Always fail if not enough characters left */
3414
3415 if (length > md->end_subject - eptr) return FALSE;
3416
3417 /* Separate the caselesss case for speed */
3418
3419 if ((ims & PCRE_CASELESS) != 0)
3420 {
3421 while (length-- > 0)
3422 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3423 }
3424 else
3425 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3426
3427 return TRUE;
3428 }
3429
3430
3431
3432 /*************************************************
3433 * Match from current position *
3434 *************************************************/
3435
3436 /* On entry ecode points to the first opcode, and eptr to the first character
3437 in the subject string, while eptrb holds the value of eptr at the start of the
3438 last bracketed group - used for breaking infinite loops matching zero-length
3439 strings.
3440
3441 Arguments:
3442 eptr pointer in subject
3443 ecode position in code
3444 offset_top current top pointer
3445 md pointer to "static" info for the match
3446 ims current /i, /m, and /s options
3447 eptrb pointer to chain of blocks containing eptr at start of
3448 brackets - for testing for empty matches
3449 flags can contain
3450 match_condassert - this is an assertion condition
3451 match_isgroup - this is the start of a bracketed group
3452
3453 Returns: TRUE if matched
3454 */
3455
3456 static BOOL
3457 match(register const uschar *eptr, register const uschar *ecode,
3458 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3459 int flags)
3460 {
3461 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3462 eptrblock newptrb;
3463
3464 /* At the start of a bracketed group, add the current subject pointer to the
3465 stack of such pointers, to be re-instated at the end of the group when we hit
3466 the closing ket. When match() is called in other circumstances, we don't add to
3467 the stack. */
3468
3469 if ((flags & match_isgroup) != 0)
3470 {
3471 newptrb.prev = eptrb;
3472 newptrb.saved_eptr = eptr;
3473 eptrb = &newptrb;
3474 }
3475
3476 /* Now start processing the operations. */
3477
3478 for (;;)
3479 {
3480 int op = (int)*ecode;
3481 int min, max, ctype;
3482 register int i;
3483 register int c;
3484 BOOL minimize = FALSE;
3485
3486 /* Opening capturing bracket. If there is space in the offset vector, save
3487 the current subject position in the working slot at the top of the vector. We
3488 mustn't change the current values of the data slot, because they may be set
3489 from a previous iteration of this group, and be referred to by a reference
3490 inside the group.
3491
3492 If the bracket fails to match, we need to restore this value and also the
3493 values of the final offsets, in case they were set by a previous iteration of
3494 the same bracket.
3495
3496 If there isn't enough space in the offset vector, treat this as if it were a
3497 non-capturing bracket. Don't worry about setting the flag for the error case
3498 here; that is handled in the code for KET. */
3499
3500 if (op > OP_BRA)
3501 {
3502 int number = op - OP_BRA;
3503 int offset = number << 1;
3504
3505 #ifdef DEBUG
3506 printf("start bracket %d subject=", number);
3507 pchars(eptr, 16, TRUE, md);
3508 printf("\n");
3509 #endif
3510
3511 if (offset < md->offset_max)
3512 {
3513 int save_offset1 = md->offset_vector[offset];
3514 int save_offset2 = md->offset_vector[offset+1];
3515 int save_offset3 = md->offset_vector[md->offset_end - number];
3516
3517 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3518 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3519
3520 do
3521 {
3522 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3523 return TRUE;
3524 ecode += (ecode[1] << 8) + ecode[2];
3525 }
3526 while (*ecode == OP_ALT);
3527
3528 DPRINTF(("bracket %d failed\n", number));
3529
3530 md->offset_vector[offset] = save_offset1;
3531 md->offset_vector[offset+1] = save_offset2;
3532 md->offset_vector[md->offset_end - number] = save_offset3;
3533 return FALSE;
3534 }
3535
3536 /* Insufficient room for saving captured contents */
3537
3538 else op = OP_BRA;
3539 }
3540
3541 /* Other types of node can be handled by a switch */
3542
3543 switch(op)
3544 {
3545 case OP_BRA: /* Non-capturing bracket: optimized */
3546 DPRINTF(("start bracket 0\n"));
3547 do
3548 {
3549 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3550 return TRUE;
3551 ecode += (ecode[1] << 8) + ecode[2];
3552 }
3553 while (*ecode == OP_ALT);
3554 DPRINTF(("bracket 0 failed\n"));
3555 return FALSE;
3556
3557 /* Conditional group: compilation checked that there are no more than
3558 two branches. If the condition is false, skipping the first branch takes us
3559 past the end if there is only one branch, but that's OK because that is
3560 exactly what going to the ket would do. */
3561
3562 case OP_COND:
3563 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3564 {
3565 int offset = ecode[4] << 1; /* Doubled reference number */
3566 return match(eptr,
3567 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3568 5 : 3 + (ecode[1] << 8) + ecode[2]),
3569 offset_top, md, ims, eptrb, match_isgroup);
3570 }
3571
3572 /* The condition is an assertion. Call match() to evaluate it - setting
3573 the final argument TRUE causes it to stop at the end of an assertion. */
3574
3575 else
3576 {
3577 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3578 match_condassert | match_isgroup))
3579 {
3580 ecode += 3 + (ecode[4] << 8) + ecode[5];
3581 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3582 }
3583 else ecode += (ecode[1] << 8) + ecode[2];
3584 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3585 }
3586 /* Control never reaches here */
3587
3588 /* Skip over conditional reference data if encountered (should not be) */
3589
3590 case OP_CREF:
3591 ecode += 2;
3592 break;
3593
3594 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3595 an empty string - recursion will then try other alternatives, if any. */
3596
3597 case OP_END:
3598 if (md->notempty && eptr == md->start_match) return FALSE;
3599 md->end_match_ptr = eptr; /* Record where we ended */
3600 md->end_offset_top = offset_top; /* and how many extracts were taken */
3601 return TRUE;
3602
3603 /* Change option settings */
3604
3605 case OP_OPT:
3606 ims = ecode[1];
3607 ecode += 2;
3608 DPRINTF(("ims set to %02lx\n", ims));
3609 break;
3610
3611 /* Assertion brackets. Check the alternative branches in turn - the
3612 matching won't pass the KET for an assertion. If any one branch matches,
3613 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3614 start of each branch to move the current point backwards, so the code at
3615 this level is identical to the lookahead case. */
3616
3617 case OP_ASSERT:
3618 case OP_ASSERTBACK:
3619 do
3620 {
3621 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3622 ecode += (ecode[1] << 8) + ecode[2];
3623 }
3624 while (*ecode == OP_ALT);
3625 if (*ecode == OP_KET) return FALSE;
3626
3627 /* If checking an assertion for a condition, return TRUE. */
3628
3629 if ((flags & match_condassert) != 0) return TRUE;
3630
3631 /* Continue from after the assertion, updating the offsets high water
3632 mark, since extracts may have been taken during the assertion. */
3633
3634 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3635 ecode += 3;
3636 offset_top = md->end_offset_top;
3637 continue;
3638
3639 /* Negative assertion: all branches must fail to match */
3640
3641 case OP_ASSERT_NOT:
3642 case OP_ASSERTBACK_NOT:
3643 do
3644 {
3645 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3646 return FALSE;
3647 ecode += (ecode[1] << 8) + ecode[2];
3648 }
3649 while (*ecode == OP_ALT);
3650
3651 if ((flags & match_condassert) != 0) return TRUE;
3652
3653 ecode += 3;
3654 continue;
3655
3656 /* Move the subject pointer back. This occurs only at the start of
3657 each branch of a lookbehind assertion. If we are too close to the start to
3658 move back, this match function fails. When working with UTF-8 we move
3659 back a number of characters, not bytes. */
3660
3661 case OP_REVERSE:
3662 #ifdef SUPPORT_UTF8
3663 c = (ecode[1] << 8) + ecode[2];
3664 for (i = 0; i < c; i++)
3665 {
3666 eptr--;
3667 BACKCHAR(eptr)
3668 }
3669 #else
3670 eptr -= (ecode[1] << 8) + ecode[2];
3671 #endif
3672
3673 if (eptr < md->start_subject) return FALSE;
3674 ecode += 3;
3675 break;
3676
3677 /* Recursion matches the current regex, nested. If there are any capturing
3678 brackets started but not finished, we have to save their starting points
3679 and reinstate them after the recursion. However, we don't know how many
3680 such there are (offset_top records the completed total) so we just have
3681 to save all the potential data. There may be up to 99 such values, which
3682 is a bit large to put on the stack, but using malloc for small numbers
3683 seems expensive. As a compromise, the stack is used when there are fewer
3684 than 16 values to store; otherwise malloc is used. A problem is what to do
3685 if the malloc fails ... there is no way of returning to the top level with
3686 an error. Save the top 15 values on the stack, and accept that the rest
3687 may be wrong. */
3688
3689 case OP_RECURSE:
3690 {
3691 BOOL rc;
3692 int *save;
3693 int stacksave[15];
3694
3695 c = md->offset_max;
3696
3697 if (c < 16) save = stacksave; else
3698 {
3699 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3700 if (save == NULL)
3701 {
3702 save = stacksave;
3703 c = 15;
3704 }
3705 }
3706
3707 for (i = 1; i <= c; i++)
3708 save[i] = md->offset_vector[md->offset_end - i];
3709 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3710 match_isgroup);
3711 for (i = 1; i <= c; i++)
3712 md->offset_vector[md->offset_end - i] = save[i];
3713 if (save != stacksave) (pcre_free)(save);
3714 if (!rc) return FALSE;
3715
3716 /* In case the recursion has set more capturing values, save the final
3717 number, then move along the subject till after the recursive match,
3718 and advance one byte in the pattern code. */
3719
3720 offset_top = md->end_offset_top;
3721 eptr = md->end_match_ptr;
3722 ecode++;
3723 }
3724 break;
3725
3726 /* "Once" brackets are like assertion brackets except that after a match,
3727 the point in the subject string is not moved back. Thus there can never be
3728 a move back into the brackets. Check the alternative branches in turn - the
3729 matching won't pass the KET for this kind of subpattern. If any one branch
3730 matches, we carry on as at the end of a normal bracket, leaving the subject
3731 pointer. */
3732
3733 case OP_ONCE:
3734 {
3735 const uschar *prev = ecode;
3736 const uschar *saved_eptr = eptr;
3737
3738 do
3739 {
3740 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3741 break;
3742 ecode += (ecode[1] << 8) + ecode[2];
3743 }
3744 while (*ecode == OP_ALT);
3745
3746 /* If hit the end of the group (which could be repeated), fail */
3747
3748 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3749
3750 /* Continue as from after the assertion, updating the offsets high water
3751 mark, since extracts may have been taken. */
3752
3753 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3754
3755 offset_top = md->end_offset_top;
3756 eptr = md->end_match_ptr;
3757
3758 /* For a non-repeating ket, just continue at this level. This also
3759 happens for a repeating ket if no characters were matched in the group.
3760 This is the forcible breaking of infinite loops as implemented in Perl
3761 5.005. If there is an options reset, it will get obeyed in the normal
3762 course of events. */
3763
3764 if (*ecode == OP_KET || eptr == saved_eptr)
3765 {
3766 ecode += 3;
3767 break;
3768 }
3769
3770 /* The repeating kets try the rest of the pattern or restart from the
3771 preceding bracket, in the appropriate order. We need to reset any options
3772 that changed within the bracket before re-running it, so check the next
3773 opcode. */
3774
3775 if (ecode[3] == OP_OPT)
3776 {
3777 ims = (ims & ~PCRE_IMS) | ecode[4];
3778 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3779 }
3780
3781 if (*ecode == OP_KETRMIN)
3782 {
3783 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3784 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3785 return TRUE;
3786 }
3787 else /* OP_KETRMAX */
3788 {
3789 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3790 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3791 }
3792 }
3793 return FALSE;
3794
3795 /* An alternation is the end of a branch; scan along to find the end of the
3796 bracketed group and go to there. */
3797
3798 case OP_ALT:
3799 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3800 break;
3801
3802 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3803 that it may occur zero times. It may repeat infinitely, or not at all -
3804 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3805 repeat limits are compiled as a number of copies, with the optional ones
3806 preceded by BRAZERO or BRAMINZERO. */
3807
3808 case OP_BRAZERO:
3809 {
3810 const uschar *next = ecode+1;
3811 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3812 return TRUE;
3813 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3814 ecode = next + 3;
3815 }
3816 break;
3817
3818 case OP_BRAMINZERO:
3819 {
3820 const uschar *next = ecode+1;
3821 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3822 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3823 return TRUE;
3824 ecode++;
3825 }
3826 break;
3827
3828 /* End of a group, repeated or non-repeating. If we are at the end of
3829 an assertion "group", stop matching and return TRUE, but record the
3830 current high water mark for use by positive assertions. Do this also
3831 for the "once" (not-backup up) groups. */
3832
3833 case OP_KET:
3834 case OP_KETRMIN:
3835 case OP_KETRMAX:
3836 {
3837 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3838 const uschar *saved_eptr = eptrb->saved_eptr;
3839
3840 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3841
3842 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3843 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3844 *prev == OP_ONCE)
3845 {
3846 md->end_match_ptr = eptr; /* For ONCE */
3847 md->end_offset_top = offset_top;
3848 return TRUE;
3849 }
3850
3851 /* In all other cases except a conditional group we have to check the
3852 group number back at the start and if necessary complete handling an
3853 extraction by setting the offsets and bumping the high water mark. */
3854
3855 if (*prev != OP_COND)
3856 {
3857 int number = *prev - OP_BRA;
3858 int offset = number << 1;
3859
3860 #ifdef DEBUG
3861 printf("end bracket %d", number);
3862 printf("\n");
3863 #endif
3864
3865 if (number > 0)
3866 {
3867 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3868 {
3869 md->offset_vector[offset] =
3870 md->offset_vector[md->offset_end - number];
3871 md->offset_vector[offset+1] = eptr - md->start_subject;
3872 if (offset_top <= offset) offset_top = offset + 2;
3873 }
3874 }
3875 }
3876
3877 /* Reset the value of the ims flags, in case they got changed during
3878 the group. */
3879
3880 ims = original_ims;
3881 DPRINTF(("ims reset to %02lx\n", ims));
3882
3883 /* For a non-repeating ket, just continue at this level. This also
3884 happens for a repeating ket if no characters were matched in the group.
3885 This is the forcible breaking of infinite loops as implemented in Perl
3886 5.005. If there is an options reset, it will get obeyed in the normal
3887 course of events. */
3888
3889 if (*ecode == OP_KET || eptr == saved_eptr)
3890 {
3891 ecode += 3;
3892 break;
3893 }
3894
3895 /* The repeating kets try the rest of the pattern or restart from the
3896 preceding bracket, in the appropriate order. */
3897
3898 if (*ecode == OP_KETRMIN)
3899 {
3900 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3901 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3902 return TRUE;
3903 }
3904 else /* OP_KETRMAX */
3905 {
3906 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3907 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3908 }
3909 }
3910 return FALSE;
3911
3912 /* Start of subject unless notbol, or after internal newline if multiline */
3913
3914 case OP_CIRC:
3915 if (md->notbol && eptr == md->start_subject) return FALSE;
3916 if ((ims & PCRE_MULTILINE) != 0)
3917 {
3918 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3919 ecode++;
3920 break;
3921 }
3922 /* ... else fall through */
3923
3924 /* Start of subject assertion */
3925
3926 case OP_SOD:
3927 if (eptr != md->start_subject) return FALSE;
3928 ecode++;
3929 break;
3930
3931 /* Assert before internal newline if multiline, or before a terminating
3932 newline unless endonly is set, else end of subject unless noteol is set. */
3933
3934 case OP_DOLL:
3935 if ((ims & PCRE_MULTILINE) != 0)
3936 {
3937 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3938 else { if (md->noteol) return FALSE; }
3939 ecode++;
3940 break;
3941 }
3942 else
3943 {
3944 if (md->noteol) return FALSE;
3945 if (!md->endonly)
3946 {
3947 if (eptr < md->end_subject - 1 ||
3948 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3949
3950 ecode++;
3951 break;
3952 }
3953 }
3954 /* ... else fall through */
3955
3956 /* End of subject assertion (\z) */
3957
3958 case OP_EOD:
3959 if (eptr < md->end_subject) return FALSE;
3960 ecode++;
3961 break;
3962
3963 /* End of subject or ending \n assertion (\Z) */
3964
3965 case OP_EODN:
3966 if (eptr < md->end_subject - 1 ||
3967 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3968 ecode++;
3969 break;
3970
3971 /* Word boundary assertions */
3972
3973 case OP_NOT_WORD_BOUNDARY:
3974 case OP_WORD_BOUNDARY:
3975 {
3976 BOOL prev_is_word = (eptr != md->start_subject) &&
3977 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3978 BOOL cur_is_word = (eptr < md->end_subject) &&
3979 ((md->ctypes[*eptr] & ctype_word) != 0);
3980 if ((*ecode++ == OP_WORD_BOUNDARY)?
3981 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3982 return FALSE;
3983 }
3984 break;
3985
3986 /* Match a single character type; inline for speed */
3987
3988 case OP_ANY:
3989 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3990 return FALSE;
3991 if (eptr++ >= md->end_subject) return FALSE;
3992 #ifdef SUPPORT_UTF8
3993 if (md->utf8)
3994 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3995 #endif
3996 ecode++;
3997 break;
3998
3999 case OP_NOT_DIGIT:
4000 if (eptr >= md->end_subject ||
4001 (md->ctypes[*eptr++] & ctype_digit) != 0)
4002 return FALSE;
4003 ecode++;
4004 break;
4005
4006 case OP_DIGIT:
4007 if (eptr >= md->end_subject ||
4008 (md->ctypes[*eptr++] & ctype_digit) == 0)
4009 return FALSE;
4010 ecode++;
4011 break;
4012
4013 case OP_NOT_WHITESPACE:
4014 if (eptr >= md->end_subject ||
4015 (md->ctypes[*eptr++] & ctype_space) != 0)
4016 return FALSE;
4017 ecode++;
4018 break;
4019
4020 case OP_WHITESPACE:
4021 if (eptr >= md->end_subject ||
4022 (md->ctypes[*eptr++] & ctype_space) == 0)
4023 return FALSE;
4024 ecode++;
4025 break;
4026
4027 case OP_NOT_WORDCHAR:
4028 if (eptr >= md->end_subject ||
4029 (md->ctypes[*eptr++] & ctype_word) != 0)
4030 return FALSE;
4031 ecode++;
4032 break;
4033
4034 case OP_WORDCHAR:
4035 if (eptr >= md->end_subject ||
4036 (md->ctypes[*eptr++] & ctype_word) == 0)
4037 return FALSE;
4038 ecode++;
4039 break;
4040
4041 /* Match a back reference, possibly repeatedly. Look past the end of the
4042 item to see if there is repeat information following. The code is similar
4043 to that for character classes, but repeated for efficiency. Then obey
4044 similar code to character type repeats - written out again for speed.
4045 However, if the referenced string is the empty string, always treat
4046 it as matched, any number of times (otherwise there could be infinite
4047 loops). */
4048
4049 case OP_REF:
4050 {
4051 int length;
4052 int offset = ecode[1] << 1; /* Doubled reference number */
4053 ecode += 2; /* Advance past the item */
4054
4055 /* If the reference is unset, set the length to be longer than the amount
4056 of subject left; this ensures that every attempt at a match fails. We
4057 can't just fail here, because of the possibility of quantifiers with zero
4058 minima. */
4059
4060 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4061 md->end_subject - eptr + 1 :
4062 md->offset_vector[offset+1] - md->offset_vector[offset];
4063
4064 /* Set up for repetition, or handle the non-repeated case */
4065
4066 switch (*ecode)
4067 {
4068 case OP_CRSTAR:
4069 case OP_CRMINSTAR:
4070 case OP_CRPLUS:
4071 case OP_CRMINPLUS:
4072 case OP_CRQUERY:
4073 case OP_CRMINQUERY:
4074 c = *ecode++ - OP_CRSTAR;
4075 minimize = (c & 1) != 0;
4076 min = rep_min[c]; /* Pick up values from tables; */
4077 max = rep_max[c]; /* zero for max => infinity */
4078 if (max == 0) max = INT_MAX;
4079 break;
4080
4081 case OP_CRRANGE:
4082 case OP_CRMINRANGE:
4083 minimize = (*ecode == OP_CRMINRANGE);
4084 min = (ecode[1] << 8) + ecode[2];
4085 max = (ecode[3] << 8) + ecode[4];
4086 if (max == 0) max = INT_MAX;
4087 ecode += 5;
4088 break;
4089
4090 default: /* No repeat follows */
4091 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4092 eptr += length;
4093 continue; /* With the main loop */
4094 }
4095
4096 /* If the length of the reference is zero, just continue with the
4097 main loop. */
4098
4099 if (length == 0) continue;
4100
4101 /* First, ensure the minimum number of matches are present. We get back
4102 the length of the reference string explicitly rather than passing the
4103 address of eptr, so that eptr can be a register variable. */
4104
4105 for (i = 1; i <= min; i++)
4106 {
4107 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4108 eptr += length;
4109 }
4110
4111 /* If min = max, continue at the same level without recursion.
4112 They are not both allowed to be zero. */
4113
4114 if (min == max) continue;
4115
4116 /* If minimizing, keep trying and advancing the pointer */
4117
4118 if (minimize)
4119 {
4120 for (i = min;; i++)
4121 {
4122 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4123 return TRUE;
4124 if (i >= max || !match_ref(offset, eptr, length, md, ims))
4125 return FALSE;
4126 eptr += length;
4127 }
4128 /* Control never gets here */
4129 }
4130
4131 /* If maximizing, find the longest string and work backwards */
4132
4133 else
4134 {
4135 const uschar *pp = eptr;
4136 for (i = min; i < max; i++)
4137 {
4138 if (!match_ref(offset, eptr, length, md, ims)) break;
4139 eptr += length;
4140 }
4141 while (eptr >= pp)
4142 {
4143 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4144 return TRUE;
4145 eptr -= length;
4146 }
4147 return FALSE;
4148 }
4149 }
4150 /* Control never gets here */
4151
4152
4153
4154 /* Match a character class, possibly repeatedly. Look past the end of the
4155 item to see if there is repeat information following. Then obey similar
4156 code to character type repeats - written out again for speed. */
4157
4158 case OP_CLASS:
4159 {
4160 const uschar *data = ecode + 1; /* Save for matching */
4161 ecode += 33; /* Advance past the item */
4162
4163 switch (*ecode)
4164 {
4165 case OP_CRSTAR:
4166 case OP_CRMINSTAR:
4167 case OP_CRPLUS:
4168 case OP_CRMINPLUS:
4169 case OP_CRQUERY:
4170 case OP_CRMINQUERY:
4171 c = *ecode++ - OP_CRSTAR;
4172 minimize = (c & 1) != 0;
4173 min = rep_min[c]; /* Pick up values from tables; */
4174 max = rep_max[c]; /* zero for max => infinity */
4175 if (max == 0) max = INT_MAX;
4176 break;
4177
4178 case OP_CRRANGE:
4179 case OP_CRMINRANGE:
4180 minimize = (*ecode == OP_CRMINRANGE);
4181 min = (ecode[1] << 8) + ecode[2];
4182 max = (ecode[3] << 8) + ecode[4];
4183 if (max == 0) max = INT_MAX;
4184 ecode += 5;
4185 break;
4186
4187 default: /* No repeat follows */
4188 min = max = 1;
4189 break;
4190 }
4191
4192 /* First, ensure the minimum number of matches are present. */
4193
4194 for (i = 1; i <= min; i++)
4195 {
4196 if (eptr >= md->end_subject) return FALSE;
4197 GETCHARINC(c, eptr) /* Get character; increment eptr */
4198
4199 #ifdef SUPPORT_UTF8
4200 /* We do not yet support class members > 255 */
4201 if (c > 255) return FALSE;
4202 #endif
4203
4204 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4205 return FALSE;
4206 }
4207
4208 /* If max == min we can continue with the main loop without the
4209 need to recurse. */
4210
4211 if (min == max) continue;
4212
4213 /* If minimizing, keep testing the rest of the expression and advancing
4214 the pointer while it matches the class. */
4215
4216 if (minimize)
4217 {
4218 for (i = min;; i++)
4219 {
4220 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4221 return TRUE;
4222 if (i >= max || eptr >= md->end_subject) return FALSE;
4223 GETCHARINC(c, eptr) /* Get character; increment eptr */
4224
4225 #ifdef SUPPORT_UTF8
4226 /* We do not yet support class members > 255 */
4227 if (c > 255) return FALSE;
4228 #endif
4229 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4230 return FALSE;
4231 }
4232 /* Control never gets here */
4233 }
4234
4235 /* If maximizing, find the longest possible run, then work backwards. */
4236
4237 else
4238 {
4239 const uschar *pp = eptr;
4240 int len = 1;
4241 for (i = min; i < max; i++)
4242 {
4243 if (eptr >= md->end_subject) break;
4244 GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4245
4246 #ifdef SUPPORT_UTF8
4247 /* We do not yet support class members > 255 */
4248 if (c > 255) break;
4249 #endif
4250 if ((data[c/8] & (1 << (c&7))) == 0) break;
4251 eptr += len;
4252 }
4253
4254 while (eptr >= pp)
4255 {
4256 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4257 return TRUE;
4258
4259 #ifdef SUPPORT_UTF8
4260 BACKCHAR(eptr)
4261 #endif
4262 }
4263 return FALSE;
4264 }
4265 }
4266 /* Control never gets here */
4267
4268 /* Match a run of characters */
4269
4270 case OP_CHARS:
4271 {
4272 register int length = ecode[1];
4273 ecode += 2;
4274
4275 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4276 if (eptr >= md->end_subject)
4277 printf("matching subject <null> against pattern ");
4278 else
4279 {
4280 printf("matching subject ");
4281 pchars(eptr, length, TRUE, md);
4282 printf(" against pattern ");
4283 }
4284 pchars(ecode, length, FALSE, md);
4285 printf("\n");
4286 #endif
4287
4288 if (length > md->end_subject - eptr) return FALSE;
4289 if ((ims & PCRE_CASELESS) != 0)
4290 {
4291 while (length-- > 0)
4292 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4293 return FALSE;
4294 }
4295 else
4296 {
4297 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4298 }
4299 }
4300 break;
4301
4302 /* Match a single character repeatedly; different opcodes share code. */
4303
4304 case OP_EXACT:
4305 min = max = (ecode[1] << 8) + ecode[2];
4306 ecode += 3;
4307 goto REPEATCHAR;
4308
4309 case OP_UPTO:
4310 case OP_MINUPTO:
4311 min = 0;
4312 max = (ecode[1] << 8) + ecode[2];
4313 minimize = *ecode == OP_MINUPTO;
4314 ecode += 3;
4315 goto REPEATCHAR;
4316
4317 case OP_STAR:
4318 case OP_MINSTAR:
4319 case OP_PLUS:
4320 case OP_MINPLUS:
4321 case OP_QUERY:
4322 case OP_MINQUERY:
4323 c = *ecode++ - OP_STAR;
4324 minimize = (c & 1) != 0;
4325 min = rep_min[c]; /* Pick up values from tables; */
4326 max = rep_max[c]; /* zero for max => infinity */
4327 if (max == 0) max = INT_MAX;
4328
4329 /* Common code for all repeated single-character matches. We can give
4330 up quickly if there are fewer than the minimum number of characters left in
4331 the subject. */
4332
4333 REPEATCHAR:
4334 if (min > md->end_subject - eptr) return FALSE;
4335 c = *ecode++;
4336
4337 /* The code is duplicated for the caseless and caseful cases, for speed,
4338 since matching characters is likely to be quite common. First, ensure the
4339 minimum number of matches are present. If min = max, continue at the same
4340 level without recursing. Otherwise, if minimizing, keep trying the rest of
4341 the expression and advancing one matching character if failing, up to the
4342 maximum. Alternatively, if maximizing, find the maximum number of
4343 characters and work backwards. */
4344
4345 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4346 max, eptr));
4347
4348 if ((ims & PCRE_CASELESS) != 0)
4349 {
4350 c = md->lcc[c];
4351 for (i = 1; i <= min; i++)
4352 if (c != md->lcc[*eptr++]) return FALSE;
4353 if (min == max) continue;
4354 if (minimize)
4355 {
4356 for (i = min;; i++)
4357 {
4358 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4359 return TRUE;
4360 if (i >= max || eptr >= md->end_subject ||
4361 c != md->lcc[*eptr++])
4362 return FALSE;
4363 }
4364 /* Control never gets here */
4365 }
4366 else
4367 {
4368 const uschar *pp = eptr;
4369 for (i = min; i < max; i++)
4370 {
4371 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4372 eptr++;
4373 }
4374 while (eptr >= pp)
4375 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4376 return TRUE;
4377 return FALSE;
4378 }
4379 /* Control never gets here */
4380 }
4381
4382 /* Caseful comparisons */
4383
4384 else
4385 {
4386 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4387 if (min == max) continue;
4388 if (minimize)
4389 {
4390 for (i = min;; i++)
4391 {
4392 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4393 return TRUE;
4394 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4395 }
4396 /* Control never gets here */
4397 }
4398 else
4399 {
4400 const uschar *pp = eptr;
4401 for (i = min; i < max; i++)
4402 {
4403 if (eptr >= md->end_subject || c != *eptr) break;
4404 eptr++;
4405 }
4406 while (eptr >= pp)
4407 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4408 return TRUE;
4409 return FALSE;
4410 }
4411 }
4412 /* Control never gets here */
4413
4414 /* Match a negated single character */
4415
4416 case OP_NOT:
4417 if (eptr >= md->end_subject) return FALSE;
4418 ecode++;
4419 if ((ims & PCRE_CASELESS) != 0)
4420 {
4421 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4422 }
4423 else
4424 {
4425 if (*ecode++ == *eptr++) return FALSE;
4426 }
4427 break;
4428
4429 /* Match a negated single character repeatedly. This is almost a repeat of
4430 the code for a repeated single character, but I haven't found a nice way of
4431 commoning these up that doesn't require a test of the positive/negative
4432 option for each character match. Maybe that wouldn't add very much to the
4433 time taken, but character matching *is* what this is all about... */
4434
4435 case OP_NOTEXACT:
4436 min = max = (ecode[1] << 8) + ecode[2];
4437 ecode += 3;
4438 goto REPEATNOTCHAR;
4439
4440 case OP_NOTUPTO:
4441 case OP_NOTMINUPTO:
4442 min = 0;
4443 max = (ecode[1] << 8) + ecode[2];
4444 minimize = *ecode == OP_NOTMINUPTO;
4445 ecode += 3;
4446 goto REPEATNOTCHAR;
4447
4448 case OP_NOTSTAR:
4449 case OP_NOTMINSTAR:
4450 case OP_NOTPLUS:
4451 case OP_NOTMINPLUS:
4452 case OP_NOTQUERY:
4453 case OP_NOTMINQUERY:
4454 c = *ecode++ - OP_NOTSTAR;
4455 minimize = (c & 1) != 0;
4456 min = rep_min[c]; /* Pick up values from tables; */
4457 max = rep_max[c]; /* zero for max => infinity */
4458 if (max == 0) max = INT_MAX;
4459
4460 /* Common code for all repeated single-character matches. We can give
4461 up quickly if there are fewer than the minimum number of characters left in
4462 the subject. */
4463
4464 REPEATNOTCHAR:
4465 if (min > md->end_subject - eptr) return FALSE;
4466 c = *ecode++;
4467
4468 /* The code is duplicated for the caseless and caseful cases, for speed,
4469 since matching characters is likely to be quite common. First, ensure the
4470 minimum number of matches are present. If min = max, continue at the same
4471 level without recursing. Otherwise, if minimizing, keep trying the rest of
4472 the expression and advancing one matching character if failing, up to the
4473 maximum. Alternatively, if maximizing, find the maximum number of
4474 characters and work backwards. */
4475
4476 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4477 max, eptr));
4478
4479 if ((ims & PCRE_CASELESS) != 0)
4480 {
4481 c = md->lcc[c];
4482 for (i = 1; i <= min; i++)
4483 if (c == md->lcc[*eptr++]) return FALSE;
4484 if (min == max) continue;
4485 if (minimize)
4486 {
4487 for (i = min;; i++)
4488 {
4489 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4490 return TRUE;
4491 if (i >= max || eptr >= md->end_subject ||
4492 c == md->lcc[*eptr++])
4493 return FALSE;
4494 }
4495 /* Control never gets here */
4496 }
4497 else
4498 {
4499 const uschar *pp = eptr;
4500 for (i = min; i < max; i++)
4501 {
4502 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4503 eptr++;
4504 }
4505 while (eptr >= pp)
4506 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4507 return TRUE;
4508 return FALSE;
4509 }
4510 /* Control never gets here */
4511 }
4512
4513 /* Caseful comparisons */
4514
4515 else
4516 {
4517 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4518 if (min == max) continue;
4519 if (minimize)
4520 {
4521 for (i = min;; i++)
4522 {
4523 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4524 return TRUE;
4525 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4526 }
4527 /* Control never gets here */
4528 }
4529 else
4530 {
4531 const uschar *pp = eptr;
4532 for (i = min; i < max; i++)
4533 {
4534 if (eptr >= md->end_subject || c == *eptr) break;
4535 eptr++;
4536 }
4537 while (eptr >= pp)
4538 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4539 return TRUE;
4540 return FALSE;
4541 }
4542 }
4543 /* Control never gets here */
4544
4545 /* Match a single character type repeatedly; several different opcodes
4546 share code. This is very similar to the code for single characters, but we
4547 repeat it in the interests of efficiency. */
4548
4549 case OP_TYPEEXACT:
4550 min = max = (ecode[1] << 8) + ecode[2];
4551 minimize = TRUE;
4552 ecode += 3;
4553 goto REPEATTYPE;
4554
4555 case OP_TYPEUPTO:
4556 case OP_TYPEMINUPTO:
4557 min = 0;
4558 max = (ecode[1] << 8) + ecode[2];
4559 minimize = *ecode == OP_TYPEMINUPTO;
4560 ecode += 3;
4561 goto REPEATTYPE;
4562
4563 case OP_TYPESTAR:
4564 case OP_TYPEMINSTAR:
4565 case OP_TYPEPLUS:
4566 case OP_TYPEMINPLUS:
4567 case OP_TYPEQUERY:
4568 case OP_TYPEMINQUERY:
4569 c = *ecode++ - OP_TYPESTAR;
4570 minimize = (c & 1) != 0;
4571 min = rep_min[c]; /* Pick up values from tables; */
4572 max = rep_max[c]; /* zero for max => infinity */
4573 if (max == 0) max = INT_MAX;
4574
4575 /* Common code for all repeated single character type matches */
4576
4577 REPEATTYPE:
4578 ctype = *ecode++; /* Code for the character type */
4579
4580 /* First, ensure the minimum number of matches are present. Use inline
4581 code for maximizing the speed, and do the type test once at the start
4582 (i.e. keep it out of the loop). Also we can test that there are at least
4583 the minimum number of bytes before we start, except when doing '.' in
4584 UTF8 mode. Leave the test in in all cases; in the special case we have
4585 to test after each character. */
4586
4587 if (min > md->end_subject - eptr) return FALSE;
4588 if (min > 0) switch(ctype)
4589 {
4590 case OP_ANY:
4591 #ifdef SUPPORT_UTF8
4592 if (md->utf8)
4593 {
4594 for (i = 1; i <= min; i++)
4595 {
4596 if (eptr >= md->end_subject ||
4597 (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4598 return FALSE;
4599 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4600 }
4601 break;
4602 }
4603 #endif
4604 /* Non-UTF8 can be faster */
4605 if ((ims & PCRE_DOTALL) == 0)
4606 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4607 else eptr += min;
4608 break;
4609
4610 case OP_NOT_DIGIT:
4611 for (i = 1; i <= min; i++)
4612 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4613 break;
4614
4615 case OP_DIGIT:
4616 for (i = 1; i <= min; i++)
4617 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4618 break;
4619
4620 case OP_NOT_WHITESPACE:
4621 for (i = 1; i <= min; i++)
4622 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4623 break;
4624
4625 case OP_WHITESPACE:
4626 for (i = 1; i <= min; i++)
4627 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4628 break;
4629
4630 case OP_NOT_WORDCHAR:
4631 for (i = 1; i <= min; i++)
4632 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4633 return FALSE;
4634 break;
4635
4636 case OP_WORDCHAR:
4637 for (i = 1; i <= min; i++)
4638 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4639 return FALSE;
4640 break;
4641 }
4642
4643 /* If min = max, continue at the same level without recursing */
4644
4645 if (min == max) continue;
4646
4647 /* If minimizing, we have to test the rest of the pattern before each
4648 subsequent match. */
4649
4650 if (minimize)
4651 {
4652 for (i = min;; i++)
4653 {
4654 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4655 if (i >= max || eptr >= md->end_subject) return FALSE;
4656
4657 c = *eptr++;
4658 switch(ctype)
4659 {
4660 case OP_ANY:
4661 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4662 #ifdef SUPPORT_UTF8
4663 if (md->utf8)
4664 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4665 #endif
4666 break;
4667
4668 case OP_NOT_DIGIT:
4669 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4670 break;
4671
4672 case OP_DIGIT:
4673 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4674 break;
4675
4676 case OP_NOT_WHITESPACE:
4677 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4678 break;
4679
4680 case OP_WHITESPACE:
4681 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4682 break;
4683
4684 case OP_NOT_WORDCHAR:
4685 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4686 break;
4687
4688 case OP_WORDCHAR:
4689 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4690 break;
4691 }
4692 }
4693 /* Control never gets here */
4694 }
4695
4696 /* If maximizing it is worth using inline code for speed, doing the type
4697 test once at the start (i.e. keep it out of the loop). */
4698
4699 else
4700 {
4701 const uschar *pp = eptr;
4702 switch(ctype)
4703 {
4704 case OP_ANY:
4705
4706 /* Special code is required for UTF8, but when the maximum is unlimited
4707 we don't need it. */
4708
4709 #ifdef SUPPORT_UTF8
4710 if (md->utf8 && max < INT_MAX)
4711 {
4712 if ((ims & PCRE_DOTALL) == 0)
4713 {
4714 for (i = min; i < max; i++)
4715 {
4716 if (eptr >= md->end_subject || *eptr++ == '\n') break;
4717 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4718 }
4719 }
4720 else
4721 {
4722 for (i = min; i < max; i++)
4723 {
4724 eptr++;
4725 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4726 }
4727 }
4728 break;
4729 }
4730 #endif
4731 /* Non-UTF8 can be faster */
4732 if ((ims & PCRE_DOTALL) == 0)
4733 {
4734 for (i = min; i < max; i++)
4735 {
4736 if (eptr >= md->end_subject || *eptr == '\n') break;
4737 eptr++;
4738 }
4739 }
4740 else
4741 {
4742 c = max - min;
4743 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4744 eptr += c;
4745 }
4746 break;
4747
4748 case OP_NOT_DIGIT:
4749 for (i = min; i < max; i++)
4750 {
4751 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4752 break;
4753 eptr++;
4754 }
4755 break;
4756
4757 case OP_DIGIT:
4758 for (i = min; i < max; i++)
4759 {
4760 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4761 break;
4762 eptr++;
4763 }
4764 break;
4765
4766 case OP_NOT_WHITESPACE:
4767 for (i = min; i < max; i++)
4768 {
4769 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4770 break;
4771 eptr++;
4772 }
4773 break;
4774
4775 case OP_WHITESPACE:
4776 for (i = min; i < max; i++)
4777 {
4778 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4779 break;
4780 eptr++;
4781 }
4782 break;
4783
4784 case OP_NOT_WORDCHAR:
4785 for (i = min; i < max; i++)
4786 {
4787 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4788 break;
4789 eptr++;
4790 }
4791 break;
4792
4793 case OP_WORDCHAR:
4794 for (i = min; i < max; i++)
4795 {
4796 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4797 break;
4798 eptr++;
4799 }
4800 break;
4801 }
4802
4803 while (eptr >= pp)
4804 {
4805 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4806 return TRUE;
4807 #ifdef SUPPORT_UTF8
4808 if (md->utf8)
4809 while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4810 #endif
4811 }
4812 return FALSE;
4813 }
4814 /* Control never gets here */
4815
4816 /* There's been some horrible disaster. */
4817
4818 default:
4819 DPRINTF(("Unknown opcode %d\n", *ecode));
4820 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4821 return FALSE;
4822 }
4823
4824 /* Do not stick any code in here without much thought; it is assumed
4825 that "continue" in the code above comes out to here to repeat the main
4826 loop. */
4827
4828 } /* End of main loop */
4829 /* Control never reaches here */
4830 }
4831
4832
4833
4834
4835 /*************************************************
4836 * Execute a Regular Expression *
4837 *************************************************/
4838
4839 /* This function applies a compiled re to a subject string and picks out
4840 portions of the string if it matches. Two elements in the vector are set for
4841 each substring: the offsets to the start and end of the substring.
4842
4843 Arguments:
4844 external_re points to the compiled expression
4845 external_extra points to "hints" from pcre_study() or is NULL
4846 subject points to the subject string
4847 length length of subject string (may contain binary zeros)
4848 start_offset where to start in the subject string
4849 options option bits
4850 offsets points to a vector of ints to be filled in with offsets
4851 offsetcount the number of elements in the vector
4852
4853 Returns: > 0 => success; value is the number of elements filled in
4854 = 0 => success, but offsets is not big enough
4855 -1 => failed to match
4856 < -1 => some kind of unexpected problem
4857 */
4858
4859 int
4860 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4861 const char *subject, int length, int start_offset, int options, int *offsets,
4862 int offsetcount)
4863 {
4864 int resetcount, ocount;
4865 int first_char = -1;
4866 int req_char = -1;
4867 int req_char2 = -1;
4868 unsigned long int ims = 0;
4869 match_data match_block;
4870 const uschar *start_bits = NULL;
4871 const uschar *start_match = (const uschar *)subject + start_offset;
4872 const uschar *end_subject;
4873 const uschar *req_char_ptr = start_match - 1;
4874 const real_pcre *re = (const real_pcre *)external_re;
4875 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4876 BOOL using_temporary_offsets = FALSE;
4877 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4878 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4879
4880 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4881
4882 if (re == NULL || subject == NULL ||
4883 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4884 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4885
4886 match_block.start_pattern = re->code;
4887 match_block.start_subject = (const uschar *)subject;
4888 match_block.end_subject = match_block.start_subject + length;
4889 end_subject = match_block.end_subject;
4890
4891 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4892 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4893
4894 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4895 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4896 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4897
4898 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4899
4900 match_block.lcc = re->tables + lcc_offset;
4901 match_block.ctypes = re->tables + ctypes_offset;
4902
4903 /* The ims options can vary during the matching as a result of the presence
4904 of (?ims) items in the pattern. They are kept in a local variable so that
4905 restoring at the exit of a group is easy. */
4906
4907 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4908
4909 /* If the expression has got more back references than the offsets supplied can
4910 hold, we get a temporary bit of working store to use during the matching.
4911 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4912 of 3. */
4913
4914 ocount = offsetcount - (offsetcount % 3);
4915
4916 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4917 {
4918 ocount = re->top_backref * 3 + 3;
4919 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4920 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4921 using_temporary_offsets = TRUE;
4922 DPRINTF(("Got memory to hold back references\n"));
4923 }
4924 else match_block.offset_vector = offsets;
4925
4926 match_block.offset_end = ocount;
4927 match_block.offset_max = (2*ocount)/3;
4928 match_block.offset_overflow = FALSE;
4929
4930 /* Compute the minimum number of offsets that we need to reset each time. Doing
4931 this makes a huge difference to execution time when there aren't many brackets
4932 in the pattern. */
4933
4934 resetcount = 2 + re->top_bracket * 2;
4935 if (resetcount > offsetcount) resetcount = ocount;
4936
4937 /* Reset the working variable associated with each extraction. These should
4938 never be used unless previously set, but they get saved and restored, and so we
4939 initialize them to avoid reading uninitialized locations. */
4940
4941 if (match_block.offset_vector != NULL)
4942 {
4943 register int *iptr = match_block.offset_vector + ocount;
4944 register int *iend = iptr - resetcount/2 + 1;
4945 while (--iptr >= iend) *iptr = -1;
4946 }
4947
4948 /* Set up the first character to match, if available. The first_char value is
4949 never set for an anchored regular expression, but the anchoring may be forced
4950 at run time, so we have to test for anchoring. The first char may be unset for
4951 an unanchored pattern, of course. If there's no first char and the pattern was
4952 studied, there may be a bitmap of possible first characters. */
4953
4954 if (!anchored)
4955 {
4956 if ((re->options & PCRE_FIRSTSET) != 0)
4957 {
4958 first_char = re->first_char;
4959 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4960 }
4961 else
4962 if (!startline && extra != NULL &&
4963 (extra->options & PCRE_STUDY_MAPPED) != 0)
4964 start_bits = extra->start_bits;
4965 }
4966
4967 /* For anchored or unanchored matches, there may be a "last known required
4968 character" set. If the PCRE_CASELESS is set, implying that the match starts
4969 caselessly, or if there are any changes of this flag within the regex, set up
4970 both cases of the character. Otherwise set the two values the same, which will
4971 avoid duplicate testing (which takes significant time). This covers the vast
4972 majority of cases. It will be suboptimal when the case flag changes in a regex
4973 and the required character in fact is caseful. */
4974
4975 if ((re->options & PCRE_REQCHSET) != 0)
4976 {
4977 req_char = re->req_char;
4978 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4979 (re->tables + fcc_offset)[req_char] : req_char;
4980 }
4981
4982 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4983 the loop runs just once. */
4984
4985 do
4986 {
4987 int rc;
4988 register int *iptr = match_block.offset_vector;
4989 register int *iend = iptr + resetcount;
4990
4991 /* Reset the maximum number of extractions we might see. */
4992
4993 while (iptr < iend) *iptr++ = -1;
4994
4995 /* Advance to a unique first char if possible */
4996
4997 if (first_char >= 0)
4998 {
4999 if ((ims & PCRE_CASELESS) != 0)
5000 while (start_match < end_subject &&
5001 match_block.lcc[*start_match] != first_char)
5002 start_match++;
5003 else
5004 while (start_match < end_subject && *start_match != first_char)
5005 start_match++;
5006 }
5007
5008 /* Or to just after \n for a multiline match if possible */
5009
5010 else if (startline)
5011 {
5012 if (start_match > match_block.start_subject + start_offset)
5013 {
5014 while (start_match < end_subject && start_match[-1] != '\n')
5015 start_match++;
5016 }
5017 }
5018
5019 /* Or to a non-unique first char after study */
5020
5021 else if (start_bits != NULL)
5022 {
5023 while (start_match < end_subject)
5024 {
5025 register int c = *start_match;
5026 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5027 }
5028 }
5029
5030 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5031 printf(">>>> Match against: ");
5032 pchars(start_match, end_subject - start_match, TRUE, &match_block);
5033 printf("\n");
5034 #endif
5035
5036 /* If req_char is set, we know that that character must appear in the subject
5037 for the match to succeed. If the first character is set, req_char must be
5038 later in the subject; otherwise the test starts at the match point. This
5039 optimization can save a huge amount of backtracking in patterns with nested
5040 unlimited repeats that aren't going to match. We don't know what the state of
5041 case matching may be when this character is hit, so test for it in both its
5042 cases if necessary. However, the different cased versions will not be set up
5043 unless PCRE_CASELESS was given or the casing state changes within the regex.
5044 Writing separate code makes it go faster, as does using an autoincrement and
5045 backing off on a match. */
5046
5047 if (req_char >= 0)
5048 {
5049 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5050
5051 /* We don't need to repeat the search if we haven't yet reached the
5052 place we found it at last time. */
5053
5054 if (p > req_char_ptr)
5055 {
5056 /* Do a single test if no case difference is set up */
5057
5058 if (req_char == req_char2)
5059 {
5060 while (p < end_subject)
5061 {
5062 if (*p++ == req_char) { p--; break; }
5063 }
5064 }
5065
5066 /* Otherwise test for either case */
5067
5068 else
5069 {
5070 while (p < end_subject)
5071 {
5072 register int pp = *p++;
5073 if (pp == req_char || pp == req_char2) { p--; break; }
5074 }
5075 }
5076
5077 /* If we can't find the required character, break the matching loop */
5078
5079 if (p >= end_subject) break;
5080
5081 /* If we have found the required character, save the point where we
5082 found it, so that we don't search again next time round the loop if
5083 the start hasn't passed this character yet. */
5084
5085 req_char_ptr = p;
5086 }
5087 }
5088
5089 /* When a match occurs, substrings will be set for all internal extractions;
5090 we just need to set up the whole thing as substring 0 before returning. If
5091 there were too many extractions, set the return code to zero. In the case
5092 where we had to get some local store to hold offsets for backreferences, copy
5093 those back references that we can. In this case there need not be overflow
5094 if certain parts of the pattern were not used. */
5095
5096 match_block.start_match = start_match;
5097 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5098 continue;
5099
5100 /* Copy the offset information from temporary store if necessary */
5101
5102 if (using_temporary_offsets)
5103 {
5104 if (offsetcount >= 4)
5105 {
5106 memcpy(offsets + 2, match_block.offset_vector + 2,
5107 (offsetcount - 2) * sizeof(int));
5108 DPRINTF(("Copied offsets from temporary memory\n"));
5109 }
5110 if (match_block.end_offset_top > offsetcount)
5111 match_block.offset_overflow = TRUE;
5112
5113 DPRINTF(("Freeing temporary memory\n"));
5114 (pcre_free)(match_block.offset_vector);
5115 }
5116
5117 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5118
5119 if (match_block.offset_end < 2) rc = 0; else
5120 {
5121 offsets[0] = start_match - match_block.start_subject;
5122 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5123 }
5124
5125 DPRINTF((">>>> returning %d\n", rc));
5126 return rc;
5127 }
5128
5129 /* This "while" is the end of the "do" above */
5130
5131 while (!anchored &&
5132 match_block.errorcode == PCRE_ERROR_NOMATCH &&
5133 start_match++ < end_subject);
5134
5135 if (using_temporary_offsets)
5136 {
5137 DPRINTF(("Freeing temporary memory\n"));
5138 (pcre_free)(match_block.offset_vector);
5139 }
5140
5141 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5142
5143 return match_block.errorcode;
5144 }
5145
5146 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12