/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 63 - (show annotations) (download)
Sat Feb 24 21:40:03 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 226050 byte(s)
Load pcre-4.0 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2003 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35 /* Define DEBUG to get debugging output on stdout. */
36
37 /* #define DEBUG */
38
39 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40 inline, and there are *still* stupid compilers about that don't like indented
41 pre-processor statements. I suppose it's only been 10 years... */
42
43 #ifdef DEBUG
44 #define DPRINTF(p) printf p
45 #else
46 #define DPRINTF(p) /*nothing*/
47 #endif
48
49 /* Include the internals header, which itself includes Standard C headers plus
50 the external pcre header. */
51
52 #include "internal.h"
53
54
55 /* Allow compilation as C++ source code, should anybody want to do that. */
56
57 #ifdef __cplusplus
58 #define class pcre_class
59 #endif
60
61
62 /* Maximum number of items on the nested bracket stacks at compile time. This
63 applies to the nesting of all kinds of parentheses. It does not limit
64 un-nested, non-capturing parentheses. This number can be made bigger if
65 necessary - it is used to dimension one int and one unsigned char vector at
66 compile time. */
67
68 #define BRASTACK_SIZE 200
69
70
71
72 /* Maximum number of ints of offset to save on the stack for recursive calls.
73 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
74 because the offset vector is always a multiple of 3 long. */
75
76 #define REC_STACK_SAVE_MAX 30
77
78
79 /* The number of bytes in a literal character string above which we can't add
80 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
81 could be 255 when UTF-8 support is excluded, but that means that some of the
82 test output would be different, which just complicates things.) */
83
84 #define MAXLIT 250
85
86
87 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
88 the definition is next to the definition of the opcodes in internal.h. */
89
90 static uschar OP_lengths[] = { OP_LENGTHS };
91
92 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
93
94 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
95 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
96
97 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
98 are simple data values; negative values are for special things like \d and so
99 on. Zero means further processing is needed (for things like \x), or the escape
100 is invalid. */
101
102 static const short int escapes[] = {
103 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
104 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
105 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
106 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
107 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
108 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
109 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
110 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
111 0, 0, ESC_r, -ESC_s, ESC_t, 0, 0, -ESC_w, /* p - w */
112 0, 0, -ESC_z /* x - z */
113 };
114
115 /* Tables of names of POSIX character classes and their lengths. The list is
116 terminated by a zero length entry. The first three must be alpha, upper, lower,
117 as this is assumed for handling case independence. */
118
119 static const char *posix_names[] = {
120 "alpha", "lower", "upper",
121 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
122 "print", "punct", "space", "word", "xdigit" };
123
124 static const uschar posix_name_lengths[] = {
125 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
126
127 /* Table of class bit maps for each POSIX class; up to three may be combined
128 to form the class. The table for [:blank:] is dynamically modified to remove
129 the vertical space characters. */
130
131 static const int posix_class_maps[] = {
132 cbit_lower, cbit_upper, -1, /* alpha */
133 cbit_lower, -1, -1, /* lower */
134 cbit_upper, -1, -1, /* upper */
135 cbit_digit, cbit_lower, cbit_upper, /* alnum */
136 cbit_print, cbit_cntrl, -1, /* ascii */
137 cbit_space, -1, -1, /* blank - a GNU extension */
138 cbit_cntrl, -1, -1, /* cntrl */
139 cbit_digit, -1, -1, /* digit */
140 cbit_graph, -1, -1, /* graph */
141 cbit_print, -1, -1, /* print */
142 cbit_punct, -1, -1, /* punct */
143 cbit_space, -1, -1, /* space */
144 cbit_word, -1, -1, /* word - a Perl extension */
145 cbit_xdigit,-1, -1 /* xdigit */
146 };
147
148
149 /* Definition to allow mutual recursion */
150
151 static BOOL
152 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
153 BOOL, int, int *, int *, branch_chain *, compile_data *);
154
155 /* Structure for building a chain of data that actually lives on the
156 stack, for holding the values of the subject pointer at the start of each
157 subpattern, so as to detect when an empty string has been matched by a
158 subpattern - to break infinite loops. */
159
160 typedef struct eptrblock {
161 struct eptrblock *prev;
162 const uschar *saved_eptr;
163 } eptrblock;
164
165 /* Flag bits for the match() function */
166
167 #define match_condassert 0x01 /* Called to check a condition assertion */
168 #define match_isgroup 0x02 /* Set if start of bracketed group */
169
170 /* Non-error returns from the match() function. Error returns are externally
171 defined PCRE_ERROR_xxx codes, which are all negative. */
172
173 #define MATCH_MATCH 1
174 #define MATCH_NOMATCH 0
175
176
177
178 /*************************************************
179 * Global variables *
180 *************************************************/
181
182 /* PCRE is thread-clean and doesn't use any global variables in the normal
183 sense. However, it calls memory allocation and free functions via the two
184 indirections below, and it can optionally do callouts. These values can be
185 changed by the caller, but are shared between all threads. However, when
186 compiling for Virtual Pascal, things are done differently (see pcre.in). */
187
188 #ifndef VPCOMPAT
189 void *(*pcre_malloc)(size_t) = malloc;
190 void (*pcre_free)(void *) = free;
191 int (*pcre_callout)(pcre_callout_block *) = NULL;
192 #endif
193
194
195 /*************************************************
196 * Macros and tables for character handling *
197 *************************************************/
198
199 /* When UTF-8 encoding is being used, a character is no longer just a single
200 byte. The macros for character handling generate simple sequences when used in
201 byte-mode, and more complicated ones for UTF-8 characters. */
202
203 #ifndef SUPPORT_UTF8
204 #define GETCHAR(c, eptr) c = *eptr;
205 #define GETCHARINC(c, eptr) c = *eptr++;
206 #define GETCHARINCTEST(c, eptr) c = *eptr++;
207 #define GETCHARLEN(c, eptr, len) c = *eptr;
208 #define BACKCHAR(eptr)
209
210 #else /* SUPPORT_UTF8 */
211
212 /* Get the next UTF-8 character, not advancing the pointer. This is called when
213 we know we are in UTF-8 mode. */
214
215 #define GETCHAR(c, eptr) \
216 c = *eptr; \
217 if ((c & 0xc0) == 0xc0) \
218 { \
219 int i; \
220 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
221 int s = 6*a; \
222 c = (c & utf8_table3[a]) << s; \
223 for (i = 1; i <= a; i++) \
224 { \
225 s -= 6; \
226 c |= (eptr[i] & 0x3f) << s; \
227 } \
228 }
229
230 /* Get the next UTF-8 character, advancing the pointer. This is called when we
231 know we are in UTF-8 mode. */
232
233 #define GETCHARINC(c, eptr) \
234 c = *eptr++; \
235 if ((c & 0xc0) == 0xc0) \
236 { \
237 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
238 int s = 6*a; \
239 c = (c & utf8_table3[a]) << s; \
240 while (a-- > 0) \
241 { \
242 s -= 6; \
243 c |= (*eptr++ & 0x3f) << s; \
244 } \
245 }
246
247 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
248
249 #define GETCHARINCTEST(c, eptr) \
250 c = *eptr++; \
251 if (md->utf8 && (c & 0xc0) == 0xc0) \
252 { \
253 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
254 int s = 6*a; \
255 c = (c & utf8_table3[a]) << s; \
256 while (a-- > 0) \
257 { \
258 s -= 6; \
259 c |= (*eptr++ & 0x3f) << s; \
260 } \
261 }
262
263 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
264 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
265
266 #define GETCHARLEN(c, eptr, len) \
267 c = *eptr; \
268 if ((c & 0xc0) == 0xc0) \
269 { \
270 int i; \
271 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
272 int s = 6*a; \
273 c = (c & utf8_table3[a]) << s; \
274 for (i = 1; i <= a; i++) \
275 { \
276 s -= 6; \
277 c |= (eptr[i] & 0x3f) << s; \
278 } \
279 len += a; \
280 }
281
282 /* If the pointer is not at the start of a character, move it back until
283 it is. Called only in UTF-8 mode. */
284
285 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
286
287 #endif
288
289
290
291 /*************************************************
292 * Default character tables *
293 *************************************************/
294
295 /* A default set of character tables is included in the PCRE binary. Its source
296 is built by the maketables auxiliary program, which uses the default C ctypes
297 functions, and put in the file chartables.c. These tables are used by PCRE
298 whenever the caller of pcre_compile() does not provide an alternate set of
299 tables. */
300
301 #include "chartables.c"
302
303
304
305 #ifdef SUPPORT_UTF8
306 /*************************************************
307 * Tables for UTF-8 support *
308 *************************************************/
309
310 /* These are the breakpoints for different numbers of bytes in a UTF-8
311 character. */
312
313 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
314
315 /* These are the indicator bits and the mask for the data bits to set in the
316 first byte of a character, indexed by the number of additional bytes. */
317
318 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
319 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
320
321 /* Table of the number of extra characters, indexed by the first character
322 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
323 0x3d. */
324
325 static uschar utf8_table4[] = {
326 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
327 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
328 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
329 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
330
331
332 /*************************************************
333 * Convert character value to UTF-8 *
334 *************************************************/
335
336 /* This function takes an integer value in the range 0 - 0x7fffffff
337 and encodes it as a UTF-8 character in 0 to 6 bytes.
338
339 Arguments:
340 cvalue the character value
341 buffer pointer to buffer for result - at least 6 bytes long
342
343 Returns: number of characters placed in the buffer
344 */
345
346 static int
347 ord2utf8(int cvalue, uschar *buffer)
348 {
349 register int i, j;
350 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
351 if (cvalue <= utf8_table1[i]) break;
352 buffer += i;
353 for (j = i; j > 0; j--)
354 {
355 *buffer-- = 0x80 | (cvalue & 0x3f);
356 cvalue >>= 6;
357 }
358 *buffer = utf8_table2[i] | cvalue;
359 return i + 1;
360 }
361 #endif
362
363
364
365 /*************************************************
366 * Print compiled regex *
367 *************************************************/
368
369 /* The code for doing this is held in a separate file that is also included in
370 pcretest.c. It defines a function called print_internals(). */
371
372 #ifdef DEBUG
373 #include "printint.c"
374 #endif
375
376
377
378 /*************************************************
379 * Return version string *
380 *************************************************/
381
382 #define STRING(a) # a
383 #define XSTRING(s) STRING(s)
384
385 const char *
386 pcre_version(void)
387 {
388 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
389 }
390
391
392
393
394 /*************************************************
395 * (Obsolete) Return info about compiled pattern *
396 *************************************************/
397
398 /* This is the original "info" function. It picks potentially useful data out
399 of the private structure, but its interface was too rigid. It remains for
400 backwards compatibility. The public options are passed back in an int - though
401 the re->options field has been expanded to a long int, all the public options
402 at the low end of it, and so even on 16-bit systems this will still be OK.
403 Therefore, I haven't changed the API for pcre_info().
404
405 Arguments:
406 external_re points to compiled code
407 optptr where to pass back the options
408 first_byte where to pass back the first character,
409 or -1 if multiline and all branches start ^,
410 or -2 otherwise
411
412 Returns: number of capturing subpatterns
413 or negative values on error
414 */
415
416 int
417 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
418 {
419 const real_pcre *re = (const real_pcre *)external_re;
420 if (re == NULL) return PCRE_ERROR_NULL;
421 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
422 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
423 if (first_byte != NULL)
424 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
425 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426 return re->top_bracket;
427 }
428
429
430
431 /*************************************************
432 * Return info about compiled pattern *
433 *************************************************/
434
435 /* This is a newer "info" function which has an extensible interface so
436 that additional items can be added compatibly.
437
438 Arguments:
439 external_re points to compiled code
440 extra_data points extra data, or NULL
441 what what information is required
442 where where to put the information
443
444 Returns: 0 if data returned, negative on error
445 */
446
447 int
448 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
449 void *where)
450 {
451 const real_pcre *re = (const real_pcre *)external_re;
452 const pcre_study_data *study = NULL;
453
454 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
455 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
456
457 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
458 study = extra_data->study_data;
459
460 switch (what)
461 {
462 case PCRE_INFO_OPTIONS:
463 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
464 break;
465
466 case PCRE_INFO_SIZE:
467 *((size_t *)where) = re->size;
468 break;
469
470 case PCRE_INFO_STUDYSIZE:
471 *((size_t *)where) = (study == NULL)? 0 : study->size;
472 break;
473
474 case PCRE_INFO_CAPTURECOUNT:
475 *((int *)where) = re->top_bracket;
476 break;
477
478 case PCRE_INFO_BACKREFMAX:
479 *((int *)where) = re->top_backref;
480 break;
481
482 case PCRE_INFO_FIRSTBYTE:
483 *((int *)where) =
484 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
485 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
486 break;
487
488 case PCRE_INFO_FIRSTTABLE:
489 *((const uschar **)where) =
490 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
491 study->start_bits : NULL;
492 break;
493
494 case PCRE_INFO_LASTLITERAL:
495 *((int *)where) =
496 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
497 break;
498
499 case PCRE_INFO_NAMEENTRYSIZE:
500 *((int *)where) = re->name_entry_size;
501 break;
502
503 case PCRE_INFO_NAMECOUNT:
504 *((int *)where) = re->name_count;
505 break;
506
507 case PCRE_INFO_NAMETABLE:
508 *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
509 break;
510
511 default: return PCRE_ERROR_BADOPTION;
512 }
513
514 return 0;
515 }
516
517
518
519 /*************************************************
520 * Return info about what features are configured *
521 *************************************************/
522
523 /* This is function which has an extensible interface so that additional items
524 can be added compatibly.
525
526 Arguments:
527 what what information is required
528 where where to put the information
529
530 Returns: 0 if data returned, negative on error
531 */
532
533 int
534 pcre_config(int what, void *where)
535 {
536 switch (what)
537 {
538 case PCRE_CONFIG_UTF8:
539 #ifdef SUPPORT_UTF8
540 *((int *)where) = 1;
541 #else
542 *((int *)where) = 0;
543 #endif
544 break;
545
546 case PCRE_CONFIG_NEWLINE:
547 *((int *)where) = NEWLINE;
548 break;
549
550 case PCRE_CONFIG_LINK_SIZE:
551 *((int *)where) = LINK_SIZE;
552 break;
553
554 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
555 *((int *)where) = POSIX_MALLOC_THRESHOLD;
556 break;
557
558 case PCRE_CONFIG_MATCH_LIMIT:
559 *((unsigned int *)where) = MATCH_LIMIT;
560 break;
561
562 default: return PCRE_ERROR_BADOPTION;
563 }
564
565 return 0;
566 }
567
568
569
570 #ifdef DEBUG
571 /*************************************************
572 * Debugging function to print chars *
573 *************************************************/
574
575 /* Print a sequence of chars in printable format, stopping at the end of the
576 subject if the requested.
577
578 Arguments:
579 p points to characters
580 length number to print
581 is_subject TRUE if printing from within md->start_subject
582 md pointer to matching data block, if is_subject is TRUE
583
584 Returns: nothing
585 */
586
587 static void
588 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
589 {
590 int c;
591 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
592 while (length-- > 0)
593 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
594 }
595 #endif
596
597
598
599
600 /*************************************************
601 * Handle escapes *
602 *************************************************/
603
604 /* This function is called when a \ has been encountered. It either returns a
605 positive value for a simple escape such as \n, or a negative value which
606 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
607 a positive value greater than 255 may be returned. On entry, ptr is pointing at
608 the \. On exit, it is on the final character of the escape sequence.
609
610 Arguments:
611 ptrptr points to the pattern position pointer
612 errorptr points to the pointer to the error message
613 bracount number of previous extracting brackets
614 options the options bits
615 isclass TRUE if inside a character class
616 cd pointer to char tables block
617
618 Returns: zero or positive => a data character
619 negative => a special escape sequence
620 on error, errorptr is set
621 */
622
623 static int
624 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
625 int options, BOOL isclass, compile_data *cd)
626 {
627 const uschar *ptr = *ptrptr;
628 int c, i;
629
630 /* If backslash is at the end of the pattern, it's an error. */
631
632 c = *(++ptr);
633 if (c == 0) *errorptr = ERR1;
634
635 /* Digits or letters may have special meaning; all others are literals. */
636
637 else if (c < '0' || c > 'z') {}
638
639 /* Do an initial lookup in a table. A non-zero result is something that can be
640 returned immediately. Otherwise further processing may be required. */
641
642 else if ((i = escapes[c - '0']) != 0) c = i;
643
644 /* Escapes that need further processing, or are illegal. */
645
646 else
647 {
648 const uschar *oldptr;
649 switch (c)
650 {
651 /* A number of Perl escapes are not handled by PCRE. We give an explicit
652 error. */
653
654 case 'l':
655 case 'L':
656 case 'N':
657 case 'p':
658 case 'P':
659 case 'u':
660 case 'U':
661 case 'X':
662 *errorptr = ERR37;
663 break;
664
665 /* The handling of escape sequences consisting of a string of digits
666 starting with one that is not zero is not straightforward. By experiment,
667 the way Perl works seems to be as follows:
668
669 Outside a character class, the digits are read as a decimal number. If the
670 number is less than 10, or if there are that many previous extracting
671 left brackets, then it is a back reference. Otherwise, up to three octal
672 digits are read to form an escaped byte. Thus \123 is likely to be octal
673 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
674 value is greater than 377, the least significant 8 bits are taken. Inside a
675 character class, \ followed by a digit is always an octal number. */
676
677 case '1': case '2': case '3': case '4': case '5':
678 case '6': case '7': case '8': case '9':
679
680 if (!isclass)
681 {
682 oldptr = ptr;
683 c -= '0';
684 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
685 c = c * 10 + *(++ptr) - '0';
686 if (c < 10 || c <= bracount)
687 {
688 c = -(ESC_REF + c);
689 break;
690 }
691 ptr = oldptr; /* Put the pointer back and fall through */
692 }
693
694 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
695 generates a binary zero byte and treats the digit as a following literal.
696 Thus we have to pull back the pointer by one. */
697
698 if ((c = *ptr) >= '8')
699 {
700 ptr--;
701 c = 0;
702 break;
703 }
704
705 /* \0 always starts an octal number, but we may drop through to here with a
706 larger first octal digit. */
707
708 case '0':
709 c -= '0';
710 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
711 ptr[1] != '8' && ptr[1] != '9')
712 c = c * 8 + *(++ptr) - '0';
713 c &= 255; /* Take least significant 8 bits */
714 break;
715
716 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
717 which can be greater than 0xff, but only if the ddd are hex digits. */
718
719 case 'x':
720 #ifdef SUPPORT_UTF8
721 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
722 {
723 const uschar *pt = ptr + 2;
724 register int count = 0;
725 c = 0;
726 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
727 {
728 count++;
729 c = c * 16 + cd->lcc[*pt] -
730 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
731 pt++;
732 }
733 if (*pt == '}')
734 {
735 if (c < 0 || count > 8) *errorptr = ERR34;
736 ptr = pt;
737 break;
738 }
739 /* If the sequence of hex digits does not end with '}', then we don't
740 recognize this construct; fall through to the normal \x handling. */
741 }
742 #endif
743
744 /* Read just a single hex char */
745
746 c = 0;
747 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
748 {
749 ptr++;
750 c = c * 16 + cd->lcc[*ptr] -
751 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
752 }
753 break;
754
755 /* Other special escapes not starting with a digit are straightforward */
756
757 case 'c':
758 c = *(++ptr);
759 if (c == 0)
760 {
761 *errorptr = ERR2;
762 return 0;
763 }
764
765 /* A letter is upper-cased; then the 0x40 bit is flipped */
766
767 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
768 c ^= 0x40;
769 break;
770
771 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
772 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
773 for Perl compatibility, it is a literal. This code looks a bit odd, but
774 there used to be some cases other than the default, and there may be again
775 in future, so I haven't "optimized" it. */
776
777 default:
778 if ((options & PCRE_EXTRA) != 0) switch(c)
779 {
780 default:
781 *errorptr = ERR3;
782 break;
783 }
784 break;
785 }
786 }
787
788 *ptrptr = ptr;
789 return c;
790 }
791
792
793
794 /*************************************************
795 * Check for counted repeat *
796 *************************************************/
797
798 /* This function is called when a '{' is encountered in a place where it might
799 start a quantifier. It looks ahead to see if it really is a quantifier or not.
800 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
801 where the ddds are digits.
802
803 Arguments:
804 p pointer to the first char after '{'
805 cd pointer to char tables block
806
807 Returns: TRUE or FALSE
808 */
809
810 static BOOL
811 is_counted_repeat(const uschar *p, compile_data *cd)
812 {
813 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
814 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
815 if (*p == '}') return TRUE;
816
817 if (*p++ != ',') return FALSE;
818 if (*p == '}') return TRUE;
819
820 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
821 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
822 return (*p == '}');
823 }
824
825
826
827 /*************************************************
828 * Read repeat counts *
829 *************************************************/
830
831 /* Read an item of the form {n,m} and return the values. This is called only
832 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
833 so the syntax is guaranteed to be correct, but we need to check the values.
834
835 Arguments:
836 p pointer to first char after '{'
837 minp pointer to int for min
838 maxp pointer to int for max
839 returned as -1 if no max
840 errorptr points to pointer to error message
841 cd pointer to character tables clock
842
843 Returns: pointer to '}' on success;
844 current ptr on error, with errorptr set
845 */
846
847 static const uschar *
848 read_repeat_counts(const uschar *p, int *minp, int *maxp,
849 const char **errorptr, compile_data *cd)
850 {
851 int min = 0;
852 int max = -1;
853
854 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
855
856 if (*p == '}') max = min; else
857 {
858 if (*(++p) != '}')
859 {
860 max = 0;
861 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
862 if (max < min)
863 {
864 *errorptr = ERR4;
865 return p;
866 }
867 }
868 }
869
870 /* Do paranoid checks, then fill in the required variables, and pass back the
871 pointer to the terminating '}'. */
872
873 if (min > 65535 || max > 65535)
874 *errorptr = ERR5;
875 else
876 {
877 *minp = min;
878 *maxp = max;
879 }
880 return p;
881 }
882
883
884
885 /*************************************************
886 * Find first significant op code *
887 *************************************************/
888
889 /* This is called by several functions that scan a compiled expression looking
890 for a fixed first character, or an anchoring op code etc. It skips over things
891 that do not influence this. For some calls, a change of option is important.
892
893 Arguments:
894 code pointer to the start of the group
895 options pointer to external options
896 optbit the option bit whose changing is significant, or
897 zero if none are
898
899 Returns: pointer to the first significant opcode
900 */
901
902 static const uschar*
903 first_significant_code(const uschar *code, int *options, int optbit)
904 {
905 for (;;)
906 {
907 switch ((int)*code)
908 {
909 case OP_OPT:
910 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
911 *options = (int)code[1];
912 code += 2;
913 break;
914
915 case OP_ASSERT_NOT:
916 case OP_ASSERTBACK:
917 case OP_ASSERTBACK_NOT:
918 do code += GET(code, 1); while (*code == OP_ALT);
919 /* Fall through */
920
921 case OP_CALLOUT:
922 case OP_CREF:
923 case OP_BRANUMBER:
924 case OP_WORD_BOUNDARY:
925 case OP_NOT_WORD_BOUNDARY:
926 code += OP_lengths[*code];
927 break;
928
929 default:
930 return code;
931 }
932 }
933 /* Control never reaches here */
934 }
935
936
937
938
939 /*************************************************
940 * Find the fixed length of a pattern *
941 *************************************************/
942
943 /* Scan a pattern and compute the fixed length of subject that will match it,
944 if the length is fixed. This is needed for dealing with backward assertions.
945 In UTF8 mode, the result is in characters rather than bytes.
946
947 Arguments:
948 code points to the start of the pattern (the bracket)
949 options the compiling options
950
951 Returns: the fixed length, or -1 if there is no fixed length,
952 or -2 if \C was encountered
953 */
954
955 static int
956 find_fixedlength(uschar *code, int options)
957 {
958 int length = -1;
959
960 register int branchlength = 0;
961 register uschar *cc = code + 1 + LINK_SIZE;
962
963 /* Scan along the opcodes for this branch. If we get to the end of the
964 branch, check the length against that of the other branches. */
965
966 for (;;)
967 {
968 int d;
969 register int op = *cc;
970 if (op >= OP_BRA) op = OP_BRA;
971
972 switch (op)
973 {
974 case OP_BRA:
975 case OP_ONCE:
976 case OP_COND:
977 d = find_fixedlength(cc, options);
978 if (d < 0) return d;
979 branchlength += d;
980 do cc += GET(cc, 1); while (*cc == OP_ALT);
981 cc += 1 + LINK_SIZE;
982 break;
983
984 /* Reached end of a branch; if it's a ket it is the end of a nested
985 call. If it's ALT it is an alternation in a nested call. If it is
986 END it's the end of the outer call. All can be handled by the same code. */
987
988 case OP_ALT:
989 case OP_KET:
990 case OP_KETRMAX:
991 case OP_KETRMIN:
992 case OP_END:
993 if (length < 0) length = branchlength;
994 else if (length != branchlength) return -1;
995 if (*cc != OP_ALT) return length;
996 cc += 1 + LINK_SIZE;
997 branchlength = 0;
998 break;
999
1000 /* Skip over assertive subpatterns */
1001
1002 case OP_ASSERT:
1003 case OP_ASSERT_NOT:
1004 case OP_ASSERTBACK:
1005 case OP_ASSERTBACK_NOT:
1006 do cc += GET(cc, 1); while (*cc == OP_ALT);
1007 /* Fall through */
1008
1009 /* Skip over things that don't match chars */
1010
1011 case OP_REVERSE:
1012 case OP_BRANUMBER:
1013 case OP_CREF:
1014 case OP_OPT:
1015 case OP_CALLOUT:
1016 case OP_SOD:
1017 case OP_SOM:
1018 case OP_EOD:
1019 case OP_EODN:
1020 case OP_CIRC:
1021 case OP_DOLL:
1022 case OP_NOT_WORD_BOUNDARY:
1023 case OP_WORD_BOUNDARY:
1024 cc += OP_lengths[*cc];
1025 break;
1026
1027 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1028 This requires a scan of the string, unfortunately. We assume valid UTF-8
1029 strings, so all we do is reduce the length by one for every byte whose bits
1030 are 10xxxxxx. */
1031
1032 case OP_CHARS:
1033 branchlength += *(++cc);
1034 #ifdef SUPPORT_UTF8
1035 if ((options & PCRE_UTF8) != 0)
1036 for (d = 1; d <= *cc; d++)
1037 if ((cc[d] & 0xc0) == 0x80) branchlength--;
1038 #endif
1039 cc += *cc + 1;
1040 break;
1041
1042 /* Handle exact repetitions. The count is already in characters, but we
1043 need to skip over a multibyte character in UTF8 mode. */
1044
1045 case OP_EXACT:
1046 branchlength += GET2(cc,1);
1047 cc += 4;
1048 #ifdef SUPPORT_UTF8
1049 if ((options & PCRE_UTF8) != 0)
1050 {
1051 while((*cc & 0x80) == 0x80) cc++;
1052 }
1053 #endif
1054 break;
1055
1056 case OP_TYPEEXACT:
1057 branchlength += GET2(cc,1);
1058 cc += 4;
1059 break;
1060
1061 /* Handle single-char matchers */
1062
1063 case OP_NOT_DIGIT:
1064 case OP_DIGIT:
1065 case OP_NOT_WHITESPACE:
1066 case OP_WHITESPACE:
1067 case OP_NOT_WORDCHAR:
1068 case OP_WORDCHAR:
1069 case OP_ANY:
1070 branchlength++;
1071 cc++;
1072 break;
1073
1074 /* The single-byte matcher isn't allowed */
1075
1076 case OP_ANYBYTE:
1077 return -2;
1078
1079 /* Check a class for variable quantification */
1080
1081 #ifdef SUPPORT_UTF8
1082 case OP_XCLASS:
1083 cc += GET(cc, 1) - 33;
1084 /* Fall through */
1085 #endif
1086
1087 case OP_CLASS:
1088 case OP_NCLASS:
1089 cc += 33;
1090
1091 switch (*cc)
1092 {
1093 case OP_CRSTAR:
1094 case OP_CRMINSTAR:
1095 case OP_CRQUERY:
1096 case OP_CRMINQUERY:
1097 return -1;
1098
1099 case OP_CRRANGE:
1100 case OP_CRMINRANGE:
1101 if (GET2(cc,1) != GET2(cc,3)) return -1;
1102 branchlength += GET2(cc,1);
1103 cc += 5;
1104 break;
1105
1106 default:
1107 branchlength++;
1108 }
1109 break;
1110
1111 /* Anything else is variable length */
1112
1113 default:
1114 return -1;
1115 }
1116 }
1117 /* Control never gets here */
1118 }
1119
1120
1121
1122
1123 /*************************************************
1124 * Scan compiled regex for numbered bracket *
1125 *************************************************/
1126
1127 /* This little function scans through a compiled pattern until it finds a
1128 capturing bracket with the given number.
1129
1130 Arguments:
1131 code points to start of expression
1132 utf8 TRUE in UTF-8 mode
1133 number the required bracket number
1134
1135 Returns: pointer to the opcode for the bracket, or NULL if not found
1136 */
1137
1138 static const uschar *
1139 find_bracket(const uschar *code, BOOL utf8, int number)
1140 {
1141 for (;;)
1142 {
1143 register int c = *code;
1144 if (c == OP_END) return NULL;
1145 else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1146 else if (c > OP_BRA)
1147 {
1148 int n = c - OP_BRA;
1149 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1150 if (n == number) return (uschar *)code;
1151 code += OP_lengths[OP_BRA];
1152 }
1153 else
1154 {
1155 code += OP_lengths[c];
1156
1157 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1158 by a multi-byte character. The length in the table is a minimum, so we have
1159 to scan along to skip the extra characters. All opcodes are less than 128,
1160 so we can use relatively efficient code. */
1161
1162 #ifdef SUPPORT_UTF8
1163 if (utf8) switch(c)
1164 {
1165 case OP_EXACT:
1166 case OP_UPTO:
1167 case OP_MINUPTO:
1168 case OP_STAR:
1169 case OP_MINSTAR:
1170 case OP_PLUS:
1171 case OP_MINPLUS:
1172 case OP_QUERY:
1173 case OP_MINQUERY:
1174 while ((*code & 0xc0) == 0x80) code++;
1175 break;
1176 }
1177 #endif
1178 }
1179 }
1180 }
1181
1182
1183
1184 /*************************************************
1185 * Scan compiled branch for non-emptiness *
1186 *************************************************/
1187
1188 /* This function scans through a branch of a compiled pattern to see whether it
1189 can match the empty string or not. It is called only from could_be_empty()
1190 below. Note that first_significant_code() skips over assertions. If we hit an
1191 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1192 whose current branch will already have been scanned.
1193
1194 Arguments:
1195 code points to start of search
1196 endcode points to where to stop
1197 utf8 TRUE if in UTF8 mode
1198
1199 Returns: TRUE if what is matched could be empty
1200 */
1201
1202 static BOOL
1203 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1204 {
1205 register int c;
1206 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1207 code < endcode;
1208 code = first_significant_code(code + OP_lengths[c], NULL, 0))
1209 {
1210 const uschar *ccode;
1211
1212 c = *code;
1213
1214 if (c >= OP_BRA)
1215 {
1216 BOOL empty_branch;
1217 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1218
1219 /* Scan a closed bracket */
1220
1221 empty_branch = FALSE;
1222 do
1223 {
1224 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1225 empty_branch = TRUE;
1226 code += GET(code, 1);
1227 }
1228 while (*code == OP_ALT);
1229 if (!empty_branch) return FALSE; /* All branches are non-empty */
1230 code += 1 + LINK_SIZE;
1231 c = *code;
1232 }
1233
1234 else switch (c)
1235 {
1236 /* Check for quantifiers after a class */
1237
1238 #ifdef SUPPORT_UTF8
1239 case OP_XCLASS:
1240 ccode = code + GET(code, 1);
1241 goto CHECK_CLASS_REPEAT;
1242 #endif
1243
1244 case OP_CLASS:
1245 case OP_NCLASS:
1246 ccode = code + 33;
1247
1248 #ifdef SUPPORT_UTF8
1249 CHECK_CLASS_REPEAT:
1250 #endif
1251
1252 switch (*ccode)
1253 {
1254 case OP_CRSTAR: /* These could be empty; continue */
1255 case OP_CRMINSTAR:
1256 case OP_CRQUERY:
1257 case OP_CRMINQUERY:
1258 break;
1259
1260 default: /* Non-repeat => class must match */
1261 case OP_CRPLUS: /* These repeats aren't empty */
1262 case OP_CRMINPLUS:
1263 return FALSE;
1264
1265 case OP_CRRANGE:
1266 case OP_CRMINRANGE:
1267 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1268 break;
1269 }
1270 break;
1271
1272 /* Opcodes that must match a character */
1273
1274 case OP_NOT_DIGIT:
1275 case OP_DIGIT:
1276 case OP_NOT_WHITESPACE:
1277 case OP_WHITESPACE:
1278 case OP_NOT_WORDCHAR:
1279 case OP_WORDCHAR:
1280 case OP_ANY:
1281 case OP_ANYBYTE:
1282 case OP_CHARS:
1283 case OP_NOT:
1284 case OP_PLUS:
1285 case OP_MINPLUS:
1286 case OP_EXACT:
1287 case OP_NOTPLUS:
1288 case OP_NOTMINPLUS:
1289 case OP_NOTEXACT:
1290 case OP_TYPEPLUS:
1291 case OP_TYPEMINPLUS:
1292 case OP_TYPEEXACT:
1293 return FALSE;
1294
1295 /* End of branch */
1296
1297 case OP_KET:
1298 case OP_KETRMAX:
1299 case OP_KETRMIN:
1300 case OP_ALT:
1301 return TRUE;
1302
1303 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1304 followed by a multibyte character */
1305
1306 #ifdef SUPPORT_UTF8
1307 case OP_STAR:
1308 case OP_MINSTAR:
1309 case OP_QUERY:
1310 case OP_MINQUERY:
1311 case OP_UPTO:
1312 case OP_MINUPTO:
1313 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1314 break;
1315 #endif
1316 }
1317 }
1318
1319 return TRUE;
1320 }
1321
1322
1323
1324 /*************************************************
1325 * Scan compiled regex for non-emptiness *
1326 *************************************************/
1327
1328 /* This function is called to check for left recursive calls. We want to check
1329 the current branch of the current pattern to see if it could match the empty
1330 string. If it could, we must look outwards for branches at other levels,
1331 stopping when we pass beyond the bracket which is the subject of the recursion.
1332
1333 Arguments:
1334 code points to start of the recursion
1335 endcode points to where to stop (current RECURSE item)
1336 bcptr points to the chain of current (unclosed) branch starts
1337 utf8 TRUE if in UTF-8 mode
1338
1339 Returns: TRUE if what is matched could be empty
1340 */
1341
1342 static BOOL
1343 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1344 BOOL utf8)
1345 {
1346 while (bcptr != NULL && bcptr->current >= code)
1347 {
1348 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1349 bcptr = bcptr->outer;
1350 }
1351 return TRUE;
1352 }
1353
1354
1355
1356 /*************************************************
1357 * Check for POSIX class syntax *
1358 *************************************************/
1359
1360 /* This function is called when the sequence "[:" or "[." or "[=" is
1361 encountered in a character class. It checks whether this is followed by an
1362 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1363 ".]" or "=]".
1364
1365 Argument:
1366 ptr pointer to the initial [
1367 endptr where to return the end pointer
1368 cd pointer to compile data
1369
1370 Returns: TRUE or FALSE
1371 */
1372
1373 static BOOL
1374 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1375 {
1376 int terminator; /* Don't combine these lines; the Solaris cc */
1377 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1378 if (*(++ptr) == '^') ptr++;
1379 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1380 if (*ptr == terminator && ptr[1] == ']')
1381 {
1382 *endptr = ptr;
1383 return TRUE;
1384 }
1385 return FALSE;
1386 }
1387
1388
1389
1390
1391 /*************************************************
1392 * Check POSIX class name *
1393 *************************************************/
1394
1395 /* This function is called to check the name given in a POSIX-style class entry
1396 such as [:alnum:].
1397
1398 Arguments:
1399 ptr points to the first letter
1400 len the length of the name
1401
1402 Returns: a value representing the name, or -1 if unknown
1403 */
1404
1405 static int
1406 check_posix_name(const uschar *ptr, int len)
1407 {
1408 register int yield = 0;
1409 while (posix_name_lengths[yield] != 0)
1410 {
1411 if (len == posix_name_lengths[yield] &&
1412 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1413 yield++;
1414 }
1415 return -1;
1416 }
1417
1418
1419
1420
1421 /*************************************************
1422 * Compile one branch *
1423 *************************************************/
1424
1425 /* Scan the pattern, compiling it into the code vector. If the options are
1426 changed during the branch, the pointer is used to change the external options
1427 bits.
1428
1429 Arguments:
1430 optionsptr pointer to the option bits
1431 brackets points to number of extracting brackets used
1432 code points to the pointer to the current code point
1433 ptrptr points to the current pattern pointer
1434 errorptr points to pointer to error message
1435 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1436 reqbyteptr set to the last literal character required, else < 0
1437 bcptr points to current branch chain
1438 cd contains pointers to tables etc.
1439
1440 Returns: TRUE on success
1441 FALSE, with *errorptr set on error
1442 */
1443
1444 static BOOL
1445 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1446 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1447 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1448 {
1449 int repeat_type, op_type;
1450 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1451 int bravalue = 0;
1452 int length;
1453 int greedy_default, greedy_non_default;
1454 int firstbyte, reqbyte;
1455 int zeroreqbyte, zerofirstbyte;
1456 int req_caseopt;
1457 int condcount = 0;
1458 int options = *optionsptr;
1459 register int c;
1460 register uschar *code = *codeptr;
1461 uschar *tempcode;
1462 BOOL inescq = FALSE;
1463 BOOL groupsetfirstbyte = FALSE;
1464 const uschar *ptr = *ptrptr;
1465 const uschar *tempptr;
1466 uschar *previous = NULL;
1467 uschar class[32];
1468
1469 #ifdef SUPPORT_UTF8
1470 BOOL class_utf8;
1471 BOOL utf8 = (options & PCRE_UTF8) != 0;
1472 uschar *class_utf8data;
1473 uschar utf8_char[6];
1474 #else
1475 BOOL utf8 = FALSE;
1476 #endif
1477
1478 /* Set up the default and non-default settings for greediness */
1479
1480 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1481 greedy_non_default = greedy_default ^ 1;
1482
1483 /* Initialize no first char, no required char. REQ_UNSET means "no char
1484 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1485 matches a non-fixed char first char; reqbyte just remains unset if we never
1486 find one.
1487
1488 When we hit a repeat whose minimum is zero, we may have to adjust these values
1489 to take the zero repeat into account. This is implemented by setting them to
1490 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1491 item types that can be repeated set these backoff variables appropriately. */
1492
1493 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1494
1495 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1496 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1497 value > 255. It is added into the firstbyte or reqbyte variables to record the
1498 case status of the value. */
1499
1500 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1501
1502 /* Switch on next character until the end of the branch */
1503
1504 for (;; ptr++)
1505 {
1506 BOOL negate_class;
1507 BOOL possessive_quantifier;
1508 int class_charcount;
1509 int class_lastchar;
1510 int newoptions;
1511 int recno;
1512 int skipbytes;
1513 int subreqbyte;
1514 int subfirstbyte;
1515
1516 c = *ptr;
1517 if (inescq && c != 0) goto NORMAL_CHAR;
1518
1519 if ((options & PCRE_EXTENDED) != 0)
1520 {
1521 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1522 if (c == '#')
1523 {
1524 /* The space before the ; is to avoid a warning on a silly compiler
1525 on the Macintosh. */
1526 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1527 if (c != 0) continue; /* Else fall through to handle end of string */
1528 }
1529 }
1530
1531 switch(c)
1532 {
1533 /* The branch terminates at end of string, |, or ). */
1534
1535 case 0:
1536 case '|':
1537 case ')':
1538 *firstbyteptr = firstbyte;
1539 *reqbyteptr = reqbyte;
1540 *codeptr = code;
1541 *ptrptr = ptr;
1542 return TRUE;
1543
1544 /* Handle single-character metacharacters. In multiline mode, ^ disables
1545 the setting of any following char as a first character. */
1546
1547 case '^':
1548 if ((options & PCRE_MULTILINE) != 0)
1549 {
1550 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1551 }
1552 previous = NULL;
1553 *code++ = OP_CIRC;
1554 break;
1555
1556 case '$':
1557 previous = NULL;
1558 *code++ = OP_DOLL;
1559 break;
1560
1561 /* There can never be a first char if '.' is first, whatever happens about
1562 repeats. The value of reqbyte doesn't change either. */
1563
1564 case '.':
1565 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1566 zerofirstbyte = firstbyte;
1567 zeroreqbyte = reqbyte;
1568 previous = code;
1569 *code++ = OP_ANY;
1570 break;
1571
1572 /* Character classes. If the included characters are all < 255 in value, we
1573 build a 32-byte bitmap of the permitted characters, except in the special
1574 case where there is only one such character. For negated classes, we build
1575 the map as usual, then invert it at the end. However, we use a different
1576 opcode so that data characters > 255 can be handled correctly.
1577
1578 If the class contains characters outside the 0-255 range, a different
1579 opcode is compiled. It may optionally have a bit map for characters < 256,
1580 but those above are are explicitly listed afterwards. A flag byte tells
1581 whether the bitmap is present, and whether this is a negated class or not.
1582 */
1583
1584 case '[':
1585 previous = code;
1586
1587 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1588 they are encountered at the top level, so we'll do that too. */
1589
1590 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1591 check_posix_syntax(ptr, &tempptr, cd))
1592 {
1593 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1594 goto FAILED;
1595 }
1596
1597 /* If the first character is '^', set the negation flag and skip it. */
1598
1599 if ((c = *(++ptr)) == '^')
1600 {
1601 negate_class = TRUE;
1602 c = *(++ptr);
1603 }
1604 else
1605 {
1606 negate_class = FALSE;
1607 }
1608
1609 /* Keep a count of chars with values < 256 so that we can optimize the case
1610 of just a single character (as long as it's < 256). For higher valued UTF-8
1611 characters, we don't yet do any optimization. */
1612
1613 class_charcount = 0;
1614 class_lastchar = -1;
1615
1616 #ifdef SUPPORT_UTF8
1617 class_utf8 = FALSE; /* No chars >= 256 */
1618 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1619 #endif
1620
1621 /* Initialize the 32-char bit map to all zeros. We have to build the
1622 map in a temporary bit of store, in case the class contains only 1
1623 character (< 256), because in that case the compiled code doesn't use the
1624 bit map. */
1625
1626 memset(class, 0, 32 * sizeof(uschar));
1627
1628 /* Process characters until ] is reached. By writing this as a "do" it
1629 means that an initial ] is taken as a data character. The first pass
1630 through the regex checked the overall syntax, so we don't need to be very
1631 strict here. At the start of the loop, c contains the first byte of the
1632 character. */
1633
1634 do
1635 {
1636 #ifdef SUPPORT_UTF8
1637 if (utf8 && c > 127) GETCHARLEN(c, ptr, ptr);
1638 #endif
1639
1640 /* Inside \Q...\E everything is literal except \E */
1641
1642 if (inescq)
1643 {
1644 if (c == '\\' && ptr[1] == 'E')
1645 {
1646 inescq = FALSE;
1647 ptr++;
1648 continue;
1649 }
1650 else goto LONE_SINGLE_CHARACTER;
1651 }
1652
1653 /* Handle POSIX class names. Perl allows a negation extension of the
1654 form [:^name:]. A square bracket that doesn't match the syntax is
1655 treated as a literal. We also recognize the POSIX constructions
1656 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1657 5.6 and 5.8 do. */
1658
1659 if (c == '[' &&
1660 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1661 check_posix_syntax(ptr, &tempptr, cd))
1662 {
1663 BOOL local_negate = FALSE;
1664 int posix_class, i;
1665 register const uschar *cbits = cd->cbits;
1666
1667 if (ptr[1] != ':')
1668 {
1669 *errorptr = ERR31;
1670 goto FAILED;
1671 }
1672
1673 ptr += 2;
1674 if (*ptr == '^')
1675 {
1676 local_negate = TRUE;
1677 ptr++;
1678 }
1679
1680 posix_class = check_posix_name(ptr, tempptr - ptr);
1681 if (posix_class < 0)
1682 {
1683 *errorptr = ERR30;
1684 goto FAILED;
1685 }
1686
1687 /* If matching is caseless, upper and lower are converted to
1688 alpha. This relies on the fact that the class table starts with
1689 alpha, lower, upper as the first 3 entries. */
1690
1691 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1692 posix_class = 0;
1693
1694 /* Or into the map we are building up to 3 of the static class
1695 tables, or their negations. The [:blank:] class sets up the same
1696 chars as the [:space:] class (all white space). We remove the vertical
1697 white space chars afterwards. */
1698
1699 posix_class *= 3;
1700 for (i = 0; i < 3; i++)
1701 {
1702 BOOL isblank = strncmp(ptr, "blank", 5) == 0;
1703 int taboffset = posix_class_maps[posix_class + i];
1704 if (taboffset < 0) break;
1705 if (local_negate)
1706 {
1707 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1708 if (isblank) class[1] |= 0x3c;
1709 }
1710 else
1711 {
1712 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1713 if (isblank) class[1] &= ~0x3c;
1714 }
1715 }
1716
1717 ptr = tempptr + 1;
1718 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1719 continue; /* End of POSIX syntax handling */
1720 }
1721
1722 /* Backslash may introduce a single character, or it may introduce one
1723 of the specials, which just set a flag. Escaped items are checked for
1724 validity in the pre-compiling pass. The sequence \b is a special case.
1725 Inside a class (and only there) it is treated as backspace. Elsewhere
1726 it marks a word boundary. Other escapes have preset maps ready to
1727 or into the one we are building. We assume they have more than one
1728 character in them, so set class_charcount bigger than one. */
1729
1730 if (c == '\\')
1731 {
1732 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1733 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1734
1735 if (-c == ESC_Q) /* Handle start of quoted string */
1736 {
1737 if (ptr[1] == '\\' && ptr[2] == 'E')
1738 {
1739 ptr += 2; /* avoid empty string */
1740 }
1741 else inescq = TRUE;
1742 continue;
1743 }
1744
1745 else if (c < 0)
1746 {
1747 register const uschar *cbits = cd->cbits;
1748 class_charcount = 10; /* Greater than 1 is what matters */
1749 switch (-c)
1750 {
1751 case ESC_d:
1752 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1753 continue;
1754
1755 case ESC_D:
1756 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1757 continue;
1758
1759 case ESC_w:
1760 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1761 continue;
1762
1763 case ESC_W:
1764 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1765 continue;
1766
1767 case ESC_s:
1768 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1769 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1770 continue;
1771
1772 case ESC_S:
1773 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1774 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1775 continue;
1776
1777 /* Unrecognized escapes are faulted if PCRE is running in its
1778 strict mode. By default, for compatibility with Perl, they are
1779 treated as literals. */
1780
1781 default:
1782 if ((options & PCRE_EXTRA) != 0)
1783 {
1784 *errorptr = ERR7;
1785 goto FAILED;
1786 }
1787 c = *ptr; /* The final character */
1788 }
1789 }
1790
1791 /* Fall through if we have a single character (c >= 0). This may be
1792 > 256 in UTF-8 mode. */
1793
1794 } /* End of backslash handling */
1795
1796 /* A single character may be followed by '-' to form a range. However,
1797 Perl does not permit ']' to be the end of the range. A '-' character
1798 here is treated as a literal. */
1799
1800 if (ptr[1] == '-' && ptr[2] != ']')
1801 {
1802 int d;
1803 ptr += 2;
1804
1805 #ifdef SUPPORT_UTF8
1806 if (utf8)
1807 { /* Braces are required because the */
1808 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1809 }
1810 else
1811 #endif
1812 d = *ptr;
1813
1814 /* The second part of a range can be a single-character escape, but
1815 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1816 in such circumstances. */
1817
1818 if (d == '\\')
1819 {
1820 const uschar *oldptr = ptr;
1821 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1822
1823 /* \b is backslash; any other special means the '-' was literal */
1824
1825 if (d < 0)
1826 {
1827 if (d == -ESC_b) d = '\b'; else
1828 {
1829 ptr = oldptr - 2;
1830 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1831 }
1832 }
1833 }
1834
1835 /* Check that the two values are in the correct order */
1836
1837 if (d < c)
1838 {
1839 *errorptr = ERR8;
1840 goto FAILED;
1841 }
1842
1843 /* If d is greater than 255, we can't just use the bit map, so set up
1844 for the UTF-8 supporting class type. If we are not caseless, we can
1845 just set up a single range. If we are caseless, the characters < 256
1846 are handled with a bitmap, in order to get the case-insensitive
1847 handling. */
1848
1849 #ifdef SUPPORT_UTF8
1850 if (d > 255)
1851 {
1852 class_utf8 = TRUE;
1853 *class_utf8data++ = XCL_RANGE;
1854 if ((options & PCRE_CASELESS) == 0)
1855 {
1856 class_utf8data += ord2utf8(c, class_utf8data);
1857 class_utf8data += ord2utf8(d, class_utf8data);
1858 continue; /* Go get the next char in the class */
1859 }
1860 class_utf8data += ord2utf8(256, class_utf8data);
1861 class_utf8data += ord2utf8(d, class_utf8data);
1862 d = 255;
1863 /* Fall through */
1864 }
1865 #endif
1866 /* We use the bit map if the range is entirely < 255, or if part of it
1867 is < 255 and matching is caseless. */
1868
1869 for (; c <= d; c++)
1870 {
1871 class[c/8] |= (1 << (c&7));
1872 if ((options & PCRE_CASELESS) != 0)
1873 {
1874 int uc = cd->fcc[c]; /* flip case */
1875 class[uc/8] |= (1 << (uc&7));
1876 }
1877 class_charcount++; /* in case a one-char range */
1878 class_lastchar = c;
1879 }
1880
1881 continue; /* Go get the next char in the class */
1882 }
1883
1884 /* Handle a lone single character - we can get here for a normal
1885 non-escape char, or after \ that introduces a single character. */
1886
1887 LONE_SINGLE_CHARACTER:
1888
1889 /* Handle a multibyte character */
1890
1891 #ifdef SUPPORT_UTF8
1892 if (utf8 && c > 255)
1893 {
1894 class_utf8 = TRUE;
1895 *class_utf8data++ = XCL_SINGLE;
1896 class_utf8data += ord2utf8(c, class_utf8data);
1897 }
1898 else
1899 #endif
1900 /* Handle a single-byte character */
1901 {
1902 class [c/8] |= (1 << (c&7));
1903 if ((options & PCRE_CASELESS) != 0)
1904 {
1905 c = cd->fcc[c]; /* flip case */
1906 class[c/8] |= (1 << (c&7));
1907 }
1908 class_charcount++;
1909 class_lastchar = c;
1910 }
1911 }
1912
1913 /* Loop until ']' reached; the check for end of string happens inside the
1914 loop. This "while" is the end of the "do" above. */
1915
1916 while ((c = *(++ptr)) != ']' || inescq);
1917
1918 /* If class_charcount is 1, we saw precisely one character with a value <
1919 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1920 the one character is < 128. In non-UTF-8 mode we can always optimize.
1921
1922 The optimization throws away the bit map. We turn the item into a
1923 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1924 that OP_NOT does not support multibyte characters. In the positive case, it
1925 can cause firstbyte to be set. Otherwise, there can be no first char if
1926 this item is first, whatever repeat count may follow. In the case of
1927 reqbyte, save the previous value for reinstating. */
1928
1929 #ifdef SUPPORT_UTF8
1930 if (!class_utf8 && class_charcount == 1 && class_lastchar < 128)
1931 #else
1932 if (class_charcount == 1)
1933 #endif
1934 {
1935 zeroreqbyte = reqbyte;
1936 if (negate_class)
1937 {
1938 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1939 zerofirstbyte = firstbyte;
1940 *code++ = OP_NOT;
1941 }
1942 else
1943 {
1944 if (firstbyte == REQ_UNSET)
1945 {
1946 zerofirstbyte = REQ_NONE;
1947 firstbyte = class_lastchar | req_caseopt;
1948 }
1949 else
1950 {
1951 zerofirstbyte = firstbyte;
1952 reqbyte = class_lastchar | req_caseopt;
1953 }
1954 *code++ = OP_CHARS;
1955 *code++ = 1;
1956 }
1957 *code++ = class_lastchar;
1958 break; /* End of class handling */
1959 } /* End of 1-byte optimization */
1960
1961 /* Otherwise, if this is the first thing in the branch, there can be no
1962 first char setting, whatever the repeat count. Any reqbyte setting must
1963 remain unchanged after any kind of repeat. */
1964
1965 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1966 zerofirstbyte = firstbyte;
1967 zeroreqbyte = reqbyte;
1968
1969 /* If there are characters with values > 255, we have to compile an
1970 extended class, with its own opcode. If there are no characters < 256,
1971 we can omit the bitmap. */
1972
1973 #ifdef SUPPORT_UTF8
1974 if (class_utf8)
1975 {
1976 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
1977 *code++ = OP_XCLASS;
1978 code += LINK_SIZE;
1979 *code = negate_class? XCL_NOT : 0;
1980
1981 /* If the map is required, install it, and move on to the end of
1982 the extra data */
1983
1984 if (class_charcount > 0)
1985 {
1986 *code++ |= XCL_MAP;
1987 memcpy(code, class, 32);
1988 code = class_utf8data;
1989 }
1990
1991 /* If the map is not required, slide down the extra data. */
1992
1993 else
1994 {
1995 int len = class_utf8data - (code + 33);
1996 memmove(code + 1, code + 33, len);
1997 code += len + 1;
1998 }
1999
2000 /* Now fill in the complete length of the item */
2001
2002 PUT(previous, 1, code - previous);
2003 break; /* End of class handling */
2004 }
2005 #endif
2006
2007 /* If there are no characters > 255, negate the 32-byte map if necessary,
2008 and copy it into the code vector. If this is the first thing in the branch,
2009 there can be no first char setting, whatever the repeat count. Any reqbyte
2010 setting must remain unchanged after any kind of repeat. */
2011
2012 if (negate_class)
2013 {
2014 *code++ = OP_NCLASS;
2015 for (c = 0; c < 32; c++) code[c] = ~class[c];
2016 }
2017 else
2018 {
2019 *code++ = OP_CLASS;
2020 memcpy(code, class, 32);
2021 }
2022 code += 32;
2023 break;
2024
2025 /* Various kinds of repeat */
2026
2027 case '{':
2028 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
2029 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
2030 if (*errorptr != NULL) goto FAILED;
2031 goto REPEAT;
2032
2033 case '*':
2034 repeat_min = 0;
2035 repeat_max = -1;
2036 goto REPEAT;
2037
2038 case '+':
2039 repeat_min = 1;
2040 repeat_max = -1;
2041 goto REPEAT;
2042
2043 case '?':
2044 repeat_min = 0;
2045 repeat_max = 1;
2046
2047 REPEAT:
2048 if (previous == NULL)
2049 {
2050 *errorptr = ERR9;
2051 goto FAILED;
2052 }
2053
2054 if (repeat_min == 0)
2055 {
2056 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2057 reqbyte = zeroreqbyte; /* Ditto */
2058 }
2059
2060 op_type = 0; /* Default single-char op codes */
2061 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2062
2063 /* Save start of previous item, in case we have to move it up to make space
2064 for an inserted OP_ONCE for the additional '+' extension. */
2065
2066 tempcode = previous;
2067
2068 /* If the next character is '+', we have a possessive quantifier. This
2069 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2070 If the next character is '?' this is a minimizing repeat, by default,
2071 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2072 repeat type to the non-default. */
2073
2074 if (ptr[1] == '+')
2075 {
2076 repeat_type = 0; /* Force greedy */
2077 possessive_quantifier = TRUE;
2078 ptr++;
2079 }
2080 else if (ptr[1] == '?')
2081 {
2082 repeat_type = greedy_non_default;
2083 ptr++;
2084 }
2085 else repeat_type = greedy_default;
2086
2087 /* If previous was a recursion, we need to wrap it inside brackets so that
2088 it can be replicated if necessary. */
2089
2090 if (*previous == OP_RECURSE)
2091 {
2092 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2093 code += 1 + LINK_SIZE;
2094 *previous = OP_BRA;
2095 PUT(previous, 1, code - previous);
2096 *code = OP_KET;
2097 PUT(code, 1, code - previous);
2098 code += 1 + LINK_SIZE;
2099 }
2100
2101 /* If previous was a string of characters, chop off the last one and use it
2102 as the subject of the repeat. If there was only one character, we can
2103 abolish the previous item altogether. If a one-char item has a minumum of
2104 more than one, ensure that it is set in reqbyte - it might not be if a
2105 sequence such as x{3} is the first thing in a branch because the x will
2106 have gone into firstbyte instead. */
2107
2108 if (*previous == OP_CHARS)
2109 {
2110 /* Deal with UTF-8 characters that take up more than one byte. It's
2111 easier to write this out separately than try to macrify it. Use c to
2112 hold the length of the character in bytes, plus 0x80 to flag that it's a
2113 length rather than a small character. */
2114
2115 #ifdef SUPPORT_UTF8
2116 if (utf8 && (code[-1] & 0x80) != 0)
2117 {
2118 uschar *lastchar = code - 1;
2119 while((*lastchar & 0xc0) == 0x80) lastchar--;
2120 c = code - lastchar; /* Length of UTF-8 character */
2121 memcpy(utf8_char, lastchar, c); /* Save the char */
2122 if (lastchar == previous + 2) /* There was only one character */
2123 {
2124 code = previous; /* Abolish the previous item */
2125 }
2126 else
2127 {
2128 previous[1] -= c; /* Adjust length of previous */
2129 code = lastchar; /* Lost char off the end */
2130 tempcode = code; /* Adjust position to be moved for '+' */
2131 }
2132 c |= 0x80; /* Flag c as a length */
2133 }
2134 else
2135 #endif
2136
2137 /* Handle the case of a single byte - either with no UTF8 support, or
2138 with UTF-8 disabled, or for a UTF-8 character < 128. */
2139
2140 {
2141 c = *(--code);
2142 if (code == previous + 2) /* There was only one character */
2143 {
2144 code = previous; /* Abolish the previous item */
2145 if (repeat_min > 1) reqbyte = c | req_caseopt;
2146 }
2147 else
2148 {
2149 previous[1]--; /* adjust length */
2150 tempcode = code; /* Adjust position to be moved for '+' */
2151 }
2152 }
2153
2154 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2155 }
2156
2157 /* If previous was a single negated character ([^a] or similar), we use
2158 one of the special opcodes, replacing it. The code is shared with single-
2159 character repeats by setting opt_type to add a suitable offset into
2160 repeat_type. OP_NOT is currently used only for single-byte chars. */
2161
2162 else if (*previous == OP_NOT)
2163 {
2164 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2165 c = previous[1];
2166 code = previous;
2167 goto OUTPUT_SINGLE_REPEAT;
2168 }
2169
2170 /* If previous was a character type match (\d or similar), abolish it and
2171 create a suitable repeat item. The code is shared with single-character
2172 repeats by setting op_type to add a suitable offset into repeat_type. */
2173
2174 else if (*previous < OP_EODN)
2175 {
2176 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2177 c = *previous;
2178 code = previous;
2179
2180 OUTPUT_SINGLE_REPEAT:
2181
2182 /* If the maximum is zero then the minimum must also be zero; Perl allows
2183 this case, so we do too - by simply omitting the item altogether. */
2184
2185 if (repeat_max == 0) goto END_REPEAT;
2186
2187 /* Combine the op_type with the repeat_type */
2188
2189 repeat_type += op_type;
2190
2191 /* A minimum of zero is handled either as the special case * or ?, or as
2192 an UPTO, with the maximum given. */
2193
2194 if (repeat_min == 0)
2195 {
2196 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2197 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2198 else
2199 {
2200 *code++ = OP_UPTO + repeat_type;
2201 PUT2INC(code, 0, repeat_max);
2202 }
2203 }
2204
2205 /* The case {1,} is handled as the special case + */
2206
2207 else if (repeat_min == 1 && repeat_max == -1)
2208 *code++ = OP_PLUS + repeat_type;
2209
2210 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2211 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2212
2213 else
2214 {
2215 if (repeat_min != 1)
2216 {
2217 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2218 PUT2INC(code, 0, repeat_min);
2219 }
2220
2221 /* If the mininum is 1 and the previous item was a character string,
2222 we either have to put back the item that got cancelled if the string
2223 length was 1, or add the character back onto the end of a longer
2224 string. For a character type nothing need be done; it will just get
2225 put back naturally. Note that the final character is always going to
2226 get added below, so we leave code ready for its insertion. */
2227
2228 else if (*previous == OP_CHARS)
2229 {
2230 if (code == previous) code += 2; else
2231
2232 /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2233 bit set as a flag. The length will always be between 2 and 6. */
2234
2235 #ifdef SUPPORT_UTF8
2236 if (utf8 && c >= 128) previous[1] += c & 7; else
2237 #endif
2238 previous[1]++;
2239 }
2240
2241 /* For a single negated character we also have to put back the
2242 item that got cancelled. At present this applies only to single byte
2243 characters in any mode. */
2244
2245 else if (*previous == OP_NOT) code++;
2246
2247 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2248 we have to insert the character for the previous code. In UTF-8 mode,
2249 long characters have their length in c, with the 0x80 bit as a flag. */
2250
2251 if (repeat_max < 0)
2252 {
2253 #ifdef SUPPORT_UTF8
2254 if (utf8 && c >= 128)
2255 {
2256 memcpy(code, utf8_char, c & 7);
2257 code += c & 7;
2258 }
2259 else
2260 #endif
2261 *code++ = c;
2262 *code++ = OP_STAR + repeat_type;
2263 }
2264
2265 /* Else insert an UPTO if the max is greater than the min, again
2266 preceded by the character, for the previously inserted code. */
2267
2268 else if (repeat_max != repeat_min)
2269 {
2270 #ifdef SUPPORT_UTF8
2271 if (utf8 && c >= 128)
2272 {
2273 memcpy(code, utf8_char, c & 7);
2274 code += c & 7;
2275 }
2276 else
2277 #endif
2278 *code++ = c;
2279 repeat_max -= repeat_min;
2280 *code++ = OP_UPTO + repeat_type;
2281 PUT2INC(code, 0, repeat_max);
2282 }
2283 }
2284
2285 /* The character or character type itself comes last in all cases. */
2286
2287 #ifdef SUPPORT_UTF8
2288 if (utf8 && c >= 128)
2289 {
2290 memcpy(code, utf8_char, c & 7);
2291 code += c & 7;
2292 }
2293 else
2294 #endif
2295
2296 *code++ = c;
2297 }
2298
2299 /* If previous was a character class or a back reference, we put the repeat
2300 stuff after it, but just skip the item if the repeat was {0,0}. */
2301
2302 else if (*previous == OP_CLASS ||
2303 *previous == OP_NCLASS ||
2304 #ifdef SUPPORT_UTF8
2305 *previous == OP_XCLASS ||
2306 #endif
2307 *previous == OP_REF)
2308 {
2309 if (repeat_max == 0)
2310 {
2311 code = previous;
2312 goto END_REPEAT;
2313 }
2314 if (repeat_min == 0 && repeat_max == -1)
2315 *code++ = OP_CRSTAR + repeat_type;
2316 else if (repeat_min == 1 && repeat_max == -1)
2317 *code++ = OP_CRPLUS + repeat_type;
2318 else if (repeat_min == 0 && repeat_max == 1)
2319 *code++ = OP_CRQUERY + repeat_type;
2320 else
2321 {
2322 *code++ = OP_CRRANGE + repeat_type;
2323 PUT2INC(code, 0, repeat_min);
2324 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2325 PUT2INC(code, 0, repeat_max);
2326 }
2327 }
2328
2329 /* If previous was a bracket group, we may have to replicate it in certain
2330 cases. */
2331
2332 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2333 *previous == OP_COND)
2334 {
2335 register int i;
2336 int ketoffset = 0;
2337 int len = code - previous;
2338 uschar *bralink = NULL;
2339
2340 /* If the maximum repeat count is unlimited, find the end of the bracket
2341 by scanning through from the start, and compute the offset back to it
2342 from the current code pointer. There may be an OP_OPT setting following
2343 the final KET, so we can't find the end just by going back from the code
2344 pointer. */
2345
2346 if (repeat_max == -1)
2347 {
2348 register uschar *ket = previous;
2349 do ket += GET(ket, 1); while (*ket != OP_KET);
2350 ketoffset = code - ket;
2351 }
2352
2353 /* The case of a zero minimum is special because of the need to stick
2354 OP_BRAZERO in front of it, and because the group appears once in the
2355 data, whereas in other cases it appears the minimum number of times. For
2356 this reason, it is simplest to treat this case separately, as otherwise
2357 the code gets far too messy. There are several special subcases when the
2358 minimum is zero. */
2359
2360 if (repeat_min == 0)
2361 {
2362 /* If the maximum is also zero, we just omit the group from the output
2363 altogether. */
2364
2365 if (repeat_max == 0)
2366 {
2367 code = previous;
2368 goto END_REPEAT;
2369 }
2370
2371 /* If the maximum is 1 or unlimited, we just have to stick in the
2372 BRAZERO and do no more at this point. */
2373
2374 if (repeat_max <= 1)
2375 {
2376 memmove(previous+1, previous, len);
2377 code++;
2378 *previous++ = OP_BRAZERO + repeat_type;
2379 }
2380
2381 /* If the maximum is greater than 1 and limited, we have to replicate
2382 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2383 The first one has to be handled carefully because it's the original
2384 copy, which has to be moved up. The remainder can be handled by code
2385 that is common with the non-zero minimum case below. We just have to
2386 adjust the value or repeat_max, since one less copy is required. */
2387
2388 else
2389 {
2390 int offset;
2391 memmove(previous + 2 + LINK_SIZE, previous, len);
2392 code += 2 + LINK_SIZE;
2393 *previous++ = OP_BRAZERO + repeat_type;
2394 *previous++ = OP_BRA;
2395
2396 /* We chain together the bracket offset fields that have to be
2397 filled in later when the ends of the brackets are reached. */
2398
2399 offset = (bralink == NULL)? 0 : previous - bralink;
2400 bralink = previous;
2401 PUTINC(previous, 0, offset);
2402 }
2403
2404 repeat_max--;
2405 }
2406
2407 /* If the minimum is greater than zero, replicate the group as many
2408 times as necessary, and adjust the maximum to the number of subsequent
2409 copies that we need. If we set a first char from the group, and didn't
2410 set a required char, copy the latter from the former. */
2411
2412 else
2413 {
2414 if (repeat_min > 1)
2415 {
2416 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2417 for (i = 1; i < repeat_min; i++)
2418 {
2419 memcpy(code, previous, len);
2420 code += len;
2421 }
2422 }
2423 if (repeat_max > 0) repeat_max -= repeat_min;
2424 }
2425
2426 /* This code is common to both the zero and non-zero minimum cases. If
2427 the maximum is limited, it replicates the group in a nested fashion,
2428 remembering the bracket starts on a stack. In the case of a zero minimum,
2429 the first one was set up above. In all cases the repeat_max now specifies
2430 the number of additional copies needed. */
2431
2432 if (repeat_max >= 0)
2433 {
2434 for (i = repeat_max - 1; i >= 0; i--)
2435 {
2436 *code++ = OP_BRAZERO + repeat_type;
2437
2438 /* All but the final copy start a new nesting, maintaining the
2439 chain of brackets outstanding. */
2440
2441 if (i != 0)
2442 {
2443 int offset;
2444 *code++ = OP_BRA;
2445 offset = (bralink == NULL)? 0 : code - bralink;
2446 bralink = code;
2447 PUTINC(code, 0, offset);
2448 }
2449
2450 memcpy(code, previous, len);
2451 code += len;
2452 }
2453
2454 /* Now chain through the pending brackets, and fill in their length
2455 fields (which are holding the chain links pro tem). */
2456
2457 while (bralink != NULL)
2458 {
2459 int oldlinkoffset;
2460 int offset = code - bralink + 1;
2461 uschar *bra = code - offset;
2462 oldlinkoffset = GET(bra, 1);
2463 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2464 *code++ = OP_KET;
2465 PUTINC(code, 0, offset);
2466 PUT(bra, 1, offset);
2467 }
2468 }
2469
2470 /* If the maximum is unlimited, set a repeater in the final copy. We
2471 can't just offset backwards from the current code point, because we
2472 don't know if there's been an options resetting after the ket. The
2473 correct offset was computed above. */
2474
2475 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2476 }
2477
2478 /* Else there's some kind of shambles */
2479
2480 else
2481 {
2482 *errorptr = ERR11;
2483 goto FAILED;
2484 }
2485
2486 /* If the character following a repeat is '+', we wrap the entire repeated
2487 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2488 Sun's Java package. The repeated item starts at tempcode, not at previous,
2489 which might be the first part of a string whose (former) last char we
2490 repeated. However, we don't support '+' after a greediness '?'. */
2491
2492 if (possessive_quantifier)
2493 {
2494 int len = code - tempcode;
2495 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2496 code += 1 + LINK_SIZE;
2497 len += 1 + LINK_SIZE;
2498 tempcode[0] = OP_ONCE;
2499 *code++ = OP_KET;
2500 PUTINC(code, 0, len);
2501 PUT(tempcode, 1, len);
2502 }
2503
2504 /* In all case we no longer have a previous item. */
2505
2506 END_REPEAT:
2507 previous = NULL;
2508 break;
2509
2510
2511 /* Start of nested bracket sub-expression, or comment or lookahead or
2512 lookbehind or option setting or condition. First deal with special things
2513 that can come after a bracket; all are introduced by ?, and the appearance
2514 of any of them means that this is not a referencing group. They were
2515 checked for validity in the first pass over the string, so we don't have to
2516 check for syntax errors here. */
2517
2518 case '(':
2519 newoptions = options;
2520 skipbytes = 0;
2521
2522 if (*(++ptr) == '?')
2523 {
2524 int set, unset;
2525 int *optset;
2526
2527 switch (*(++ptr))
2528 {
2529 case '#': /* Comment; skip to ket */
2530 ptr++;
2531 while (*ptr != ')') ptr++;
2532 continue;
2533
2534 case ':': /* Non-extracting bracket */
2535 bravalue = OP_BRA;
2536 ptr++;
2537 break;
2538
2539 case '(':
2540 bravalue = OP_COND; /* Conditional group */
2541
2542 /* Condition to test for recursion */
2543
2544 if (ptr[1] == 'R')
2545 {
2546 code[1+LINK_SIZE] = OP_CREF;
2547 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2548 skipbytes = 3;
2549 ptr += 3;
2550 }
2551
2552 /* Condition to test for a numbered subpattern match */
2553
2554 else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
2555 {
2556 int condref = *(++ptr) - '0';
2557 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2558 if (condref == 0)
2559 {
2560 *errorptr = ERR35;
2561 goto FAILED;
2562 }
2563 ptr++;
2564 code[1+LINK_SIZE] = OP_CREF;
2565 PUT2(code, 2+LINK_SIZE, condref);
2566 skipbytes = 3;
2567 }
2568 /* For conditions that are assertions, we just fall through, having
2569 set bravalue above. */
2570 break;
2571
2572 case '=': /* Positive lookahead */
2573 bravalue = OP_ASSERT;
2574 ptr++;
2575 break;
2576
2577 case '!': /* Negative lookahead */
2578 bravalue = OP_ASSERT_NOT;
2579 ptr++;
2580 break;
2581
2582 case '<': /* Lookbehinds */
2583 switch (*(++ptr))
2584 {
2585 case '=': /* Positive lookbehind */
2586 bravalue = OP_ASSERTBACK;
2587 ptr++;
2588 break;
2589
2590 case '!': /* Negative lookbehind */
2591 bravalue = OP_ASSERTBACK_NOT;
2592 ptr++;
2593 break;
2594 }
2595 break;
2596
2597 case '>': /* One-time brackets */
2598 bravalue = OP_ONCE;
2599 ptr++;
2600 break;
2601
2602 case 'C': /* Callout - may be followed by digits */
2603 *code++ = OP_CALLOUT;
2604 {
2605 int n = 0;
2606 while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
2607 n = n * 10 + *ptr - '0';
2608 if (n > 255)
2609 {
2610 *errorptr = ERR38;
2611 goto FAILED;
2612 }
2613 *code++ = n;
2614 }
2615 previous = NULL;
2616 continue;
2617
2618 case 'P': /* Named subpattern handling */
2619 if (*(++ptr) == '<') /* Definition */
2620 {
2621 int i, namelen;
2622 const uschar *name = ++ptr;
2623 uschar *slot = cd->name_table;
2624
2625 while (*ptr++ != '>');
2626 namelen = ptr - name - 1;
2627
2628 for (i = 0; i < cd->names_found; i++)
2629 {
2630 int c = memcmp(name, slot+2, namelen + 1);
2631 if (c == 0)
2632 {
2633 *errorptr = ERR43;
2634 goto FAILED;
2635 }
2636 if (c < 0)
2637 {
2638 memmove(slot + cd->name_entry_size, slot,
2639 (cd->names_found - i) * cd->name_entry_size);
2640 break;
2641 }
2642 slot += cd->name_entry_size;
2643 }
2644
2645 PUT2(slot, 0, *brackets + 1);
2646 memcpy(slot + 2, name, namelen);
2647 slot[2+namelen] = 0;
2648 cd->names_found++;
2649 goto NUMBERED_GROUP;
2650 }
2651
2652 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2653 {
2654 int i, namelen;
2655 int type = *ptr++;
2656 const uschar *name = ptr;
2657 uschar *slot = cd->name_table;
2658
2659 while (*ptr != ')') ptr++;
2660 namelen = ptr - name;
2661
2662 for (i = 0; i < cd->names_found; i++)
2663 {
2664 if (strncmp(name, slot+2, namelen) == 0) break;
2665 slot += cd->name_entry_size;
2666 }
2667 if (i >= cd->names_found)
2668 {
2669 *errorptr = ERR15;
2670 goto FAILED;
2671 }
2672
2673 recno = GET2(slot, 0);
2674
2675 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2676
2677 /* Back reference */
2678
2679 previous = code;
2680 *code++ = OP_REF;
2681 PUT2INC(code, 0, recno);
2682 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2683 if (recno > cd->top_backref) cd->top_backref = recno;
2684 continue;
2685 }
2686
2687 /* Should never happen */
2688 break;
2689
2690 case 'R': /* Pattern recursion */
2691 ptr++; /* Same as (?0) */
2692 /* Fall through */
2693
2694 /* Recursion or "subroutine" call */
2695
2696 case '0': case '1': case '2': case '3': case '4':
2697 case '5': case '6': case '7': case '8': case '9':
2698 {
2699 const uschar *called;
2700 recno = 0;
2701
2702 while ((cd->ctypes[*ptr] & ctype_digit) != 0)
2703 recno = recno * 10 + *ptr++ - '0';
2704
2705 /* Come here from code above that handles a named recursion */
2706
2707 HANDLE_RECURSION:
2708
2709 previous = code;
2710
2711 /* Find the bracket that is being referenced. Temporarily end the
2712 regex in case it doesn't exist. */
2713
2714 *code = OP_END;
2715 called = (recno == 0)?
2716 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2717
2718 if (called == NULL)
2719 {
2720 *errorptr = ERR15;
2721 goto FAILED;
2722 }
2723
2724 /* If the subpattern is still open, this is a recursive call. We
2725 check to see if this is a left recursion that could loop for ever,
2726 and diagnose that case. */
2727
2728 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2729 {
2730 *errorptr = ERR40;
2731 goto FAILED;
2732 }
2733
2734 /* Insert the recursion/subroutine item */
2735
2736 *code = OP_RECURSE;
2737 PUT(code, 1, called - cd->start_code);
2738 code += 1 + LINK_SIZE;
2739 }
2740 continue;
2741
2742 /* Character after (? not specially recognized */
2743
2744 default: /* Option setting */
2745 set = unset = 0;
2746 optset = &set;
2747
2748 while (*ptr != ')' && *ptr != ':')
2749 {
2750 switch (*ptr++)
2751 {
2752 case '-': optset = &unset; break;
2753
2754 case 'i': *optset |= PCRE_CASELESS; break;
2755 case 'm': *optset |= PCRE_MULTILINE; break;
2756 case 's': *optset |= PCRE_DOTALL; break;
2757 case 'x': *optset |= PCRE_EXTENDED; break;
2758 case 'U': *optset |= PCRE_UNGREEDY; break;
2759 case 'X': *optset |= PCRE_EXTRA; break;
2760 }
2761 }
2762
2763 /* Set up the changed option bits, but don't change anything yet. */
2764
2765 newoptions = (options | set) & (~unset);
2766
2767 /* If the options ended with ')' this is not the start of a nested
2768 group with option changes, so the options change at this level. Compile
2769 code to change the ims options if this setting actually changes any of
2770 them. We also pass the new setting back so that it can be put at the
2771 start of any following branches, and when this group ends (if we are in
2772 a group), a resetting item can be compiled.
2773
2774 Note that if this item is right at the start of the pattern, the
2775 options will have been abstracted and made global, so there will be no
2776 change to compile. */
2777
2778 if (*ptr == ')')
2779 {
2780 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2781 {
2782 *code++ = OP_OPT;
2783 *code++ = newoptions & PCRE_IMS;
2784 }
2785
2786 /* Change options at this level, and pass them back for use
2787 in subsequent branches. Reset the greedy defaults and the case
2788 value for firstbyte and reqbyte. */
2789
2790 *optionsptr = options = newoptions;
2791 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2792 greedy_non_default = greedy_default ^ 1;
2793 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2794
2795 previous = NULL; /* This item can't be repeated */
2796 continue; /* It is complete */
2797 }
2798
2799 /* If the options ended with ':' we are heading into a nested group
2800 with possible change of options. Such groups are non-capturing and are
2801 not assertions of any kind. All we need to do is skip over the ':';
2802 the newoptions value is handled below. */
2803
2804 bravalue = OP_BRA;
2805 ptr++;
2806 }
2807 }
2808
2809 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2810 non-capturing and behave like (?:...) brackets */
2811
2812 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2813 {
2814 bravalue = OP_BRA;
2815 }
2816
2817 /* Else we have a referencing group; adjust the opcode. If the bracket
2818 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2819 arrange for the true number to follow later, in an OP_BRANUMBER item. */
2820
2821 else
2822 {
2823 NUMBERED_GROUP:
2824 if (++(*brackets) > EXTRACT_BASIC_MAX)
2825 {
2826 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2827 code[1+LINK_SIZE] = OP_BRANUMBER;
2828 PUT2(code, 2+LINK_SIZE, *brackets);
2829 skipbytes = 3;
2830 }
2831 else bravalue = OP_BRA + *brackets;
2832 }
2833
2834 /* Process nested bracketed re. Assertions may not be repeated, but other
2835 kinds can be. We copy code into a non-register variable in order to be able
2836 to pass its address because some compilers complain otherwise. Pass in a
2837 new setting for the ims options if they have changed. */
2838
2839 previous = (bravalue >= OP_ONCE)? code : NULL;
2840 *code = bravalue;
2841 tempcode = code;
2842
2843 if (!compile_regex(
2844 newoptions, /* The complete new option state */
2845 options & PCRE_IMS, /* The previous ims option state */
2846 brackets, /* Extracting bracket count */
2847 &tempcode, /* Where to put code (updated) */
2848 &ptr, /* Input pointer (updated) */
2849 errorptr, /* Where to put an error message */
2850 (bravalue == OP_ASSERTBACK ||
2851 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2852 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
2853 &subfirstbyte, /* For possible first char */
2854 &subreqbyte, /* For possible last char */
2855 bcptr, /* Current branch chain */
2856 cd)) /* Tables block */
2857 goto FAILED;
2858
2859 /* At the end of compiling, code is still pointing to the start of the
2860 group, while tempcode has been updated to point past the end of the group
2861 and any option resetting that may follow it. The pattern pointer (ptr)
2862 is on the bracket. */
2863
2864 /* If this is a conditional bracket, check that there are no more than
2865 two branches in the group. */
2866
2867 else if (bravalue == OP_COND)
2868 {
2869 uschar *tc = code;
2870 condcount = 0;
2871
2872 do {
2873 condcount++;
2874 tc += GET(tc,1);
2875 }
2876 while (*tc != OP_KET);
2877
2878 if (condcount > 2)
2879 {
2880 *errorptr = ERR27;
2881 goto FAILED;
2882 }
2883
2884 /* If there is just one branch, we must not make use of its firstbyte or
2885 reqbyte, because this is equivalent to an empty second branch. */
2886
2887 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2888 }
2889
2890 /* Handle updating of the required and first characters. Update for normal
2891 brackets of all kinds, and conditions with two branches (see code above).
2892 If the bracket is followed by a quantifier with zero repeat, we have to
2893 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2894 main loop so that they can be accessed for the back off. */
2895
2896 zeroreqbyte = reqbyte;
2897 zerofirstbyte = firstbyte;
2898 groupsetfirstbyte = FALSE;
2899
2900 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2901 {
2902 /* If we have not yet set a firstbyte in this branch, take it from the
2903 subpattern, remembering that it was set here so that a repeat of more
2904 than one can replicate it as reqbyte if necessary. If the subpattern has
2905 no firstbyte, set "none" for the whole branch. In both cases, a zero
2906 repeat forces firstbyte to "none". */
2907
2908 if (firstbyte == REQ_UNSET)
2909 {
2910 if (subfirstbyte >= 0)
2911 {
2912 firstbyte = subfirstbyte;
2913 groupsetfirstbyte = TRUE;
2914 }
2915 else firstbyte = REQ_NONE;
2916 zerofirstbyte = REQ_NONE;
2917 }
2918
2919 /* If firstbyte was previously set, convert the subpattern's firstbyte
2920 into reqbyte if there wasn't one. */
2921
2922 else if (subfirstbyte >= 0 && subreqbyte < 0) subreqbyte = subfirstbyte;
2923
2924 /* If the subpattern set a required char (or set a first char that isn't
2925 really the first char - see above), set it. */
2926
2927 if (subreqbyte >= 0) reqbyte = subreqbyte;
2928 }
2929
2930 /* For a forward assertion, we take the reqbyte, if set. This can be
2931 helpful if the pattern that follows the assertion doesn't set a different
2932 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
2933 for an assertion, however because it leads to incorrect effect for patterns
2934 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
2935 of a firstbyte. This is overcome by a scan at the end if there's no
2936 firstbyte, looking for an asserted first char. */
2937
2938 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
2939
2940 /* Now update the main code pointer to the end of the group. */
2941
2942 code = tempcode;
2943
2944 /* Error if hit end of pattern */
2945
2946 if (*ptr != ')')
2947 {
2948 *errorptr = ERR14;
2949 goto FAILED;
2950 }
2951 break;
2952
2953 /* Check \ for being a real metacharacter; if not, fall through and handle
2954 it as a data character at the start of a string. Escape items are checked
2955 for validity in the pre-compiling pass. */
2956
2957 case '\\':
2958 tempptr = ptr;
2959 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2960
2961 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
2962 are arranged to be the negation of the corresponding OP_values. For the
2963 back references, the values are ESC_REF plus the reference number. Only
2964 back references and those types that consume a character may be repeated.
2965 We can test for values between ESC_b and ESC_Z for the latter; this may
2966 have to change if any new ones are ever created. */
2967
2968 if (c < 0)
2969 {
2970 if (-c == ESC_Q) /* Handle start of quoted string */
2971 {
2972 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
2973 else inescq = TRUE;
2974 continue;
2975 }
2976
2977 /* For metasequences that actually match a character, we disable the
2978 setting of a first character if it hasn't already been set. */
2979
2980 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
2981 firstbyte = REQ_NONE;
2982
2983 /* Set values to reset to if this is followed by a zero repeat. */
2984
2985 zerofirstbyte = firstbyte;
2986 zeroreqbyte = reqbyte;
2987
2988 /* Back references are handled specially */
2989
2990 if (-c >= ESC_REF)
2991 {
2992 int number = -c - ESC_REF;
2993 previous = code;
2994 *code++ = OP_REF;
2995 PUT2INC(code, 0, number);
2996 }
2997 else
2998 {
2999 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3000 *code++ = -c;
3001 }
3002 continue;
3003 }
3004
3005 /* Data character: reset and fall through */
3006
3007 ptr = tempptr;
3008 c = '\\';
3009
3010 /* Handle a run of data characters until a metacharacter is encountered.
3011 The first character is guaranteed not to be whitespace or # when the
3012 extended flag is set. */
3013
3014 NORMAL_CHAR:
3015 default:
3016 previous = code;
3017 *code = OP_CHARS;
3018 code += 2;
3019 length = 0;
3020
3021 do
3022 {
3023 /* If in \Q...\E, check for the end; if not, we always have a literal */
3024
3025 if (inescq)
3026 {
3027 if (c == '\\' && ptr[1] == 'E')
3028 {
3029 inescq = FALSE;
3030 ptr++;
3031 }
3032 else
3033 {
3034 *code++ = c;
3035 length++;
3036 }
3037 continue;
3038 }
3039
3040 /* Skip white space and comments for /x patterns */
3041
3042 if ((options & PCRE_EXTENDED) != 0)
3043 {
3044 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3045 if (c == '#')
3046 {
3047 /* The space before the ; is to avoid a warning on a silly compiler
3048 on the Macintosh. */
3049 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3050 if (c == 0) break;
3051 continue;
3052 }
3053 }
3054
3055 /* Backslash may introduce a data char or a metacharacter. Escaped items
3056 are checked for validity in the pre-compiling pass. Stop the string
3057 before a metaitem. */
3058
3059 if (c == '\\')
3060 {
3061 tempptr = ptr;
3062 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3063 if (c < 0) { ptr = tempptr; break; }
3064
3065 /* If a character is > 127 in UTF-8 mode, we have to turn it into
3066 two or more characters in the UTF-8 encoding. */
3067
3068 #ifdef SUPPORT_UTF8
3069 if (utf8 && c > 127)
3070 {
3071 uschar buffer[8];
3072 int len = ord2utf8(c, buffer);
3073 for (c = 0; c < len; c++) *code++ = buffer[c];
3074 length += len;
3075 continue;
3076 }
3077 #endif
3078 }
3079
3080 /* Ordinary character or single-char escape */
3081
3082 *code++ = c;
3083 length++;
3084 }
3085
3086 /* This "while" is the end of the "do" above. */
3087
3088 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3089
3090 /* Update the first and last requirements. These are always bytes, even in
3091 UTF-8 mode. However, there is a special case to be considered when there
3092 are only one or two characters. Because this gets messy in UTF-8 mode, the
3093 code is kept separate. When we get here "length" contains the number of
3094 bytes. */
3095
3096 #ifdef SUPPORT_UTF8
3097 if (utf8 && length > 1)
3098 {
3099 uschar *t = previous + 3; /* After this code, t */
3100 while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3101
3102 /* Handle the case when there is only one multibyte character. It must
3103 have at least two bytes because of the "length > 1" test above. */
3104
3105 if (t == code)
3106 {
3107 /* If no previous first byte, set it from this character, but revert to
3108 none on a zero repeat. */
3109
3110 if (firstbyte == REQ_UNSET)
3111 {
3112 zerofirstbyte = REQ_NONE;
3113 firstbyte = previous[2];
3114 }
3115
3116 /* Otherwise, leave the first byte value alone, and don't change it on
3117 a zero repeat */
3118
3119 else zerofirstbyte = firstbyte;
3120
3121 /* In both cases, a zero repeat resets the previous required byte */
3122
3123 zeroreqbyte = reqbyte;
3124 }
3125
3126 /* Handle the case when there is more than one character. These may be
3127 single-byte or multibyte characters */
3128
3129 else
3130 {
3131 uschar *t = code - 1; /* After this code, t is at the */
3132 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3133
3134 /* If no previous first byte, set it from the first character, and
3135 retain it on a zero repeat (of the last character). The required byte
3136 is reset on a zero repeat, either to the byte before the last
3137 character, unless this is the first byte of the string. In that case,
3138 it reverts to its previous value. */
3139
3140 if (firstbyte == REQ_UNSET)
3141 {
3142 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3143 zeroreqbyte = (t - 1 == previous + 2)? reqbyte : t[-1] | req_caseopt;
3144 }
3145
3146 /* If there was a previous first byte, leave it alone, and don't change
3147 it on a zero repeat. The required byte is reset on a zero repeat to the
3148 byte before the last character. */
3149
3150 else
3151 {
3152 zerofirstbyte = firstbyte;
3153 zeroreqbyte = t[-1] | req_caseopt;
3154 }
3155 }
3156
3157 /* In all cases (we know length > 1), the new required byte is the last
3158 byte of the string. */
3159
3160 reqbyte = code[-1] | req_caseopt;
3161 }
3162
3163 else /* End of UTF-8 coding */
3164 #endif
3165
3166 /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3167 or when UTF-8 is not enabled. */
3168
3169 {
3170 /* firstbyte was not previously set; take it from this string */
3171
3172 if (firstbyte == REQ_UNSET)
3173 {
3174 if (length == 1)
3175 {
3176 zerofirstbyte = REQ_NONE;
3177 firstbyte = previous[2] | req_caseopt;
3178 zeroreqbyte = reqbyte;
3179 }
3180 else
3181 {
3182 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3183 zeroreqbyte = (length > 2)? (code[-2] | req_caseopt) : reqbyte;
3184 reqbyte = code[-1] | req_caseopt;
3185 }
3186 }
3187
3188 /* firstbyte was previously set */
3189
3190 else
3191 {
3192 zerofirstbyte = firstbyte;
3193 zeroreqbyte = (length == 1)? reqbyte : code[-2] | req_caseopt;
3194 reqbyte = code[-1] | req_caseopt;
3195 }
3196 }
3197
3198 /* Set the length in the data vector, and advance to the next state. */
3199
3200 previous[1] = length;
3201 if (length < MAXLIT) ptr--;
3202 break;
3203 }
3204 } /* end of big loop */
3205
3206 /* Control never reaches here by falling through, only by a goto for all the
3207 error states. Pass back the position in the pattern so that it can be displayed
3208 to the user for diagnosing the error. */
3209
3210 FAILED:
3211 *ptrptr = ptr;
3212 return FALSE;
3213 }
3214
3215
3216
3217
3218 /*************************************************
3219 * Compile sequence of alternatives *
3220 *************************************************/
3221
3222 /* On entry, ptr is pointing past the bracket character, but on return
3223 it points to the closing bracket, or vertical bar, or end of string.
3224 The code variable is pointing at the byte into which the BRA operator has been
3225 stored. If the ims options are changed at the start (for a (?ims: group) or
3226 during any branch, we need to insert an OP_OPT item at the start of every
3227 following branch to ensure they get set correctly at run time, and also pass
3228 the new options into every subsequent branch compile.
3229
3230 Argument:
3231 options option bits, including any changes for this subpattern
3232 oldims previous settings of ims option bits
3233 brackets -> int containing the number of extracting brackets used
3234 codeptr -> the address of the current code pointer
3235 ptrptr -> the address of the current pattern pointer
3236 errorptr -> pointer to error message
3237 lookbehind TRUE if this is a lookbehind assertion
3238 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3239 firstbyteptr place to put the first required character, or a negative number
3240 reqbyteptr place to put the last required character, or a negative number
3241 bcptr pointer to the chain of currently open branches
3242 cd points to the data block with tables pointers etc.
3243
3244 Returns: TRUE on success
3245 */
3246
3247 static BOOL
3248 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3249 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3250 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3251 {
3252 const uschar *ptr = *ptrptr;
3253 uschar *code = *codeptr;
3254 uschar *last_branch = code;
3255 uschar *start_bracket = code;
3256 uschar *reverse_count = NULL;
3257 int firstbyte, reqbyte;
3258 int branchfirstbyte, branchreqbyte;
3259 branch_chain bc;
3260
3261 bc.outer = bcptr;
3262 bc.current = code;
3263
3264 firstbyte = reqbyte = REQ_UNSET;
3265
3266 /* Offset is set zero to mark that this bracket is still open */
3267
3268 PUT(code, 1, 0);
3269 code += 1 + LINK_SIZE + skipbytes;
3270
3271 /* Loop for each alternative branch */
3272
3273 for (;;)
3274 {
3275 /* Handle a change of ims options at the start of the branch */
3276
3277 if ((options & PCRE_IMS) != oldims)
3278 {
3279 *code++ = OP_OPT;
3280 *code++ = options & PCRE_IMS;
3281 }
3282
3283 /* Set up dummy OP_REVERSE if lookbehind assertion */
3284
3285 if (lookbehind)
3286 {
3287 *code++ = OP_REVERSE;
3288 reverse_count = code;
3289 PUTINC(code, 0, 0);
3290 }
3291
3292 /* Now compile the branch */
3293
3294 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3295 &branchfirstbyte, &branchreqbyte, &bc, cd))
3296 {
3297 *ptrptr = ptr;
3298 return FALSE;
3299 }
3300
3301 /* If this is the first branch, the firstbyte and reqbyte values for the
3302 branch become the values for the regex. */
3303
3304 if (*last_branch != OP_ALT)
3305 {
3306 firstbyte = branchfirstbyte;
3307 reqbyte = branchreqbyte;
3308 }
3309
3310 /* If this is not the first branch, the first char and reqbyte have to
3311 match the values from all the previous branches. */
3312
3313 else
3314 {
3315 /* If we previously had a firstbyte, but it doesn't match the new branch,
3316 we have to abandon the firstbyte for the regex, but if there was previously
3317 no reqbyte, it takes on the value of the old firstbyte. */
3318
3319 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3320 {
3321 if (reqbyte < 0) reqbyte = firstbyte;
3322 firstbyte = REQ_NONE;
3323 }
3324
3325 /* If we (now or from before) have no firstbyte, a firstbyte from the
3326 branch becomes a reqbyte if there isn't a branch reqbyte. */
3327
3328 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3329 branchreqbyte = branchfirstbyte;
3330
3331 /* Now ensure that the reqbytes match */
3332
3333 if (reqbyte != branchreqbyte) reqbyte = REQ_NONE;
3334 }
3335
3336 /* If lookbehind, check that this branch matches a fixed-length string,
3337 and put the length into the OP_REVERSE item. Temporarily mark the end of
3338 the branch with OP_END. */
3339
3340 if (lookbehind)
3341 {
3342 int length;
3343 *code = OP_END;
3344 length = find_fixedlength(last_branch, options);
3345 DPRINTF(("fixed length = %d\n", length));
3346 if (length < 0)
3347 {
3348 *errorptr = (length == -2)? ERR36 : ERR25;
3349 *ptrptr = ptr;
3350 return FALSE;
3351 }
3352 PUT(reverse_count, 0, length);
3353 }
3354
3355 /* Reached end of expression, either ')' or end of pattern. Go back through
3356 the alternative branches and reverse the chain of offsets, with the field in
3357 the BRA item now becoming an offset to the first alternative. If there are
3358 no alternatives, it points to the end of the group. The length in the
3359 terminating ket is always the length of the whole bracketed item. If any of
3360 the ims options were changed inside the group, compile a resetting op-code
3361 following, except at the very end of the pattern. Return leaving the pointer
3362 at the terminating char. */
3363
3364 if (*ptr != '|')
3365 {
3366 int length = code - last_branch;
3367 do
3368 {
3369 int prev_length = GET(last_branch, 1);
3370 PUT(last_branch, 1, length);
3371 length = prev_length;
3372 last_branch -= length;
3373 }
3374 while (length > 0);
3375
3376 /* Fill in the ket */
3377
3378 *code = OP_KET;
3379 PUT(code, 1, code - start_bracket);
3380 code += 1 + LINK_SIZE;
3381
3382 /* Resetting option if needed */
3383
3384 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3385 {
3386 *code++ = OP_OPT;
3387 *code++ = oldims;
3388 }
3389
3390 /* Set values to pass back */
3391
3392 *codeptr = code;
3393 *ptrptr = ptr;
3394 *firstbyteptr = firstbyte;
3395 *reqbyteptr = reqbyte;
3396 return TRUE;
3397 }
3398
3399 /* Another branch follows; insert an "or" node. Its length field points back
3400 to the previous branch while the bracket remains open. At the end the chain
3401 is reversed. It's done like this so that the start of the bracket has a
3402 zero offset until it is closed, making it possible to detect recursion. */
3403
3404 *code = OP_ALT;
3405 PUT(code, 1, code - last_branch);
3406 bc.current = last_branch = code;
3407 code += 1 + LINK_SIZE;
3408 ptr++;
3409 }
3410 /* Control never reaches here */
3411 }
3412
3413
3414
3415
3416 /*************************************************
3417 * Check for anchored expression *
3418 *************************************************/
3419
3420 /* Try to find out if this is an anchored regular expression. Consider each
3421 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3422 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3423 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3424 counts, since OP_CIRC can match in the middle.
3425
3426 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3427 This is the code for \G, which means "match at start of match position, taking
3428 into account the match offset".
3429
3430 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3431 because that will try the rest of the pattern at all possible matching points,
3432 so there is no point trying again.... er ....
3433
3434 .... except when the .* appears inside capturing parentheses, and there is a
3435 subsequent back reference to those parentheses. We haven't enough information
3436 to catch that case precisely.
3437
3438 At first, the best we could do was to detect when .* was in capturing brackets
3439 and the highest back reference was greater than or equal to that level.
3440 However, by keeping a bitmap of the first 31 back references, we can catch some
3441 of the more common cases more precisely.
3442
3443 Arguments:
3444 code points to start of expression (the bracket)
3445 options points to the options setting
3446 bracket_map a bitmap of which brackets we are inside while testing; this
3447 handles up to substring 31; after that we just have to take
3448 the less precise approach
3449 backref_map the back reference bitmap
3450
3451 Returns: TRUE or FALSE
3452 */
3453
3454 static BOOL
3455 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3456 unsigned int backref_map)
3457 {
3458 do {
3459 const uschar *scode =
3460 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3461 register int op = *scode;
3462
3463 /* Capturing brackets */
3464
3465 if (op > OP_BRA)
3466 {
3467 int new_map;
3468 op -= OP_BRA;
3469 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3470 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3471 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3472 }
3473
3474 /* Other brackets */
3475
3476 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3477 {
3478 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3479 }
3480
3481 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3482 are or may be referenced. */
3483
3484 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3485 (*options & PCRE_DOTALL) != 0)
3486 {
3487 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3488 }
3489
3490 /* Check for explicit anchoring */
3491
3492 else if (op != OP_SOD && op != OP_SOM &&
3493 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3494 return FALSE;
3495 code += GET(code, 1);
3496 }
3497 while (*code == OP_ALT); /* Loop for each alternative */
3498 return TRUE;
3499 }
3500
3501
3502
3503 /*************************************************
3504 * Check for starting with ^ or .* *
3505 *************************************************/
3506
3507 /* This is called to find out if every branch starts with ^ or .* so that
3508 "first char" processing can be done to speed things up in multiline
3509 matching and for non-DOTALL patterns that start with .* (which must start at
3510 the beginning or after \n). As in the case of is_anchored() (see above), we
3511 have to take account of back references to capturing brackets that contain .*
3512 because in that case we can't make the assumption.
3513
3514 Arguments:
3515 code points to start of expression (the bracket)
3516 bracket_map a bitmap of which brackets we are inside while testing; this
3517 handles up to substring 31; after that we just have to take
3518 the less precise approach
3519 backref_map the back reference bitmap
3520
3521 Returns: TRUE or FALSE
3522 */
3523
3524 static BOOL
3525 is_startline(const uschar *code, unsigned int bracket_map,
3526 unsigned int backref_map)
3527 {
3528 do {
3529 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3530 register int op = *scode;
3531
3532 /* Capturing brackets */
3533
3534 if (op > OP_BRA)
3535 {
3536 int new_map;
3537 op -= OP_BRA;
3538 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3539 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3540 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3541 }
3542
3543 /* Other brackets */
3544
3545 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3546 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3547
3548 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3549 may be referenced. */
3550
3551 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3552 {
3553 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3554 }
3555
3556 /* Check for explicit circumflex */
3557
3558 else if (op != OP_CIRC) return FALSE;
3559 code += GET(code, 1);
3560 }
3561 while (*code == OP_ALT); /* Loop for each alternative */
3562 return TRUE;
3563 }
3564
3565
3566
3567 /*************************************************
3568 * Check for asserted fixed first char *
3569 *************************************************/
3570
3571 /* During compilation, the "first char" settings from forward assertions are
3572 discarded, because they can cause conflicts with actual literals that follow.
3573 However, if we end up without a first char setting for an unanchored pattern,
3574 it is worth scanning the regex to see if there is an initial asserted first
3575 char. If all branches start with the same asserted char, or with a bracket all
3576 of whose alternatives start with the same asserted char (recurse ad lib), then
3577 we return that char, otherwise -1.
3578
3579 Arguments:
3580 code points to start of expression (the bracket)
3581 options pointer to the options (used to check casing changes)
3582 inassert TRUE if in an assertion
3583
3584 Returns: -1 or the fixed first char
3585 */
3586
3587 static int
3588 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3589 {
3590 register int c = -1;
3591 do {
3592 int d;
3593 const uschar *scode =
3594 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3595 register int op = *scode;
3596
3597 if (op >= OP_BRA) op = OP_BRA;
3598
3599 switch(op)
3600 {
3601 default:
3602 return -1;
3603
3604 case OP_BRA:
3605 case OP_ASSERT:
3606 case OP_ONCE:
3607 case OP_COND:
3608 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3609 return -1;
3610 if (c < 0) c = d; else if (c != d) return -1;
3611 break;
3612
3613 case OP_EXACT: /* Fall through */
3614 scode++;
3615
3616 case OP_CHARS: /* Fall through */
3617 scode++;
3618
3619 case OP_PLUS:
3620 case OP_MINPLUS:
3621 if (!inassert) return -1;
3622 if (c < 0)
3623 {
3624 c = scode[1];
3625 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3626 }
3627 else if (c != scode[1]) return -1;
3628 break;
3629 }
3630
3631 code += GET(code, 1);
3632 }
3633 while (*code == OP_ALT);
3634 return c;
3635 }
3636
3637
3638
3639
3640 /*************************************************
3641 * Compile a Regular Expression *
3642 *************************************************/
3643
3644 /* This function takes a string and returns a pointer to a block of store
3645 holding a compiled version of the expression.
3646
3647 Arguments:
3648 pattern the regular expression
3649 options various option bits
3650 errorptr pointer to pointer to error text
3651 erroroffset ptr offset in pattern where error was detected
3652 tables pointer to character tables or NULL
3653
3654 Returns: pointer to compiled data block, or NULL on error,
3655 with errorptr and erroroffset set
3656 */
3657
3658 pcre *
3659 pcre_compile(const char *pattern, int options, const char **errorptr,
3660 int *erroroffset, const unsigned char *tables)
3661 {
3662 real_pcre *re;
3663 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3664 int runlength;
3665 int c, firstbyte, reqbyte;
3666 int bracount = 0;
3667 int branch_extra = 0;
3668 int branch_newextra;
3669 int item_count = -1;
3670 int name_count = 0;
3671 int max_name_size = 0;
3672 #ifdef SUPPORT_UTF8
3673 int lastcharlength = 0;
3674 BOOL utf8;
3675 BOOL class_utf8;
3676 #endif
3677 BOOL inescq = FALSE;
3678 unsigned int brastackptr = 0;
3679 size_t size;
3680 uschar *code;
3681 const uschar *codestart;
3682 const uschar *ptr;
3683 compile_data compile_block;
3684 int brastack[BRASTACK_SIZE];
3685 uschar bralenstack[BRASTACK_SIZE];
3686
3687 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3688 can do is just return NULL. */
3689
3690 if (errorptr == NULL) return NULL;
3691 *errorptr = NULL;
3692
3693 /* However, we can give a message for this error */
3694
3695 if (erroroffset == NULL)
3696 {
3697 *errorptr = ERR16;
3698 return NULL;
3699 }
3700 *erroroffset = 0;
3701
3702 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3703
3704 #ifdef SUPPORT_UTF8
3705 utf8 = (options & PCRE_UTF8) != 0;
3706 #else
3707 if ((options & PCRE_UTF8) != 0)
3708 {
3709 *errorptr = ERR32;
3710 return NULL;
3711 }
3712 #endif
3713
3714 if ((options & ~PUBLIC_OPTIONS) != 0)
3715 {
3716 *errorptr = ERR17;
3717 return NULL;
3718 }
3719
3720 /* Set up pointers to the individual character tables */
3721
3722 if (tables == NULL) tables = pcre_default_tables;
3723 compile_block.lcc = tables + lcc_offset;
3724 compile_block.fcc = tables + fcc_offset;
3725 compile_block.cbits = tables + cbits_offset;
3726 compile_block.ctypes = tables + ctypes_offset;
3727
3728 /* Maximum back reference and backref bitmap. This is updated for numeric
3729 references during the first pass, but for named references during the actual
3730 compile pass. The bitmap records up to 31 back references to help in deciding
3731 whether (.*) can be treated as anchored or not. */
3732
3733 compile_block.top_backref = 0;
3734 compile_block.backref_map = 0;
3735
3736 /* Reflect pattern for debugging output */
3737
3738 DPRINTF(("------------------------------------------------------------------\n"));
3739 DPRINTF(("%s\n", pattern));
3740
3741 /* The first thing to do is to make a pass over the pattern to compute the
3742 amount of store required to hold the compiled code. This does not have to be
3743 perfect as long as errors are overestimates. At the same time we can detect any
3744 flag settings right at the start, and extract them. Make an attempt to correct
3745 for any counted white space if an "extended" flag setting appears late in the
3746 pattern. We can't be so clever for #-comments. */
3747
3748 ptr = (const uschar *)(pattern - 1);
3749 while ((c = *(++ptr)) != 0)
3750 {
3751 int min, max;
3752 int class_optcount;
3753 int bracket_length;
3754 int duplength;
3755
3756 /* If we are inside a \Q...\E sequence, all chars are literal */
3757
3758 if (inescq) goto NORMAL_CHAR;
3759
3760 /* Otherwise, first check for ignored whitespace and comments */
3761
3762 if ((options & PCRE_EXTENDED) != 0)
3763 {
3764 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3765 if (c == '#')
3766 {
3767 /* The space before the ; is to avoid a warning on a silly compiler
3768 on the Macintosh. */
3769 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3770 if (c == 0) break;
3771 continue;
3772 }
3773 }
3774
3775 item_count++; /* Is zero for the first non-comment item */
3776
3777 switch(c)
3778 {
3779 /* A backslashed item may be an escaped "normal" character or a
3780 character type. For a "normal" character, put the pointers and
3781 character back so that tests for whitespace etc. in the input
3782 are done correctly. */
3783
3784 case '\\':
3785 {
3786 const uschar *save_ptr = ptr;
3787 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
3788 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3789 if (c >= 0)
3790 {
3791 ptr = save_ptr;
3792 c = '\\';
3793 goto NORMAL_CHAR;
3794 }
3795 }
3796
3797 /* If \Q, enter "literal" mode */
3798
3799 if (-c == ESC_Q)
3800 {
3801 inescq = TRUE;
3802 continue;
3803 }
3804
3805 /* Other escapes need one byte, and are of length one for repeats */
3806
3807 length++;
3808 #ifdef SUPPORT_UTF8
3809 lastcharlength = 1;
3810 #endif
3811
3812 /* A back reference needs an additional 2 bytes, plus either one or 5
3813 bytes for a repeat. We also need to keep the value of the highest
3814 back reference. */
3815
3816 if (c <= -ESC_REF)
3817 {
3818 int refnum = -c - ESC_REF;
3819 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3820 if (refnum > compile_block.top_backref)
3821 compile_block.top_backref = refnum;
3822 length += 2; /* For single back reference */
3823 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3824 {
3825 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
3826 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3827 if ((min == 0 && (max == 1 || max == -1)) ||
3828 (min == 1 && max == -1))
3829 length++;
3830 else length += 5;
3831 if (ptr[1] == '?') ptr++;
3832 }
3833 }
3834 continue;
3835
3836 case '^': /* Single-byte metacharacters */
3837 case '.':
3838 case '$':
3839 length++;
3840 #ifdef SUPPORT_UTF8
3841 lastcharlength = 1;
3842 #endif
3843 continue;
3844
3845 case '*': /* These repeats won't be after brackets; */
3846 case '+': /* those are handled separately */
3847 case '?':
3848 length++;
3849 goto POSESSIVE; /* A few lines below */
3850
3851 /* This covers the cases of braced repeats after a single char, metachar,
3852 class, or back reference. */
3853
3854 case '{':
3855 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3856 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3857 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3858
3859 /* These special cases just insert one extra opcode */
3860
3861 if ((min == 0 && (max == 1 || max == -1)) ||
3862 (min == 1 && max == -1))
3863 length++;
3864
3865 /* These cases might insert additional copies of a preceding character. */
3866
3867 else
3868 {
3869 #ifdef SUPPORT_UTF8
3870 /* In UTF-8 mode, we should find the length in lastcharlength */
3871 if (utf8)
3872 {
3873 if (min != 1)
3874 {
3875 length -= lastcharlength; /* Uncount the original char or metachar */
3876 if (min > 0) length += 3 + lastcharlength;
3877 }
3878 length += lastcharlength + ((max > 0)? 3 : 1);
3879 }
3880 else
3881 #endif
3882
3883 /* Not UTF-8 mode: all characters are one byte */
3884 {
3885 if (min != 1)
3886 {
3887 length--; /* Uncount the original char or metachar */
3888 if (min > 0) length += 4;
3889 }
3890
3891 length += (max > 0)? 4 : 2;
3892 }
3893 }
3894
3895 if (ptr[1] == '?') ptr++; /* Needs no extra length */
3896
3897 POSESSIVE: /* Test for possessive quantifier */
3898 if (ptr[1] == '+')
3899 {
3900 ptr++;
3901 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
3902 }
3903 continue;
3904
3905 /* An alternation contains an offset to the next branch or ket. If any ims
3906 options changed in the previous branch(es), and/or if we are in a
3907 lookbehind assertion, extra space will be needed at the start of the
3908 branch. This is handled by branch_extra. */
3909
3910 case '|':
3911 length += 1 + LINK_SIZE + branch_extra;
3912 continue;
3913
3914 /* A character class uses 33 characters provided that all the character
3915 values are less than 256. Otherwise, it uses a bit map for low valued
3916 characters, and individual items for others. Don't worry about character
3917 types that aren't allowed in classes - they'll get picked up during the
3918 compile. A character class that contains only one single-byte character
3919 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
3920 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
3921
3922 case '[':
3923 class_optcount = 0;
3924
3925 #ifdef SUPPORT_UTF8
3926 class_utf8 = FALSE;
3927 #endif
3928
3929 if (*(++ptr) == '^') ptr++;
3930
3931 /* Written as a "do" so that an initial ']' is taken as data */
3932
3933 if (*ptr != 0) do
3934 {
3935 /* Inside \Q...\E everything is literal except \E */
3936
3937 if (inescq)
3938 {
3939 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
3940 inescq = FALSE;
3941 ptr += 1;
3942 continue;
3943 }
3944
3945 /* Outside \Q...\E, check for escapes */
3946
3947 if (*ptr == '\\')
3948 {
3949 #ifdef SUPPORT_UTF8
3950 int prevchar = ptr[-1];
3951 #endif
3952 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
3953 &compile_block);
3954 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3955
3956 /* \b is backspace inside a class */
3957
3958 if (-ch == ESC_b) ch = '\b';
3959
3960 /* \Q enters quoting mode */
3961
3962 if (-ch == ESC_Q)
3963 {
3964 inescq = TRUE;
3965 continue;
3966 }
3967
3968 /* Handle escapes that turn into characters */
3969
3970 if (ch >= 0)
3971 {
3972 #ifdef SUPPORT_UTF8
3973 if (utf8)
3974 {
3975 if (ch > 127) class_optcount = 10; /* Ensure > 1 */
3976 if (ch > 255)
3977 {
3978 uschar buffer[6];
3979 if (!class_utf8)
3980 {
3981 class_utf8 = TRUE;
3982 length += LINK_SIZE + 1 + 1;
3983 }
3984 length += 1 + ord2utf8(ch, buffer);
3985
3986 /* If this wide character is preceded by '-', add an extra 2 to
3987 the length in case the previous character was < 128, because in
3988 this case the whole range will be put into the list. */
3989
3990 if (prevchar == '-') length += 2;
3991 }
3992 }
3993 #endif
3994 class_optcount++; /* for possible optimization */
3995 }
3996 else class_optcount = 10; /* \d, \s etc; make sure > 1 */
3997 }
3998
3999 /* Check the syntax for POSIX stuff. The bits we actually handle are
4000 checked during the real compile phase. */
4001
4002 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4003 {
4004 ptr++;
4005 class_optcount = 10; /* Make sure > 1 */
4006 }
4007
4008 /* Anything else just increments the possible optimization count. If
4009 there are wide characters, we are going to have to use an XCLASS. */
4010
4011 else
4012 {
4013 NON_SPECIAL_CHARACTER:
4014 class_optcount++;
4015
4016 #ifdef SUPPORT_UTF8
4017 if (utf8)
4018 {
4019 int c;
4020 int extra = 0;
4021 GETCHARLEN(c, ptr, extra);
4022 if (c > 127) class_optcount = 10; /* No optimization possible */
4023 if (c > 255)
4024 {
4025 if (!class_utf8)
4026 {
4027 class_utf8 = TRUE;
4028 length += LINK_SIZE + 1 + 1;
4029 }
4030 length += 2 + extra;
4031
4032 /* If this wide character is preceded by '-', add an extra 2 to
4033 the length in case the previous character was < 128, because in
4034 this case the whole range will be put into the list. */
4035
4036 if (ptr[-1] == '-') length += 2;
4037
4038 /* Advance to the end of this character */
4039
4040 ptr += extra;
4041 }
4042 }
4043 #endif
4044 }
4045 }
4046 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4047
4048 if (*ptr == 0) /* Missing terminating ']' */
4049 {
4050 *errorptr = ERR6;
4051 goto PCRE_ERROR_RETURN;
4052 }
4053
4054 /* We can optimize when there was only one optimizable character. Repeats
4055 for positive and negated single one-byte chars are handled by the general
4056 code. Here, we handle repeats for the class opcodes. */
4057
4058 if (class_optcount == 1) length += 3; else
4059 {
4060 length += 33;
4061
4062 /* A repeat needs either 1 or 5 bytes. */
4063
4064 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
4065 {
4066 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4067 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4068 if ((min == 0 && (max == 1 || max == -1)) ||
4069 (min == 1 && max == -1))
4070 length++;
4071 else length += 5;
4072 if (ptr[1] == '?') ptr++;
4073 }
4074 }
4075 continue;
4076
4077 /* Brackets may be genuine groups or special things */
4078
4079 case '(':
4080 branch_newextra = 0;
4081 bracket_length = 1 + LINK_SIZE;
4082
4083 /* Handle special forms of bracket, which all start (? */
4084
4085 if (ptr[1] == '?')
4086 {
4087 int set, unset;
4088 int *optset;
4089
4090 switch (c = ptr[2])
4091 {
4092 /* Skip over comments entirely */
4093 case '#':
4094 ptr += 3;
4095 while (*ptr != 0 && *ptr != ')') ptr++;
4096 if (*ptr == 0)
4097 {
4098 *errorptr = ERR18;
4099 goto PCRE_ERROR_RETURN;
4100 }
4101 continue;
4102
4103 /* Non-referencing groups and lookaheads just move the pointer on, and
4104 then behave like a non-special bracket, except that they don't increment
4105 the count of extracting brackets. Ditto for the "once only" bracket,
4106 which is in Perl from version 5.005. */
4107
4108 case ':':
4109 case '=':
4110 case '!':
4111 case '>':
4112 ptr += 2;
4113 break;
4114
4115 /* (?R) specifies a recursive call to the regex, which is an extension
4116 to provide the facility which can be obtained by (?p{perl-code}) in
4117 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4118
4119 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4120 the appropriate numbered brackets. This includes both recursive and
4121 non-recursive calls. (?R) is now synonymous with (?0). */
4122
4123 case 'R':
4124 ptr++;
4125
4126 case '0': case '1': case '2': case '3': case '4':
4127 case '5': case '6': case '7': case '8': case '9':
4128 ptr += 2;
4129 if (c != 'R')
4130 while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4131 if (*ptr != ')')
4132 {
4133 *errorptr = ERR29;
4134 goto PCRE_ERROR_RETURN;
4135 }
4136 length += 1 + LINK_SIZE;
4137
4138 /* If this item is quantified, it will get wrapped inside brackets so
4139 as to use the code for quantified brackets. We jump down and use the
4140 code that handles this for real brackets. */
4141
4142 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4143 {
4144 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4145 duplength = 5 + 3 * LINK_SIZE;
4146 goto HANDLE_QUANTIFIED_BRACKETS;
4147 }
4148 continue;
4149
4150 /* (?C) is an extension which provides "callout" - to provide a bit of
4151 the functionality of the Perl (?{...}) feature. An optional number may
4152 follow (default is zero). */
4153
4154 case 'C':
4155 ptr += 2;
4156 while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4157 if (*ptr != ')')
4158 {
4159 *errorptr = ERR39;
4160 goto PCRE_ERROR_RETURN;
4161 }
4162 length += 2;
4163 continue;
4164
4165 /* Named subpatterns are an extension copied from Python */
4166
4167 case 'P':
4168 ptr += 3;
4169 if (*ptr == '<')
4170 {
4171 const uschar *p = ++ptr;
4172 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4173 if (*ptr != '>')
4174 {
4175 *errorptr = ERR42;
4176 goto PCRE_ERROR_RETURN;
4177 }
4178 name_count++;
4179 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4180 break;
4181 }
4182
4183 if (*ptr == '=' || *ptr == '>')
4184 {
4185 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4186 if (*ptr != ')')
4187 {
4188 *errorptr = ERR42;
4189 goto PCRE_ERROR_RETURN;
4190 }
4191 break;
4192 }
4193
4194 /* Unknown character after (?P */
4195
4196 *errorptr = ERR41;
4197 goto PCRE_ERROR_RETURN;
4198
4199 /* Lookbehinds are in Perl from version 5.005 */
4200
4201 case '<':
4202 ptr += 3;
4203 if (*ptr == '=' || *ptr == '!')
4204 {
4205 branch_newextra = 1 + LINK_SIZE;
4206 length += 1 + LINK_SIZE; /* For the first branch */
4207 break;
4208 }
4209 *errorptr = ERR24;
4210 goto PCRE_ERROR_RETURN;
4211
4212 /* Conditionals are in Perl from version 5.005. The bracket must either
4213 be followed by a number (for bracket reference) or by an assertion
4214 group, or (a PCRE extension) by 'R' for a recursion test. */
4215
4216 case '(':
4217 if (ptr[3] == 'R' && ptr[4] == ')')
4218 {
4219 ptr += 4;
4220 length += 3;
4221 }
4222 else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
4223 {
4224 ptr += 4;
4225 length += 3;
4226 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
4227 if (*ptr != ')')
4228 {
4229 *errorptr = ERR26;
4230 goto PCRE_ERROR_RETURN;
4231 }
4232 }
4233 else /* An assertion must follow */
4234 {
4235 ptr++; /* Can treat like ':' as far as spacing is concerned */
4236 if (ptr[2] != '?' ||
4237 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4238 {
4239 ptr += 2; /* To get right offset in message */
4240 *errorptr = ERR28;
4241 goto PCRE_ERROR_RETURN;
4242 }
4243 }
4244 break;
4245
4246 /* Else loop checking valid options until ) is met. Anything else is an
4247 error. If we are without any brackets, i.e. at top level, the settings
4248 act as if specified in the options, so massage the options immediately.
4249 This is for backward compatibility with Perl 5.004. */
4250
4251 default:
4252 set = unset = 0;
4253 optset = &set;
4254 ptr += 2;
4255
4256 for (;; ptr++)
4257 {
4258 c = *ptr;
4259 switch (c)
4260 {
4261 case 'i':
4262 *optset |= PCRE_CASELESS;
4263 continue;
4264
4265 case 'm':
4266 *optset |= PCRE_MULTILINE;
4267 continue;
4268
4269 case 's':
4270 *optset |= PCRE_DOTALL;
4271 continue;
4272
4273 case 'x':
4274 *optset |= PCRE_EXTENDED;
4275 continue;
4276
4277 case 'X':
4278 *optset |= PCRE_EXTRA;
4279 continue;
4280
4281 case 'U':
4282 *optset |= PCRE_UNGREEDY;
4283 continue;
4284
4285 case '-':
4286 optset = &unset;
4287 continue;
4288
4289 /* A termination by ')' indicates an options-setting-only item; if
4290 this is at the very start of the pattern (indicated by item_count
4291 being zero), we use it to set the global options. This is helpful
4292 when analyzing the pattern for first characters, etc. Otherwise
4293 nothing is done here and it is handled during the compiling
4294 process.
4295
4296 [Historical note: Up to Perl 5.8, options settings at top level
4297 were always global settings, wherever they appeared in the pattern.
4298 That is, they were equivalent to an external setting. From 5.8
4299 onwards, they apply only to what follows (which is what you might
4300 expect).] */
4301
4302 case ')':
4303 if (item_count == 0)
4304 {
4305 options = (options | set) & (~unset);
4306 set = unset = 0; /* To save length */
4307 item_count--; /* To allow for several */
4308 }
4309
4310 /* Fall through */
4311
4312 /* A termination by ':' indicates the start of a nested group with
4313 the given options set. This is again handled at compile time, but
4314 we must allow for compiled space if any of the ims options are
4315 set. We also have to allow for resetting space at the end of
4316 the group, which is why 4 is added to the length and not just 2.
4317 If there are several changes of options within the same group, this
4318 will lead to an over-estimate on the length, but this shouldn't
4319 matter very much. We also have to allow for resetting options at
4320 the start of any alternations, which we do by setting
4321 branch_newextra to 2. Finally, we record whether the case-dependent
4322 flag ever changes within the regex. This is used by the "required
4323 character" code. */
4324
4325 case ':':
4326 if (((set|unset) & PCRE_IMS) != 0)
4327 {
4328 length += 4;
4329 branch_newextra = 2;
4330 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4331 }
4332 goto END_OPTIONS;
4333
4334 /* Unrecognized option character */
4335
4336 default:
4337 *errorptr = ERR12;
4338 goto PCRE_ERROR_RETURN;
4339 }
4340 }
4341
4342 /* If we hit a closing bracket, that's it - this is a freestanding
4343 option-setting. We need to ensure that branch_extra is updated if
4344 necessary. The only values branch_newextra can have here are 0 or 2.
4345 If the value is 2, then branch_extra must either be 2 or 5, depending
4346 on whether this is a lookbehind group or not. */
4347
4348 END_OPTIONS:
4349 if (c == ')')
4350 {
4351 if (branch_newextra == 2 &&
4352 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4353 branch_extra += branch_newextra;
4354 continue;
4355 }
4356
4357 /* If options were terminated by ':' control comes here. Fall through
4358 to handle the group below. */
4359 }
4360 }
4361
4362 /* Extracting brackets must be counted so we can process escapes in a
4363 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4364 need an additional 3 bytes of store per extracting bracket. However, if
4365 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4366 must leave the count alone (it will aways be zero). */
4367
4368 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4369 {
4370 bracount++;
4371 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4372 }
4373
4374 /* Save length for computing whole length at end if there's a repeat that
4375 requires duplication of the group. Also save the current value of
4376 branch_extra, and start the new group with the new value. If non-zero, this
4377 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4378
4379 if (brastackptr >= sizeof(brastack)/sizeof(int))
4380 {
4381 *errorptr = ERR19;
4382 goto PCRE_ERROR_RETURN;
4383 }
4384
4385 bralenstack[brastackptr] = branch_extra;
4386 branch_extra = branch_newextra;
4387
4388 brastack[brastackptr++] = length;
4389 length += bracket_length;
4390 continue;
4391
4392 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4393 have to replicate this bracket up to that many times. If brastackptr is
4394 0 this is an unmatched bracket which will generate an error, but take care
4395 not to try to access brastack[-1] when computing the length and restoring
4396 the branch_extra value. */
4397
4398 case ')':
4399 length += 1 + LINK_SIZE;
4400 if (brastackptr > 0)
4401 {
4402 duplength = length - brastack[--brastackptr];
4403 branch_extra = bralenstack[brastackptr];
4404 }
4405 else duplength = 0;
4406
4407 /* The following code is also used when a recursion such as (?3) is
4408 followed by a quantifier, because in that case, it has to be wrapped inside
4409 brackets so that the quantifier works. The value of duplength must be
4410 set before arrival. */
4411
4412 HANDLE_QUANTIFIED_BRACKETS:
4413
4414 /* Leave ptr at the final char; for read_repeat_counts this happens
4415 automatically; for the others we need an increment. */
4416
4417 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
4418 {
4419 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4420 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4421 }
4422 else if (c == '*') { min = 0; max = -1; ptr++; }
4423 else if (c == '+') { min = 1; max = -1; ptr++; }
4424 else if (c == '?') { min = 0; max = 1; ptr++; }
4425 else { min = 1; max = 1; }
4426
4427 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4428 group, and if the maximum is greater than zero, we have to replicate
4429 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4430 bracket set. */
4431
4432 if (min == 0)
4433 {
4434 length++;
4435 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4436 }
4437
4438 /* When the minimum is greater than zero, we have to replicate up to
4439 minval-1 times, with no additions required in the copies. Then, if there
4440 is a limited maximum we have to replicate up to maxval-1 times allowing
4441 for a BRAZERO item before each optional copy and nesting brackets for all
4442 but one of the optional copies. */
4443
4444 else
4445 {
4446 length += (min - 1) * duplength;
4447 if (max > min) /* Need this test as max=-1 means no limit */
4448 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4449 - (2 + 2*LINK_SIZE);
4450 }
4451
4452 /* Allow space for once brackets for "possessive quantifier" */
4453
4454 if (ptr[1] == '+')
4455 {
4456 ptr++;
4457 length += 2 + 2*LINK_SIZE;
4458 }
4459 continue;
4460
4461 /* Non-special character. For a run of such characters the length required
4462 is the number of characters + 2, except that the maximum run length is
4463 MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4464 # comment as the first character, so the length can't be zero. */
4465
4466 NORMAL_CHAR:
4467 default:
4468 length += 2;
4469 runlength = 0;
4470 do
4471 {
4472 #ifdef SUPPORT_UTF8
4473 lastcharlength = 1; /* Need length of last char for UTF-8 repeats */
4474 #endif
4475
4476 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4477 if (inescq)
4478 {
4479 if (c == '\\' && ptr[1] == 'E')
4480 {
4481 inescq = FALSE;
4482 ptr++;
4483 }
4484 else runlength++;
4485 continue;
4486 }
4487
4488 /* Skip whitespace and comments for /x */
4489
4490 if ((options & PCRE_EXTENDED) != 0)
4491 {
4492 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4493 if (c == '#')
4494 {
4495 /* The space before the ; is to avoid a warning on a silly compiler
4496 on the Macintosh. */
4497 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4498 continue;
4499 }
4500 }
4501
4502 /* Backslash may introduce a data char or a metacharacter; stop the
4503 string before the latter. */
4504
4505 if (c == '\\')
4506 {
4507 const uschar *saveptr = ptr;
4508 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
4509 &compile_block);
4510 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4511 if (c < 0) { ptr = saveptr; break; }
4512
4513 /* In UTF-8 mode, add on the number of additional bytes needed to
4514 encode this character, and save the total length in case this is a
4515 final char that is repeated. */
4516
4517 #ifdef SUPPORT_UTF8
4518 if (utf8 && c > 127)
4519 {
4520 int i;
4521 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4522 if (c <= utf8_table1[i]) break;
4523 runlength += i;
4524 lastcharlength += i;
4525 }
4526 #endif
4527 }
4528
4529 /* Ordinary character or single-char escape */
4530
4531 runlength++;
4532 }
4533
4534 /* This "while" is the end of the "do" above. */
4535
4536 while (runlength < MAXLIT &&
4537 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4538
4539 /* If we hit a meta-character, back off to point to it */
4540
4541 if (runlength < MAXLIT) ptr--;
4542
4543 /* If the last char in the string is a UTF-8 multibyte character, we must
4544 set lastcharlength correctly. If it was specified as an escape, this will
4545 already have been done above. However, we also have to support in-line
4546 UTF-8 characters, so check backwards from where we are. */
4547
4548 #ifdef SUPPORT_UTF8
4549 if (utf8)
4550 {
4551 const uschar *lastptr = ptr - 1;
4552 if ((*lastptr & 0x80) != 0)
4553 {
4554 while((*lastptr & 0xc0) == 0x80) lastptr--;
4555 lastcharlength = ptr - lastptr;
4556 }
4557 }
4558 #endif
4559
4560 length += runlength;
4561 continue;
4562 }
4563 }
4564
4565 length += 2 + LINK_SIZE; /* For final KET and END */
4566
4567 if (length > MAX_PATTERN_SIZE)
4568 {
4569 *errorptr = ERR20;
4570 return NULL;
4571 }
4572
4573 /* Compute the size of data block needed and get it, either from malloc or
4574 externally provided function. */
4575
4576 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4577 re = (real_pcre *)(pcre_malloc)(size);
4578
4579 if (re == NULL)
4580 {
4581 *errorptr = ERR21;
4582 return NULL;
4583 }
4584
4585 /* Put in the magic number, and save the size, options, and table pointer */
4586
4587 re->magic_number = MAGIC_NUMBER;
4588 re->size = size;
4589 re->options = options;
4590 re->tables = tables;
4591 re->name_entry_size = max_name_size + 3;
4592 re->name_count = name_count;
4593
4594 /* The starting points of the name/number translation table and of the code are
4595 passed around in the compile data block. */
4596
4597 compile_block.names_found = 0;
4598 compile_block.name_entry_size = max_name_size + 3;
4599 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4600 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4601 compile_block.start_code = codestart;
4602
4603 /* Set up a starting, non-extracting bracket, then compile the expression. On
4604 error, *errorptr will be set non-NULL, so we don't need to look at the result
4605 of the function here. */
4606
4607 ptr = (const uschar *)pattern;
4608 code = (uschar *)codestart;
4609 *code = OP_BRA;
4610 bracount = 0;
4611 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4612 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4613 re->top_bracket = bracount;
4614 re->top_backref = compile_block.top_backref;
4615
4616 /* If not reached end of pattern on success, there's an excess bracket. */
4617
4618 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
4619
4620 /* Fill in the terminating state and check for disastrous overflow, but
4621 if debugging, leave the test till after things are printed out. */
4622
4623 *code++ = OP_END;
4624
4625 #ifndef DEBUG
4626 if (code - codestart > length) *errorptr = ERR23;
4627 #endif
4628
4629 /* Give an error if there's back reference to a non-existent capturing
4630 subpattern. */
4631
4632 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4633
4634 /* Failed to compile, or error while post-processing */
4635
4636 if (*errorptr != NULL)
4637 {
4638 (pcre_free)(re);
4639 PCRE_ERROR_RETURN:
4640 *erroroffset = ptr - (const uschar *)pattern;
4641 return NULL;
4642 }
4643
4644 /* If the anchored option was not passed, set the flag if we can determine that
4645 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4646 as starting with .* when DOTALL is set).
4647
4648 Otherwise, if we know what the first character has to be, save it, because that
4649 speeds up unanchored matches no end. If not, see if we can set the
4650 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4651 start with ^. and also when all branches start with .* for non-DOTALL matches.
4652 */
4653
4654 if ((options & PCRE_ANCHORED) == 0)
4655 {
4656 int temp_options = options;
4657 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4658 re->options |= PCRE_ANCHORED;
4659 else
4660 {
4661 if (firstbyte < 0)
4662 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4663 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4664 {
4665 int ch = firstbyte & 255;
4666 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4667 compile_block.fcc[ch] == ch)? ch : firstbyte;
4668 re->options |= PCRE_FIRSTSET;
4669 }
4670 else if (is_startline(codestart, 0, compile_block.backref_map))
4671 re->options |= PCRE_STARTLINE;
4672 }
4673 }
4674
4675 /* Save the last required character if any. Remove caseless flag for
4676 non-caseable chars. */
4677
4678 if ((re->options & PCRE_ANCHORED) != 0 && reqbyte < 0 && firstbyte >= 0)
4679 reqbyte = firstbyte;
4680
4681 if (reqbyte >= 0)
4682 {
4683 int ch = reqbyte & 255;
4684 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4685 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4686 re->options |= PCRE_REQCHSET;
4687 }
4688
4689 /* Print out the compiled data for debugging */
4690
4691 #ifdef DEBUG
4692
4693 printf("Length = %d top_bracket = %d top_backref = %d\n",
4694 length, re->top_bracket, re->top_backref);
4695
4696 if (re->options != 0)
4697 {
4698 printf("%s%s%s%s%s%s%s%s%s\n",
4699 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4700 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4701 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4702 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4703 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4704 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4705 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4706 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4707 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4708 }
4709
4710 if ((re->options & PCRE_FIRSTSET) != 0)
4711 {
4712 int ch = re->first_byte & 255;
4713 char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4714 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4715 else printf("First char = \\x%02x%s\n", ch, caseless);
4716 }
4717
4718 if ((re->options & PCRE_REQCHSET) != 0)
4719 {
4720 int ch = re->req_byte & 255;
4721 char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4722 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
4723 else printf("Req char = \\x%02x%s\n", ch, caseless);
4724 }
4725
4726 print_internals(re, stdout);
4727
4728 /* This check is done here in the debugging case so that the code that
4729 was compiled can be seen. */
4730
4731 if (code - codestart > length)
4732 {
4733 *errorptr = ERR23;
4734 (pcre_free)(re);
4735 *erroroffset = ptr - (uschar *)pattern;
4736 return NULL;
4737 }
4738 #endif
4739
4740 return (pcre *)re;
4741 }
4742
4743
4744
4745 /*************************************************
4746 * Match a back-reference *
4747 *************************************************/
4748
4749 /* If a back reference hasn't been set, the length that is passed is greater
4750 than the number of characters left in the string, so the match fails.
4751
4752 Arguments:
4753 offset index into the offset vector
4754 eptr points into the subject
4755 length length to be matched
4756 md points to match data block
4757 ims the ims flags
4758
4759 Returns: TRUE if matched
4760 */
4761
4762 static BOOL
4763 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
4764 unsigned long int ims)
4765 {
4766 const uschar *p = md->start_subject + md->offset_vector[offset];
4767
4768 #ifdef DEBUG
4769 if (eptr >= md->end_subject)
4770 printf("matching subject <null>");
4771 else
4772 {
4773 printf("matching subject ");
4774 pchars(eptr, length, TRUE, md);
4775 }
4776 printf(" against backref ");
4777 pchars(p, length, FALSE, md);
4778 printf("\n");
4779 #endif
4780
4781 /* Always fail if not enough characters left */
4782
4783 if (length > md->end_subject - eptr) return FALSE;
4784
4785 /* Separate the caselesss case for speed */
4786
4787 if ((ims & PCRE_CASELESS) != 0)
4788 {
4789 while (length-- > 0)
4790 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
4791 }
4792 else
4793 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
4794
4795 return TRUE;
4796 }
4797
4798
4799 #ifdef SUPPORT_UTF8
4800 /*************************************************
4801 * Match character against an XCLASS *
4802 *************************************************/
4803
4804 /* This function is called from within the XCLASS code below, to match a
4805 character against an extended class which might match values > 255.
4806
4807 Arguments:
4808 c the character
4809 data points to the flag byte of the XCLASS data
4810
4811 Returns: TRUE if character matches, else FALSE
4812 */
4813
4814 static BOOL
4815 match_xclass(int c, const uschar *data)
4816 {
4817 int t;
4818 BOOL negated = (*data & XCL_NOT) != 0;
4819
4820 /* Character values < 256 are matched against a bitmap, if one is present. If
4821 not, we still carry on, because there may be ranges that start below 256 in the
4822 additional data. */
4823
4824 if (c < 256)
4825 {
4826 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
4827 return !negated; /* char found */
4828 }
4829
4830 /* Now match against the list of large chars or ranges that end with a large
4831 char. First skip the bit map if present. */
4832
4833 if ((*data++ & XCL_MAP) != 0) data += 32;
4834
4835 while ((t = *data++) != XCL_END)
4836 {
4837 int x, y;
4838 GETCHARINC(x, data);
4839 if (t == XCL_SINGLE)
4840 {
4841 if (c == x) return !negated;
4842 }
4843 else
4844 {
4845 GETCHARINC(y, data);
4846 if (c >= x && c <= y) return !negated;
4847 }
4848 }
4849
4850 return negated; /* char was not found */
4851 }
4852 #endif
4853
4854
4855
4856
4857 /*************************************************
4858 * Match from current position *
4859 *************************************************/
4860
4861 /* On entry ecode points to the first opcode, and eptr to the first character
4862 in the subject string, while eptrb holds the value of eptr at the start of the
4863 last bracketed group - used for breaking infinite loops matching zero-length
4864 strings. This function is called recursively in many circumstances. Whenever it
4865 returns a negative (error) response, the outer incarnation must also return the
4866 same response.
4867
4868 Performance note: It might be tempting to extract commonly used fields from the
4869 md structure (e.g. utf8, end_subject) into individual variables to improve
4870 performance. Tests using gcc on a SPARC disproved this; in the first case, it
4871 made performance worse.
4872
4873 Arguments:
4874 eptr pointer in subject
4875 ecode position in code
4876 offset_top current top pointer
4877 md pointer to "static" info for the match
4878 ims current /i, /m, and /s options
4879 eptrb pointer to chain of blocks containing eptr at start of
4880 brackets - for testing for empty matches
4881 flags can contain
4882 match_condassert - this is an assertion condition
4883 match_isgroup - this is the start of a bracketed group
4884
4885 Returns: MATCH_MATCH if matched ) these values are >= 0
4886 MATCH_NOMATCH if failed to match )
4887 a negative PCRE_ERROR_xxx value if aborted by an error condition
4888 (e.g. stopped by recursion limit)
4889 */
4890
4891 static int
4892 match(register const uschar *eptr, register const uschar *ecode,
4893 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
4894 int flags)
4895 {
4896 unsigned long int original_ims = ims; /* Save for resetting on ')' */
4897 register int rrc;
4898 eptrblock newptrb;
4899
4900 if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
4901
4902 /* At the start of a bracketed group, add the current subject pointer to the
4903 stack of such pointers, to be re-instated at the end of the group when we hit
4904 the closing ket. When match() is called in other circumstances, we don't add to
4905 the stack. */
4906
4907 if ((flags & match_isgroup) != 0)
4908 {
4909 newptrb.prev = eptrb;
4910 newptrb.saved_eptr = eptr;
4911 eptrb = &newptrb;
4912 }
4913
4914 /* Now start processing the operations. */
4915
4916 for (;;)
4917 {
4918 int op = (int)*ecode;
4919 int min, max, ctype;
4920 register int i;
4921 register int c;
4922 BOOL minimize = FALSE;
4923
4924 /* Opening capturing bracket. If there is space in the offset vector, save
4925 the current subject position in the working slot at the top of the vector. We
4926 mustn't change the current values of the data slot, because they may be set
4927 from a previous iteration of this group, and be referred to by a reference
4928 inside the group.
4929
4930 If the bracket fails to match, we need to restore this value and also the
4931 values of the final offsets, in case they were set by a previous iteration of
4932 the same bracket.
4933
4934 If there isn't enough space in the offset vector, treat this as if it were a
4935 non-capturing bracket. Don't worry about setting the flag for the error case
4936 here; that is handled in the code for KET. */
4937
4938 if (op > OP_BRA)
4939 {
4940 int offset;
4941 int number = op - OP_BRA;
4942
4943 /* For extended extraction brackets (large number), we have to fish out the
4944 number from a dummy opcode at the start. */
4945
4946 if (number > EXTRACT_BASIC_MAX)
4947 number = GET2(ecode, 2+LINK_SIZE);
4948 offset = number << 1;
4949
4950 #ifdef DEBUG
4951 printf("start bracket %d subject=", number);
4952 pchars(eptr, 16, TRUE, md);
4953 printf("\n");
4954 #endif
4955
4956 if (offset < md->offset_max)
4957 {
4958 int save_offset1 = md->offset_vector[offset];
4959 int save_offset2 = md->offset_vector[offset+1];
4960 int save_offset3 = md->offset_vector[md->offset_end - number];
4961 int save_capture_last = md->capture_last;
4962
4963 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
4964 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
4965
4966 do
4967 {
4968 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
4969 eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
4970 md->capture_last = save_capture_last;
4971 ecode += GET(ecode, 1);
4972 }
4973 while (*ecode == OP_ALT);
4974
4975 DPRINTF(("bracket %d failed\n", number));
4976
4977 md->offset_vector[offset] = save_offset1;
4978 md->offset_vector[offset+1] = save_offset2;
4979 md->offset_vector[md->offset_end - number] = save_offset3;
4980
4981 return MATCH_NOMATCH;
4982 }
4983
4984 /* Insufficient room for saving captured contents */
4985
4986 else op = OP_BRA;
4987 }
4988
4989 /* Other types of node can be handled by a switch */
4990
4991 switch(op)
4992 {
4993 case OP_BRA: /* Non-capturing bracket: optimized */
4994 DPRINTF(("start bracket 0\n"));
4995 do
4996 {
4997 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
4998 match_isgroup)) != MATCH_NOMATCH) return rrc;
4999 ecode += GET(ecode, 1);
5000 }
5001 while (*ecode == OP_ALT);
5002 DPRINTF(("bracket 0 failed\n"));
5003 return MATCH_NOMATCH;
5004
5005 /* Conditional group: compilation checked that there are no more than
5006 two branches. If the condition is false, skipping the first branch takes us
5007 past the end if there is only one branch, but that's OK because that is
5008 exactly what going to the ket would do. */
5009
5010 case OP_COND:
5011 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5012 {
5013 int offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
5014 BOOL condition = (offset == CREF_RECURSE * 2)?
5015 (md->recursive != NULL) :
5016 (offset < offset_top && md->offset_vector[offset] >= 0);
5017 return match(eptr, ecode + (condition?
5018 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5019 offset_top, md, ims, eptrb, match_isgroup);
5020 }
5021
5022 /* The condition is an assertion. Call match() to evaluate it - setting
5023 the final argument TRUE causes it to stop at the end of an assertion. */
5024
5025 else
5026 {
5027 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5028 match_condassert | match_isgroup)) == MATCH_MATCH)
5029 {
5030 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5031 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5032 }
5033 else if (rrc != MATCH_NOMATCH) return rrc;
5034 else ecode += GET(ecode, 1);
5035 return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5036 match_isgroup);
5037 }
5038 /* Control never reaches here */
5039
5040 /* Skip over conditional reference or large extraction number data if
5041 encountered. */
5042
5043 case OP_CREF:
5044 case OP_BRANUMBER:
5045 ecode += 3;
5046 break;
5047
5048 /* End of the pattern. If we are in a recursion, we should restore the
5049 offsets appropriately and continue from after the call. */
5050
5051 case OP_END:
5052 if (md->recursive != NULL && md->recursive->group_num == 0)
5053 {
5054 recursion_info *rec = md->recursive;
5055 DPRINTF(("Hit the end in a (?0) recursion\n"));
5056 md->recursive = rec->prev;
5057 memmove(md->offset_vector, rec->offset_save,
5058 rec->saved_max * sizeof(int));
5059 md->start_match = rec->save_start;
5060 ims = original_ims;
5061 ecode = rec->after_call;
5062 break;
5063 }
5064
5065 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5066 string - backtracking will then try other alternatives, if any. */
5067
5068 if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
5069 md->end_match_ptr = eptr; /* Record where we ended */
5070 md->end_offset_top = offset_top; /* and how many extracts were taken */
5071 return MATCH_MATCH;
5072
5073 /* Change option settings */
5074
5075 case OP_OPT:
5076 ims = ecode[1];
5077 ecode += 2;
5078 DPRINTF(("ims set to %02lx\n", ims));
5079 break;
5080
5081 /* Assertion brackets. Check the alternative branches in turn - the
5082 matching won't pass the KET for an assertion. If any one branch matches,
5083 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5084 start of each branch to move the current point backwards, so the code at
5085 this level is identical to the lookahead case. */
5086
5087 case OP_ASSERT:
5088 case OP_ASSERTBACK:
5089 do
5090 {
5091 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5092 match_isgroup)) == MATCH_MATCH) break;
5093 if (rrc != MATCH_NOMATCH) return rrc;
5094 ecode += GET(ecode, 1);
5095 }
5096 while (*ecode == OP_ALT);
5097 if (*ecode == OP_KET) return MATCH_NOMATCH;
5098
5099 /* If checking an assertion for a condition, return MATCH_MATCH. */
5100
5101 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5102
5103 /* Continue from after the assertion, updating the offsets high water
5104 mark, since extracts may have been taken during the assertion. */
5105
5106 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5107 ecode += 1 + LINK_SIZE;
5108 offset_top = md->end_offset_top;
5109 continue;
5110
5111 /* Negative assertion: all branches must fail to match */
5112
5113 case OP_ASSERT_NOT:
5114 case OP_ASSERTBACK_NOT:
5115 do
5116 {
5117 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5118 match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
5119 if (rrc != MATCH_NOMATCH) return rrc;
5120 ecode += GET(ecode,1);
5121 }
5122 while (*ecode == OP_ALT);
5123
5124 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5125
5126 ecode += 1 + LINK_SIZE;
5127 continue;
5128
5129 /* Move the subject pointer back. This occurs only at the start of
5130 each branch of a lookbehind assertion. If we are too close to the start to
5131 move back, this match function fails. When working with UTF-8 we move
5132 back a number of characters, not bytes. */
5133
5134 case OP_REVERSE:
5135 #ifdef SUPPORT_UTF8
5136 c = GET(ecode,1);
5137 for (i = 0; i < c; i++)
5138 {
5139 eptr--;
5140 BACKCHAR(eptr)
5141 }
5142 #else
5143 eptr -= GET(ecode,1);
5144 #endif
5145
5146 if (eptr < md->start_subject) return MATCH_NOMATCH;
5147 ecode += 1 + LINK_SIZE;
5148 break;
5149
5150 /* The callout item calls an external function, if one is provided, passing
5151 details of the match so far. This is mainly for debugging, though the
5152 function is able to force a failure. */
5153
5154 case OP_CALLOUT:
5155 if (pcre_callout != NULL)
5156 {
5157 pcre_callout_block cb;
5158 cb.version = 0; /* Version 0 of the callout block */
5159 cb.callout_number = ecode[1];
5160 cb.offset_vector = md->offset_vector;
5161 cb.subject = (const char *)md->start_subject;
5162 cb.subject_length = md->end_subject - md->start_subject;
5163 cb.start_match = md->start_match - md->start_subject;
5164 cb.current_position = eptr - md->start_subject;
5165 cb.capture_top = offset_top/2;
5166 cb.capture_last = md->capture_last;
5167 cb.callout_data = md->callout_data;
5168 if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
5169 if (rrc < 0) return rrc;
5170 }
5171 ecode += 2;
5172 break;
5173
5174 /* Recursion either matches the current regex, or some subexpression. The
5175 offset data is the offset to the starting bracket from the start of the
5176 whole pattern. However, it is possible that a BRAZERO was inserted before
5177 this bracket after we took the offset - we just skip it if encountered.
5178
5179 If there are any capturing brackets started but not finished, we have to
5180 save their starting points and reinstate them after the recursion. However,
5181 we don't know how many such there are (offset_top records the completed
5182 total) so we just have to save all the potential data. There may be up to
5183 65535 such values, which is too large to put on the stack, but using malloc
5184 for small numbers seems expensive. As a compromise, the stack is used when
5185 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5186 is used. A problem is what to do if the malloc fails ... there is no way of
5187 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5188 values on the stack, and accept that the rest may be wrong.
5189
5190 There are also other values that have to be saved. We use a chained
5191 sequence of blocks that actually live on the stack. Thanks to Robin Houston
5192 for the original version of this logic. */
5193
5194 case OP_RECURSE:
5195 {
5196 int stacksave[REC_STACK_SAVE_MAX];
5197 recursion_info new_recursive;
5198 const uschar *callpat = md->start_code + GET(ecode, 1);
5199
5200 if (*callpat == OP_BRAZERO) callpat++;
5201
5202 new_recursive.group_num = *callpat - OP_BRA;
5203
5204 /* For extended extraction brackets (large number), we have to fish out
5205 the number from a dummy opcode at the start. */
5206
5207 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5208 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5209
5210 /* Add to "recursing stack" */
5211
5212 new_recursive.prev = md->recursive;
5213 md->recursive = &new_recursive;
5214
5215 /* Find where to continue from afterwards */
5216
5217 ecode += 1 + LINK_SIZE;
5218 new_recursive.after_call = ecode;
5219
5220 /* Now save the offset data. */
5221
5222 new_recursive.saved_max = md->offset_end;
5223 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5224 new_recursive.offset_save = stacksave;
5225 else
5226 {
5227 new_recursive.offset_save =
5228 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5229 if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
5230 }
5231
5232 memcpy(new_recursive.offset_save, md->offset_vector,
5233 new_recursive.saved_max * sizeof(int));
5234 new_recursive.save_start = md->start_match;
5235 md->start_match = eptr;
5236
5237 /* OK, now we can do the recursion. For each top-level alternative we
5238 restore the offset and recursion data. */
5239
5240 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5241 do
5242 {
5243 if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5244 eptrb, match_isgroup)) == MATCH_MATCH)
5245 {
5246 md->recursive = new_recursive.prev;
5247 if (new_recursive.offset_save != stacksave)
5248 (pcre_free)(new_recursive.offset_save);
5249 return MATCH_MATCH;
5250 }
5251 else if (rrc != MATCH_NOMATCH) return rrc;
5252
5253 md->recursive = &new_recursive;
5254 memcpy(md->offset_vector, new_recursive.offset_save,
5255 new_recursive.saved_max * sizeof(int));
5256 callpat += GET(callpat, 1);
5257 }
5258 while (*callpat == OP_ALT);
5259
5260 DPRINTF(("Recursion didn't match\n"));
5261 md->recursive = new_recursive.prev;
5262 if (new_recursive.offset_save != stacksave)
5263 (pcre_free)(new_recursive.offset_save);
5264 return MATCH_NOMATCH;
5265 }
5266 break;
5267
5268 /* "Once" brackets are like assertion brackets except that after a match,
5269 the point in the subject string is not moved back. Thus there can never be
5270 a move back into the brackets. Friedl calls these "atomic" subpatterns.
5271 Check the alternative branches in turn - the matching won't pass the KET
5272 for this kind of subpattern. If any one branch matches, we carry on as at
5273 the end of a normal bracket, leaving the subject pointer. */
5274
5275 case OP_ONCE:
5276 {
5277 const uschar *prev = ecode;
5278 const uschar *saved_eptr = eptr;
5279
5280 do
5281 {
5282 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5283 eptrb, match_isgroup)) == MATCH_MATCH) break;
5284 if (rrc != MATCH_NOMATCH) return rrc;
5285 ecode += GET(ecode,1);
5286 }
5287 while (*ecode == OP_ALT);
5288
5289 /* If hit the end of the group (which could be repeated), fail */
5290
5291 if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
5292
5293 /* Continue as from after the assertion, updating the offsets high water
5294 mark, since extracts may have been taken. */
5295
5296 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5297
5298 offset_top = md->end_offset_top;
5299 eptr = md->end_match_ptr;
5300
5301 /* For a non-repeating ket, just continue at this level. This also
5302 happens for a repeating ket if no characters were matched in the group.
5303 This is the forcible breaking of infinite loops as implemented in Perl
5304 5.005. If there is an options reset, it will get obeyed in the normal
5305 course of events. */
5306
5307 if (*ecode == OP_KET || eptr == saved_eptr)
5308 {
5309 ecode += 1+LINK_SIZE;
5310 break;
5311 }
5312
5313 /* The repeating kets try the rest of the pattern or restart from the
5314 preceding bracket, in the appropriate order. We need to reset any options
5315 that changed within the bracket before re-running it, so check the next
5316 opcode. */
5317
5318 if (ecode[1+LINK_SIZE] == OP_OPT)
5319 {
5320 ims = (ims & ~PCRE_IMS) | ecode[4];
5321 DPRINTF(("ims set to %02lx at group repeat\n", ims));
5322 }
5323
5324 if (*ecode == OP_KETRMIN)
5325 {
5326 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5327 eptrb, 0)) != MATCH_NOMATCH) return rrc;
5328 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5329 match_isgroup)) != MATCH_NOMATCH) return rrc;
5330 }
5331 else /* OP_KETRMAX */
5332 {
5333 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5334 match_isgroup)) != MATCH_NOMATCH) return rrc;
5335 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5336 0)) != MATCH_NOMATCH) return rrc;
5337 }
5338 }
5339 return MATCH_NOMATCH;
5340
5341 /* An alternation is the end of a branch; scan along to find the end of the
5342 bracketed group and go to there. */
5343
5344 case OP_ALT:
5345 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5346 break;
5347
5348 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
5349 that it may occur zero times. It may repeat infinitely, or not at all -
5350 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
5351 repeat limits are compiled as a number of copies, with the optional ones
5352 preceded by BRAZERO or BRAMINZERO. */
5353
5354 case OP_BRAZERO:
5355 {
5356 const uschar *next = ecode+1;
5357 if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
5358 != MATCH_NOMATCH) return rrc;
5359 do next += GET(next,1); while (*next == OP_ALT);
5360 ecode = next + 1+LINK_SIZE;
5361 }
5362 break;
5363
5364 case OP_BRAMINZERO:
5365 {
5366 const uschar *next = ecode+1;
5367 do next += GET(next,1); while (*next == OP_ALT);
5368 if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5369 match_isgroup)) != MATCH_NOMATCH) return rrc;
5370 ecode++;
5371 }
5372 break;
5373
5374 /* End of a group, repeated or non-repeating. If we are at the end of
5375 an assertion "group", stop matching and return MATCH_MATCH, but record the
5376 current high water mark for use by positive assertions. Do this also
5377 for the "once" (not-backup up) groups. */
5378
5379 case OP_KET:
5380 case OP_KETRMIN:
5381 case OP_KETRMAX:
5382 {
5383 const uschar *prev = ecode - GET(ecode, 1);
5384 const uschar *saved_eptr = eptrb->saved_eptr;
5385
5386 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
5387
5388 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
5389 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
5390 *prev == OP_ONCE)
5391 {
5392 md->end_match_ptr = eptr; /* For ONCE */
5393 md->end_offset_top = offset_top;
5394 return MATCH_MATCH;
5395 }
5396
5397 /* In all other cases except a conditional group we have to check the
5398 group number back at the start and if necessary complete handling an
5399 extraction by setting the offsets and bumping the high water mark. */
5400
5401 if (*prev != OP_COND)
5402 {
5403 int offset;
5404 int number = *prev - OP_BRA;
5405
5406 /* For extended extraction brackets (large number), we have to fish out
5407 the number from a dummy opcode at the start. */
5408
5409 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
5410 offset = number << 1;
5411
5412 #ifdef DEBUG
5413 printf("end bracket %d", number);
5414 printf("\n");
5415 #endif
5416
5417 /* Test for a numbered group. This includes groups called as a result
5418 of recursion. Note that whole-pattern recursion is coded as a recurse
5419 into group 0, so it won't be picked up here. Instead, we catch it when
5420 the OP_END is reached. */
5421
5422 if (number > 0)
5423 {
5424 md->capture_last = number;
5425 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
5426 {
5427 md->offset_vector[offset] =
5428 md->offset_vector[md->offset_end - number];
5429 md->offset_vector[offset+1] = eptr - md->start_subject;
5430 if (offset_top <= offset) offset_top = offset + 2;
5431 }
5432
5433 /* Handle a recursively called group. Restore the offsets
5434 appropriately and continue from after the call. */
5435
5436 if (md->recursive != NULL && md->recursive->group_num == number)
5437 {
5438 recursion_info *rec = md->recursive;
5439 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
5440 md->recursive = rec->prev;
5441 md->start_match = rec->save_start;
5442 memcpy(md->offset_vector, rec->offset_save,
5443 rec->saved_max * sizeof(int));
5444 ecode = rec->after_call;
5445 ims = original_ims;
5446 break;
5447 }
5448 }
5449 }
5450
5451 /* Reset the value of the ims flags, in case they got changed during
5452 the group. */
5453
5454 ims = original_ims;
5455 DPRINTF(("ims reset to %02lx\n", ims));
5456
5457 /* For a non-repeating ket, just continue at this level. This also
5458 happens for a repeating ket if no characters were matched in the group.
5459 This is the forcible breaking of infinite loops as implemented in Perl
5460 5.005. If there is an options reset, it will get obeyed in the normal
5461 course of events. */
5462
5463 if (*ecode == OP_KET || eptr == saved_eptr)
5464 {
5465 ecode += 1 + LINK_SIZE;
5466 break;
5467 }
5468
5469 /* The repeating kets try the rest of the pattern or restart from the
5470 preceding bracket, in the appropriate order. */
5471
5472 if (*ecode == OP_KETRMIN)
5473 {
5474 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5475 0)) != MATCH_NOMATCH) return rrc;
5476 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5477 match_isgroup)) != MATCH_NOMATCH) return rrc;
5478 }
5479 else /* OP_KETRMAX */
5480 {
5481 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5482 match_isgroup)) != MATCH_NOMATCH) return rrc;
5483 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5484 0)) != MATCH_NOMATCH) return rrc;
5485 }
5486 }
5487 return MATCH_NOMATCH;
5488
5489 /* Start of subject unless notbol, or after internal newline if multiline */
5490
5491 case OP_CIRC:
5492 if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
5493 if ((ims & PCRE_MULTILINE) != 0)
5494 {
5495 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
5496 return MATCH_NOMATCH;
5497 ecode++;
5498 break;
5499 }
5500 /* ... else fall through */
5501
5502 /* Start of subject assertion */
5503
5504 case OP_SOD:
5505 if (eptr != md->start_subject) return MATCH_NOMATCH;
5506 ecode++;
5507 break;
5508
5509 /* Start of match assertion */
5510
5511 case OP_SOM:
5512 if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
5513 ecode++;
5514 break;
5515
5516 /* Assert before internal newline if multiline, or before a terminating
5517 newline unless endonly is set, else end of subject unless noteol is set. */
5518
5519 case OP_DOLL:
5520 if ((ims & PCRE_MULTILINE) != 0)
5521 {
5522 if (eptr < md->end_subject)
5523 { if (*eptr != NEWLINE) return MATCH_NOMATCH; }
5524 else
5525 { if (md->noteol) return MATCH_NOMATCH; }
5526 ecode++;
5527 break;
5528 }
5529 else
5530 {
5531 if (md->noteol) return MATCH_NOMATCH;
5532 if (!md->endonly)
5533 {
5534 if (eptr < md->end_subject - 1 ||
5535 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
5536 return MATCH_NOMATCH;
5537 ecode++;
5538 break;
5539 }
5540 }
5541 /* ... else fall through */
5542
5543 /* End of subject assertion (\z) */
5544
5545 case OP_EOD:
5546 if (eptr < md->end_subject) return MATCH_NOMATCH;
5547 ecode++;
5548 break;
5549
5550 /* End of subject or ending \n assertion (\Z) */
5551
5552 case OP_EODN:
5553 if (eptr < md->end_subject - 1 ||
5554 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH;
5555 ecode++;
5556 break;
5557
5558 /* Word boundary assertions */
5559
5560 case OP_NOT_WORD_BOUNDARY:
5561 case OP_WORD_BOUNDARY:
5562 {
5563 BOOL prev_is_word, cur_is_word;
5564
5565 /* Find out if the previous and current characters are "word" characters.
5566 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
5567 be "non-word" characters. */
5568
5569 #if SUPPORT_UTF8
5570 if (md->utf8)
5571 {
5572 if (eptr == md->start_subject) prev_is_word = FALSE; else
5573 {
5574 const uschar *lastptr = eptr - 1;
5575 while((*lastptr & 0xc0) == 0x80) lastptr--;
5576 GETCHAR(c, lastptr);
5577 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5578 }
5579 if (eptr >= md->end_subject) cur_is_word = FALSE; else
5580 {
5581 GETCHAR(c, eptr);
5582 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5583 }
5584 }
5585 else
5586 #endif
5587
5588 /* More streamlined when not in UTF-8 mode */
5589
5590 {
5591 prev_is_word = (eptr != md->start_subject) &&
5592 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
5593 cur_is_word = (eptr < md->end_subject) &&
5594 ((md->ctypes[*eptr] & ctype_word) != 0);
5595 }
5596
5597 /* Now see if the situation is what we want */
5598
5599 if ((*ecode++ == OP_WORD_BOUNDARY)?
5600 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5601 return MATCH_NOMATCH;
5602 }
5603 break;
5604
5605 /* Match a single character type; inline for speed */
5606
5607 case OP_ANY:
5608 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
5609 return MATCH_NOMATCH;
5610 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5611 #ifdef SUPPORT_UTF8
5612 if (md->utf8)
5613 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5614 #endif
5615 ecode++;
5616 break;
5617
5618 /* Match a single byte, even in UTF-8 mode. This opcode really does match
5619 any byte, even newline, independent of the setting of PCRE_DOTALL. */
5620
5621 case OP_ANYBYTE:
5622 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5623 ecode++;
5624 break;
5625
5626 case OP_NOT_DIGIT:
5627 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5628 GETCHARINCTEST(c, eptr);
5629 if (
5630 #ifdef SUPPORT_UTF8
5631 c < 256 &&
5632 #endif
5633 (md->ctypes[c] & ctype_digit) != 0
5634 )
5635 return MATCH_NOMATCH;
5636 ecode++;
5637 break;
5638
5639 case OP_DIGIT:
5640 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5641 GETCHARINCTEST(c, eptr);
5642 if (
5643 #ifdef SUPPORT_UTF8
5644 c >= 256 ||
5645 #endif
5646 (md->ctypes[c] & ctype_digit) == 0
5647 )
5648 return MATCH_NOMATCH;
5649 ecode++;
5650 break;
5651
5652 case OP_NOT_WHITESPACE:
5653 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5654 GETCHARINCTEST(c, eptr);
5655 if (
5656 #ifdef SUPPORT_UTF8
5657 c < 256 &&
5658 #endif
5659 (md->ctypes[c] & ctype_space) != 0
5660 )
5661 return MATCH_NOMATCH;
5662 ecode++;
5663 break;
5664
5665 case OP_WHITESPACE:
5666 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5667 GETCHARINCTEST(c, eptr);
5668 if (
5669 #ifdef SUPPORT_UTF8
5670 c >= 256 ||
5671 #endif
5672 (md->ctypes[c] & ctype_space) == 0
5673 )
5674 return MATCH_NOMATCH;
5675 ecode++;
5676 break;
5677
5678 case OP_NOT_WORDCHAR:
5679 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5680 GETCHARINCTEST(c, eptr);
5681 if (
5682 #ifdef SUPPORT_UTF8
5683 c < 256 &&
5684 #endif
5685 (md->ctypes[c] & ctype_word) != 0
5686 )
5687 return MATCH_NOMATCH;
5688 ecode++;
5689 break;
5690
5691 case OP_WORDCHAR:
5692 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5693 GETCHARINCTEST(c, eptr);
5694 if (
5695 #ifdef SUPPORT_UTF8
5696 c >= 256 ||
5697 #endif
5698 (md->ctypes[c] & ctype_word) == 0
5699 )
5700 return MATCH_NOMATCH;
5701 ecode++;
5702 break;
5703
5704 /* Match a back reference, possibly repeatedly. Look past the end of the
5705 item to see if there is repeat information following. The code is similar
5706 to that for character classes, but repeated for efficiency. Then obey
5707 similar code to character type repeats - written out again for speed.
5708 However, if the referenced string is the empty string, always treat
5709 it as matched, any number of times (otherwise there could be infinite
5710 loops). */
5711
5712 case OP_REF:
5713 {
5714 int length;
5715 int offset = GET2(ecode, 1) << 1; /* Doubled ref number */
5716 ecode += 3; /* Advance past item */
5717
5718 /* If the reference is unset, set the length to be longer than the amount
5719 of subject left; this ensures that every attempt at a match fails. We
5720 can't just fail here, because of the possibility of quantifiers with zero
5721 minima. */
5722
5723 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
5724 md->end_subject - eptr + 1 :
5725 md->offset_vector[offset+1] - md->offset_vector[offset];
5726
5727 /* Set up for repetition, or handle the non-repeated case */
5728
5729 switch (*ecode)
5730 {
5731 case OP_CRSTAR:
5732 case OP_CRMINSTAR:
5733 case OP_CRPLUS:
5734 case OP_CRMINPLUS:
5735 case OP_CRQUERY:
5736 case OP_CRMINQUERY:
5737 c = *ecode++ - OP_CRSTAR;
5738 minimize = (c & 1) != 0;
5739 min = rep_min[c]; /* Pick up values from tables; */
5740 max = rep_max[c]; /* zero for max => infinity */
5741 if (max == 0) max = INT_MAX;
5742 break;
5743
5744 case OP_CRRANGE:
5745 case OP_CRMINRANGE:
5746 minimize = (*ecode == OP_CRMINRANGE);
5747 min = GET2(ecode, 1);
5748 max = GET2(ecode, 3);
5749 if (max == 0) max = INT_MAX;
5750 ecode += 5;
5751 break;
5752
5753 default: /* No repeat follows */
5754 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5755 eptr += length;
5756 continue; /* With the main loop */
5757 }
5758
5759 /* If the length of the reference is zero, just continue with the
5760 main loop. */
5761
5762 if (length == 0) continue;
5763
5764 /* First, ensure the minimum number of matches are present. We get back
5765 the length of the reference string explicitly rather than passing the
5766 address of eptr, so that eptr can be a register variable. */
5767
5768 for (i = 1; i <= min; i++)
5769 {
5770 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5771 eptr += length;
5772 }
5773
5774 /* If min = max, continue at the same level without recursion.
5775 They are not both allowed to be zero. */
5776
5777 if (min == max) continue;
5778
5779 /* If minimizing, keep trying and advancing the pointer */
5780
5781 if (minimize)
5782 {
5783 for (i = min;; i++)
5784 {
5785 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5786 MATCH_NOMATCH) return rrc;
5787 if (i >= max || !match_ref(offset, eptr, length, md, ims))
5788 return MATCH_NOMATCH;
5789 eptr += length;
5790 }
5791 /* Control never gets here */
5792 }
5793
5794 /* If maximizing, find the longest string and work backwards */
5795
5796 else
5797 {
5798 const uschar *pp = eptr;
5799 for (i = min; i < max; i++)
5800 {
5801 if (!match_ref(offset, eptr, length, md, ims)) break;
5802 eptr += length;
5803 }
5804 while (eptr >= pp)
5805 {
5806 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5807 MATCH_NOMATCH) return rrc;
5808 eptr -= length;
5809 }
5810 return MATCH_NOMATCH;
5811 }
5812 }
5813 /* Control never gets here */
5814
5815
5816
5817 /* Match a bit-mapped character class, possibly repeatedly. This op code is
5818 used when all the characters in the class have values in the range 0-255.
5819 The only difference between OP_CLASS and OP_NCLASS occurs when a data
5820 character outside the range is encountered.
5821
5822 First, look past the end of the item to see if there is repeat information
5823 following. Then obey similar code to character type repeats - written out
5824 again for speed. */
5825
5826 case OP_NCLASS:
5827 case OP_CLASS:
5828 {
5829 const uschar *data = ecode + 1; /* Save for matching */
5830 ecode += 33; /* Advance past the item */
5831
5832 switch (*ecode)
5833 {
5834 case OP_CRSTAR:
5835 case OP_CRMINSTAR:
5836 case OP_CRPLUS:
5837 case OP_CRMINPLUS:
5838 case OP_CRQUERY:
5839 case OP_CRMINQUERY:
5840 c = *ecode++ - OP_CRSTAR;
5841 minimize = (c & 1) != 0;
5842 min = rep_min[c]; /* Pick up values from tables; */
5843 max = rep_max[c]; /* zero for max => infinity */
5844 if (max == 0) max = INT_MAX;
5845 break;
5846
5847 case OP_CRRANGE:
5848 case OP_CRMINRANGE:
5849 minimize = (*ecode == OP_CRMINRANGE);
5850 min = GET2(ecode, 1);
5851 max = GET2(ecode, 3);
5852 if (max == 0) max = INT_MAX;
5853 ecode += 5;
5854 break;
5855
5856 default: /* No repeat follows */
5857 min = max = 1;
5858 break;
5859 }
5860
5861 /* First, ensure the minimum number of matches are present. */
5862
5863 #ifdef SUPPORT_UTF8
5864 /* UTF-8 mode */
5865 if (md->utf8)
5866 {
5867 for (i = 1; i <= min; i++)
5868 {
5869 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5870 GETCHARINC(c, eptr);
5871 if (c > 255)
5872 {
5873 if (op == OP_CLASS) return MATCH_NOMATCH;
5874 }
5875 else
5876 {
5877 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5878 }
5879 }
5880 }
5881 else
5882 #endif
5883 /* Not UTF-8 mode */
5884 {
5885 for (i = 1; i <= min; i++)
5886 {
5887 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5888 c = *eptr++;
5889 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5890 }
5891 }
5892
5893 /* If max == min we can continue with the main loop without the
5894 need to recurse. */
5895
5896 if (min == max) continue;
5897
5898 /* If minimizing, keep testing the rest of the expression and advancing
5899 the pointer while it matches the class. */
5900
5901 if (minimize)
5902 {
5903 #ifdef SUPPORT_UTF8
5904 /* UTF-8 mode */
5905 if (md->utf8)
5906 {
5907 for (i = min;; i++)
5908 {
5909 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5910 MATCH_NOMATCH) return rrc;
5911 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
5912 GETCHARINC(c, eptr);
5913 if (c > 255)
5914 {
5915 if (op == OP_CLASS) return MATCH_NOMATCH;
5916 }
5917 else
5918 {
5919 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5920 }
5921 }
5922 }
5923 else
5924 #endif
5925 /* Not UTF-8 mode */
5926 {
5927 for (i = min;; i++)
5928 {
5929 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5930 MATCH_NOMATCH) return rrc;
5931 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
5932 c = *eptr++;
5933 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5934 }
5935 }
5936 /* Control never gets here */
5937 }
5938
5939 /* If maximizing, find the longest possible run, then work backwards. */
5940
5941 else
5942 {
5943 const uschar *pp = eptr;
5944
5945 #ifdef SUPPORT_UTF8
5946 /* UTF-8 mode */
5947 if (md->utf8)
5948 {
5949 for (i = min; i < max; i++)
5950 {
5951 int len = 1;
5952 if (eptr >= md->end_subject) break;
5953 GETCHARLEN(c, eptr, len);
5954 if (c > 255)
5955 {
5956 if (op == OP_CLASS) break;
5957 }
5958 else
5959 {
5960 if ((data[c/8] & (1 << (c&7))) == 0) break;
5961 }
5962 eptr += len;
5963 }
5964 while (eptr >= pp)
5965 {
5966 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
5967 MATCH_NOMATCH) return rrc;
5968 BACKCHAR(eptr)
5969 }
5970 }
5971 else
5972 #endif
5973 /* Not UTF-8 mode */
5974 {
5975 for (i = min; i < max; i++)
5976 {
5977 if (eptr >= md->end_subject) break;
5978 c = *eptr;
5979 if ((data[c/8] & (1 << (c&7))) == 0) break;
5980 eptr++;
5981 }
5982 while (eptr >= pp)
5983 {
5984 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
5985 MATCH_NOMATCH) return rrc;
5986 }
5987 }
5988
5989 return MATCH_NOMATCH;
5990 }
5991 }
5992 /* Control never gets here */
5993
5994
5995 /* Match an extended character class. This opcode is encountered only
5996 in UTF-8 mode, because that's the only time it is compiled. */
5997
5998 #ifdef SUPPORT_UTF8
5999 case OP_XCLASS:
6000 {
6001 const uschar *data = ecode + 1 + LINK_SIZE; /* Save for matching */
6002 ecode += GET(ecode, 1); /* Advance past the item */
6003
6004 switch (*ecode)
6005 {
6006 case OP_CRSTAR:
6007 case OP_CRMINSTAR:
6008 case OP_CRPLUS:
6009 case OP_CRMINPLUS:
6010 case OP_CRQUERY:
6011 case OP_CRMINQUERY:
6012 c = *ecode++ - OP_CRSTAR;
6013 minimize = (c & 1) != 0;
6014 min = rep_min[c]; /* Pick up values from tables; */
6015 max = rep_max[c]; /* zero for max => infinity */
6016 if (max == 0) max = INT_MAX;
6017 break;
6018
6019 case OP_CRRANGE:
6020 case OP_CRMINRANGE:
6021 minimize = (*ecode == OP_CRMINRANGE);
6022 min = GET2(ecode, 1);
6023 max = GET2(ecode, 3);
6024 if (max == 0) max = INT_MAX;
6025 ecode += 5;
6026 break;
6027
6028 default: /* No repeat follows */
6029 min = max = 1;
6030 break;
6031 }
6032
6033 /* First, ensure the minimum number of matches are present. */
6034
6035 for (i = 1; i <= min; i++)
6036 {
6037 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6038 GETCHARINC(c, eptr);
6039 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6040 }
6041
6042 /* If max == min we can continue with the main loop without the
6043 need to recurse. */
6044
6045 if (min == max) continue;
6046
6047 /* If minimizing, keep testing the rest of the expression and advancing
6048 the pointer while it matches the class. */
6049
6050 if (minimize)
6051 {
6052 for (i = min;; i++)
6053 {
6054 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6055 MATCH_NOMATCH) return rrc;
6056 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6057 GETCHARINC(c, eptr);
6058 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6059 }
6060 /* Control never gets here */
6061 }
6062
6063 /* If maximizing, find the longest possible run, then work backwards. */
6064
6065 else
6066 {
6067 const uschar *pp = eptr;
6068 for (i = min; i < max; i++)
6069 {
6070 int len = 1;
6071 if (eptr >= md->end_subject) break;
6072 GETCHARLEN(c, eptr, len);
6073 if (!match_xclass(c, data)) break;
6074 eptr += len;
6075 }
6076 while (eptr >= pp)
6077 {
6078 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6079 MATCH_NOMATCH) return rrc;
6080 BACKCHAR(eptr)
6081 }
6082 return MATCH_NOMATCH;
6083 }
6084
6085 /* Control never gets here */
6086 }
6087 #endif /* End of XCLASS */
6088
6089 /* Match a run of characters */
6090
6091 case OP_CHARS:
6092 {
6093 register int length = ecode[1];
6094 ecode += 2;
6095
6096 #ifdef DEBUG /* Sigh. Some compilers never learn. */
6097 if (eptr >= md->end_subject)
6098 printf("matching subject <null> against pattern ");
6099 else
6100 {
6101 printf("matching subject ");
6102 pchars(eptr, length, TRUE, md);
6103 printf(" against pattern ");
6104 }
6105 pchars(ecode, length, FALSE, md);
6106 printf("\n");
6107 #endif
6108
6109 if (length > md->end_subject - eptr) return MATCH_NOMATCH;
6110 if ((ims & PCRE_CASELESS) != 0)
6111 {
6112 while (length-- > 0)
6113 if (md->lcc[*ecode++] != md->lcc[*eptr++])
6114 return MATCH_NOMATCH;
6115 }
6116 else
6117 {
6118 while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
6119 }
6120 }
6121 break;
6122
6123 /* Match a single character repeatedly; different opcodes share code. */
6124
6125 case OP_EXACT:
6126 min = max = GET2(ecode, 1);
6127 ecode += 3;
6128 goto REPEATCHAR;
6129
6130 case OP_UPTO:
6131 case OP_MINUPTO:
6132 min = 0;
6133 max = GET2(ecode, 1);
6134 minimize = *ecode == OP_MINUPTO;
6135 ecode += 3;
6136 goto REPEATCHAR;
6137
6138 case OP_STAR:
6139 case OP_MINSTAR:
6140 case OP_PLUS:
6141 case OP_MINPLUS:
6142 case OP_QUERY:
6143 case OP_MINQUERY:
6144 c = *ecode++ - OP_STAR;
6145 minimize = (c & 1) != 0;
6146 min = rep_min[c]; /* Pick up values from tables; */
6147 max = rep_max[c]; /* zero for max => infinity */
6148 if (max == 0) max = INT_MAX;
6149
6150 /* Common code for all repeated single-character matches. We can give
6151 up quickly if there are fewer than the minimum number of characters left in
6152 the subject. */
6153
6154 REPEATCHAR:
6155 #ifdef SUPPORT_UTF8
6156 if (md->utf8)
6157 {
6158 int len = 1;
6159 const uschar *charptr = ecode;
6160 GETCHARLEN(c, ecode, len);
6161 if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
6162 ecode += len;
6163
6164 /* Handle multibyte character matching specially here. There is no
6165 support for any kind of casing for multibyte characters. */
6166
6167 if (len > 1)
6168 {
6169 for (i = 1; i <= min; i++)
6170 {
6171 if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
6172 eptr += len;
6173 }
6174
6175 if (min == max) continue;
6176
6177 if (minimize)
6178 {
6179 for (i = min;; i++)
6180 {
6181 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6182 MATCH_NOMATCH) return rrc;
6183 if (i >= max ||
6184 eptr >= md->end_subject ||
6185 memcmp(eptr, charptr, len) != 0)
6186 return MATCH_NOMATCH;
6187 eptr += len;
6188 }
6189 /* Control never gets here */
6190 }
6191 else
6192 {
6193 const uschar *pp = eptr;
6194 for (i = min; i < max; i++)
6195 {
6196 if (eptr > md->end_subject - len ||
6197 memcmp(eptr, charptr, len) != 0)
6198 break;
6199 eptr += len;
6200 }
6201 while (eptr >= pp)
6202 {
6203 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6204 MATCH_NOMATCH) return rrc;
6205 eptr -= len;
6206 }
6207 return MATCH_NOMATCH;
6208 }
6209 /* Control never gets here */
6210 }
6211
6212 /* If the length of a UTF-8 character is 1, we fall through here, and
6213 obey the code as for non-UTF-8 characters below, though in this case the
6214 value of c will always be < 128. */
6215 }
6216 else
6217 #endif
6218
6219 /* When not in UTF-8 mode, load a single-byte character. */
6220 {
6221 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6222 c = *ecode++;
6223 }
6224
6225 /* The value of c at this point is always less than 256, though we may or
6226 may not be in UTF-8 mode. The code is duplicated for the caseless and
6227 caseful cases, for speed, since matching characters is likely to be quite
6228 common. First, ensure the minimum number of matches are present. If min =
6229 max, continue at the same level without recursing. Otherwise, if
6230 minimizing, keep trying the rest of the expression and advancing one
6231 matching character if failing, up to the maximum. Alternatively, if
6232 maximizing, find the maximum number of characters and work backwards. */
6233
6234 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
6235 max, eptr));
6236
6237 if ((ims & PCRE_CASELESS) != 0)
6238 {
6239 c = md->lcc[c];
6240 for (i = 1; i <= min; i++)
6241 if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
6242 if (min == max) continue;
6243 if (minimize)
6244 {
6245 for (i = min;; i++)
6246 {
6247 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6248 MATCH_NOMATCH) return rrc;
6249 if (i >= max || eptr >= md->end_subject ||
6250 c != md->lcc[*eptr++])
6251 return MATCH_NOMATCH;
6252 }
6253 /* Control never gets here */
6254 }
6255 else
6256 {
6257 const uschar *pp = eptr;
6258 for (i = min; i < max; i++)
6259 {
6260 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
6261 eptr++;
6262 }
6263 while (eptr >= pp)
6264 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6265 MATCH_NOMATCH) return rrc;
6266 return MATCH_NOMATCH;
6267 }
6268 /* Control never gets here */
6269 }
6270
6271 /* Caseful comparisons (includes all multi-byte characters) */
6272
6273 else
6274 {
6275 for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
6276 if (min == max) continue;
6277 if (minimize)
6278 {
6279 for (i = min;; i++)
6280 {
6281 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6282 MATCH_NOMATCH) return rrc;
6283 if (i >= max || eptr >= md->end_subject || c != *eptr++)
6284 return MATCH_NOMATCH;
6285 }
6286 /* Control never gets here */
6287 }
6288 else
6289 {
6290 const uschar *pp = eptr;
6291 for (i = min; i < max; i++)
6292 {
6293 if (eptr >= md->end_subject || c != *eptr) break;
6294 eptr++;
6295 }
6296 while (eptr >= pp)
6297 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6298 MATCH_NOMATCH) return rrc;
6299 return MATCH_NOMATCH;
6300 }
6301 }
6302 /* Control never gets here */
6303
6304 /* Match a negated single one-byte character. The character we are
6305 checking can be multibyte. */
6306
6307 case OP_NOT:
6308 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6309 ecode++;
6310 GETCHARINCTEST(c, eptr);
6311 if ((ims & PCRE_CASELESS) != 0)
6312 {
6313 #ifdef SUPPORT_UTF8
6314 if (c < 256)
6315 #endif
6316 c = md->lcc[c];
6317 if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
6318 }
6319 else
6320 {
6321 if (*ecode++ == c) return MATCH_NOMATCH;
6322 }
6323 break;
6324
6325 /* Match a negated single one-byte character repeatedly. This is almost a
6326 repeat of the code for a repeated single character, but I haven't found a
6327 nice way of commoning these up that doesn't require a test of the
6328 positive/negative option for each character match. Maybe that wouldn't add
6329 very much to the time taken, but character matching *is* what this is all
6330 about... */
6331
6332 case OP_NOTEXACT:
6333 min = max = GET2(ecode, 1);
6334 ecode += 3;
6335 goto REPEATNOTCHAR;
6336
6337 case OP_NOTUPTO:
6338 case OP_NOTMINUPTO:
6339 min = 0;
6340 max = GET2(ecode, 1);
6341 minimize = *ecode == OP_NOTMINUPTO;
6342 ecode += 3;
6343 goto REPEATNOTCHAR;
6344
6345 case OP_NOTSTAR:
6346 case OP_NOTMINSTAR:
6347 case OP_NOTPLUS:
6348 case OP_NOTMINPLUS:
6349 case OP_NOTQUERY:
6350 case OP_NOTMINQUERY:
6351 c = *ecode++ - OP_NOTSTAR;
6352 minimize = (c & 1) != 0;
6353 min = rep_min[c]; /* Pick up values from tables; */
6354 max = rep_max[c]; /* zero for max => infinity */
6355 if (max == 0) max = INT_MAX;
6356
6357 /* Common code for all repeated single-character (less than 255) matches.
6358 We can give up quickly if there are fewer than the minimum number of
6359 characters left in the subject. */
6360
6361 REPEATNOTCHAR:
6362 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6363 c = *ecode++;
6364
6365 /* The code is duplicated for the caseless and caseful cases, for speed,
6366 since matching characters is likely to be quite common. First, ensure the
6367 minimum number of matches are present. If min = max, continue at the same
6368 level without recursing. Otherwise, if minimizing, keep trying the rest of
6369 the expression and advancing one matching character if failing, up to the
6370 maximum. Alternatively, if maximizing, find the maximum number of
6371 characters and work backwards. */
6372
6373 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
6374 max, eptr));
6375
6376 if ((ims & PCRE_CASELESS) != 0)
6377 {
6378 c = md->lcc[c];
6379
6380 #ifdef SUPPORT_UTF8
6381 /* UTF-8 mode */
6382 if (md->utf8)
6383 {
6384 register int d;
6385 for (i = 1; i <= min; i++)
6386 {
6387 GETCHARINC(d, eptr);
6388 if (d < 256) d = md->lcc[d];
6389 if (c == d) return MATCH_NOMATCH;
6390 }
6391 }
6392 else
6393 #endif
6394
6395 /* Not UTF-8 mode */
6396 {
6397 for (i = 1; i <= min; i++)
6398 if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
6399 }
6400
6401 if (min == max) continue;
6402
6403 if (minimize)
6404 {
6405 #ifdef SUPPORT_UTF8
6406 /* UTF-8 mode */
6407 if (md->utf8)
6408 {
6409 register int d;
6410 for (i = min;; i++)
6411 {
6412 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6413 MATCH_NOMATCH) return rrc;
6414 GETCHARINC(d, eptr);
6415 if (d < 256) d = md->lcc[d];
6416 if (i >= max || eptr >= md->end_subject || c == d)
6417 return MATCH_NOMATCH;
6418 }
6419 }
6420 else
6421 #endif
6422 /* Not UTF-8 mode */
6423 {
6424 for (i = min;; i++)
6425 {
6426 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6427 MATCH_NOMATCH) return rrc;
6428 if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
6429 return MATCH_NOMATCH;
6430 }
6431 }
6432 /* Control never gets here */
6433 }
6434
6435 /* Maximize case */
6436
6437 else
6438 {
6439 const uschar *pp = eptr;
6440
6441 #ifdef SUPPORT_UTF8
6442 /* UTF-8 mode */
6443 if (md->utf8)
6444 {
6445 register int d;
6446 for (i = min; i < max; i++)
6447 {
6448 int len = 1;
6449 if (eptr >= md->end_subject) break;
6450 GETCHARLEN(d, eptr, len);
6451 if (d < 256) d = md->lcc[d];
6452 if (c == d) break;
6453 eptr += len;
6454 }
6455 while (eptr >= pp)
6456 {
6457 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6458 MATCH_NOMATCH) return rrc;
6459 eptr--;
6460 BACKCHAR(eptr);
6461 }
6462 }
6463 else
6464 #endif
6465 /* Not UTF-8 mode */
6466 {
6467 for (i = min; i < max; i++)
6468 {
6469 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
6470 eptr++;
6471 }
6472 while (eptr >= pp)
6473 {
6474 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6475 MATCH_NOMATCH) return rrc;
6476 eptr--;
6477 }
6478 }
6479
6480 return MATCH_NOMATCH;
6481 }
6482 /* Control never gets here */
6483 }
6484
6485 /* Caseful comparisons */
6486
6487 else
6488 {
6489 #ifdef SUPPORT_UTF8
6490 /* UTF-8 mode */
6491 if (md->utf8)
6492 {
6493 register int d;
6494 for (i = 1; i <= min; i++)
6495 {
6496 GETCHARINC(d, eptr);
6497 if (c == d) return MATCH_NOMATCH;
6498 }
6499 }
6500 else
6501 #endif
6502 /* Not UTF-8 mode */
6503 {
6504 for (i = 1; i <= min; i++)
6505 if (c == *eptr++) return MATCH_NOMATCH;
6506 }
6507
6508 if (min == max) continue;
6509
6510 if (minimize)
6511 {
6512 #ifdef SUPPORT_UTF8
6513 /* UTF-8 mode */
6514 if (md->utf8)
6515 {
6516 register int d;
6517 for (i = min;; i++)
6518 {
6519 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6520 MATCH_NOMATCH) return rrc;
6521 GETCHARINC(d, eptr);
6522 if (i >= max || eptr >= md->end_subject || c == d)
6523 return MATCH_NOMATCH;
6524 }
6525 }
6526 else
6527 #endif
6528 /* Not UTF-8 mode */
6529 {
6530 for (i = min;; i++)
6531 {
6532 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6533 MATCH_NOMATCH) return rrc;
6534 if (i >= max || eptr >= md->end_subject || c == *eptr++)
6535 return MATCH_NOMATCH;
6536 }
6537 }
6538 /* Control never gets here */
6539 }
6540
6541 /* Maximize case */
6542
6543 else
6544 {
6545 const uschar *pp = eptr;
6546
6547 #ifdef SUPPORT_UTF8
6548 /* UTF-8 mode */
6549 if (md->utf8)
6550 {
6551 register int d;
6552 for (i = min; i < max; i++)
6553 {
6554 int len = 1;
6555 if (eptr >= md->end_subject) break;
6556 GETCHARLEN(d, eptr, len);
6557 if (c == d) break;
6558 eptr += len;
6559 }
6560 while (eptr >= pp)
6561 {
6562 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6563 MATCH_NOMATCH) return rrc;
6564 eptr--;
6565 BACKCHAR(eptr);
6566 }
6567 }
6568 else
6569 #endif
6570 /* Not UTF-8 mode */
6571 {
6572 for (i = min; i < max; i++)
6573 {
6574 if (eptr >= md->end_subject || c == *eptr) break;
6575 eptr++;
6576 }
6577 while (eptr >= pp)
6578 {
6579 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6580 MATCH_NOMATCH) return rrc;
6581 eptr--;
6582 }
6583 }
6584
6585 return MATCH_NOMATCH;
6586 }
6587 }
6588 /* Control never gets here */
6589
6590 /* Match a single character type repeatedly; several different opcodes
6591 share code. This is very similar to the code for single characters, but we
6592 repeat it in the interests of efficiency. */
6593
6594 case OP_TYPEEXACT:
6595 min = max = GET2(ecode, 1);
6596 minimize = TRUE;
6597 ecode += 3;
6598 goto REPEATTYPE;
6599
6600 case OP_TYPEUPTO:
6601 case OP_TYPEMINUPTO:
6602 min = 0;
6603 max = GET2(ecode, 1);
6604 minimize = *ecode == OP_TYPEMINUPTO;
6605 ecode += 3;
6606 goto REPEATTYPE;
6607
6608 case OP_TYPESTAR:
6609 case OP_TYPEMINSTAR:
6610 case OP_TYPEPLUS:
6611 case OP_TYPEMINPLUS:
6612 case OP_TYPEQUERY:
6613 case OP_TYPEMINQUERY:
6614 c = *ecode++ - OP_TYPESTAR;
6615 minimize = (c & 1) != 0;
6616 min = rep_min[c]; /* Pick up values from tables; */
6617 max = rep_max[c]; /* zero for max => infinity */
6618 if (max == 0) max = INT_MAX;
6619
6620 /* Common code for all repeated single character type matches. Note that
6621 in UTF-8 mode, '.' matches a character of any length, but for the other
6622 character types, the valid characters are all one-byte long. */
6623
6624 REPEATTYPE:
6625 ctype = *ecode++; /* Code for the character type */
6626
6627 /* First, ensure the minimum number of matches are present. Use inline
6628 code for maximizing the speed, and do the type test once at the start
6629 (i.e. keep it out of the loop). Also we can test that there are at least
6630 the minimum number of bytes before we start. This isn't as effective in
6631 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
6632 is tidier. */
6633
6634 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6635 if (min > 0)
6636 {
6637 #ifdef SUPPORT_UTF8
6638 if (md->utf8) switch(ctype)
6639 {
6640 case OP_ANY:
6641 for (i = 1; i <= min; i++)
6642 {
6643 if (eptr >= md->end_subject ||
6644 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
6645 return MATCH_NOMATCH;
6646 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6647 }
6648 break;
6649
6650 case OP_ANYBYTE:
6651 eptr += min;
6652 break;
6653
6654 case OP_NOT_DIGIT:
6655 for (i = 1; i <= min; i++)
6656 {
6657 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6658 GETCHARINC(c, eptr);
6659 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
6660 return MATCH_NOMATCH;
6661 }
6662 break;
6663
6664 case OP_DIGIT:
6665 for (i = 1; i <= min; i++)
6666 {
6667 if (eptr >= md->end_subject ||
6668 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
6669 return MATCH_NOMATCH;
6670 /* No need to skip more bytes - we know it's a 1-byte character */
6671 }
6672 break;
6673
6674 case OP_NOT_WHITESPACE:
6675 for (i = 1; i <= min; i++)
6676 {
6677 if (eptr >= md->end_subject ||
6678 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
6679 return MATCH_NOMATCH;
6680 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6681 }
6682 break;
6683
6684 case OP_WHITESPACE:
6685 for (i = 1; i <= min; i++)
6686 {
6687 if (eptr >= md->end_subject ||
6688 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
6689 return MATCH_NOMATCH;
6690 /* No need to skip more bytes - we know it's a 1-byte character */
6691 }
6692 break;
6693
6694 case OP_NOT_WORDCHAR:
6695 for (i = 1; i <= min; i++)
6696 {
6697 if (eptr >= md->end_subject ||
6698 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
6699 return MATCH_NOMATCH;
6700 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6701 }
6702 break;
6703
6704 case OP_WORDCHAR:
6705 for (i = 1; i <= min; i++)
6706 {
6707 if (eptr >= md->end_subject ||
6708 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
6709 return MATCH_NOMATCH;
6710 /* No need to skip more bytes - we know it's a 1-byte character */
6711 }
6712 break;
6713 }
6714 else
6715 #endif
6716
6717 /* Code for the non-UTF-8 case for minimum matching */
6718
6719 switch(ctype)
6720 {
6721 case OP_ANY:
6722 if ((ims & PCRE_DOTALL) == 0)
6723 {
6724 for (i = 1; i <= min; i++)
6725 if (*eptr++ == NEWLINE) return MATCH_NOMATCH;
6726 }
6727 else eptr += min;
6728 break;
6729
6730 case OP_ANYBYTE:
6731 eptr += min;
6732 break;
6733
6734 case OP_NOT_DIGIT:
6735 for (i = 1; i <= min; i++)
6736 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return MATCH_NOMATCH;
6737 break;
6738
6739 case OP_DIGIT:
6740 for (i = 1; i <= min; i++)
6741 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return MATCH_NOMATCH;
6742 break;
6743
6744 case OP_NOT_WHITESPACE:
6745 for (i = 1; i <= min; i++)
6746 if ((md->ctypes[*eptr++] & ctype_space) != 0) return MATCH_NOMATCH;
6747 break;
6748
6749 case OP_WHITESPACE:
6750 for (i = 1; i <= min; i++)