/[pcre]/code/tags/pcre-4.3/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-4.3/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 67 - (show annotations) (download)
Sat Feb 24 21:40:13 2007 UTC (7 years, 8 months ago) by nigel
Original Path: code/trunk/pcre.c
File MIME type: text/plain
File size: 228448 byte(s)
Load pcre-4.2 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2003 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35 /* Define DEBUG to get debugging output on stdout. */
36
37 /* #define DEBUG */
38
39 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40 inline, and there are *still* stupid compilers about that don't like indented
41 pre-processor statements. I suppose it's only been 10 years... */
42
43 #ifdef DEBUG
44 #define DPRINTF(p) printf p
45 #else
46 #define DPRINTF(p) /*nothing*/
47 #endif
48
49 /* Include the internals header, which itself includes Standard C headers plus
50 the external pcre header. */
51
52 #include "internal.h"
53
54
55 /* Allow compilation as C++ source code, should anybody want to do that. */
56
57 #ifdef __cplusplus
58 #define class pcre_class
59 #endif
60
61
62 /* Maximum number of items on the nested bracket stacks at compile time. This
63 applies to the nesting of all kinds of parentheses. It does not limit
64 un-nested, non-capturing parentheses. This number can be made bigger if
65 necessary - it is used to dimension one int and one unsigned char vector at
66 compile time. */
67
68 #define BRASTACK_SIZE 200
69
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77
78 /* The number of bytes in a literal character string above which we can't add
79 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80 could be 255 when UTF-8 support is excluded, but that means that some of the
81 test output would be different, which just complicates things.) */
82
83 #define MAXLIT 250
84
85
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
88
89 #define REQ_BYTE_MAX 1000
90
91
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
94
95 static uschar OP_lengths[] = { OP_LENGTHS };
96
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
106
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
116 0, 0, ESC_r, -ESC_s, ESC_t, 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
118 };
119
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
123
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
128
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. The table for [:blank:] is dynamically modified to remove
134 the vertical space characters. */
135
136 static const int posix_class_maps[] = {
137 cbit_lower, cbit_upper, -1, /* alpha */
138 cbit_lower, -1, -1, /* lower */
139 cbit_upper, -1, -1, /* upper */
140 cbit_digit, cbit_lower, cbit_upper, /* alnum */
141 cbit_print, cbit_cntrl, -1, /* ascii */
142 cbit_space, -1, -1, /* blank - a GNU extension */
143 cbit_cntrl, -1, -1, /* cntrl */
144 cbit_digit, -1, -1, /* digit */
145 cbit_graph, -1, -1, /* graph */
146 cbit_print, -1, -1, /* print */
147 cbit_punct, -1, -1, /* punct */
148 cbit_space, -1, -1, /* space */
149 cbit_word, -1, -1, /* word - a Perl extension */
150 cbit_xdigit,-1, -1 /* xdigit */
151 };
152
153
154 /* Definition to allow mutual recursion */
155
156 static BOOL
157 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
158 BOOL, int, int *, int *, branch_chain *, compile_data *);
159
160 /* Structure for building a chain of data that actually lives on the
161 stack, for holding the values of the subject pointer at the start of each
162 subpattern, so as to detect when an empty string has been matched by a
163 subpattern - to break infinite loops. */
164
165 typedef struct eptrblock {
166 struct eptrblock *prev;
167 const uschar *saved_eptr;
168 } eptrblock;
169
170 /* Flag bits for the match() function */
171
172 #define match_condassert 0x01 /* Called to check a condition assertion */
173 #define match_isgroup 0x02 /* Set if start of bracketed group */
174
175 /* Non-error returns from the match() function. Error returns are externally
176 defined PCRE_ERROR_xxx codes, which are all negative. */
177
178 #define MATCH_MATCH 1
179 #define MATCH_NOMATCH 0
180
181
182
183 /*************************************************
184 * Global variables *
185 *************************************************/
186
187 /* PCRE is thread-clean and doesn't use any global variables in the normal
188 sense. However, it calls memory allocation and free functions via the two
189 indirections below, and it can optionally do callouts. These values can be
190 changed by the caller, but are shared between all threads. However, when
191 compiling for Virtual Pascal, things are done differently (see pcre.in). */
192
193 #ifndef VPCOMPAT
194 void *(*pcre_malloc)(size_t) = malloc;
195 void (*pcre_free)(void *) = free;
196 int (*pcre_callout)(pcre_callout_block *) = NULL;
197 #endif
198
199
200 /*************************************************
201 * Macros and tables for character handling *
202 *************************************************/
203
204 /* When UTF-8 encoding is being used, a character is no longer just a single
205 byte. The macros for character handling generate simple sequences when used in
206 byte-mode, and more complicated ones for UTF-8 characters. */
207
208 #ifndef SUPPORT_UTF8
209 #define GETCHAR(c, eptr) c = *eptr;
210 #define GETCHARINC(c, eptr) c = *eptr++;
211 #define GETCHARINCTEST(c, eptr) c = *eptr++;
212 #define GETCHARLEN(c, eptr, len) c = *eptr;
213 #define BACKCHAR(eptr)
214
215 #else /* SUPPORT_UTF8 */
216
217 /* Get the next UTF-8 character, not advancing the pointer. This is called when
218 we know we are in UTF-8 mode. */
219
220 #define GETCHAR(c, eptr) \
221 c = *eptr; \
222 if ((c & 0xc0) == 0xc0) \
223 { \
224 int gcii; \
225 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
226 int gcss = 6*gcaa; \
227 c = (c & utf8_table3[gcaa]) << gcss; \
228 for (gcii = 1; gcii <= gcaa; gcii++) \
229 { \
230 gcss -= 6; \
231 c |= (eptr[gcii] & 0x3f) << gcss; \
232 } \
233 }
234
235 /* Get the next UTF-8 character, advancing the pointer. This is called when we
236 know we are in UTF-8 mode. */
237
238 #define GETCHARINC(c, eptr) \
239 c = *eptr++; \
240 if ((c & 0xc0) == 0xc0) \
241 { \
242 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
243 int gcss = 6*gcaa; \
244 c = (c & utf8_table3[gcaa]) << gcss; \
245 while (gcaa-- > 0) \
246 { \
247 gcss -= 6; \
248 c |= (*eptr++ & 0x3f) << gcss; \
249 } \
250 }
251
252 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
253
254 #define GETCHARINCTEST(c, eptr) \
255 c = *eptr++; \
256 if (md->utf8 && (c & 0xc0) == 0xc0) \
257 { \
258 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
259 int gcss = 6*gcaa; \
260 c = (c & utf8_table3[gcaa]) << gcss; \
261 while (gcaa-- > 0) \
262 { \
263 gcss -= 6; \
264 c |= (*eptr++ & 0x3f) << gcss; \
265 } \
266 }
267
268 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
269 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
270
271 #define GETCHARLEN(c, eptr, len) \
272 c = *eptr; \
273 if ((c & 0xc0) == 0xc0) \
274 { \
275 int gcii; \
276 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
277 int gcss = 6*gcaa; \
278 c = (c & utf8_table3[gcaa]) << gcss; \
279 for (gcii = 1; gcii <= gcaa; gcii++) \
280 { \
281 gcss -= 6; \
282 c |= (eptr[gcii] & 0x3f) << gcss; \
283 } \
284 len += gcaa; \
285 }
286
287 /* If the pointer is not at the start of a character, move it back until
288 it is. Called only in UTF-8 mode. */
289
290 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
291
292 #endif
293
294
295
296 /*************************************************
297 * Default character tables *
298 *************************************************/
299
300 /* A default set of character tables is included in the PCRE binary. Its source
301 is built by the maketables auxiliary program, which uses the default C ctypes
302 functions, and put in the file chartables.c. These tables are used by PCRE
303 whenever the caller of pcre_compile() does not provide an alternate set of
304 tables. */
305
306 #include "chartables.c"
307
308
309
310 #ifdef SUPPORT_UTF8
311 /*************************************************
312 * Tables for UTF-8 support *
313 *************************************************/
314
315 /* These are the breakpoints for different numbers of bytes in a UTF-8
316 character. */
317
318 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
319
320 /* These are the indicator bits and the mask for the data bits to set in the
321 first byte of a character, indexed by the number of additional bytes. */
322
323 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
324 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
325
326 /* Table of the number of extra characters, indexed by the first character
327 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
328 0x3d. */
329
330 static uschar utf8_table4[] = {
331 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
332 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
333 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
334 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
335
336
337 /*************************************************
338 * Convert character value to UTF-8 *
339 *************************************************/
340
341 /* This function takes an integer value in the range 0 - 0x7fffffff
342 and encodes it as a UTF-8 character in 0 to 6 bytes.
343
344 Arguments:
345 cvalue the character value
346 buffer pointer to buffer for result - at least 6 bytes long
347
348 Returns: number of characters placed in the buffer
349 */
350
351 static int
352 ord2utf8(int cvalue, uschar *buffer)
353 {
354 register int i, j;
355 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
356 if (cvalue <= utf8_table1[i]) break;
357 buffer += i;
358 for (j = i; j > 0; j--)
359 {
360 *buffer-- = 0x80 | (cvalue & 0x3f);
361 cvalue >>= 6;
362 }
363 *buffer = utf8_table2[i] | cvalue;
364 return i + 1;
365 }
366 #endif
367
368
369
370 /*************************************************
371 * Print compiled regex *
372 *************************************************/
373
374 /* The code for doing this is held in a separate file that is also included in
375 pcretest.c. It defines a function called print_internals(). */
376
377 #ifdef DEBUG
378 #include "printint.c"
379 #endif
380
381
382
383 /*************************************************
384 * Return version string *
385 *************************************************/
386
387 #define STRING(a) # a
388 #define XSTRING(s) STRING(s)
389
390 const char *
391 pcre_version(void)
392 {
393 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
394 }
395
396
397
398
399 /*************************************************
400 * (Obsolete) Return info about compiled pattern *
401 *************************************************/
402
403 /* This is the original "info" function. It picks potentially useful data out
404 of the private structure, but its interface was too rigid. It remains for
405 backwards compatibility. The public options are passed back in an int - though
406 the re->options field has been expanded to a long int, all the public options
407 at the low end of it, and so even on 16-bit systems this will still be OK.
408 Therefore, I haven't changed the API for pcre_info().
409
410 Arguments:
411 external_re points to compiled code
412 optptr where to pass back the options
413 first_byte where to pass back the first character,
414 or -1 if multiline and all branches start ^,
415 or -2 otherwise
416
417 Returns: number of capturing subpatterns
418 or negative values on error
419 */
420
421 int
422 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
423 {
424 const real_pcre *re = (const real_pcre *)external_re;
425 if (re == NULL) return PCRE_ERROR_NULL;
426 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
427 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
428 if (first_byte != NULL)
429 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
430 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
431 return re->top_bracket;
432 }
433
434
435
436 /*************************************************
437 * Return info about compiled pattern *
438 *************************************************/
439
440 /* This is a newer "info" function which has an extensible interface so
441 that additional items can be added compatibly.
442
443 Arguments:
444 external_re points to compiled code
445 extra_data points extra data, or NULL
446 what what information is required
447 where where to put the information
448
449 Returns: 0 if data returned, negative on error
450 */
451
452 int
453 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
454 void *where)
455 {
456 const real_pcre *re = (const real_pcre *)external_re;
457 const pcre_study_data *study = NULL;
458
459 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
460 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
461
462 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
463 study = extra_data->study_data;
464
465 switch (what)
466 {
467 case PCRE_INFO_OPTIONS:
468 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
469 break;
470
471 case PCRE_INFO_SIZE:
472 *((size_t *)where) = re->size;
473 break;
474
475 case PCRE_INFO_STUDYSIZE:
476 *((size_t *)where) = (study == NULL)? 0 : study->size;
477 break;
478
479 case PCRE_INFO_CAPTURECOUNT:
480 *((int *)where) = re->top_bracket;
481 break;
482
483 case PCRE_INFO_BACKREFMAX:
484 *((int *)where) = re->top_backref;
485 break;
486
487 case PCRE_INFO_FIRSTBYTE:
488 *((int *)where) =
489 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
490 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
491 break;
492
493 case PCRE_INFO_FIRSTTABLE:
494 *((const uschar **)where) =
495 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
496 study->start_bits : NULL;
497 break;
498
499 case PCRE_INFO_LASTLITERAL:
500 *((int *)where) =
501 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
502 break;
503
504 case PCRE_INFO_NAMEENTRYSIZE:
505 *((int *)where) = re->name_entry_size;
506 break;
507
508 case PCRE_INFO_NAMECOUNT:
509 *((int *)where) = re->name_count;
510 break;
511
512 case PCRE_INFO_NAMETABLE:
513 *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
514 break;
515
516 default: return PCRE_ERROR_BADOPTION;
517 }
518
519 return 0;
520 }
521
522
523
524 /*************************************************
525 * Return info about what features are configured *
526 *************************************************/
527
528 /* This is function which has an extensible interface so that additional items
529 can be added compatibly.
530
531 Arguments:
532 what what information is required
533 where where to put the information
534
535 Returns: 0 if data returned, negative on error
536 */
537
538 int
539 pcre_config(int what, void *where)
540 {
541 switch (what)
542 {
543 case PCRE_CONFIG_UTF8:
544 #ifdef SUPPORT_UTF8
545 *((int *)where) = 1;
546 #else
547 *((int *)where) = 0;
548 #endif
549 break;
550
551 case PCRE_CONFIG_NEWLINE:
552 *((int *)where) = NEWLINE;
553 break;
554
555 case PCRE_CONFIG_LINK_SIZE:
556 *((int *)where) = LINK_SIZE;
557 break;
558
559 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
560 *((int *)where) = POSIX_MALLOC_THRESHOLD;
561 break;
562
563 case PCRE_CONFIG_MATCH_LIMIT:
564 *((unsigned int *)where) = MATCH_LIMIT;
565 break;
566
567 default: return PCRE_ERROR_BADOPTION;
568 }
569
570 return 0;
571 }
572
573
574
575 #ifdef DEBUG
576 /*************************************************
577 * Debugging function to print chars *
578 *************************************************/
579
580 /* Print a sequence of chars in printable format, stopping at the end of the
581 subject if the requested.
582
583 Arguments:
584 p points to characters
585 length number to print
586 is_subject TRUE if printing from within md->start_subject
587 md pointer to matching data block, if is_subject is TRUE
588
589 Returns: nothing
590 */
591
592 static void
593 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
594 {
595 int c;
596 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
597 while (length-- > 0)
598 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
599 }
600 #endif
601
602
603
604
605 /*************************************************
606 * Handle escapes *
607 *************************************************/
608
609 /* This function is called when a \ has been encountered. It either returns a
610 positive value for a simple escape such as \n, or a negative value which
611 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
612 a positive value greater than 255 may be returned. On entry, ptr is pointing at
613 the \. On exit, it is on the final character of the escape sequence.
614
615 Arguments:
616 ptrptr points to the pattern position pointer
617 errorptr points to the pointer to the error message
618 bracount number of previous extracting brackets
619 options the options bits
620 isclass TRUE if inside a character class
621 cd pointer to char tables block
622
623 Returns: zero or positive => a data character
624 negative => a special escape sequence
625 on error, errorptr is set
626 */
627
628 static int
629 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
630 int options, BOOL isclass, compile_data *cd)
631 {
632 const uschar *ptr = *ptrptr;
633 int c, i;
634
635 /* If backslash is at the end of the pattern, it's an error. */
636
637 c = *(++ptr);
638 if (c == 0) *errorptr = ERR1;
639
640 /* Digits or letters may have special meaning; all others are literals. */
641
642 else if (c < '0' || c > 'z') {}
643
644 /* Do an initial lookup in a table. A non-zero result is something that can be
645 returned immediately. Otherwise further processing may be required. */
646
647 else if ((i = escapes[c - '0']) != 0) c = i;
648
649 /* Escapes that need further processing, or are illegal. */
650
651 else
652 {
653 const uschar *oldptr;
654 switch (c)
655 {
656 /* A number of Perl escapes are not handled by PCRE. We give an explicit
657 error. */
658
659 case 'l':
660 case 'L':
661 case 'N':
662 case 'p':
663 case 'P':
664 case 'u':
665 case 'U':
666 case 'X':
667 *errorptr = ERR37;
668 break;
669
670 /* The handling of escape sequences consisting of a string of digits
671 starting with one that is not zero is not straightforward. By experiment,
672 the way Perl works seems to be as follows:
673
674 Outside a character class, the digits are read as a decimal number. If the
675 number is less than 10, or if there are that many previous extracting
676 left brackets, then it is a back reference. Otherwise, up to three octal
677 digits are read to form an escaped byte. Thus \123 is likely to be octal
678 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
679 value is greater than 377, the least significant 8 bits are taken. Inside a
680 character class, \ followed by a digit is always an octal number. */
681
682 case '1': case '2': case '3': case '4': case '5':
683 case '6': case '7': case '8': case '9':
684
685 if (!isclass)
686 {
687 oldptr = ptr;
688 c -= '0';
689 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
690 c = c * 10 + *(++ptr) - '0';
691 if (c < 10 || c <= bracount)
692 {
693 c = -(ESC_REF + c);
694 break;
695 }
696 ptr = oldptr; /* Put the pointer back and fall through */
697 }
698
699 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
700 generates a binary zero byte and treats the digit as a following literal.
701 Thus we have to pull back the pointer by one. */
702
703 if ((c = *ptr) >= '8')
704 {
705 ptr--;
706 c = 0;
707 break;
708 }
709
710 /* \0 always starts an octal number, but we may drop through to here with a
711 larger first octal digit. */
712
713 case '0':
714 c -= '0';
715 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
716 ptr[1] != '8' && ptr[1] != '9')
717 c = c * 8 + *(++ptr) - '0';
718 c &= 255; /* Take least significant 8 bits */
719 break;
720
721 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
722 which can be greater than 0xff, but only if the ddd are hex digits. */
723
724 case 'x':
725 #ifdef SUPPORT_UTF8
726 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
727 {
728 const uschar *pt = ptr + 2;
729 register int count = 0;
730 c = 0;
731 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
732 {
733 count++;
734 c = c * 16 + cd->lcc[*pt] -
735 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
736 pt++;
737 }
738 if (*pt == '}')
739 {
740 if (c < 0 || count > 8) *errorptr = ERR34;
741 ptr = pt;
742 break;
743 }
744 /* If the sequence of hex digits does not end with '}', then we don't
745 recognize this construct; fall through to the normal \x handling. */
746 }
747 #endif
748
749 /* Read just a single hex char */
750
751 c = 0;
752 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
753 {
754 ptr++;
755 c = c * 16 + cd->lcc[*ptr] -
756 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
757 }
758 break;
759
760 /* Other special escapes not starting with a digit are straightforward */
761
762 case 'c':
763 c = *(++ptr);
764 if (c == 0)
765 {
766 *errorptr = ERR2;
767 return 0;
768 }
769
770 /* A letter is upper-cased; then the 0x40 bit is flipped */
771
772 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
773 c ^= 0x40;
774 break;
775
776 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
777 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
778 for Perl compatibility, it is a literal. This code looks a bit odd, but
779 there used to be some cases other than the default, and there may be again
780 in future, so I haven't "optimized" it. */
781
782 default:
783 if ((options & PCRE_EXTRA) != 0) switch(c)
784 {
785 default:
786 *errorptr = ERR3;
787 break;
788 }
789 break;
790 }
791 }
792
793 *ptrptr = ptr;
794 return c;
795 }
796
797
798
799 /*************************************************
800 * Check for counted repeat *
801 *************************************************/
802
803 /* This function is called when a '{' is encountered in a place where it might
804 start a quantifier. It looks ahead to see if it really is a quantifier or not.
805 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
806 where the ddds are digits.
807
808 Arguments:
809 p pointer to the first char after '{'
810 cd pointer to char tables block
811
812 Returns: TRUE or FALSE
813 */
814
815 static BOOL
816 is_counted_repeat(const uschar *p, compile_data *cd)
817 {
818 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
819 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
820 if (*p == '}') return TRUE;
821
822 if (*p++ != ',') return FALSE;
823 if (*p == '}') return TRUE;
824
825 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
826 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
827 return (*p == '}');
828 }
829
830
831
832 /*************************************************
833 * Read repeat counts *
834 *************************************************/
835
836 /* Read an item of the form {n,m} and return the values. This is called only
837 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
838 so the syntax is guaranteed to be correct, but we need to check the values.
839
840 Arguments:
841 p pointer to first char after '{'
842 minp pointer to int for min
843 maxp pointer to int for max
844 returned as -1 if no max
845 errorptr points to pointer to error message
846 cd pointer to character tables clock
847
848 Returns: pointer to '}' on success;
849 current ptr on error, with errorptr set
850 */
851
852 static const uschar *
853 read_repeat_counts(const uschar *p, int *minp, int *maxp,
854 const char **errorptr, compile_data *cd)
855 {
856 int min = 0;
857 int max = -1;
858
859 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
860
861 if (*p == '}') max = min; else
862 {
863 if (*(++p) != '}')
864 {
865 max = 0;
866 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
867 if (max < min)
868 {
869 *errorptr = ERR4;
870 return p;
871 }
872 }
873 }
874
875 /* Do paranoid checks, then fill in the required variables, and pass back the
876 pointer to the terminating '}'. */
877
878 if (min > 65535 || max > 65535)
879 *errorptr = ERR5;
880 else
881 {
882 *minp = min;
883 *maxp = max;
884 }
885 return p;
886 }
887
888
889
890 /*************************************************
891 * Find first significant op code *
892 *************************************************/
893
894 /* This is called by several functions that scan a compiled expression looking
895 for a fixed first character, or an anchoring op code etc. It skips over things
896 that do not influence this. For some calls, a change of option is important.
897
898 Arguments:
899 code pointer to the start of the group
900 options pointer to external options
901 optbit the option bit whose changing is significant, or
902 zero if none are
903
904 Returns: pointer to the first significant opcode
905 */
906
907 static const uschar*
908 first_significant_code(const uschar *code, int *options, int optbit)
909 {
910 for (;;)
911 {
912 switch ((int)*code)
913 {
914 case OP_OPT:
915 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
916 *options = (int)code[1];
917 code += 2;
918 break;
919
920 case OP_ASSERT_NOT:
921 case OP_ASSERTBACK:
922 case OP_ASSERTBACK_NOT:
923 do code += GET(code, 1); while (*code == OP_ALT);
924 /* Fall through */
925
926 case OP_CALLOUT:
927 case OP_CREF:
928 case OP_BRANUMBER:
929 case OP_WORD_BOUNDARY:
930 case OP_NOT_WORD_BOUNDARY:
931 code += OP_lengths[*code];
932 break;
933
934 default:
935 return code;
936 }
937 }
938 /* Control never reaches here */
939 }
940
941
942
943
944 /*************************************************
945 * Find the fixed length of a pattern *
946 *************************************************/
947
948 /* Scan a pattern and compute the fixed length of subject that will match it,
949 if the length is fixed. This is needed for dealing with backward assertions.
950 In UTF8 mode, the result is in characters rather than bytes.
951
952 Arguments:
953 code points to the start of the pattern (the bracket)
954 options the compiling options
955
956 Returns: the fixed length, or -1 if there is no fixed length,
957 or -2 if \C was encountered
958 */
959
960 static int
961 find_fixedlength(uschar *code, int options)
962 {
963 int length = -1;
964
965 register int branchlength = 0;
966 register uschar *cc = code + 1 + LINK_SIZE;
967
968 /* Scan along the opcodes for this branch. If we get to the end of the
969 branch, check the length against that of the other branches. */
970
971 for (;;)
972 {
973 int d;
974 register int op = *cc;
975 if (op >= OP_BRA) op = OP_BRA;
976
977 switch (op)
978 {
979 case OP_BRA:
980 case OP_ONCE:
981 case OP_COND:
982 d = find_fixedlength(cc, options);
983 if (d < 0) return d;
984 branchlength += d;
985 do cc += GET(cc, 1); while (*cc == OP_ALT);
986 cc += 1 + LINK_SIZE;
987 break;
988
989 /* Reached end of a branch; if it's a ket it is the end of a nested
990 call. If it's ALT it is an alternation in a nested call. If it is
991 END it's the end of the outer call. All can be handled by the same code. */
992
993 case OP_ALT:
994 case OP_KET:
995 case OP_KETRMAX:
996 case OP_KETRMIN:
997 case OP_END:
998 if (length < 0) length = branchlength;
999 else if (length != branchlength) return -1;
1000 if (*cc != OP_ALT) return length;
1001 cc += 1 + LINK_SIZE;
1002 branchlength = 0;
1003 break;
1004
1005 /* Skip over assertive subpatterns */
1006
1007 case OP_ASSERT:
1008 case OP_ASSERT_NOT:
1009 case OP_ASSERTBACK:
1010 case OP_ASSERTBACK_NOT:
1011 do cc += GET(cc, 1); while (*cc == OP_ALT);
1012 /* Fall through */
1013
1014 /* Skip over things that don't match chars */
1015
1016 case OP_REVERSE:
1017 case OP_BRANUMBER:
1018 case OP_CREF:
1019 case OP_OPT:
1020 case OP_CALLOUT:
1021 case OP_SOD:
1022 case OP_SOM:
1023 case OP_EOD:
1024 case OP_EODN:
1025 case OP_CIRC:
1026 case OP_DOLL:
1027 case OP_NOT_WORD_BOUNDARY:
1028 case OP_WORD_BOUNDARY:
1029 cc += OP_lengths[*cc];
1030 break;
1031
1032 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1033 This requires a scan of the string, unfortunately. We assume valid UTF-8
1034 strings, so all we do is reduce the length by one for every byte whose bits
1035 are 10xxxxxx. */
1036
1037 case OP_CHARS:
1038 branchlength += *(++cc);
1039 #ifdef SUPPORT_UTF8
1040 if ((options & PCRE_UTF8) != 0)
1041 for (d = 1; d <= *cc; d++)
1042 if ((cc[d] & 0xc0) == 0x80) branchlength--;
1043 #endif
1044 cc += *cc + 1;
1045 break;
1046
1047 /* Handle exact repetitions. The count is already in characters, but we
1048 need to skip over a multibyte character in UTF8 mode. */
1049
1050 case OP_EXACT:
1051 branchlength += GET2(cc,1);
1052 cc += 4;
1053 #ifdef SUPPORT_UTF8
1054 if ((options & PCRE_UTF8) != 0)
1055 {
1056 while((*cc & 0x80) == 0x80) cc++;
1057 }
1058 #endif
1059 break;
1060
1061 case OP_TYPEEXACT:
1062 branchlength += GET2(cc,1);
1063 cc += 4;
1064 break;
1065
1066 /* Handle single-char matchers */
1067
1068 case OP_NOT_DIGIT:
1069 case OP_DIGIT:
1070 case OP_NOT_WHITESPACE:
1071 case OP_WHITESPACE:
1072 case OP_NOT_WORDCHAR:
1073 case OP_WORDCHAR:
1074 case OP_ANY:
1075 branchlength++;
1076 cc++;
1077 break;
1078
1079 /* The single-byte matcher isn't allowed */
1080
1081 case OP_ANYBYTE:
1082 return -2;
1083
1084 /* Check a class for variable quantification */
1085
1086 #ifdef SUPPORT_UTF8
1087 case OP_XCLASS:
1088 cc += GET(cc, 1) - 33;
1089 /* Fall through */
1090 #endif
1091
1092 case OP_CLASS:
1093 case OP_NCLASS:
1094 cc += 33;
1095
1096 switch (*cc)
1097 {
1098 case OP_CRSTAR:
1099 case OP_CRMINSTAR:
1100 case OP_CRQUERY:
1101 case OP_CRMINQUERY:
1102 return -1;
1103
1104 case OP_CRRANGE:
1105 case OP_CRMINRANGE:
1106 if (GET2(cc,1) != GET2(cc,3)) return -1;
1107 branchlength += GET2(cc,1);
1108 cc += 5;
1109 break;
1110
1111 default:
1112 branchlength++;
1113 }
1114 break;
1115
1116 /* Anything else is variable length */
1117
1118 default:
1119 return -1;
1120 }
1121 }
1122 /* Control never gets here */
1123 }
1124
1125
1126
1127
1128 /*************************************************
1129 * Scan compiled regex for numbered bracket *
1130 *************************************************/
1131
1132 /* This little function scans through a compiled pattern until it finds a
1133 capturing bracket with the given number.
1134
1135 Arguments:
1136 code points to start of expression
1137 utf8 TRUE in UTF-8 mode
1138 number the required bracket number
1139
1140 Returns: pointer to the opcode for the bracket, or NULL if not found
1141 */
1142
1143 static const uschar *
1144 find_bracket(const uschar *code, BOOL utf8, int number)
1145 {
1146 #ifndef SUPPORT_UTF8
1147 utf8 = utf8; /* Stop pedantic compilers complaining */
1148 #endif
1149
1150 for (;;)
1151 {
1152 register int c = *code;
1153 if (c == OP_END) return NULL;
1154 else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1155 else if (c > OP_BRA)
1156 {
1157 int n = c - OP_BRA;
1158 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1159 if (n == number) return (uschar *)code;
1160 code += OP_lengths[OP_BRA];
1161 }
1162 else
1163 {
1164 code += OP_lengths[c];
1165
1166 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1167 by a multi-byte character. The length in the table is a minimum, so we have
1168 to scan along to skip the extra characters. All opcodes are less than 128,
1169 so we can use relatively efficient code. */
1170
1171 #ifdef SUPPORT_UTF8
1172 if (utf8) switch(c)
1173 {
1174 case OP_EXACT:
1175 case OP_UPTO:
1176 case OP_MINUPTO:
1177 case OP_STAR:
1178 case OP_MINSTAR:
1179 case OP_PLUS:
1180 case OP_MINPLUS:
1181 case OP_QUERY:
1182 case OP_MINQUERY:
1183 while ((*code & 0xc0) == 0x80) code++;
1184 break;
1185 }
1186 #endif
1187 }
1188 }
1189 }
1190
1191
1192
1193 /*************************************************
1194 * Scan compiled branch for non-emptiness *
1195 *************************************************/
1196
1197 /* This function scans through a branch of a compiled pattern to see whether it
1198 can match the empty string or not. It is called only from could_be_empty()
1199 below. Note that first_significant_code() skips over assertions. If we hit an
1200 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1201 whose current branch will already have been scanned.
1202
1203 Arguments:
1204 code points to start of search
1205 endcode points to where to stop
1206 utf8 TRUE if in UTF8 mode
1207
1208 Returns: TRUE if what is matched could be empty
1209 */
1210
1211 static BOOL
1212 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1213 {
1214 register int c;
1215 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1216 code < endcode;
1217 code = first_significant_code(code + OP_lengths[c], NULL, 0))
1218 {
1219 const uschar *ccode;
1220
1221 c = *code;
1222
1223 if (c >= OP_BRA)
1224 {
1225 BOOL empty_branch;
1226 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1227
1228 /* Scan a closed bracket */
1229
1230 empty_branch = FALSE;
1231 do
1232 {
1233 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1234 empty_branch = TRUE;
1235 code += GET(code, 1);
1236 }
1237 while (*code == OP_ALT);
1238 if (!empty_branch) return FALSE; /* All branches are non-empty */
1239 code += 1 + LINK_SIZE;
1240 c = *code;
1241 }
1242
1243 else switch (c)
1244 {
1245 /* Check for quantifiers after a class */
1246
1247 #ifdef SUPPORT_UTF8
1248 case OP_XCLASS:
1249 ccode = code + GET(code, 1);
1250 goto CHECK_CLASS_REPEAT;
1251 #endif
1252
1253 case OP_CLASS:
1254 case OP_NCLASS:
1255 ccode = code + 33;
1256
1257 #ifdef SUPPORT_UTF8
1258 CHECK_CLASS_REPEAT:
1259 #endif
1260
1261 switch (*ccode)
1262 {
1263 case OP_CRSTAR: /* These could be empty; continue */
1264 case OP_CRMINSTAR:
1265 case OP_CRQUERY:
1266 case OP_CRMINQUERY:
1267 break;
1268
1269 default: /* Non-repeat => class must match */
1270 case OP_CRPLUS: /* These repeats aren't empty */
1271 case OP_CRMINPLUS:
1272 return FALSE;
1273
1274 case OP_CRRANGE:
1275 case OP_CRMINRANGE:
1276 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1277 break;
1278 }
1279 break;
1280
1281 /* Opcodes that must match a character */
1282
1283 case OP_NOT_DIGIT:
1284 case OP_DIGIT:
1285 case OP_NOT_WHITESPACE:
1286 case OP_WHITESPACE:
1287 case OP_NOT_WORDCHAR:
1288 case OP_WORDCHAR:
1289 case OP_ANY:
1290 case OP_ANYBYTE:
1291 case OP_CHARS:
1292 case OP_NOT:
1293 case OP_PLUS:
1294 case OP_MINPLUS:
1295 case OP_EXACT:
1296 case OP_NOTPLUS:
1297 case OP_NOTMINPLUS:
1298 case OP_NOTEXACT:
1299 case OP_TYPEPLUS:
1300 case OP_TYPEMINPLUS:
1301 case OP_TYPEEXACT:
1302 return FALSE;
1303
1304 /* End of branch */
1305
1306 case OP_KET:
1307 case OP_KETRMAX:
1308 case OP_KETRMIN:
1309 case OP_ALT:
1310 return TRUE;
1311
1312 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1313 followed by a multibyte character */
1314
1315 #ifdef SUPPORT_UTF8
1316 case OP_STAR:
1317 case OP_MINSTAR:
1318 case OP_QUERY:
1319 case OP_MINQUERY:
1320 case OP_UPTO:
1321 case OP_MINUPTO:
1322 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1323 break;
1324 #endif
1325 }
1326 }
1327
1328 return TRUE;
1329 }
1330
1331
1332
1333 /*************************************************
1334 * Scan compiled regex for non-emptiness *
1335 *************************************************/
1336
1337 /* This function is called to check for left recursive calls. We want to check
1338 the current branch of the current pattern to see if it could match the empty
1339 string. If it could, we must look outwards for branches at other levels,
1340 stopping when we pass beyond the bracket which is the subject of the recursion.
1341
1342 Arguments:
1343 code points to start of the recursion
1344 endcode points to where to stop (current RECURSE item)
1345 bcptr points to the chain of current (unclosed) branch starts
1346 utf8 TRUE if in UTF-8 mode
1347
1348 Returns: TRUE if what is matched could be empty
1349 */
1350
1351 static BOOL
1352 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1353 BOOL utf8)
1354 {
1355 while (bcptr != NULL && bcptr->current >= code)
1356 {
1357 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1358 bcptr = bcptr->outer;
1359 }
1360 return TRUE;
1361 }
1362
1363
1364
1365 /*************************************************
1366 * Check for POSIX class syntax *
1367 *************************************************/
1368
1369 /* This function is called when the sequence "[:" or "[." or "[=" is
1370 encountered in a character class. It checks whether this is followed by an
1371 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1372 ".]" or "=]".
1373
1374 Argument:
1375 ptr pointer to the initial [
1376 endptr where to return the end pointer
1377 cd pointer to compile data
1378
1379 Returns: TRUE or FALSE
1380 */
1381
1382 static BOOL
1383 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1384 {
1385 int terminator; /* Don't combine these lines; the Solaris cc */
1386 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1387 if (*(++ptr) == '^') ptr++;
1388 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1389 if (*ptr == terminator && ptr[1] == ']')
1390 {
1391 *endptr = ptr;
1392 return TRUE;
1393 }
1394 return FALSE;
1395 }
1396
1397
1398
1399
1400 /*************************************************
1401 * Check POSIX class name *
1402 *************************************************/
1403
1404 /* This function is called to check the name given in a POSIX-style class entry
1405 such as [:alnum:].
1406
1407 Arguments:
1408 ptr points to the first letter
1409 len the length of the name
1410
1411 Returns: a value representing the name, or -1 if unknown
1412 */
1413
1414 static int
1415 check_posix_name(const uschar *ptr, int len)
1416 {
1417 register int yield = 0;
1418 while (posix_name_lengths[yield] != 0)
1419 {
1420 if (len == posix_name_lengths[yield] &&
1421 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1422 yield++;
1423 }
1424 return -1;
1425 }
1426
1427
1428
1429
1430 /*************************************************
1431 * Compile one branch *
1432 *************************************************/
1433
1434 /* Scan the pattern, compiling it into the code vector. If the options are
1435 changed during the branch, the pointer is used to change the external options
1436 bits.
1437
1438 Arguments:
1439 optionsptr pointer to the option bits
1440 brackets points to number of extracting brackets used
1441 code points to the pointer to the current code point
1442 ptrptr points to the current pattern pointer
1443 errorptr points to pointer to error message
1444 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1445 reqbyteptr set to the last literal character required, else < 0
1446 bcptr points to current branch chain
1447 cd contains pointers to tables etc.
1448
1449 Returns: TRUE on success
1450 FALSE, with *errorptr set on error
1451 */
1452
1453 static BOOL
1454 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1455 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1456 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1457 {
1458 int repeat_type, op_type;
1459 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1460 int bravalue = 0;
1461 int length;
1462 int greedy_default, greedy_non_default;
1463 int firstbyte, reqbyte;
1464 int zeroreqbyte, zerofirstbyte;
1465 int req_caseopt, reqvary, tempreqvary;
1466 int condcount = 0;
1467 int options = *optionsptr;
1468 register int c;
1469 register uschar *code = *codeptr;
1470 uschar *tempcode;
1471 BOOL inescq = FALSE;
1472 BOOL groupsetfirstbyte = FALSE;
1473 const uschar *ptr = *ptrptr;
1474 const uschar *tempptr;
1475 uschar *previous = NULL;
1476 uschar class[32];
1477
1478 #ifdef SUPPORT_UTF8
1479 BOOL class_utf8;
1480 BOOL utf8 = (options & PCRE_UTF8) != 0;
1481 uschar *class_utf8data;
1482 uschar utf8_char[6];
1483 #else
1484 BOOL utf8 = FALSE;
1485 #endif
1486
1487 /* Set up the default and non-default settings for greediness */
1488
1489 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1490 greedy_non_default = greedy_default ^ 1;
1491
1492 /* Initialize no first char, no required char. REQ_UNSET means "no char
1493 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1494 matches a non-fixed char first char; reqbyte just remains unset if we never
1495 find one.
1496
1497 When we hit a repeat whose minimum is zero, we may have to adjust these values
1498 to take the zero repeat into account. This is implemented by setting them to
1499 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1500 item types that can be repeated set these backoff variables appropriately. */
1501
1502 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1503
1504 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1505 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1506 value > 255. It is added into the firstbyte or reqbyte variables to record the
1507 case status of the value. */
1508
1509 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1510
1511 /* Switch on next character until the end of the branch */
1512
1513 for (;; ptr++)
1514 {
1515 BOOL negate_class;
1516 BOOL possessive_quantifier;
1517 int class_charcount;
1518 int class_lastchar;
1519 int newoptions;
1520 int recno;
1521 int skipbytes;
1522 int subreqbyte;
1523 int subfirstbyte;
1524
1525 c = *ptr;
1526 if (inescq && c != 0) goto NORMAL_CHAR;
1527
1528 if ((options & PCRE_EXTENDED) != 0)
1529 {
1530 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1531 if (c == '#')
1532 {
1533 /* The space before the ; is to avoid a warning on a silly compiler
1534 on the Macintosh. */
1535 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1536 if (c != 0) continue; /* Else fall through to handle end of string */
1537 }
1538 }
1539
1540 switch(c)
1541 {
1542 /* The branch terminates at end of string, |, or ). */
1543
1544 case 0:
1545 case '|':
1546 case ')':
1547 *firstbyteptr = firstbyte;
1548 *reqbyteptr = reqbyte;
1549 *codeptr = code;
1550 *ptrptr = ptr;
1551 return TRUE;
1552
1553 /* Handle single-character metacharacters. In multiline mode, ^ disables
1554 the setting of any following char as a first character. */
1555
1556 case '^':
1557 if ((options & PCRE_MULTILINE) != 0)
1558 {
1559 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1560 }
1561 previous = NULL;
1562 *code++ = OP_CIRC;
1563 break;
1564
1565 case '$':
1566 previous = NULL;
1567 *code++ = OP_DOLL;
1568 break;
1569
1570 /* There can never be a first char if '.' is first, whatever happens about
1571 repeats. The value of reqbyte doesn't change either. */
1572
1573 case '.':
1574 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1575 zerofirstbyte = firstbyte;
1576 zeroreqbyte = reqbyte;
1577 previous = code;
1578 *code++ = OP_ANY;
1579 break;
1580
1581 /* Character classes. If the included characters are all < 255 in value, we
1582 build a 32-byte bitmap of the permitted characters, except in the special
1583 case where there is only one such character. For negated classes, we build
1584 the map as usual, then invert it at the end. However, we use a different
1585 opcode so that data characters > 255 can be handled correctly.
1586
1587 If the class contains characters outside the 0-255 range, a different
1588 opcode is compiled. It may optionally have a bit map for characters < 256,
1589 but those above are are explicitly listed afterwards. A flag byte tells
1590 whether the bitmap is present, and whether this is a negated class or not.
1591 */
1592
1593 case '[':
1594 previous = code;
1595
1596 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1597 they are encountered at the top level, so we'll do that too. */
1598
1599 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1600 check_posix_syntax(ptr, &tempptr, cd))
1601 {
1602 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1603 goto FAILED;
1604 }
1605
1606 /* If the first character is '^', set the negation flag and skip it. */
1607
1608 if ((c = *(++ptr)) == '^')
1609 {
1610 negate_class = TRUE;
1611 c = *(++ptr);
1612 }
1613 else
1614 {
1615 negate_class = FALSE;
1616 }
1617
1618 /* Keep a count of chars with values < 256 so that we can optimize the case
1619 of just a single character (as long as it's < 256). For higher valued UTF-8
1620 characters, we don't yet do any optimization. */
1621
1622 class_charcount = 0;
1623 class_lastchar = -1;
1624
1625 #ifdef SUPPORT_UTF8
1626 class_utf8 = FALSE; /* No chars >= 256 */
1627 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1628 #endif
1629
1630 /* Initialize the 32-char bit map to all zeros. We have to build the
1631 map in a temporary bit of store, in case the class contains only 1
1632 character (< 256), because in that case the compiled code doesn't use the
1633 bit map. */
1634
1635 memset(class, 0, 32 * sizeof(uschar));
1636
1637 /* Process characters until ] is reached. By writing this as a "do" it
1638 means that an initial ] is taken as a data character. The first pass
1639 through the regex checked the overall syntax, so we don't need to be very
1640 strict here. At the start of the loop, c contains the first byte of the
1641 character. */
1642
1643 do
1644 {
1645 #ifdef SUPPORT_UTF8
1646 if (utf8 && c > 127)
1647 { /* Braces are required because the */
1648 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1649 }
1650 #endif
1651
1652 /* Inside \Q...\E everything is literal except \E */
1653
1654 if (inescq)
1655 {
1656 if (c == '\\' && ptr[1] == 'E')
1657 {
1658 inescq = FALSE;
1659 ptr++;
1660 continue;
1661 }
1662 else goto LONE_SINGLE_CHARACTER;
1663 }
1664
1665 /* Handle POSIX class names. Perl allows a negation extension of the
1666 form [:^name:]. A square bracket that doesn't match the syntax is
1667 treated as a literal. We also recognize the POSIX constructions
1668 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1669 5.6 and 5.8 do. */
1670
1671 if (c == '[' &&
1672 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1673 check_posix_syntax(ptr, &tempptr, cd))
1674 {
1675 BOOL local_negate = FALSE;
1676 int posix_class, i;
1677 register const uschar *cbits = cd->cbits;
1678
1679 if (ptr[1] != ':')
1680 {
1681 *errorptr = ERR31;
1682 goto FAILED;
1683 }
1684
1685 ptr += 2;
1686 if (*ptr == '^')
1687 {
1688 local_negate = TRUE;
1689 ptr++;
1690 }
1691
1692 posix_class = check_posix_name(ptr, tempptr - ptr);
1693 if (posix_class < 0)
1694 {
1695 *errorptr = ERR30;
1696 goto FAILED;
1697 }
1698
1699 /* If matching is caseless, upper and lower are converted to
1700 alpha. This relies on the fact that the class table starts with
1701 alpha, lower, upper as the first 3 entries. */
1702
1703 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1704 posix_class = 0;
1705
1706 /* Or into the map we are building up to 3 of the static class
1707 tables, or their negations. The [:blank:] class sets up the same
1708 chars as the [:space:] class (all white space). We remove the vertical
1709 white space chars afterwards. */
1710
1711 posix_class *= 3;
1712 for (i = 0; i < 3; i++)
1713 {
1714 BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1715 int taboffset = posix_class_maps[posix_class + i];
1716 if (taboffset < 0) break;
1717 if (local_negate)
1718 {
1719 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1720 if (isblank) class[1] |= 0x3c;
1721 }
1722 else
1723 {
1724 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1725 if (isblank) class[1] &= ~0x3c;
1726 }
1727 }
1728
1729 ptr = tempptr + 1;
1730 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1731 continue; /* End of POSIX syntax handling */
1732 }
1733
1734 /* Backslash may introduce a single character, or it may introduce one
1735 of the specials, which just set a flag. Escaped items are checked for
1736 validity in the pre-compiling pass. The sequence \b is a special case.
1737 Inside a class (and only there) it is treated as backspace. Elsewhere
1738 it marks a word boundary. Other escapes have preset maps ready to
1739 or into the one we are building. We assume they have more than one
1740 character in them, so set class_charcount bigger than one. */
1741
1742 if (c == '\\')
1743 {
1744 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1745 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1746
1747 if (-c == ESC_Q) /* Handle start of quoted string */
1748 {
1749 if (ptr[1] == '\\' && ptr[2] == 'E')
1750 {
1751 ptr += 2; /* avoid empty string */
1752 }
1753 else inescq = TRUE;
1754 continue;
1755 }
1756
1757 else if (c < 0)
1758 {
1759 register const uschar *cbits = cd->cbits;
1760 class_charcount = 10; /* Greater than 1 is what matters */
1761 switch (-c)
1762 {
1763 case ESC_d:
1764 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1765 continue;
1766
1767 case ESC_D:
1768 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1769 continue;
1770
1771 case ESC_w:
1772 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1773 continue;
1774
1775 case ESC_W:
1776 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1777 continue;
1778
1779 case ESC_s:
1780 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1781 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1782 continue;
1783
1784 case ESC_S:
1785 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1786 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1787 continue;
1788
1789 /* Unrecognized escapes are faulted if PCRE is running in its
1790 strict mode. By default, for compatibility with Perl, they are
1791 treated as literals. */
1792
1793 default:
1794 if ((options & PCRE_EXTRA) != 0)
1795 {
1796 *errorptr = ERR7;
1797 goto FAILED;
1798 }
1799 c = *ptr; /* The final character */
1800 }
1801 }
1802
1803 /* Fall through if we have a single character (c >= 0). This may be
1804 > 256 in UTF-8 mode. */
1805
1806 } /* End of backslash handling */
1807
1808 /* A single character may be followed by '-' to form a range. However,
1809 Perl does not permit ']' to be the end of the range. A '-' character
1810 here is treated as a literal. */
1811
1812 if (ptr[1] == '-' && ptr[2] != ']')
1813 {
1814 int d;
1815 ptr += 2;
1816
1817 #ifdef SUPPORT_UTF8
1818 if (utf8)
1819 { /* Braces are required because the */
1820 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1821 }
1822 else
1823 #endif
1824 d = *ptr;
1825
1826 /* The second part of a range can be a single-character escape, but
1827 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1828 in such circumstances. */
1829
1830 if (d == '\\')
1831 {
1832 const uschar *oldptr = ptr;
1833 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1834
1835 /* \b is backslash; any other special means the '-' was literal */
1836
1837 if (d < 0)
1838 {
1839 if (d == -ESC_b) d = '\b'; else
1840 {
1841 ptr = oldptr - 2;
1842 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1843 }
1844 }
1845 }
1846
1847 /* Check that the two values are in the correct order */
1848
1849 if (d < c)
1850 {
1851 *errorptr = ERR8;
1852 goto FAILED;
1853 }
1854
1855 /* If d is greater than 255, we can't just use the bit map, so set up
1856 for the UTF-8 supporting class type. If we are not caseless, we can
1857 just set up a single range. If we are caseless, the characters < 256
1858 are handled with a bitmap, in order to get the case-insensitive
1859 handling. */
1860
1861 #ifdef SUPPORT_UTF8
1862 if (d > 255)
1863 {
1864 class_utf8 = TRUE;
1865 *class_utf8data++ = XCL_RANGE;
1866 if ((options & PCRE_CASELESS) == 0)
1867 {
1868 class_utf8data += ord2utf8(c, class_utf8data);
1869 class_utf8data += ord2utf8(d, class_utf8data);
1870 continue; /* Go get the next char in the class */
1871 }
1872 class_utf8data += ord2utf8(256, class_utf8data);
1873 class_utf8data += ord2utf8(d, class_utf8data);
1874 d = 255;
1875 /* Fall through */
1876 }
1877 #endif
1878 /* We use the bit map if the range is entirely < 255, or if part of it
1879 is < 255 and matching is caseless. */
1880
1881 for (; c <= d; c++)
1882 {
1883 class[c/8] |= (1 << (c&7));
1884 if ((options & PCRE_CASELESS) != 0)
1885 {
1886 int uc = cd->fcc[c]; /* flip case */
1887 class[uc/8] |= (1 << (uc&7));
1888 }
1889 class_charcount++; /* in case a one-char range */
1890 class_lastchar = c;
1891 }
1892
1893 continue; /* Go get the next char in the class */
1894 }
1895
1896 /* Handle a lone single character - we can get here for a normal
1897 non-escape char, or after \ that introduces a single character. */
1898
1899 LONE_SINGLE_CHARACTER:
1900
1901 /* Handle a multibyte character */
1902
1903 #ifdef SUPPORT_UTF8
1904 if (utf8 && c > 255)
1905 {
1906 class_utf8 = TRUE;
1907 *class_utf8data++ = XCL_SINGLE;
1908 class_utf8data += ord2utf8(c, class_utf8data);
1909 }
1910 else
1911 #endif
1912 /* Handle a single-byte character */
1913 {
1914 class [c/8] |= (1 << (c&7));
1915 if ((options & PCRE_CASELESS) != 0)
1916 {
1917 c = cd->fcc[c]; /* flip case */
1918 class[c/8] |= (1 << (c&7));
1919 }
1920 class_charcount++;
1921 class_lastchar = c;
1922 }
1923 }
1924
1925 /* Loop until ']' reached; the check for end of string happens inside the
1926 loop. This "while" is the end of the "do" above. */
1927
1928 while ((c = *(++ptr)) != ']' || inescq);
1929
1930 /* If class_charcount is 1, we saw precisely one character with a value <
1931 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1932 the one character is < 128. In non-UTF-8 mode we can always optimize.
1933
1934 The optimization throws away the bit map. We turn the item into a
1935 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1936 that OP_NOT does not support multibyte characters. In the positive case, it
1937 can cause firstbyte to be set. Otherwise, there can be no first char if
1938 this item is first, whatever repeat count may follow. In the case of
1939 reqbyte, save the previous value for reinstating. */
1940
1941 #ifdef SUPPORT_UTF8
1942 if (class_charcount == 1 &&
1943 (!utf8 ||
1944 (!class_utf8 && class_lastchar < 128)))
1945 #else
1946 if (class_charcount == 1)
1947 #endif
1948 {
1949 zeroreqbyte = reqbyte;
1950 if (negate_class)
1951 {
1952 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1953 zerofirstbyte = firstbyte;
1954 *code++ = OP_NOT;
1955 }
1956 else
1957 {
1958 if (firstbyte == REQ_UNSET)
1959 {
1960 zerofirstbyte = REQ_NONE;
1961 firstbyte = class_lastchar | req_caseopt;
1962 }
1963 else
1964 {
1965 zerofirstbyte = firstbyte;
1966 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
1967 }
1968 *code++ = OP_CHARS;
1969 *code++ = 1;
1970 }
1971 *code++ = class_lastchar;
1972 break; /* End of class handling */
1973 } /* End of 1-byte optimization */
1974
1975 /* Otherwise, if this is the first thing in the branch, there can be no
1976 first char setting, whatever the repeat count. Any reqbyte setting must
1977 remain unchanged after any kind of repeat. */
1978
1979 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1980 zerofirstbyte = firstbyte;
1981 zeroreqbyte = reqbyte;
1982
1983 /* If there are characters with values > 255, we have to compile an
1984 extended class, with its own opcode. If there are no characters < 256,
1985 we can omit the bitmap. */
1986
1987 #ifdef SUPPORT_UTF8
1988 if (class_utf8)
1989 {
1990 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
1991 *code++ = OP_XCLASS;
1992 code += LINK_SIZE;
1993 *code = negate_class? XCL_NOT : 0;
1994
1995 /* If the map is required, install it, and move on to the end of
1996 the extra data */
1997
1998 if (class_charcount > 0)
1999 {
2000 *code++ |= XCL_MAP;
2001 memcpy(code, class, 32);
2002 code = class_utf8data;
2003 }
2004
2005 /* If the map is not required, slide down the extra data. */
2006
2007 else
2008 {
2009 int len = class_utf8data - (code + 33);
2010 memmove(code + 1, code + 33, len);
2011 code += len + 1;
2012 }
2013
2014 /* Now fill in the complete length of the item */
2015
2016 PUT(previous, 1, code - previous);
2017 break; /* End of class handling */
2018 }
2019 #endif
2020
2021 /* If there are no characters > 255, negate the 32-byte map if necessary,
2022 and copy it into the code vector. If this is the first thing in the branch,
2023 there can be no first char setting, whatever the repeat count. Any reqbyte
2024 setting must remain unchanged after any kind of repeat. */
2025
2026 if (negate_class)
2027 {
2028 *code++ = OP_NCLASS;
2029 for (c = 0; c < 32; c++) code[c] = ~class[c];
2030 }
2031 else
2032 {
2033 *code++ = OP_CLASS;
2034 memcpy(code, class, 32);
2035 }
2036 code += 32;
2037 break;
2038
2039 /* Various kinds of repeat */
2040
2041 case '{':
2042 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
2043 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
2044 if (*errorptr != NULL) goto FAILED;
2045 goto REPEAT;
2046
2047 case '*':
2048 repeat_min = 0;
2049 repeat_max = -1;
2050 goto REPEAT;
2051
2052 case '+':
2053 repeat_min = 1;
2054 repeat_max = -1;
2055 goto REPEAT;
2056
2057 case '?':
2058 repeat_min = 0;
2059 repeat_max = 1;
2060
2061 REPEAT:
2062 if (previous == NULL)
2063 {
2064 *errorptr = ERR9;
2065 goto FAILED;
2066 }
2067
2068 if (repeat_min == 0)
2069 {
2070 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2071 reqbyte = zeroreqbyte; /* Ditto */
2072 }
2073
2074 /* Remember whether this is a variable length repeat */
2075
2076 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2077
2078 op_type = 0; /* Default single-char op codes */
2079 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2080
2081 /* Save start of previous item, in case we have to move it up to make space
2082 for an inserted OP_ONCE for the additional '+' extension. */
2083
2084 tempcode = previous;
2085
2086 /* If the next character is '+', we have a possessive quantifier. This
2087 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2088 If the next character is '?' this is a minimizing repeat, by default,
2089 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2090 repeat type to the non-default. */
2091
2092 if (ptr[1] == '+')
2093 {
2094 repeat_type = 0; /* Force greedy */
2095 possessive_quantifier = TRUE;
2096 ptr++;
2097 }
2098 else if (ptr[1] == '?')
2099 {
2100 repeat_type = greedy_non_default;
2101 ptr++;
2102 }
2103 else repeat_type = greedy_default;
2104
2105 /* If previous was a recursion, we need to wrap it inside brackets so that
2106 it can be replicated if necessary. */
2107
2108 if (*previous == OP_RECURSE)
2109 {
2110 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2111 code += 1 + LINK_SIZE;
2112 *previous = OP_BRA;
2113 PUT(previous, 1, code - previous);
2114 *code = OP_KET;
2115 PUT(code, 1, code - previous);
2116 code += 1 + LINK_SIZE;
2117 }
2118
2119 /* If previous was a string of characters, chop off the last one and use it
2120 as the subject of the repeat. If there was only one character, we can
2121 abolish the previous item altogether. If a one-char item has a minumum of
2122 more than one, ensure that it is set in reqbyte - it might not be if a
2123 sequence such as x{3} is the first thing in a branch because the x will
2124 have gone into firstbyte instead. */
2125
2126 if (*previous == OP_CHARS)
2127 {
2128 /* Deal with UTF-8 characters that take up more than one byte. It's
2129 easier to write this out separately than try to macrify it. Use c to
2130 hold the length of the character in bytes, plus 0x80 to flag that it's a
2131 length rather than a small character. */
2132
2133 #ifdef SUPPORT_UTF8
2134 if (utf8 && (code[-1] & 0x80) != 0)
2135 {
2136 uschar *lastchar = code - 1;
2137 while((*lastchar & 0xc0) == 0x80) lastchar--;
2138 c = code - lastchar; /* Length of UTF-8 character */
2139 memcpy(utf8_char, lastchar, c); /* Save the char */
2140 if (lastchar == previous + 2) /* There was only one character */
2141 {
2142 code = previous; /* Abolish the previous item */
2143 }
2144 else
2145 {
2146 previous[1] -= c; /* Adjust length of previous */
2147 code = lastchar; /* Lost char off the end */
2148 tempcode = code; /* Adjust position to be moved for '+' */
2149 }
2150 c |= 0x80; /* Flag c as a length */
2151 }
2152 else
2153 #endif
2154
2155 /* Handle the case of a single byte - either with no UTF8 support, or
2156 with UTF-8 disabled, or for a UTF-8 character < 128. */
2157
2158 {
2159 c = *(--code);
2160 if (code == previous + 2) /* There was only one character */
2161 {
2162 code = previous; /* Abolish the previous item */
2163 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2164 }
2165 else
2166 {
2167 previous[1]--; /* adjust length */
2168 tempcode = code; /* Adjust position to be moved for '+' */
2169 }
2170 }
2171
2172 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2173 }
2174
2175 /* If previous was a single negated character ([^a] or similar), we use
2176 one of the special opcodes, replacing it. The code is shared with single-
2177 character repeats by setting opt_type to add a suitable offset into
2178 repeat_type. OP_NOT is currently used only for single-byte chars. */
2179
2180 else if (*previous == OP_NOT)
2181 {
2182 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2183 c = previous[1];
2184 code = previous;
2185 goto OUTPUT_SINGLE_REPEAT;
2186 }
2187
2188 /* If previous was a character type match (\d or similar), abolish it and
2189 create a suitable repeat item. The code is shared with single-character
2190 repeats by setting op_type to add a suitable offset into repeat_type. */
2191
2192 else if (*previous < OP_EODN)
2193 {
2194 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2195 c = *previous;
2196 code = previous;
2197
2198 OUTPUT_SINGLE_REPEAT:
2199
2200 /* If the maximum is zero then the minimum must also be zero; Perl allows
2201 this case, so we do too - by simply omitting the item altogether. */
2202
2203 if (repeat_max == 0) goto END_REPEAT;
2204
2205 /* Combine the op_type with the repeat_type */
2206
2207 repeat_type += op_type;
2208
2209 /* A minimum of zero is handled either as the special case * or ?, or as
2210 an UPTO, with the maximum given. */
2211
2212 if (repeat_min == 0)
2213 {
2214 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2215 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2216 else
2217 {
2218 *code++ = OP_UPTO + repeat_type;
2219 PUT2INC(code, 0, repeat_max);
2220 }
2221 }
2222
2223 /* The case {1,} is handled as the special case + */
2224
2225 else if (repeat_min == 1 && repeat_max == -1)
2226 *code++ = OP_PLUS + repeat_type;
2227
2228 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2229 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2230
2231 else
2232 {
2233 if (repeat_min != 1)
2234 {
2235 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2236 PUT2INC(code, 0, repeat_min);
2237 }
2238
2239 /* If the mininum is 1 and the previous item was a character string,
2240 we either have to put back the item that got cancelled if the string
2241 length was 1, or add the character back onto the end of a longer
2242 string. For a character type nothing need be done; it will just get
2243 put back naturally. Note that the final character is always going to
2244 get added below, so we leave code ready for its insertion. */
2245
2246 else if (*previous == OP_CHARS)
2247 {
2248 if (code == previous) code += 2; else
2249
2250 /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2251 bit set as a flag. The length will always be between 2 and 6. */
2252
2253 #ifdef SUPPORT_UTF8
2254 if (utf8 && c >= 128) previous[1] += c & 7; else
2255 #endif
2256 previous[1]++;
2257 }
2258
2259 /* For a single negated character we also have to put back the
2260 item that got cancelled. At present this applies only to single byte
2261 characters in any mode. */
2262
2263 else if (*previous == OP_NOT) code++;
2264
2265 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2266 we have to insert the character for the previous code. In UTF-8 mode,
2267 long characters have their length in c, with the 0x80 bit as a flag. */
2268
2269 if (repeat_max < 0)
2270 {
2271 #ifdef SUPPORT_UTF8
2272 if (utf8 && c >= 128)
2273 {
2274 memcpy(code, utf8_char, c & 7);
2275 code += c & 7;
2276 }
2277 else
2278 #endif
2279 *code++ = c;
2280 *code++ = OP_STAR + repeat_type;
2281 }
2282
2283 /* Else insert an UPTO if the max is greater than the min, again
2284 preceded by the character, for the previously inserted code. */
2285
2286 else if (repeat_max != repeat_min)
2287 {
2288 #ifdef SUPPORT_UTF8
2289 if (utf8 && c >= 128)
2290 {
2291 memcpy(code, utf8_char, c & 7);
2292 code += c & 7;
2293 }
2294 else
2295 #endif
2296 *code++ = c;
2297 repeat_max -= repeat_min;
2298 *code++ = OP_UPTO + repeat_type;
2299 PUT2INC(code, 0, repeat_max);
2300 }
2301 }
2302
2303 /* The character or character type itself comes last in all cases. */
2304
2305 #ifdef SUPPORT_UTF8
2306 if (utf8 && c >= 128)
2307 {
2308 memcpy(code, utf8_char, c & 7);
2309 code += c & 7;
2310 }
2311 else
2312 #endif
2313
2314 *code++ = c;
2315 }
2316
2317 /* If previous was a character class or a back reference, we put the repeat
2318 stuff after it, but just skip the item if the repeat was {0,0}. */
2319
2320 else if (*previous == OP_CLASS ||
2321 *previous == OP_NCLASS ||
2322 #ifdef SUPPORT_UTF8
2323 *previous == OP_XCLASS ||
2324 #endif
2325 *previous == OP_REF)
2326 {
2327 if (repeat_max == 0)
2328 {
2329 code = previous;
2330 goto END_REPEAT;
2331 }
2332 if (repeat_min == 0 && repeat_max == -1)
2333 *code++ = OP_CRSTAR + repeat_type;
2334 else if (repeat_min == 1 && repeat_max == -1)
2335 *code++ = OP_CRPLUS + repeat_type;
2336 else if (repeat_min == 0 && repeat_max == 1)
2337 *code++ = OP_CRQUERY + repeat_type;
2338 else
2339 {
2340 *code++ = OP_CRRANGE + repeat_type;
2341 PUT2INC(code, 0, repeat_min);
2342 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2343 PUT2INC(code, 0, repeat_max);
2344 }
2345 }
2346
2347 /* If previous was a bracket group, we may have to replicate it in certain
2348 cases. */
2349
2350 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2351 *previous == OP_COND)
2352 {
2353 register int i;
2354 int ketoffset = 0;
2355 int len = code - previous;
2356 uschar *bralink = NULL;
2357
2358 /* If the maximum repeat count is unlimited, find the end of the bracket
2359 by scanning through from the start, and compute the offset back to it
2360 from the current code pointer. There may be an OP_OPT setting following
2361 the final KET, so we can't find the end just by going back from the code
2362 pointer. */
2363
2364 if (repeat_max == -1)
2365 {
2366 register uschar *ket = previous;
2367 do ket += GET(ket, 1); while (*ket != OP_KET);
2368 ketoffset = code - ket;
2369 }
2370
2371 /* The case of a zero minimum is special because of the need to stick
2372 OP_BRAZERO in front of it, and because the group appears once in the
2373 data, whereas in other cases it appears the minimum number of times. For
2374 this reason, it is simplest to treat this case separately, as otherwise
2375 the code gets far too messy. There are several special subcases when the
2376 minimum is zero. */
2377
2378 if (repeat_min == 0)
2379 {
2380 /* If the maximum is also zero, we just omit the group from the output
2381 altogether. */
2382
2383 if (repeat_max == 0)
2384 {
2385 code = previous;
2386 goto END_REPEAT;
2387 }
2388
2389 /* If the maximum is 1 or unlimited, we just have to stick in the
2390 BRAZERO and do no more at this point. */
2391
2392 if (repeat_max <= 1)
2393 {
2394 memmove(previous+1, previous, len);
2395 code++;
2396 *previous++ = OP_BRAZERO + repeat_type;
2397 }
2398
2399 /* If the maximum is greater than 1 and limited, we have to replicate
2400 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2401 The first one has to be handled carefully because it's the original
2402 copy, which has to be moved up. The remainder can be handled by code
2403 that is common with the non-zero minimum case below. We just have to
2404 adjust the value or repeat_max, since one less copy is required. */
2405
2406 else
2407 {
2408 int offset;
2409 memmove(previous + 2 + LINK_SIZE, previous, len);
2410 code += 2 + LINK_SIZE;
2411 *previous++ = OP_BRAZERO + repeat_type;
2412 *previous++ = OP_BRA;
2413
2414 /* We chain together the bracket offset fields that have to be
2415 filled in later when the ends of the brackets are reached. */
2416
2417 offset = (bralink == NULL)? 0 : previous - bralink;
2418 bralink = previous;
2419 PUTINC(previous, 0, offset);
2420 }
2421
2422 repeat_max--;
2423 }
2424
2425 /* If the minimum is greater than zero, replicate the group as many
2426 times as necessary, and adjust the maximum to the number of subsequent
2427 copies that we need. If we set a first char from the group, and didn't
2428 set a required char, copy the latter from the former. */
2429
2430 else
2431 {
2432 if (repeat_min > 1)
2433 {
2434 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2435 for (i = 1; i < repeat_min; i++)
2436 {
2437 memcpy(code, previous, len);
2438 code += len;
2439 }
2440 }
2441 if (repeat_max > 0) repeat_max -= repeat_min;
2442 }
2443
2444 /* This code is common to both the zero and non-zero minimum cases. If
2445 the maximum is limited, it replicates the group in a nested fashion,
2446 remembering the bracket starts on a stack. In the case of a zero minimum,
2447 the first one was set up above. In all cases the repeat_max now specifies
2448 the number of additional copies needed. */
2449
2450 if (repeat_max >= 0)
2451 {
2452 for (i = repeat_max - 1; i >= 0; i--)
2453 {
2454 *code++ = OP_BRAZERO + repeat_type;
2455
2456 /* All but the final copy start a new nesting, maintaining the
2457 chain of brackets outstanding. */
2458
2459 if (i != 0)
2460 {
2461 int offset;
2462 *code++ = OP_BRA;
2463 offset = (bralink == NULL)? 0 : code - bralink;
2464 bralink = code;
2465 PUTINC(code, 0, offset);
2466 }
2467
2468 memcpy(code, previous, len);
2469 code += len;
2470 }
2471
2472 /* Now chain through the pending brackets, and fill in their length
2473 fields (which are holding the chain links pro tem). */
2474
2475 while (bralink != NULL)
2476 {
2477 int oldlinkoffset;
2478 int offset = code - bralink + 1;
2479 uschar *bra = code - offset;
2480 oldlinkoffset = GET(bra, 1);
2481 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2482 *code++ = OP_KET;
2483 PUTINC(code, 0, offset);
2484 PUT(bra, 1, offset);
2485 }
2486 }
2487
2488 /* If the maximum is unlimited, set a repeater in the final copy. We
2489 can't just offset backwards from the current code point, because we
2490 don't know if there's been an options resetting after the ket. The
2491 correct offset was computed above. */
2492
2493 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2494 }
2495
2496 /* Else there's some kind of shambles */
2497
2498 else
2499 {
2500 *errorptr = ERR11;
2501 goto FAILED;
2502 }
2503
2504 /* If the character following a repeat is '+', we wrap the entire repeated
2505 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2506 Sun's Java package. The repeated item starts at tempcode, not at previous,
2507 which might be the first part of a string whose (former) last char we
2508 repeated. However, we don't support '+' after a greediness '?'. */
2509
2510 if (possessive_quantifier)
2511 {
2512 int len = code - tempcode;
2513 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2514 code += 1 + LINK_SIZE;
2515 len += 1 + LINK_SIZE;
2516 tempcode[0] = OP_ONCE;
2517 *code++ = OP_KET;
2518 PUTINC(code, 0, len);
2519 PUT(tempcode, 1, len);
2520 }
2521
2522 /* In all case we no longer have a previous item. We also set the
2523 "follows varying string" flag for subsequently encountered reqbytes if
2524 it isn't already set and we have just passed a varying length item. */
2525
2526 END_REPEAT:
2527 previous = NULL;
2528 cd->req_varyopt |= reqvary;
2529 break;
2530
2531
2532 /* Start of nested bracket sub-expression, or comment or lookahead or
2533 lookbehind or option setting or condition. First deal with special things
2534 that can come after a bracket; all are introduced by ?, and the appearance
2535 of any of them means that this is not a referencing group. They were
2536 checked for validity in the first pass over the string, so we don't have to
2537 check for syntax errors here. */
2538
2539 case '(':
2540 newoptions = options;
2541 skipbytes = 0;
2542
2543 if (*(++ptr) == '?')
2544 {
2545 int set, unset;
2546 int *optset;
2547
2548 switch (*(++ptr))
2549 {
2550 case '#': /* Comment; skip to ket */
2551 ptr++;
2552 while (*ptr != ')') ptr++;
2553 continue;
2554
2555 case ':': /* Non-extracting bracket */
2556 bravalue = OP_BRA;
2557 ptr++;
2558 break;
2559
2560 case '(':
2561 bravalue = OP_COND; /* Conditional group */
2562
2563 /* Condition to test for recursion */
2564
2565 if (ptr[1] == 'R')
2566 {
2567 code[1+LINK_SIZE] = OP_CREF;
2568 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2569 skipbytes = 3;
2570 ptr += 3;
2571 }
2572
2573 /* Condition to test for a numbered subpattern match */
2574
2575 else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
2576 {
2577 int condref; /* Don't amalgamate; some compilers */
2578 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2579 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2580 if (condref == 0)
2581 {
2582 *errorptr = ERR35;
2583 goto FAILED;
2584 }
2585 ptr++;
2586 code[1+LINK_SIZE] = OP_CREF;
2587 PUT2(code, 2+LINK_SIZE, condref);
2588 skipbytes = 3;
2589 }
2590 /* For conditions that are assertions, we just fall through, having
2591 set bravalue above. */
2592 break;
2593
2594 case '=': /* Positive lookahead */
2595 bravalue = OP_ASSERT;
2596 ptr++;
2597 break;
2598
2599 case '!': /* Negative lookahead */
2600 bravalue = OP_ASSERT_NOT;
2601 ptr++;
2602 break;
2603
2604 case '<': /* Lookbehinds */
2605 switch (*(++ptr))
2606 {
2607 case '=': /* Positive lookbehind */
2608 bravalue = OP_ASSERTBACK;
2609 ptr++;
2610 break;
2611
2612 case '!': /* Negative lookbehind */
2613 bravalue = OP_ASSERTBACK_NOT;
2614 ptr++;
2615 break;
2616 }
2617 break;
2618
2619 case '>': /* One-time brackets */
2620 bravalue = OP_ONCE;
2621 ptr++;
2622 break;
2623
2624 case 'C': /* Callout - may be followed by digits */
2625 *code++ = OP_CALLOUT;
2626 {
2627 int n = 0;
2628 while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
2629 n = n * 10 + *ptr - '0';
2630 if (n > 255)
2631 {
2632 *errorptr = ERR38;
2633 goto FAILED;
2634 }
2635 *code++ = n;
2636 }
2637 previous = NULL;
2638 continue;
2639
2640 case 'P': /* Named subpattern handling */
2641 if (*(++ptr) == '<') /* Definition */
2642 {
2643 int i, namelen;
2644 uschar *slot = cd->name_table;
2645 const uschar *name; /* Don't amalgamate; some compilers */
2646 name = ++ptr; /* grumble at autoincrement in declaration */
2647
2648 while (*ptr++ != '>');
2649 namelen = ptr - name - 1;
2650
2651 for (i = 0; i < cd->names_found; i++)
2652 {
2653 int crc = memcmp(name, slot+2, namelen);
2654 if (crc == 0)
2655 {
2656 if (slot[2+namelen] == 0)
2657 {
2658 *errorptr = ERR43;
2659 goto FAILED;
2660 }
2661 crc = -1; /* Current name is substring */
2662 }
2663 if (crc < 0)
2664 {
2665 memmove(slot + cd->name_entry_size, slot,
2666 (cd->names_found - i) * cd->name_entry_size);
2667 break;
2668 }
2669 slot += cd->name_entry_size;
2670 }
2671
2672 PUT2(slot, 0, *brackets + 1);
2673 memcpy(slot + 2, name, namelen);
2674 slot[2+namelen] = 0;
2675 cd->names_found++;
2676 goto NUMBERED_GROUP;
2677 }
2678
2679 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2680 {
2681 int i, namelen;
2682 int type = *ptr++;
2683 const uschar *name = ptr;
2684 uschar *slot = cd->name_table;
2685
2686 while (*ptr != ')') ptr++;
2687 namelen = ptr - name;
2688
2689 for (i = 0; i < cd->names_found; i++)
2690 {
2691 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2692 slot += cd->name_entry_size;
2693 }
2694 if (i >= cd->names_found)
2695 {
2696 *errorptr = ERR15;
2697 goto FAILED;
2698 }
2699
2700 recno = GET2(slot, 0);
2701
2702 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2703
2704 /* Back reference */
2705
2706 previous = code;
2707 *code++ = OP_REF;
2708 PUT2INC(code, 0, recno);
2709 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2710 if (recno > cd->top_backref) cd->top_backref = recno;
2711 continue;
2712 }
2713
2714 /* Should never happen */
2715 break;
2716
2717 case 'R': /* Pattern recursion */
2718 ptr++; /* Same as (?0) */
2719 /* Fall through */
2720
2721 /* Recursion or "subroutine" call */
2722
2723 case '0': case '1': case '2': case '3': case '4':
2724 case '5': case '6': case '7': case '8': case '9':
2725 {
2726 const uschar *called;
2727 recno = 0;
2728
2729 while ((cd->ctypes[*ptr] & ctype_digit) != 0)
2730 recno = recno * 10 + *ptr++ - '0';
2731
2732 /* Come here from code above that handles a named recursion */
2733
2734 HANDLE_RECURSION:
2735
2736 previous = code;
2737
2738 /* Find the bracket that is being referenced. Temporarily end the
2739 regex in case it doesn't exist. */
2740
2741 *code = OP_END;
2742 called = (recno == 0)?
2743 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2744
2745 if (called == NULL)
2746 {
2747 *errorptr = ERR15;
2748 goto FAILED;
2749 }
2750
2751 /* If the subpattern is still open, this is a recursive call. We
2752 check to see if this is a left recursion that could loop for ever,
2753 and diagnose that case. */
2754
2755 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2756 {
2757 *errorptr = ERR40;
2758 goto FAILED;
2759 }
2760
2761 /* Insert the recursion/subroutine item */
2762
2763 *code = OP_RECURSE;
2764 PUT(code, 1, called - cd->start_code);
2765 code += 1 + LINK_SIZE;
2766 }
2767 continue;
2768
2769 /* Character after (? not specially recognized */
2770
2771 default: /* Option setting */
2772 set = unset = 0;
2773 optset = &set;
2774
2775 while (*ptr != ')' && *ptr != ':')
2776 {
2777 switch (*ptr++)
2778 {
2779 case '-': optset = &unset; break;
2780
2781 case 'i': *optset |= PCRE_CASELESS; break;
2782 case 'm': *optset |= PCRE_MULTILINE; break;
2783 case 's': *optset |= PCRE_DOTALL; break;
2784 case 'x': *optset |= PCRE_EXTENDED; break;
2785 case 'U': *optset |= PCRE_UNGREEDY; break;
2786 case 'X': *optset |= PCRE_EXTRA; break;
2787 }
2788 }
2789
2790 /* Set up the changed option bits, but don't change anything yet. */
2791
2792 newoptions = (options | set) & (~unset);
2793
2794 /* If the options ended with ')' this is not the start of a nested
2795 group with option changes, so the options change at this level. Compile
2796 code to change the ims options if this setting actually changes any of
2797 them. We also pass the new setting back so that it can be put at the
2798 start of any following branches, and when this group ends (if we are in
2799 a group), a resetting item can be compiled.
2800
2801 Note that if this item is right at the start of the pattern, the
2802 options will have been abstracted and made global, so there will be no
2803 change to compile. */
2804
2805 if (*ptr == ')')
2806 {
2807 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2808 {
2809 *code++ = OP_OPT;
2810 *code++ = newoptions & PCRE_IMS;
2811 }
2812
2813 /* Change options at this level, and pass them back for use
2814 in subsequent branches. Reset the greedy defaults and the case
2815 value for firstbyte and reqbyte. */
2816
2817 *optionsptr = options = newoptions;
2818 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2819 greedy_non_default = greedy_default ^ 1;
2820 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2821
2822 previous = NULL; /* This item can't be repeated */
2823 continue; /* It is complete */
2824 }
2825
2826 /* If the options ended with ':' we are heading into a nested group
2827 with possible change of options. Such groups are non-capturing and are
2828 not assertions of any kind. All we need to do is skip over the ':';
2829 the newoptions value is handled below. */
2830
2831 bravalue = OP_BRA;
2832 ptr++;
2833 }
2834 }
2835
2836 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2837 non-capturing and behave like (?:...) brackets */
2838
2839 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2840 {
2841 bravalue = OP_BRA;
2842 }
2843
2844 /* Else we have a referencing group; adjust the opcode. If the bracket
2845 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2846 arrange for the true number to follow later, in an OP_BRANUMBER item. */
2847
2848 else
2849 {
2850 NUMBERED_GROUP:
2851 if (++(*brackets) > EXTRACT_BASIC_MAX)
2852 {
2853 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2854 code[1+LINK_SIZE] = OP_BRANUMBER;
2855 PUT2(code, 2+LINK_SIZE, *brackets);
2856 skipbytes = 3;
2857 }
2858 else bravalue = OP_BRA + *brackets;
2859 }
2860
2861 /* Process nested bracketed re. Assertions may not be repeated, but other
2862 kinds can be. We copy code into a non-register variable in order to be able
2863 to pass its address because some compilers complain otherwise. Pass in a
2864 new setting for the ims options if they have changed. */
2865
2866 previous = (bravalue >= OP_ONCE)? code : NULL;
2867 *code = bravalue;
2868 tempcode = code;
2869 tempreqvary = cd->req_varyopt; /* Save value before bracket */
2870
2871 if (!compile_regex(
2872 newoptions, /* The complete new option state */
2873 options & PCRE_IMS, /* The previous ims option state */
2874 brackets, /* Extracting bracket count */
2875 &tempcode, /* Where to put code (updated) */
2876 &ptr, /* Input pointer (updated) */
2877 errorptr, /* Where to put an error message */
2878 (bravalue == OP_ASSERTBACK ||
2879 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2880 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
2881 &subfirstbyte, /* For possible first char */
2882 &subreqbyte, /* For possible last char */
2883 bcptr, /* Current branch chain */
2884 cd)) /* Tables block */
2885 goto FAILED;
2886
2887 /* At the end of compiling, code is still pointing to the start of the
2888 group, while tempcode has been updated to point past the end of the group
2889 and any option resetting that may follow it. The pattern pointer (ptr)
2890 is on the bracket. */
2891
2892 /* If this is a conditional bracket, check that there are no more than
2893 two branches in the group. */
2894
2895 else if (bravalue == OP_COND)
2896 {
2897 uschar *tc = code;
2898 condcount = 0;
2899
2900 do {
2901 condcount++;
2902 tc += GET(tc,1);
2903 }
2904 while (*tc != OP_KET);
2905
2906 if (condcount > 2)
2907 {
2908 *errorptr = ERR27;
2909 goto FAILED;
2910 }
2911
2912 /* If there is just one branch, we must not make use of its firstbyte or
2913 reqbyte, because this is equivalent to an empty second branch. */
2914
2915 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2916 }
2917
2918 /* Handle updating of the required and first characters. Update for normal
2919 brackets of all kinds, and conditions with two branches (see code above).
2920 If the bracket is followed by a quantifier with zero repeat, we have to
2921 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2922 main loop so that they can be accessed for the back off. */
2923
2924 zeroreqbyte = reqbyte;
2925 zerofirstbyte = firstbyte;
2926 groupsetfirstbyte = FALSE;
2927
2928 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2929 {
2930 /* If we have not yet set a firstbyte in this branch, take it from the
2931 subpattern, remembering that it was set here so that a repeat of more
2932 than one can replicate it as reqbyte if necessary. If the subpattern has
2933 no firstbyte, set "none" for the whole branch. In both cases, a zero
2934 repeat forces firstbyte to "none". */
2935
2936 if (firstbyte == REQ_UNSET)
2937 {
2938 if (subfirstbyte >= 0)
2939 {
2940 firstbyte = subfirstbyte;
2941 groupsetfirstbyte = TRUE;
2942 }
2943 else firstbyte = REQ_NONE;
2944 zerofirstbyte = REQ_NONE;
2945 }
2946
2947 /* If firstbyte was previously set, convert the subpattern's firstbyte
2948 into reqbyte if there wasn't one, using the vary flag that was in
2949 existence beforehand. */
2950
2951 else if (subfirstbyte >= 0 && subreqbyte < 0)
2952 subreqbyte = subfirstbyte | tempreqvary;
2953
2954 /* If the subpattern set a required byte (or set a first byte that isn't
2955 really the first byte - see above), set it. */
2956
2957 if (subreqbyte >= 0) reqbyte = subreqbyte;
2958 }
2959
2960 /* For a forward assertion, we take the reqbyte, if set. This can be
2961 helpful if the pattern that follows the assertion doesn't set a different
2962 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
2963 for an assertion, however because it leads to incorrect effect for patterns
2964 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
2965 of a firstbyte. This is overcome by a scan at the end if there's no
2966 firstbyte, looking for an asserted first char. */
2967
2968 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
2969
2970 /* Now update the main code pointer to the end of the group. */
2971
2972 code = tempcode;
2973
2974 /* Error if hit end of pattern */
2975
2976 if (*ptr != ')')
2977 {
2978 *errorptr = ERR14;
2979 goto FAILED;
2980 }
2981 break;
2982
2983 /* Check \ for being a real metacharacter; if not, fall through and handle
2984 it as a data character at the start of a string. Escape items are checked
2985 for validity in the pre-compiling pass. */
2986
2987 case '\\':
2988 tempptr = ptr;
2989 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2990
2991 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
2992 are arranged to be the negation of the corresponding OP_values. For the
2993 back references, the values are ESC_REF plus the reference number. Only
2994 back references and those types that consume a character may be repeated.
2995 We can test for values between ESC_b and ESC_Z for the latter; this may
2996 have to change if any new ones are ever created. */
2997
2998 if (c < 0)
2999 {
3000 if (-c == ESC_Q) /* Handle start of quoted string */
3001 {
3002 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3003 else inescq = TRUE;
3004 continue;
3005 }
3006
3007 /* For metasequences that actually match a character, we disable the
3008 setting of a first character if it hasn't already been set. */
3009
3010 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3011 firstbyte = REQ_NONE;
3012
3013 /* Set values to reset to if this is followed by a zero repeat. */
3014
3015 zerofirstbyte = firstbyte;
3016 zeroreqbyte = reqbyte;
3017
3018 /* Back references are handled specially */
3019
3020 if (-c >= ESC_REF)
3021 {
3022 int number = -c - ESC_REF;
3023 previous = code;
3024 *code++ = OP_REF;
3025 PUT2INC(code, 0, number);
3026 }
3027 else
3028 {
3029 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3030 *code++ = -c;
3031 }
3032 continue;
3033 }
3034
3035 /* Data character: reset and fall through */
3036
3037 ptr = tempptr;
3038 c = '\\';
3039
3040 /* Handle a run of data characters until a metacharacter is encountered.
3041 The first character is guaranteed not to be whitespace or # when the
3042 extended flag is set. */
3043
3044 NORMAL_CHAR:
3045 default:
3046 previous = code;
3047 *code = OP_CHARS;
3048 code += 2;
3049 length = 0;
3050
3051 do
3052 {
3053 /* If in \Q...\E, check for the end; if not, we always have a literal */
3054
3055 if (inescq)
3056 {
3057 if (c == '\\' && ptr[1] == 'E')
3058 {
3059 inescq = FALSE;
3060 ptr++;
3061 }
3062 else
3063 {
3064 *code++ = c;
3065 length++;
3066 }
3067 continue;
3068 }
3069
3070 /* Skip white space and comments for /x patterns */
3071
3072 if ((options & PCRE_EXTENDED) != 0)
3073 {
3074 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3075 if (c == '#')
3076 {
3077 /* The space before the ; is to avoid a warning on a silly compiler
3078 on the Macintosh. */
3079 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3080 if (c == 0) break;
3081 continue;
3082 }
3083 }
3084
3085 /* Backslash may introduce a data char or a metacharacter. Escaped items
3086 are checked for validity in the pre-compiling pass. Stop the string
3087 before a metaitem. */
3088
3089 if (c == '\\')
3090 {
3091 tempptr = ptr;
3092 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3093 if (c < 0) { ptr = tempptr; break; }
3094
3095 /* If a character is > 127 in UTF-8 mode, we have to turn it into
3096 two or more characters in the UTF-8 encoding. */
3097
3098 #ifdef SUPPORT_UTF8
3099 if (utf8 && c > 127)
3100 {
3101 uschar buffer[8];
3102 int len = ord2utf8(c, buffer);
3103 for (c = 0; c < len; c++) *code++ = buffer[c];
3104 length += len;
3105 continue;
3106 }
3107 #endif
3108 }
3109
3110 /* Ordinary character or single-char escape */
3111
3112 *code++ = c;
3113 length++;
3114 }
3115
3116 /* This "while" is the end of the "do" above. */
3117
3118 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3119
3120 /* Update the first and last requirements. These are always bytes, even in
3121 UTF-8 mode. However, there is a special case to be considered when there
3122 are only one or two characters. Because this gets messy in UTF-8 mode, the
3123 code is kept separate. When we get here "length" contains the number of
3124 bytes. */
3125
3126 #ifdef SUPPORT_UTF8
3127 if (utf8 && length > 1)
3128 {
3129 uschar *t = previous + 3; /* After this code, t */
3130 while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3131
3132 /* Handle the case when there is only one multibyte character. It must
3133 have at least two bytes because of the "length > 1" test above. */
3134
3135 if (t == code)
3136 {
3137 /* If no previous first byte, set it from this character, but revert to
3138 none on a zero repeat. */
3139
3140 if (firstbyte == REQ_UNSET)
3141 {
3142 zerofirstbyte = REQ_NONE;
3143 firstbyte = previous[2];
3144 }
3145
3146 /* Otherwise, leave the first byte value alone, and don't change it on
3147 a zero repeat */
3148
3149 else zerofirstbyte = firstbyte;
3150
3151 /* In both cases, a zero repeat resets the previous required byte */
3152
3153 zeroreqbyte = reqbyte;
3154 }
3155
3156 /* Handle the case when there is more than one character. These may be
3157 single-byte or multibyte characters */
3158
3159 else
3160 {
3161 t = code - 1; /* After this code, t is at the */
3162 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3163
3164 /* If no previous first byte, set it from the first character, and
3165 retain it on a zero repeat (of the last character). The required byte
3166 is reset on a zero repeat, either to the byte before the last
3167 character, unless this is the first byte of the string. In that case,
3168 it reverts to its previous value. */
3169
3170 if (firstbyte == REQ_UNSET)
3171 {
3172 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3173 zeroreqbyte = (t - 1 == previous + 2)?
3174 reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3175 }
3176
3177 /* If there was a previous first byte, leave it alone, and don't change
3178 it on a zero repeat. The required byte is reset on a zero repeat to the
3179 byte before the last character. */
3180
3181 else
3182 {
3183 zerofirstbyte = firstbyte;
3184 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3185 }
3186 }
3187
3188 /* In all cases (we know length > 1), the new required byte is the last
3189 byte of the string. */
3190
3191 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3192 }
3193
3194 else /* End of UTF-8 coding */
3195 #endif
3196
3197 /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3198 or when UTF-8 is not enabled. */
3199
3200 {
3201 /* firstbyte was not previously set; take it from this string */
3202
3203 if (firstbyte == REQ_UNSET)
3204 {
3205 if (length == 1)
3206 {
3207 zerofirstbyte = REQ_NONE;
3208 firstbyte = previous[2] | req_caseopt;
3209 zeroreqbyte = reqbyte;
3210 }
3211 else
3212 {
3213 zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3214 zeroreqbyte = (length > 2)?
3215 (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3216 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3217 }
3218 }
3219
3220 /* firstbyte was previously set */
3221
3222 else
3223 {
3224 zerofirstbyte = firstbyte;
3225 zeroreqbyte = (length == 1)? reqbyte :
3226 code[-2] | req_caseopt | cd->req_varyopt;
3227 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3228 }
3229 }
3230
3231 /* Set the length in the data vector, and advance to the next state. */
3232
3233 previous[1] = length;
3234 if (length < MAXLIT) ptr--;
3235 break;
3236 }
3237 } /* end of big loop */
3238
3239 /* Control never reaches here by falling through, only by a goto for all the
3240 error states. Pass back the position in the pattern so that it can be displayed
3241 to the user for diagnosing the error. */
3242
3243 FAILED:
3244 *ptrptr = ptr;
3245 return FALSE;
3246 }
3247
3248
3249
3250
3251 /*************************************************
3252 * Compile sequence of alternatives *
3253 *************************************************/
3254
3255 /* On entry, ptr is pointing past the bracket character, but on return
3256 it points to the closing bracket, or vertical bar, or end of string.
3257 The code variable is pointing at the byte into which the BRA operator has been
3258 stored. If the ims options are changed at the start (for a (?ims: group) or
3259 during any branch, we need to insert an OP_OPT item at the start of every
3260 following branch to ensure they get set correctly at run time, and also pass
3261 the new options into every subsequent branch compile.
3262
3263 Argument:
3264 options option bits, including any changes for this subpattern
3265 oldims previous settings of ims option bits
3266 brackets -> int containing the number of extracting brackets used
3267 codeptr -> the address of the current code pointer
3268 ptrptr -> the address of the current pattern pointer
3269 errorptr -> pointer to error message
3270 lookbehind TRUE if this is a lookbehind assertion
3271 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3272 firstbyteptr place to put the first required character, or a negative number
3273 reqbyteptr place to put the last required character, or a negative number
3274 bcptr pointer to the chain of currently open branches
3275 cd points to the data block with tables pointers etc.
3276
3277 Returns: TRUE on success
3278 */
3279
3280 static BOOL
3281 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3282 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3283 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3284 {
3285 const uschar *ptr = *ptrptr;
3286 uschar *code = *codeptr;
3287 uschar *last_branch = code;
3288 uschar *start_bracket = code;
3289 uschar *reverse_count = NULL;
3290 int firstbyte, reqbyte;
3291 int branchfirstbyte, branchreqbyte;
3292 branch_chain bc;
3293
3294 bc.outer = bcptr;
3295 bc.current = code;
3296
3297 firstbyte = reqbyte = REQ_UNSET;
3298
3299 /* Offset is set zero to mark that this bracket is still open */
3300
3301 PUT(code, 1, 0);
3302 code += 1 + LINK_SIZE + skipbytes;
3303
3304 /* Loop for each alternative branch */
3305
3306 for (;;)
3307 {
3308 /* Handle a change of ims options at the start of the branch */
3309
3310 if ((options & PCRE_IMS) != oldims)
3311 {
3312 *code++ = OP_OPT;
3313 *code++ = options & PCRE_IMS;
3314 }
3315
3316 /* Set up dummy OP_REVERSE if lookbehind assertion */
3317
3318 if (lookbehind)
3319 {
3320 *code++ = OP_REVERSE;
3321 reverse_count = code;
3322 PUTINC(code, 0, 0);
3323 }
3324
3325 /* Now compile the branch */
3326
3327 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3328 &branchfirstbyte, &branchreqbyte, &bc, cd))
3329 {
3330 *ptrptr = ptr;
3331 return FALSE;
3332 }
3333
3334 /* If this is the first branch, the firstbyte and reqbyte values for the
3335 branch become the values for the regex. */
3336
3337 if (*last_branch != OP_ALT)
3338 {
3339 firstbyte = branchfirstbyte;
3340 reqbyte = branchreqbyte;
3341 }
3342
3343 /* If this is not the first branch, the first char and reqbyte have to
3344 match the values from all the previous branches, except that if the previous
3345 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3346 REQ_VARY for the regex. */
3347
3348 else
3349 {
3350 /* If we previously had a firstbyte, but it doesn't match the new branch,
3351 we have to abandon the firstbyte for the regex, but if there was previously
3352 no reqbyte, it takes on the value of the old firstbyte. */
3353
3354 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3355 {
3356 if (reqbyte < 0) reqbyte = firstbyte;
3357 firstbyte = REQ_NONE;
3358 }
3359
3360 /* If we (now or from before) have no firstbyte, a firstbyte from the
3361 branch becomes a reqbyte if there isn't a branch reqbyte. */
3362
3363 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3364 branchreqbyte = branchfirstbyte;
3365
3366 /* Now ensure that the reqbytes match */
3367
3368 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3369 reqbyte = REQ_NONE;
3370 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3371 }
3372
3373 /* If lookbehind, check that this branch matches a fixed-length string,
3374 and put the length into the OP_REVERSE item. Temporarily mark the end of
3375 the branch with OP_END. */
3376
3377 if (lookbehind)
3378 {
3379 int length;
3380 *code = OP_END;
3381 length = find_fixedlength(last_branch, options);
3382 DPRINTF(("fixed length = %d\n", length));
3383 if (length < 0)
3384 {
3385 *errorptr = (length == -2)? ERR36 : ERR25;
3386 *ptrptr = ptr;
3387 return FALSE;
3388 }
3389 PUT(reverse_count, 0, length);
3390 }
3391
3392 /* Reached end of expression, either ')' or end of pattern. Go back through
3393 the alternative branches and reverse the chain of offsets, with the field in
3394 the BRA item now becoming an offset to the first alternative. If there are
3395 no alternatives, it points to the end of the group. The length in the
3396 terminating ket is always the length of the whole bracketed item. If any of
3397 the ims options were changed inside the group, compile a resetting op-code
3398 following, except at the very end of the pattern. Return leaving the pointer
3399 at the terminating char. */
3400
3401 if (*ptr != '|')
3402 {
3403 int length = code - last_branch;
3404 do
3405 {
3406 int prev_length = GET(last_branch, 1);
3407 PUT(last_branch, 1, length);
3408 length = prev_length;
3409 last_branch -= length;
3410 }
3411 while (length > 0);
3412
3413 /* Fill in the ket */
3414
3415 *code = OP_KET;
3416 PUT(code, 1, code - start_bracket);
3417 code += 1 + LINK_SIZE;
3418
3419 /* Resetting option if needed */
3420
3421 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3422 {
3423 *code++ = OP_OPT;
3424 *code++ = oldims;
3425 }
3426
3427 /* Set values to pass back */
3428
3429 *codeptr = code;
3430 *ptrptr = ptr;
3431 *firstbyteptr = firstbyte;
3432 *reqbyteptr = reqbyte;
3433 return TRUE;
3434 }
3435
3436 /* Another branch follows; insert an "or" node. Its length field points back
3437 to the previous branch while the bracket remains open. At the end the chain
3438 is reversed. It's done like this so that the start of the bracket has a
3439 zero offset until it is closed, making it possible to detect recursion. */
3440
3441 *code = OP_ALT;
3442 PUT(code, 1, code - last_branch);
3443 bc.current = last_branch = code;
3444 code += 1 + LINK_SIZE;
3445 ptr++;
3446 }
3447 /* Control never reaches here */
3448 }
3449
3450
3451
3452
3453 /*************************************************
3454 * Check for anchored expression *
3455 *************************************************/
3456
3457 /* Try to find out if this is an anchored regular expression. Consider each
3458 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3459 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3460 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3461 counts, since OP_CIRC can match in the middle.
3462
3463 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3464 This is the code for \G, which means "match at start of match position, taking
3465 into account the match offset".
3466
3467 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3468 because that will try the rest of the pattern at all possible matching points,
3469 so there is no point trying again.... er ....
3470
3471 .... except when the .* appears inside capturing parentheses, and there is a
3472 subsequent back reference to those parentheses. We haven't enough information
3473 to catch that case precisely.
3474
3475 At first, the best we could do was to detect when .* was in capturing brackets
3476 and the highest back reference was greater than or equal to that level.
3477 However, by keeping a bitmap of the first 31 back references, we can catch some
3478 of the more common cases more precisely.
3479
3480 Arguments:
3481 code points to start of expression (the bracket)
3482 options points to the options setting
3483 bracket_map a bitmap of which brackets we are inside while testing; this
3484 handles up to substring 31; after that we just have to take
3485 the less precise approach
3486 backref_map the back reference bitmap
3487
3488 Returns: TRUE or FALSE
3489 */
3490
3491 static BOOL
3492 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3493 unsigned int backref_map)
3494 {
3495 do {
3496 const uschar *scode =
3497 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3498 register int op = *scode;
3499
3500 /* Capturing brackets */
3501
3502 if (op > OP_BRA)
3503 {
3504 int new_map;
3505 op -= OP_BRA;
3506 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3507 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3508 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3509 }
3510
3511 /* Other brackets */
3512
3513 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3514 {
3515 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3516 }
3517
3518 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3519 are or may be referenced. */
3520
3521 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3522 (*options & PCRE_DOTALL) != 0)
3523 {
3524 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3525 }
3526
3527 /* Check for explicit anchoring */
3528
3529 else if (op != OP_SOD && op != OP_SOM &&
3530 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3531 return FALSE;
3532 code += GET(code, 1);
3533 }
3534 while (*code == OP_ALT); /* Loop for each alternative */
3535 return TRUE;
3536 }
3537
3538
3539
3540 /*************************************************
3541 * Check for starting with ^ or .* *
3542 *************************************************/
3543
3544 /* This is called to find out if every branch starts with ^ or .* so that
3545 "first char" processing can be done to speed things up in multiline
3546 matching and for non-DOTALL patterns that start with .* (which must start at
3547 the beginning or after \n). As in the case of is_anchored() (see above), we
3548 have to take account of back references to capturing brackets that contain .*
3549 because in that case we can't make the assumption.
3550
3551 Arguments:
3552 code points to start of expression (the bracket)
3553 bracket_map a bitmap of which brackets we are inside while testing; this
3554 handles up to substring 31; after that we just have to take
3555 the less precise approach
3556 backref_map the back reference bitmap
3557
3558 Returns: TRUE or FALSE
3559 */
3560
3561 static BOOL
3562 is_startline(const uschar *code, unsigned int bracket_map,
3563 unsigned int backref_map)
3564 {
3565 do {
3566 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3567 register int op = *scode;
3568
3569 /* Capturing brackets */
3570
3571 if (op > OP_BRA)
3572 {
3573 int new_map;
3574 op -= OP_BRA;
3575 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3576 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3577 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3578 }
3579
3580 /* Other brackets */
3581
3582 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3583 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3584
3585 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3586 may be referenced. */
3587
3588 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3589 {
3590 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3591 }
3592
3593 /* Check for explicit circumflex */
3594
3595 else if (op != OP_CIRC) return FALSE;
3596 code += GET(code, 1);
3597 }
3598 while (*code == OP_ALT); /* Loop for each alternative */
3599 return TRUE;
3600 }
3601
3602
3603
3604 /*************************************************
3605 * Check for asserted fixed first char *
3606 *************************************************/
3607
3608 /* During compilation, the "first char" settings from forward assertions are
3609 discarded, because they can cause conflicts with actual literals that follow.
3610 However, if we end up without a first char setting for an unanchored pattern,
3611 it is worth scanning the regex to see if there is an initial asserted first
3612 char. If all branches start with the same asserted char, or with a bracket all
3613 of whose alternatives start with the same asserted char (recurse ad lib), then
3614 we return that char, otherwise -1.
3615
3616 Arguments:
3617 code points to start of expression (the bracket)
3618 options pointer to the options (used to check casing changes)
3619 inassert TRUE if in an assertion
3620
3621 Returns: -1 or the fixed first char
3622 */
3623
3624 static int
3625 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3626 {
3627 register int c = -1;
3628 do {
3629 int d;
3630 const uschar *scode =
3631 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3632 register int op = *scode;
3633
3634 if (op >= OP_BRA) op = OP_BRA;
3635
3636 switch(op)
3637 {
3638 default:
3639 return -1;
3640
3641 case OP_BRA:
3642 case OP_ASSERT:
3643 case OP_ONCE:
3644 case OP_COND:
3645 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3646 return -1;
3647 if (c < 0) c = d; else if (c != d) return -1;
3648 break;
3649
3650 case OP_EXACT: /* Fall through */
3651 scode++;
3652
3653 case OP_CHARS: /* Fall through */
3654 scode++;
3655
3656 case OP_PLUS:
3657 case OP_MINPLUS:
3658 if (!inassert) return -1;
3659 if (c < 0)
3660 {
3661 c = scode[1];
3662 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3663 }
3664 else if (c != scode[1]) return -1;
3665 break;
3666 }
3667
3668 code += GET(code, 1);
3669 }
3670 while (*code == OP_ALT);
3671 return c;
3672 }
3673
3674
3675
3676
3677 /*************************************************
3678 * Compile a Regular Expression *
3679 *************************************************/
3680
3681 /* This function takes a string and returns a pointer to a block of store
3682 holding a compiled version of the expression.
3683
3684 Arguments:
3685 pattern the regular expression
3686 options various option bits
3687 errorptr pointer to pointer to error text
3688 erroroffset ptr offset in pattern where error was detected
3689 tables pointer to character tables or NULL
3690
3691 Returns: pointer to compiled data block, or NULL on error,
3692 with errorptr and erroroffset set
3693 */
3694
3695 pcre *
3696 pcre_compile(const char *pattern, int options, const char **errorptr,
3697 int *erroroffset, const unsigned char *tables)
3698 {
3699 real_pcre *re;
3700 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3701 int runlength;
3702 int c, firstbyte, reqbyte;
3703 int bracount = 0;
3704 int branch_extra = 0;
3705 int branch_newextra;
3706 int item_count = -1;
3707 int name_count = 0;
3708 int max_name_size = 0;
3709 #ifdef SUPPORT_UTF8
3710 int lastcharlength = 0;
3711 BOOL utf8;
3712 BOOL class_utf8;
3713 #endif
3714 BOOL inescq = FALSE;
3715 unsigned int brastackptr = 0;
3716 size_t size;
3717 uschar *code;
3718 const uschar *codestart;
3719 const uschar *ptr;
3720 compile_data compile_block;
3721 int brastack[BRASTACK_SIZE];
3722 uschar bralenstack[BRASTACK_SIZE];
3723
3724 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3725 can do is just return NULL. */
3726
3727 if (errorptr == NULL) return NULL;
3728 *errorptr = NULL;
3729
3730 /* However, we can give a message for this error */
3731
3732 if (erroroffset == NULL)
3733 {
3734 *errorptr = ERR16;
3735 return NULL;
3736 }
3737 *erroroffset = 0;
3738
3739 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3740
3741 #ifdef SUPPORT_UTF8
3742 utf8 = (options & PCRE_UTF8) != 0;
3743 #else
3744 if ((options & PCRE_UTF8) != 0)
3745 {
3746 *errorptr = ERR32;
3747 return NULL;
3748 }
3749 #endif
3750
3751 if ((options & ~PUBLIC_OPTIONS) != 0)
3752 {
3753 *errorptr = ERR17;
3754 return NULL;
3755 }
3756
3757 /* Set up pointers to the individual character tables */
3758
3759 if (tables == NULL) tables = pcre_default_tables;
3760 compile_block.lcc = tables + lcc_offset;
3761 compile_block.fcc = tables + fcc_offset;
3762 compile_block.cbits = tables + cbits_offset;
3763 compile_block.ctypes = tables + ctypes_offset;
3764
3765 /* Maximum back reference and backref bitmap. This is updated for numeric
3766 references during the first pass, but for named references during the actual
3767 compile pass. The bitmap records up to 31 back references to help in deciding
3768 whether (.*) can be treated as anchored or not. */
3769
3770 compile_block.top_backref = 0;
3771 compile_block.backref_map = 0;
3772
3773 /* Reflect pattern for debugging output */
3774
3775 DPRINTF(("------------------------------------------------------------------\n"));
3776 DPRINTF(("%s\n", pattern));
3777
3778 /* The first thing to do is to make a pass over the pattern to compute the
3779 amount of store required to hold the compiled code. This does not have to be
3780 perfect as long as errors are overestimates. At the same time we can detect any
3781 flag settings right at the start, and extract them. Make an attempt to correct
3782 for any counted white space if an "extended" flag setting appears late in the
3783 pattern. We can't be so clever for #-comments. */
3784
3785 ptr = (const uschar *)(pattern - 1);
3786 while ((c = *(++ptr)) != 0)
3787 {
3788 int min, max;
3789 int class_optcount;
3790 int bracket_length;
3791 int duplength;
3792
3793 /* If we are inside a \Q...\E sequence, all chars are literal */
3794
3795 if (inescq) goto NORMAL_CHAR;
3796
3797 /* Otherwise, first check for ignored whitespace and comments */
3798
3799 if ((options & PCRE_EXTENDED) != 0)
3800 {
3801 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3802 if (c == '#')
3803 {
3804 /* The space before the ; is to avoid a warning on a silly compiler
3805 on the Macintosh. */
3806 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3807 if (c == 0) break;
3808 continue;
3809 }
3810 }
3811
3812 item_count++; /* Is zero for the first non-comment item */
3813
3814 switch(c)
3815 {
3816 /* A backslashed item may be an escaped "normal" character or a
3817 character type. For a "normal" character, put the pointers and
3818 character back so that tests for whitespace etc. in the input
3819 are done correctly. */
3820
3821 case '\\':
3822 {
3823 const uschar *save_ptr = ptr;
3824 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
3825 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3826 if (c >= 0)
3827 {
3828 ptr = save_ptr;
3829 c = '\\';
3830 goto NORMAL_CHAR;
3831 }
3832 }
3833
3834 /* If \Q, enter "literal" mode */
3835
3836 if (-c == ESC_Q)
3837 {
3838 inescq = TRUE;
3839 continue;
3840 }
3841
3842 /* Other escapes need one byte, and are of length one for repeats */
3843
3844 length++;
3845 #ifdef SUPPORT_UTF8
3846 lastcharlength = 1;
3847 #endif
3848
3849 /* A back reference needs an additional 2 bytes, plus either one or 5
3850 bytes for a repeat. We also need to keep the value of the highest
3851 back reference. */
3852
3853 if (c <= -ESC_REF)
3854 {
3855 int refnum = -c - ESC_REF;
3856 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3857 if (refnum > compile_block.top_backref)
3858 compile_block.top_backref = refnum;
3859 length += 2; /* For single back reference */
3860 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3861 {
3862 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
3863 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3864 if ((min == 0 && (max == 1 || max == -1)) ||
3865 (min == 1 && max == -1))
3866 length++;
3867 else length += 5;
3868 if (ptr[1] == '?') ptr++;
3869 }
3870 }
3871 continue;
3872
3873 case '^': /* Single-byte metacharacters */
3874 case '.':
3875 case '$':
3876 length++;
3877 #ifdef SUPPORT_UTF8
3878 lastcharlength = 1;
3879 #endif
3880 continue;
3881
3882 case '*': /* These repeats won't be after brackets; */
3883 case '+': /* those are handled separately */
3884 case '?':
3885 length++;
3886 goto POSESSIVE; /* A few lines below */
3887
3888 /* This covers the cases of braced repeats after a single char, metachar,
3889 class, or back reference. */
3890
3891 case '{':
3892 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3893 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3894 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3895
3896 /* These special cases just insert one extra opcode */
3897
3898 if ((min == 0 && (max == 1 || max == -1)) ||
3899 (min == 1 && max == -1))
3900 length++;
3901
3902 /* These cases might insert additional copies of a preceding character. */
3903
3904 else
3905 {
3906 #ifdef SUPPORT_UTF8
3907 /* In UTF-8 mode, we should find the length in lastcharlength */
3908 if (utf8)
3909 {
3910 if (min != 1)
3911 {
3912 length -= lastcharlength; /* Uncount the original char or metachar */
3913 if (min > 0) length += 3 + lastcharlength;
3914 }
3915 length += lastcharlength + ((max > 0)? 3 : 1);
3916 }
3917 else
3918 #endif
3919
3920 /* Not UTF-8 mode: all characters are one byte */
3921 {
3922 if (min != 1)
3923 {
3924 length--; /* Uncount the original char or metachar */
3925 if (min > 0) length += 4;
3926 }
3927
3928 length += (max > 0)? 4 : 2;
3929 }
3930 }
3931
3932 if (ptr[1] == '?') ptr++; /* Needs no extra length */
3933
3934 POSESSIVE: /* Test for possessive quantifier */
3935 if (ptr[1] == '+')
3936 {
3937 ptr++;
3938 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
3939 }
3940 continue;
3941
3942 /* An alternation contains an offset to the next branch or ket. If any ims
3943 options changed in the previous branch(es), and/or if we are in a
3944 lookbehind assertion, extra space will be needed at the start of the
3945 branch. This is handled by branch_extra. */
3946
3947 case '|':
3948 length += 1 + LINK_SIZE + branch_extra;
3949 continue;
3950
3951 /* A character class uses 33 characters provided that all the character
3952 values are less than 256. Otherwise, it uses a bit map for low valued
3953 characters, and individual items for others. Don't worry about character
3954 types that aren't allowed in classes - they'll get picked up during the
3955 compile. A character class that contains only one single-byte character
3956 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
3957 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
3958
3959 case '[':
3960 class_optcount = 0;
3961
3962 #ifdef SUPPORT_UTF8
3963 class_utf8 = FALSE;
3964 #endif
3965
3966 if (*(++ptr) == '^') ptr++;
3967
3968 /* Written as a "do" so that an initial ']' is taken as data */
3969
3970 if (*ptr != 0) do
3971 {
3972 /* Inside \Q...\E everything is literal except \E */
3973
3974 if (inescq)
3975 {
3976 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
3977 inescq = FALSE;
3978 ptr += 1;
3979 continue;
3980 }
3981
3982 /* Outside \Q...\E, check for escapes */
3983
3984 if (*ptr == '\\')
3985 {
3986 #ifdef SUPPORT_UTF8
3987 int prevchar = ptr[-1];
3988 #endif
3989 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
3990 &compile_block);
3991 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3992
3993 /* \b is backspace inside a class */
3994
3995 if (-ch == ESC_b) ch = '\b';
3996
3997 /* \Q enters quoting mode */
3998
3999 if (-ch == ESC_Q)
4000 {
4001 inescq = TRUE;
4002 continue;
4003 }
4004
4005 /* Handle escapes that turn into characters */
4006
4007 if (ch >= 0)
4008 {
4009 #ifdef SUPPORT_UTF8
4010 if (utf8)
4011 {
4012 if (ch > 127) class_optcount = 10; /* Ensure > 1 */
4013 if (ch > 255)
4014 {
4015 uschar buffer[6];
4016 if (!class_utf8)
4017 {
4018 class_utf8 = TRUE;
4019 length += LINK_SIZE + 1 + 1;
4020 }
4021 length += 1 + ord2utf8(ch, buffer);
4022
4023 /* If this wide character is preceded by '-', add an extra 2 to
4024 the length in case the previous character was < 128, because in
4025 this case the whole range will be put into the list. */
4026
4027 if (prevchar == '-') length += 2;
4028 }
4029 }
4030 #endif
4031 class_optcount++; /* for possible optimization */
4032 }
4033 else class_optcount = 10; /* \d, \s etc; make sure > 1 */
4034 }
4035
4036 /* Check the syntax for POSIX stuff. The bits we actually handle are
4037 checked during the real compile phase. */
4038
4039 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4040 {
4041 ptr++;
4042 class_optcount = 10; /* Make sure > 1 */
4043 }
4044
4045 /* Anything else just increments the possible optimization count. If
4046 there are wide characters, we are going to have to use an XCLASS. */
4047
4048 else
4049 {
4050 NON_SPECIAL_CHARACTER:
4051 class_optcount++;
4052
4053 #ifdef SUPPORT_UTF8
4054 if (utf8)
4055 {
4056 int ch;
4057 int extra = 0;
4058 GETCHARLEN(ch, ptr, extra);
4059 if (ch > 127) class_optcount = 10; /* No optimization possible */
4060 if (ch > 255)
4061 {
4062 if (!class_utf8)
4063 {
4064 class_utf8 = TRUE;
4065 length += LINK_SIZE + 1 + 1;
4066 }
4067 length += 2 + extra;
4068
4069 /* If this wide character is preceded by '-', add an extra 2 to
4070 the length in case the previous character was < 128, because in
4071 this case the whole range will be put into the list. */
4072
4073 if (ptr[-1] == '-') length += 2;
4074
4075 /* Advance to the end of this character */
4076
4077 ptr += extra;
4078 }
4079 }
4080 #endif
4081 }
4082 }
4083 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4084
4085 if (*ptr == 0) /* Missing terminating ']' */
4086 {
4087 *errorptr = ERR6;
4088 goto PCRE_ERROR_RETURN;
4089 }
4090
4091 /* We can optimize when there was only one optimizable character. Repeats
4092 for positive and negated single one-byte chars are handled by the general
4093 code. Here, we handle repeats for the class opcodes. */
4094
4095 if (class_optcount == 1) length += 3; else
4096 {
4097 length += 33;
4098
4099 /* A repeat needs either 1 or 5 bytes. */
4100
4101 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
4102 {
4103 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4104 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4105 if ((min == 0 && (max == 1 || max == -1)) ||
4106 (min == 1 && max == -1))
4107 length++;
4108 else length += 5;
4109 if (ptr[1] == '?') ptr++;
4110 }
4111 }
4112 continue;
4113
4114 /* Brackets may be genuine groups or special things */
4115
4116 case '(':
4117 branch_newextra = 0;
4118 bracket_length = 1 + LINK_SIZE;
4119
4120 /* Handle special forms of bracket, which all start (? */
4121
4122 if (ptr[1] == '?')
4123 {
4124 int set, unset;
4125 int *optset;
4126
4127 switch (c = ptr[2])
4128 {
4129 /* Skip over comments entirely */
4130 case '#':
4131 ptr += 3;
4132 while (*ptr != 0 && *ptr != ')') ptr++;
4133 if (*ptr == 0)
4134 {
4135 *errorptr = ERR18;
4136 goto PCRE_ERROR_RETURN;
4137 }
4138 continue;
4139
4140 /* Non-referencing groups and lookaheads just move the pointer on, and
4141 then behave like a non-special bracket, except that they don't increment
4142 the count of extracting brackets. Ditto for the "once only" bracket,
4143 which is in Perl from version 5.005. */
4144
4145 case ':':
4146 case '=':
4147 case '!':
4148 case '>':
4149 ptr += 2;
4150 break;
4151
4152 /* (?R) specifies a recursive call to the regex, which is an extension
4153 to provide the facility which can be obtained by (?p{perl-code}) in
4154 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4155
4156 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4157 the appropriate numbered brackets. This includes both recursive and
4158 non-recursive calls. (?R) is now synonymous with (?0). */
4159
4160 case 'R':
4161 ptr++;
4162
4163 case '0': case '1': case '2': case '3': case '4':
4164 case '5': case '6': case '7': case '8': case '9':
4165 ptr += 2;
4166 if (c != 'R')
4167 while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4168 if (*ptr != ')')
4169 {
4170 *errorptr = ERR29;
4171 goto PCRE_ERROR_RETURN;
4172 }
4173 length += 1 + LINK_SIZE;
4174
4175 /* If this item is quantified, it will get wrapped inside brackets so
4176 as to use the code for quantified brackets. We jump down and use the
4177 code that handles this for real brackets. */
4178
4179 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4180 {
4181 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4182 duplength = 5 + 3 * LINK_SIZE;
4183 goto HANDLE_QUANTIFIED_BRACKETS;
4184 }
4185 continue;
4186
4187 /* (?C) is an extension which provides "callout" - to provide a bit of
4188 the functionality of the Perl (?{...}) feature. An optional number may
4189 follow (default is zero). */
4190
4191 case 'C':
4192 ptr += 2;
4193 while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
4194 if (*ptr != ')')
4195 {
4196 *errorptr = ERR39;
4197 goto PCRE_ERROR_RETURN;
4198 }
4199 length += 2;
4200 continue;
4201
4202 /* Named subpatterns are an extension copied from Python */
4203
4204 case 'P':
4205 ptr += 3;
4206 if (*ptr == '<')
4207 {
4208 const uschar *p; /* Don't amalgamate; some compilers */
4209 p = ++ptr; /* grumble at autoincrement in declaration */
4210 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4211 if (*ptr != '>')
4212 {
4213 *errorptr = ERR42;
4214 goto PCRE_ERROR_RETURN;
4215 }
4216 name_count++;
4217 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4218 break;
4219 }
4220
4221 if (*ptr == '=' || *ptr == '>')
4222 {
4223 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4224 if (*ptr != ')')
4225 {
4226 *errorptr = ERR42;
4227 goto PCRE_ERROR_RETURN;
4228 }
4229 break;
4230 }
4231
4232 /* Unknown character after (?P */
4233
4234 *errorptr = ERR41;
4235 goto PCRE_ERROR_RETURN;
4236
4237 /* Lookbehinds are in Perl from version 5.005 */
4238
4239 case '<':
4240 ptr += 3;
4241 if (*ptr == '=' || *ptr == '!')
4242 {
4243 branch_newextra = 1 + LINK_SIZE;
4244 length += 1 + LINK_SIZE; /* For the first branch */
4245 break;
4246 }
4247 *errorptr = ERR24;
4248 goto PCRE_ERROR_RETURN;
4249
4250 /* Conditionals are in Perl from version 5.005. The bracket must either
4251 be followed by a number (for bracket reference) or by an assertion
4252 group, or (a PCRE extension) by 'R' for a recursion test. */
4253
4254 case '(':
4255 if (ptr[3] == 'R' && ptr[4] == ')')
4256 {
4257 ptr += 4;
4258 length += 3;
4259 }
4260 else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
4261 {
4262 ptr += 4;
4263 length += 3;
4264 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
4265 if (*ptr != ')')
4266 {
4267 *errorptr = ERR26;
4268 goto PCRE_ERROR_RETURN;
4269 }
4270 }
4271 else /* An assertion must follow */
4272 {
4273 ptr++; /* Can treat like ':' as far as spacing is concerned */
4274 if (ptr[2] != '?' ||
4275 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4276 {
4277 ptr += 2; /* To get right offset in message */
4278 *errorptr = ERR28;
4279 goto PCRE_ERROR_RETURN;
4280 }
4281 }
4282 break;
4283
4284 /* Else loop checking valid options until ) is met. Anything else is an
4285 error. If we are without any brackets, i.e. at top level, the settings
4286 act as if specified in the options, so massage the options immediately.
4287 This is for backward compatibility with Perl 5.004. */
4288
4289 default:
4290 set = unset = 0;
4291 optset = &set;
4292 ptr += 2;
4293
4294 for (;; ptr++)
4295 {
4296 c = *ptr;
4297 switch (c)
4298 {
4299 case 'i':
4300 *optset |= PCRE_CASELESS;
4301 continue;
4302
4303 case 'm':
4304 *optset |= PCRE_MULTILINE;
4305 continue;
4306
4307 case 's':
4308 *optset |= PCRE_DOTALL;
4309 continue;
4310
4311 case 'x':
4312 *optset |= PCRE_EXTENDED;
4313 continue;
4314
4315 case 'X':
4316 *optset |= PCRE_EXTRA;
4317 continue;
4318
4319 case 'U':
4320 *optset |= PCRE_UNGREEDY;
4321 continue;
4322
4323 case '-':
4324 optset = &unset;
4325 continue;
4326
4327 /* A termination by ')' indicates an options-setting-only item; if
4328 this is at the very start of the pattern (indicated by item_count
4329 being zero), we use it to set the global options. This is helpful
4330 when analyzing the pattern for first characters, etc. Otherwise
4331 nothing is done here and it is handled during the compiling
4332 process.
4333
4334 [Historical note: Up to Perl 5.8, options settings at top level
4335 were always global settings, wherever they appeared in the pattern.
4336 That is, they were equivalent to an external setting. From 5.8
4337 onwards, they apply only to what follows (which is what you might
4338 expect).] */
4339
4340 case ')':
4341 if (item_count == 0)
4342 {
4343 options = (options | set) & (~unset);
4344 set = unset = 0; /* To save length */
4345 item_count--; /* To allow for several */
4346 }
4347
4348 /* Fall through */
4349
4350 /* A termination by ':' indicates the start of a nested group with
4351 the given options set. This is again handled at compile time, but
4352 we must allow for compiled space if any of the ims options are
4353 set. We also have to allow for resetting space at the end of
4354 the group, which is why 4 is added to the length and not just 2.
4355 If there are several changes of options within the same group, this
4356 will lead to an over-estimate on the length, but this shouldn't
4357 matter very much. We also have to allow for resetting options at
4358 the start of any alternations, which we do by setting
4359 branch_newextra to 2. Finally, we record whether the case-dependent
4360 flag ever changes within the regex. This is used by the "required
4361 character" code. */
4362
4363 case ':':
4364 if (((set|unset) & PCRE_IMS) != 0)
4365 {
4366 length += 4;
4367 branch_newextra = 2;
4368 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4369 }
4370 goto END_OPTIONS;
4371
4372 /* Unrecognized option character */
4373
4374 default:
4375 *errorptr = ERR12;
4376 goto PCRE_ERROR_RETURN;
4377 }
4378 }
4379
4380 /* If we hit a closing bracket, that's it - this is a freestanding
4381 option-setting. We need to ensure that branch_extra is updated if
4382 necessary. The only values branch_newextra can have here are 0 or 2.
4383 If the value is 2, then branch_extra must either be 2 or 5, depending
4384 on whether this is a lookbehind group or not. */
4385
4386 END_OPTIONS:
4387 if (c == ')')
4388 {
4389 if (branch_newextra == 2 &&
4390 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4391 branch_extra += branch_newextra;
4392 continue;
4393 }
4394
4395 /* If options were terminated by ':' control comes here. Fall through
4396 to handle the group below. */
4397 }
4398 }
4399
4400 /* Extracting brackets must be counted so we can process escapes in a
4401 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4402 need an additional 3 bytes of store per extracting bracket. However, if
4403 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4404 must leave the count alone (it will aways be zero). */
4405
4406 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4407 {
4408 bracount++;
4409 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4410 }
4411
4412 /* Save length for computing whole length at end if there's a repeat that
4413 requires duplication of the group. Also save the current value of
4414 branch_extra, and start the new group with the new value. If non-zero, this
4415 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4416
4417 if (brastackptr >= sizeof(brastack)/sizeof(int))
4418 {
4419 *errorptr = ERR19;
4420 goto PCRE_ERROR_RETURN;
4421 }
4422
4423 bralenstack[brastackptr] = branch_extra;
4424 branch_extra = branch_newextra;
4425
4426 brastack[brastackptr++] = length;
4427 length += bracket_length;
4428 continue;
4429
4430 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4431 have to replicate this bracket up to that many times. If brastackptr is
4432 0 this is an unmatched bracket which will generate an error, but take care
4433 not to try to access brastack[-1] when computing the length and restoring
4434 the branch_extra value. */
4435
4436 case ')':
4437 length += 1 + LINK_SIZE;
4438 if (brastackptr > 0)
4439 {
4440 duplength = length - brastack[--brastackptr];
4441 branch_extra = bralenstack[brastackptr];
4442 }
4443 else duplength = 0;
4444
4445 /* The following code is also used when a recursion such as (?3) is
4446 followed by a quantifier, because in that case, it has to be wrapped inside
4447 brackets so that the quantifier works. The value of duplength must be
4448 set before arrival. */
4449
4450 HANDLE_QUANTIFIED_BRACKETS:
4451
4452 /* Leave ptr at the final char; for read_repeat_counts this happens
4453 automatically; for the others we need an increment. */
4454
4455 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
4456 {
4457 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4458 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4459 }
4460 else if (c == '*') { min = 0; max = -1; ptr++; }
4461 else if (c == '+') { min = 1; max = -1; ptr++; }
4462 else if (c == '?') { min = 0; max = 1; ptr++; }
4463 else { min = 1; max = 1; }
4464
4465 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4466 group, and if the maximum is greater than zero, we have to replicate
4467 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4468 bracket set. */
4469
4470 if (min == 0)
4471 {
4472 length++;
4473 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4474 }
4475
4476 /* When the minimum is greater than zero, we have to replicate up to
4477 minval-1 times, with no additions required in the copies. Then, if there
4478 is a limited maximum we have to replicate up to maxval-1 times allowing
4479 for a BRAZERO item before each optional copy and nesting brackets for all
4480 but one of the optional copies. */
4481
4482 else
4483 {
4484 length += (min - 1) * duplength;
4485 if (max > min) /* Need this test as max=-1 means no limit */
4486 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4487 - (2 + 2*LINK_SIZE);
4488 }
4489
4490 /* Allow space for once brackets for "possessive quantifier" */
4491
4492 if (ptr[1] == '+')
4493 {
4494 ptr++;
4495 length += 2 + 2*LINK_SIZE;
4496 }
4497 continue;
4498
4499 /* Non-special character. For a run of such characters the length required
4500 is the number of characters + 2, except that the maximum run length is
4501 MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4502 # comment as the first character, so the length can't be zero. */
4503
4504 NORMAL_CHAR:
4505 default:
4506 length += 2;
4507 runlength = 0;
4508 do
4509 {
4510 #ifdef SUPPORT_UTF8
4511 lastcharlength = 1; /* Need length of last char for UTF-8 repeats */
4512 #endif
4513
4514 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4515 if (inescq)
4516 {
4517 if (c == '\\' && ptr[1] == 'E')
4518 {
4519 inescq = FALSE;
4520 ptr++;
4521 }
4522 else runlength++;
4523 continue;
4524 }
4525
4526 /* Skip whitespace and comments for /x */
4527
4528 if ((options & PCRE_EXTENDED) != 0)
4529 {
4530 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4531 if (c == '#')
4532 {
4533 /* The space before the ; is to avoid a warning on a silly compiler
4534 on the Macintosh. */
4535 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4536 continue;
4537 }
4538 }
4539
4540 /* Backslash may introduce a data char or a metacharacter; stop the
4541 string before the latter. */
4542
4543 if (c == '\\')
4544 {
4545 const uschar *saveptr = ptr;
4546 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
4547 &compile_block);
4548 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4549 if (c < 0) { ptr = saveptr; break; }
4550
4551 /* In UTF-8 mode, add on the number of additional bytes needed to
4552 encode this character, and save the total length in case this is a
4553 final char that is repeated. */
4554
4555 #ifdef SUPPORT_UTF8
4556 if (utf8 && c > 127)
4557 {
4558 int i;
4559 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4560 if (c <= utf8_table1[i]) break;
4561 runlength += i;
4562 lastcharlength += i;
4563 }
4564 #endif
4565 }
4566
4567 /* Ordinary character or single-char escape */
4568
4569 runlength++;
4570 }
4571
4572 /* This "while" is the end of the "do" above. */
4573
4574 while (runlength < MAXLIT &&
4575 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4576
4577 /* If we hit a meta-character, back off to point to it */
4578
4579 if (runlength < MAXLIT) ptr--;
4580
4581 /* If the last char in the string is a UTF-8 multibyte character, we must
4582 set lastcharlength correctly. If it was specified as an escape, this will
4583 already have been done above. However, we also have to support in-line
4584 UTF-8 characters, so check backwards from where we are. */
4585
4586 #ifdef SUPPORT_UTF8
4587 if (utf8)
4588 {
4589 const uschar *lastptr = ptr - 1;
4590 if ((*lastptr & 0x80) != 0)
4591 {
4592 while((*lastptr & 0xc0) == 0x80) lastptr--;
4593 lastcharlength = ptr - lastptr;
4594 }
4595 }
4596 #endif
4597
4598 length += runlength;
4599 continue;
4600 }
4601 }
4602
4603 length += 2 + LINK_SIZE; /* For final KET and END */
4604
4605 if (length > MAX_PATTERN_SIZE)
4606 {
4607 *errorptr = ERR20;
4608 return NULL;
4609 }
4610
4611 /* Compute the size of data block needed and get it, either from malloc or
4612 externally provided function. */
4613
4614 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4615 re = (real_pcre *)(pcre_malloc)(size);
4616
4617 if (re == NULL)
4618 {
4619 *errorptr = ERR21;
4620 return NULL;
4621 }
4622
4623 /* Put in the magic number, and save the size, options, and table pointer */
4624
4625 re->magic_number = MAGIC_NUMBER;
4626 re->size = size;
4627 re->options = options;
4628 re->tables = tables;
4629 re->name_entry_size = max_name_size + 3;
4630 re->name_count = name_count;
4631
4632 /* The starting points of the name/number translation table and of the code are
4633 passed around in the compile data block. */
4634
4635 compile_block.names_found = 0;
4636 compile_block.name_entry_size = max_name_size + 3;
4637 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4638 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4639 compile_block.start_code = codestart;
4640 compile_block.req_varyopt = 0;
4641
4642 /* Set up a starting, non-extracting bracket, then compile the expression. On
4643 error, *errorptr will be set non-NULL, so we don't need to look at the result
4644 of the function here. */
4645
4646 ptr = (const uschar *)pattern;
4647 code = (uschar *)codestart;
4648 *code = OP_BRA;
4649 bracount = 0;
4650 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4651 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4652 re->top_bracket = bracount;
4653 re->top_backref = compile_block.top_backref;
4654
4655 /* If not reached end of pattern on success, there's an excess bracket. */
4656
4657 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
4658
4659 /* Fill in the terminating state and check for disastrous overflow, but
4660 if debugging, leave the test till after things are printed out. */
4661
4662 *code++ = OP_END;
4663
4664 #ifndef DEBUG
4665 if (code - codestart > length) *errorptr = ERR23;
4666 #endif
4667
4668 /* Give an error if there's back reference to a non-existent capturing
4669 subpattern. */
4670
4671 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4672
4673 /* Failed to compile, or error while post-processing */
4674
4675 if (*errorptr != NULL)
4676 {
4677 (pcre_free)(re);
4678 PCRE_ERROR_RETURN:
4679 *erroroffset = ptr - (const uschar *)pattern;
4680 return NULL;
4681 }
4682
4683 /* If the anchored option was not passed, set the flag if we can determine that
4684 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4685 as starting with .* when DOTALL is set).
4686
4687 Otherwise, if we know what the first character has to be, save it, because that
4688 speeds up unanchored matches no end. If not, see if we can set the
4689 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4690 start with ^. and also when all branches start with .* for non-DOTALL matches.
4691 */
4692
4693 if ((options & PCRE_ANCHORED) == 0)
4694 {
4695 int temp_options = options;
4696 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4697 re->options |= PCRE_ANCHORED;
4698 else
4699 {
4700 if (firstbyte < 0)
4701 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4702 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4703 {
4704 int ch = firstbyte & 255;
4705 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4706 compile_block.fcc[ch] == ch)? ch : firstbyte;
4707 re->options |= PCRE_FIRSTSET;
4708 }
4709 else if (is_startline(codestart, 0, compile_block.backref_map))
4710 re->options |= PCRE_STARTLINE;
4711 }
4712 }
4713
4714 /* For an anchored pattern, we use the "required byte" only if it follows a
4715 variable length item in the regex. Remove the caseless flag for non-caseable
4716 chars. */
4717
4718 if (reqbyte >= 0 &&
4719 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4720 {
4721 int ch = reqbyte & 255;
4722 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4723 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4724 re->options |= PCRE_REQCHSET;
4725 }
4726
4727 /* Print out the compiled data for debugging */
4728
4729 #ifdef DEBUG
4730
4731 printf("Length = %d top_bracket = %d top_backref = %d\n",
4732 length, re->top_bracket, re->top_backref);
4733
4734 if (re->options != 0)
4735 {
4736 printf("%s%s%s%s%s%s%s%s%s\n",
4737 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4738 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4739 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4740 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4741 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4742 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4743 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4744 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4745 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4746 }
4747
4748 if ((re->options & PCRE_FIRSTSET) != 0)
4749 {
4750 int ch = re->first_byte & 255;
4751 char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4752 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4753 else printf("First char = \\x%02x%s\n", ch, caseless);
4754 }
4755
4756 if ((re->options & PCRE_REQCHSET) != 0)
4757 {
4758 int ch = re->req_byte & 255;
4759 char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4760 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
4761 else printf("Req char = \\x%02x%s\n", ch, caseless);
4762 }
4763
4764 print_internals(re, stdout);
4765
4766 /* This check is done here in the debugging case so that the code that
4767 was compiled can be seen. */
4768
4769 if (code - codestart > length)
4770 {
4771 *errorptr = ERR23;
4772 (pcre_free)(re);
4773 *erroroffset = ptr - (uschar *)pattern;
4774 return NULL;
4775 }
4776 #endif
4777
4778 return (pcre *)re;
4779 }
4780
4781
4782
4783 /*************************************************
4784 * Match a back-reference *
4785 *************************************************/
4786
4787 /* If a back reference hasn't been set, the length that is passed is greater
4788 than the number of characters left in the string, so the match fails.
4789
4790 Arguments:
4791 offset index into the offset vector
4792 eptr points into the subject
4793 length length to be matched
4794 md points to match data block
4795 ims the ims flags
4796
4797 Returns: TRUE if matched
4798 */
4799
4800 static BOOL
4801 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
4802 unsigned long int ims)
4803 {
4804 const uschar *p = md->start_subject + md->offset_vector[offset];
4805
4806 #ifdef DEBUG
4807 if (eptr >= md->end_subject)
4808 printf("matching subject <null>");
4809 else
4810 {
4811 printf("matching subject ");
4812 pchars(eptr, length, TRUE, md);
4813 }
4814 printf(" against backref ");
4815 pchars(p, length, FALSE, md);
4816 printf("\n");
4817 #endif
4818
4819 /* Always fail if not enough characters left */
4820
4821 if (length > md->end_subject - eptr) return FALSE;
4822
4823 /* Separate the caselesss case for speed */
4824
4825 if ((ims & PCRE_CASELESS) != 0)
4826 {
4827 while (length-- > 0)
4828 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
4829 }
4830 else
4831 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
4832
4833 return TRUE;
4834 }
4835
4836
4837 #ifdef SUPPORT_UTF8
4838 /*************************************************
4839 * Match character against an XCLASS *
4840 *************************************************/
4841
4842 /* This function is called from within the XCLASS code below, to match a
4843 character against an extended class which might match values > 255.
4844
4845 Arguments:
4846 c the character
4847 data points to the flag byte of the XCLASS data
4848
4849 Returns: TRUE if character matches, else FALSE
4850 */
4851
4852 static BOOL
4853 match_xclass(int c, const uschar *data)
4854 {
4855 int t;
4856 BOOL negated = (*data & XCL_NOT) != 0;
4857
4858 /* Character values < 256 are matched against a bitmap, if one is present. If
4859 not, we still carry on, because there may be ranges that start below 256 in the
4860 additional data. */
4861
4862 if (c < 256)
4863 {
4864 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
4865 return !negated; /* char found */
4866 }
4867
4868 /* Now match against the list of large chars or ranges that end with a large
4869 char. First skip the bit map if present. */
4870
4871 if ((*data++ & XCL_MAP) != 0) data += 32;
4872
4873 while ((t = *data++) != XCL_END)
4874 {
4875 int x, y;
4876 GETCHARINC(x, data);
4877 if (t == XCL_SINGLE)
4878 {
4879 if (c == x) return !negated;
4880 }
4881 else
4882 {
4883 GETCHARINC(y, data);
4884 if (c >= x && c <= y) return !negated;
4885 }
4886 }
4887
4888 return negated; /* char was not found */
4889 }
4890 #endif
4891
4892
4893
4894
4895 /*************************************************
4896 * Match from current position *
4897 *************************************************/
4898
4899 /* On entry ecode points to the first opcode, and eptr to the first character
4900 in the subject string, while eptrb holds the value of eptr at the start of the
4901 last bracketed group - used for breaking infinite loops matching zero-length
4902 strings. This function is called recursively in many circumstances. Whenever it
4903 returns a negative (error) response, the outer incarnation must also return the
4904 same response.
4905
4906 Performance note: It might be tempting to extract commonly used fields from the
4907 md structure (e.g. utf8, end_subject) into individual variables to improve
4908 performance. Tests using gcc on a SPARC disproved this; in the first case, it
4909 made performance worse.
4910
4911 Arguments:
4912 eptr pointer in subject
4913 ecode position in code
4914 offset_top current top pointer
4915 md pointer to "static" info for the match
4916 ims current /i, /m, and /s options
4917 eptrb pointer to chain of blocks containing eptr at start of
4918 brackets - for testing for empty matches
4919 flags can contain
4920 match_condassert - this is an assertion condition
4921 match_isgroup - this is the start of a bracketed group
4922
4923 Returns: MATCH_MATCH if matched ) these values are >= 0
4924 MATCH_NOMATCH if failed to match )
4925 a negative PCRE_ERROR_xxx value if aborted by an error condition
4926 (e.g. stopped by recursion limit)
4927 */
4928
4929 static int
4930 match(register const uschar *eptr, register const uschar *ecode,
4931 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
4932 int flags)
4933 {
4934 unsigned long int original_ims = ims; /* Save for resetting on ')' */
4935 register int rrc;
4936 eptrblock newptrb;
4937
4938 if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
4939
4940 /* At the start of a bracketed group, add the current subject pointer to the
4941 stack of such pointers, to be re-instated at the end of the group when we hit
4942 the closing ket. When match() is called in other circumstances, we don't add to
4943 the stack. */
4944
4945 if ((flags & match_isgroup) != 0)
4946 {
4947 newptrb.prev = eptrb;
4948 newptrb.saved_eptr = eptr;
4949 eptrb = &newptrb;
4950 }
4951
4952 /* Now start processing the operations. */
4953
4954 for (;;)
4955 {
4956 int op = (int)*ecode;
4957 int min, max, ctype;
4958 register int i;
4959 register int c;
4960 BOOL minimize = FALSE;
4961
4962 /* Opening capturing bracket. If there is space in the offset vector, save
4963 the current subject position in the working slot at the top of the vector. We
4964 mustn't change the current values of the data slot, because they may be set
4965 from a previous iteration of this group, and be referred to by a reference
4966 inside the group.
4967
4968 If the bracket fails to match, we need to restore this value and also the
4969 values of the final offsets, in case they were set by a previous iteration of
4970 the same bracket.
4971
4972 If there isn't enough space in the offset vector, treat this as if it were a
4973 non-capturing bracket. Don't worry about setting the flag for the error case
4974 here; that is handled in the code for KET. */
4975
4976 if (op > OP_BRA)
4977 {
4978 int offset;
4979 int number = op - OP_BRA;
4980
4981 /* For extended extraction brackets (large number), we have to fish out the
4982 number from a dummy opcode at the start. */
4983
4984 if (number > EXTRACT_BASIC_MAX)
4985 number = GET2(ecode, 2+LINK_SIZE);
4986 offset = number << 1;
4987
4988 #ifdef DEBUG
4989 printf("start bracket %d subject=", number);
4990 pchars(eptr, 16, TRUE, md);
4991 printf("\n");
4992 #endif
4993
4994 if (offset < md->offset_max)
4995 {
4996 int save_offset1 = md->offset_vector[offset];
4997 int save_offset2 = md->offset_vector[offset+1];
4998 int save_offset3 = md->offset_vector[md->offset_end - number];
4999 int save_capture_last = md->capture_last;
5000
5001 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
5002 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
5003
5004 do
5005 {
5006 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5007 eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
5008 md->capture_last = save_capture_last;
5009 ecode += GET(ecode, 1);
5010 }
5011 while (*ecode == OP_ALT);
5012
5013 DPRINTF(("bracket %d failed\n", number));
5014
5015 md->offset_vector[offset] = save_offset1;
5016 md->offset_vector[offset+1] = save_offset2;
5017 md->offset_vector[md->offset_end - number] = save_offset3;
5018
5019 return MATCH_NOMATCH;
5020 }
5021
5022 /* Insufficient room for saving captured contents */
5023
5024 else op = OP_BRA;
5025 }
5026
5027 /* Other types of node can be handled by a switch */
5028
5029 switch(op)
5030 {
5031 case OP_BRA: /* Non-capturing bracket: optimized */
5032 DPRINTF(("start bracket 0\n"));
5033 do
5034 {
5035 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5036 match_isgroup)) != MATCH_NOMATCH) return rrc;
5037 ecode += GET(ecode, 1);
5038 }
5039 while (*ecode == OP_ALT);
5040 DPRINTF(("bracket 0 failed\n"));
5041 return MATCH_NOMATCH;
5042
5043 /* Conditional group: compilation checked that there are no more than
5044 two branches. If the condition is false, skipping the first branch takes us
5045 past the end if there is only one branch, but that's OK because that is
5046 exactly what going to the ket would do. */
5047
5048 case OP_COND:
5049 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5050 {
5051 int offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
5052 BOOL condition = (offset == CREF_RECURSE * 2)?
5053 (md->recursive != NULL) :
5054 (offset < offset_top && md->offset_vector[offset] >= 0);
5055 return match(eptr, ecode + (condition?
5056 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5057 offset_top, md, ims, eptrb, match_isgroup);
5058 }
5059
5060 /* The condition is an assertion. Call match() to evaluate it - setting
5061 the final argument TRUE causes it to stop at the end of an assertion. */
5062
5063 else
5064 {
5065 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5066 match_condassert | match_isgroup)) == MATCH_MATCH)
5067 {
5068 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5069 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5070 }
5071 else if (rrc != MATCH_NOMATCH) return rrc;
5072 else ecode += GET(ecode, 1);
5073 return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5074 match_isgroup);
5075 }
5076 /* Control never reaches here */
5077
5078 /* Skip over conditional reference or large extraction number data if
5079 encountered. */
5080
5081 case OP_CREF:
5082 case OP_BRANUMBER:
5083 ecode += 3;
5084 break;
5085
5086 /* End of the pattern. If we are in a recursion, we should restore the
5087 offsets appropriately and continue from after the call. */
5088
5089 case OP_END:
5090 if (md->recursive != NULL && md->recursive->group_num == 0)
5091 {
5092 recursion_info *rec = md->recursive;
5093 DPRINTF(("Hit the end in a (?0) recursion\n"));
5094 md->recursive = rec->prev;
5095 memmove(md->offset_vector, rec->offset_save,
5096 rec->saved_max * sizeof(int));
5097 md->start_match = rec->save_start;
5098 ims = original_ims;
5099 ecode = rec->after_call;
5100 break;
5101 }
5102
5103 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5104 string - backtracking will then try other alternatives, if any. */
5105
5106 if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
5107 md->end_match_ptr = eptr; /* Record where we ended */
5108 md->end_offset_top = offset_top; /* and how many extracts were taken */
5109 return MATCH_MATCH;
5110
5111 /* Change option settings */
5112
5113 case OP_OPT:
5114 ims = ecode[1];
5115 ecode += 2;
5116 DPRINTF(("ims set to %02lx\n", ims));
5117 break;
5118
5119 /* Assertion brackets. Check the alternative branches in turn - the
5120 matching won't pass the KET for an assertion. If any one branch matches,
5121 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5122 start of each branch to move the current point backwards, so the code at
5123 this level is identical to the lookahead case. */
5124
5125 case OP_ASSERT:
5126 case OP_ASSERTBACK:
5127 do
5128 {
5129 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5130 match_isgroup)) == MATCH_MATCH) break;
5131 if (rrc != MATCH_NOMATCH) return rrc;
5132 ecode += GET(ecode, 1);
5133 }
5134 while (*ecode == OP_ALT);
5135 if (*ecode == OP_KET) return MATCH_NOMATCH;
5136
5137 /* If checking an assertion for a condition, return MATCH_MATCH. */
5138
5139 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5140
5141 /* Continue from after the assertion, updating the offsets high water
5142 mark, since extracts may have been taken during the assertion. */
5143
5144 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5145 ecode += 1 + LINK_SIZE;
5146 offset_top = md->end_offset_top;
5147 continue;
5148
5149 /* Negative assertion: all branches must fail to match */
5150
5151 case OP_ASSERT_NOT:
5152 case OP_ASSERTBACK_NOT:
5153 do
5154 {
5155 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5156 match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
5157 if (rrc != MATCH_NOMATCH) return rrc;
5158 ecode += GET(ecode,1);
5159 }
5160 while (*ecode == OP_ALT);
5161
5162 if ((flags & match_condassert) != 0) return MATCH_MATCH;
5163
5164 ecode += 1 + LINK_SIZE;
5165 continue;
5166
5167 /* Move the subject pointer back. This occurs only at the start of
5168 each branch of a lookbehind assertion. If we are too close to the start to
5169 move back, this match function fails. When working with UTF-8 we move
5170 back a number of characters, not bytes. */
5171
5172 case OP_REVERSE:
5173 #ifdef SUPPORT_UTF8
5174 c = GET(ecode,1);
5175 for (i = 0; i < c; i++)
5176 {
5177 eptr--;
5178 BACKCHAR(eptr)
5179 }
5180 #else
5181 eptr -= GET(ecode,1);
5182 #endif
5183
5184 if (eptr < md->start_subject) return MATCH_NOMATCH;
5185 ecode += 1 + LINK_SIZE;
5186 break;
5187
5188 /* The callout item calls an external function, if one is provided, passing
5189 details of the match so far. This is mainly for debugging, though the
5190 function is able to force a failure. */
5191
5192 case OP_CALLOUT:
5193 if (pcre_callout != NULL)
5194 {
5195 pcre_callout_block cb;
5196 cb.version = 0; /* Version 0 of the callout block */
5197 cb.callout_number = ecode[1];
5198 cb.offset_vector = md->offset_vector;
5199 cb.subject = (const char *)md->start_subject;
5200 cb.subject_length = md->end_subject - md->start_subject;
5201 cb.start_match = md->start_match - md->start_subject;
5202 cb.current_position = eptr - md->start_subject;
5203 cb.capture_top = offset_top/2;
5204 cb.capture_last = md->capture_last;
5205 cb.callout_data = md->callout_data;
5206 if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
5207 if (rrc < 0) return rrc;
5208 }
5209 ecode += 2;
5210 break;
5211
5212 /* Recursion either matches the current regex, or some subexpression. The
5213 offset data is the offset to the starting bracket from the start of the
5214 whole pattern. However, it is possible that a BRAZERO was inserted before
5215 this bracket after we took the offset - we just skip it if encountered.
5216
5217 If there are any capturing brackets started but not finished, we have to
5218 save their starting points and reinstate them after the recursion. However,
5219 we don't know how many such there are (offset_top records the completed
5220 total) so we just have to save all the potential data. There may be up to
5221 65535 such values, which is too large to put on the stack, but using malloc
5222 for small numbers seems expensive. As a compromise, the stack is used when
5223 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5224 is used. A problem is what to do if the malloc fails ... there is no way of
5225 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5226 values on the stack, and accept that the rest may be wrong.
5227
5228 There are also other values that have to be saved. We use a chained
5229 sequence of blocks that actually live on the stack. Thanks to Robin Houston
5230 for the original version of this logic. */
5231
5232 case OP_RECURSE:
5233 {
5234 int stacksave[REC_STACK_SAVE_MAX];
5235 recursion_info new_recursive;
5236 const uschar *callpat = md->start_code + GET(ecode, 1);
5237
5238 if (*callpat == OP_BRAZERO) callpat++;
5239
5240 new_recursive.group_num = *callpat - OP_BRA;
5241
5242 /* For extended extraction brackets (large number), we have to fish out
5243 the number from a dummy opcode at the start. */
5244
5245 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5246 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5247
5248 /* Add to "recursing stack" */
5249
5250 new_recursive.prev = md->recursive;
5251 md->recursive = &new_recursive;
5252
5253 /* Find where to continue from afterwards */
5254
5255 ecode += 1 + LINK_SIZE;
5256 new_recursive.after_call = ecode;
5257
5258 /* Now save the offset data. */
5259
5260 new_recursive.saved_max = md->offset_end;
5261 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5262 new_recursive.offset_save = stacksave;
5263 else
5264 {
5265 new_recursive.offset_save =
5266 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5267 if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
5268 }
5269
5270 memcpy(new_recursive.offset_save, md->offset_vector,
5271 new_recursive.saved_max * sizeof(int));
5272 new_recursive.save_start = md->start_match;
5273 md->start_match = eptr;
5274
5275 /* OK, now we can do the recursion. For each top-level alternative we
5276 restore the offset and recursion data. */
5277
5278 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5279 do
5280 {
5281 if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5282 eptrb, match_isgroup)) == MATCH_MATCH)
5283 {
5284 md->recursive = new_recursive.prev;
5285 if (new_recursive.offset_save != stacksave)
5286 (pcre_free)(new_recursive.offset_save);
5287 return MATCH_MATCH;
5288 }
5289 else if (rrc != MATCH_NOMATCH) return rrc;
5290
5291 md->recursive = &new_recursive;
5292 memcpy(md->offset_vector, new_recursive.offset_save,
5293 new_recursive.saved_max * sizeof(int));
5294 callpat += GET(callpat, 1);
5295 }
5296 while (*callpat == OP_ALT);
5297
5298 DPRINTF(("Recursion didn't match\n"));
5299 md->recursive = new_recursive.prev;
5300 if (new_recursive.offset_save != stacksave)
5301 (pcre_free)(new_recursive.offset_save);
5302 return MATCH_NOMATCH;
5303 }
5304 /* Control never reaches here */
5305
5306 /* "Once" brackets are like assertion brackets except that after a match,
5307 the point in the subject string is not moved back. Thus there can never be
5308 a move back into the brackets. Friedl calls these "atomic" subpatterns.
5309 Check the alternative branches in turn - the matching won't pass the KET
5310 for this kind of subpattern. If any one branch matches, we carry on as at
5311 the end of a normal bracket, leaving the subject pointer. */
5312
5313 case OP_ONCE:
5314 {
5315 const uschar *prev = ecode;
5316 const uschar *saved_eptr = eptr;
5317
5318 do
5319 {
5320 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5321 eptrb, match_isgroup)) == MATCH_MATCH) break;
5322 if (rrc != MATCH_NOMATCH) return rrc;
5323 ecode += GET(ecode,1);
5324 }
5325 while (*ecode == OP_ALT);
5326
5327 /* If hit the end of the group (which could be repeated), fail */
5328
5329 if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
5330
5331 /* Continue as from after the assertion, updating the offsets high water
5332 mark, since extracts may have been taken. */
5333
5334 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5335
5336 offset_top = md->end_offset_top;
5337 eptr = md->end_match_ptr;
5338
5339 /* For a non-repeating ket, just continue at this level. This also
5340 happens for a repeating ket if no characters were matched in the group.
5341 This is the forcible breaking of infinite loops as implemented in Perl
5342 5.005. If there is an options reset, it will get obeyed in the normal
5343 course of events. */
5344
5345 if (*ecode == OP_KET || eptr == saved_eptr)
5346 {
5347 ecode += 1+LINK_SIZE;
5348 break;
5349 }
5350
5351 /* The repeating kets try the rest of the pattern or restart from the
5352 preceding bracket, in the appropriate order. We need to reset any options
5353 that changed within the bracket before re-running it, so check the next
5354 opcode. */
5355
5356 if (ecode[1+LINK_SIZE] == OP_OPT)
5357 {
5358 ims = (ims & ~PCRE_IMS) | ecode[4];
5359 DPRINTF(("ims set to %02lx at group repeat\n", ims));
5360 }
5361
5362 if (*ecode == OP_KETRMIN)
5363 {
5364 if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5365 eptrb, 0)) != MATCH_NOMATCH) return rrc;
5366 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5367 match_isgroup)) != MATCH_NOMATCH) return rrc;
5368 }
5369 else /* OP_KETRMAX */
5370 {
5371 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5372 match_isgroup)) != MATCH_NOMATCH) return rrc;
5373 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5374 0)) != MATCH_NOMATCH) return rrc;
5375 }
5376 }
5377 return MATCH_NOMATCH;
5378
5379 /* An alternation is the end of a branch; scan along to find the end of the
5380 bracketed group and go to there. */
5381
5382 case OP_ALT:
5383 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5384 break;
5385
5386 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
5387 that it may occur zero times. It may repeat infinitely, or not at all -
5388 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
5389 repeat limits are compiled as a number of copies, with the optional ones
5390 preceded by BRAZERO or BRAMINZERO. */
5391
5392 case OP_BRAZERO:
5393 {
5394 const uschar *next = ecode+1;
5395 if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
5396 != MATCH_NOMATCH) return rrc;
5397 do next += GET(next,1); while (*next == OP_ALT);
5398 ecode = next + 1+LINK_SIZE;
5399 }
5400 break;
5401
5402 case OP_BRAMINZERO:
5403 {
5404 const uschar *next = ecode+1;
5405 do next += GET(next,1); while (*next == OP_ALT);
5406 if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5407 match_isgroup)) != MATCH_NOMATCH) return rrc;
5408 ecode++;
5409 }
5410 break;
5411
5412 /* End of a group, repeated or non-repeating. If we are at the end of
5413 an assertion "group", stop matching and return MATCH_MATCH, but record the
5414 current high water mark for use by positive assertions. Do this also
5415 for the "once" (not-backup up) groups. */
5416
5417 case OP_KET:
5418 case OP_KETRMIN:
5419 case OP_KETRMAX:
5420 {
5421 const uschar *prev = ecode - GET(ecode, 1);
5422 const uschar *saved_eptr = eptrb->saved_eptr;
5423
5424 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
5425
5426 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
5427 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
5428 *prev == OP_ONCE)
5429 {
5430 md->end_match_ptr = eptr; /* For ONCE */
5431 md->end_offset_top = offset_top;
5432 return MATCH_MATCH;
5433 }
5434
5435 /* In all other cases except a conditional group we have to check the
5436 group number back at the start and if necessary complete handling an
5437 extraction by setting the offsets and bumping the high water mark. */
5438
5439 if (*prev != OP_COND)
5440 {
5441 int offset;
5442 int number = *prev - OP_BRA;
5443
5444 /* For extended extraction brackets (large number), we have to fish out
5445 the number from a dummy opcode at the start. */
5446
5447 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
5448 offset = number << 1;
5449
5450 #ifdef DEBUG
5451 printf("end bracket %d", number);
5452 printf("\n");
5453 #endif
5454
5455 /* Test for a numbered group. This includes groups called as a result
5456 of recursion. Note that whole-pattern recursion is coded as a recurse
5457 into group 0, so it won't be picked up here. Instead, we catch it when
5458 the OP_END is reached. */
5459
5460 if (number > 0)
5461 {
5462 md->capture_last = number;
5463 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
5464 {
5465 md->offset_vector[offset] =
5466 md->offset_vector[md->offset_end - number];
5467 md->offset_vector[offset+1] = eptr - md->start_subject;
5468 if (offset_top <= offset) offset_top = offset + 2;
5469 }
5470
5471 /* Handle a recursively called group. Restore the offsets
5472 appropriately and continue from after the call. */
5473
5474 if (md->recursive != NULL && md->recursive->group_num == number)
5475 {
5476 recursion_info *rec = md->recursive;
5477 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
5478 md->recursive = rec->prev;
5479 md->start_match = rec->save_start;
5480 memcpy(md->offset_vector, rec->offset_save,
5481 rec->saved_max * sizeof(int));
5482 ecode = rec->after_call;
5483 ims = original_ims;
5484 break;
5485 }
5486 }
5487 }
5488
5489 /* Reset the value of the ims flags, in case they got changed during
5490 the group. */
5491
5492 ims = original_ims;
5493 DPRINTF(("ims reset to %02lx\n", ims));
5494
5495 /* For a non-repeating ket, just continue at this level. This also
5496 happens for a repeating ket if no characters were matched in the group.
5497 This is the forcible breaking of infinite loops as implemented in Perl
5498 5.005. If there is an options reset, it will get obeyed in the normal
5499 course of events. */
5500
5501 if (*ecode == OP_KET || eptr == saved_eptr)
5502 {
5503 ecode += 1 + LINK_SIZE;
5504 break;
5505 }
5506
5507 /* The repeating kets try the rest of the pattern or restart from the
5508 preceding bracket, in the appropriate order. */
5509
5510 if (*ecode == OP_KETRMIN)
5511 {
5512 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5513 0)) != MATCH_NOMATCH) return rrc;
5514 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5515 match_isgroup)) != MATCH_NOMATCH) return rrc;
5516 }
5517 else /* OP_KETRMAX */
5518 {
5519 if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5520 match_isgroup)) != MATCH_NOMATCH) return rrc;
5521 if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5522 0)) != MATCH_NOMATCH) return rrc;
5523 }
5524 }
5525 return MATCH_NOMATCH;
5526
5527 /* Start of subject unless notbol, or after internal newline if multiline */
5528
5529 case OP_CIRC:
5530 if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
5531 if ((ims & PCRE_MULTILINE) != 0)
5532 {
5533 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
5534 return MATCH_NOMATCH;
5535 ecode++;
5536 break;
5537 }
5538 /* ... else fall through */
5539
5540 /* Start of subject assertion */
5541
5542 case OP_SOD:
5543 if (eptr != md->start_subject) return MATCH_NOMATCH;
5544 ecode++;
5545 break;
5546
5547 /* Start of match assertion */
5548
5549 case OP_SOM:
5550 if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
5551 ecode++;
5552 break;
5553
5554 /* Assert before internal newline if multiline, or before a terminating
5555 newline unless endonly is set, else end of subject unless noteol is set. */
5556
5557 case OP_DOLL:
5558 if ((ims & PCRE_MULTILINE) != 0)
5559 {
5560 if (eptr < md->end_subject)
5561 { if (*eptr != NEWLINE) return MATCH_NOMATCH; }
5562 else
5563 { if (md->noteol) return MATCH_NOMATCH; }
5564 ecode++;
5565 break;
5566 }
5567 else
5568 {
5569 if (md->noteol) return MATCH_NOMATCH;
5570 if (!md->endonly)
5571 {
5572 if (eptr < md->end_subject - 1 ||
5573 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
5574 return MATCH_NOMATCH;
5575 ecode++;
5576 break;
5577 }
5578 }
5579 /* ... else fall through */
5580
5581 /* End of subject assertion (\z) */
5582
5583 case OP_EOD:
5584 if (eptr < md->end_subject) return MATCH_NOMATCH;
5585 ecode++;
5586 break;
5587
5588 /* End of subject or ending \n assertion (\Z) */
5589
5590 case OP_EODN:
5591 if (eptr < md->end_subject - 1 ||
5592 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH;
5593 ecode++;
5594 break;
5595
5596 /* Word boundary assertions */
5597
5598 case OP_NOT_WORD_BOUNDARY:
5599 case OP_WORD_BOUNDARY:
5600 {
5601 BOOL prev_is_word, cur_is_word;
5602
5603 /* Find out if the previous and current characters are "word" characters.
5604 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
5605 be "non-word" characters. */
5606
5607 #ifdef SUPPORT_UTF8
5608 if (md->utf8)
5609 {
5610 if (eptr == md->start_subject) prev_is_word = FALSE; else
5611 {
5612 const uschar *lastptr = eptr - 1;
5613 while((*lastptr & 0xc0) == 0x80) lastptr--;
5614 GETCHAR(c, lastptr);
5615 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5616 }
5617 if (eptr >= md->end_subject) cur_is_word = FALSE; else
5618 {
5619 GETCHAR(c, eptr);
5620 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5621 }
5622 }
5623 else
5624 #endif
5625
5626 /* More streamlined when not in UTF-8 mode */
5627
5628 {
5629 prev_is_word = (eptr != md->start_subject) &&
5630 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
5631 cur_is_word = (eptr < md->end_subject) &&
5632 ((md->ctypes[*eptr] & ctype_word) != 0);
5633 }
5634
5635 /* Now see if the situation is what we want */
5636
5637 if ((*ecode++ == OP_WORD_BOUNDARY)?
5638 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5639 return MATCH_NOMATCH;
5640 }
5641 break;
5642
5643 /* Match a single character type; inline for speed */
5644
5645 case OP_ANY:
5646 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
5647 return MATCH_NOMATCH;
5648 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5649 #ifdef SUPPORT_UTF8
5650 if (md->utf8)
5651 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5652 #endif
5653 ecode++;
5654 break;
5655
5656 /* Match a single byte, even in UTF-8 mode. This opcode really does match
5657 any byte, even newline, independent of the setting of PCRE_DOTALL. */
5658
5659 case OP_ANYBYTE:
5660 if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5661 ecode++;
5662 break;
5663
5664 case OP_NOT_DIGIT:
5665 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5666 GETCHARINCTEST(c, eptr);
5667 if (
5668 #ifdef SUPPORT_UTF8
5669 c < 256 &&
5670 #endif
5671 (md->ctypes[c] & ctype_digit) != 0
5672 )
5673 return MATCH_NOMATCH;
5674 ecode++;
5675 break;
5676
5677 case OP_DIGIT:
5678 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5679 GETCHARINCTEST(c, eptr);
5680 if (
5681 #ifdef SUPPORT_UTF8
5682 c >= 256 ||
5683 #endif
5684 (md->ctypes[c] & ctype_digit) == 0
5685 )
5686 return MATCH_NOMATCH;
5687 ecode++;
5688 break;
5689
5690 case OP_NOT_WHITESPACE:
5691 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5692 GETCHARINCTEST(c, eptr);
5693 if (
5694 #ifdef SUPPORT_UTF8
5695 c < 256 &&
5696 #endif
5697 (md->ctypes[c] & ctype_space) != 0
5698 )
5699 return MATCH_NOMATCH;
5700 ecode++;
5701 break;
5702
5703 case OP_WHITESPACE:
5704 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5705 GETCHARINCTEST(c, eptr);
5706 if (
5707 #ifdef SUPPORT_UTF8
5708 c >= 256 ||
5709 #endif
5710 (md->ctypes[c] & ctype_space) == 0
5711 )
5712 return MATCH_NOMATCH;
5713 ecode++;
5714 break;
5715
5716 case OP_NOT_WORDCHAR:
5717 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5718 GETCHARINCTEST(c, eptr);
5719 if (
5720 #ifdef SUPPORT_UTF8
5721 c < 256 &&
5722 #endif
5723 (md->ctypes[c] & ctype_word) != 0
5724 )
5725 return MATCH_NOMATCH;
5726 ecode++;
5727 break;
5728
5729 case OP_WORDCHAR:
5730 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5731 GETCHARINCTEST(c, eptr);
5732 if (
5733 #ifdef SUPPORT_UTF8
5734 c >= 256 ||
5735 #endif
5736 (md->ctypes[c] & ctype_word) == 0
5737 )
5738 return MATCH_NOMATCH;
5739 ecode++;
5740 break;
5741
5742 /* Match a back reference, possibly repeatedly. Look past the end of the
5743 item to see if there is repeat information following. The code is similar
5744 to that for character classes, but repeated for efficiency. Then obey
5745 similar code to character type repeats - written out again for speed.
5746 However, if the referenced string is the empty string, always treat
5747 it as matched, any number of times (otherwise there could be infinite
5748 loops). */
5749
5750 case OP_REF:
5751 {
5752 int length;
5753 int offset = GET2(ecode, 1) << 1; /* Doubled ref number */
5754 ecode += 3; /* Advance past item */
5755
5756 /* If the reference is unset, set the length to be longer than the amount
5757 of subject left; this ensures that every attempt at a match fails. We
5758 can't just fail here, because of the possibility of quantifiers with zero
5759 minima. */
5760
5761 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
5762 md->end_subject - eptr + 1 :
5763 md->offset_vector[offset+1] - md->offset_vector[offset];
5764
5765 /* Set up for repetition, or handle the non-repeated case */
5766
5767 switch (*ecode)
5768 {
5769 case OP_CRSTAR:
5770 case OP_CRMINSTAR:
5771 case OP_CRPLUS:
5772 case OP_CRMINPLUS:
5773 case OP_CRQUERY:
5774 case OP_CRMINQUERY:
5775 c = *ecode++ - OP_CRSTAR;
5776 minimize = (c & 1) != 0;
5777 min = rep_min[c]; /* Pick up values from tables; */
5778 max = rep_max[c]; /* zero for max => infinity */
5779 if (max == 0) max = INT_MAX;
5780 break;
5781
5782 case OP_CRRANGE:
5783 case OP_CRMINRANGE:
5784 minimize = (*ecode == OP_CRMINRANGE);
5785 min = GET2(ecode, 1);
5786 max = GET2(ecode, 3);
5787 if (max == 0) max = INT_MAX;
5788 ecode += 5;
5789 break;
5790
5791 default: /* No repeat follows */
5792 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5793 eptr += length;
5794 continue; /* With the main loop */
5795 }
5796
5797 /* If the length of the reference is zero, just continue with the
5798 main loop. */
5799
5800 if (length == 0) continue;
5801
5802 /* First, ensure the minimum number of matches are present. We get back
5803 the length of the reference string explicitly rather than passing the
5804 address of eptr, so that eptr can be a register variable. */
5805
5806 for (i = 1; i <= min; i++)
5807 {
5808 if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5809 eptr += length;
5810 }
5811
5812 /* If min = max, continue at the same level without recursion.
5813 They are not both allowed to be zero. */
5814
5815 if (min == max) continue;
5816
5817 /* If minimizing, keep trying and advancing the pointer */
5818
5819 if (minimize)
5820 {
5821 for (i = min;; i++)
5822 {
5823 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5824 MATCH_NOMATCH) return rrc;
5825 if (i >= max || !match_ref(offset, eptr, length, md, ims))
5826 return MATCH_NOMATCH;
5827 eptr += length;
5828 }
5829 /* Control never gets here */
5830 }
5831
5832 /* If maximizing, find the longest string and work backwards */
5833
5834 else
5835 {
5836 const uschar *pp = eptr;
5837 for (i = min; i < max; i++)
5838 {
5839 if (!match_ref(offset, eptr, length, md, ims)) break;
5840 eptr += length;
5841 }
5842 while (eptr >= pp)
5843 {
5844 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5845 MATCH_NOMATCH) return rrc;
5846 eptr -= length;
5847 }
5848 return MATCH_NOMATCH;
5849 }
5850 }
5851 /* Control never gets here */
5852
5853
5854
5855 /* Match a bit-mapped character class, possibly repeatedly. This op code is
5856 used when all the characters in the class have values in the range 0-255.
5857 The only difference between OP_CLASS and OP_NCLASS occurs when a data
5858 character outside the range is encountered.
5859
5860 First, look past the end of the item to see if there is repeat information
5861 following. Then obey similar code to character type repeats - written out
5862 again for speed. */
5863
5864 case OP_NCLASS:
5865 case OP_CLASS:
5866 {
5867 const uschar *data = ecode + 1; /* Save for matching */
5868 ecode += 33; /* Advance past the item */
5869
5870 switch (*ecode)
5871 {
5872 case OP_CRSTAR:
5873 case OP_CRMINSTAR:
5874 case OP_CRPLUS:
5875 case OP_CRMINPLUS:
5876 case OP_CRQUERY:
5877 case OP_CRMINQUERY:
5878 c = *ecode++ - OP_CRSTAR;
5879 minimize = (c & 1) != 0;
5880 min = rep_min[c]; /* Pick up values from tables; */
5881 max = rep_max[c]; /* zero for max => infinity */
5882 if (max == 0) max = INT_MAX;
5883 break;
5884
5885 case OP_CRRANGE:
5886 case OP_CRMINRANGE:
5887 minimize = (*ecode == OP_CRMINRANGE);
5888 min = GET2(ecode, 1);
5889 max = GET2(ecode, 3);
5890 if (max == 0) max = INT_MAX;
5891 ecode += 5;
5892 break;
5893
5894 default: /* No repeat follows */
5895 min = max = 1;
5896 break;
5897 }
5898
5899 /* First, ensure the minimum number of matches are present. */
5900
5901 #ifdef SUPPORT_UTF8
5902 /* UTF-8 mode */
5903 if (md->utf8)
5904 {
5905 for (i = 1; i <= min; i++)
5906 {
5907 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5908 GETCHARINC(c, eptr);
5909 if (c > 255)
5910 {
5911 if (op == OP_CLASS) return MATCH_NOMATCH;
5912 }
5913 else
5914 {
5915 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5916 }
5917 }
5918 }
5919 else
5920 #endif
5921 /* Not UTF-8 mode */
5922 {
5923 for (i = 1; i <= min; i++)
5924 {
5925 if (eptr >= md->end_subject) return MATCH_NOMATCH;
5926 c = *eptr++;
5927 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5928 }
5929 }
5930
5931 /* If max == min we can continue with the main loop without the
5932 need to recurse. */
5933
5934 if (min == max) continue;
5935
5936 /* If minimizing, keep testing the rest of the expression and advancing
5937 the pointer while it matches the class. */
5938
5939 if (minimize)
5940 {
5941 #ifdef SUPPORT_UTF8
5942 /* UTF-8 mode */
5943 if (md->utf8)
5944 {
5945 for (i = min;; i++)
5946 {
5947 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5948 MATCH_NOMATCH) return rrc;
5949 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
5950 GETCHARINC(c, eptr);
5951 if (c > 255)
5952 {
5953 if (op == OP_CLASS) return MATCH_NOMATCH;
5954 }
5955 else
5956 {
5957 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5958 }
5959 }
5960 }
5961 else
5962 #endif
5963 /* Not UTF-8 mode */
5964 {
5965 for (i = min;; i++)
5966 {
5967 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5968 MATCH_NOMATCH) return rrc;
5969 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
5970 c = *eptr++;
5971 if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5972 }
5973 }
5974 /* Control never gets here */
5975 }
5976
5977 /* If maximizing, find the longest possible run, then work backwards. */
5978
5979 else
5980 {
5981 const uschar *pp = eptr;
5982
5983 #ifdef SUPPORT_UTF8
5984 /* UTF-8 mode */
5985 if (md->utf8)
5986 {
5987 for (i = min; i < max; i++)
5988 {
5989 int len = 1;
5990 if (eptr >= md->end_subject) break;
5991 GETCHARLEN(c, eptr, len);
5992 if (c > 255)
5993 {
5994 if (op == OP_CLASS) break;
5995 }
5996 else
5997 {
5998 if ((data[c/8] & (1 << (c&7))) == 0) break;
5999 }
6000 eptr += len;
6001 }
6002 while (eptr >= pp)
6003 {
6004 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6005 MATCH_NOMATCH) return rrc;
6006 BACKCHAR(eptr)
6007 }
6008 }
6009 else
6010 #endif
6011 /* Not UTF-8 mode */
6012 {
6013 for (i = min; i < max; i++)
6014 {
6015 if (eptr >= md->end_subject) break;
6016 c = *eptr;
6017 if ((data[c/8] & (1 << (c&7))) == 0) break;
6018 eptr++;
6019 }
6020 while (eptr >= pp)
6021 {
6022 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6023 MATCH_NOMATCH) return rrc;
6024 }
6025 }
6026
6027 return MATCH_NOMATCH;
6028 }
6029 }
6030 /* Control never gets here */
6031
6032
6033 /* Match an extended character class. This opcode is encountered only
6034 in UTF-8 mode, because that's the only time it is compiled. */
6035
6036 #ifdef SUPPORT_UTF8
6037 case OP_XCLASS:
6038 {
6039 const uschar *data = ecode + 1 + LINK_SIZE; /* Save for matching */
6040 ecode += GET(ecode, 1); /* Advance past the item */
6041
6042 switch (*ecode)
6043 {
6044 case OP_CRSTAR:
6045 case OP_CRMINSTAR:
6046 case OP_CRPLUS:
6047 case OP_CRMINPLUS:
6048 case OP_CRQUERY:
6049 case OP_CRMINQUERY:
6050 c = *ecode++ - OP_CRSTAR;
6051 minimize = (c & 1) != 0;
6052 min = rep_min[c]; /* Pick up values from tables; */
6053 max = rep_max[c]; /* zero for max => infinity */
6054 if (max == 0) max = INT_MAX;
6055 break;
6056
6057 case OP_CRRANGE:
6058 case OP_CRMINRANGE:
6059 minimize = (*ecode == OP_CRMINRANGE);
6060 min = GET2(ecode, 1);
6061 max = GET2(ecode, 3);
6062 if (max == 0) max = INT_MAX;
6063 ecode += 5;
6064 break;
6065
6066 default: /* No repeat follows */
6067 min = max = 1;
6068 break;
6069 }
6070
6071 /* First, ensure the minimum number of matches are present. */
6072
6073 for (i = 1; i <= min; i++)
6074 {
6075 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6076 GETCHARINC(c, eptr);
6077 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6078 }
6079
6080 /* If max == min we can continue with the main loop without the
6081 need to recurse. */
6082
6083 if (min == max) continue;
6084
6085 /* If minimizing, keep testing the rest of the expression and advancing
6086 the pointer while it matches the class. */
6087
6088 if (minimize)
6089 {
6090 for (i = min;; i++)
6091 {
6092 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6093 MATCH_NOMATCH) return rrc;
6094 if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6095 GETCHARINC(c, eptr);
6096 if (!match_xclass(c, data)) return MATCH_NOMATCH;
6097 }
6098 /* Control never gets here */
6099 }
6100
6101 /* If maximizing, find the longest possible run, then work backwards. */
6102
6103 else
6104 {
6105 const uschar *pp = eptr;
6106 for (i = min; i < max; i++)
6107 {
6108 int len = 1;
6109 if (eptr >= md->end_subject) break;
6110 GETCHARLEN(c, eptr, len);
6111 if (!match_xclass(c, data)) break;
6112 eptr += len;
6113 }
6114 while (eptr >= pp)
6115 {
6116 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6117 MATCH_NOMATCH) return rrc;
6118 BACKCHAR(eptr)
6119 }
6120 return MATCH_NOMATCH;
6121 }
6122
6123 /* Control never gets here */
6124 }
6125 #endif /* End of XCLASS */
6126
6127 /* Match a run of characters */
6128
6129 case OP_CHARS:
6130 {
6131 register int length = ecode[1];
6132 ecode += 2;
6133
6134 #ifdef DEBUG /* Sigh. Some compilers never learn. */
6135 if (eptr >= md->end_subject)
6136 printf("matching subject <null> against pattern ");
6137 else
6138 {
6139 printf("matching subject ");
6140 pchars(eptr, length, TRUE, md);
6141 printf(" against pattern ");
6142 }
6143 pchars(ecode, length, FALSE, md);
6144 printf("\n");
6145 #endif
6146
6147 if (length > md->end_subject - eptr) return MATCH_NOMATCH;
6148 if ((ims & PCRE_CASELESS) != 0)
6149 {
6150 while (length-- > 0)
6151 if (md->lcc[*ecode++] != md->lcc[*eptr++])
6152 return MATCH_NOMATCH;
6153 }
6154 else
6155 {
6156 while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
6157 }
6158 }
6159 break;
6160
6161 /* Match a single character repeatedly; different opcodes share code. */
6162
6163 case OP_EXACT:
6164 min = max = GET2(ecode, 1);
6165 ecode += 3;
6166 goto REPEATCHAR;
6167
6168 case OP_UPTO:
6169 case OP_MINUPTO:
6170 min = 0;
6171 max = GET2(ecode, 1);
6172 minimize = *ecode == OP_MINUPTO;
6173 ecode += 3;
6174 goto REPEATCHAR;
6175
6176 case OP_STAR:
6177 case OP_MINSTAR:
6178 case OP_PLUS:
6179 case OP_MINPLUS:
6180 case OP_QUERY:
6181 case OP_MINQUERY:
6182 c = *ecode++ - OP_STAR;
6183 minimize = (c & 1) != 0;
6184 min = rep_min[c]; /* Pick up values from tables; */
6185 max = rep_max[c]; /* zero for max => infinity */
6186 if (max == 0) max = INT_MAX;
6187
6188 /* Common code for all repeated single-character matches. We can give
6189 up quickly if there are fewer than the minimum number of characters left in
6190 the subject. */
6191
6192 REPEATCHAR:
6193 #ifdef SUPPORT_UTF8
6194 if (md->utf8)
6195 {
6196 int len = 1;
6197 const uschar *charptr = ecode;
6198 GETCHARLEN(c, ecode, len);
6199 if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
6200 ecode += len;
6201
6202 /* Handle multibyte character matching specially here. There is no
6203 support for any kind of casing for multibyte characters. */
6204
6205 if (len > 1)
6206 {
6207 for (i = 1; i <= min; i++)
6208 {
6209 if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
6210 eptr += len;
6211 }
6212
6213 if (min == max) continue;
6214
6215 if (minimize)
6216 {
6217 for (i = min;; i++)
6218 {
6219 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6220 MATCH_NOMATCH) return rrc;
6221 if (i >= max ||
6222 eptr >= md->end_subject ||
6223 memcmp(eptr, charptr, len) != 0)
6224 return MATCH_NOMATCH;
6225 eptr += len;
6226 }
6227 /* Control never gets here */
6228 }
6229 else
6230 {
6231 const uschar *pp = eptr;
6232 for (i = min; i < max; i++)
6233 {
6234 if (eptr > md->end_subject - len ||
6235 memcmp(eptr, charptr, len) != 0)
6236 break;
6237 eptr += len;
6238 }
6239 while (eptr >= pp)
6240 {
6241 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6242 MATCH_NOMATCH) return rrc;
6243 eptr -= len;
6244 }
6245 return MATCH_NOMATCH;
6246 }
6247 /* Control never gets here */
6248 }
6249
6250 /* If the length of a UTF-8 character is 1, we fall through here, and
6251 obey the code as for non-UTF-8 characters below, though in this case the
6252 value of c will always be < 128. */
6253 }
6254 else
6255 #endif
6256
6257 /* When not in UTF-8 mode, load a single-byte character. */
6258 {
6259 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6260 c = *ecode++;
6261 }
6262
6263 /* The value of c at this point is always less than 256, though we may or
6264 may not be in UTF-8 mode. The code is duplicated for the caseless and
6265 caseful cases, for speed, since matching characters is likely to be quite
6266 common. First, ensure the minimum number of matches are present. If min =
6267 max, continue at the same level without recursing. Otherwise, if
6268 minimizing, keep trying the rest of the expression and advancing one
6269 matching character if failing, up to the maximum. Alternatively, if
6270 maximizing, find the maximum number of characters and work backwards. */
6271
6272 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
6273 max, eptr));
6274
6275 if ((ims & PCRE_CASELESS) != 0)
6276 {
6277 c = md->lcc[c];
6278 for (i = 1; i <= min; i++)
6279 if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
6280 if (min == max) continue;
6281 if (minimize)
6282 {
6283 for (i = min;; i++)
6284 {
6285 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6286 MATCH_NOMATCH) return rrc;
6287 if (i >= max || eptr >= md->end_subject ||
6288 c != md->lcc[*eptr++])
6289 return MATCH_NOMATCH;
6290 }
6291 /* Control never gets here */
6292 }
6293 else
6294 {
6295 const uschar *pp = eptr;
6296 for (i = min; i < max; i++)
6297 {
6298 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
6299 eptr++;
6300 }
6301 while (eptr >= pp)
6302 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6303 MATCH_NOMATCH) return rrc;
6304 return MATCH_NOMATCH;
6305 }
6306 /* Control never gets here */
6307 }
6308
6309 /* Caseful comparisons (includes all multi-byte characters) */
6310
6311 else
6312 {
6313 for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
6314 if (min == max) continue;
6315 if (minimize)
6316 {
6317 for (i = min;; i++)
6318 {
6319 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6320 MATCH_NOMATCH) return rrc;
6321 if (i >= max || eptr >= md->end_subject || c != *eptr++)
6322 return MATCH_NOMATCH;
6323 }
6324 /* Control never gets here */
6325 }
6326 else
6327 {
6328 const uschar *pp = eptr;
6329 for (i = min; i < max; i++)
6330 {
6331 if (eptr >= md->end_subject || c != *eptr) break;
6332 eptr++;
6333 }
6334 while (eptr >= pp)
6335 if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6336 MATCH_NOMATCH) return rrc;
6337 return MATCH_NOMATCH;
6338 }
6339 }
6340 /* Control never gets here */
6341
6342 /* Match a negated single one-byte character. The character we are
6343 checking can be multibyte. */
6344
6345 case OP_NOT:
6346 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6347 ecode++;
6348 GETCHARINCTEST(c, eptr);
6349 if ((ims & PCRE_CASELESS) != 0)
6350 {
6351 #ifdef SUPPORT_UTF8
6352 if (c < 256)
6353 #endif
6354 c = md->lcc[c];
6355 if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
6356 }
6357 else
6358 {
6359 if (*ecode++ == c) return MATCH_NOMATCH;
6360 }
6361 break;
6362
6363 /* Match a negated single one-byte character repeatedly. This is almost a
6364 repeat of the code for a repeated single character, but I haven't found a
6365 nice way of commoning these up that doesn't require a test of the
6366 positive/negative option for each character match. Maybe that wouldn't add
6367 very much to the time taken, but character matching *is* what this is all
6368 about... */
6369
6370 case OP_NOTEXACT:
6371 min = max = GET2(ecode, 1);
6372 ecode += 3;
6373 goto REPEATNOTCHAR;
6374
6375 case OP_NOTUPTO:
6376 case OP_NOTMINUPTO:
6377 min = 0;
6378 max = GET2(ecode, 1);
6379 minimize = *ecode == OP_NOTMINUPTO;
6380 ecode += 3;
6381 goto REPEATNOTCHAR;
6382
6383 case OP_NOTSTAR:
6384 case OP_NOTMINSTAR:
6385 case OP_NOTPLUS:
6386 case OP_NOTMINPLUS:
6387 case OP_NOTQUERY:
6388 case OP_NOTMINQUERY:
6389 c = *ecode++ - OP_NOTSTAR;
6390 minimize = (c & 1) != 0;
6391 min = rep_min[c]; /* Pick up values from tables; */
6392 max = rep_max[c]; /* zero for max => infinity */
6393 if (max == 0) max = INT_MAX;
6394
6395 /* Common code for all repeated single-character (less than 255) matches.
6396 We can give up quickly if there are fewer than the minimum number of
6397 characters left in the subject. */
6398
6399 REPEATNOTCHAR:
6400 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6401 c = *ecode++;
6402
6403 /* The code is duplicated for the caseless and caseful cases, for speed,
6404 since matching characters is likely to be quite common. First, ensure the
6405 minimum number of matches are present. If min = max, continue at the same
6406 level without recursing. Otherwise, if minimizing, keep trying the rest of
6407 the expression and advancing one matching character if failing, up to the
6408 maximum. Alternatively, if maximizing, find the maximum number of
6409 characters and work backwards. */
6410
6411 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
6412 max, eptr));
6413
6414 if ((ims & PCRE_CASELESS) != 0)
6415 {
6416 c = md->lcc[c];
6417
6418 #ifdef SUPPORT_UTF8
6419 /* UTF-8 mode */
6420 if (md->utf8)
6421 {
6422 register int d;
6423 for (i = 1; i <= min; i++)
6424 {
6425 GETCHARINC(d, eptr);
6426 if (d < 256) d = md->lcc[d];
6427 if (c == d) return MATCH_NOMATCH;
6428 }
6429 }
6430 else
6431 #endif
6432
6433 /* Not UTF-8 mode */
6434 {
6435 for (i = 1; i <= min; i++)
6436 if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
6437 }
6438
6439 if (min == max) continue;
6440
6441 if (minimize)
6442 {
6443 #ifdef SUPPORT_UTF8
6444 /* UTF-8 mode */
6445 if (md->utf8)
6446 {
6447 register int d;
6448 for (i = min;; i++)
6449 {
6450 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6451 MATCH_NOMATCH) return rrc;
6452 GETCHARINC(d, eptr);
6453 if (d < 256) d = md->lcc[d];
6454 if (i >= max || eptr >= md->end_subject || c == d)
6455 return MATCH_NOMATCH;
6456 }
6457 }
6458 else
6459 #endif
6460 /* Not UTF-8 mode */
6461 {
6462 for (i = min;; i++)
6463 {
6464 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6465 MATCH_NOMATCH) return rrc;
6466 if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
6467 return MATCH_NOMATCH;
6468 }
6469 }
6470 /* Control never gets here */
6471 }
6472
6473 /* Maximize case */
6474
6475 else
6476 {
6477 const uschar *pp = eptr;
6478
6479 #ifdef SUPPORT_UTF8
6480 /* UTF-8 mode */
6481 if (md->utf8)
6482 {
6483 register int d;
6484 for (i = min; i < max; i++)
6485 {
6486 int len = 1;
6487 if (eptr >= md->end_subject) break;
6488 GETCHARLEN(d, eptr, len);
6489 if (d < 256) d = md->lcc[d];
6490 if (c == d) break;
6491 eptr += len;
6492 }
6493 while (eptr >= pp)
6494 {
6495 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6496 MATCH_NOMATCH) return rrc;
6497 eptr--;
6498 BACKCHAR(eptr);
6499 }
6500 }
6501 else
6502 #endif
6503 /* Not UTF-8 mode */
6504 {
6505 for (i = min; i < max; i++)
6506 {
6507 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
6508 eptr++;
6509 }
6510 while (eptr >= pp)
6511 {
6512 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6513 MATCH_NOMATCH) return rrc;
6514 eptr--;
6515 }
6516 }
6517
6518 return MATCH_NOMATCH;
6519 }
6520 /* Control never gets here */
6521 }
6522
6523 /* Caseful comparisons */
6524
6525 else
6526 {
6527 #ifdef SUPPORT_UTF8
6528 /* UTF-8 mode */
6529 if (md->utf8)
6530 {
6531 register int d;
6532 for (i = 1; i <= min; i++)
6533 {
6534 GETCHARINC(d, eptr);
6535 if (c == d) return MATCH_NOMATCH;
6536 }
6537 }
6538 else
6539 #endif
6540 /* Not UTF-8 mode */
6541 {
6542 for (i = 1; i <= min; i++)
6543 if (c == *eptr++) return MATCH_NOMATCH;
6544 }
6545
6546 if (min == max) continue;
6547
6548 if (minimize)
6549 {
6550 #ifdef SUPPORT_UTF8
6551 /* UTF-8 mode */
6552 if (md->utf8)
6553 {
6554 register int d;
6555 for (i = min;; i++)
6556 {
6557 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6558 MATCH_NOMATCH) return rrc;
6559 GETCHARINC(d, eptr);
6560 if (i >= max || eptr >= md->end_subject || c == d)
6561 return MATCH_NOMATCH;
6562 }
6563 }
6564 else
6565 #endif
6566 /* Not UTF-8 mode */
6567 {
6568 for (i = min;; i++)
6569 {
6570 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6571 MATCH_NOMATCH) return rrc;
6572 if (i >= max || eptr >= md->end_subject || c == *eptr++)
6573 return MATCH_NOMATCH;
6574 }
6575 }
6576 /* Control never gets here */
6577 }
6578
6579 /* Maximize case */
6580
6581 else
6582 {
6583 const uschar *pp = eptr;
6584
6585 #ifdef SUPPORT_UTF8
6586 /* UTF-8 mode */
6587 if (md->utf8)
6588 {
6589 register int d;
6590 for (i = min; i < max; i++)
6591 {
6592 int len = 1;
6593 if (eptr >= md->end_subject) break;
6594 GETCHARLEN(d, eptr, len);
6595 if (c == d) break;
6596 eptr += len;
6597 }
6598 while (eptr >= pp)
6599 {
6600 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6601 MATCH_NOMATCH) return rrc;
6602 eptr--;
6603 BACKCHAR(eptr);
6604 }
6605 }
6606 else
6607 #endif
6608 /* Not UTF-8 mode */
6609 {
6610 for (i = min; i < max; i++)
6611 {
6612 if (eptr >= md->end_subject || c == *eptr) break;
6613 eptr++;
6614 }
6615 while (eptr >= pp)
6616 {
6617 if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6618 MATCH_NOMATCH) return rrc;
6619 eptr--;
6620 }
6621 }
6622
6623 return MATCH_NOMATCH;
6624 }
6625 }
6626 /* Control never gets here */
6627
6628 /* Match a single character type repeatedly; several different opcodes
6629 share code. This is very similar to the code for single characters, but we
6630 repeat it in the interests of efficiency. */
6631
6632 case OP_TYPEEXACT:
6633 min = max = GET2(ecode, 1);
6634 minimize = TRUE;
6635 ecode += 3;
6636 goto REPEATTYPE;
6637
6638 case OP_TYPEUPTO:
6639 case OP_TYPEMINUPTO:
6640 min = 0;
6641 max = GET2(ecode, 1);
6642 minimize = *ecode == OP_TYPEMINUPTO;
6643 ecode += 3;
6644 goto REPEATTYPE;
6645
6646 case OP_TYPESTAR:
6647 case OP_TYPEMINSTAR:
6648 case OP_TYPEPLUS:
6649 case OP_TYPEMINPLUS:
6650 case OP_TYPEQUERY:
6651 case OP_TYPEMINQUERY:
6652 c = *ecode++ - OP_TYPESTAR;
6653 minimize = (c & 1) != 0;
6654 min = rep_min[c]; /* Pick up values from tables; */
6655 max = rep_max[c]; /* zero for max => infinity */
6656 if (max == 0) max = INT_MAX;
6657
6658 /* Common code for all repeated single character type matches. Note that
6659 in UTF-8 mode, '.' matches a character of any length, but for the other
6660 character types, the valid characters are all one-byte long. */
6661
6662 REPEATTYPE:
6663 ctype = *ecode++; /* Code for the character type */
6664
6665 /* First, ensure the minimum number of matches are present. Use inline
6666 code for maximizing the speed, and do the type test once at the start
6667 (i.e. keep it out of the loop). Also we can test that there are at least
6668 the minimum number of bytes before we start. This isn't as effective in
6669 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
6670 is tidier. */
6671
6672 if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6673 if (min > 0)
6674 {
6675 #ifdef SUPPORT_UTF8
6676 if (md->utf8) switch(ctype)
6677 {
6678 case OP_ANY:
6679 for (i = 1; i <= min; i++)
6680 {
6681 if (eptr >= md->end_subject ||
6682 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
6683 return MATCH_NOMATCH;
6684 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6685 }
6686 break;
6687
6688 case OP_ANYBYTE:
6689 eptr += min;
6690 break;
6691
6692 case OP_NOT_DIGIT:
6693 for (i = 1; i <= min; i++)
6694 {
6695 if (eptr >= md->end_subject) return MATCH_NOMATCH;
6696 GETCHARINC(c, eptr);
6697 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
6698 return MATCH_NOMATCH;
6699 }
6700 break;
6701
6702 case OP_DIGIT:
6703 for (i = 1; i <= min; i++)
6704 {
6705 if (eptr >= md->end_subject ||
6706 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
6707 return MATCH_NOMATCH;
6708 /* No need to skip more bytes - we know it's a 1-byte character */
6709 }
6710 break;
6711
6712 case OP_NOT_WHITESPACE:
6713 for (i = 1; i <= min; i++)
6714 {
6715 if (eptr >= md->end_subject ||
6716 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
6717 return MATCH_NOMATCH;
6718 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6719 }
6720 break;
6721
6722 case OP_WHITESPACE:
6723 for (i = 1; i <= min; i++)
6724 {
6725 if (eptr >= md->end_subject ||
6726 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
6727 return MATCH_NOMATCH;
6728 /* No need to skip more bytes - we know it's a 1-byte character */
6729 }
6730 break;
6731
6732 case OP_NOT_WORDCHAR:
6733 for (i = 1; i <= min; i++)
6734 {
6735 if (eptr >= md->end_subject ||
6736 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
6737 return MATCH_NOMATCH;
6738 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6739 }
6740 break;
6741
6742 case OP_WORDCHAR:
6743 for (i = 1; i <= min; i++)
6744 {
6745 if (eptr >= md->end_subject ||
6746 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
6747 return MATCH_NOMATCH;