/[pcre]/code/tags/pcre-3.5/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-3.5/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 54 - (show annotations) (download)
Sat Feb 24 21:39:44 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 153401 byte(s)
Tag code/trunk as code/tags/pcre-3.5.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2001 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Maximum number of items on the nested bracket stacks at compile time. This
64 applies to the nesting of all kinds of parentheses. It does not limit
65 un-nested, non-capturing parentheses. This number can be made bigger if
66 necessary - it is used to dimension one int and one unsigned char vector at
67 compile time. */
68
69 #define BRASTACK_SIZE 200
70
71
72 /* The number of bytes in a literal character string above which we can't add
73 any more is different when UTF-8 characters may be encountered. */
74
75 #ifdef SUPPORT_UTF8
76 #define MAXLIT 250
77 #else
78 #define MAXLIT 255
79 #endif
80
81
82 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
83
84 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
85 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
86
87 /* Text forms of OP_ values and things, for debugging (not all used) */
88
89 #ifdef DEBUG
90 static const char *OP_names[] = {
91 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
92 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
93 "Opt", "^", "$", "Any", "chars", "not",
94 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
95 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
96 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
97 "*", "*?", "+", "+?", "?", "??", "{", "{",
98 "class", "Ref", "Recurse",
99 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101 "Brazero", "Braminzero", "Branumber", "Bra"
102 };
103 #endif
104
105 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
106 are simple data values; negative values are for special things like \d and so
107 on. Zero means further processing is needed (for things like \x), or the escape
108 is invalid. */
109
110 static const short int escapes[] = {
111 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
112 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
113 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
114 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
115 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
116 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
117 '`', 7, -ESC_b, 0, -ESC_d, ESC_E, ESC_F, 0, /* ` - g */
118 0, 0, 0, 0, 0, 0, ESC_N, 0, /* h - o */
119 0, 0, ESC_R, -ESC_s, ESC_T, 0, 0, -ESC_w, /* p - w */
120 0, 0, -ESC_z /* x - z */
121 };
122
123 /* Tables of names of POSIX character classes and their lengths. The list is
124 terminated by a zero length entry. The first three must be alpha, upper, lower,
125 as this is assumed for handling case independence. */
126
127 static const char *posix_names[] = {
128 "alpha", "lower", "upper",
129 "alnum", "ascii", "cntrl", "digit", "graph",
130 "print", "punct", "space", "word", "xdigit" };
131
132 static const uschar posix_name_lengths[] = {
133 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
134
135 /* Table of class bit maps for each POSIX class; up to three may be combined
136 to form the class. */
137
138 static const int posix_class_maps[] = {
139 cbit_lower, cbit_upper, -1, /* alpha */
140 cbit_lower, -1, -1, /* lower */
141 cbit_upper, -1, -1, /* upper */
142 cbit_digit, cbit_lower, cbit_upper, /* alnum */
143 cbit_print, cbit_cntrl, -1, /* ascii */
144 cbit_cntrl, -1, -1, /* cntrl */
145 cbit_digit, -1, -1, /* digit */
146 cbit_graph, -1, -1, /* graph */
147 cbit_print, -1, -1, /* print */
148 cbit_punct, -1, -1, /* punct */
149 cbit_space, -1, -1, /* space */
150 cbit_word, -1, -1, /* word */
151 cbit_xdigit,-1, -1 /* xdigit */
152 };
153
154
155 /* Definition to allow mutual recursion */
156
157 static BOOL
158 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
159 BOOL, int, int *, int *, compile_data *);
160
161 /* Structure for building a chain of data that actually lives on the
162 stack, for holding the values of the subject pointer at the start of each
163 subpattern, so as to detect when an empty string has been matched by a
164 subpattern - to break infinite loops. */
165
166 typedef struct eptrblock {
167 struct eptrblock *prev;
168 const uschar *saved_eptr;
169 } eptrblock;
170
171 /* Flag bits for the match() function */
172
173 #define match_condassert 0x01 /* Called to check a condition assertion */
174 #define match_isgroup 0x02 /* Set if start of bracketed group */
175
176
177
178 /*************************************************
179 * Global variables *
180 *************************************************/
181
182 /* PCRE is thread-clean and doesn't use any global variables in the normal
183 sense. However, it calls memory allocation and free functions via the two
184 indirections below, which are can be changed by the caller, but are shared
185 between all threads. */
186
187 void *(*pcre_malloc)(size_t) = malloc;
188 void (*pcre_free)(void *) = free;
189
190
191
192 /*************************************************
193 * Macros and tables for character handling *
194 *************************************************/
195
196 /* When UTF-8 encoding is being used, a character is no longer just a single
197 byte. The macros for character handling generate simple sequences when used in
198 byte-mode, and more complicated ones for UTF-8 characters. */
199
200 #ifndef SUPPORT_UTF8
201 #define GETCHARINC(c, eptr) c = *eptr++;
202 #define GETCHARLEN(c, eptr, len) c = *eptr;
203 #define BACKCHAR(eptr)
204
205 #else /* SUPPORT_UTF8 */
206
207 /* Get the next UTF-8 character, advancing the pointer */
208
209 #define GETCHARINC(c, eptr) \
210 c = *eptr++; \
211 if (md->utf8 && (c & 0xc0) == 0xc0) \
212 { \
213 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
214 int s = 6 - a; /* Amount to shift next byte */ \
215 c &= utf8_table3[a]; /* Low order bits from first byte */ \
216 while (a-- > 0) \
217 { \
218 c |= (*eptr++ & 0x3f) << s; \
219 s += 6; \
220 } \
221 }
222
223 /* Get the next UTF-8 character, not advancing the pointer, setting length */
224
225 #define GETCHARLEN(c, eptr, len) \
226 c = *eptr; \
227 len = 1; \
228 if (md->utf8 && (c & 0xc0) == 0xc0) \
229 { \
230 int i; \
231 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
232 int s = 6 - a; /* Amount to shift next byte */ \
233 c &= utf8_table3[a]; /* Low order bits from first byte */ \
234 for (i = 1; i <= a; i++) \
235 { \
236 c |= (eptr[i] & 0x3f) << s; \
237 s += 6; \
238 } \
239 len += a; \
240 }
241
242 /* If the pointer is not at the start of a character, move it back until
243 it is. */
244
245 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
246
247 #endif
248
249
250
251 /*************************************************
252 * Default character tables *
253 *************************************************/
254
255 /* A default set of character tables is included in the PCRE binary. Its source
256 is built by the maketables auxiliary program, which uses the default C ctypes
257 functions, and put in the file chartables.c. These tables are used by PCRE
258 whenever the caller of pcre_compile() does not provide an alternate set of
259 tables. */
260
261 #include "chartables.c"
262
263
264
265 #ifdef SUPPORT_UTF8
266 /*************************************************
267 * Tables for UTF-8 support *
268 *************************************************/
269
270 /* These are the breakpoints for different numbers of bytes in a UTF-8
271 character. */
272
273 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
274
275 /* These are the indicator bits and the mask for the data bits to set in the
276 first byte of a character, indexed by the number of additional bytes. */
277
278 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
280
281 /* Table of the number of extra characters, indexed by the first character
282 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
283 0x3d. */
284
285 static uschar utf8_table4[] = {
286 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
290
291
292 /*************************************************
293 * Convert character value to UTF-8 *
294 *************************************************/
295
296 /* This function takes an integer value in the range 0 - 0x7fffffff
297 and encodes it as a UTF-8 character in 0 to 6 bytes.
298
299 Arguments:
300 cvalue the character value
301 buffer pointer to buffer for result - at least 6 bytes long
302
303 Returns: number of characters placed in the buffer
304 */
305
306 static int
307 ord2utf8(int cvalue, uschar *buffer)
308 {
309 register int i, j;
310 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311 if (cvalue <= utf8_table1[i]) break;
312 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
313 cvalue >>= 6 - i;
314 for (j = 0; j < i; j++)
315 {
316 *buffer++ = 0x80 | (cvalue & 0x3f);
317 cvalue >>= 6;
318 }
319 return i + 1;
320 }
321 #endif
322
323
324
325 /*************************************************
326 * Return version string *
327 *************************************************/
328
329 #define STRING(a) # a
330 #define XSTRING(s) STRING(s)
331
332 const char *
333 pcre_version(void)
334 {
335 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
336 }
337
338
339
340
341 /*************************************************
342 * (Obsolete) Return info about compiled pattern *
343 *************************************************/
344
345 /* This is the original "info" function. It picks potentially useful data out
346 of the private structure, but its interface was too rigid. It remains for
347 backwards compatibility. The public options are passed back in an int - though
348 the re->options field has been expanded to a long int, all the public options
349 at the low end of it, and so even on 16-bit systems this will still be OK.
350 Therefore, I haven't changed the API for pcre_info().
351
352 Arguments:
353 external_re points to compiled code
354 optptr where to pass back the options
355 first_char where to pass back the first character,
356 or -1 if multiline and all branches start ^,
357 or -2 otherwise
358
359 Returns: number of capturing subpatterns
360 or negative values on error
361 */
362
363 int
364 pcre_info(const pcre *external_re, int *optptr, int *first_char)
365 {
366 const real_pcre *re = (const real_pcre *)external_re;
367 if (re == NULL) return PCRE_ERROR_NULL;
368 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
369 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
370 if (first_char != NULL)
371 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
372 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
373 return re->top_bracket;
374 }
375
376
377
378 /*************************************************
379 * Return info about compiled pattern *
380 *************************************************/
381
382 /* This is a newer "info" function which has an extensible interface so
383 that additional items can be added compatibly.
384
385 Arguments:
386 external_re points to compiled code
387 external_study points to study data, or NULL
388 what what information is required
389 where where to put the information
390
391 Returns: 0 if data returned, negative on error
392 */
393
394 int
395 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
396 void *where)
397 {
398 const real_pcre *re = (const real_pcre *)external_re;
399 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
400
401 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
402 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
403
404 switch (what)
405 {
406 case PCRE_INFO_OPTIONS:
407 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
408 break;
409
410 case PCRE_INFO_SIZE:
411 *((size_t *)where) = re->size;
412 break;
413
414 case PCRE_INFO_CAPTURECOUNT:
415 *((int *)where) = re->top_bracket;
416 break;
417
418 case PCRE_INFO_BACKREFMAX:
419 *((int *)where) = re->top_backref;
420 break;
421
422 case PCRE_INFO_FIRSTCHAR:
423 *((int *)where) =
424 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
425 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426 break;
427
428 case PCRE_INFO_FIRSTTABLE:
429 *((const uschar **)where) =
430 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
431 study->start_bits : NULL;
432 break;
433
434 case PCRE_INFO_LASTLITERAL:
435 *((int *)where) =
436 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
437 break;
438
439 default: return PCRE_ERROR_BADOPTION;
440 }
441
442 return 0;
443 }
444
445
446
447 #ifdef DEBUG
448 /*************************************************
449 * Debugging function to print chars *
450 *************************************************/
451
452 /* Print a sequence of chars in printable format, stopping at the end of the
453 subject if the requested.
454
455 Arguments:
456 p points to characters
457 length number to print
458 is_subject TRUE if printing from within md->start_subject
459 md pointer to matching data block, if is_subject is TRUE
460
461 Returns: nothing
462 */
463
464 static void
465 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
466 {
467 int c;
468 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
469 while (length-- > 0)
470 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
471 }
472 #endif
473
474
475
476
477 /*************************************************
478 * Handle escapes *
479 *************************************************/
480
481 /* This function is called when a \ has been encountered. It either returns a
482 positive value for a simple escape such as \n, or a negative value which
483 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484 a positive value greater than 255 may be returned. On entry, ptr is pointing at
485 the \. On exit, it is on the final character of the escape sequence.
486
487 Arguments:
488 ptrptr points to the pattern position pointer
489 errorptr points to the pointer to the error message
490 bracount number of previous extracting brackets
491 options the options bits
492 isclass TRUE if inside a character class
493 cd pointer to char tables block
494
495 Returns: zero or positive => a data character
496 negative => a special escape sequence
497 on error, errorptr is set
498 */
499
500 static int
501 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
502 int options, BOOL isclass, compile_data *cd)
503 {
504 const uschar *ptr = *ptrptr;
505 int c, i;
506
507 /* If backslash is at the end of the pattern, it's an error. */
508
509 c = *(++ptr);
510 if (c == 0) *errorptr = ERR1;
511
512 /* Digits or letters may have special meaning; all others are literals. */
513
514 else if (c < '0' || c > 'z') {}
515
516 /* Do an initial lookup in a table. A non-zero result is something that can be
517 returned immediately. Otherwise further processing may be required. */
518
519 else if ((i = escapes[c - '0']) != 0) c = i;
520
521 /* Escapes that need further processing, or are illegal. */
522
523 else
524 {
525 const uschar *oldptr;
526 switch (c)
527 {
528 /* The handling of escape sequences consisting of a string of digits
529 starting with one that is not zero is not straightforward. By experiment,
530 the way Perl works seems to be as follows:
531
532 Outside a character class, the digits are read as a decimal number. If the
533 number is less than 10, or if there are that many previous extracting
534 left brackets, then it is a back reference. Otherwise, up to three octal
535 digits are read to form an escaped byte. Thus \123 is likely to be octal
536 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
537 value is greater than 377, the least significant 8 bits are taken. Inside a
538 character class, \ followed by a digit is always an octal number. */
539
540 case '1': case '2': case '3': case '4': case '5':
541 case '6': case '7': case '8': case '9':
542
543 if (!isclass)
544 {
545 oldptr = ptr;
546 c -= '0';
547 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
548 c = c * 10 + *(++ptr) - '0';
549 if (c < 10 || c <= bracount)
550 {
551 c = -(ESC_REF + c);
552 break;
553 }
554 ptr = oldptr; /* Put the pointer back and fall through */
555 }
556
557 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
558 generates a binary zero byte and treats the digit as a following literal.
559 Thus we have to pull back the pointer by one. */
560
561 if ((c = *ptr) >= '8')
562 {
563 ptr--;
564 c = 0;
565 break;
566 }
567
568 /* \0 always starts an octal number, but we may drop through to here with a
569 larger first octal digit. */
570
571 case '0':
572 c -= '0';
573 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574 ptr[1] != '8' && ptr[1] != '9')
575 c = c * 8 + *(++ptr) - '0';
576 c &= 255; /* Take least significant 8 bits */
577 break;
578
579 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580 which can be greater than 0xff, but only if the ddd are hex digits. */
581
582 case 'x':
583 #ifdef SUPPORT_UTF8
584 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
585 {
586 const uschar *pt = ptr + 2;
587 register int count = 0;
588 c = 0;
589 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
590 {
591 count++;
592 c = c * 16 + cd->lcc[*pt] -
593 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
594 pt++;
595 }
596 if (*pt == '}')
597 {
598 if (c < 0 || count > 8) *errorptr = ERR34;
599 ptr = pt;
600 break;
601 }
602 /* If the sequence of hex digits does not end with '}', then we don't
603 recognize this construct; fall through to the normal \x handling. */
604 }
605 #endif
606
607 /* Read just a single hex char */
608
609 c = 0;
610 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
611 {
612 ptr++;
613 c = c * 16 + cd->lcc[*ptr] -
614 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
615 }
616 break;
617
618 /* Other special escapes not starting with a digit are straightforward */
619
620 case 'c':
621 c = *(++ptr);
622 if (c == 0)
623 {
624 *errorptr = ERR2;
625 return 0;
626 }
627
628 /* A letter is upper-cased; then the 0x40 bit is flipped */
629
630 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
631 c ^= 0x40;
632 break;
633
634 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
635 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
636 for Perl compatibility, it is a literal. This code looks a bit odd, but
637 there used to be some cases other than the default, and there may be again
638 in future, so I haven't "optimized" it. */
639
640 default:
641 if ((options & PCRE_EXTRA) != 0) switch(c)
642 {
643 default:
644 *errorptr = ERR3;
645 break;
646 }
647 break;
648 }
649 }
650
651 *ptrptr = ptr;
652 return c;
653 }
654
655
656
657 /*************************************************
658 * Check for counted repeat *
659 *************************************************/
660
661 /* This function is called when a '{' is encountered in a place where it might
662 start a quantifier. It looks ahead to see if it really is a quantifier or not.
663 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
664 where the ddds are digits.
665
666 Arguments:
667 p pointer to the first char after '{'
668 cd pointer to char tables block
669
670 Returns: TRUE or FALSE
671 */
672
673 static BOOL
674 is_counted_repeat(const uschar *p, compile_data *cd)
675 {
676 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
677 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
678 if (*p == '}') return TRUE;
679
680 if (*p++ != ',') return FALSE;
681 if (*p == '}') return TRUE;
682
683 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
684 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
685 return (*p == '}');
686 }
687
688
689
690 /*************************************************
691 * Read repeat counts *
692 *************************************************/
693
694 /* Read an item of the form {n,m} and return the values. This is called only
695 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
696 so the syntax is guaranteed to be correct, but we need to check the values.
697
698 Arguments:
699 p pointer to first char after '{'
700 minp pointer to int for min
701 maxp pointer to int for max
702 returned as -1 if no max
703 errorptr points to pointer to error message
704 cd pointer to character tables clock
705
706 Returns: pointer to '}' on success;
707 current ptr on error, with errorptr set
708 */
709
710 static const uschar *
711 read_repeat_counts(const uschar *p, int *minp, int *maxp,
712 const char **errorptr, compile_data *cd)
713 {
714 int min = 0;
715 int max = -1;
716
717 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
718
719 if (*p == '}') max = min; else
720 {
721 if (*(++p) != '}')
722 {
723 max = 0;
724 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
725 if (max < min)
726 {
727 *errorptr = ERR4;
728 return p;
729 }
730 }
731 }
732
733 /* Do paranoid checks, then fill in the required variables, and pass back the
734 pointer to the terminating '}'. */
735
736 if (min > 65535 || max > 65535)
737 *errorptr = ERR5;
738 else
739 {
740 *minp = min;
741 *maxp = max;
742 }
743 return p;
744 }
745
746
747
748 /*************************************************
749 * Find the fixed length of a pattern *
750 *************************************************/
751
752 /* Scan a pattern and compute the fixed length of subject that will match it,
753 if the length is fixed. This is needed for dealing with backward assertions.
754
755 Arguments:
756 code points to the start of the pattern (the bracket)
757 options the compiling options
758
759 Returns: the fixed length, or -1 if there is no fixed length
760 */
761
762 static int
763 find_fixedlength(uschar *code, int options)
764 {
765 int length = -1;
766
767 register int branchlength = 0;
768 register uschar *cc = code + 3;
769
770 /* Scan along the opcodes for this branch. If we get to the end of the
771 branch, check the length against that of the other branches. */
772
773 for (;;)
774 {
775 int d;
776 register int op = *cc;
777 if (op >= OP_BRA) op = OP_BRA;
778
779 switch (op)
780 {
781 case OP_BRA:
782 case OP_ONCE:
783 case OP_COND:
784 d = find_fixedlength(cc, options);
785 if (d < 0) return -1;
786 branchlength += d;
787 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
788 cc += 3;
789 break;
790
791 /* Reached end of a branch; if it's a ket it is the end of a nested
792 call. If it's ALT it is an alternation in a nested call. If it is
793 END it's the end of the outer call. All can be handled by the same code. */
794
795 case OP_ALT:
796 case OP_KET:
797 case OP_KETRMAX:
798 case OP_KETRMIN:
799 case OP_END:
800 if (length < 0) length = branchlength;
801 else if (length != branchlength) return -1;
802 if (*cc != OP_ALT) return length;
803 cc += 3;
804 branchlength = 0;
805 break;
806
807 /* Skip over assertive subpatterns */
808
809 case OP_ASSERT:
810 case OP_ASSERT_NOT:
811 case OP_ASSERTBACK:
812 case OP_ASSERTBACK_NOT:
813 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
814 cc += 3;
815 break;
816
817 /* Skip over things that don't match chars */
818
819 case OP_REVERSE:
820 case OP_BRANUMBER:
821 case OP_CREF:
822 cc++;
823 /* Fall through */
824
825 case OP_OPT:
826 cc++;
827 /* Fall through */
828
829 case OP_SOD:
830 case OP_EOD:
831 case OP_EODN:
832 case OP_CIRC:
833 case OP_DOLL:
834 case OP_NOT_WORD_BOUNDARY:
835 case OP_WORD_BOUNDARY:
836 cc++;
837 break;
838
839 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
840 This requires a scan of the string, unfortunately. We assume valid UTF-8
841 strings, so all we do is reduce the length by one for byte whose bits are
842 10xxxxxx. */
843
844 case OP_CHARS:
845 branchlength += *(++cc);
846 #ifdef SUPPORT_UTF8
847 for (d = 1; d <= *cc; d++)
848 if ((cc[d] & 0xc0) == 0x80) branchlength--;
849 #endif
850 cc += *cc + 1;
851 break;
852
853 /* Handle exact repetitions */
854
855 case OP_EXACT:
856 case OP_TYPEEXACT:
857 branchlength += (cc[1] << 8) + cc[2];
858 cc += 4;
859 break;
860
861 /* Handle single-char matchers */
862
863 case OP_NOT_DIGIT:
864 case OP_DIGIT:
865 case OP_NOT_WHITESPACE:
866 case OP_WHITESPACE:
867 case OP_NOT_WORDCHAR:
868 case OP_WORDCHAR:
869 case OP_ANY:
870 branchlength++;
871 cc++;
872 break;
873
874
875 /* Check a class for variable quantification */
876
877 case OP_CLASS:
878 cc += 33;
879
880 switch (*cc)
881 {
882 case OP_CRSTAR:
883 case OP_CRMINSTAR:
884 case OP_CRQUERY:
885 case OP_CRMINQUERY:
886 return -1;
887
888 case OP_CRRANGE:
889 case OP_CRMINRANGE:
890 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
891 branchlength += (cc[1] << 8) + cc[2];
892 cc += 5;
893 break;
894
895 default:
896 branchlength++;
897 }
898 break;
899
900 /* Anything else is variable length */
901
902 default:
903 return -1;
904 }
905 }
906 /* Control never gets here */
907 }
908
909
910
911
912 /*************************************************
913 * Check for POSIX class syntax *
914 *************************************************/
915
916 /* This function is called when the sequence "[:" or "[." or "[=" is
917 encountered in a character class. It checks whether this is followed by an
918 optional ^ and then a sequence of letters, terminated by a matching ":]" or
919 ".]" or "=]".
920
921 Argument:
922 ptr pointer to the initial [
923 endptr where to return the end pointer
924 cd pointer to compile data
925
926 Returns: TRUE or FALSE
927 */
928
929 static BOOL
930 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
931 {
932 int terminator; /* Don't combine these lines; the Solaris cc */
933 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
934 if (*(++ptr) == '^') ptr++;
935 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
936 if (*ptr == terminator && ptr[1] == ']')
937 {
938 *endptr = ptr;
939 return TRUE;
940 }
941 return FALSE;
942 }
943
944
945
946
947 /*************************************************
948 * Check POSIX class name *
949 *************************************************/
950
951 /* This function is called to check the name given in a POSIX-style class entry
952 such as [:alnum:].
953
954 Arguments:
955 ptr points to the first letter
956 len the length of the name
957
958 Returns: a value representing the name, or -1 if unknown
959 */
960
961 static int
962 check_posix_name(const uschar *ptr, int len)
963 {
964 register int yield = 0;
965 while (posix_name_lengths[yield] != 0)
966 {
967 if (len == posix_name_lengths[yield] &&
968 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
969 yield++;
970 }
971 return -1;
972 }
973
974
975
976
977 /*************************************************
978 * Compile one branch *
979 *************************************************/
980
981 /* Scan the pattern, compiling it into the code vector.
982
983 Arguments:
984 options the option bits
985 brackets points to number of extracting brackets used
986 code points to the pointer to the current code point
987 ptrptr points to the current pattern pointer
988 errorptr points to pointer to error message
989 optchanged set to the value of the last OP_OPT item compiled
990 reqchar set to the last literal character required, else -1
991 countlits set to count of mandatory literal characters
992 cd contains pointers to tables
993
994 Returns: TRUE on success
995 FALSE, with *errorptr set on error
996 */
997
998 static BOOL
999 compile_branch(int options, int *brackets, uschar **codeptr,
1000 const uschar **ptrptr, const char **errorptr, int *optchanged,
1001 int *reqchar, int *countlits, compile_data *cd)
1002 {
1003 int repeat_type, op_type;
1004 int repeat_min, repeat_max;
1005 int bravalue, length;
1006 int greedy_default, greedy_non_default;
1007 int prevreqchar;
1008 int condcount = 0;
1009 int subcountlits = 0;
1010 register int c;
1011 register uschar *code = *codeptr;
1012 uschar *tempcode;
1013 const uschar *ptr = *ptrptr;
1014 const uschar *tempptr;
1015 uschar *previous = NULL;
1016 uschar class[32];
1017
1018 /* Set up the default and non-default settings for greediness */
1019
1020 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1021 greedy_non_default = greedy_default ^ 1;
1022
1023 /* Initialize no required char, and count of literals */
1024
1025 *reqchar = prevreqchar = -1;
1026 *countlits = 0;
1027
1028 /* Switch on next character until the end of the branch */
1029
1030 for (;; ptr++)
1031 {
1032 BOOL negate_class;
1033 int class_charcount;
1034 int class_lastchar;
1035 int newoptions;
1036 int skipbytes;
1037 int subreqchar;
1038
1039 c = *ptr;
1040 if ((options & PCRE_EXTENDED) != 0)
1041 {
1042 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1043 if (c == '#')
1044 {
1045 /* The space before the ; is to avoid a warning on a silly compiler
1046 on the Macintosh. */
1047 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1048 continue;
1049 }
1050 }
1051
1052 switch(c)
1053 {
1054 /* The branch terminates at end of string, |, or ). */
1055
1056 case 0:
1057 case '|':
1058 case ')':
1059 *codeptr = code;
1060 *ptrptr = ptr;
1061 return TRUE;
1062
1063 /* Handle single-character metacharacters */
1064
1065 case '^':
1066 previous = NULL;
1067 *code++ = OP_CIRC;
1068 break;
1069
1070 case '$':
1071 previous = NULL;
1072 *code++ = OP_DOLL;
1073 break;
1074
1075 case '.':
1076 previous = code;
1077 *code++ = OP_ANY;
1078 break;
1079
1080 /* Character classes. These always build a 32-byte bitmap of the permitted
1081 characters, except in the special case where there is only one character.
1082 For negated classes, we build the map as usual, then invert it at the end.
1083 */
1084
1085 case '[':
1086 previous = code;
1087 *code++ = OP_CLASS;
1088
1089 /* If the first character is '^', set the negation flag and skip it. */
1090
1091 if ((c = *(++ptr)) == '^')
1092 {
1093 negate_class = TRUE;
1094 c = *(++ptr);
1095 }
1096 else negate_class = FALSE;
1097
1098 /* Keep a count of chars so that we can optimize the case of just a single
1099 character. */
1100
1101 class_charcount = 0;
1102 class_lastchar = -1;
1103
1104 /* Initialize the 32-char bit map to all zeros. We have to build the
1105 map in a temporary bit of store, in case the class contains only 1
1106 character, because in that case the compiled code doesn't use the
1107 bit map. */
1108
1109 memset(class, 0, 32 * sizeof(uschar));
1110
1111 /* Process characters until ] is reached. By writing this as a "do" it
1112 means that an initial ] is taken as a data character. */
1113
1114 do
1115 {
1116 if (c == 0)
1117 {
1118 *errorptr = ERR6;
1119 goto FAILED;
1120 }
1121
1122 /* Handle POSIX class names. Perl allows a negation extension of the
1123 form [:^name]. A square bracket that doesn't match the syntax is
1124 treated as a literal. We also recognize the POSIX constructions
1125 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1126 5.6 does. */
1127
1128 if (c == '[' &&
1129 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1130 check_posix_syntax(ptr, &tempptr, cd))
1131 {
1132 BOOL local_negate = FALSE;
1133 int posix_class, i;
1134 register const uschar *cbits = cd->cbits;
1135
1136 if (ptr[1] != ':')
1137 {
1138 *errorptr = ERR31;
1139 goto FAILED;
1140 }
1141
1142 ptr += 2;
1143 if (*ptr == '^')
1144 {
1145 local_negate = TRUE;
1146 ptr++;
1147 }
1148
1149 posix_class = check_posix_name(ptr, tempptr - ptr);
1150 if (posix_class < 0)
1151 {
1152 *errorptr = ERR30;
1153 goto FAILED;
1154 }
1155
1156 /* If matching is caseless, upper and lower are converted to
1157 alpha. This relies on the fact that the class table starts with
1158 alpha, lower, upper as the first 3 entries. */
1159
1160 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1161 posix_class = 0;
1162
1163 /* Or into the map we are building up to 3 of the static class
1164 tables, or their negations. */
1165
1166 posix_class *= 3;
1167 for (i = 0; i < 3; i++)
1168 {
1169 int taboffset = posix_class_maps[posix_class + i];
1170 if (taboffset < 0) break;
1171 if (local_negate)
1172 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1173 else
1174 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1175 }
1176
1177 ptr = tempptr + 1;
1178 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1179 continue;
1180 }
1181
1182 /* Backslash may introduce a single character, or it may introduce one
1183 of the specials, which just set a flag. Escaped items are checked for
1184 validity in the pre-compiling pass. The sequence \b is a special case.
1185 Inside a class (and only there) it is treated as backspace. Elsewhere
1186 it marks a word boundary. Other escapes have preset maps ready to
1187 or into the one we are building. We assume they have more than one
1188 character in them, so set class_count bigger than one. */
1189
1190 if (c == '\\')
1191 {
1192 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1193 if (-c == ESC_b) c = '\b';
1194 else if (c < 0)
1195 {
1196 register const uschar *cbits = cd->cbits;
1197 class_charcount = 10;
1198 switch (-c)
1199 {
1200 case ESC_d:
1201 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1202 continue;
1203
1204 case ESC_D:
1205 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1206 continue;
1207
1208 case ESC_w:
1209 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1210 continue;
1211
1212 case ESC_W:
1213 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1214 continue;
1215
1216 case ESC_s:
1217 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1218 continue;
1219
1220 case ESC_S:
1221 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1222 continue;
1223
1224 default:
1225 *errorptr = ERR7;
1226 goto FAILED;
1227 }
1228 }
1229
1230 /* Fall through if single character, but don't at present allow
1231 chars > 255 in UTF-8 mode. */
1232
1233 #ifdef SUPPORT_UTF8
1234 if (c > 255)
1235 {
1236 *errorptr = ERR33;
1237 goto FAILED;
1238 }
1239 #endif
1240 }
1241
1242 /* A single character may be followed by '-' to form a range. However,
1243 Perl does not permit ']' to be the end of the range. A '-' character
1244 here is treated as a literal. */
1245
1246 if (ptr[1] == '-' && ptr[2] != ']')
1247 {
1248 int d;
1249 ptr += 2;
1250 d = *ptr;
1251
1252 if (d == 0)
1253 {
1254 *errorptr = ERR6;
1255 goto FAILED;
1256 }
1257
1258 /* The second part of a range can be a single-character escape, but
1259 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260 in such circumstances. */
1261
1262 if (d == '\\')
1263 {
1264 const uschar *oldptr = ptr;
1265 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1266
1267 #ifdef SUPPORT_UTF8
1268 if (d > 255)
1269 {
1270 *errorptr = ERR33;
1271 goto FAILED;
1272 }
1273 #endif
1274 /* \b is backslash; any other special means the '-' was literal */
1275
1276 if (d < 0)
1277 {
1278 if (d == -ESC_b) d = '\b'; else
1279 {
1280 ptr = oldptr - 2;
1281 goto SINGLE_CHARACTER; /* A few lines below */
1282 }
1283 }
1284 }
1285
1286 if (d < c)
1287 {
1288 *errorptr = ERR8;
1289 goto FAILED;
1290 }
1291
1292 for (; c <= d; c++)
1293 {
1294 class[c/8] |= (1 << (c&7));
1295 if ((options & PCRE_CASELESS) != 0)
1296 {
1297 int uc = cd->fcc[c]; /* flip case */
1298 class[uc/8] |= (1 << (uc&7));
1299 }
1300 class_charcount++; /* in case a one-char range */
1301 class_lastchar = c;
1302 }
1303 continue; /* Go get the next char in the class */
1304 }
1305
1306 /* Handle a lone single character - we can get here for a normal
1307 non-escape char, or after \ that introduces a single character. */
1308
1309 SINGLE_CHARACTER:
1310
1311 class [c/8] |= (1 << (c&7));
1312 if ((options & PCRE_CASELESS) != 0)
1313 {
1314 c = cd->fcc[c]; /* flip case */
1315 class[c/8] |= (1 << (c&7));
1316 }
1317 class_charcount++;
1318 class_lastchar = c;
1319 }
1320
1321 /* Loop until ']' reached; the check for end of string happens inside the
1322 loop. This "while" is the end of the "do" above. */
1323
1324 while ((c = *(++ptr)) != ']');
1325
1326 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1327 precisely one character. This doesn't need the whole 32-byte bit map.
1328 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1329 it's negative. */
1330
1331 if (class_charcount == 1 && class_lastchar >= 0)
1332 {
1333 if (negate_class)
1334 {
1335 code[-1] = OP_NOT;
1336 }
1337 else
1338 {
1339 code[-1] = OP_CHARS;
1340 *code++ = 1;
1341 }
1342 *code++ = class_lastchar;
1343 }
1344
1345 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1346 the code vector. */
1347
1348 else
1349 {
1350 if (negate_class)
1351 for (c = 0; c < 32; c++) code[c] = ~class[c];
1352 else
1353 memcpy(code, class, 32);
1354 code += 32;
1355 }
1356 break;
1357
1358 /* Various kinds of repeat */
1359
1360 case '{':
1361 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1362 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1363 if (*errorptr != NULL) goto FAILED;
1364 goto REPEAT;
1365
1366 case '*':
1367 repeat_min = 0;
1368 repeat_max = -1;
1369 goto REPEAT;
1370
1371 case '+':
1372 repeat_min = 1;
1373 repeat_max = -1;
1374 goto REPEAT;
1375
1376 case '?':
1377 repeat_min = 0;
1378 repeat_max = 1;
1379
1380 REPEAT:
1381 if (previous == NULL)
1382 {
1383 *errorptr = ERR9;
1384 goto FAILED;
1385 }
1386
1387 /* If the next character is '?' this is a minimizing repeat, by default,
1388 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1389 next character. */
1390
1391 if (ptr[1] == '?')
1392 { repeat_type = greedy_non_default; ptr++; }
1393 else repeat_type = greedy_default;
1394
1395 /* If previous was a string of characters, chop off the last one and use it
1396 as the subject of the repeat. If there was only one character, we can
1397 abolish the previous item altogether. A repeat with a zero minimum wipes
1398 out any reqchar setting, backing up to the previous value. We must also
1399 adjust the countlits value. */
1400
1401 if (*previous == OP_CHARS)
1402 {
1403 int len = previous[1];
1404
1405 if (repeat_min == 0) *reqchar = prevreqchar;
1406 *countlits += repeat_min - 1;
1407
1408 if (len == 1)
1409 {
1410 c = previous[2];
1411 code = previous;
1412 }
1413 else
1414 {
1415 c = previous[len+1];
1416 previous[1]--;
1417 code--;
1418 }
1419 op_type = 0; /* Use single-char op codes */
1420 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1421 }
1422
1423 /* If previous was a single negated character ([^a] or similar), we use
1424 one of the special opcodes, replacing it. The code is shared with single-
1425 character repeats by adding a suitable offset into repeat_type. */
1426
1427 else if ((int)*previous == OP_NOT)
1428 {
1429 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1430 c = previous[1];
1431 code = previous;
1432 goto OUTPUT_SINGLE_REPEAT;
1433 }
1434
1435 /* If previous was a character type match (\d or similar), abolish it and
1436 create a suitable repeat item. The code is shared with single-character
1437 repeats by adding a suitable offset into repeat_type. */
1438
1439 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1440 {
1441 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1442 c = *previous;
1443 code = previous;
1444
1445 OUTPUT_SINGLE_REPEAT:
1446
1447 /* If the maximum is zero then the minimum must also be zero; Perl allows
1448 this case, so we do too - by simply omitting the item altogether. */
1449
1450 if (repeat_max == 0) goto END_REPEAT;
1451
1452 /* Combine the op_type with the repeat_type */
1453
1454 repeat_type += op_type;
1455
1456 /* A minimum of zero is handled either as the special case * or ?, or as
1457 an UPTO, with the maximum given. */
1458
1459 if (repeat_min == 0)
1460 {
1461 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1462 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1463 else
1464 {
1465 *code++ = OP_UPTO + repeat_type;
1466 *code++ = repeat_max >> 8;
1467 *code++ = (repeat_max & 255);
1468 }
1469 }
1470
1471 /* The case {1,} is handled as the special case + */
1472
1473 else if (repeat_min == 1 && repeat_max == -1)
1474 *code++ = OP_PLUS + repeat_type;
1475
1476 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1477 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1478
1479 else
1480 {
1481 if (repeat_min != 1)
1482 {
1483 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1484 *code++ = repeat_min >> 8;
1485 *code++ = (repeat_min & 255);
1486 }
1487
1488 /* If the mininum is 1 and the previous item was a character string,
1489 we either have to put back the item that got cancelled if the string
1490 length was 1, or add the character back onto the end of a longer
1491 string. For a character type nothing need be done; it will just get
1492 put back naturally. Note that the final character is always going to
1493 get added below. */
1494
1495 else if (*previous == OP_CHARS)
1496 {
1497 if (code == previous) code += 2; else previous[1]++;
1498 }
1499
1500 /* For a single negated character we also have to put back the
1501 item that got cancelled. */
1502
1503 else if (*previous == OP_NOT) code++;
1504
1505 /* If the maximum is unlimited, insert an OP_STAR. */
1506
1507 if (repeat_max < 0)
1508 {
1509 *code++ = c;
1510 *code++ = OP_STAR + repeat_type;
1511 }
1512
1513 /* Else insert an UPTO if the max is greater than the min. */
1514
1515 else if (repeat_max != repeat_min)
1516 {
1517 *code++ = c;
1518 repeat_max -= repeat_min;
1519 *code++ = OP_UPTO + repeat_type;
1520 *code++ = repeat_max >> 8;
1521 *code++ = (repeat_max & 255);
1522 }
1523 }
1524
1525 /* The character or character type itself comes last in all cases. */
1526
1527 *code++ = c;
1528 }
1529
1530 /* If previous was a character class or a back reference, we put the repeat
1531 stuff after it, but just skip the item if the repeat was {0,0}. */
1532
1533 else if (*previous == OP_CLASS || *previous == OP_REF)
1534 {
1535 if (repeat_max == 0)
1536 {
1537 code = previous;
1538 goto END_REPEAT;
1539 }
1540 if (repeat_min == 0 && repeat_max == -1)
1541 *code++ = OP_CRSTAR + repeat_type;
1542 else if (repeat_min == 1 && repeat_max == -1)
1543 *code++ = OP_CRPLUS + repeat_type;
1544 else if (repeat_min == 0 && repeat_max == 1)
1545 *code++ = OP_CRQUERY + repeat_type;
1546 else
1547 {
1548 *code++ = OP_CRRANGE + repeat_type;
1549 *code++ = repeat_min >> 8;
1550 *code++ = repeat_min & 255;
1551 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1552 *code++ = repeat_max >> 8;
1553 *code++ = repeat_max & 255;
1554 }
1555 }
1556
1557 /* If previous was a bracket group, we may have to replicate it in certain
1558 cases. */
1559
1560 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1561 (int)*previous == OP_COND)
1562 {
1563 register int i;
1564 int ketoffset = 0;
1565 int len = code - previous;
1566 uschar *bralink = NULL;
1567
1568 /* If the maximum repeat count is unlimited, find the end of the bracket
1569 by scanning through from the start, and compute the offset back to it
1570 from the current code pointer. There may be an OP_OPT setting following
1571 the final KET, so we can't find the end just by going back from the code
1572 pointer. */
1573
1574 if (repeat_max == -1)
1575 {
1576 register uschar *ket = previous;
1577 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1578 ketoffset = code - ket;
1579 }
1580
1581 /* The case of a zero minimum is special because of the need to stick
1582 OP_BRAZERO in front of it, and because the group appears once in the
1583 data, whereas in other cases it appears the minimum number of times. For
1584 this reason, it is simplest to treat this case separately, as otherwise
1585 the code gets far too messy. There are several special subcases when the
1586 minimum is zero. */
1587
1588 if (repeat_min == 0)
1589 {
1590 /* If we set up a required char from the bracket, we must back off
1591 to the previous value and reset the countlits value too. */
1592
1593 if (subcountlits > 0)
1594 {
1595 *reqchar = prevreqchar;
1596 *countlits -= subcountlits;
1597 }
1598
1599 /* If the maximum is also zero, we just omit the group from the output
1600 altogether. */
1601
1602 if (repeat_max == 0)
1603 {
1604 code = previous;
1605 goto END_REPEAT;
1606 }
1607
1608 /* If the maximum is 1 or unlimited, we just have to stick in the
1609 BRAZERO and do no more at this point. */
1610
1611 if (repeat_max <= 1)
1612 {
1613 memmove(previous+1, previous, len);
1614 code++;
1615 *previous++ = OP_BRAZERO + repeat_type;
1616 }
1617
1618 /* If the maximum is greater than 1 and limited, we have to replicate
1619 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1620 The first one has to be handled carefully because it's the original
1621 copy, which has to be moved up. The remainder can be handled by code
1622 that is common with the non-zero minimum case below. We just have to
1623 adjust the value or repeat_max, since one less copy is required. */
1624
1625 else
1626 {
1627 int offset;
1628 memmove(previous+4, previous, len);
1629 code += 4;
1630 *previous++ = OP_BRAZERO + repeat_type;
1631 *previous++ = OP_BRA;
1632
1633 /* We chain together the bracket offset fields that have to be
1634 filled in later when the ends of the brackets are reached. */
1635
1636 offset = (bralink == NULL)? 0 : previous - bralink;
1637 bralink = previous;
1638 *previous++ = offset >> 8;
1639 *previous++ = offset & 255;
1640 }
1641
1642 repeat_max--;
1643 }
1644
1645 /* If the minimum is greater than zero, replicate the group as many
1646 times as necessary, and adjust the maximum to the number of subsequent
1647 copies that we need. */
1648
1649 else
1650 {
1651 for (i = 1; i < repeat_min; i++)
1652 {
1653 memcpy(code, previous, len);
1654 code += len;
1655 }
1656 if (repeat_max > 0) repeat_max -= repeat_min;
1657 }
1658
1659 /* This code is common to both the zero and non-zero minimum cases. If
1660 the maximum is limited, it replicates the group in a nested fashion,
1661 remembering the bracket starts on a stack. In the case of a zero minimum,
1662 the first one was set up above. In all cases the repeat_max now specifies
1663 the number of additional copies needed. */
1664
1665 if (repeat_max >= 0)
1666 {
1667 for (i = repeat_max - 1; i >= 0; i--)
1668 {
1669 *code++ = OP_BRAZERO + repeat_type;
1670
1671 /* All but the final copy start a new nesting, maintaining the
1672 chain of brackets outstanding. */
1673
1674 if (i != 0)
1675 {
1676 int offset;
1677 *code++ = OP_BRA;
1678 offset = (bralink == NULL)? 0 : code - bralink;
1679 bralink = code;
1680 *code++ = offset >> 8;
1681 *code++ = offset & 255;
1682 }
1683
1684 memcpy(code, previous, len);
1685 code += len;
1686 }
1687
1688 /* Now chain through the pending brackets, and fill in their length
1689 fields (which are holding the chain links pro tem). */
1690
1691 while (bralink != NULL)
1692 {
1693 int oldlinkoffset;
1694 int offset = code - bralink + 1;
1695 uschar *bra = code - offset;
1696 oldlinkoffset = (bra[1] << 8) + bra[2];
1697 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1698 *code++ = OP_KET;
1699 *code++ = bra[1] = offset >> 8;
1700 *code++ = bra[2] = (offset & 255);
1701 }
1702 }
1703
1704 /* If the maximum is unlimited, set a repeater in the final copy. We
1705 can't just offset backwards from the current code point, because we
1706 don't know if there's been an options resetting after the ket. The
1707 correct offset was computed above. */
1708
1709 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1710 }
1711
1712 /* Else there's some kind of shambles */
1713
1714 else
1715 {
1716 *errorptr = ERR11;
1717 goto FAILED;
1718 }
1719
1720 /* In all case we no longer have a previous item. */
1721
1722 END_REPEAT:
1723 previous = NULL;
1724 break;
1725
1726
1727 /* Start of nested bracket sub-expression, or comment or lookahead or
1728 lookbehind or option setting or condition. First deal with special things
1729 that can come after a bracket; all are introduced by ?, and the appearance
1730 of any of them means that this is not a referencing group. They were
1731 checked for validity in the first pass over the string, so we don't have to
1732 check for syntax errors here. */
1733
1734 case '(':
1735 newoptions = options;
1736 skipbytes = 0;
1737
1738 if (*(++ptr) == '?')
1739 {
1740 int set, unset;
1741 int *optset;
1742
1743 switch (*(++ptr))
1744 {
1745 case '#': /* Comment; skip to ket */
1746 ptr++;
1747 while (*ptr != ')') ptr++;
1748 continue;
1749
1750 case ':': /* Non-extracting bracket */
1751 bravalue = OP_BRA;
1752 ptr++;
1753 break;
1754
1755 case '(':
1756 bravalue = OP_COND; /* Conditional group */
1757 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1758 {
1759 int condref = *ptr - '0';
1760 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1761 if (condref == 0)
1762 {
1763 *errorptr = ERR35;
1764 goto FAILED;
1765 }
1766 ptr++;
1767 code[3] = OP_CREF;
1768 code[4] = condref >> 8;
1769 code[5] = condref & 255;
1770 skipbytes = 3;
1771 }
1772 else ptr--;
1773 break;
1774
1775 case '=': /* Positive lookahead */
1776 bravalue = OP_ASSERT;
1777 ptr++;
1778 break;
1779
1780 case '!': /* Negative lookahead */
1781 bravalue = OP_ASSERT_NOT;
1782 ptr++;
1783 break;
1784
1785 case '<': /* Lookbehinds */
1786 switch (*(++ptr))
1787 {
1788 case '=': /* Positive lookbehind */
1789 bravalue = OP_ASSERTBACK;
1790 ptr++;
1791 break;
1792
1793 case '!': /* Negative lookbehind */
1794 bravalue = OP_ASSERTBACK_NOT;
1795 ptr++;
1796 break;
1797
1798 default: /* Syntax error */
1799 *errorptr = ERR24;
1800 goto FAILED;
1801 }
1802 break;
1803
1804 case '>': /* One-time brackets */
1805 bravalue = OP_ONCE;
1806 ptr++;
1807 break;
1808
1809 case 'R': /* Pattern recursion */
1810 *code++ = OP_RECURSE;
1811 ptr++;
1812 continue;
1813
1814 default: /* Option setting */
1815 set = unset = 0;
1816 optset = &set;
1817
1818 while (*ptr != ')' && *ptr != ':')
1819 {
1820 switch (*ptr++)
1821 {
1822 case '-': optset = &unset; break;
1823
1824 case 'i': *optset |= PCRE_CASELESS; break;
1825 case 'm': *optset |= PCRE_MULTILINE; break;
1826 case 's': *optset |= PCRE_DOTALL; break;
1827 case 'x': *optset |= PCRE_EXTENDED; break;
1828 case 'U': *optset |= PCRE_UNGREEDY; break;
1829 case 'X': *optset |= PCRE_EXTRA; break;
1830
1831 default:
1832 *errorptr = ERR12;
1833 goto FAILED;
1834 }
1835 }
1836
1837 /* Set up the changed option bits, but don't change anything yet. */
1838
1839 newoptions = (options | set) & (~unset);
1840
1841 /* If the options ended with ')' this is not the start of a nested
1842 group with option changes, so the options change at this level. At top
1843 level there is nothing else to be done (the options will in fact have
1844 been set from the start of compiling as a result of the first pass) but
1845 at an inner level we must compile code to change the ims options if
1846 necessary, and pass the new setting back so that it can be put at the
1847 start of any following branches, and when this group ends, a resetting
1848 item can be compiled. */
1849
1850 if (*ptr == ')')
1851 {
1852 if ((options & PCRE_INGROUP) != 0 &&
1853 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1854 {
1855 *code++ = OP_OPT;
1856 *code++ = *optchanged = newoptions & PCRE_IMS;
1857 }
1858 options = newoptions; /* Change options at this level */
1859 previous = NULL; /* This item can't be repeated */
1860 continue; /* It is complete */
1861 }
1862
1863 /* If the options ended with ':' we are heading into a nested group
1864 with possible change of options. Such groups are non-capturing and are
1865 not assertions of any kind. All we need to do is skip over the ':';
1866 the newoptions value is handled below. */
1867
1868 bravalue = OP_BRA;
1869 ptr++;
1870 }
1871 }
1872
1873 /* Else we have a referencing group; adjust the opcode. If the bracket
1874 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1876
1877 else
1878 {
1879 if (++(*brackets) > EXTRACT_BASIC_MAX)
1880 {
1881 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882 code[3] = OP_BRANUMBER;
1883 code[4] = *brackets >> 8;
1884 code[5] = *brackets & 255;
1885 skipbytes = 3;
1886 }
1887 else bravalue = OP_BRA + *brackets;
1888 }
1889
1890 /* Process nested bracketed re. Assertions may not be repeated, but other
1891 kinds can be. We copy code into a non-register variable in order to be able
1892 to pass its address because some compilers complain otherwise. Pass in a
1893 new setting for the ims options if they have changed. */
1894
1895 previous = (bravalue >= OP_ONCE)? code : NULL;
1896 *code = bravalue;
1897 tempcode = code;
1898
1899 if (!compile_regex(
1900 options | PCRE_INGROUP, /* Set for all nested groups */
1901 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903 brackets, /* Extracting bracket count */
1904 &tempcode, /* Where to put code (updated) */
1905 &ptr, /* Input pointer (updated) */
1906 errorptr, /* Where to put an error message */
1907 (bravalue == OP_ASSERTBACK ||
1908 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
1910 &subreqchar, /* For possible last char */
1911 &subcountlits, /* For literal count */
1912 cd)) /* Tables block */
1913 goto FAILED;
1914
1915 /* At the end of compiling, code is still pointing to the start of the
1916 group, while tempcode has been updated to point past the end of the group
1917 and any option resetting that may follow it. The pattern pointer (ptr)
1918 is on the bracket. */
1919
1920 /* If this is a conditional bracket, check that there are no more than
1921 two branches in the group. */
1922
1923 else if (bravalue == OP_COND)
1924 {
1925 uschar *tc = code;
1926 condcount = 0;
1927
1928 do {
1929 condcount++;
1930 tc += (tc[1] << 8) | tc[2];
1931 }
1932 while (*tc != OP_KET);
1933
1934 if (condcount > 2)
1935 {
1936 *errorptr = ERR27;
1937 goto FAILED;
1938 }
1939 }
1940
1941 /* Handle updating of the required character. If the subpattern didn't
1942 set one, leave it as it was. Otherwise, update it for normal brackets of
1943 all kinds, forward assertions, and conditions with two branches. Don't
1944 update the literal count for forward assertions, however. If the bracket
1945 is followed by a quantifier with zero repeat, we have to back off. Hence
1946 the definition of prevreqchar and subcountlits outside the main loop so
1947 that they can be accessed for the back off. */
1948
1949 if (subreqchar > 0 &&
1950 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1951 (bravalue == OP_COND && condcount == 2)))
1952 {
1953 prevreqchar = *reqchar;
1954 *reqchar = subreqchar;
1955 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1956 }
1957
1958 /* Now update the main code pointer to the end of the group. */
1959
1960 code = tempcode;
1961
1962 /* Error if hit end of pattern */
1963
1964 if (*ptr != ')')
1965 {
1966 *errorptr = ERR14;
1967 goto FAILED;
1968 }
1969 break;
1970
1971 /* Check \ for being a real metacharacter; if not, fall through and handle
1972 it as a data character at the start of a string. Escape items are checked
1973 for validity in the pre-compiling pass. */
1974
1975 case '\\':
1976 tempptr = ptr;
1977 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1978
1979 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1980 are arranged to be the negation of the corresponding OP_values. For the
1981 back references, the values are ESC_REF plus the reference number. Only
1982 back references and those types that consume a character may be repeated.
1983 We can test for values between ESC_b and ESC_Z for the latter; this may
1984 have to change if any new ones are ever created. */
1985
1986 if (c < 0)
1987 {
1988 if (-c >= ESC_REF)
1989 {
1990 int number = -c - ESC_REF;
1991 previous = code;
1992 *code++ = OP_REF;
1993 *code++ = number >> 8;
1994 *code++ = number & 255;
1995 }
1996 else
1997 {
1998 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1999 *code++ = -c;
2000 }
2001 continue;
2002 }
2003
2004 /* Data character: reset and fall through */
2005
2006 ptr = tempptr;
2007 c = '\\';
2008
2009 /* Handle a run of data characters until a metacharacter is encountered.
2010 The first character is guaranteed not to be whitespace or # when the
2011 extended flag is set. */
2012
2013 NORMAL_CHAR:
2014 default:
2015 previous = code;
2016 *code = OP_CHARS;
2017 code += 2;
2018 length = 0;
2019
2020 do
2021 {
2022 if ((options & PCRE_EXTENDED) != 0)
2023 {
2024 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2025 if (c == '#')
2026 {
2027 /* The space before the ; is to avoid a warning on a silly compiler
2028 on the Macintosh. */
2029 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2030 if (c == 0) break;
2031 continue;
2032 }
2033 }
2034
2035 /* Backslash may introduce a data char or a metacharacter. Escaped items
2036 are checked for validity in the pre-compiling pass. Stop the string
2037 before a metaitem. */
2038
2039 if (c == '\\')
2040 {
2041 tempptr = ptr;
2042 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043 if (c < 0) { ptr = tempptr; break; }
2044
2045 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046 two or more characters in the UTF-8 encoding. */
2047
2048 #ifdef SUPPORT_UTF8
2049 if (c > 127 && (options & PCRE_UTF8) != 0)
2050 {
2051 uschar buffer[8];
2052 int len = ord2utf8(c, buffer);
2053 for (c = 0; c < len; c++) *code++ = buffer[c];
2054 length += len;
2055 continue;
2056 }
2057 #endif
2058 }
2059
2060 /* Ordinary character or single-char escape */
2061
2062 *code++ = c;
2063 length++;
2064 }
2065
2066 /* This "while" is the end of the "do" above. */
2067
2068 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2069
2070 /* Update the last character and the count of literals */
2071
2072 prevreqchar = (length > 1)? code[-2] : *reqchar;
2073 *reqchar = code[-1];
2074 *countlits += length;
2075
2076 /* Compute the length and set it in the data vector, and advance to
2077 the next state. */
2078
2079 previous[1] = length;
2080 if (length < MAXLIT) ptr--;
2081 break;
2082 }
2083 } /* end of big loop */
2084
2085 /* Control never reaches here by falling through, only by a goto for all the
2086 error states. Pass back the position in the pattern so that it can be displayed
2087 to the user for diagnosing the error. */
2088
2089 FAILED:
2090 *ptrptr = ptr;
2091 return FALSE;
2092 }
2093
2094
2095
2096
2097 /*************************************************
2098 * Compile sequence of alternatives *
2099 *************************************************/
2100
2101 /* On entry, ptr is pointing past the bracket character, but on return
2102 it points to the closing bracket, or vertical bar, or end of string.
2103 The code variable is pointing at the byte into which the BRA operator has been
2104 stored. If the ims options are changed at the start (for a (?ims: group) or
2105 during any branch, we need to insert an OP_OPT item at the start of every
2106 following branch to ensure they get set correctly at run time, and also pass
2107 the new options into every subsequent branch compile.
2108
2109 Argument:
2110 options the option bits
2111 optchanged new ims options to set as if (?ims) were at the start, or -1
2112 for no change
2113 brackets -> int containing the number of extracting brackets used
2114 codeptr -> the address of the current code pointer
2115 ptrptr -> the address of the current pattern pointer
2116 errorptr -> pointer to error message
2117 lookbehind TRUE if this is a lookbehind assertion
2118 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119 reqchar -> place to put the last required character, or a negative number
2120 countlits -> place to put the shortest literal count of any branch
2121 cd points to the data block with tables pointers
2122
2123 Returns: TRUE on success
2124 */
2125
2126 static BOOL
2127 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129 int *reqchar, int *countlits, compile_data *cd)
2130 {
2131 const uschar *ptr = *ptrptr;
2132 uschar *code = *codeptr;
2133 uschar *last_branch = code;
2134 uschar *start_bracket = code;
2135 uschar *reverse_count = NULL;
2136 int oldoptions = options & PCRE_IMS;
2137 int branchreqchar, branchcountlits;
2138
2139 *reqchar = -1;
2140 *countlits = INT_MAX;
2141 code += 3 + skipbytes;
2142
2143 /* Loop for each alternative branch */
2144
2145 for (;;)
2146 {
2147 int length;
2148
2149 /* Handle change of options */
2150
2151 if (optchanged >= 0)
2152 {
2153 *code++ = OP_OPT;
2154 *code++ = optchanged;
2155 options = (options & ~PCRE_IMS) | optchanged;
2156 }
2157
2158 /* Set up dummy OP_REVERSE if lookbehind assertion */
2159
2160 if (lookbehind)
2161 {
2162 *code++ = OP_REVERSE;
2163 reverse_count = code;
2164 *code++ = 0;
2165 *code++ = 0;
2166 }
2167
2168 /* Now compile the branch */
2169
2170 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2171 &branchreqchar, &branchcountlits, cd))
2172 {
2173 *ptrptr = ptr;
2174 return FALSE;
2175 }
2176
2177 /* Fill in the length of the last branch */
2178
2179 length = code - last_branch;
2180 last_branch[1] = length >> 8;
2181 last_branch[2] = length & 255;
2182
2183 /* Save the last required character if all branches have the same; a current
2184 value of -1 means unset, while -2 means "previous branch had no last required
2185 char". */
2186
2187 if (*reqchar != -2)
2188 {
2189 if (branchreqchar >= 0)
2190 {
2191 if (*reqchar == -1) *reqchar = branchreqchar;
2192 else if (*reqchar != branchreqchar) *reqchar = -2;
2193 }
2194 else *reqchar = -2;
2195 }
2196
2197 /* Keep the shortest literal count */
2198
2199 if (branchcountlits < *countlits) *countlits = branchcountlits;
2200 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2201
2202 /* If lookbehind, check that this branch matches a fixed-length string,
2203 and put the length into the OP_REVERSE item. Temporarily mark the end of
2204 the branch with OP_END. */
2205
2206 if (lookbehind)
2207 {
2208 *code = OP_END;
2209 length = find_fixedlength(last_branch, options);
2210 DPRINTF(("fixed length = %d\n", length));
2211 if (length < 0)
2212 {
2213 *errorptr = ERR25;
2214 *ptrptr = ptr;
2215 return FALSE;
2216 }
2217 reverse_count[0] = (length >> 8);
2218 reverse_count[1] = length & 255;
2219 }
2220
2221 /* Reached end of expression, either ')' or end of pattern. Insert a
2222 terminating ket and the length of the whole bracketed item, and return,
2223 leaving the pointer at the terminating char. If any of the ims options
2224 were changed inside the group, compile a resetting op-code following. */
2225
2226 if (*ptr != '|')
2227 {
2228 length = code - start_bracket;
2229 *code++ = OP_KET;
2230 *code++ = length >> 8;
2231 *code++ = length & 255;
2232 if (optchanged >= 0)
2233 {
2234 *code++ = OP_OPT;
2235 *code++ = oldoptions;
2236 }
2237 *codeptr = code;
2238 *ptrptr = ptr;
2239 return TRUE;
2240 }
2241
2242 /* Another branch follows; insert an "or" node and advance the pointer. */
2243
2244 *code = OP_ALT;
2245 last_branch = code;
2246 code += 3;
2247 ptr++;
2248 }
2249 /* Control never reaches here */
2250 }
2251
2252
2253
2254
2255 /*************************************************
2256 * Find first significant op code *
2257 *************************************************/
2258
2259 /* This is called by several functions that scan a compiled expression looking
2260 for a fixed first character, or an anchoring op code etc. It skips over things
2261 that do not influence this. For one application, a change of caseless option is
2262 important.
2263
2264 Arguments:
2265 code pointer to the start of the group
2266 options pointer to external options
2267 optbit the option bit whose changing is significant, or
2268 zero if none are
2269 optstop TRUE to return on option change, otherwise change the options
2270 value and continue
2271
2272 Returns: pointer to the first significant opcode
2273 */
2274
2275 static const uschar*
2276 first_significant_code(const uschar *code, int *options, int optbit,
2277 BOOL optstop)
2278 {
2279 for (;;)
2280 {
2281 switch ((int)*code)
2282 {
2283 case OP_OPT:
2284 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2285 {
2286 if (optstop) return code;
2287 *options = (int)code[1];
2288 }
2289 code += 2;
2290 break;
2291
2292 case OP_CREF:
2293 case OP_BRANUMBER:
2294 code += 3;
2295 break;
2296
2297 case OP_WORD_BOUNDARY:
2298 case OP_NOT_WORD_BOUNDARY:
2299 code++;
2300 break;
2301
2302 case OP_ASSERT_NOT:
2303 case OP_ASSERTBACK:
2304 case OP_ASSERTBACK_NOT:
2305 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2306 code += 3;
2307 break;
2308
2309 default:
2310 return code;
2311 }
2312 }
2313 /* Control never reaches here */
2314 }
2315
2316
2317
2318
2319 /*************************************************
2320 * Check for anchored expression *
2321 *************************************************/
2322
2323 /* Try to find out if this is an anchored regular expression. Consider each
2324 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2325 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2326 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2327 counts, since OP_CIRC can match in the middle.
2328
2329 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2330 because that will try the rest of the pattern at all possible matching points,
2331 so there is no point trying them again.
2332
2333 Arguments:
2334 code points to start of expression (the bracket)
2335 options points to the options setting
2336
2337 Returns: TRUE or FALSE
2338 */
2339
2340 static BOOL
2341 is_anchored(register const uschar *code, int *options)
2342 {
2343 do {
2344 const uschar *scode = first_significant_code(code + 3, options,
2345 PCRE_MULTILINE, FALSE);
2346 register int op = *scode;
2347 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2348 { if (!is_anchored(scode, options)) return FALSE; }
2349 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2350 (*options & PCRE_DOTALL) != 0)
2351 { if (scode[1] != OP_ANY) return FALSE; }
2352 else if (op != OP_SOD &&
2353 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2354 return FALSE;
2355 code += (code[1] << 8) + code[2];
2356 }
2357 while (*code == OP_ALT);
2358 return TRUE;
2359 }
2360
2361
2362
2363 /*************************************************
2364 * Check for starting with ^ or .* *
2365 *************************************************/
2366
2367 /* This is called to find out if every branch starts with ^ or .* so that
2368 "first char" processing can be done to speed things up in multiline
2369 matching and for non-DOTALL patterns that start with .* (which must start at
2370 the beginning or after \n).
2371
2372 Argument: points to start of expression (the bracket)
2373 Returns: TRUE or FALSE
2374 */
2375
2376 static BOOL
2377 is_startline(const uschar *code)
2378 {
2379 do {
2380 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2381 register int op = *scode;
2382 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2383 { if (!is_startline(scode)) return FALSE; }
2384 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2385 { if (scode[1] != OP_ANY) return FALSE; }
2386 else if (op != OP_CIRC) return FALSE;
2387 code += (code[1] << 8) + code[2];
2388 }
2389 while (*code == OP_ALT);
2390 return TRUE;
2391 }
2392
2393
2394
2395 /*************************************************
2396 * Check for fixed first char *
2397 *************************************************/
2398
2399 /* Try to find out if there is a fixed first character. This is called for
2400 unanchored expressions, as it speeds up their processing quite considerably.
2401 Consider each alternative branch. If they all start with the same char, or with
2402 a bracket all of whose alternatives start with the same char (recurse ad lib),
2403 then we return that char, otherwise -1.
2404
2405 Arguments:
2406 code points to start of expression (the bracket)
2407 options pointer to the options (used to check casing changes)
2408
2409 Returns: -1 or the fixed first char
2410 */
2411
2412 static int
2413 find_firstchar(const uschar *code, int *options)
2414 {
2415 register int c = -1;
2416 do {
2417 int d;
2418 const uschar *scode = first_significant_code(code + 3, options,
2419 PCRE_CASELESS, TRUE);
2420 register int op = *scode;
2421
2422 if (op >= OP_BRA) op = OP_BRA;
2423
2424 switch(op)
2425 {
2426 default:
2427 return -1;
2428
2429 case OP_BRA:
2430 case OP_ASSERT:
2431 case OP_ONCE:
2432 case OP_COND:
2433 if ((d = find_firstchar(scode, options)) < 0) return -1;
2434 if (c < 0) c = d; else if (c != d) return -1;
2435 break;
2436
2437 case OP_EXACT: /* Fall through */
2438 scode++;
2439
2440 case OP_CHARS: /* Fall through */
2441 scode++;
2442
2443 case OP_PLUS:
2444 case OP_MINPLUS:
2445 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2446 break;
2447 }
2448
2449 code += (code[1] << 8) + code[2];
2450 }
2451 while (*code == OP_ALT);
2452 return c;
2453 }
2454
2455
2456
2457
2458
2459 /*************************************************
2460 * Compile a Regular Expression *
2461 *************************************************/
2462
2463 /* This function takes a string and returns a pointer to a block of store
2464 holding a compiled version of the expression.
2465
2466 Arguments:
2467 pattern the regular expression
2468 options various option bits
2469 errorptr pointer to pointer to error text
2470 erroroffset ptr offset in pattern where error was detected
2471 tables pointer to character tables or NULL
2472
2473 Returns: pointer to compiled data block, or NULL on error,
2474 with errorptr and erroroffset set
2475 */
2476
2477 pcre *
2478 pcre_compile(const char *pattern, int options, const char **errorptr,
2479 int *erroroffset, const unsigned char *tables)
2480 {
2481 real_pcre *re;
2482 int length = 3; /* For initial BRA plus length */
2483 int runlength;
2484 int c, reqchar, countlits;
2485 int bracount = 0;
2486 int top_backref = 0;
2487 int branch_extra = 0;
2488 int branch_newextra;
2489 unsigned int brastackptr = 0;
2490 size_t size;
2491 uschar *code;
2492 const uschar *ptr;
2493 compile_data compile_block;
2494 int brastack[BRASTACK_SIZE];
2495 uschar bralenstack[BRASTACK_SIZE];
2496
2497 #ifdef DEBUG
2498 uschar *code_base, *code_end;
2499 #endif
2500
2501 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2502
2503 #ifndef SUPPORT_UTF8
2504 if ((options & PCRE_UTF8) != 0)
2505 {
2506 *errorptr = ERR32;
2507 return NULL;
2508 }
2509 #endif
2510
2511 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512 can do is just return NULL. */
2513
2514 if (errorptr == NULL) return NULL;
2515 *errorptr = NULL;
2516
2517 /* However, we can give a message for this error */
2518
2519 if (erroroffset == NULL)
2520 {
2521 *errorptr = ERR16;
2522 return NULL;
2523 }
2524 *erroroffset = 0;
2525
2526 if ((options & ~PUBLIC_OPTIONS) != 0)
2527 {
2528 *errorptr = ERR17;
2529 return NULL;
2530 }
2531
2532 /* Set up pointers to the individual character tables */
2533
2534 if (tables == NULL) tables = pcre_default_tables;
2535 compile_block.lcc = tables + lcc_offset;
2536 compile_block.fcc = tables + fcc_offset;
2537 compile_block.cbits = tables + cbits_offset;
2538 compile_block.ctypes = tables + ctypes_offset;
2539
2540 /* Reflect pattern for debugging output */
2541
2542 DPRINTF(("------------------------------------------------------------------\n"));
2543 DPRINTF(("%s\n", pattern));
2544
2545 /* The first thing to do is to make a pass over the pattern to compute the
2546 amount of store required to hold the compiled code. This does not have to be
2547 perfect as long as errors are overestimates. At the same time we can detect any
2548 internal flag settings. Make an attempt to correct for any counted white space
2549 if an "extended" flag setting appears late in the pattern. We can't be so
2550 clever for #-comments. */
2551
2552 ptr = (const uschar *)(pattern - 1);
2553 while ((c = *(++ptr)) != 0)
2554 {
2555 int min, max;
2556 int class_charcount;
2557 int bracket_length;
2558
2559 if ((options & PCRE_EXTENDED) != 0)
2560 {
2561 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2562 if (c == '#')
2563 {
2564 /* The space before the ; is to avoid a warning on a silly compiler
2565 on the Macintosh. */
2566 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2567 continue;
2568 }
2569 }
2570
2571 switch(c)
2572 {
2573 /* A backslashed item may be an escaped "normal" character or a
2574 character type. For a "normal" character, put the pointers and
2575 character back so that tests for whitespace etc. in the input
2576 are done correctly. */
2577
2578 case '\\':
2579 {
2580 const uschar *save_ptr = ptr;
2581 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2582 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583 if (c >= 0)
2584 {
2585 ptr = save_ptr;
2586 c = '\\';
2587 goto NORMAL_CHAR;
2588 }
2589 }
2590 length++;
2591
2592 /* A back reference needs an additional 2 bytes, plus either one or 5
2593 bytes for a repeat. We also need to keep the value of the highest
2594 back reference. */
2595
2596 if (c <= -ESC_REF)
2597 {
2598 int refnum = -c - ESC_REF;
2599 if (refnum > top_backref) top_backref = refnum;
2600 length += 2; /* For single back reference */
2601 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2602 {
2603 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2604 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2605 if ((min == 0 && (max == 1 || max == -1)) ||
2606 (min == 1 && max == -1))
2607 length++;
2608 else length += 5;
2609 if (ptr[1] == '?') ptr++;
2610 }
2611 }
2612 continue;
2613
2614 case '^':
2615 case '.':
2616 case '$':
2617 case '*': /* These repeats won't be after brackets; */
2618 case '+': /* those are handled separately */
2619 case '?':
2620 length++;
2621 continue;
2622
2623 /* This covers the cases of repeats after a single char, metachar, class,
2624 or back reference. */
2625
2626 case '{':
2627 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2628 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2629 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630 if ((min == 0 && (max == 1 || max == -1)) ||
2631 (min == 1 && max == -1))
2632 length++;
2633 else
2634 {
2635 length--; /* Uncount the original char or metachar */
2636 if (min == 1) length++; else if (min > 0) length += 4;
2637 if (max > 0) length += 4; else length += 2;
2638 }
2639 if (ptr[1] == '?') ptr++;
2640 continue;
2641
2642 /* An alternation contains an offset to the next branch or ket. If any ims
2643 options changed in the previous branch(es), and/or if we are in a
2644 lookbehind assertion, extra space will be needed at the start of the
2645 branch. This is handled by branch_extra. */
2646
2647 case '|':
2648 length += 3 + branch_extra;
2649 continue;
2650
2651 /* A character class uses 33 characters. Don't worry about character types
2652 that aren't allowed in classes - they'll get picked up during the compile.
2653 A character class that contains only one character uses 2 or 3 bytes,
2654 depending on whether it is negated or not. Notice this where we can. */
2655
2656 case '[':
2657 class_charcount = 0;
2658 if (*(++ptr) == '^') ptr++;
2659 do
2660 {
2661 if (*ptr == '\\')
2662 {
2663 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2664 &compile_block);
2665 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2666 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2667 }
2668 else class_charcount++;
2669 ptr++;
2670 }
2671 while (*ptr != 0 && *ptr != ']');
2672
2673 /* Repeats for negated single chars are handled by the general code */
2674
2675 if (class_charcount == 1) length += 3; else
2676 {
2677 length += 33;
2678
2679 /* A repeat needs either 1 or 5 bytes. */
2680
2681 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2682 {
2683 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2684 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2685 if ((min == 0 && (max == 1 || max == -1)) ||
2686 (min == 1 && max == -1))
2687 length++;
2688 else length += 5;
2689 if (ptr[1] == '?') ptr++;
2690 }
2691 }
2692 continue;
2693
2694 /* Brackets may be genuine groups or special things */
2695
2696 case '(':
2697 branch_newextra = 0;
2698 bracket_length = 3;
2699
2700 /* Handle special forms of bracket, which all start (? */
2701
2702 if (ptr[1] == '?')
2703 {
2704 int set, unset;
2705 int *optset;
2706
2707 switch (c = ptr[2])
2708 {
2709 /* Skip over comments entirely */
2710 case '#':
2711 ptr += 3;
2712 while (*ptr != 0 && *ptr != ')') ptr++;
2713 if (*ptr == 0)
2714 {
2715 *errorptr = ERR18;
2716 goto PCRE_ERROR_RETURN;
2717 }
2718 continue;
2719
2720 /* Non-referencing groups and lookaheads just move the pointer on, and
2721 then behave like a non-special bracket, except that they don't increment
2722 the count of extracting brackets. Ditto for the "once only" bracket,
2723 which is in Perl from version 5.005. */
2724
2725 case ':':
2726 case '=':
2727 case '!':
2728 case '>':
2729 ptr += 2;
2730 break;
2731
2732 /* A recursive call to the regex is an extension, to provide the
2733 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2734
2735 case 'R':
2736 if (ptr[3] != ')')
2737 {
2738 *errorptr = ERR29;
2739 goto PCRE_ERROR_RETURN;
2740 }
2741 ptr += 3;
2742 length += 1;
2743 break;
2744
2745 /* Lookbehinds are in Perl from version 5.005 */
2746
2747 case '<':
2748 if (ptr[3] == '=' || ptr[3] == '!')
2749 {
2750 ptr += 3;
2751 branch_newextra = 3;
2752 length += 3; /* For the first branch */
2753 break;
2754 }
2755 *errorptr = ERR24;
2756 goto PCRE_ERROR_RETURN;
2757
2758 /* Conditionals are in Perl from version 5.005. The bracket must either
2759 be followed by a number (for bracket reference) or by an assertion
2760 group. */
2761
2762 case '(':
2763 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2764 {
2765 ptr += 4;
2766 length += 3;
2767 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2768 if (*ptr != ')')
2769 {
2770 *errorptr = ERR26;
2771 goto PCRE_ERROR_RETURN;
2772 }
2773 }
2774 else /* An assertion must follow */
2775 {
2776 ptr++; /* Can treat like ':' as far as spacing is concerned */
2777 if (ptr[2] != '?' ||
2778 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2779 {
2780 ptr += 2; /* To get right offset in message */
2781 *errorptr = ERR28;
2782 goto PCRE_ERROR_RETURN;
2783 }
2784 }
2785 break;
2786
2787 /* Else loop checking valid options until ) is met. Anything else is an
2788 error. If we are without any brackets, i.e. at top level, the settings
2789 act as if specified in the options, so massage the options immediately.
2790 This is for backward compatibility with Perl 5.004. */
2791
2792 default:
2793 set = unset = 0;
2794 optset = &set;
2795 ptr += 2;
2796
2797 for (;; ptr++)
2798 {
2799 c = *ptr;
2800 switch (c)
2801 {
2802 case 'i':
2803 *optset |= PCRE_CASELESS;
2804 continue;
2805
2806 case 'm':
2807 *optset |= PCRE_MULTILINE;
2808 continue;
2809
2810 case 's':
2811 *optset |= PCRE_DOTALL;
2812 continue;
2813
2814 case 'x':
2815 *optset |= PCRE_EXTENDED;
2816 continue;
2817
2818 case 'X':
2819 *optset |= PCRE_EXTRA;
2820 continue;
2821
2822 case 'U':
2823 *optset |= PCRE_UNGREEDY;
2824 continue;
2825
2826 case '-':
2827 optset = &unset;
2828 continue;
2829
2830 /* A termination by ')' indicates an options-setting-only item;
2831 this is global at top level; otherwise nothing is done here and
2832 it is handled during the compiling process on a per-bracket-group
2833 basis. */
2834
2835 case ')':
2836 if (brastackptr == 0)
2837 {
2838 options = (options | set) & (~unset);
2839 set = unset = 0; /* To save length */
2840 }
2841 /* Fall through */
2842
2843 /* A termination by ':' indicates the start of a nested group with
2844 the given options set. This is again handled at compile time, but
2845 we must allow for compiled space if any of the ims options are
2846 set. We also have to allow for resetting space at the end of
2847 the group, which is why 4 is added to the length and not just 2.
2848 If there are several changes of options within the same group, this
2849 will lead to an over-estimate on the length, but this shouldn't
2850 matter very much. We also have to allow for resetting options at
2851 the start of any alternations, which we do by setting
2852 branch_newextra to 2. Finally, we record whether the case-dependent
2853 flag ever changes within the regex. This is used by the "required
2854 character" code. */
2855
2856 case ':':
2857 if (((set|unset) & PCRE_IMS) != 0)
2858 {
2859 length += 4;
2860 branch_newextra = 2;
2861 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2862 }
2863 goto END_OPTIONS;
2864
2865 /* Unrecognized option character */
2866
2867 default:
2868 *errorptr = ERR12;
2869 goto PCRE_ERROR_RETURN;
2870 }
2871 }
2872
2873 /* If we hit a closing bracket, that's it - this is a freestanding
2874 option-setting. We need to ensure that branch_extra is updated if
2875 necessary. The only values branch_newextra can have here are 0 or 2.
2876 If the value is 2, then branch_extra must either be 2 or 5, depending
2877 on whether this is a lookbehind group or not. */
2878
2879 END_OPTIONS:
2880 if (c == ')')
2881 {
2882 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2883 branch_extra += branch_newextra;
2884 continue;
2885 }
2886
2887 /* If options were terminated by ':' control comes here. Fall through
2888 to handle the group below. */
2889 }
2890 }
2891
2892 /* Extracting brackets must be counted so we can process escapes in a
2893 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894 need an additional 3 bytes of store per extracting bracket. */
2895
2896 else
2897 {
2898 bracount++;
2899 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2900 }
2901
2902 /* Save length for computing whole length at end if there's a repeat that
2903 requires duplication of the group. Also save the current value of
2904 branch_extra, and start the new group with the new value. If non-zero, this
2905 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2906
2907 if (brastackptr >= sizeof(brastack)/sizeof(int))
2908 {
2909 *errorptr = ERR19;
2910 goto PCRE_ERROR_RETURN;
2911 }
2912
2913 bralenstack[brastackptr] = branch_extra;
2914 branch_extra = branch_newextra;
2915
2916 brastack[brastackptr++] = length;
2917 length += bracket_length;
2918 continue;
2919
2920 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2921 have to replicate this bracket up to that many times. If brastackptr is
2922 0 this is an unmatched bracket which will generate an error, but take care
2923 not to try to access brastack[-1] when computing the length and restoring
2924 the branch_extra value. */
2925
2926 case ')':
2927 length += 3;
2928 {
2929 int minval = 1;
2930 int maxval = 1;
2931 int duplength;
2932
2933 if (brastackptr > 0)
2934 {
2935 duplength = length - brastack[--brastackptr];
2936 branch_extra = bralenstack[brastackptr];
2937 }
2938 else duplength = 0;
2939
2940 /* Leave ptr at the final char; for read_repeat_counts this happens
2941 automatically; for the others we need an increment. */
2942
2943 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2944 {
2945 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2946 &compile_block);
2947 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2948 }
2949 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2950 else if (c == '+') { maxval = -1; ptr++; }
2951 else if (c == '?') { minval = 0; ptr++; }
2952
2953 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2954 group, and if the maximum is greater than zero, we have to replicate
2955 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2956 bracket set - hence the 7. */
2957
2958 if (minval == 0)
2959 {
2960 length++;
2961 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2962 }
2963
2964 /* When the minimum is greater than zero, 1 we have to replicate up to
2965 minval-1 times, with no additions required in the copies. Then, if
2966 there is a limited maximum we have to replicate up to maxval-1 times
2967 allowing for a BRAZERO item before each optional copy and nesting
2968 brackets for all but one of the optional copies. */
2969
2970 else
2971 {
2972 length += (minval - 1) * duplength;
2973 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2974 length += (maxval - minval) * (duplength + 7) - 6;
2975 }
2976 }
2977 continue;
2978
2979 /* Non-special character. For a run of such characters the length required
2980 is the number of characters + 2, except that the maximum run length is 255.
2981 We won't get a skipped space or a non-data escape or the start of a #
2982 comment as the first character, so the length can't be zero. */
2983
2984 NORMAL_CHAR:
2985 default:
2986 length += 2;
2987 runlength = 0;
2988 do
2989 {
2990 if ((options & PCRE_EXTENDED) != 0)
2991 {
2992 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2993 if (c == '#')
2994 {
2995 /* The space before the ; is to avoid a warning on a silly compiler
2996 on the Macintosh. */
2997 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2998 continue;
2999 }
3000 }
3001
3002 /* Backslash may introduce a data char or a metacharacter; stop the
3003 string before the latter. */
3004
3005 if (c == '\\')
3006 {
3007 const uschar *saveptr = ptr;
3008 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3009 &compile_block);
3010 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011 if (c < 0) { ptr = saveptr; break; }
3012
3013 #ifdef SUPPORT_UTF8
3014 if (c > 127 && (options & PCRE_UTF8) != 0)
3015 {
3016 int i;
3017 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018 if (c <= utf8_table1[i]) break;
3019 runlength += i;
3020 }
3021 #endif
3022 }
3023
3024 /* Ordinary character or single-char escape */
3025
3026 runlength++;
3027 }
3028
3029 /* This "while" is the end of the "do" above. */
3030
3031 while (runlength < MAXLIT &&
3032 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3033
3034 ptr--;
3035 length += runlength;
3036 continue;
3037 }
3038 }
3039
3040 length += 4; /* For final KET and END */
3041
3042 if (length > 65539)
3043 {
3044 *errorptr = ERR20;
3045 return NULL;
3046 }
3047
3048 /* Compute the size of data block needed and get it, either from malloc or
3049 externally provided function. We specify "code[0]" in the offsetof() expression
3050 rather than just "code", because it has been reported that one broken compiler
3051 fails on "code" because it is also an independent variable. It should make no
3052 difference to the value of the offsetof(). */
3053
3054 size = length + offsetof(real_pcre, code[0]);
3055 re = (real_pcre *)(pcre_malloc)(size);
3056
3057 if (re == NULL)
3058 {
3059 *errorptr = ERR21;
3060 return NULL;
3061 }
3062
3063 /* Put in the magic number, and save the size, options, and table pointer */
3064
3065 re->magic_number = MAGIC_NUMBER;
3066 re->size = size;
3067 re->options = options;
3068 re->tables = tables;
3069
3070 /* Set up a starting, non-extracting bracket, then compile the expression. On
3071 error, *errorptr will be set non-NULL, so we don't need to look at the result
3072 of the function here. */
3073
3074 ptr = (const uschar *)pattern;
3075 code = re->code;
3076 *code = OP_BRA;
3077 bracount = 0;
3078 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079 &reqchar, &countlits, &compile_block);
3080 re->top_bracket = bracount;
3081 re->top_backref = top_backref;
3082
3083 /* If not reached end of pattern on success, there's an excess bracket. */
3084
3085 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3086
3087 /* Fill in the terminating state and check for disastrous overflow, but
3088 if debugging, leave the test till after things are printed out. */
3089
3090 *code++ = OP_END;
3091
3092 #ifndef DEBUG
3093 if (code - re->code > length) *errorptr = ERR23;
3094 #endif
3095
3096 /* Give an error if there's back reference to a non-existent capturing
3097 subpattern. */
3098
3099 if (top_backref > re->top_bracket) *errorptr = ERR15;
3100
3101 /* Failed to compile */
3102
3103 if (*errorptr != NULL)
3104 {
3105 (pcre_free)(re);
3106 PCRE_ERROR_RETURN:
3107 *erroroffset = ptr - (const uschar *)pattern;
3108 return NULL;
3109 }
3110
3111 /* If the anchored option was not passed, set flag if we can determine that the
3112 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3113 starting with .* when DOTALL is set).
3114
3115 Otherwise, see if we can determine what the first character has to be, because
3116 that speeds up unanchored matches no end. If not, see if we can set the
3117 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3118 start with ^. and also when all branches start with .* for non-DOTALL matches.
3119 */
3120
3121 if ((options & PCRE_ANCHORED) == 0)
3122 {
3123 int temp_options = options;
3124 if (is_anchored(re->code, &temp_options))
3125 re->options |= PCRE_ANCHORED;
3126 else
3127 {
3128 int ch = find_firstchar(re->code, &temp_options);
3129 if (ch >= 0)
3130 {
3131 re->first_char = ch;
3132 re->options |= PCRE_FIRSTSET;
3133 }
3134 else if (is_startline(re->code))
3135 re->options |= PCRE_STARTLINE;
3136 }
3137 }
3138
3139 /* Save the last required character if there are at least two literal
3140 characters on all paths, or if there is no first character setting. */
3141
3142 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3143 {
3144 re->req_char = reqchar;
3145 re->options |= PCRE_REQCHSET;
3146 }
3147
3148 /* Print out the compiled data for debugging */
3149
3150 #ifdef DEBUG
3151
3152 printf("Length = %d top_bracket = %d top_backref = %d\n",
3153 length, re->top_bracket, re->top_backref);
3154
3155 if (re->options != 0)
3156 {
3157 printf("%s%s%s%s%s%s%s%s%s\n",
3158 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3159 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3160 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3161 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3162 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3163 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3164 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3165 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3166 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3167 }
3168
3169 if ((re->options & PCRE_FIRSTSET) != 0)
3170 {
3171 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3172 else printf("First char = \\x%02x\n", re->first_char);
3173 }
3174
3175 if ((re->options & PCRE_REQCHSET) != 0)
3176 {
3177 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3178 else printf("Req char = \\x%02x\n", re->req_char);
3179 }
3180
3181 code_end = code;
3182 code_base = code = re->code;
3183
3184 while (code < code_end)
3185 {
3186 int charlength;
3187
3188 printf("%3d ", code - code_base);
3189
3190 if (*code >= OP_BRA)
3191 {
3192 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193 printf("%3d Bra extra", (code[1] << 8) + code[2]);
3194 else
3195 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3196 code += 2;
3197 }
3198
3199 else switch(*code)
3200 {
3201 case OP_OPT:
3202 printf(" %.2x %s", code[1], OP_names[*code]);
3203 code++;
3204 break;
3205
3206 case OP_CHARS:
3207 charlength = *(++code);
3208 printf("%3d ", charlength);
3209 while (charlength-- > 0)
3210 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3211 break;
3212
3213 case OP_KETRMAX:
3214 case OP_KETRMIN:
3215 case OP_ALT:
3216 case OP_KET:
3217 case OP_ASSERT:
3218 case OP_ASSERT_NOT:
3219 case OP_ASSERTBACK:
3220 case OP_ASSERTBACK_NOT:
3221 case OP_ONCE:
3222 case OP_REVERSE:
3223 case OP_BRANUMBER:
3224 case OP_COND:
3225 case OP_CREF:
3226 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3227 code += 2;
3228 break;
3229
3230 case OP_STAR:
3231 case OP_MINSTAR:
3232 case OP_PLUS:
3233 case OP_MINPLUS:
3234 case OP_QUERY:
3235 case OP_MINQUERY:
3236 case OP_TYPESTAR:
3237 case OP_TYPEMINSTAR:
3238 case OP_TYPEPLUS:
3239 case OP_TYPEMINPLUS:
3240 case OP_TYPEQUERY:
3241 case OP_TYPEMINQUERY:
3242 if (*code >= OP_TYPESTAR)
3243 printf(" %s", OP_names[code[1]]);
3244 else if (isprint(c = code[1])) printf(" %c", c);
3245 else printf(" \\x%02x", c);
3246 printf("%s", OP_names[*code++]);
3247 break;
3248
3249 case OP_EXACT:
3250 case OP_UPTO:
3251 case OP_MINUPTO:
3252 if (isprint(c = code[3])) printf(" %c{", c);
3253 else printf(" \\x%02x{", c);
3254 if (*code != OP_EXACT) printf("0,");
3255 printf("%d}", (code[1] << 8) + code[2]);
3256 if (*code == OP_MINUPTO) printf("?");
3257 code += 3;
3258 break;
3259
3260 case OP_TYPEEXACT:
3261 case OP_TYPEUPTO:
3262 case OP_TYPEMINUPTO:
3263 printf(" %s{", OP_names[code[3]]);
3264 if (*code != OP_TYPEEXACT) printf(",");
3265 printf("%d}", (code[1] << 8) + code[2]);
3266 if (*code == OP_TYPEMINUPTO) printf("?");
3267 code += 3;
3268 break;
3269
3270 case OP_NOT:
3271 if (isprint(c = *(++code))) printf(" [^%c]", c);
3272 else printf(" [^\\x%02x]", c);
3273 break;
3274
3275 case OP_NOTSTAR:
3276 case OP_NOTMINSTAR:
3277 case OP_NOTPLUS:
3278 case OP_NOTMINPLUS:
3279 case OP_NOTQUERY:
3280 case OP_NOTMINQUERY:
3281 if (isprint(c = code[1])) printf(" [^%c]", c);
3282 else printf(" [^\\x%02x]", c);
3283 printf("%s", OP_names[*code++]);
3284 break;
3285
3286 case OP_NOTEXACT:
3287 case OP_NOTUPTO:
3288 case OP_NOTMINUPTO:
3289 if (isprint(c = code[3])) printf(" [^%c]{", c);
3290 else printf(" [^\\x%02x]{", c);
3291 if (*code != OP_NOTEXACT) printf(",");
3292 printf("%d}", (code[1] << 8) + code[2]);
3293 if (*code == OP_NOTMINUPTO) printf("?");
3294 code += 3;
3295 break;
3296
3297 case OP_REF:
3298 printf(" \\%d", (code[1] << 8) | code[2]);
3299 code += 3;
3300 goto CLASS_REF_REPEAT;
3301
3302 case OP_CLASS:
3303 {
3304 int i, min, max;
3305 code++;
3306 printf(" [");
3307
3308 for (i = 0; i < 256; i++)
3309 {
3310 if ((code[i/8] & (1 << (i&7))) != 0)
3311 {
3312 int j;
3313 for (j = i+1; j < 256; j++)
3314 if ((code[j/8] & (1 << (j&7))) == 0) break;
3315 if (i == '-' || i == ']') printf("\\");
3316 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3317 if (--j > i)
3318 {
3319 printf("-");
3320 if (j == '-' || j == ']') printf("\\");
3321 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3322 }
3323 i = j;
3324 }
3325 }
3326 printf("]");
3327 code += 32;
3328
3329 CLASS_REF_REPEAT:
3330
3331 switch(*code)
3332 {
3333 case OP_CRSTAR:
3334 case OP_CRMINSTAR:
3335 case OP_CRPLUS:
3336 case OP_CRMINPLUS:
3337 case OP_CRQUERY:
3338 case OP_CRMINQUERY:
3339 printf("%s", OP_names[*code]);
3340 break;
3341
3342 case OP_CRRANGE:
3343 case OP_CRMINRANGE:
3344 min = (code[1] << 8) + code[2];
3345 max = (code[3] << 8) + code[4];
3346 if (max == 0) printf("{%d,}", min);
3347 else printf("{%d,%d}", min, max);
3348 if (*code == OP_CRMINRANGE) printf("?");
3349 code += 4;
3350 break;
3351
3352 default:
3353 code--;
3354 }
3355 }
3356 break;
3357
3358 /* Anything else is just a one-node item */
3359
3360 default:
3361 printf(" %s", OP_names[*code]);
3362 break;
3363 }
3364
3365 code++;
3366 printf("\n");
3367 }
3368 printf("------------------------------------------------------------------\n");
3369
3370 /* This check is done here in the debugging case so that the code that
3371 was compiled can be seen. */
3372
3373 if (code - re->code > length)
3374 {
3375 *errorptr = ERR23;
3376 (pcre_free)(re);
3377 *erroroffset = ptr - (uschar *)pattern;
3378 return NULL;
3379 }
3380 #endif
3381
3382 return (pcre *)re;
3383 }
3384
3385
3386
3387 /*************************************************
3388 * Match a back-reference *
3389 *************************************************/
3390
3391 /* If a back reference hasn't been set, the length that is passed is greater
3392 than the number of characters left in the string, so the match fails.
3393
3394 Arguments:
3395 offset index into the offset vector
3396 eptr points into the subject
3397 length length to be matched
3398 md points to match data block
3399 ims the ims flags
3400
3401 Returns: TRUE if matched
3402 */
3403
3404 static BOOL
3405 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3406 unsigned long int ims)
3407 {
3408 const uschar *p = md->start_subject + md->offset_vector[offset];
3409
3410 #ifdef DEBUG
3411 if (eptr >= md->end_subject)
3412 printf("matching subject <null>");
3413 else
3414 {
3415 printf("matching subject ");
3416 pchars(eptr, length, TRUE, md);
3417 }
3418 printf(" against backref ");
3419 pchars(p, length, FALSE, md);
3420 printf("\n");
3421 #endif
3422
3423 /* Always fail if not enough characters left */
3424
3425 if (length > md->end_subject - eptr) return FALSE;
3426
3427 /* Separate the caselesss case for speed */
3428
3429 if ((ims & PCRE_CASELESS) != 0)
3430 {
3431 while (length-- > 0)
3432 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3433 }
3434 else
3435 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3436
3437 return TRUE;
3438 }
3439
3440
3441
3442 /*************************************************
3443 * Match from current position *
3444 *************************************************/
3445
3446 /* On entry ecode points to the first opcode, and eptr to the first character
3447 in the subject string, while eptrb holds the value of eptr at the start of the
3448 last bracketed group - used for breaking infinite loops matching zero-length
3449 strings.
3450
3451 Arguments:
3452 eptr pointer in subject
3453 ecode position in code
3454 offset_top current top pointer
3455 md pointer to "static" info for the match
3456 ims current /i, /m, and /s options
3457 eptrb pointer to chain of blocks containing eptr at start of
3458 brackets - for testing for empty matches
3459 flags can contain
3460 match_condassert - this is an assertion condition
3461 match_isgroup - this is the start of a bracketed group
3462
3463 Returns: TRUE if matched
3464 */
3465
3466 static BOOL
3467 match(register const uschar *eptr, register const uschar *ecode,
3468 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3469 int flags)
3470 {
3471 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3472 eptrblock newptrb;
3473
3474 /* At the start of a bracketed group, add the current subject pointer to the
3475 stack of such pointers, to be re-instated at the end of the group when we hit
3476 the closing ket. When match() is called in other circumstances, we don't add to
3477 the stack. */
3478
3479 if ((flags & match_isgroup) != 0)
3480 {
3481 newptrb.prev = eptrb;
3482 newptrb.saved_eptr = eptr;
3483 eptrb = &newptrb;
3484 }
3485
3486 /* Now start processing the operations. */
3487
3488 for (;;)
3489 {
3490 int op = (int)*ecode;
3491 int min, max, ctype;
3492 register int i;
3493 register int c;
3494 BOOL minimize = FALSE;
3495
3496 /* Opening capturing bracket. If there is space in the offset vector, save
3497 the current subject position in the working slot at the top of the vector. We
3498 mustn't change the current values of the data slot, because they may be set
3499 from a previous iteration of this group, and be referred to by a reference
3500 inside the group.
3501
3502 If the bracket fails to match, we need to restore this value and also the
3503 values of the final offsets, in case they were set by a previous iteration of
3504 the same bracket.
3505
3506 If there isn't enough space in the offset vector, treat this as if it were a
3507 non-capturing bracket. Don't worry about setting the flag for the error case
3508 here; that is handled in the code for KET. */
3509
3510 if (op > OP_BRA)
3511 {
3512 int offset;
3513 int number = op - OP_BRA;
3514
3515 /* For extended extraction brackets (large number), we have to fish out the
3516 number from a dummy opcode at the start. */
3517
3518 if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519 offset = number << 1;
3520
3521 #ifdef DEBUG
3522 printf("start bracket %d subject=", number);
3523 pchars(eptr, 16, TRUE, md);
3524 printf("\n");
3525 #endif
3526
3527 if (offset < md->offset_max)
3528 {
3529 int save_offset1 = md->offset_vector[offset];
3530 int save_offset2 = md->offset_vector[offset+1];
3531 int save_offset3 = md->offset_vector[md->offset_end - number];
3532
3533 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3534 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3535
3536 do
3537 {
3538 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3539 return TRUE;
3540 ecode += (ecode[1] << 8) + ecode[2];
3541 }
3542 while (*ecode == OP_ALT);
3543
3544 DPRINTF(("bracket %d failed\n", number));
3545
3546 md->offset_vector[offset] = save_offset1;
3547 md->offset_vector[offset+1] = save_offset2;
3548 md->offset_vector[md->offset_end - number] = save_offset3;
3549
3550 return FALSE;
3551 }
3552
3553 /* Insufficient room for saving captured contents */
3554
3555 else op = OP_BRA;
3556 }
3557
3558 /* Other types of node can be handled by a switch */
3559
3560 switch(op)
3561 {
3562 case OP_BRA: /* Non-capturing bracket: optimized */
3563 DPRINTF(("start bracket 0\n"));
3564 do
3565 {
3566 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567 return TRUE;
3568 ecode += (ecode[1] << 8) + ecode[2];
3569 }
3570 while (*ecode == OP_ALT);
3571 DPRINTF(("bracket 0 failed\n"));
3572 return FALSE;
3573
3574 /* Conditional group: compilation checked that there are no more than
3575 two branches. If the condition is false, skipping the first branch takes us
3576 past the end if there is only one branch, but that's OK because that is
3577 exactly what going to the ket would do. */
3578
3579 case OP_COND:
3580 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3581 {
3582 int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3583 return match(eptr,
3584 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585 6 : 3 + (ecode[1] << 8) + ecode[2]),
3586 offset_top, md, ims, eptrb, match_isgroup);
3587 }
3588
3589 /* The condition is an assertion. Call match() to evaluate it - setting
3590 the final argument TRUE causes it to stop at the end of an assertion. */
3591
3592 else
3593 {
3594 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595 match_condassert | match_isgroup))
3596 {
3597 ecode += 3 + (ecode[4] << 8) + ecode[5];
3598 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599 }
3600 else ecode += (ecode[1] << 8) + ecode[2];
3601 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602 }
3603 /* Control never reaches here */
3604
3605 /* Skip over conditional reference or large extraction number data if
3606 encountered. */
3607
3608 case OP_CREF:
3609 case OP_BRANUMBER:
3610 ecode += 3;
3611 break;
3612
3613 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3614 an empty string - recursion will then try other alternatives, if any. */
3615
3616 case OP_END:
3617 if (md->notempty && eptr == md->start_match) return FALSE;
3618 md->end_match_ptr = eptr; /* Record where we ended */
3619 md->end_offset_top = offset_top; /* and how many extracts were taken */
3620 return TRUE;
3621
3622 /* Change option settings */
3623
3624 case OP_OPT:
3625 ims = ecode[1];
3626 ecode += 2;
3627 DPRINTF(("ims set to %02lx\n", ims));
3628 break;
3629
3630 /* Assertion brackets. Check the alternative branches in turn - the
3631 matching won't pass the KET for an assertion. If any one branch matches,
3632 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3633 start of each branch to move the current point backwards, so the code at
3634 this level is identical to the lookahead case. */
3635
3636 case OP_ASSERT:
3637 case OP_ASSERTBACK:
3638 do
3639 {
3640 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3641 ecode += (ecode[1] << 8) + ecode[2];
3642 }
3643 while (*ecode == OP_ALT);
3644 if (*ecode == OP_KET) return FALSE;
3645
3646 /* If checking an assertion for a condition, return TRUE. */
3647
3648 if ((flags & match_condassert) != 0) return TRUE;
3649
3650 /* Continue from after the assertion, updating the offsets high water
3651 mark, since extracts may have been taken during the assertion. */
3652
3653 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3654 ecode += 3;
3655 offset_top = md->end_offset_top;
3656 continue;
3657
3658 /* Negative assertion: all branches must fail to match */
3659
3660 case OP_ASSERT_NOT:
3661 case OP_ASSERTBACK_NOT:
3662 do
3663 {
3664 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3665 return FALSE;
3666 ecode += (ecode[1] << 8) + ecode[2];
3667 }
3668 while (*ecode == OP_ALT);
3669
3670 if ((flags & match_condassert) != 0) return TRUE;
3671
3672 ecode += 3;
3673 continue;
3674
3675 /* Move the subject pointer back. This occurs only at the start of
3676 each branch of a lookbehind assertion. If we are too close to the start to
3677 move back, this match function fails. When working with UTF-8 we move
3678 back a number of characters, not bytes. */
3679
3680 case OP_REVERSE:
3681 #ifdef SUPPORT_UTF8
3682 c = (ecode[1] << 8) + ecode[2];
3683 for (i = 0; i < c; i++)
3684 {
3685 eptr--;
3686 BACKCHAR(eptr)
3687 }
3688 #else
3689 eptr -= (ecode[1] << 8) + ecode[2];
3690 #endif
3691
3692 if (eptr < md->start_subject) return FALSE;
3693 ecode += 3;
3694 break;
3695
3696 /* Recursion matches the current regex, nested. If there are any capturing
3697 brackets started but not finished, we have to save their starting points
3698 and reinstate them after the recursion. However, we don't know how many
3699 such there are (offset_top records the completed total) so we just have
3700 to save all the potential data. There may be up to 99 such values, which
3701 is a bit large to put on the stack, but using malloc for small numbers
3702 seems expensive. As a compromise, the stack is used when there are fewer
3703 than 16 values to store; otherwise malloc is used. A problem is what to do
3704 if the malloc fails ... there is no way of returning to the top level with
3705 an error. Save the top 15 values on the stack, and accept that the rest
3706 may be wrong. */
3707
3708 case OP_RECURSE:
3709 {
3710 BOOL rc;
3711 int *save;
3712 int stacksave[15];
3713
3714 c = md->offset_max;
3715
3716 if (c < 16) save = stacksave; else
3717 {
3718 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3719 if (save == NULL)
3720 {
3721 save = stacksave;
3722 c = 15;
3723 }
3724 }
3725
3726 for (i = 1; i <= c; i++)
3727 save[i] = md->offset_vector[md->offset_end - i];
3728 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3729 match_isgroup);
3730 for (i = 1; i <= c; i++)
3731 md->offset_vector[md->offset_end - i] = save[i];
3732 if (save != stacksave) (pcre_free)(save);
3733 if (!rc) return FALSE;
3734
3735 /* In case the recursion has set more capturing values, save the final
3736 number, then move along the subject till after the recursive match,
3737 and advance one byte in the pattern code. */
3738
3739 offset_top = md->end_offset_top;
3740 eptr = md->end_match_ptr;
3741 ecode++;
3742 }
3743 break;
3744
3745 /* "Once" brackets are like assertion brackets except that after a match,
3746 the point in the subject string is not moved back. Thus there can never be
3747 a move back into the brackets. Check the alternative branches in turn - the
3748 matching won't pass the KET for this kind of subpattern. If any one branch
3749 matches, we carry on as at the end of a normal bracket, leaving the subject
3750 pointer. */
3751
3752 case OP_ONCE:
3753 {
3754 const uschar *prev = ecode;
3755 const uschar *saved_eptr = eptr;
3756
3757 do
3758 {
3759 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3760 break;
3761 ecode += (ecode[1] << 8) + ecode[2];
3762 }
3763 while (*ecode == OP_ALT);
3764
3765 /* If hit the end of the group (which could be repeated), fail */
3766
3767 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3768
3769 /* Continue as from after the assertion, updating the offsets high water
3770 mark, since extracts may have been taken. */
3771
3772 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3773
3774 offset_top = md->end_offset_top;
3775 eptr = md->end_match_ptr;
3776
3777 /* For a non-repeating ket, just continue at this level. This also
3778 happens for a repeating ket if no characters were matched in the group.
3779 This is the forcible breaking of infinite loops as implemented in Perl
3780 5.005. If there is an options reset, it will get obeyed in the normal
3781 course of events. */
3782
3783 if (*ecode == OP_KET || eptr == saved_eptr)
3784 {
3785 ecode += 3;
3786 break;
3787 }
3788
3789 /* The repeating kets try the rest of the pattern or restart from the
3790 preceding bracket, in the appropriate order. We need to reset any options
3791 that changed within the bracket before re-running it, so check the next
3792 opcode. */
3793
3794 if (ecode[3] == OP_OPT)
3795 {
3796 ims = (ims & ~PCRE_IMS) | ecode[4];
3797 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3798 }
3799
3800 if (*ecode == OP_KETRMIN)
3801 {
3802 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3803 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3804 return TRUE;
3805 }
3806 else /* OP_KETRMAX */
3807 {
3808 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3809 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3810 }
3811 }
3812 return FALSE;
3813
3814 /* An alternation is the end of a branch; scan along to find the end of the
3815 bracketed group and go to there. */
3816
3817 case OP_ALT:
3818 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3819 break;
3820
3821 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3822 that it may occur zero times. It may repeat infinitely, or not at all -
3823 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3824 repeat limits are compiled as a number of copies, with the optional ones
3825 preceded by BRAZERO or BRAMINZERO. */
3826
3827 case OP_BRAZERO:
3828 {
3829 const uschar *next = ecode+1;
3830 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3831 return TRUE;
3832 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833 ecode = next + 3;
3834 }
3835 break;
3836
3837 case OP_BRAMINZERO:
3838 {
3839 const uschar *next = ecode+1;
3840 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3841 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3842 return TRUE;
3843 ecode++;
3844 }
3845 break;
3846
3847 /* End of a group, repeated or non-repeating. If we are at the end of
3848 an assertion "group", stop matching and return TRUE, but record the
3849 current high water mark for use by positive assertions. Do this also
3850 for the "once" (not-backup up) groups. */
3851
3852 case OP_KET:
3853 case OP_KETRMIN:
3854 case OP_KETRMAX:
3855 {
3856 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3857 const uschar *saved_eptr = eptrb->saved_eptr;
3858
3859 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3860
3861 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3862 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3863 *prev == OP_ONCE)
3864 {
3865 md->end_match_ptr = eptr; /* For ONCE */
3866 md->end_offset_top = offset_top;
3867 return TRUE;
3868 }
3869
3870 /* In all other cases except a conditional group we have to check the
3871 group number back at the start and if necessary complete handling an
3872 extraction by setting the offsets and bumping the high water mark. */
3873
3874 if (*prev != OP_COND)
3875 {
3876 int offset;
3877 int number = *prev - OP_BRA;
3878
3879 /* For extended extraction brackets (large number), we have to fish out
3880 the number from a dummy opcode at the start. */
3881
3882 if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883 offset = number << 1;
3884
3885 #ifdef DEBUG
3886 printf("end bracket %d", number);
3887 printf("\n");
3888 #endif
3889
3890 if (number > 0)
3891 {
3892 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3893 {
3894 md->offset_vector[offset] =
3895 md->offset_vector[md->offset_end - number];
3896 md->offset_vector[offset+1] = eptr - md->start_subject;
3897 if (offset_top <= offset) offset_top = offset + 2;
3898 }
3899 }
3900 }
3901
3902 /* Reset the value of the ims flags, in case they got changed during
3903 the group. */
3904
3905 ims = original_ims;
3906 DPRINTF(("ims reset to %02lx\n", ims));
3907
3908 /* For a non-repeating ket, just continue at this level. This also
3909 happens for a repeating ket if no characters were matched in the group.
3910 This is the forcible breaking of infinite loops as implemented in Perl
3911 5.005. If there is an options reset, it will get obeyed in the normal
3912 course of events. */
3913
3914 if (*ecode == OP_KET || eptr == saved_eptr)
3915 {
3916 ecode += 3;
3917 break;
3918 }
3919
3920 /* The repeating kets try the rest of the pattern or restart from the
3921 preceding bracket, in the appropriate order. */
3922
3923 if (*ecode == OP_KETRMIN)
3924 {
3925 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3926 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3927 return TRUE;
3928 }
3929 else /* OP_KETRMAX */
3930 {
3931 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3932 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3933 }
3934 }
3935 return FALSE;
3936
3937 /* Start of subject unless notbol, or after internal newline if multiline */
3938
3939 case OP_CIRC:
3940 if (md->notbol && eptr == md->start_subject) return FALSE;
3941 if ((ims & PCRE_MULTILINE) != 0)
3942 {
3943 if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3944 ecode++;
3945 break;
3946 }
3947 /* ... else fall through */
3948
3949 /* Start of subject assertion */
3950
3951 case OP_SOD:
3952 if (eptr != md->start_subject) return FALSE;
3953 ecode++;
3954 break;
3955
3956 /* Assert before internal newline if multiline, or before a terminating
3957 newline unless endonly is set, else end of subject unless noteol is set. */
3958
3959 case OP_DOLL:
3960 if ((ims & PCRE_MULTILINE) != 0)
3961 {
3962 if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3963 else { if (md->noteol) return FALSE; }
3964 ecode++;
3965 break;
3966 }
3967 else
3968 {
3969 if (md->noteol) return FALSE;
3970 if (!md->endonly)
3971 {
3972 if (eptr < md->end_subject - 1 ||
3973 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3974
3975 ecode++;
3976 break;
3977 }
3978 }
3979 /* ... else fall through */
3980
3981 /* End of subject assertion (\z) */
3982
3983 case OP_EOD:
3984 if (eptr < md->end_subject) return FALSE;
3985 ecode++;
3986 break;
3987
3988 /* End of subject or ending \n assertion (\Z) */
3989
3990 case OP_EODN:
3991 if (eptr < md->end_subject - 1 ||
3992 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3993 ecode++;
3994 break;
3995
3996 /* Word boundary assertions */
3997
3998 case OP_NOT_WORD_BOUNDARY:
3999 case OP_WORD_BOUNDARY:
4000 {
4001 BOOL prev_is_word = (eptr != md->start_subject) &&
4002 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
4003 BOOL cur_is_word = (eptr < md->end_subject) &&
4004 ((md->ctypes[*eptr] & ctype_word) != 0);
4005 if ((*ecode++ == OP_WORD_BOUNDARY)?
4006 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4007 return FALSE;
4008 }
4009 break;
4010
4011 /* Match a single character type; inline for speed */
4012
4013 case OP_ANY:
4014 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4015 return FALSE;
4016 if (eptr++ >= md->end_subject) return FALSE;
4017 #ifdef SUPPORT_UTF8
4018 if (md->utf8)
4019 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4020 #endif
4021 ecode++;
4022 break;
4023
4024 case OP_NOT_DIGIT:
4025 if (eptr >= md->end_subject ||
4026 (md->ctypes[*eptr++] & ctype_digit) != 0)
4027 return FALSE;
4028 ecode++;
4029 break;
4030
4031 case OP_DIGIT:
4032 if (eptr >= md->end_subject ||
4033 (md->ctypes[*eptr++] & ctype_digit) == 0)
4034 return FALSE;
4035 ecode++;
4036 break;
4037
4038 case OP_NOT_WHITESPACE:
4039 if (eptr >= md->end_subject ||
4040 (md->ctypes[*eptr++] & ctype_space) != 0)
4041 return FALSE;
4042 ecode++;
4043 break;
4044
4045 case OP_WHITESPACE:
4046 if (eptr >= md->end_subject ||
4047 (md->ctypes[*eptr++] & ctype_space) == 0)
4048 return FALSE;
4049 ecode++;
4050 break;
4051
4052 case OP_NOT_WORDCHAR:
4053 if (eptr >= md->end_subject ||
4054 (md->ctypes[*eptr++] & ctype_word) != 0)
4055 return FALSE;
4056 ecode++;
4057 break;
4058
4059 case OP_WORDCHAR:
4060 if (eptr >= md->end_subject ||
4061 (md->ctypes[*eptr++] & ctype_word) == 0)
4062 return FALSE;
4063 ecode++;
4064 break;
4065
4066 /* Match a back reference, possibly repeatedly. Look past the end of the
4067 item to see if there is repeat information following. The code is similar
4068 to that for character classes, but repeated for efficiency. Then obey
4069 similar code to character type repeats - written out again for speed.
4070 However, if the referenced string is the empty string, always treat
4071 it as matched, any number of times (otherwise there could be infinite
4072 loops). */
4073
4074 case OP_REF:
4075 {
4076 int length;
4077 int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4078 ecode += 3; /* Advance past item */
4079
4080 /* If the reference is unset, set the length to be longer than the amount
4081 of subject left; this ensures that every attempt at a match fails. We
4082 can't just fail here, because of the possibility of quantifiers with zero
4083 minima. */
4084
4085 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4086 md->end_subject - eptr + 1 :
4087 md->offset_vector[offset+1] - md->offset_vector[offset];
4088
4089 /* Set up for repetition, or handle the non-repeated case */
4090
4091 switch (*ecode)
4092 {
4093 case OP_CRSTAR:
4094 case OP_CRMINSTAR:
4095 case OP_CRPLUS:
4096 case OP_CRMINPLUS:
4097 case OP_CRQUERY:
4098 case OP_CRMINQUERY:
4099 c = *ecode++ - OP_CRSTAR;
4100 minimize = (c & 1) != 0;
4101 min = rep_min[c]; /* Pick up values from tables; */
4102 max = rep_max[c]; /* zero for max => infinity */
4103 if (max == 0) max = INT_MAX;
4104 break;
4105
4106 case OP_CRRANGE:
4107 case OP_CRMINRANGE:
4108 minimize = (*ecode == OP_CRMINRANGE);
4109 min = (ecode[1] << 8) + ecode[2];
4110 max = (ecode[3] << 8) + ecode[4];
4111 if (max == 0) max = INT_MAX;
4112 ecode += 5;
4113 break;
4114
4115 default: /* No repeat follows */
4116 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4117 eptr += length;
4118 continue; /* With the main loop */
4119 }
4120
4121 /* If the length of the reference is zero, just continue with the
4122 main loop. */
4123
4124 if (length == 0) continue;
4125
4126 /* First, ensure the minimum number of matches are present. We get back
4127 the length of the reference string explicitly rather than passing the
4128 address of eptr, so that eptr can be a register variable. */
4129
4130 for (i = 1; i <= min; i++)
4131 {
4132 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4133 eptr += length;
4134 }
4135
4136 /* If min = max, continue at the same level without recursion.
4137 They are not both allowed to be zero. */
4138
4139 if (min == max) continue;
4140
4141 /* If minimizing, keep trying and advancing the pointer */
4142
4143 if (minimize)
4144 {
4145 for (i = min;; i++)
4146 {
4147 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4148 return TRUE;
4149 if (i >= max || !match_ref(offset, eptr, length, md, ims))
4150 return FALSE;
4151 eptr += length;
4152 }
4153 /* Control never gets here */
4154 }
4155
4156 /* If maximizing, find the longest string and work backwards */
4157
4158 else
4159 {
4160 const uschar *pp = eptr;
4161 for (i = min; i < max; i++)
4162 {
4163 if (!match_ref(offset, eptr, length, md, ims)) break;
4164 eptr += length;
4165 }
4166 while (eptr >= pp)
4167 {
4168 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4169 return TRUE;
4170 eptr -= length;
4171 }
4172 return FALSE;
4173 }
4174 }
4175 /* Control never gets here */
4176
4177
4178
4179 /* Match a character class, possibly repeatedly. Look past the end of the
4180 item to see if there is repeat information following. Then obey similar
4181 code to character type repeats - written out again for speed. */
4182
4183 case OP_CLASS:
4184 {
4185 const uschar *data = ecode + 1; /* Save for matching */
4186 ecode += 33; /* Advance past the item */
4187
4188 switch (*ecode)
4189 {
4190 case OP_CRSTAR:
4191 case OP_CRMINSTAR:
4192 case OP_CRPLUS:
4193 case OP_CRMINPLUS:
4194 case OP_CRQUERY:
4195 case OP_CRMINQUERY:
4196 c = *ecode++ - OP_CRSTAR;
4197 minimize = (c & 1) != 0;
4198 min = rep_min[c]; /* Pick up values from tables; */
4199 max = rep_max[c]; /* zero for max => infinity */
4200 if (max == 0) max = INT_MAX;
4201 break;
4202
4203 case OP_CRRANGE:
4204 case OP_CRMINRANGE:
4205 minimize = (*ecode == OP_CRMINRANGE);
4206 min = (ecode[1] << 8) + ecode[2];
4207 max = (ecode[3] << 8) + ecode[4];
4208 if (max == 0) max = INT_MAX;
4209 ecode += 5;
4210 break;
4211
4212 default: /* No repeat follows */
4213 min = max = 1;
4214 break;
4215 }
4216
4217 /* First, ensure the minimum number of matches are present. */
4218
4219 for (i = 1; i <= min; i++)
4220 {
4221 if (eptr >= md->end_subject) return FALSE;
4222 GETCHARINC(c, eptr) /* Get character; increment eptr */
4223
4224 #ifdef SUPPORT_UTF8
4225 /* We do not yet support class members > 255 */
4226 if (c > 255) return FALSE;
4227 #endif
4228
4229 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4230 return FALSE;
4231 }
4232
4233 /* If max == min we can continue with the main loop without the
4234 need to recurse. */
4235
4236 if (min == max) continue;
4237
4238 /* If minimizing, keep testing the rest of the expression and advancing
4239 the pointer while it matches the class. */
4240
4241 if (minimize)
4242 {
4243 for (i = min;; i++)
4244 {
4245 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4246 return TRUE;
4247 if (i >= max || eptr >= md->end_subject) return FALSE;
4248 GETCHARINC(c, eptr) /* Get character; increment eptr */
4249
4250 #ifdef SUPPORT_UTF8
4251 /* We do not yet support class members > 255 */
4252 if (c > 255) return FALSE;
4253 #endif
4254 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4255 return FALSE;
4256 }
4257 /* Control never gets here */
4258 }
4259
4260 /* If maximizing, find the longest possible run, then work backwards. */
4261
4262 else
4263 {
4264 const uschar *pp = eptr;
4265 int len = 1;
4266 for (i = min; i < max; i++)
4267 {
4268 if (eptr >= md->end_subject) break;
4269 GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4270
4271 #ifdef SUPPORT_UTF8
4272 /* We do not yet support class members > 255 */
4273 if (c > 255) break;
4274 #endif
4275 if ((data[c/8] & (1 << (c&7))) == 0) break;
4276 eptr += len;
4277 }
4278
4279 while (eptr >= pp)
4280 {
4281 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4282 return TRUE;
4283
4284 #ifdef SUPPORT_UTF8
4285 BACKCHAR(eptr)
4286 #endif
4287 }
4288 return FALSE;
4289 }
4290 }
4291 /* Control never gets here */
4292
4293 /* Match a run of characters */
4294
4295 case OP_CHARS:
4296 {
4297 register int length = ecode[1];
4298 ecode += 2;
4299
4300 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4301 if (eptr >= md->end_subject)
4302 printf("matching subject <null> against pattern ");
4303 else
4304 {
4305 printf("matching subject ");
4306 pchars(eptr, length, TRUE, md);
4307 printf(" against pattern ");
4308 }
4309 pchars(ecode, length, FALSE, md);
4310 printf("\n");
4311 #endif
4312
4313 if (length > md->end_subject - eptr) return FALSE;
4314 if ((ims & PCRE_CASELESS) != 0)
4315 {
4316 while (length-- > 0)
4317 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4318 return FALSE;
4319 }
4320 else
4321 {
4322 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4323 }
4324 }
4325 break;
4326
4327 /* Match a single character repeatedly; different opcodes share code. */
4328
4329 case OP_EXACT:
4330 min = max = (ecode[1] << 8) + ecode[2];
4331 ecode += 3;
4332 goto REPEATCHAR;
4333
4334 case OP_UPTO:
4335 case OP_MINUPTO:
4336 min = 0;
4337 max = (ecode[1] << 8) + ecode[2];
4338 minimize = *ecode == OP_MINUPTO;
4339 ecode += 3;
4340 goto REPEATCHAR;
4341
4342 case OP_STAR:
4343 case OP_MINSTAR:
4344 case OP_PLUS:
4345 case OP_MINPLUS:
4346 case OP_QUERY:
4347 case OP_MINQUERY:
4348 c = *ecode++ - OP_STAR;
4349 minimize = (c & 1) != 0;
4350 min = rep_min[c]; /* Pick up values from tables; */
4351 max = rep_max[c]; /* zero for max => infinity */
4352 if (max == 0) max = INT_MAX;
4353
4354 /* Common code for all repeated single-character matches. We can give
4355 up quickly if there are fewer than the minimum number of characters left in
4356 the subject. */
4357
4358 REPEATCHAR:
4359 if (min > md->end_subject - eptr) return FALSE;
4360 c = *ecode++;
4361
4362 /* The code is duplicated for the caseless and caseful cases, for speed,
4363 since matching characters is likely to be quite common. First, ensure the
4364 minimum number of matches are present. If min = max, continue at the same
4365 level without recursing. Otherwise, if minimizing, keep trying the rest of
4366 the expression and advancing one matching character if failing, up to the
4367 maximum. Alternatively, if maximizing, find the maximum number of
4368 characters and work backwards. */
4369
4370 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4371 max, eptr));
4372
4373 if ((ims & PCRE_CASELESS) != 0)
4374 {
4375 c = md->lcc[c];
4376 for (i = 1; i <= min; i++)
4377 if (c != md->lcc[*eptr++]) return FALSE;
4378 if (min == max) continue;
4379 if (minimize)
4380 {
4381 for (i = min;; i++)
4382 {
4383 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4384 return TRUE;
4385 if (i >= max || eptr >= md->end_subject ||
4386 c != md->lcc[*eptr++])
4387 return FALSE;
4388 }
4389 /* Control never gets here */
4390 }
4391 else
4392 {
4393 const uschar *pp = eptr;
4394 for (i = min; i < max; i++)
4395 {
4396 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4397 eptr++;
4398 }
4399 while (eptr >= pp)
4400 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4401 return TRUE;
4402 return FALSE;
4403 }
4404 /* Control never gets here */
4405 }
4406
4407 /* Caseful comparisons */
4408
4409 else
4410 {
4411 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4412 if (min == max) continue;
4413 if (minimize)
4414 {
4415 for (i = min;; i++)
4416 {
4417 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4418 return TRUE;
4419 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4420 }
4421 /* Control never gets here */
4422 }
4423 else
4424 {
4425 const uschar *pp = eptr;
4426 for (i = min; i < max; i++)
4427 {
4428 if (eptr >= md->end_subject || c != *eptr) break;
4429 eptr++;
4430 }
4431 while (eptr >= pp)
4432 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4433 return TRUE;
4434 return FALSE;
4435 }
4436 }
4437 /* Control never gets here */
4438
4439 /* Match a negated single character */
4440
4441 case OP_NOT:
4442 if (eptr >= md->end_subject) return FALSE;
4443 ecode++;
4444 if ((ims & PCRE_CASELESS) != 0)
4445 {
4446 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4447 }
4448 else
4449 {
4450 if (*ecode++ == *eptr++) return FALSE;
4451 }
4452 break;
4453
4454 /* Match a negated single character repeatedly. This is almost a repeat of
4455 the code for a repeated single character, but I haven't found a nice way of
4456 commoning these up that doesn't require a test of the positive/negative
4457 option for each character match. Maybe that wouldn't add very much to the
4458 time taken, but character matching *is* what this is all about... */
4459
4460 case OP_NOTEXACT:
4461 min = max = (ecode[1] << 8) + ecode[2];
4462 ecode += 3;
4463 goto REPEATNOTCHAR;
4464
4465 case OP_NOTUPTO:
4466 case OP_NOTMINUPTO:
4467 min = 0;
4468 max = (ecode[1] << 8) + ecode[2];
4469 minimize = *ecode == OP_NOTMINUPTO;
4470 ecode += 3;
4471 goto REPEATNOTCHAR;
4472
4473 case OP_NOTSTAR:
4474 case OP_NOTMINSTAR:
4475 case OP_NOTPLUS:
4476 case OP_NOTMINPLUS:
4477 case OP_NOTQUERY:
4478 case OP_NOTMINQUERY:
4479 c = *ecode++ - OP_NOTSTAR;
4480 minimize = (c & 1) != 0;
4481 min = rep_min[c]; /* Pick up values from tables; */
4482 max = rep_max[c]; /* zero for max => infinity */
4483 if (max == 0) max = INT_MAX;
4484
4485 /* Common code for all repeated single-character matches. We can give
4486 up quickly if there are fewer than the minimum number of characters left in
4487 the subject. */
4488
4489 REPEATNOTCHAR:
4490 if (min > md->end_subject - eptr) return FALSE;
4491 c = *ecode++;
4492
4493 /* The code is duplicated for the caseless and caseful cases, for speed,
4494 since matching characters is likely to be quite common. First, ensure the
4495 minimum number of matches are present. If min = max, continue at the same
4496 level without recursing. Otherwise, if minimizing, keep trying the rest of
4497 the expression and advancing one matching character if failing, up to the
4498 maximum. Alternatively, if maximizing, find the maximum number of
4499 characters and work backwards. */
4500
4501 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4502 max, eptr));
4503
4504 if ((ims & PCRE_CASELESS) != 0)
4505 {
4506 c = md->lcc[c];
4507 for (i = 1; i <= min; i++)
4508 if (c == md->lcc[*eptr++]) return FALSE;
4509 if (min == max) continue;
4510 if (minimize)
4511 {
4512 for (i = min;; i++)
4513 {
4514 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4515 return TRUE;
4516 if (i >= max || eptr >= md->end_subject ||
4517 c == md->lcc[*eptr++])
4518 return FALSE;
4519 }
4520 /* Control never gets here */
4521 }
4522 else
4523 {
4524 const uschar *pp = eptr;
4525 for (i = min; i < max; i++)
4526 {
4527 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4528 eptr++;
4529 }
4530 while (eptr >= pp)
4531 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4532 return TRUE;
4533 return FALSE;
4534 }
4535 /* Control never gets here */
4536 }
4537
4538 /* Caseful comparisons */
4539
4540 else
4541 {
4542 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4543 if (min == max) continue;
4544 if (minimize)
4545 {
4546 for (i = min;; i++)
4547 {
4548 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4549 return TRUE;
4550 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4551 }
4552 /* Control never gets here */
4553 }
4554 else
4555 {
4556 const uschar *pp = eptr;
4557 for (i = min; i < max; i++)
4558 {
4559 if (eptr >= md->end_subject || c == *eptr) break;
4560 eptr++;
4561 }
4562 while (eptr >= pp)
4563 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4564 return TRUE;
4565 return FALSE;
4566 }
4567 }
4568 /* Control never gets here */
4569
4570 /* Match a single character type repeatedly; several different opcodes
4571 share code. This is very similar to the code for single characters, but we
4572 repeat it in the interests of efficiency. */
4573
4574 case OP_TYPEEXACT:
4575 min = max = (ecode[1] << 8) + ecode[2];
4576 minimize = TRUE;
4577 ecode += 3;
4578 goto REPEATTYPE;
4579
4580 case OP_TYPEUPTO:
4581 case OP_TYPEMINUPTO:
4582 min = 0;
4583 max = (ecode[1] << 8) + ecode[2];
4584 minimize = *ecode == OP_TYPEMINUPTO;
4585 ecode += 3;
4586 goto REPEATTYPE;
4587
4588 case OP_TYPESTAR:
4589 case OP_TYPEMINSTAR:
4590 case OP_TYPEPLUS:
4591 case OP_TYPEMINPLUS:
4592 case OP_TYPEQUERY:
4593 case OP_TYPEMINQUERY:
4594 c = *ecode++ - OP_TYPESTAR;
4595 minimize = (c & 1) != 0;
4596 min = rep_min[c]; /* Pick up values from tables; */
4597 max = rep_max[c]; /* zero for max => infinity */
4598 if (max == 0) max = INT_MAX;
4599
4600 /* Common code for all repeated single character type matches */
4601
4602 REPEATTYPE:
4603 ctype = *ecode++; /* Code for the character type */
4604
4605 /* First, ensure the minimum number of matches are present. Use inline
4606 code for maximizing the speed, and do the type test once at the start
4607 (i.e. keep it out of the loop). Also we can test that there are at least
4608 the minimum number of bytes before we start, except when doing '.' in
4609 UTF8 mode. Leave the test in in all cases; in the special case we have
4610 to test after each character. */
4611
4612 if (min > md->end_subject - eptr) return FALSE;
4613 if (min > 0) switch(ctype)
4614 {
4615 case OP_ANY:
4616 #ifdef SUPPORT_UTF8
4617 if (md->utf8)
4618 {
4619 for (i = 1; i <= min; i++)
4620 {
4621 if (eptr >= md->end_subject ||
4622 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4623 return FALSE;
4624 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4625 }
4626 break;
4627 }
4628 #endif
4629 /* Non-UTF8 can be faster */
4630 if ((ims & PCRE_DOTALL) == 0)
4631 { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4632 else eptr += min;
4633 break;
4634
4635 case OP_NOT_DIGIT:
4636 for (i = 1; i <= min; i++)
4637 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4638 break;
4639
4640 case OP_DIGIT:
4641 for (i = 1; i <= min; i++)
4642 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4643 break;
4644
4645 case OP_NOT_WHITESPACE:
4646 for (i = 1; i <= min; i++)
4647 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4648 break;
4649
4650 case OP_WHITESPACE:
4651 for (i = 1; i <= min; i++)
4652 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4653 break;
4654
4655 case OP_NOT_WORDCHAR:
4656 for (i = 1; i <= min; i++)
4657 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4658 return FALSE;
4659 break;
4660
4661 case OP_WORDCHAR:
4662 for (i = 1; i <= min; i++)
4663 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4664 return FALSE;
4665 break;
4666 }
4667
4668 /* If min = max, continue at the same level without recursing */
4669
4670 if (min == max) continue;
4671
4672 /* If minimizing, we have to test the rest of the pattern before each
4673 subsequent match. */
4674
4675 if (minimize)
4676 {
4677 for (i = min;; i++)
4678 {
4679 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4680 if (i >= max || eptr >= md->end_subject) return FALSE;
4681
4682 c = *eptr++;
4683 switch(ctype)
4684 {
4685 case OP_ANY:
4686 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4687 #ifdef SUPPORT_UTF8
4688 if (md->utf8)
4689 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4690 #endif
4691 break;
4692
4693 case OP_NOT_DIGIT:
4694 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4695 break;
4696
4697 case OP_DIGIT:
4698 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4699 break;
4700
4701 case OP_NOT_WHITESPACE:
4702 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4703 break;
4704
4705 case OP_WHITESPACE:
4706 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4707 break;
4708
4709 case OP_NOT_WORDCHAR:
4710 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4711 break;
4712
4713 case OP_WORDCHAR:
4714 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4715 break;
4716 }
4717 }
4718 /* Control never gets here */
4719 }
4720
4721 /* If maximizing it is worth using inline code for speed, doing the type
4722 test once at the start (i.e. keep it out of the loop). */
4723
4724 else
4725 {
4726 const uschar *pp = eptr;
4727 switch(ctype)
4728 {
4729 case OP_ANY:
4730
4731 /* Special code is required for UTF8, but when the maximum is unlimited
4732 we don't need it. */
4733
4734 #ifdef SUPPORT_UTF8
4735 if (md->utf8 && max < INT_MAX)
4736 {
4737 if ((ims & PCRE_DOTALL) == 0)
4738 {
4739 for (i = min; i < max; i++)
4740 {
4741 if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4742 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743 }
4744 }
4745 else
4746 {
4747 for (i = min; i < max; i++)
4748 {
4749 eptr++;
4750 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4751 }
4752 }
4753 break;
4754 }
4755 #endif
4756 /* Non-UTF8 can be faster */
4757 if ((ims & PCRE_DOTALL) == 0)
4758 {
4759 for (i = min; i < max; i++)
4760 {
4761 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4762 eptr++;
4763 }
4764 }
4765 else
4766 {
4767 c = max - min;
4768 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4769 eptr += c;
4770 }
4771 break;
4772
4773 case OP_NOT_DIGIT:
4774 for (i = min; i < max; i++)
4775 {
4776 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4777 break;
4778 eptr++;
4779 }
4780 break;
4781
4782 case OP_DIGIT:
4783 for (i = min; i < max; i++)
4784 {
4785 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4786 break;
4787 eptr++;
4788 }
4789 break;
4790
4791 case OP_NOT_WHITESPACE:
4792 for (i = min; i < max; i++)
4793 {
4794 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4795 break;
4796 eptr++;
4797 }
4798 break;
4799
4800 case OP_WHITESPACE:
4801 for (i = min; i < max; i++)
4802 {
4803 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4804 break;
4805 eptr++;
4806 }
4807 break;
4808
4809 case OP_NOT_WORDCHAR:
4810 for (i = min; i < max; i++)
4811 {
4812 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4813 break;
4814 eptr++;
4815 }
4816 break;
4817
4818 case OP_WORDCHAR:
4819 for (i = min; i < max; i++)
4820 {
4821 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4822 break;
4823 eptr++;
4824 }
4825 break;
4826 }
4827
4828 while (eptr >= pp)
4829 {
4830 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4831 return TRUE;
4832 #ifdef SUPPORT_UTF8
4833 if (md->utf8)
4834 while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4835 #endif
4836 }
4837 return FALSE;
4838 }
4839 /* Control never gets here */
4840
4841 /* There's been some horrible disaster. */
4842
4843 default:
4844 DPRINTF(("Unknown opcode %d\n", *ecode));
4845 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4846 return FALSE;
4847 }
4848
4849 /* Do not stick any code in here without much thought; it is assumed
4850 that "continue" in the code above comes out to here to repeat the main
4851 loop. */
4852
4853 } /* End of main loop */
4854 /* Control never reaches here */
4855 }
4856
4857
4858
4859
4860 /*************************************************
4861 * Execute a Regular Expression *
4862 *************************************************/
4863
4864 /* This function applies a compiled re to a subject string and picks out
4865 portions of the string if it matches. Two elements in the vector are set for
4866 each substring: the offsets to the start and end of the substring.
4867
4868 Arguments:
4869 external_re points to the compiled expression
4870 external_extra points to "hints" from pcre_study() or is NULL
4871 subject points to the subject string
4872 length length of subject string (may contain binary zeros)
4873 start_offset where to start in the subject string
4874 options option bits
4875 offsets points to a vector of ints to be filled in with offsets
4876 offsetcount the number of elements in the vector
4877
4878 Returns: > 0 => success; value is the number of elements filled in
4879 = 0 => success, but offsets is not big enough
4880 -1 => failed to match
4881 < -1 => some kind of unexpected problem
4882 */
4883
4884 int
4885 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4886 const char *subject, int length, int start_offset, int options, int *offsets,
4887 int offsetcount)
4888 {
4889 int resetcount, ocount;
4890 int first_char = -1;
4891 int req_char = -1;
4892 int req_char2 = -1;
4893 unsigned long int ims = 0;
4894 match_data match_block;
4895 const uschar *start_bits = NULL;
4896 const uschar *start_match = (const uschar *)subject + start_offset;
4897 const uschar *end_subject;
4898 const uschar *req_char_ptr = start_match - 1;
4899 const real_pcre *re = (const real_pcre *)external_re;
4900 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4901 BOOL using_temporary_offsets = FALSE;
4902 BOOL anchored;
4903 BOOL startline;
4904
4905 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4906
4907 if (re == NULL || subject == NULL ||
4908 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4909 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4910
4911 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4912 startline = (re->options & PCRE_STARTLINE) != 0;
4913
4914 match_block.start_pattern = re->code;
4915 match_block.start_subject = (const uschar *)subject;
4916 match_block.end_subject = match_block.start_subject + length;
4917 end_subject = match_block.end_subject;
4918
4919 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4920 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4921
4922 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4923 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4924 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4925
4926 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4927
4928 match_block.lcc = re->tables + lcc_offset;
4929 match_block.ctypes = re->tables + ctypes_offset;
4930
4931 /* The ims options can vary during the matching as a result of the presence
4932 of (?ims) items in the pattern. They are kept in a local variable so that
4933 restoring at the exit of a group is easy. */
4934
4935 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4936
4937 /* If the expression has got more back references than the offsets supplied can
4938 hold, we get a temporary bit of working store to use during the matching.
4939 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4940 of 3. */
4941
4942 ocount = offsetcount - (offsetcount % 3);
4943
4944 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4945 {
4946 ocount = re->top_backref * 3 + 3;
4947 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4948 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4949 using_temporary_offsets = TRUE;
4950 DPRINTF(("Got memory to hold back references\n"));
4951 }
4952 else match_block.offset_vector = offsets;
4953
4954 match_block.offset_end = ocount;
4955 match_block.offset_max = (2*ocount)/3;
4956 match_block.offset_overflow = FALSE;
4957
4958 /* Compute the minimum number of offsets that we need to reset each time. Doing
4959 this makes a huge difference to execution time when there aren't many brackets
4960 in the pattern. */
4961
4962 resetcount = 2 + re->top_bracket * 2;
4963 if (resetcount > offsetcount) resetcount = ocount;
4964
4965 /* Reset the working variable associated with each extraction. These should
4966 never be used unless previously set, but they get saved and restored, and so we
4967 initialize them to avoid reading uninitialized locations. */
4968
4969 if (match_block.offset_vector != NULL)
4970 {
4971 register int *iptr = match_block.offset_vector + ocount;
4972 register int *iend = iptr - resetcount/2 + 1;
4973 while (--iptr >= iend) *iptr = -1;
4974 }
4975
4976 /* Set up the first character to match, if available. The first_char value is
4977 never set for an anchored regular expression, but the anchoring may be forced
4978 at run time, so we have to test for anchoring. The first char may be unset for
4979 an unanchored pattern, of course. If there's no first char and the pattern was
4980 studied, there may be a bitmap of possible first characters. */
4981
4982 if (!anchored)
4983 {
4984 if ((re->options & PCRE_FIRSTSET) != 0)
4985 {
4986 first_char = re->first_char;
4987 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4988 }
4989 else
4990 if (!startline && extra != NULL &&
4991 (extra->options & PCRE_STUDY_MAPPED) != 0)
4992 start_bits = extra->start_bits;
4993 }
4994
4995 /* For anchored or unanchored matches, there may be a "last known required
4996 character" set. If the PCRE_CASELESS is set, implying that the match starts
4997 caselessly, or if there are any changes of this flag within the regex, set up
4998 both cases of the character. Otherwise set the two values the same, which will
4999 avoid duplicate testing (which takes significant time). This covers the vast
5000 majority of cases. It will be suboptimal when the case flag changes in a regex
5001 and the required character in fact is caseful. */
5002
5003 if ((re->options & PCRE_REQCHSET) != 0)
5004 {
5005 req_char = re->req_char;
5006 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
5007 (re->tables + fcc_offset)[req_char] : req_char;
5008 }
5009
5010 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5011 the loop runs just once. */
5012
5013 do
5014 {
5015 int rc;
5016 register int *iptr = match_block.offset_vector;
5017 register int *iend = iptr + resetcount;
5018
5019 /* Reset the maximum number of extractions we might see. */
5020
5021 while (iptr < iend) *iptr++ = -1;
5022
5023 /* Advance to a unique first char if possible */
5024
5025 if (first_char >= 0)
5026 {
5027 if ((ims & PCRE_CASELESS) != 0)
5028 while (start_match < end_subject &&
5029 match_block.lcc[*start_match] != first_char)
5030 start_match++;
5031 else
5032 while (start_match < end_subject && *start_match != first_char)
5033 start_match++;
5034 }
5035
5036 /* Or to just after \n for a multiline match if possible */
5037
5038 else if (startline)
5039 {
5040 if (start_match > match_block.start_subject + start_offset)
5041 {
5042 while (start_match < end_subject && start_match[-1] != NEWLINE)
5043 start_match++;
5044 }
5045 }
5046
5047 /* Or to a non-unique first char after study */
5048
5049 else if (start_bits != NULL)
5050 {
5051 while (start_match < end_subject)
5052 {
5053 register int c = *start_match;
5054 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5055 }
5056 }
5057
5058 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5059 printf(">>>> Match against: ");
5060 pchars(start_match, end_subject - start_match, TRUE, &match_block);
5061 printf("\n");
5062 #endif
5063
5064 /* If req_char is set, we know that that character must appear in the subject
5065 for the match to succeed. If the first character is set, req_char must be
5066 later in the subject; otherwise the test starts at the match point. This
5067 optimization can save a huge amount of backtracking in patterns with nested
5068 unlimited repeats that aren't going to match. We don't know what the state of
5069 case matching may be when this character is hit, so test for it in both its
5070 cases if necessary. However, the different cased versions will not be set up
5071 unless PCRE_CASELESS was given or the casing state changes within the regex.
5072 Writing separate code makes it go faster, as does using an autoincrement and
5073 backing off on a match. */
5074
5075 if (req_char >= 0)
5076 {
5077 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5078
5079 /* We don't need to repeat the search if we haven't yet reached the
5080 place we found it at last time. */
5081
5082 if (p > req_char_ptr)
5083 {
5084 /* Do a single test if no case difference is set up */
5085
5086 if (req_char == req_char2)
5087 {
5088 while (p < end_subject)
5089 {
5090 if (*p++ == req_char) { p--; break; }
5091 }
5092 }
5093
5094 /* Otherwise test for either case */
5095
5096 else
5097 {
5098 while (p < end_subject)
5099 {
5100 register int pp = *p++;
5101 if (pp == req_char || pp == req_char2) { p--; break; }
5102 }
5103 }
5104
5105 /* If we can't find the required character, break the matching loop */
5106
5107 if (p >= end_subject) break;
5108
5109 /* If we have found the required character, save the point where we
5110 found it, so that we don't search again next time round the loop if
5111 the start hasn't passed this character yet. */
5112
5113 req_char_ptr = p;
5114 }
5115 }
5116
5117 /* When a match occurs, substrings will be set for all internal extractions;
5118 we just need to set up the whole thing as substring 0 before returning. If
5119 there were too many extractions, set the return code to zero. In the case
5120 where we had to get some local store to hold offsets for backreferences, copy
5121 those back references that we can. In this case there need not be overflow
5122 if certain parts of the pattern were not used. */
5123
5124 match_block.start_match = start_match;
5125 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5126 continue;
5127
5128 /* Copy the offset information from temporary store if necessary */
5129
5130 if (using_temporary_offsets)
5131 {
5132 if (offsetcount >= 4)
5133 {
5134 memcpy(offsets + 2, match_block.offset_vector + 2,
5135 (offsetcount - 2) * sizeof(int));
5136 DPRINTF(("Copied offsets from temporary memory\n"));
5137 }
5138 if (match_block.end_offset_top > offsetcount)
5139 match_block.offset_overflow = TRUE;
5140
5141 DPRINTF(("Freeing temporary memory\n"));
5142 (pcre_free)(match_block.offset_vector);
5143 }
5144
5145 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5146
5147 if (match_block.offset_end < 2) rc = 0; else
5148 {
5149 offsets[0] = start_match - match_block.start_subject;
5150 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5151 }
5152
5153 DPRINTF((">>>> returning %d\n", rc));
5154 return rc;
5155 }
5156
5157 /* This "while" is the end of the "do" above */
5158
5159 while (!anchored &&
5160 match_block.errorcode == PCRE_ERROR_NOMATCH &&
5161 start_match++ < end_subject);
5162
5163 if (using_temporary_offsets)
5164 {
5165 DPRINTF(("Freeing temporary memory\n"));
5166 (pcre_free)(match_block.offset_vector);
5167 }
5168
5169 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5170
5171 return match_block.errorcode;
5172 }
5173
5174 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12