/[pcre]/code/tags/pcre-3.0/pcre.c
ViewVC logotype

Contents of /code/tags/pcre-3.0/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 44 - (show annotations) (download)
Sat Feb 24 21:39:23 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 140886 byte(s)
Tag code/trunk as code/tags/pcre-3.0.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2000 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Define DEBUG to get debugging output on stdout. */
37
38 /* #define DEBUG */
39
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
43
44 #ifdef DEBUG
45 #define DPRINTF(p) printf p
46 #else
47 #define DPRINTF(p) /*nothing*/
48 #endif
49
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
52
53 #include "internal.h"
54
55
56 /* Allow compilation as C++ source code, should anybody want to do that. */
57
58 #ifdef __cplusplus
59 #define class pcre_class
60 #endif
61
62
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
65
66 #define BRASTACK_SIZE 200
67
68
69 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70
71 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73
74 /* Text forms of OP_ values and things, for debugging (not all used) */
75
76 #ifdef DEBUG
77 static const char *OP_names[] = {
78 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80 "Opt", "^", "$", "Any", "chars", "not",
81 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84 "*", "*?", "+", "+?", "?", "??", "{", "{",
85 "class", "Ref", "Recurse",
86 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88 "Brazero", "Braminzero", "Bra"
89 };
90 #endif
91
92 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93 are simple data values; negative values are for special things like \d and so
94 on. Zero means further processing is needed (for things like \x), or the escape
95 is invalid. */
96
97 static const short int escapes[] = {
98 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107 0, 0, -ESC_z /* x - z */
108 };
109
110 /* Tables of names of POSIX character classes and their lengths. The list is
111 terminated by a zero length entry. The first three must be alpha, upper, lower,
112 as this is assumed for handling case independence. */
113
114 static const char *posix_names[] = {
115 "alpha", "lower", "upper",
116 "alnum", "ascii", "cntrl", "digit", "graph",
117 "print", "punct", "space", "word", "xdigit" };
118
119 static const uschar posix_name_lengths[] = {
120 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
121
122 /* Table of class bit maps for each POSIX class; up to three may be combined
123 to form the class. */
124
125 static const int posix_class_maps[] = {
126 cbit_lower, cbit_upper, -1, /* alpha */
127 cbit_lower, -1, -1, /* lower */
128 cbit_upper, -1, -1, /* upper */
129 cbit_digit, cbit_lower, cbit_upper, /* alnum */
130 cbit_print, cbit_cntrl, -1, /* ascii */
131 cbit_cntrl, -1, -1, /* cntrl */
132 cbit_digit, -1, -1, /* digit */
133 cbit_graph, -1, -1, /* graph */
134 cbit_print, -1, -1, /* print */
135 cbit_punct, -1, -1, /* punct */
136 cbit_space, -1, -1, /* space */
137 cbit_word, -1, -1, /* word */
138 cbit_xdigit,-1, -1 /* xdigit */
139 };
140
141
142 /* Definition to allow mutual recursion */
143
144 static BOOL
145 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
146 BOOL, int, int *, int *, compile_data *);
147
148
149
150 /*************************************************
151 * Global variables *
152 *************************************************/
153
154 /* PCRE is thread-clean and doesn't use any global variables in the normal
155 sense. However, it calls memory allocation and free functions via the two
156 indirections below, which are can be changed by the caller, but are shared
157 between all threads. */
158
159 void *(*pcre_malloc)(size_t) = malloc;
160 void (*pcre_free)(void *) = free;
161
162
163
164
165 /*************************************************
166 * Default character tables *
167 *************************************************/
168
169 /* A default set of character tables is included in the PCRE binary. Its source
170 is built by the maketables auxiliary program, which uses the default C ctypes
171 functions, and put in the file chartables.c. These tables are used by PCRE
172 whenever the caller of pcre_compile() does not provide an alternate set of
173 tables. */
174
175 #include "chartables.c"
176
177
178
179 /*************************************************
180 * Return version string *
181 *************************************************/
182
183 #define STRING(a) # a
184 #define XSTRING(s) STRING(s)
185
186 const char *
187 pcre_version(void)
188 {
189 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
190 }
191
192
193
194
195 /*************************************************
196 * (Obsolete) Return info about compiled pattern *
197 *************************************************/
198
199 /* This is the original "info" function. It picks potentially useful data out
200 of the private structure, but its interface was too rigid. It remains for
201 backwards compatibility. The public options are passed back in an int - though
202 the re->options field has been expanded to a long int, all the public options
203 at the low end of it, and so even on 16-bit systems this will still be OK.
204 Therefore, I haven't changed the API for pcre_info().
205
206 Arguments:
207 external_re points to compiled code
208 optptr where to pass back the options
209 first_char where to pass back the first character,
210 or -1 if multiline and all branches start ^,
211 or -2 otherwise
212
213 Returns: number of capturing subpatterns
214 or negative values on error
215 */
216
217 int
218 pcre_info(const pcre *external_re, int *optptr, int *first_char)
219 {
220 const real_pcre *re = (const real_pcre *)external_re;
221 if (re == NULL) return PCRE_ERROR_NULL;
222 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
223 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
224 if (first_char != NULL)
225 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
226 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
227 return re->top_bracket;
228 }
229
230
231
232 /*************************************************
233 * Return info about compiled pattern *
234 *************************************************/
235
236 /* This is a newer "info" function which has an extensible interface so
237 that additional items can be added compatibly.
238
239 Arguments:
240 external_re points to compiled code
241 external_study points to study data, or NULL
242 what what information is required
243 where where to put the information
244
245 Returns: 0 if data returned, negative on error
246 */
247
248 int
249 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
250 void *where)
251 {
252 const real_pcre *re = (const real_pcre *)external_re;
253 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
254
255 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
256 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
257
258 switch (what)
259 {
260 case PCRE_INFO_OPTIONS:
261 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
262 break;
263
264 case PCRE_INFO_SIZE:
265 *((size_t *)where) = re->size;
266 break;
267
268 case PCRE_INFO_CAPTURECOUNT:
269 *((int *)where) = re->top_bracket;
270 break;
271
272 case PCRE_INFO_BACKREFMAX:
273 *((int *)where) = re->top_backref;
274 break;
275
276 case PCRE_INFO_FIRSTCHAR:
277 *((int *)where) =
278 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
279 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
280 break;
281
282 case PCRE_INFO_FIRSTTABLE:
283 *((const uschar **)where) =
284 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
285 study->start_bits : NULL;
286 break;
287
288 case PCRE_INFO_LASTLITERAL:
289 *((int *)where) =
290 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
291 break;
292
293 default: return PCRE_ERROR_BADOPTION;
294 }
295
296 return 0;
297 }
298
299
300
301 #ifdef DEBUG
302 /*************************************************
303 * Debugging function to print chars *
304 *************************************************/
305
306 /* Print a sequence of chars in printable format, stopping at the end of the
307 subject if the requested.
308
309 Arguments:
310 p points to characters
311 length number to print
312 is_subject TRUE if printing from within md->start_subject
313 md pointer to matching data block, if is_subject is TRUE
314
315 Returns: nothing
316 */
317
318 static void
319 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
320 {
321 int c;
322 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
323 while (length-- > 0)
324 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
325 }
326 #endif
327
328
329
330
331 /*************************************************
332 * Handle escapes *
333 *************************************************/
334
335 /* This function is called when a \ has been encountered. It either returns a
336 positive value for a simple escape such as \n, or a negative value which
337 encodes one of the more complicated things such as \d. On entry, ptr is
338 pointing at the \. On exit, it is on the final character of the escape
339 sequence.
340
341 Arguments:
342 ptrptr points to the pattern position pointer
343 errorptr points to the pointer to the error message
344 bracount number of previous extracting brackets
345 options the options bits
346 isclass TRUE if inside a character class
347 cd pointer to char tables block
348
349 Returns: zero or positive => a data character
350 negative => a special escape sequence
351 on error, errorptr is set
352 */
353
354 static int
355 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
356 int options, BOOL isclass, compile_data *cd)
357 {
358 const uschar *ptr = *ptrptr;
359 int c, i;
360
361 c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
362 if (c == 0) *errorptr = ERR1;
363
364 /* Digits or letters may have special meaning; all others are literals. */
365
366 else if (c < '0' || c > 'z') {}
367
368 /* Do an initial lookup in a table. A non-zero result is something that can be
369 returned immediately. Otherwise further processing may be required. */
370
371 else if ((i = escapes[c - '0']) != 0) c = i;
372
373 /* Escapes that need further processing, or are illegal. */
374
375 else
376 {
377 const uschar *oldptr;
378 switch (c)
379 {
380 /* The handling of escape sequences consisting of a string of digits
381 starting with one that is not zero is not straightforward. By experiment,
382 the way Perl works seems to be as follows:
383
384 Outside a character class, the digits are read as a decimal number. If the
385 number is less than 10, or if there are that many previous extracting
386 left brackets, then it is a back reference. Otherwise, up to three octal
387 digits are read to form an escaped byte. Thus \123 is likely to be octal
388 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
389 value is greater than 377, the least significant 8 bits are taken. Inside a
390 character class, \ followed by a digit is always an octal number. */
391
392 case '1': case '2': case '3': case '4': case '5':
393 case '6': case '7': case '8': case '9':
394
395 if (!isclass)
396 {
397 oldptr = ptr;
398 c -= '0';
399 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
400 c = c * 10 + *(++ptr) - '0';
401 if (c < 10 || c <= bracount)
402 {
403 c = -(ESC_REF + c);
404 break;
405 }
406 ptr = oldptr; /* Put the pointer back and fall through */
407 }
408
409 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
410 generates a binary zero byte and treats the digit as a following literal.
411 Thus we have to pull back the pointer by one. */
412
413 if ((c = *ptr) >= '8')
414 {
415 ptr--;
416 c = 0;
417 break;
418 }
419
420 /* \0 always starts an octal number, but we may drop through to here with a
421 larger first octal digit */
422
423 case '0':
424 c -= '0';
425 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
426 ptr[1] != '8' && ptr[1] != '9')
427 c = c * 8 + *(++ptr) - '0';
428 break;
429
430 /* Special escapes not starting with a digit are straightforward */
431
432 case 'x':
433 c = 0;
434 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
435 {
436 ptr++;
437 c = c * 16 + cd->lcc[*ptr] -
438 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
439 }
440 break;
441
442 case 'c':
443 c = *(++ptr);
444 if (c == 0)
445 {
446 *errorptr = ERR2;
447 return 0;
448 }
449
450 /* A letter is upper-cased; then the 0x40 bit is flipped */
451
452 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
453 c ^= 0x40;
454 break;
455
456 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
457 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
458 for Perl compatibility, it is a literal. This code looks a bit odd, but
459 there used to be some cases other than the default, and there may be again
460 in future, so I haven't "optimized" it. */
461
462 default:
463 if ((options & PCRE_EXTRA) != 0) switch(c)
464 {
465 default:
466 *errorptr = ERR3;
467 break;
468 }
469 break;
470 }
471 }
472
473 *ptrptr = ptr;
474 return c;
475 }
476
477
478
479 /*************************************************
480 * Check for counted repeat *
481 *************************************************/
482
483 /* This function is called when a '{' is encountered in a place where it might
484 start a quantifier. It looks ahead to see if it really is a quantifier or not.
485 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
486 where the ddds are digits.
487
488 Arguments:
489 p pointer to the first char after '{'
490 cd pointer to char tables block
491
492 Returns: TRUE or FALSE
493 */
494
495 static BOOL
496 is_counted_repeat(const uschar *p, compile_data *cd)
497 {
498 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
499 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
500 if (*p == '}') return TRUE;
501
502 if (*p++ != ',') return FALSE;
503 if (*p == '}') return TRUE;
504
505 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
506 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
507 return (*p == '}');
508 }
509
510
511
512 /*************************************************
513 * Read repeat counts *
514 *************************************************/
515
516 /* Read an item of the form {n,m} and return the values. This is called only
517 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
518 so the syntax is guaranteed to be correct, but we need to check the values.
519
520 Arguments:
521 p pointer to first char after '{'
522 minp pointer to int for min
523 maxp pointer to int for max
524 returned as -1 if no max
525 errorptr points to pointer to error message
526 cd pointer to character tables clock
527
528 Returns: pointer to '}' on success;
529 current ptr on error, with errorptr set
530 */
531
532 static const uschar *
533 read_repeat_counts(const uschar *p, int *minp, int *maxp,
534 const char **errorptr, compile_data *cd)
535 {
536 int min = 0;
537 int max = -1;
538
539 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
540
541 if (*p == '}') max = min; else
542 {
543 if (*(++p) != '}')
544 {
545 max = 0;
546 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
547 if (max < min)
548 {
549 *errorptr = ERR4;
550 return p;
551 }
552 }
553 }
554
555 /* Do paranoid checks, then fill in the required variables, and pass back the
556 pointer to the terminating '}'. */
557
558 if (min > 65535 || max > 65535)
559 *errorptr = ERR5;
560 else
561 {
562 *minp = min;
563 *maxp = max;
564 }
565 return p;
566 }
567
568
569
570 /*************************************************
571 * Find the fixed length of a pattern *
572 *************************************************/
573
574 /* Scan a pattern and compute the fixed length of subject that will match it,
575 if the length is fixed. This is needed for dealing with backward assertions.
576
577 Arguments:
578 code points to the start of the pattern (the bracket)
579
580 Returns: the fixed length, or -1 if there is no fixed length
581 */
582
583 static int
584 find_fixedlength(uschar *code)
585 {
586 int length = -1;
587
588 register int branchlength = 0;
589 register uschar *cc = code + 3;
590
591 /* Scan along the opcodes for this branch. If we get to the end of the
592 branch, check the length against that of the other branches. */
593
594 for (;;)
595 {
596 int d;
597 register int op = *cc;
598 if (op >= OP_BRA) op = OP_BRA;
599
600 switch (op)
601 {
602 case OP_BRA:
603 case OP_ONCE:
604 case OP_COND:
605 d = find_fixedlength(cc);
606 if (d < 0) return -1;
607 branchlength += d;
608 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
609 cc += 3;
610 break;
611
612 /* Reached end of a branch; if it's a ket it is the end of a nested
613 call. If it's ALT it is an alternation in a nested call. If it is
614 END it's the end of the outer call. All can be handled by the same code. */
615
616 case OP_ALT:
617 case OP_KET:
618 case OP_KETRMAX:
619 case OP_KETRMIN:
620 case OP_END:
621 if (length < 0) length = branchlength;
622 else if (length != branchlength) return -1;
623 if (*cc != OP_ALT) return length;
624 cc += 3;
625 branchlength = 0;
626 break;
627
628 /* Skip over assertive subpatterns */
629
630 case OP_ASSERT:
631 case OP_ASSERT_NOT:
632 case OP_ASSERTBACK:
633 case OP_ASSERTBACK_NOT:
634 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
635 cc += 3;
636 break;
637
638 /* Skip over things that don't match chars */
639
640 case OP_REVERSE:
641 cc++;
642 /* Fall through */
643
644 case OP_CREF:
645 case OP_OPT:
646 cc++;
647 /* Fall through */
648
649 case OP_SOD:
650 case OP_EOD:
651 case OP_EODN:
652 case OP_CIRC:
653 case OP_DOLL:
654 case OP_NOT_WORD_BOUNDARY:
655 case OP_WORD_BOUNDARY:
656 cc++;
657 break;
658
659 /* Handle char strings */
660
661 case OP_CHARS:
662 branchlength += *(++cc);
663 cc += *cc + 1;
664 break;
665
666 /* Handle exact repetitions */
667
668 case OP_EXACT:
669 case OP_TYPEEXACT:
670 branchlength += (cc[1] << 8) + cc[2];
671 cc += 4;
672 break;
673
674 /* Handle single-char matchers */
675
676 case OP_NOT_DIGIT:
677 case OP_DIGIT:
678 case OP_NOT_WHITESPACE:
679 case OP_WHITESPACE:
680 case OP_NOT_WORDCHAR:
681 case OP_WORDCHAR:
682 case OP_ANY:
683 branchlength++;
684 cc++;
685 break;
686
687
688 /* Check a class for variable quantification */
689
690 case OP_CLASS:
691 cc += (*cc == OP_REF)? 2 : 33;
692
693 switch (*cc)
694 {
695 case OP_CRSTAR:
696 case OP_CRMINSTAR:
697 case OP_CRQUERY:
698 case OP_CRMINQUERY:
699 return -1;
700
701 case OP_CRRANGE:
702 case OP_CRMINRANGE:
703 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
704 branchlength += (cc[1] << 8) + cc[2];
705 cc += 5;
706 break;
707
708 default:
709 branchlength++;
710 }
711 break;
712
713 /* Anything else is variable length */
714
715 default:
716 return -1;
717 }
718 }
719 /* Control never gets here */
720 }
721
722
723
724
725 /*************************************************
726 * Check for POSIX class syntax *
727 *************************************************/
728
729 /* This function is called when the sequence "[:" or "[." or "[=" is
730 encountered in a character class. It checks whether this is followed by an
731 optional ^ and then a sequence of letters, terminated by a matching ":]" or
732 ".]" or "=]".
733
734 Argument:
735 ptr pointer to the initial [
736 endptr where to return the end pointer
737 cd pointer to compile data
738
739 Returns: TRUE or FALSE
740 */
741
742 static BOOL
743 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
744 {
745 int terminator; /* Don't combine these lines; the Solaris cc */
746 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
747 if (*(++ptr) == '^') ptr++;
748 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
749 if (*ptr == terminator && ptr[1] == ']')
750 {
751 *endptr = ptr;
752 return TRUE;
753 }
754 return FALSE;
755 }
756
757
758
759
760 /*************************************************
761 * Check POSIX class name *
762 *************************************************/
763
764 /* This function is called to check the name given in a POSIX-style class entry
765 such as [:alnum:].
766
767 Arguments:
768 ptr points to the first letter
769 len the length of the name
770
771 Returns: a value representing the name, or -1 if unknown
772 */
773
774 static int
775 check_posix_name(const uschar *ptr, int len)
776 {
777 register int yield = 0;
778 while (posix_name_lengths[yield] != 0)
779 {
780 if (len == posix_name_lengths[yield] &&
781 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
782 yield++;
783 }
784 return -1;
785 }
786
787
788
789
790 /*************************************************
791 * Compile one branch *
792 *************************************************/
793
794 /* Scan the pattern, compiling it into the code vector.
795
796 Arguments:
797 options the option bits
798 brackets points to number of brackets used
799 code points to the pointer to the current code point
800 ptrptr points to the current pattern pointer
801 errorptr points to pointer to error message
802 optchanged set to the value of the last OP_OPT item compiled
803 reqchar set to the last literal character required, else -1
804 countlits set to count of mandatory literal characters
805 cd contains pointers to tables
806
807 Returns: TRUE on success
808 FALSE, with *errorptr set on error
809 */
810
811 static BOOL
812 compile_branch(int options, int *brackets, uschar **codeptr,
813 const uschar **ptrptr, const char **errorptr, int *optchanged,
814 int *reqchar, int *countlits, compile_data *cd)
815 {
816 int repeat_type, op_type;
817 int repeat_min, repeat_max;
818 int bravalue, length;
819 int greedy_default, greedy_non_default;
820 int prevreqchar;
821 int condcount = 0;
822 int subcountlits = 0;
823 register int c;
824 register uschar *code = *codeptr;
825 uschar *tempcode;
826 const uschar *ptr = *ptrptr;
827 const uschar *tempptr;
828 uschar *previous = NULL;
829 uschar class[32];
830
831 /* Set up the default and non-default settings for greediness */
832
833 greedy_default = ((options & PCRE_UNGREEDY) != 0);
834 greedy_non_default = greedy_default ^ 1;
835
836 /* Initialize no required char, and count of literals */
837
838 *reqchar = prevreqchar = -1;
839 *countlits = 0;
840
841 /* Switch on next character until the end of the branch */
842
843 for (;; ptr++)
844 {
845 BOOL negate_class;
846 int class_charcount;
847 int class_lastchar;
848 int newoptions;
849 int condref;
850 int subreqchar;
851
852 c = *ptr;
853 if ((options & PCRE_EXTENDED) != 0)
854 {
855 if ((cd->ctypes[c] & ctype_space) != 0) continue;
856 if (c == '#')
857 {
858 while ((c = *(++ptr)) != 0 && c != '\n');
859 continue;
860 }
861 }
862
863 switch(c)
864 {
865 /* The branch terminates at end of string, |, or ). */
866
867 case 0:
868 case '|':
869 case ')':
870 *codeptr = code;
871 *ptrptr = ptr;
872 return TRUE;
873
874 /* Handle single-character metacharacters */
875
876 case '^':
877 previous = NULL;
878 *code++ = OP_CIRC;
879 break;
880
881 case '$':
882 previous = NULL;
883 *code++ = OP_DOLL;
884 break;
885
886 case '.':
887 previous = code;
888 *code++ = OP_ANY;
889 break;
890
891 /* Character classes. These always build a 32-byte bitmap of the permitted
892 characters, except in the special case where there is only one character.
893 For negated classes, we build the map as usual, then invert it at the end.
894 */
895
896 case '[':
897 previous = code;
898 *code++ = OP_CLASS;
899
900 /* If the first character is '^', set the negation flag and skip it. */
901
902 if ((c = *(++ptr)) == '^')
903 {
904 negate_class = TRUE;
905 c = *(++ptr);
906 }
907 else negate_class = FALSE;
908
909 /* Keep a count of chars so that we can optimize the case of just a single
910 character. */
911
912 class_charcount = 0;
913 class_lastchar = -1;
914
915 /* Initialize the 32-char bit map to all zeros. We have to build the
916 map in a temporary bit of store, in case the class contains only 1
917 character, because in that case the compiled code doesn't use the
918 bit map. */
919
920 memset(class, 0, 32 * sizeof(uschar));
921
922 /* Process characters until ] is reached. By writing this as a "do" it
923 means that an initial ] is taken as a data character. */
924
925 do
926 {
927 if (c == 0)
928 {
929 *errorptr = ERR6;
930 goto FAILED;
931 }
932
933 /* Handle POSIX class names. Perl allows a negation extension of the
934 form [:^name]. A square bracket that doesn't match the syntax is
935 treated as a literal. We also recognize the POSIX constructions
936 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
937 5.6 does. */
938
939 if (c == '[' &&
940 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
941 check_posix_syntax(ptr, &tempptr, cd))
942 {
943 BOOL local_negate = FALSE;
944 int posix_class, i;
945 register const uschar *cbits = cd->cbits;
946
947 if (ptr[1] != ':')
948 {
949 *errorptr = ERR31;
950 goto FAILED;
951 }
952
953 ptr += 2;
954 if (*ptr == '^')
955 {
956 local_negate = TRUE;
957 ptr++;
958 }
959
960 posix_class = check_posix_name(ptr, tempptr - ptr);
961 if (posix_class < 0)
962 {
963 *errorptr = ERR30;
964 goto FAILED;
965 }
966
967 /* If matching is caseless, upper and lower are converted to
968 alpha. This relies on the fact that the class table starts with
969 alpha, lower, upper as the first 3 entries. */
970
971 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
972 posix_class = 0;
973
974 /* Or into the map we are building up to 3 of the static class
975 tables, or their negations. */
976
977 posix_class *= 3;
978 for (i = 0; i < 3; i++)
979 {
980 int taboffset = posix_class_maps[posix_class + i];
981 if (taboffset < 0) break;
982 if (local_negate)
983 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
984 else
985 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
986 }
987
988 ptr = tempptr + 1;
989 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
990 continue;
991 }
992
993 /* Backslash may introduce a single character, or it may introduce one
994 of the specials, which just set a flag. Escaped items are checked for
995 validity in the pre-compiling pass. The sequence \b is a special case.
996 Inside a class (and only there) it is treated as backspace. Elsewhere
997 it marks a word boundary. Other escapes have preset maps ready to
998 or into the one we are building. We assume they have more than one
999 character in them, so set class_count bigger than one. */
1000
1001 if (c == '\\')
1002 {
1003 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1004 if (-c == ESC_b) c = '\b';
1005 else if (c < 0)
1006 {
1007 register const uschar *cbits = cd->cbits;
1008 class_charcount = 10;
1009 switch (-c)
1010 {
1011 case ESC_d:
1012 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1013 continue;
1014
1015 case ESC_D:
1016 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1017 continue;
1018
1019 case ESC_w:
1020 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1021 continue;
1022
1023 case ESC_W:
1024 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1025 continue;
1026
1027 case ESC_s:
1028 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1029 continue;
1030
1031 case ESC_S:
1032 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1033 continue;
1034
1035 default:
1036 *errorptr = ERR7;
1037 goto FAILED;
1038 }
1039 }
1040 /* Fall through if single character */
1041 }
1042
1043 /* A single character may be followed by '-' to form a range. However,
1044 Perl does not permit ']' to be the end of the range. A '-' character
1045 here is treated as a literal. */
1046
1047 if (ptr[1] == '-' && ptr[2] != ']')
1048 {
1049 int d;
1050 ptr += 2;
1051 d = *ptr;
1052
1053 if (d == 0)
1054 {
1055 *errorptr = ERR6;
1056 goto FAILED;
1057 }
1058
1059 /* The second part of a range can be a single-character escape, but
1060 not any of the other escapes. */
1061
1062 if (d == '\\')
1063 {
1064 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1065 if (d < 0)
1066 {
1067 if (d == -ESC_b) d = '\b'; else
1068 {
1069 *errorptr = ERR7;
1070 goto FAILED;
1071 }
1072 }
1073 }
1074
1075 if (d < c)
1076 {
1077 *errorptr = ERR8;
1078 goto FAILED;
1079 }
1080
1081 for (; c <= d; c++)
1082 {
1083 class[c/8] |= (1 << (c&7));
1084 if ((options & PCRE_CASELESS) != 0)
1085 {
1086 int uc = cd->fcc[c]; /* flip case */
1087 class[uc/8] |= (1 << (uc&7));
1088 }
1089 class_charcount++; /* in case a one-char range */
1090 class_lastchar = c;
1091 }
1092 continue; /* Go get the next char in the class */
1093 }
1094
1095 /* Handle a lone single character - we can get here for a normal
1096 non-escape char, or after \ that introduces a single character. */
1097
1098 class [c/8] |= (1 << (c&7));
1099 if ((options & PCRE_CASELESS) != 0)
1100 {
1101 c = cd->fcc[c]; /* flip case */
1102 class[c/8] |= (1 << (c&7));
1103 }
1104 class_charcount++;
1105 class_lastchar = c;
1106 }
1107
1108 /* Loop until ']' reached; the check for end of string happens inside the
1109 loop. This "while" is the end of the "do" above. */
1110
1111 while ((c = *(++ptr)) != ']');
1112
1113 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1114 precisely one character. This doesn't need the whole 32-byte bit map.
1115 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1116 it's negative. */
1117
1118 if (class_charcount == 1 && class_lastchar >= 0)
1119 {
1120 if (negate_class)
1121 {
1122 code[-1] = OP_NOT;
1123 }
1124 else
1125 {
1126 code[-1] = OP_CHARS;
1127 *code++ = 1;
1128 }
1129 *code++ = class_lastchar;
1130 }
1131
1132 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1133 the code vector. */
1134
1135 else
1136 {
1137 if (negate_class)
1138 for (c = 0; c < 32; c++) code[c] = ~class[c];
1139 else
1140 memcpy(code, class, 32);
1141 code += 32;
1142 }
1143 break;
1144
1145 /* Various kinds of repeat */
1146
1147 case '{':
1148 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1149 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1150 if (*errorptr != NULL) goto FAILED;
1151 goto REPEAT;
1152
1153 case '*':
1154 repeat_min = 0;
1155 repeat_max = -1;
1156 goto REPEAT;
1157
1158 case '+':
1159 repeat_min = 1;
1160 repeat_max = -1;
1161 goto REPEAT;
1162
1163 case '?':
1164 repeat_min = 0;
1165 repeat_max = 1;
1166
1167 REPEAT:
1168 if (previous == NULL)
1169 {
1170 *errorptr = ERR9;
1171 goto FAILED;
1172 }
1173
1174 /* If the next character is '?' this is a minimizing repeat, by default,
1175 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1176 next character. */
1177
1178 if (ptr[1] == '?')
1179 { repeat_type = greedy_non_default; ptr++; }
1180 else repeat_type = greedy_default;
1181
1182 /* If previous was a string of characters, chop off the last one and use it
1183 as the subject of the repeat. If there was only one character, we can
1184 abolish the previous item altogether. A repeat with a zero minimum wipes
1185 out any reqchar setting, backing up to the previous value. We must also
1186 adjust the countlits value. */
1187
1188 if (*previous == OP_CHARS)
1189 {
1190 int len = previous[1];
1191
1192 if (repeat_min == 0) *reqchar = prevreqchar;
1193 *countlits += repeat_min - 1;
1194
1195 if (len == 1)
1196 {
1197 c = previous[2];
1198 code = previous;
1199 }
1200 else
1201 {
1202 c = previous[len+1];
1203 previous[1]--;
1204 code--;
1205 }
1206 op_type = 0; /* Use single-char op codes */
1207 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1208 }
1209
1210 /* If previous was a single negated character ([^a] or similar), we use
1211 one of the special opcodes, replacing it. The code is shared with single-
1212 character repeats by adding a suitable offset into repeat_type. */
1213
1214 else if ((int)*previous == OP_NOT)
1215 {
1216 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1217 c = previous[1];
1218 code = previous;
1219 goto OUTPUT_SINGLE_REPEAT;
1220 }
1221
1222 /* If previous was a character type match (\d or similar), abolish it and
1223 create a suitable repeat item. The code is shared with single-character
1224 repeats by adding a suitable offset into repeat_type. */
1225
1226 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1227 {
1228 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1229 c = *previous;
1230 code = previous;
1231
1232 OUTPUT_SINGLE_REPEAT:
1233
1234 /* If the maximum is zero then the minimum must also be zero; Perl allows
1235 this case, so we do too - by simply omitting the item altogether. */
1236
1237 if (repeat_max == 0) goto END_REPEAT;
1238
1239 /* Combine the op_type with the repeat_type */
1240
1241 repeat_type += op_type;
1242
1243 /* A minimum of zero is handled either as the special case * or ?, or as
1244 an UPTO, with the maximum given. */
1245
1246 if (repeat_min == 0)
1247 {
1248 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1249 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1250 else
1251 {
1252 *code++ = OP_UPTO + repeat_type;
1253 *code++ = repeat_max >> 8;
1254 *code++ = (repeat_max & 255);
1255 }
1256 }
1257
1258 /* The case {1,} is handled as the special case + */
1259
1260 else if (repeat_min == 1 && repeat_max == -1)
1261 *code++ = OP_PLUS + repeat_type;
1262
1263 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1264 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1265
1266 else
1267 {
1268 if (repeat_min != 1)
1269 {
1270 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1271 *code++ = repeat_min >> 8;
1272 *code++ = (repeat_min & 255);
1273 }
1274
1275 /* If the mininum is 1 and the previous item was a character string,
1276 we either have to put back the item that got cancelled if the string
1277 length was 1, or add the character back onto the end of a longer
1278 string. For a character type nothing need be done; it will just get
1279 put back naturally. Note that the final character is always going to
1280 get added below. */
1281
1282 else if (*previous == OP_CHARS)
1283 {
1284 if (code == previous) code += 2; else previous[1]++;
1285 }
1286
1287 /* For a single negated character we also have to put back the
1288 item that got cancelled. */
1289
1290 else if (*previous == OP_NOT) code++;
1291
1292 /* If the maximum is unlimited, insert an OP_STAR. */
1293
1294 if (repeat_max < 0)
1295 {
1296 *code++ = c;
1297 *code++ = OP_STAR + repeat_type;
1298 }
1299
1300 /* Else insert an UPTO if the max is greater than the min. */
1301
1302 else if (repeat_max != repeat_min)
1303 {
1304 *code++ = c;
1305 repeat_max -= repeat_min;
1306 *code++ = OP_UPTO + repeat_type;
1307 *code++ = repeat_max >> 8;
1308 *code++ = (repeat_max & 255);
1309 }
1310 }
1311
1312 /* The character or character type itself comes last in all cases. */
1313
1314 *code++ = c;
1315 }
1316
1317 /* If previous was a character class or a back reference, we put the repeat
1318 stuff after it, but just skip the item if the repeat was {0,0}. */
1319
1320 else if (*previous == OP_CLASS || *previous == OP_REF)
1321 {
1322 if (repeat_max == 0)
1323 {
1324 code = previous;
1325 goto END_REPEAT;
1326 }
1327 if (repeat_min == 0 && repeat_max == -1)
1328 *code++ = OP_CRSTAR + repeat_type;
1329 else if (repeat_min == 1 && repeat_max == -1)
1330 *code++ = OP_CRPLUS + repeat_type;
1331 else if (repeat_min == 0 && repeat_max == 1)
1332 *code++ = OP_CRQUERY + repeat_type;
1333 else
1334 {
1335 *code++ = OP_CRRANGE + repeat_type;
1336 *code++ = repeat_min >> 8;
1337 *code++ = repeat_min & 255;
1338 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1339 *code++ = repeat_max >> 8;
1340 *code++ = repeat_max & 255;
1341 }
1342 }
1343
1344 /* If previous was a bracket group, we may have to replicate it in certain
1345 cases. */
1346
1347 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1348 (int)*previous == OP_COND)
1349 {
1350 register int i;
1351 int ketoffset = 0;
1352 int len = code - previous;
1353 uschar *bralink = NULL;
1354
1355 /* If the maximum repeat count is unlimited, find the end of the bracket
1356 by scanning through from the start, and compute the offset back to it
1357 from the current code pointer. There may be an OP_OPT setting following
1358 the final KET, so we can't find the end just by going back from the code
1359 pointer. */
1360
1361 if (repeat_max == -1)
1362 {
1363 register uschar *ket = previous;
1364 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1365 ketoffset = code - ket;
1366 }
1367
1368 /* The case of a zero minimum is special because of the need to stick
1369 OP_BRAZERO in front of it, and because the group appears once in the
1370 data, whereas in other cases it appears the minimum number of times. For
1371 this reason, it is simplest to treat this case separately, as otherwise
1372 the code gets far too mess. There are several special subcases when the
1373 minimum is zero. */
1374
1375 if (repeat_min == 0)
1376 {
1377 /* If we set up a required char from the bracket, we must back off
1378 to the previous value and reset the countlits value too. */
1379
1380 if (subcountlits > 0)
1381 {
1382 *reqchar = prevreqchar;
1383 *countlits -= subcountlits;
1384 }
1385
1386 /* If the maximum is also zero, we just omit the group from the output
1387 altogether. */
1388
1389 if (repeat_max == 0)
1390 {
1391 code = previous;
1392 goto END_REPEAT;
1393 }
1394
1395 /* If the maximum is 1 or unlimited, we just have to stick in the
1396 BRAZERO and do no more at this point. */
1397
1398 if (repeat_max <= 1)
1399 {
1400 memmove(previous+1, previous, len);
1401 code++;
1402 *previous++ = OP_BRAZERO + repeat_type;
1403 }
1404
1405 /* If the maximum is greater than 1 and limited, we have to replicate
1406 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1407 The first one has to be handled carefully because it's the original
1408 copy, which has to be moved up. The remainder can be handled by code
1409 that is common with the non-zero minimum case below. We just have to
1410 adjust the value or repeat_max, since one less copy is required. */
1411
1412 else
1413 {
1414 int offset;
1415 memmove(previous+4, previous, len);
1416 code += 4;
1417 *previous++ = OP_BRAZERO + repeat_type;
1418 *previous++ = OP_BRA;
1419
1420 /* We chain together the bracket offset fields that have to be
1421 filled in later when the ends of the brackets are reached. */
1422
1423 offset = (bralink == NULL)? 0 : previous - bralink;
1424 bralink = previous;
1425 *previous++ = offset >> 8;
1426 *previous++ = offset & 255;
1427 }
1428
1429 repeat_max--;
1430 }
1431
1432 /* If the minimum is greater than zero, replicate the group as many
1433 times as necessary, and adjust the maximum to the number of subsequent
1434 copies that we need. */
1435
1436 else
1437 {
1438 for (i = 1; i < repeat_min; i++)
1439 {
1440 memcpy(code, previous, len);
1441 code += len;
1442 }
1443 if (repeat_max > 0) repeat_max -= repeat_min;
1444 }
1445
1446 /* This code is common to both the zero and non-zero minimum cases. If
1447 the maximum is limited, it replicates the group in a nested fashion,
1448 remembering the bracket starts on a stack. In the case of a zero minimum,
1449 the first one was set up above. In all cases the repeat_max now specifies
1450 the number of additional copies needed. */
1451
1452 if (repeat_max >= 0)
1453 {
1454 for (i = repeat_max - 1; i >= 0; i--)
1455 {
1456 *code++ = OP_BRAZERO + repeat_type;
1457
1458 /* All but the final copy start a new nesting, maintaining the
1459 chain of brackets outstanding. */
1460
1461 if (i != 0)
1462 {
1463 int offset;
1464 *code++ = OP_BRA;
1465 offset = (bralink == NULL)? 0 : code - bralink;
1466 bralink = code;
1467 *code++ = offset >> 8;
1468 *code++ = offset & 255;
1469 }
1470
1471 memcpy(code, previous, len);
1472 code += len;
1473 }
1474
1475 /* Now chain through the pending brackets, and fill in their length
1476 fields (which are holding the chain links pro tem). */
1477
1478 while (bralink != NULL)
1479 {
1480 int oldlinkoffset;
1481 int offset = code - bralink + 1;
1482 uschar *bra = code - offset;
1483 oldlinkoffset = (bra[1] << 8) + bra[2];
1484 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1485 *code++ = OP_KET;
1486 *code++ = bra[1] = offset >> 8;
1487 *code++ = bra[2] = (offset & 255);
1488 }
1489 }
1490
1491 /* If the maximum is unlimited, set a repeater in the final copy. We
1492 can't just offset backwards from the current code point, because we
1493 don't know if there's been an options resetting after the ket. The
1494 correct offset was computed above. */
1495
1496 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1497 }
1498
1499 /* Else there's some kind of shambles */
1500
1501 else
1502 {
1503 *errorptr = ERR11;
1504 goto FAILED;
1505 }
1506
1507 /* In all case we no longer have a previous item. */
1508
1509 END_REPEAT:
1510 previous = NULL;
1511 break;
1512
1513
1514 /* Start of nested bracket sub-expression, or comment or lookahead or
1515 lookbehind or option setting or condition. First deal with special things
1516 that can come after a bracket; all are introduced by ?, and the appearance
1517 of any of them means that this is not a referencing group. They were
1518 checked for validity in the first pass over the string, so we don't have to
1519 check for syntax errors here. */
1520
1521 case '(':
1522 newoptions = options;
1523 condref = -1;
1524
1525 if (*(++ptr) == '?')
1526 {
1527 int set, unset;
1528 int *optset;
1529
1530 switch (*(++ptr))
1531 {
1532 case '#': /* Comment; skip to ket */
1533 ptr++;
1534 while (*ptr != ')') ptr++;
1535 continue;
1536
1537 case ':': /* Non-extracting bracket */
1538 bravalue = OP_BRA;
1539 ptr++;
1540 break;
1541
1542 case '(':
1543 bravalue = OP_COND; /* Conditional group */
1544 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1545 {
1546 condref = *ptr - '0';
1547 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1548 ptr++;
1549 }
1550 else ptr--;
1551 break;
1552
1553 case '=': /* Positive lookahead */
1554 bravalue = OP_ASSERT;
1555 ptr++;
1556 break;
1557
1558 case '!': /* Negative lookahead */
1559 bravalue = OP_ASSERT_NOT;
1560 ptr++;
1561 break;
1562
1563 case '<': /* Lookbehinds */
1564 switch (*(++ptr))
1565 {
1566 case '=': /* Positive lookbehind */
1567 bravalue = OP_ASSERTBACK;
1568 ptr++;
1569 break;
1570
1571 case '!': /* Negative lookbehind */
1572 bravalue = OP_ASSERTBACK_NOT;
1573 ptr++;
1574 break;
1575
1576 default: /* Syntax error */
1577 *errorptr = ERR24;
1578 goto FAILED;
1579 }
1580 break;
1581
1582 case '>': /* One-time brackets */
1583 bravalue = OP_ONCE;
1584 ptr++;
1585 break;
1586
1587 case 'R': /* Pattern recursion */
1588 *code++ = OP_RECURSE;
1589 ptr++;
1590 continue;
1591
1592 default: /* Option setting */
1593 set = unset = 0;
1594 optset = &set;
1595
1596 while (*ptr != ')' && *ptr != ':')
1597 {
1598 switch (*ptr++)
1599 {
1600 case '-': optset = &unset; break;
1601
1602 case 'i': *optset |= PCRE_CASELESS; break;
1603 case 'm': *optset |= PCRE_MULTILINE; break;
1604 case 's': *optset |= PCRE_DOTALL; break;
1605 case 'x': *optset |= PCRE_EXTENDED; break;
1606 case 'U': *optset |= PCRE_UNGREEDY; break;
1607 case 'X': *optset |= PCRE_EXTRA; break;
1608
1609 default:
1610 *errorptr = ERR12;
1611 goto FAILED;
1612 }
1613 }
1614
1615 /* Set up the changed option bits, but don't change anything yet. */
1616
1617 newoptions = (options | set) & (~unset);
1618
1619 /* If the options ended with ')' this is not the start of a nested
1620 group with option changes, so the options change at this level. At top
1621 level there is nothing else to be done (the options will in fact have
1622 been set from the start of compiling as a result of the first pass) but
1623 at an inner level we must compile code to change the ims options if
1624 necessary, and pass the new setting back so that it can be put at the
1625 start of any following branches, and when this group ends, a resetting
1626 item can be compiled. */
1627
1628 if (*ptr == ')')
1629 {
1630 if ((options & PCRE_INGROUP) != 0 &&
1631 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1632 {
1633 *code++ = OP_OPT;
1634 *code++ = *optchanged = newoptions & PCRE_IMS;
1635 }
1636 options = newoptions; /* Change options at this level */
1637 previous = NULL; /* This item can't be repeated */
1638 continue; /* It is complete */
1639 }
1640
1641 /* If the options ended with ':' we are heading into a nested group
1642 with possible change of options. Such groups are non-capturing and are
1643 not assertions of any kind. All we need to do is skip over the ':';
1644 the newoptions value is handled below. */
1645
1646 bravalue = OP_BRA;
1647 ptr++;
1648 }
1649 }
1650
1651 /* Else we have a referencing group; adjust the opcode. */
1652
1653 else
1654 {
1655 if (++(*brackets) > EXTRACT_MAX)
1656 {
1657 *errorptr = ERR13;
1658 goto FAILED;
1659 }
1660 bravalue = OP_BRA + *brackets;
1661 }
1662
1663 /* Process nested bracketed re. Assertions may not be repeated, but other
1664 kinds can be. We copy code into a non-register variable in order to be able
1665 to pass its address because some compilers complain otherwise. Pass in a
1666 new setting for the ims options if they have changed. */
1667
1668 previous = (bravalue >= OP_ONCE)? code : NULL;
1669 *code = bravalue;
1670 tempcode = code;
1671
1672 if (!compile_regex(
1673 options | PCRE_INGROUP, /* Set for all nested groups */
1674 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1675 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1676 brackets, /* Bracket level */
1677 &tempcode, /* Where to put code (updated) */
1678 &ptr, /* Input pointer (updated) */
1679 errorptr, /* Where to put an error message */
1680 (bravalue == OP_ASSERTBACK ||
1681 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1682 condref, /* Condition reference number */
1683 &subreqchar, /* For possible last char */
1684 &subcountlits, /* For literal count */
1685 cd)) /* Tables block */
1686 goto FAILED;
1687
1688 /* At the end of compiling, code is still pointing to the start of the
1689 group, while tempcode has been updated to point past the end of the group
1690 and any option resetting that may follow it. The pattern pointer (ptr)
1691 is on the bracket. */
1692
1693 /* If this is a conditional bracket, check that there are no more than
1694 two branches in the group. */
1695
1696 if (bravalue == OP_COND)
1697 {
1698 uschar *tc = code;
1699 condcount = 0;
1700
1701 do {
1702 condcount++;
1703 tc += (tc[1] << 8) | tc[2];
1704 }
1705 while (*tc != OP_KET);
1706
1707 if (condcount > 2)
1708 {
1709 *errorptr = ERR27;
1710 goto FAILED;
1711 }
1712 }
1713
1714 /* Handle updating of the required character. If the subpattern didn't
1715 set one, leave it as it was. Otherwise, update it for normal brackets of
1716 all kinds, forward assertions, and conditions with two branches. Don't
1717 update the literal count for forward assertions, however. If the bracket
1718 is followed by a quantifier with zero repeat, we have to back off. Hence
1719 the definition of prevreqchar and subcountlits outside the main loop so
1720 that they can be accessed for the back off. */
1721
1722 if (subreqchar > 0 &&
1723 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1724 (bravalue == OP_COND && condcount == 2)))
1725 {
1726 prevreqchar = *reqchar;
1727 *reqchar = subreqchar;
1728 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1729 }
1730
1731 /* Now update the main code pointer to the end of the group. */
1732
1733 code = tempcode;
1734
1735 /* Error if hit end of pattern */
1736
1737 if (*ptr != ')')
1738 {
1739 *errorptr = ERR14;
1740 goto FAILED;
1741 }
1742 break;
1743
1744 /* Check \ for being a real metacharacter; if not, fall through and handle
1745 it as a data character at the start of a string. Escape items are checked
1746 for validity in the pre-compiling pass. */
1747
1748 case '\\':
1749 tempptr = ptr;
1750 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1751
1752 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1753 are arranged to be the negation of the corresponding OP_values. For the
1754 back references, the values are ESC_REF plus the reference number. Only
1755 back references and those types that consume a character may be repeated.
1756 We can test for values between ESC_b and ESC_Z for the latter; this may
1757 have to change if any new ones are ever created. */
1758
1759 if (c < 0)
1760 {
1761 if (-c >= ESC_REF)
1762 {
1763 previous = code;
1764 *code++ = OP_REF;
1765 *code++ = -c - ESC_REF;
1766 }
1767 else
1768 {
1769 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1770 *code++ = -c;
1771 }
1772 continue;
1773 }
1774
1775 /* Data character: reset and fall through */
1776
1777 ptr = tempptr;
1778 c = '\\';
1779
1780 /* Handle a run of data characters until a metacharacter is encountered.
1781 The first character is guaranteed not to be whitespace or # when the
1782 extended flag is set. */
1783
1784 NORMAL_CHAR:
1785 default:
1786 previous = code;
1787 *code = OP_CHARS;
1788 code += 2;
1789 length = 0;
1790
1791 do
1792 {
1793 if ((options & PCRE_EXTENDED) != 0)
1794 {
1795 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1796 if (c == '#')
1797 {
1798 while ((c = *(++ptr)) != 0 && c != '\n');
1799 if (c == 0) break;
1800 continue;
1801 }
1802 }
1803
1804 /* Backslash may introduce a data char or a metacharacter. Escaped items
1805 are checked for validity in the pre-compiling pass. Stop the string
1806 before a metaitem. */
1807
1808 if (c == '\\')
1809 {
1810 tempptr = ptr;
1811 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1812 if (c < 0) { ptr = tempptr; break; }
1813 }
1814
1815 /* Ordinary character or single-char escape */
1816
1817 *code++ = c;
1818 length++;
1819 }
1820
1821 /* This "while" is the end of the "do" above. */
1822
1823 while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1824
1825 /* Update the last character and the count of literals */
1826
1827 prevreqchar = (length > 1)? code[-2] : *reqchar;
1828 *reqchar = code[-1];
1829 *countlits += length;
1830
1831 /* Compute the length and set it in the data vector, and advance to
1832 the next state. */
1833
1834 previous[1] = length;
1835 if (length < 255) ptr--;
1836 break;
1837 }
1838 } /* end of big loop */
1839
1840 /* Control never reaches here by falling through, only by a goto for all the
1841 error states. Pass back the position in the pattern so that it can be displayed
1842 to the user for diagnosing the error. */
1843
1844 FAILED:
1845 *ptrptr = ptr;
1846 return FALSE;
1847 }
1848
1849
1850
1851
1852 /*************************************************
1853 * Compile sequence of alternatives *
1854 *************************************************/
1855
1856 /* On entry, ptr is pointing past the bracket character, but on return
1857 it points to the closing bracket, or vertical bar, or end of string.
1858 The code variable is pointing at the byte into which the BRA operator has been
1859 stored. If the ims options are changed at the start (for a (?ims: group) or
1860 during any branch, we need to insert an OP_OPT item at the start of every
1861 following branch to ensure they get set correctly at run time, and also pass
1862 the new options into every subsequent branch compile.
1863
1864 Argument:
1865 options the option bits
1866 optchanged new ims options to set as if (?ims) were at the start, or -1
1867 for no change
1868 brackets -> int containing the number of extracting brackets used
1869 codeptr -> the address of the current code pointer
1870 ptrptr -> the address of the current pattern pointer
1871 errorptr -> pointer to error message
1872 lookbehind TRUE if this is a lookbehind assertion
1873 condref > 0 for OPT_CREF setting at start of conditional group
1874 reqchar -> place to put the last required character, or a negative number
1875 countlits -> place to put the shortest literal count of any branch
1876 cd points to the data block with tables pointers
1877
1878 Returns: TRUE on success
1879 */
1880
1881 static BOOL
1882 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1883 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1884 int *reqchar, int *countlits, compile_data *cd)
1885 {
1886 const uschar *ptr = *ptrptr;
1887 uschar *code = *codeptr;
1888 uschar *last_branch = code;
1889 uschar *start_bracket = code;
1890 uschar *reverse_count = NULL;
1891 int oldoptions = options & PCRE_IMS;
1892 int branchreqchar, branchcountlits;
1893
1894 *reqchar = -1;
1895 *countlits = INT_MAX;
1896 code += 3;
1897
1898 /* At the start of a reference-based conditional group, insert the reference
1899 number as an OP_CREF item. */
1900
1901 if (condref > 0)
1902 {
1903 *code++ = OP_CREF;
1904 *code++ = condref;
1905 }
1906
1907 /* Loop for each alternative branch */
1908
1909 for (;;)
1910 {
1911 int length;
1912
1913 /* Handle change of options */
1914
1915 if (optchanged >= 0)
1916 {
1917 *code++ = OP_OPT;
1918 *code++ = optchanged;
1919 options = (options & ~PCRE_IMS) | optchanged;
1920 }
1921
1922 /* Set up dummy OP_REVERSE if lookbehind assertion */
1923
1924 if (lookbehind)
1925 {
1926 *code++ = OP_REVERSE;
1927 reverse_count = code;
1928 *code++ = 0;
1929 *code++ = 0;
1930 }
1931
1932 /* Now compile the branch */
1933
1934 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1935 &branchreqchar, &branchcountlits, cd))
1936 {
1937 *ptrptr = ptr;
1938 return FALSE;
1939 }
1940
1941 /* Fill in the length of the last branch */
1942
1943 length = code - last_branch;
1944 last_branch[1] = length >> 8;
1945 last_branch[2] = length & 255;
1946
1947 /* Save the last required character if all branches have the same; a current
1948 value of -1 means unset, while -2 means "previous branch had no last required
1949 char". */
1950
1951 if (*reqchar != -2)
1952 {
1953 if (branchreqchar >= 0)
1954 {
1955 if (*reqchar == -1) *reqchar = branchreqchar;
1956 else if (*reqchar != branchreqchar) *reqchar = -2;
1957 }
1958 else *reqchar = -2;
1959 }
1960
1961 /* Keep the shortest literal count */
1962
1963 if (branchcountlits < *countlits) *countlits = branchcountlits;
1964 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1965
1966 /* If lookbehind, check that this branch matches a fixed-length string,
1967 and put the length into the OP_REVERSE item. Temporarily mark the end of
1968 the branch with OP_END. */
1969
1970 if (lookbehind)
1971 {
1972 *code = OP_END;
1973 length = find_fixedlength(last_branch);
1974 DPRINTF(("fixed length = %d\n", length));
1975 if (length < 0)
1976 {
1977 *errorptr = ERR25;
1978 *ptrptr = ptr;
1979 return FALSE;
1980 }
1981 reverse_count[0] = (length >> 8);
1982 reverse_count[1] = length & 255;
1983 }
1984
1985 /* Reached end of expression, either ')' or end of pattern. Insert a
1986 terminating ket and the length of the whole bracketed item, and return,
1987 leaving the pointer at the terminating char. If any of the ims options
1988 were changed inside the group, compile a resetting op-code following. */
1989
1990 if (*ptr != '|')
1991 {
1992 length = code - start_bracket;
1993 *code++ = OP_KET;
1994 *code++ = length >> 8;
1995 *code++ = length & 255;
1996 if (optchanged >= 0)
1997 {
1998 *code++ = OP_OPT;
1999 *code++ = oldoptions;
2000 }
2001 *codeptr = code;
2002 *ptrptr = ptr;
2003 return TRUE;
2004 }
2005
2006 /* Another branch follows; insert an "or" node and advance the pointer. */
2007
2008 *code = OP_ALT;
2009 last_branch = code;
2010 code += 3;
2011 ptr++;
2012 }
2013 /* Control never reaches here */
2014 }
2015
2016
2017
2018
2019 /*************************************************
2020 * Find first significant op code *
2021 *************************************************/
2022
2023 /* This is called by several functions that scan a compiled expression looking
2024 for a fixed first character, or an anchoring op code etc. It skips over things
2025 that do not influence this. For one application, a change of caseless option is
2026 important.
2027
2028 Arguments:
2029 code pointer to the start of the group
2030 options pointer to external options
2031 optbit the option bit whose changing is significant, or
2032 zero if none are
2033 optstop TRUE to return on option change, otherwise change the options
2034 value and continue
2035
2036 Returns: pointer to the first significant opcode
2037 */
2038
2039 static const uschar*
2040 first_significant_code(const uschar *code, int *options, int optbit,
2041 BOOL optstop)
2042 {
2043 for (;;)
2044 {
2045 switch ((int)*code)
2046 {
2047 case OP_OPT:
2048 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2049 {
2050 if (optstop) return code;
2051 *options = (int)code[1];
2052 }
2053 code += 2;
2054 break;
2055
2056 case OP_CREF:
2057 code += 2;
2058 break;
2059
2060 case OP_WORD_BOUNDARY:
2061 case OP_NOT_WORD_BOUNDARY:
2062 code++;
2063 break;
2064
2065 case OP_ASSERT_NOT:
2066 case OP_ASSERTBACK:
2067 case OP_ASSERTBACK_NOT:
2068 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2069 code += 3;
2070 break;
2071
2072 default:
2073 return code;
2074 }
2075 }
2076 /* Control never reaches here */
2077 }
2078
2079
2080
2081
2082 /*************************************************
2083 * Check for anchored expression *
2084 *************************************************/
2085
2086 /* Try to find out if this is an anchored regular expression. Consider each
2087 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2088 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2089 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2090 counts, since OP_CIRC can match in the middle.
2091
2092 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2093 because that will try the rest of the pattern at all possible matching points,
2094 so there is no point trying them again.
2095
2096 Arguments:
2097 code points to start of expression (the bracket)
2098 options points to the options setting
2099
2100 Returns: TRUE or FALSE
2101 */
2102
2103 static BOOL
2104 is_anchored(register const uschar *code, int *options)
2105 {
2106 do {
2107 const uschar *scode = first_significant_code(code + 3, options,
2108 PCRE_MULTILINE, FALSE);
2109 register int op = *scode;
2110 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2111 { if (!is_anchored(scode, options)) return FALSE; }
2112 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2113 (*options & PCRE_DOTALL) != 0)
2114 { if (scode[1] != OP_ANY) return FALSE; }
2115 else if (op != OP_SOD &&
2116 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2117 return FALSE;
2118 code += (code[1] << 8) + code[2];
2119 }
2120 while (*code == OP_ALT);
2121 return TRUE;
2122 }
2123
2124
2125
2126 /*************************************************
2127 * Check for starting with ^ or .* *
2128 *************************************************/
2129
2130 /* This is called to find out if every branch starts with ^ or .* so that
2131 "first char" processing can be done to speed things up in multiline
2132 matching and for non-DOTALL patterns that start with .* (which must start at
2133 the beginning or after \n).
2134
2135 Argument: points to start of expression (the bracket)
2136 Returns: TRUE or FALSE
2137 */
2138
2139 static BOOL
2140 is_startline(const uschar *code)
2141 {
2142 do {
2143 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2144 register int op = *scode;
2145 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2146 { if (!is_startline(scode)) return FALSE; }
2147 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2148 { if (scode[1] != OP_ANY) return FALSE; }
2149 else if (op != OP_CIRC) return FALSE;
2150 code += (code[1] << 8) + code[2];
2151 }
2152 while (*code == OP_ALT);
2153 return TRUE;
2154 }
2155
2156
2157
2158 /*************************************************
2159 * Check for fixed first char *
2160 *************************************************/
2161
2162 /* Try to find out if there is a fixed first character. This is called for
2163 unanchored expressions, as it speeds up their processing quite considerably.
2164 Consider each alternative branch. If they all start with the same char, or with
2165 a bracket all of whose alternatives start with the same char (recurse ad lib),
2166 then we return that char, otherwise -1.
2167
2168 Arguments:
2169 code points to start of expression (the bracket)
2170 options pointer to the options (used to check casing changes)
2171
2172 Returns: -1 or the fixed first char
2173 */
2174
2175 static int
2176 find_firstchar(const uschar *code, int *options)
2177 {
2178 register int c = -1;
2179 do {
2180 int d;
2181 const uschar *scode = first_significant_code(code + 3, options,
2182 PCRE_CASELESS, TRUE);
2183 register int op = *scode;
2184
2185 if (op >= OP_BRA) op = OP_BRA;
2186
2187 switch(op)
2188 {
2189 default:
2190 return -1;
2191
2192 case OP_BRA:
2193 case OP_ASSERT:
2194 case OP_ONCE:
2195 case OP_COND:
2196 if ((d = find_firstchar(scode, options)) < 0) return -1;
2197 if (c < 0) c = d; else if (c != d) return -1;
2198 break;
2199
2200 case OP_EXACT: /* Fall through */
2201 scode++;
2202
2203 case OP_CHARS: /* Fall through */
2204 scode++;
2205
2206 case OP_PLUS:
2207 case OP_MINPLUS:
2208 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2209 break;
2210 }
2211
2212 code += (code[1] << 8) + code[2];
2213 }
2214 while (*code == OP_ALT);
2215 return c;
2216 }
2217
2218
2219
2220
2221
2222 /*************************************************
2223 * Compile a Regular Expression *
2224 *************************************************/
2225
2226 /* This function takes a string and returns a pointer to a block of store
2227 holding a compiled version of the expression.
2228
2229 Arguments:
2230 pattern the regular expression
2231 options various option bits
2232 errorptr pointer to pointer to error text
2233 erroroffset ptr offset in pattern where error was detected
2234 tables pointer to character tables or NULL
2235
2236 Returns: pointer to compiled data block, or NULL on error,
2237 with errorptr and erroroffset set
2238 */
2239
2240 pcre *
2241 pcre_compile(const char *pattern, int options, const char **errorptr,
2242 int *erroroffset, const unsigned char *tables)
2243 {
2244 real_pcre *re;
2245 int length = 3; /* For initial BRA plus length */
2246 int runlength;
2247 int c, reqchar, countlits;
2248 int bracount = 0;
2249 int top_backref = 0;
2250 int branch_extra = 0;
2251 int branch_newextra;
2252 unsigned int brastackptr = 0;
2253 size_t size;
2254 uschar *code;
2255 const uschar *ptr;
2256 compile_data compile_block;
2257 int brastack[BRASTACK_SIZE];
2258 uschar bralenstack[BRASTACK_SIZE];
2259
2260 #ifdef DEBUG
2261 uschar *code_base, *code_end;
2262 #endif
2263
2264 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2265 can do is just return NULL. */
2266
2267 if (errorptr == NULL) return NULL;
2268 *errorptr = NULL;
2269
2270 /* However, we can give a message for this error */
2271
2272 if (erroroffset == NULL)
2273 {
2274 *errorptr = ERR16;
2275 return NULL;
2276 }
2277 *erroroffset = 0;
2278
2279 if ((options & ~PUBLIC_OPTIONS) != 0)
2280 {
2281 *errorptr = ERR17;
2282 return NULL;
2283 }
2284
2285 /* Set up pointers to the individual character tables */
2286
2287 if (tables == NULL) tables = pcre_default_tables;
2288 compile_block.lcc = tables + lcc_offset;
2289 compile_block.fcc = tables + fcc_offset;
2290 compile_block.cbits = tables + cbits_offset;
2291 compile_block.ctypes = tables + ctypes_offset;
2292
2293 /* Reflect pattern for debugging output */
2294
2295 DPRINTF(("------------------------------------------------------------------\n"));
2296 DPRINTF(("%s\n", pattern));
2297
2298 /* The first thing to do is to make a pass over the pattern to compute the
2299 amount of store required to hold the compiled code. This does not have to be
2300 perfect as long as errors are overestimates. At the same time we can detect any
2301 internal flag settings. Make an attempt to correct for any counted white space
2302 if an "extended" flag setting appears late in the pattern. We can't be so
2303 clever for #-comments. */
2304
2305 ptr = (const uschar *)(pattern - 1);
2306 while ((c = *(++ptr)) != 0)
2307 {
2308 int min, max;
2309 int class_charcount;
2310
2311 if ((options & PCRE_EXTENDED) != 0)
2312 {
2313 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2314 if (c == '#')
2315 {
2316 while ((c = *(++ptr)) != 0 && c != '\n');
2317 continue;
2318 }
2319 }
2320
2321 switch(c)
2322 {
2323 /* A backslashed item may be an escaped "normal" character or a
2324 character type. For a "normal" character, put the pointers and
2325 character back so that tests for whitespace etc. in the input
2326 are done correctly. */
2327
2328 case '\\':
2329 {
2330 const uschar *save_ptr = ptr;
2331 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2332 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2333 if (c >= 0)
2334 {
2335 ptr = save_ptr;
2336 c = '\\';
2337 goto NORMAL_CHAR;
2338 }
2339 }
2340 length++;
2341
2342 /* A back reference needs an additional char, plus either one or 5
2343 bytes for a repeat. We also need to keep the value of the highest
2344 back reference. */
2345
2346 if (c <= -ESC_REF)
2347 {
2348 int refnum = -c - ESC_REF;
2349 if (refnum > top_backref) top_backref = refnum;
2350 length++; /* For single back reference */
2351 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2352 {
2353 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2354 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2355 if ((min == 0 && (max == 1 || max == -1)) ||
2356 (min == 1 && max == -1))
2357 length++;
2358 else length += 5;
2359 if (ptr[1] == '?') ptr++;
2360 }
2361 }
2362 continue;
2363
2364 case '^':
2365 case '.':
2366 case '$':
2367 case '*': /* These repeats won't be after brackets; */
2368 case '+': /* those are handled separately */
2369 case '?':
2370 length++;
2371 continue;
2372
2373 /* This covers the cases of repeats after a single char, metachar, class,
2374 or back reference. */
2375
2376 case '{':
2377 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2378 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2379 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2380 if ((min == 0 && (max == 1 || max == -1)) ||
2381 (min == 1 && max == -1))
2382 length++;
2383 else
2384 {
2385 length--; /* Uncount the original char or metachar */
2386 if (min == 1) length++; else if (min > 0) length += 4;
2387 if (max > 0) length += 4; else length += 2;
2388 }
2389 if (ptr[1] == '?') ptr++;
2390 continue;
2391
2392 /* An alternation contains an offset to the next branch or ket. If any ims
2393 options changed in the previous branch(es), and/or if we are in a
2394 lookbehind assertion, extra space will be needed at the start of the
2395 branch. This is handled by branch_extra. */
2396
2397 case '|':
2398 length += 3 + branch_extra;
2399 continue;
2400
2401 /* A character class uses 33 characters. Don't worry about character types
2402 that aren't allowed in classes - they'll get picked up during the compile.
2403 A character class that contains only one character uses 2 or 3 bytes,
2404 depending on whether it is negated or not. Notice this where we can. */
2405
2406 case '[':
2407 class_charcount = 0;
2408 if (*(++ptr) == '^') ptr++;
2409 do
2410 {
2411 if (*ptr == '\\')
2412 {
2413 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2414 &compile_block);
2415 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2416 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2417 }
2418 else class_charcount++;
2419 ptr++;
2420 }
2421 while (*ptr != 0 && *ptr != ']');
2422
2423 /* Repeats for negated single chars are handled by the general code */
2424
2425 if (class_charcount == 1) length += 3; else
2426 {
2427 length += 33;
2428
2429 /* A repeat needs either 1 or 5 bytes. */
2430
2431 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2432 {
2433 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2434 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2435 if ((min == 0 && (max == 1 || max == -1)) ||
2436 (min == 1 && max == -1))
2437 length++;
2438 else length += 5;
2439 if (ptr[1] == '?') ptr++;
2440 }
2441 }
2442 continue;
2443
2444 /* Brackets may be genuine groups or special things */
2445
2446 case '(':
2447 branch_newextra = 0;
2448
2449 /* Handle special forms of bracket, which all start (? */
2450
2451 if (ptr[1] == '?')
2452 {
2453 int set, unset;
2454 int *optset;
2455
2456 switch (c = ptr[2])
2457 {
2458 /* Skip over comments entirely */
2459 case '#':
2460 ptr += 3;
2461 while (*ptr != 0 && *ptr != ')') ptr++;
2462 if (*ptr == 0)
2463 {
2464 *errorptr = ERR18;
2465 goto PCRE_ERROR_RETURN;
2466 }
2467 continue;
2468
2469 /* Non-referencing groups and lookaheads just move the pointer on, and
2470 then behave like a non-special bracket, except that they don't increment
2471 the count of extracting brackets. Ditto for the "once only" bracket,
2472 which is in Perl from version 5.005. */
2473
2474 case ':':
2475 case '=':
2476 case '!':
2477 case '>':
2478 ptr += 2;
2479 break;
2480
2481 /* A recursive call to the regex is an extension, to provide the
2482 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2483
2484 case 'R':
2485 if (ptr[3] != ')')
2486 {
2487 *errorptr = ERR29;
2488 goto PCRE_ERROR_RETURN;
2489 }
2490 ptr += 3;
2491 length += 1;
2492 break;
2493
2494 /* Lookbehinds are in Perl from version 5.005 */
2495
2496 case '<':
2497 if (ptr[3] == '=' || ptr[3] == '!')
2498 {
2499 ptr += 3;
2500 branch_newextra = 3;
2501 length += 3; /* For the first branch */
2502 break;
2503 }
2504 *errorptr = ERR24;
2505 goto PCRE_ERROR_RETURN;
2506
2507 /* Conditionals are in Perl from version 5.005. The bracket must either
2508 be followed by a number (for bracket reference) or by an assertion
2509 group. */
2510
2511 case '(':
2512 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2513 {
2514 ptr += 4;
2515 length += 2;
2516 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2517 if (*ptr != ')')
2518 {
2519 *errorptr = ERR26;
2520 goto PCRE_ERROR_RETURN;
2521 }
2522 }
2523 else /* An assertion must follow */
2524 {
2525 ptr++; /* Can treat like ':' as far as spacing is concerned */
2526
2527 if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2528 {
2529 ptr += 2; /* To get right offset in message */
2530 *errorptr = ERR28;
2531 goto PCRE_ERROR_RETURN;
2532 }
2533 }
2534 break;
2535
2536 /* Else loop checking valid options until ) is met. Anything else is an
2537 error. If we are without any brackets, i.e. at top level, the settings
2538 act as if specified in the options, so massage the options immediately.
2539 This is for backward compatibility with Perl 5.004. */
2540
2541 default:
2542 set = unset = 0;
2543 optset = &set;
2544 ptr += 2;
2545
2546 for (;; ptr++)
2547 {
2548 c = *ptr;
2549 switch (c)
2550 {
2551 case 'i':
2552 *optset |= PCRE_CASELESS;
2553 continue;
2554
2555 case 'm':
2556 *optset |= PCRE_MULTILINE;
2557 continue;
2558
2559 case 's':
2560 *optset |= PCRE_DOTALL;
2561 continue;
2562
2563 case 'x':
2564 *optset |= PCRE_EXTENDED;
2565 continue;
2566
2567 case 'X':
2568 *optset |= PCRE_EXTRA;
2569 continue;
2570
2571 case 'U':
2572 *optset |= PCRE_UNGREEDY;
2573 continue;
2574
2575 case '-':
2576 optset = &unset;
2577 continue;
2578
2579 /* A termination by ')' indicates an options-setting-only item;
2580 this is global at top level; otherwise nothing is done here and
2581 it is handled during the compiling process on a per-bracket-group
2582 basis. */
2583
2584 case ')':
2585 if (brastackptr == 0)
2586 {
2587 options = (options | set) & (~unset);
2588 set = unset = 0; /* To save length */
2589 }
2590 /* Fall through */
2591
2592 /* A termination by ':' indicates the start of a nested group with
2593 the given options set. This is again handled at compile time, but
2594 we must allow for compiled space if any of the ims options are
2595 set. We also have to allow for resetting space at the end of
2596 the group, which is why 4 is added to the length and not just 2.
2597 If there are several changes of options within the same group, this
2598 will lead to an over-estimate on the length, but this shouldn't
2599 matter very much. We also have to allow for resetting options at
2600 the start of any alternations, which we do by setting
2601 branch_newextra to 2. Finally, we record whether the case-dependent
2602 flag ever changes within the regex. This is used by the "required
2603 character" code. */
2604
2605 case ':':
2606 if (((set|unset) & PCRE_IMS) != 0)
2607 {
2608 length += 4;
2609 branch_newextra = 2;
2610 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2611 }
2612 goto END_OPTIONS;
2613
2614 /* Unrecognized option character */
2615
2616 default:
2617 *errorptr = ERR12;
2618 goto PCRE_ERROR_RETURN;
2619 }
2620 }
2621
2622 /* If we hit a closing bracket, that's it - this is a freestanding
2623 option-setting. We need to ensure that branch_extra is updated if
2624 necessary. The only values branch_newextra can have here are 0 or 2.
2625 If the value is 2, then branch_extra must either be 2 or 5, depending
2626 on whether this is a lookbehind group or not. */
2627
2628 END_OPTIONS:
2629 if (c == ')')
2630 {
2631 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2632 branch_extra += branch_newextra;
2633 continue;
2634 }
2635
2636 /* If options were terminated by ':' control comes here. Fall through
2637 to handle the group below. */
2638 }
2639 }
2640
2641 /* Extracting brackets must be counted so we can process escapes in a
2642 Perlish way. */
2643
2644 else bracount++;
2645
2646 /* Non-special forms of bracket. Save length for computing whole length
2647 at end if there's a repeat that requires duplication of the group. Also
2648 save the current value of branch_extra, and start the new group with
2649 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2650 for a lookbehind assertion. */
2651
2652 if (brastackptr >= sizeof(brastack)/sizeof(int))
2653 {
2654 *errorptr = ERR19;
2655 goto PCRE_ERROR_RETURN;
2656 }
2657
2658 bralenstack[brastackptr] = branch_extra;
2659 branch_extra = branch_newextra;
2660
2661 brastack[brastackptr++] = length;
2662 length += 3;
2663 continue;
2664
2665 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2666 have to replicate this bracket up to that many times. If brastackptr is
2667 0 this is an unmatched bracket which will generate an error, but take care
2668 not to try to access brastack[-1] when computing the length and restoring
2669 the branch_extra value. */
2670
2671 case ')':
2672 length += 3;
2673 {
2674 int minval = 1;
2675 int maxval = 1;
2676 int duplength;
2677
2678 if (brastackptr > 0)
2679 {
2680 duplength = length - brastack[--brastackptr];
2681 branch_extra = bralenstack[brastackptr];
2682 }
2683 else duplength = 0;
2684
2685 /* Leave ptr at the final char; for read_repeat_counts this happens
2686 automatically; for the others we need an increment. */
2687
2688 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2689 {
2690 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2691 &compile_block);
2692 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2693 }
2694 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2695 else if (c == '+') { maxval = -1; ptr++; }
2696 else if (c == '?') { minval = 0; ptr++; }
2697
2698 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2699 group, and if the maximum is greater than zero, we have to replicate
2700 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2701 bracket set - hence the 7. */
2702
2703 if (minval == 0)
2704 {
2705 length++;
2706 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2707 }
2708
2709 /* When the minimum is greater than zero, 1 we have to replicate up to
2710 minval-1 times, with no additions required in the copies. Then, if
2711 there is a limited maximum we have to replicate up to maxval-1 times
2712 allowing for a BRAZERO item before each optional copy and nesting
2713 brackets for all but one of the optional copies. */
2714
2715 else
2716 {
2717 length += (minval - 1) * duplength;
2718 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2719 length += (maxval - minval) * (duplength + 7) - 6;
2720 }
2721 }
2722 continue;
2723
2724 /* Non-special character. For a run of such characters the length required
2725 is the number of characters + 2, except that the maximum run length is 255.
2726 We won't get a skipped space or a non-data escape or the start of a #
2727 comment as the first character, so the length can't be zero. */
2728
2729 NORMAL_CHAR:
2730 default:
2731 length += 2;
2732 runlength = 0;
2733 do
2734 {
2735 if ((options & PCRE_EXTENDED) != 0)
2736 {
2737 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2738 if (c == '#')
2739 {
2740 while ((c = *(++ptr)) != 0 && c != '\n');
2741 continue;
2742 }
2743 }
2744
2745 /* Backslash may introduce a data char or a metacharacter; stop the
2746 string before the latter. */
2747
2748 if (c == '\\')
2749 {
2750 const uschar *saveptr = ptr;
2751 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2752 &compile_block);
2753 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2754 if (c < 0) { ptr = saveptr; break; }
2755 }
2756
2757 /* Ordinary character or single-char escape */
2758
2759 runlength++;
2760 }
2761
2762 /* This "while" is the end of the "do" above. */
2763
2764 while (runlength < 255 &&
2765 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2766
2767 ptr--;
2768 length += runlength;
2769 continue;
2770 }
2771 }
2772
2773 length += 4; /* For final KET and END */
2774
2775 if (length > 65539)
2776 {
2777 *errorptr = ERR20;
2778 return NULL;
2779 }
2780
2781 /* Compute the size of data block needed and get it, either from malloc or
2782 externally provided function. We specify "code[0]" in the offsetof() expression
2783 rather than just "code", because it has been reported that one broken compiler
2784 fails on "code" because it is also an independent variable. It should make no
2785 difference to the value of the offsetof(). */
2786
2787 size = length + offsetof(real_pcre, code[0]);
2788 re = (real_pcre *)(pcre_malloc)(size);
2789
2790 if (re == NULL)
2791 {
2792 *errorptr = ERR21;
2793 return NULL;
2794 }
2795
2796 /* Put in the magic number, and save the size, options, and table pointer */
2797
2798 re->magic_number = MAGIC_NUMBER;
2799 re->size = size;
2800 re->options = options;
2801 re->tables = tables;
2802
2803 /* Set up a starting, non-extracting bracket, then compile the expression. On
2804 error, *errorptr will be set non-NULL, so we don't need to look at the result
2805 of the function here. */
2806
2807 ptr = (const uschar *)pattern;
2808 code = re->code;
2809 *code = OP_BRA;
2810 bracount = 0;
2811 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2812 &reqchar, &countlits, &compile_block);
2813 re->top_bracket = bracount;
2814 re->top_backref = top_backref;
2815
2816 /* If not reached end of pattern on success, there's an excess bracket. */
2817
2818 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2819
2820 /* Fill in the terminating state and check for disastrous overflow, but
2821 if debugging, leave the test till after things are printed out. */
2822
2823 *code++ = OP_END;
2824
2825 #ifndef DEBUG
2826 if (code - re->code > length) *errorptr = ERR23;
2827 #endif
2828
2829 /* Give an error if there's back reference to a non-existent capturing
2830 subpattern. */
2831
2832 if (top_backref > re->top_bracket) *errorptr = ERR15;
2833
2834 /* Failed to compile */
2835
2836 if (*errorptr != NULL)
2837 {
2838 (pcre_free)(re);
2839 PCRE_ERROR_RETURN:
2840 *erroroffset = ptr - (const uschar *)pattern;
2841 return NULL;
2842 }
2843
2844 /* If the anchored option was not passed, set flag if we can determine that the
2845 pattern is anchored by virtue of ^ characters or \A or anything else (such as
2846 starting with .* when DOTALL is set).
2847
2848 Otherwise, see if we can determine what the first character has to be, because
2849 that speeds up unanchored matches no end. If not, see if we can set the
2850 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2851 start with ^. and also when all branches start with .* for non-DOTALL matches.
2852 */
2853
2854 if ((options & PCRE_ANCHORED) == 0)
2855 {
2856 int temp_options = options;
2857 if (is_anchored(re->code, &temp_options))
2858 re->options |= PCRE_ANCHORED;
2859 else
2860 {
2861 int ch = find_firstchar(re->code, &temp_options);
2862 if (ch >= 0)
2863 {
2864 re->first_char = ch;
2865 re->options |= PCRE_FIRSTSET;
2866 }
2867 else if (is_startline(re->code))
2868 re->options |= PCRE_STARTLINE;
2869 }
2870 }
2871
2872 /* Save the last required character if there are at least two literal
2873 characters on all paths, or if there is no first character setting. */
2874
2875 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2876 {
2877 re->req_char = reqchar;
2878 re->options |= PCRE_REQCHSET;
2879 }
2880
2881 /* Print out the compiled data for debugging */
2882
2883 #ifdef DEBUG
2884
2885 printf("Length = %d top_bracket = %d top_backref = %d\n",
2886 length, re->top_bracket, re->top_backref);
2887
2888 if (re->options != 0)
2889 {
2890 printf("%s%s%s%s%s%s%s%s%s\n",
2891 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2892 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2893 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2894 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2895 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2896 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2897 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2898 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2899 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2900 }
2901
2902 if ((re->options & PCRE_FIRSTSET) != 0)
2903 {
2904 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2905 else printf("First char = \\x%02x\n", re->first_char);
2906 }
2907
2908 if ((re->options & PCRE_REQCHSET) != 0)
2909 {
2910 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2911 else printf("Req char = \\x%02x\n", re->req_char);
2912 }
2913
2914 code_end = code;
2915 code_base = code = re->code;
2916
2917 while (code < code_end)
2918 {
2919 int charlength;
2920
2921 printf("%3d ", code - code_base);
2922
2923 if (*code >= OP_BRA)
2924 {
2925 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2926 code += 2;
2927 }
2928
2929 else switch(*code)
2930 {
2931 case OP_OPT:
2932 printf(" %.2x %s", code[1], OP_names[*code]);
2933 code++;
2934 break;
2935
2936 case OP_COND:
2937 printf("%3d Cond", (code[1] << 8) + code[2]);
2938 code += 2;
2939 break;
2940
2941 case OP_CREF:
2942 printf(" %.2d %s", code[1], OP_names[*code]);
2943 code++;
2944 break;
2945
2946 case OP_CHARS:
2947 charlength = *(++code);
2948 printf("%3d ", charlength);
2949 while (charlength-- > 0)
2950 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2951 break;
2952
2953 case OP_KETRMAX:
2954 case OP_KETRMIN:
2955 case OP_ALT:
2956 case OP_KET:
2957 case OP_ASSERT:
2958 case OP_ASSERT_NOT:
2959 case OP_ASSERTBACK:
2960 case OP_ASSERTBACK_NOT:
2961 case OP_ONCE:
2962 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2963 code += 2;
2964 break;
2965
2966 case OP_REVERSE:
2967 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2968 code += 2;
2969 break;
2970
2971 case OP_STAR:
2972 case OP_MINSTAR:
2973 case OP_PLUS:
2974 case OP_MINPLUS:
2975 case OP_QUERY:
2976 case OP_MINQUERY:
2977 case OP_TYPESTAR:
2978 case OP_TYPEMINSTAR:
2979 case OP_TYPEPLUS:
2980 case OP_TYPEMINPLUS:
2981 case OP_TYPEQUERY:
2982 case OP_TYPEMINQUERY:
2983 if (*code >= OP_TYPESTAR)
2984 printf(" %s", OP_names[code[1]]);
2985 else if (isprint(c = code[1])) printf(" %c", c);
2986 else printf(" \\x%02x", c);
2987 printf("%s", OP_names[*code++]);
2988 break;
2989
2990 case OP_EXACT:
2991 case OP_UPTO:
2992 case OP_MINUPTO:
2993 if (isprint(c = code[3])) printf(" %c{", c);
2994 else printf(" \\x%02x{", c);
2995 if (*code != OP_EXACT) printf("0,");
2996 printf("%d}", (code[1] << 8) + code[2]);
2997 if (*code == OP_MINUPTO) printf("?");
2998 code += 3;
2999 break;
3000
3001 case OP_TYPEEXACT:
3002 case OP_TYPEUPTO:
3003 case OP_TYPEMINUPTO:
3004 printf(" %s{", OP_names[code[3]]);
3005 if (*code != OP_TYPEEXACT) printf(",");
3006 printf("%d}", (code[1] << 8) + code[2]);
3007 if (*code == OP_TYPEMINUPTO) printf("?");
3008 code += 3;
3009 break;
3010
3011 case OP_NOT:
3012 if (isprint(c = *(++code))) printf(" [^%c]", c);
3013 else printf(" [^\\x%02x]", c);
3014 break;
3015
3016 case OP_NOTSTAR:
3017 case OP_NOTMINSTAR:
3018 case OP_NOTPLUS:
3019 case OP_NOTMINPLUS:
3020 case OP_NOTQUERY:
3021 case OP_NOTMINQUERY:
3022 if (isprint(c = code[1])) printf(" [^%c]", c);
3023 else printf(" [^\\x%02x]", c);
3024 printf("%s", OP_names[*code++]);
3025 break;
3026
3027 case OP_NOTEXACT:
3028 case OP_NOTUPTO:
3029 case OP_NOTMINUPTO:
3030 if (isprint(c = code[3])) printf(" [^%c]{", c);
3031 else printf(" [^\\x%02x]{", c);
3032 if (*code != OP_NOTEXACT) printf(",");
3033 printf("%d}", (code[1] << 8) + code[2]);
3034 if (*code == OP_NOTMINUPTO) printf("?");
3035 code += 3;
3036 break;
3037
3038 case OP_REF:
3039 printf(" \\%d", *(++code));
3040 code ++;
3041 goto CLASS_REF_REPEAT;
3042
3043 case OP_CLASS:
3044 {
3045 int i, min, max;
3046 code++;
3047 printf(" [");
3048
3049 for (i = 0; i < 256; i++)
3050 {
3051 if ((code[i/8] & (1 << (i&7))) != 0)
3052 {
3053 int j;
3054 for (j = i+1; j < 256; j++)
3055 if ((code[j/8] & (1 << (j&7))) == 0) break;
3056 if (i == '-' || i == ']') printf("\\");
3057 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3058 if (--j > i)
3059 {
3060 printf("-");
3061 if (j == '-' || j == ']') printf("\\");
3062 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3063 }
3064 i = j;
3065 }
3066 }
3067 printf("]");
3068 code += 32;
3069
3070 CLASS_REF_REPEAT:
3071
3072 switch(*code)
3073 {
3074 case OP_CRSTAR:
3075 case OP_CRMINSTAR:
3076 case OP_CRPLUS:
3077 case OP_CRMINPLUS:
3078 case OP_CRQUERY:
3079 case OP_CRMINQUERY:
3080 printf("%s", OP_names[*code]);
3081 break;
3082
3083 case OP_CRRANGE:
3084 case OP_CRMINRANGE:
3085 min = (code[1] << 8) + code[2];
3086 max = (code[3] << 8) + code[4];
3087 if (max == 0) printf("{%d,}", min);
3088 else printf("{%d,%d}", min, max);
3089 if (*code == OP_CRMINRANGE) printf("?");
3090 code += 4;
3091 break;
3092
3093 default:
3094 code--;
3095 }
3096 }
3097 break;
3098
3099 /* Anything else is just a one-node item */
3100
3101 default:
3102 printf(" %s", OP_names[*code]);
3103 break;
3104 }
3105
3106 code++;
3107 printf("\n");
3108 }
3109 printf("------------------------------------------------------------------\n");
3110
3111 /* This check is done here in the debugging case so that the code that
3112 was compiled can be seen. */
3113
3114 if (code - re->code > length)
3115 {
3116 *errorptr = ERR23;
3117 (pcre_free)(re);
3118 *erroroffset = ptr - (uschar *)pattern;
3119 return NULL;
3120 }
3121 #endif
3122
3123 return (pcre *)re;
3124 }
3125
3126
3127
3128 /*************************************************
3129 * Match a back-reference *
3130 *************************************************/
3131
3132 /* If a back reference hasn't been set, the length that is passed is greater
3133 than the number of characters left in the string, so the match fails.
3134
3135 Arguments:
3136 offset index into the offset vector
3137 eptr points into the subject
3138 length length to be matched
3139 md points to match data block
3140 ims the ims flags
3141
3142 Returns: TRUE if matched
3143 */
3144
3145 static BOOL
3146 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3147 unsigned long int ims)
3148 {
3149 const uschar *p = md->start_subject + md->offset_vector[offset];
3150
3151 #ifdef DEBUG
3152 if (eptr >= md->end_subject)
3153 printf("matching subject <null>");
3154 else
3155 {
3156 printf("matching subject ");
3157 pchars(eptr, length, TRUE, md);
3158 }
3159 printf(" against backref ");
3160 pchars(p, length, FALSE, md);
3161 printf("\n");
3162 #endif
3163
3164 /* Always fail if not enough characters left */
3165
3166 if (length > md->end_subject - eptr) return FALSE;
3167
3168 /* Separate the caselesss case for speed */
3169
3170 if ((ims & PCRE_CASELESS) != 0)
3171 {
3172 while (length-- > 0)
3173 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3174 }
3175 else
3176 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3177
3178 return TRUE;
3179 }
3180
3181
3182
3183 /*************************************************
3184 * Match from current position *
3185 *************************************************/
3186
3187 /* On entry ecode points to the first opcode, and eptr to the first character
3188 in the subject string, while eptrb holds the value of eptr at the start of the
3189 last bracketed group - used for breaking infinite loops matching zero-length
3190 strings.
3191
3192 Arguments:
3193 eptr pointer in subject
3194 ecode position in code
3195 offset_top current top pointer
3196 md pointer to "static" info for the match
3197 ims current /i, /m, and /s options
3198 condassert TRUE if called to check a condition assertion
3199 eptrb eptr at start of last bracket
3200
3201 Returns: TRUE if matched
3202 */
3203
3204 static BOOL
3205 match(register const uschar *eptr, register const uschar *ecode,
3206 int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
3207 const uschar *eptrb)
3208 {
3209 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3210
3211 for (;;)
3212 {
3213 int op = (int)*ecode;
3214 int min, max, ctype;
3215 register int i;
3216 register int c;
3217 BOOL minimize = FALSE;
3218
3219 /* Opening capturing bracket. If there is space in the offset vector, save
3220 the current subject position in the working slot at the top of the vector. We
3221 mustn't change the current values of the data slot, because they may be set
3222 from a previous iteration of this group, and be referred to by a reference
3223 inside the group.
3224
3225 If the bracket fails to match, we need to restore this value and also the
3226 values of the final offsets, in case they were set by a previous iteration of
3227 the same bracket.
3228
3229 If there isn't enough space in the offset vector, treat this as if it were a
3230 non-capturing bracket. Don't worry about setting the flag for the error case
3231 here; that is handled in the code for KET. */
3232
3233 if (op > OP_BRA)
3234 {
3235 int number = op - OP_BRA;
3236 int offset = number << 1;
3237
3238 #ifdef DEBUG
3239 printf("start bracket %d subject=", number);
3240 pchars(eptr, 16, TRUE, md);
3241 printf("\n");
3242 #endif
3243
3244 if (offset < md->offset_max)
3245 {
3246 int save_offset1 = md->offset_vector[offset];
3247 int save_offset2 = md->offset_vector[offset+1];
3248 int save_offset3 = md->offset_vector[md->offset_end - number];
3249
3250 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3251 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3252
3253 do
3254 {
3255 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3256 ecode += (ecode[1] << 8) + ecode[2];
3257 }
3258 while (*ecode == OP_ALT);
3259
3260 DPRINTF(("bracket %d failed\n", number));
3261
3262 md->offset_vector[offset] = save_offset1;
3263 md->offset_vector[offset+1] = save_offset2;
3264 md->offset_vector[md->offset_end - number] = save_offset3;
3265 return FALSE;
3266 }
3267
3268 /* Insufficient room for saving captured contents */
3269
3270 else op = OP_BRA;
3271 }
3272
3273 /* Other types of node can be handled by a switch */
3274
3275 switch(op)
3276 {
3277 case OP_BRA: /* Non-capturing bracket: optimized */
3278 DPRINTF(("start bracket 0\n"));
3279 do
3280 {
3281 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3282 ecode += (ecode[1] << 8) + ecode[2];
3283 }
3284 while (*ecode == OP_ALT);
3285 DPRINTF(("bracket 0 failed\n"));
3286 return FALSE;
3287
3288 /* Conditional group: compilation checked that there are no more than
3289 two branches. If the condition is false, skipping the first branch takes us
3290 past the end if there is only one branch, but that's OK because that is
3291 exactly what going to the ket would do. */
3292
3293 case OP_COND:
3294 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3295 {
3296 int offset = ecode[4] << 1; /* Doubled reference number */
3297 return match(eptr,
3298 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3299 5 : 3 + (ecode[1] << 8) + ecode[2]),
3300 offset_top, md, ims, FALSE, eptr);
3301 }
3302
3303 /* The condition is an assertion. Call match() to evaluate it - setting
3304 the final argument TRUE causes it to stop at the end of an assertion. */
3305
3306 else
3307 {
3308 if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3309 {
3310 ecode += 3 + (ecode[4] << 8) + ecode[5];
3311 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3312 }
3313 else ecode += (ecode[1] << 8) + ecode[2];
3314 return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3315 }
3316 /* Control never reaches here */
3317
3318 /* Skip over conditional reference data if encountered (should not be) */
3319
3320 case OP_CREF:
3321 ecode += 2;
3322 break;
3323
3324 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3325 an empty string - recursion will then try other alternatives, if any. */
3326
3327 case OP_END:
3328 if (md->notempty && eptr == md->start_match) return FALSE;
3329 md->end_match_ptr = eptr; /* Record where we ended */
3330 md->end_offset_top = offset_top; /* and how many extracts were taken */
3331 return TRUE;
3332
3333 /* Change option settings */
3334
3335 case OP_OPT:
3336 ims = ecode[1];
3337 ecode += 2;
3338 DPRINTF(("ims set to %02lx\n", ims));
3339 break;
3340
3341 /* Assertion brackets. Check the alternative branches in turn - the
3342 matching won't pass the KET for an assertion. If any one branch matches,
3343 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3344 start of each branch to move the current point backwards, so the code at
3345 this level is identical to the lookahead case. */
3346
3347 case OP_ASSERT:
3348 case OP_ASSERTBACK:
3349 do
3350 {
3351 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3352 ecode += (ecode[1] << 8) + ecode[2];
3353 }
3354 while (*ecode == OP_ALT);
3355 if (*ecode == OP_KET) return FALSE;
3356
3357 /* If checking an assertion for a condition, return TRUE. */
3358
3359 if (condassert) return TRUE;
3360
3361 /* Continue from after the assertion, updating the offsets high water
3362 mark, since extracts may have been taken during the assertion. */
3363
3364 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3365 ecode += 3;
3366 offset_top = md->end_offset_top;
3367 continue;
3368
3369 /* Negative assertion: all branches must fail to match */
3370
3371 case OP_ASSERT_NOT:
3372 case OP_ASSERTBACK_NOT:
3373 do
3374 {
3375 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3376 ecode += (ecode[1] << 8) + ecode[2];
3377 }
3378 while (*ecode == OP_ALT);
3379
3380 if (condassert) return TRUE;
3381 ecode += 3;
3382 continue;
3383
3384 /* Move the subject pointer back. This occurs only at the start of
3385 each branch of a lookbehind assertion. If we are too close to the start to
3386 move back, this match function fails. */
3387
3388 case OP_REVERSE:
3389 eptr -= (ecode[1] << 8) + ecode[2];
3390 if (eptr < md->start_subject) return FALSE;
3391 ecode += 3;
3392 break;
3393
3394 /* Recursion matches the current regex, nested. If there are any capturing
3395 brackets started but not finished, we have to save their starting points
3396 and reinstate them after the recursion. However, we don't know how many
3397 such there are (offset_top records the completed total) so we just have
3398 to save all the potential data. There may be up to 99 such values, which
3399 is a bit large to put on the stack, but using malloc for small numbers
3400 seems expensive. As a compromise, the stack is used when there are fewer
3401 than 16 values to store; otherwise malloc is used. A problem is what to do
3402 if the malloc fails ... there is no way of returning to the top level with
3403 an error. Save the top 15 values on the stack, and accept that the rest
3404 may be wrong. */
3405
3406 case OP_RECURSE:
3407 {
3408 BOOL rc;
3409 int *save;
3410 int stacksave[15];
3411
3412 c = md->offset_max;
3413
3414 if (c < 16) save = stacksave; else
3415 {
3416 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3417 if (save == NULL)
3418 {
3419 save = stacksave;
3420 c = 15;
3421 }
3422 }
3423
3424 for (i = 1; i <= c; i++)
3425 save[i] = md->offset_vector[md->offset_end - i];
3426 rc = match(eptr, md->start_pattern, offset_top, md, ims, FALSE, eptrb);
3427 for (i = 1; i <= c; i++)
3428 md->offset_vector[md->offset_end - i] = save[i];
3429 if (save != stacksave) (pcre_free)(save);
3430 if (!rc) return FALSE;
3431
3432 /* In case the recursion has set more capturing values, save the final
3433 number, then move along the subject till after the recursive match,
3434 and advance one byte in the pattern code. */
3435
3436 offset_top = md->end_offset_top;
3437 eptr = md->end_match_ptr;
3438 ecode++;
3439 }
3440 break;
3441
3442 /* "Once" brackets are like assertion brackets except that after a match,
3443 the point in the subject string is not moved back. Thus there can never be
3444 a move back into the brackets. Check the alternative branches in turn - the
3445 matching won't pass the KET for this kind of subpattern. If any one branch
3446 matches, we carry on as at the end of a normal bracket, leaving the subject
3447 pointer. */
3448
3449 case OP_ONCE:
3450 {
3451 const uschar *prev = ecode;
3452
3453 do
3454 {
3455 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3456 ecode += (ecode[1] << 8) + ecode[2];
3457 }
3458 while (*ecode == OP_ALT);
3459
3460 /* If hit the end of the group (which could be repeated), fail */
3461
3462 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3463
3464 /* Continue as from after the assertion, updating the offsets high water
3465 mark, since extracts may have been taken. */
3466
3467 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3468
3469 offset_top = md->end_offset_top;
3470 eptr = md->end_match_ptr;
3471
3472 /* For a non-repeating ket, just continue at this level. This also
3473 happens for a repeating ket if no characters were matched in the group.
3474 This is the forcible breaking of infinite loops as implemented in Perl
3475 5.005. If there is an options reset, it will get obeyed in the normal
3476 course of events. */
3477
3478 if (*ecode == OP_KET || eptr == eptrb)
3479 {
3480 ecode += 3;
3481 break;
3482 }
3483
3484 /* The repeating kets try the rest of the pattern or restart from the
3485 preceding bracket, in the appropriate order. We need to reset any options
3486 that changed within the bracket before re-running it, so check the next
3487 opcode. */
3488
3489 if (ecode[3] == OP_OPT)
3490 {
3491 ims = (ims & ~PCRE_IMS) | ecode[4];
3492 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3493 }
3494
3495 if (*ecode == OP_KETRMIN)
3496 {
3497 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3498 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3499 }
3500 else /* OP_KETRMAX */
3501 {
3502 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3503 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3504 }
3505 }
3506 return FALSE;
3507
3508 /* An alternation is the end of a branch; scan along to find the end of the
3509 bracketed group and go to there. */
3510
3511 case OP_ALT:
3512 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3513 break;
3514
3515 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3516 that it may occur zero times. It may repeat infinitely, or not at all -
3517 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3518 repeat limits are compiled as a number of copies, with the optional ones
3519 preceded by BRAZERO or BRAMINZERO. */
3520
3521 case OP_BRAZERO:
3522 {
3523 const uschar *next = ecode+1;
3524 if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3525 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3526 ecode = next + 3;
3527 }
3528 break;
3529
3530 case OP_BRAMINZERO:
3531 {
3532 const uschar *next = ecode+1;
3533 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3534 if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3535 ecode++;
3536 }
3537 break;
3538
3539 /* End of a group, repeated or non-repeating. If we are at the end of
3540 an assertion "group", stop matching and return TRUE, but record the
3541 current high water mark for use by positive assertions. Do this also
3542 for the "once" (not-backup up) groups. */
3543
3544 case OP_KET:
3545 case OP_KETRMIN:
3546 case OP_KETRMAX:
3547 {
3548 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3549
3550 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3551 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3552 *prev == OP_ONCE)
3553 {
3554 md->end_match_ptr = eptr; /* For ONCE */
3555 md->end_offset_top = offset_top;
3556 return TRUE;
3557 }
3558
3559 /* In all other cases except a conditional group we have to check the
3560 group number back at the start and if necessary complete handling an
3561 extraction by setting the offsets and bumping the high water mark. */
3562
3563 if (*prev != OP_COND)
3564 {
3565 int number = *prev - OP_BRA;
3566 int offset = number << 1;
3567
3568 DPRINTF(("end bracket %d\n", number));
3569
3570 if (number > 0)
3571 {
3572 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3573 {
3574 md->offset_vector[offset] =
3575 md->offset_vector[md->offset_end - number];
3576 md->offset_vector[offset+1] = eptr - md->start_subject;
3577 if (offset_top <= offset) offset_top = offset + 2;
3578 }
3579 }
3580 }
3581
3582 /* Reset the value of the ims flags, in case they got changed during
3583 the group. */
3584
3585 ims = original_ims;
3586 DPRINTF(("ims reset to %02lx\n", ims));
3587
3588 /* For a non-repeating ket, just continue at this level. This also
3589 happens for a repeating ket if no characters were matched in the group.
3590 This is the forcible breaking of infinite loops as implemented in Perl
3591 5.005. If there is an options reset, it will get obeyed in the normal
3592 course of events. */
3593
3594 if (*ecode == OP_KET || eptr == eptrb)
3595 {
3596 ecode += 3;
3597 break;
3598 }
3599
3600 /* The repeating kets try the rest of the pattern or restart from the
3601 preceding bracket, in the appropriate order. */
3602
3603 if (*ecode == OP_KETRMIN)
3604 {
3605 if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3606 match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3607 }
3608 else /* OP_KETRMAX */
3609 {
3610 if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3611 match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3612 }
3613 }
3614 return FALSE;
3615
3616 /* Start of subject unless notbol, or after internal newline if multiline */
3617
3618 case OP_CIRC:
3619 if (md->notbol && eptr == md->start_subject) return FALSE;
3620 if ((ims & PCRE_MULTILINE) != 0)
3621 {
3622 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3623 ecode++;
3624 break;
3625 }
3626 /* ... else fall through */
3627
3628 /* Start of subject assertion */
3629
3630 case OP_SOD:
3631 if (eptr != md->start_subject) return FALSE;
3632 ecode++;
3633 break;
3634
3635 /* Assert before internal newline if multiline, or before a terminating
3636 newline unless endonly is set, else end of subject unless noteol is set. */
3637
3638 case OP_DOLL:
3639 if ((ims & PCRE_MULTILINE) != 0)
3640 {
3641 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3642 else { if (md->noteol) return FALSE; }
3643 ecode++;
3644 break;
3645 }
3646 else
3647 {
3648 if (md->noteol) return FALSE;
3649 if (!md->endonly)
3650 {
3651 if (eptr < md->end_subject - 1 ||
3652 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3653
3654 ecode++;
3655 break;
3656 }
3657 }
3658 /* ... else fall through */
3659
3660 /* End of subject assertion (\z) */
3661
3662 case OP_EOD:
3663 if (eptr < md->end_subject) return FALSE;
3664 ecode++;
3665 break;
3666
3667 /* End of subject or ending \n assertion (\Z) */
3668
3669 case OP_EODN:
3670 if (eptr < md->end_subject - 1 ||
3671 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3672 ecode++;
3673 break;
3674
3675 /* Word boundary assertions */
3676
3677 case OP_NOT_WORD_BOUNDARY:
3678 case OP_WORD_BOUNDARY:
3679 {
3680 BOOL prev_is_word = (eptr != md->start_subject) &&
3681 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3682 BOOL cur_is_word = (eptr < md->end_subject) &&
3683 ((md->ctypes[*eptr] & ctype_word) != 0);
3684 if ((*ecode++ == OP_WORD_BOUNDARY)?
3685 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3686 return FALSE;
3687 }
3688 break;
3689
3690 /* Match a single character type; inline for speed */
3691
3692 case OP_ANY:
3693 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3694 return FALSE;
3695 if (eptr++ >= md->end_subject) return FALSE;
3696 ecode++;
3697 break;
3698
3699 case OP_NOT_DIGIT:
3700 if (eptr >= md->end_subject ||
3701 (md->ctypes[*eptr++] & ctype_digit) != 0)
3702 return FALSE;
3703 ecode++;
3704 break;
3705
3706 case OP_DIGIT:
3707 if (eptr >= md->end_subject ||
3708 (md->ctypes[*eptr++] & ctype_digit) == 0)
3709 return FALSE;
3710 ecode++;
3711 break;
3712
3713 case OP_NOT_WHITESPACE:
3714 if (eptr >= md->end_subject ||
3715 (md->ctypes[*eptr++] & ctype_space) != 0)
3716 return FALSE;
3717 ecode++;
3718 break;
3719
3720 case OP_WHITESPACE:
3721 if (eptr >= md->end_subject ||
3722 (md->ctypes[*eptr++] & ctype_space) == 0)
3723 return FALSE;
3724 ecode++;
3725 break;
3726
3727 case OP_NOT_WORDCHAR:
3728 if (eptr >= md->end_subject ||
3729 (md->ctypes[*eptr++] & ctype_word) != 0)
3730 return FALSE;
3731 ecode++;
3732 break;
3733
3734 case OP_WORDCHAR:
3735 if (eptr >= md->end_subject ||
3736 (md->ctypes[*eptr++] & ctype_word) == 0)
3737 return FALSE;
3738 ecode++;
3739 break;
3740
3741 /* Match a back reference, possibly repeatedly. Look past the end of the
3742 item to see if there is repeat information following. The code is similar
3743 to that for character classes, but repeated for efficiency. Then obey
3744 similar code to character type repeats - written out again for speed.
3745 However, if the referenced string is the empty string, always treat
3746 it as matched, any number of times (otherwise there could be infinite
3747 loops). */
3748
3749 case OP_REF:
3750 {
3751 int length;
3752 int offset = ecode[1] << 1; /* Doubled reference number */
3753 ecode += 2; /* Advance past the item */
3754
3755 /* If the reference is unset, set the length to be longer than the amount
3756 of subject left; this ensures that every attempt at a match fails. We
3757 can't just fail here, because of the possibility of quantifiers with zero
3758 minima. */
3759
3760 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3761 md->end_subject - eptr + 1 :
3762 md->offset_vector[offset+1] - md->offset_vector[offset];
3763
3764 /* Set up for repetition, or handle the non-repeated case */
3765
3766 switch (*ecode)
3767 {
3768 case OP_CRSTAR:
3769 case OP_CRMINSTAR:
3770 case OP_CRPLUS:
3771 case OP_CRMINPLUS:
3772 case OP_CRQUERY:
3773 case OP_CRMINQUERY:
3774 c = *ecode++ - OP_CRSTAR;
3775 minimize = (c & 1) != 0;
3776 min = rep_min[c]; /* Pick up values from tables; */
3777 max = rep_max[c]; /* zero for max => infinity */
3778 if (max == 0) max = INT_MAX;
3779 break;
3780
3781 case OP_CRRANGE:
3782 case OP_CRMINRANGE:
3783 minimize = (*ecode == OP_CRMINRANGE);
3784 min = (ecode[1] << 8) + ecode[2];
3785 max = (ecode[3] << 8) + ecode[4];
3786 if (max == 0) max = INT_MAX;
3787 ecode += 5;
3788 break;
3789
3790 default: /* No repeat follows */
3791 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3792 eptr += length;
3793 continue; /* With the main loop */
3794 }
3795
3796 /* If the length of the reference is zero, just continue with the
3797 main loop. */
3798
3799 if (length == 0) continue;
3800
3801 /* First, ensure the minimum number of matches are present. We get back
3802 the length of the reference string explicitly rather than passing the
3803 address of eptr, so that eptr can be a register variable. */
3804
3805 for (i = 1; i <= min; i++)
3806 {
3807 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3808 eptr += length;
3809 }
3810
3811 /* If min = max, continue at the same level without recursion.
3812 They are not both allowed to be zero. */
3813
3814 if (min == max) continue;
3815
3816 /* If minimizing, keep trying and advancing the pointer */
3817
3818 if (minimize)
3819 {
3820 for (i = min;; i++)
3821 {
3822 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3823 return TRUE;
3824 if (i >= max || !match_ref(offset, eptr, length, md, ims))
3825 return FALSE;
3826 eptr += length;
3827 }
3828 /* Control never gets here */
3829 }
3830
3831 /* If maximizing, find the longest string and work backwards */
3832
3833 else
3834 {
3835 const uschar *pp = eptr;
3836 for (i = min; i < max; i++)
3837 {
3838 if (!match_ref(offset, eptr, length, md, ims)) break;
3839 eptr += length;
3840 }
3841 while (eptr >= pp)
3842 {
3843 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3844 return TRUE;
3845 eptr -= length;
3846 }
3847 return FALSE;
3848 }
3849 }
3850 /* Control never gets here */
3851
3852
3853
3854 /* Match a character class, possibly repeatedly. Look past the end of the
3855 item to see if there is repeat information following. Then obey similar
3856 code to character type repeats - written out again for speed. */
3857
3858 case OP_CLASS:
3859 {
3860 const uschar *data = ecode + 1; /* Save for matching */
3861 ecode += 33; /* Advance past the item */
3862
3863 switch (*ecode)
3864 {
3865 case OP_CRSTAR:
3866 case OP_CRMINSTAR:
3867 case OP_CRPLUS:
3868 case OP_CRMINPLUS:
3869 case OP_CRQUERY:
3870 case OP_CRMINQUERY:
3871 c = *ecode++ - OP_CRSTAR;
3872 minimize = (c & 1) != 0;
3873 min = rep_min[c]; /* Pick up values from tables; */
3874 max = rep_max[c]; /* zero for max => infinity */
3875 if (max == 0) max = INT_MAX;
3876 break;
3877
3878 case OP_CRRANGE:
3879 case OP_CRMINRANGE:
3880 minimize = (*ecode == OP_CRMINRANGE);
3881 min = (ecode[1] << 8) + ecode[2];
3882 max = (ecode[3] << 8) + ecode[4];
3883 if (max == 0) max = INT_MAX;
3884 ecode += 5;
3885 break;
3886
3887 default: /* No repeat follows */
3888 min = max = 1;
3889 break;
3890 }
3891
3892 /* First, ensure the minimum number of matches are present. */
3893
3894 for (i = 1; i <= min; i++)
3895 {
3896 if (eptr >= md->end_subject) return FALSE;
3897 c = *eptr++;
3898 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3899 return FALSE;
3900 }
3901
3902 /* If max == min we can continue with the main loop without the
3903 need to recurse. */
3904
3905 if (min == max) continue;
3906
3907 /* If minimizing, keep testing the rest of the expression and advancing
3908 the pointer while it matches the class. */
3909
3910 if (minimize)
3911 {
3912 for (i = min;; i++)
3913 {
3914 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3915 return TRUE;
3916 if (i >= max || eptr >= md->end_subject) return FALSE;
3917 c = *eptr++;
3918 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3919 return FALSE;
3920 }
3921 /* Control never gets here */
3922 }
3923
3924 /* If maximizing, find the longest possible run, then work backwards. */
3925
3926 else
3927 {
3928 const uschar *pp = eptr;
3929 for (i = min; i < max; eptr++, i++)
3930 {
3931 if (eptr >= md->end_subject) break;
3932 c = *eptr;
3933 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3934 break;
3935 }
3936
3937 while (eptr >= pp)
3938 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3939 return TRUE;
3940 return FALSE;
3941 }
3942 }
3943 /* Control never gets here */
3944
3945 /* Match a run of characters */
3946
3947 case OP_CHARS:
3948 {
3949 register int length = ecode[1];
3950 ecode += 2;
3951
3952 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3953 if (eptr >= md->end_subject)
3954 printf("matching subject <null> against pattern ");
3955 else
3956 {
3957 printf("matching subject ");
3958 pchars(eptr, length, TRUE, md);
3959 printf(" against pattern ");
3960 }
3961 pchars(ecode, length, FALSE, md);
3962 printf("\n");
3963 #endif
3964
3965 if (length > md->end_subject - eptr) return FALSE;
3966 if ((ims & PCRE_CASELESS) != 0)
3967 {
3968 while (length-- > 0)
3969 if (md->lcc[*ecode++] != md->lcc[*eptr++])
3970 return FALSE;
3971 }
3972 else
3973 {
3974 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3975 }
3976 }
3977 break;
3978
3979 /* Match a single character repeatedly; different opcodes share code. */
3980
3981 case OP_EXACT:
3982 min = max = (ecode[1] << 8) + ecode[2];
3983 ecode += 3;
3984 goto REPEATCHAR;
3985
3986 case OP_UPTO:
3987 case OP_MINUPTO:
3988 min = 0;
3989 max = (ecode[1] << 8) + ecode[2];
3990 minimize = *ecode == OP_MINUPTO;
3991 ecode += 3;
3992 goto REPEATCHAR;
3993
3994 case OP_STAR:
3995 case OP_MINSTAR:
3996 case OP_PLUS:
3997 case OP_MINPLUS:
3998 case OP_QUERY:
3999 case OP_MINQUERY:
4000 c = *ecode++ - OP_STAR;
4001 minimize = (c & 1) != 0;
4002 min = rep_min[c]; /* Pick up values from tables; */
4003 max = rep_max[c]; /* zero for max => infinity */
4004 if (max == 0) max = INT_MAX;
4005
4006 /* Common code for all repeated single-character matches. We can give
4007 up quickly if there are fewer than the minimum number of characters left in
4008 the subject. */
4009
4010 REPEATCHAR:
4011 if (min > md->end_subject - eptr) return FALSE;
4012 c = *ecode++;
4013
4014 /* The code is duplicated for the caseless and caseful cases, for speed,
4015 since matching characters is likely to be quite common. First, ensure the
4016 minimum number of matches are present. If min = max, continue at the same
4017 level without recursing. Otherwise, if minimizing, keep trying the rest of
4018 the expression and advancing one matching character if failing, up to the
4019 maximum. Alternatively, if maximizing, find the maximum number of
4020 characters and work backwards. */
4021
4022 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4023 max, eptr));
4024
4025 if ((ims & PCRE_CASELESS) != 0)
4026 {
4027 c = md->lcc[c];
4028 for (i = 1; i <= min; i++)
4029 if (c != md->lcc[*eptr++]) return FALSE;
4030 if (min == max) continue;
4031 if (minimize)
4032 {
4033 for (i = min;; i++)
4034 {
4035 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4036 return TRUE;
4037 if (i >= max || eptr >= md->end_subject ||
4038 c != md->lcc[*eptr++])
4039 return FALSE;
4040 }
4041 /* Control never gets here */
4042 }
4043 else
4044 {
4045 const uschar *pp = eptr;
4046 for (i = min; i < max; i++)
4047 {
4048 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4049 eptr++;
4050 }
4051 while (eptr >= pp)
4052 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4053 return TRUE;
4054 return FALSE;
4055 }
4056 /* Control never gets here */
4057 }
4058
4059 /* Caseful comparisons */
4060
4061 else
4062 {
4063 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4064 if (min == max) continue;
4065 if (minimize)
4066 {
4067 for (i = min;; i++)
4068 {
4069 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4070 return TRUE;
4071 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4072 }
4073 /* Control never gets here */
4074 }
4075 else
4076 {
4077 const uschar *pp = eptr;
4078 for (i = min; i < max; i++)
4079 {
4080 if (eptr >= md->end_subject || c != *eptr) break;
4081 eptr++;
4082 }
4083 while (eptr >= pp)
4084 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4085 return TRUE;
4086 return FALSE;
4087 }
4088 }
4089 /* Control never gets here */
4090
4091 /* Match a negated single character */
4092
4093 case OP_NOT:
4094 if (eptr >= md->end_subject) return FALSE;
4095 ecode++;
4096 if ((ims & PCRE_CASELESS) != 0)
4097 {
4098 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4099 }
4100 else
4101 {
4102 if (*ecode++ == *eptr++) return FALSE;
4103 }
4104 break;
4105
4106 /* Match a negated single character repeatedly. This is almost a repeat of
4107 the code for a repeated single character, but I haven't found a nice way of
4108 commoning these up that doesn't require a test of the positive/negative
4109 option for each character match. Maybe that wouldn't add very much to the
4110 time taken, but character matching *is* what this is all about... */
4111
4112 case OP_NOTEXACT:
4113 min = max = (ecode[1] << 8) + ecode[2];
4114 ecode += 3;
4115 goto REPEATNOTCHAR;
4116
4117 case OP_NOTUPTO:
4118 case OP_NOTMINUPTO:
4119 min = 0;
4120 max = (ecode[1] << 8) + ecode[2];
4121 minimize = *ecode == OP_NOTMINUPTO;
4122 ecode += 3;
4123 goto REPEATNOTCHAR;
4124
4125 case OP_NOTSTAR:
4126 case OP_NOTMINSTAR:
4127 case OP_NOTPLUS:
4128 case OP_NOTMINPLUS:
4129 case OP_NOTQUERY:
4130 case OP_NOTMINQUERY:
4131 c = *ecode++ - OP_NOTSTAR;
4132 minimize = (c & 1) != 0;
4133 min = rep_min[c]; /* Pick up values from tables; */
4134 max = rep_max[c]; /* zero for max => infinity */
4135 if (max == 0) max = INT_MAX;
4136
4137 /* Common code for all repeated single-character matches. We can give
4138 up quickly if there are fewer than the minimum number of characters left in
4139 the subject. */
4140
4141 REPEATNOTCHAR:
4142 if (min > md->end_subject - eptr) return FALSE;
4143 c = *ecode++;
4144
4145 /* The code is duplicated for the caseless and caseful cases, for speed,
4146 since matching characters is likely to be quite common. First, ensure the
4147 minimum number of matches are present. If min = max, continue at the same
4148 level without recursing. Otherwise, if minimizing, keep trying the rest of
4149 the expression and advancing one matching character if failing, up to the
4150 maximum. Alternatively, if maximizing, find the maximum number of
4151 characters and work backwards. */
4152
4153 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4154 max, eptr));
4155
4156 if ((ims & PCRE_CASELESS) != 0)
4157 {
4158 c = md->lcc[c];
4159 for (i = 1; i <= min; i++)
4160 if (c == md->lcc[*eptr++]) return FALSE;
4161 if (min == max) continue;
4162 if (minimize)
4163 {
4164 for (i = min;; i++)
4165 {
4166 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4167 return TRUE;
4168 if (i >= max || eptr >= md->end_subject ||
4169 c == md->lcc[*eptr++])
4170 return FALSE;
4171 }
4172 /* Control never gets here */
4173 }
4174 else
4175 {
4176 const uschar *pp = eptr;
4177 for (i = min; i < max; i++)
4178 {
4179 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4180 eptr++;
4181 }
4182 while (eptr >= pp)
4183 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4184 return TRUE;
4185 return FALSE;
4186 }
4187 /* Control never gets here */
4188 }
4189
4190 /* Caseful comparisons */
4191
4192 else
4193 {
4194 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4195 if (min == max) continue;
4196 if (minimize)
4197 {
4198 for (i = min;; i++)
4199 {
4200 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
4201 return TRUE;
4202 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4203 }
4204 /* Control never gets here */
4205 }
4206 else
4207 {
4208 const uschar *pp = eptr;
4209 for (i = min; i < max; i++)
4210 {
4211 if (eptr >= md->end_subject || c == *eptr) break;
4212 eptr++;
4213 }
4214 while (eptr >= pp)
4215 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4216 return TRUE;
4217 return FALSE;
4218 }
4219 }
4220 /* Control never gets here */
4221
4222 /* Match a single character type repeatedly; several different opcodes
4223 share code. This is very similar to the code for single characters, but we
4224 repeat it in the interests of efficiency. */
4225
4226 case OP_TYPEEXACT:
4227 min = max = (ecode[1] << 8) + ecode[2];
4228 minimize = TRUE;
4229 ecode += 3;
4230 goto REPEATTYPE;
4231
4232 case OP_TYPEUPTO:
4233 case OP_TYPEMINUPTO:
4234 min = 0;
4235 max = (ecode[1] << 8) + ecode[2];
4236 minimize = *ecode == OP_TYPEMINUPTO;
4237 ecode += 3;
4238 goto REPEATTYPE;
4239
4240 case OP_TYPESTAR:
4241 case OP_TYPEMINSTAR:
4242 case OP_TYPEPLUS:
4243 case OP_TYPEMINPLUS:
4244 case OP_TYPEQUERY:
4245 case OP_TYPEMINQUERY:
4246 c = *ecode++ - OP_TYPESTAR;
4247 minimize = (c & 1) != 0;
4248 min = rep_min[c]; /* Pick up values from tables; */
4249 max = rep_max[c]; /* zero for max => infinity */
4250 if (max == 0) max = INT_MAX;
4251
4252 /* Common code for all repeated single character type matches */
4253
4254 REPEATTYPE:
4255 ctype = *ecode++; /* Code for the character type */
4256
4257 /* First, ensure the minimum number of matches are present. Use inline
4258 code for maximizing the speed, and do the type test once at the start
4259 (i.e. keep it out of the loop). Also test that there are at least the
4260 minimum number of characters before we start. */
4261
4262 if (min > md->end_subject - eptr) return FALSE;
4263 if (min > 0) switch(ctype)
4264 {
4265 case OP_ANY:
4266 if ((ims & PCRE_DOTALL) == 0)
4267 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4268 else eptr += min;
4269 break;
4270
4271 case OP_NOT_DIGIT:
4272 for (i = 1; i <= min; i++)
4273 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4274 break;
4275
4276 case OP_DIGIT:
4277 for (i = 1; i <= min; i++)
4278 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4279 break;
4280
4281 case OP_NOT_WHITESPACE:
4282 for (i = 1; i <= min; i++)
4283 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4284 break;
4285
4286 case OP_WHITESPACE:
4287 for (i = 1; i <= min; i++)
4288 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4289 break;
4290
4291 case OP_NOT_WORDCHAR:
4292 for (i = 1; i <= min; i++)
4293 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4294 return FALSE;
4295 break;
4296
4297 case OP_WORDCHAR:
4298 for (i = 1; i <= min; i++)
4299 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4300 return FALSE;
4301 break;
4302 }
4303
4304 /* If min = max, continue at the same level without recursing */
4305
4306 if (min == max) continue;
4307
4308 /* If minimizing, we have to test the rest of the pattern before each
4309 subsequent match. */
4310
4311 if (minimize)
4312 {
4313 for (i = min;; i++)
4314 {
4315 if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
4316 if (i >= max || eptr >= md->end_subject) return FALSE;
4317
4318 c = *eptr++;
4319 switch(ctype)
4320 {
4321 case OP_ANY:
4322 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4323 break;
4324
4325 case OP_NOT_DIGIT:
4326 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4327 break;
4328
4329 case OP_DIGIT:
4330 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4331 break;
4332
4333 case OP_NOT_WHITESPACE:
4334 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4335 break;
4336
4337 case OP_WHITESPACE:
4338 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4339 break;
4340
4341 case OP_NOT_WORDCHAR:
4342 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4343 break;
4344
4345 case OP_WORDCHAR:
4346 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4347 break;
4348 }
4349 }
4350 /* Control never gets here */
4351 }
4352
4353 /* If maximizing it is worth using inline code for speed, doing the type
4354 test once at the start (i.e. keep it out of the loop). */
4355
4356 else
4357 {
4358 const uschar *pp = eptr;
4359 switch(ctype)
4360 {
4361 case OP_ANY:
4362 if ((ims & PCRE_DOTALL) == 0)
4363 {
4364 for (i = min; i < max; i++)
4365 {
4366 if (eptr >= md->end_subject || *eptr == '\n') break;
4367 eptr++;
4368 }
4369 }
4370 else
4371 {
4372 c = max - min;
4373 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4374 eptr += c;
4375 }
4376 break;
4377
4378 case OP_NOT_DIGIT:
4379 for (i = min; i < max; i++)
4380 {
4381 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4382 break;
4383 eptr++;
4384 }
4385 break;
4386
4387 case OP_DIGIT:
4388 for (i = min; i < max; i++)
4389 {
4390 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4391 break;
4392 eptr++;
4393 }
4394 break;
4395
4396 case OP_NOT_WHITESPACE:
4397 for (i = min; i < max; i++)
4398 {
4399 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4400 break;
4401 eptr++;
4402 }
4403 break;
4404
4405 case OP_WHITESPACE:
4406 for (i = min; i < max; i++)
4407 {
4408 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4409 break;
4410 eptr++;
4411 }
4412 break;
4413
4414 case OP_NOT_WORDCHAR:
4415 for (i = min; i < max; i++)
4416 {
4417 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4418 break;
4419 eptr++;
4420 }
4421 break;
4422
4423 case OP_WORDCHAR:
4424 for (i = min; i < max; i++)
4425 {
4426 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4427 break;
4428 eptr++;
4429 }
4430 break;
4431 }
4432
4433 while (eptr >= pp)
4434 if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4435 return TRUE;
4436 return FALSE;
4437 }
4438 /* Control never gets here */
4439
4440 /* There's been some horrible disaster. */
4441
4442 default:
4443 DPRINTF(("Unknown opcode %d\n", *ecode));
4444 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4445 return FALSE;
4446 }
4447
4448 /* Do not stick any code in here without much thought; it is assumed
4449 that "continue" in the code above comes out to here to repeat the main
4450 loop. */
4451
4452 } /* End of main loop */
4453 /* Control never reaches here */
4454 }
4455
4456
4457
4458
4459 /*************************************************
4460 * Execute a Regular Expression *
4461 *************************************************/
4462
4463 /* This function applies a compiled re to a subject string and picks out
4464 portions of the string if it matches. Two elements in the vector are set for
4465 each substring: the offsets to the start and end of the substring.
4466
4467 Arguments:
4468 external_re points to the compiled expression
4469 external_extra points to "hints" from pcre_study() or is NULL
4470 subject points to the subject string
4471 length length of subject string (may contain binary zeros)
4472 start_offset where to start in the subject string
4473 options option bits
4474 offsets points to a vector of ints to be filled in with offsets
4475 offsetcount the number of elements in the vector
4476
4477 Returns: > 0 => success; value is the number of elements filled in
4478 = 0 => success, but offsets is not big enough
4479 -1 => failed to match
4480 < -1 => some kind of unexpected problem
4481 */
4482
4483 int
4484 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4485 const char *subject, int length, int start_offset, int options, int *offsets,
4486 int offsetcount)
4487 {
4488 int resetcount, ocount;
4489 int first_char = -1;
4490 int req_char = -1;
4491 int req_char2 = -1;
4492 unsigned long int ims = 0;
4493 match_data match_block;
4494 const uschar *start_bits = NULL;
4495 const uschar *start_match = (const uschar *)subject + start_offset;
4496 const uschar *end_subject;
4497 const uschar *req_char_ptr = start_match - 1;
4498 const real_pcre *re = (const real_pcre *)external_re;
4499 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4500 BOOL using_temporary_offsets = FALSE;
4501 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4502 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4503
4504 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4505
4506 if (re == NULL || subject == NULL ||
4507 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4508 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4509
4510 match_block.start_pattern = re->code;
4511 match_block.start_subject = (const uschar *)subject;
4512 match_block.end_subject = match_block.start_subject + length;
4513 end_subject = match_block.end_subject;
4514
4515 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4516
4517 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4518 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4519 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4520
4521 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4522
4523 match_block.lcc = re->tables + lcc_offset;
4524 match_block.ctypes = re->tables + ctypes_offset;
4525
4526 /* The ims options can vary during the matching as a result of the presence
4527 of (?ims) items in the pattern. They are kept in a local variable so that
4528 restoring at the exit of a group is easy. */
4529
4530 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4531
4532 /* If the expression has got more back references than the offsets supplied can
4533 hold, we get a temporary bit of working store to use during the matching.
4534 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4535 of 3. */
4536
4537 ocount = offsetcount - (offsetcount % 3);
4538
4539 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4540 {
4541 ocount = re->top_backref * 3 + 3;
4542 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4543 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4544 using_temporary_offsets = TRUE;
4545 DPRINTF(("Got memory to hold back references\n"));
4546 }
4547 else match_block.offset_vector = offsets;
4548
4549 match_block.offset_end = ocount;
4550 match_block.offset_max = (2*ocount)/3;
4551 match_block.offset_overflow = FALSE;
4552
4553 /* Compute the minimum number of offsets that we need to reset each time. Doing
4554 this makes a huge difference to execution time when there aren't many brackets
4555 in the pattern. */
4556
4557 resetcount = 2 + re->top_bracket * 2;
4558 if (resetcount > offsetcount) resetcount = ocount;
4559
4560 /* Reset the working variable associated with each extraction. These should
4561 never be used unless previously set, but they get saved and restored, and so we
4562 initialize them to avoid reading uninitialized locations. */
4563
4564 if (match_block.offset_vector != NULL)
4565 {
4566 register int *iptr = match_block.offset_vector + ocount;
4567 register int *iend = iptr - resetcount/2 + 1;
4568 while (--iptr >= iend) *iptr = -1;
4569 }
4570
4571 /* Set up the first character to match, if available. The first_char value is
4572 never set for an anchored regular expression, but the anchoring may be forced
4573 at run time, so we have to test for anchoring. The first char may be unset for
4574 an unanchored pattern, of course. If there's no first char and the pattern was
4575 studied, there may be a bitmap of possible first characters. */
4576
4577 if (!anchored)
4578 {
4579 if ((re->options & PCRE_FIRSTSET) != 0)
4580 {
4581 first_char = re->first_char;
4582 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4583 }
4584 else
4585 if (!startline && extra != NULL &&
4586 (extra->options & PCRE_STUDY_MAPPED) != 0)
4587 start_bits = extra->start_bits;
4588 }
4589
4590 /* For anchored or unanchored matches, there may be a "last known required
4591 character" set. If the PCRE_CASELESS is set, implying that the match starts
4592 caselessly, or if there are any changes of this flag within the regex, set up
4593 both cases of the character. Otherwise set the two values the same, which will
4594 avoid duplicate testing (which takes significant time). This covers the vast
4595 majority of cases. It will be suboptimal when the case flag changes in a regex
4596 and the required character in fact is caseful. */
4597
4598 if ((re->options & PCRE_REQCHSET) != 0)
4599 {
4600 req_char = re->req_char;
4601 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4602 (re->tables + fcc_offset)[req_char] : req_char;
4603 }
4604
4605 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4606 the loop runs just once. */
4607
4608 do
4609 {
4610 int rc;
4611 register int *iptr = match_block.offset_vector;
4612 register int *iend = iptr + resetcount;
4613
4614 /* Reset the maximum number of extractions we might see. */
4615
4616 while (iptr < iend) *iptr++ = -1;
4617
4618 /* Advance to a unique first char if possible */
4619
4620 if (first_char >= 0)
4621 {
4622 if ((ims & PCRE_CASELESS) != 0)
4623 while (start_match < end_subject &&
4624 match_block.lcc[*start_match] != first_char)
4625 start_match++;
4626 else
4627 while (start_match < end_subject && *start_match != first_char)
4628 start_match++;
4629 }
4630
4631 /* Or to just after \n for a multiline match if possible */
4632
4633 else if (startline)
4634 {
4635 if (start_match > match_block.start_subject + start_offset)
4636 {
4637 while (start_match < end_subject && start_match[-1] != '\n')
4638 start_match++;
4639 }
4640 }
4641
4642 /* Or to a non-unique first char after study */
4643
4644 else if (start_bits != NULL)
4645 {
4646 while (start_match < end_subject)
4647 {
4648 register int c = *start_match;
4649 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4650 }
4651 }
4652
4653 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4654 printf(">>>> Match against: ");
4655 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4656 printf("\n");
4657 #endif
4658
4659 /* If req_char is set, we know that that character must appear in the subject
4660 for the match to succeed. If the first character is set, req_char must be
4661 later in the subject; otherwise the test starts at the match point. This
4662 optimization can save a huge amount of backtracking in patterns with nested
4663 unlimited repeats that aren't going to match. We don't know what the state of
4664 case matching may be when this character is hit, so test for it in both its
4665 cases if necessary. However, the different cased versions will not be set up
4666 unless PCRE_CASELESS was given or the casing state changes within the regex.
4667 Writing separate code makes it go faster, as does using an autoincrement and
4668 backing off on a match. */
4669
4670 if (req_char >= 0)
4671 {
4672 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4673
4674 /* We don't need to repeat the search if we haven't yet reached the
4675 place we found it at last time. */
4676
4677 if (p > req_char_ptr)
4678 {
4679 /* Do a single test if no case difference is set up */
4680
4681 if (req_char == req_char2)
4682 {
4683 while (p < end_subject)
4684 {
4685 if (*p++ == req_char) { p--; break; }
4686 }
4687 }
4688
4689 /* Otherwise test for either case */
4690
4691 else
4692 {
4693 while (p < end_subject)
4694 {
4695 register int pp = *p++;
4696 if (pp == req_char || pp == req_char2) { p--; break; }
4697 }
4698 }
4699
4700 /* If we can't find the required character, break the matching loop */
4701
4702 if (p >= end_subject) break;
4703
4704 /* If we have found the required character, save the point where we
4705 found it, so that we don't search again next time round the loop if
4706 the start hasn't passed this character yet. */
4707
4708 req_char_ptr = p;
4709 }
4710 }
4711
4712 /* When a match occurs, substrings will be set for all internal extractions;
4713 we just need to set up the whole thing as substring 0 before returning. If
4714 there were too many extractions, set the return code to zero. In the case
4715 where we had to get some local store to hold offsets for backreferences, copy
4716 those back references that we can. In this case there need not be overflow
4717 if certain parts of the pattern were not used. */
4718
4719 match_block.start_match = start_match;
4720 if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4721 continue;
4722
4723 /* Copy the offset information from temporary store if necessary */
4724
4725 if (using_temporary_offsets)
4726 {
4727 if (offsetcount >= 4)
4728 {
4729 memcpy(offsets + 2, match_block.offset_vector + 2,
4730 (offsetcount - 2) * sizeof(int));
4731 DPRINTF(("Copied offsets from temporary memory\n"));
4732 }
4733 if (match_block.end_offset_top > offsetcount)
4734 match_block.offset_overflow = TRUE;
4735
4736 DPRINTF(("Freeing temporary memory\n"));
4737 (pcre_free)(match_block.offset_vector);
4738 }
4739
4740 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4741
4742 if (match_block.offset_end < 2) rc = 0; else
4743 {
4744 offsets[0] = start_match - match_block.start_subject;
4745 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4746 }
4747
4748 DPRINTF((">>>> returning %d\n", rc));
4749 return rc;
4750 }
4751
4752 /* This "while" is the end of the "do" above */
4753
4754 while (!anchored &&
4755 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4756 start_match++ < end_subject);
4757
4758 if (using_temporary_offsets)
4759 {
4760 DPRINTF(("Freeing temporary memory\n"));
4761 (pcre_free)(match_block.offset_vector);
4762 }
4763
4764 DPRINTF((">>>> returning %d\n", match_block.errorcode));
4765
4766 return match_block.errorcode;
4767 }
4768
4769 /* End of pcre.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12