/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 59 - (hide annotations) (download)
Sat Feb 24 21:39:54 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 153180 byte(s)
Load pcre-3.8 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 53 Copyright (c) 1997-2001 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Define DEBUG to get debugging output on stdout. */
37    
38     /* #define DEBUG */
39    
40 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41     inline, and there are *still* stupid compilers about that don't like indented
42     pre-processor statements. I suppose it's only been 10 years... */
43 nigel 3
44 nigel 9 #ifdef DEBUG
45     #define DPRINTF(p) printf p
46     #else
47     #define DPRINTF(p) /*nothing*/
48     #endif
49    
50 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
51     the external pcre header. */
52    
53     #include "internal.h"
54    
55    
56 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
57    
58     #ifdef __cplusplus
59     #define class pcre_class
60     #endif
61    
62    
63 nigel 53 /* Maximum number of items on the nested bracket stacks at compile time. This
64     applies to the nesting of all kinds of parentheses. It does not limit
65     un-nested, non-capturing parentheses. This number can be made bigger if
66     necessary - it is used to dimension one int and one unsigned char vector at
67     compile time. */
68 nigel 23
69     #define BRASTACK_SIZE 200
70    
71    
72 nigel 49 /* The number of bytes in a literal character string above which we can't add
73     any more is different when UTF-8 characters may be encountered. */
74    
75     #ifdef SUPPORT_UTF8
76     #define MAXLIT 250
77     #else
78     #define MAXLIT 255
79     #endif
80    
81    
82 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
83    
84 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
85     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
86 nigel 3
87 nigel 13 /* Text forms of OP_ values and things, for debugging (not all used) */
88 nigel 3
89     #ifdef DEBUG
90 nigel 7 static const char *OP_names[] = {
91     "End", "\\A", "\\B", "\\b", "\\D", "\\d",
92 nigel 23 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
93     "Opt", "^", "$", "Any", "chars", "not",
94 nigel 3 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
95     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
96     "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
97     "*", "*?", "+", "+?", "?", "??", "{", "{",
98 nigel 43 "class", "Ref", "Recurse",
99 nigel 23 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
100     "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
101 nigel 53 "Brazero", "Braminzero", "Branumber", "Bra"
102 nigel 3 };
103     #endif
104    
105     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
106     are simple data values; negative values are for special things like \d and so
107     on. Zero means further processing is needed (for things like \x), or the escape
108     is invalid. */
109    
110 nigel 15 static const short int escapes[] = {
111 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
112     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
113     '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
114     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
115     0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
116     0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
117 nigel 53 '`', 7, -ESC_b, 0, -ESC_d, ESC_E, ESC_F, 0, /* ` - g */
118     0, 0, 0, 0, 0, 0, ESC_N, 0, /* h - o */
119     0, 0, ESC_R, -ESC_s, ESC_T, 0, 0, -ESC_w, /* p - w */
120 nigel 23 0, 0, -ESC_z /* x - z */
121 nigel 3 };
122    
123 nigel 43 /* Tables of names of POSIX character classes and their lengths. The list is
124     terminated by a zero length entry. The first three must be alpha, upper, lower,
125     as this is assumed for handling case independence. */
126    
127     static const char *posix_names[] = {
128     "alpha", "lower", "upper",
129     "alnum", "ascii", "cntrl", "digit", "graph",
130     "print", "punct", "space", "word", "xdigit" };
131    
132     static const uschar posix_name_lengths[] = {
133     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
134    
135     /* Table of class bit maps for each POSIX class; up to three may be combined
136     to form the class. */
137    
138     static const int posix_class_maps[] = {
139     cbit_lower, cbit_upper, -1, /* alpha */
140     cbit_lower, -1, -1, /* lower */
141     cbit_upper, -1, -1, /* upper */
142     cbit_digit, cbit_lower, cbit_upper, /* alnum */
143     cbit_print, cbit_cntrl, -1, /* ascii */
144     cbit_cntrl, -1, -1, /* cntrl */
145     cbit_digit, -1, -1, /* digit */
146     cbit_graph, -1, -1, /* graph */
147     cbit_print, -1, -1, /* print */
148     cbit_punct, -1, -1, /* punct */
149     cbit_space, -1, -1, /* space */
150     cbit_word, -1, -1, /* word */
151     cbit_xdigit,-1, -1 /* xdigit */
152     };
153    
154    
155 nigel 3 /* Definition to allow mutual recursion */
156    
157 nigel 13 static BOOL
158 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
159 nigel 37 BOOL, int, int *, int *, compile_data *);
160 nigel 3
161 nigel 47 /* Structure for building a chain of data that actually lives on the
162     stack, for holding the values of the subject pointer at the start of each
163     subpattern, so as to detect when an empty string has been matched by a
164     subpattern - to break infinite loops. */
165 nigel 3
166 nigel 47 typedef struct eptrblock {
167     struct eptrblock *prev;
168     const uschar *saved_eptr;
169     } eptrblock;
170 nigel 3
171 nigel 47 /* Flag bits for the match() function */
172    
173     #define match_condassert 0x01 /* Called to check a condition assertion */
174     #define match_isgroup 0x02 /* Set if start of bracketed group */
175    
176    
177    
178 nigel 3 /*************************************************
179     * Global variables *
180     *************************************************/
181    
182     /* PCRE is thread-clean and doesn't use any global variables in the normal
183     sense. However, it calls memory allocation and free functions via the two
184     indirections below, which are can be changed by the caller, but are shared
185     between all threads. */
186    
187     void *(*pcre_malloc)(size_t) = malloc;
188     void (*pcre_free)(void *) = free;
189    
190    
191    
192 nigel 49 /*************************************************
193     * Macros and tables for character handling *
194     *************************************************/
195 nigel 3
196 nigel 49 /* When UTF-8 encoding is being used, a character is no longer just a single
197     byte. The macros for character handling generate simple sequences when used in
198     byte-mode, and more complicated ones for UTF-8 characters. */
199    
200     #ifndef SUPPORT_UTF8
201     #define GETCHARINC(c, eptr) c = *eptr++;
202     #define GETCHARLEN(c, eptr, len) c = *eptr;
203     #define BACKCHAR(eptr)
204    
205     #else /* SUPPORT_UTF8 */
206    
207     /* Get the next UTF-8 character, advancing the pointer */
208    
209     #define GETCHARINC(c, eptr) \
210     c = *eptr++; \
211     if (md->utf8 && (c & 0xc0) == 0xc0) \
212     { \
213     int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
214 nigel 59 int s = 6*a; \
215     c = (c & utf8_table3[a]) << s; \
216 nigel 49 while (a-- > 0) \
217     { \
218 nigel 59 s -= 6; \
219 nigel 49 c |= (*eptr++ & 0x3f) << s; \
220     } \
221     }
222    
223     /* Get the next UTF-8 character, not advancing the pointer, setting length */
224    
225     #define GETCHARLEN(c, eptr, len) \
226     c = *eptr; \
227     len = 1; \
228     if (md->utf8 && (c & 0xc0) == 0xc0) \
229     { \
230     int i; \
231     int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
232 nigel 59 int s = 6*a; \
233     c = (c & utf8_table3[a]) << s; \
234 nigel 49 for (i = 1; i <= a; i++) \
235     { \
236 nigel 59 s -= 6; \
237 nigel 49 c |= (eptr[i] & 0x3f) << s; \
238     } \
239     len += a; \
240     }
241    
242     /* If the pointer is not at the start of a character, move it back until
243     it is. */
244    
245     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
246    
247     #endif
248    
249    
250    
251 nigel 3 /*************************************************
252 nigel 25 * Default character tables *
253     *************************************************/
254    
255     /* A default set of character tables is included in the PCRE binary. Its source
256     is built by the maketables auxiliary program, which uses the default C ctypes
257     functions, and put in the file chartables.c. These tables are used by PCRE
258     whenever the caller of pcre_compile() does not provide an alternate set of
259     tables. */
260    
261     #include "chartables.c"
262    
263    
264    
265 nigel 49 #ifdef SUPPORT_UTF8
266 nigel 25 /*************************************************
267 nigel 49 * Tables for UTF-8 support *
268     *************************************************/
269    
270     /* These are the breakpoints for different numbers of bytes in a UTF-8
271     character. */
272    
273     static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
274    
275     /* These are the indicator bits and the mask for the data bits to set in the
276     first byte of a character, indexed by the number of additional bytes. */
277    
278     static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
279     static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
280    
281     /* Table of the number of extra characters, indexed by the first character
282     masked with 0x3f. The highest number for a valid UTF-8 character is in fact
283     0x3d. */
284    
285     static uschar utf8_table4[] = {
286     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
287     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
288     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
289     3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
290    
291    
292     /*************************************************
293     * Convert character value to UTF-8 *
294     *************************************************/
295    
296     /* This function takes an integer value in the range 0 - 0x7fffffff
297     and encodes it as a UTF-8 character in 0 to 6 bytes.
298    
299     Arguments:
300     cvalue the character value
301     buffer pointer to buffer for result - at least 6 bytes long
302    
303     Returns: number of characters placed in the buffer
304     */
305    
306     static int
307     ord2utf8(int cvalue, uschar *buffer)
308     {
309     register int i, j;
310     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
311     if (cvalue <= utf8_table1[i]) break;
312 nigel 59 buffer += i;
313     for (j = i; j > 0; j--)
314     {
315     *buffer-- = 0x80 | (cvalue & 0x3f);
316     cvalue >>= 6;
317     }
318     *buffer = utf8_table2[i] | cvalue;
319 nigel 49 return i + 1;
320     }
321     #endif
322    
323    
324    
325     /*************************************************
326 nigel 3 * Return version string *
327     *************************************************/
328    
329 nigel 39 #define STRING(a) # a
330     #define XSTRING(s) STRING(s)
331    
332 nigel 7 const char *
333 nigel 3 pcre_version(void)
334     {
335 nigel 39 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
336 nigel 3 }
337    
338    
339    
340    
341     /*************************************************
342 nigel 43 * (Obsolete) Return info about compiled pattern *
343 nigel 3 *************************************************/
344    
345 nigel 43 /* This is the original "info" function. It picks potentially useful data out
346     of the private structure, but its interface was too rigid. It remains for
347     backwards compatibility. The public options are passed back in an int - though
348     the re->options field has been expanded to a long int, all the public options
349 nigel 37 at the low end of it, and so even on 16-bit systems this will still be OK.
350     Therefore, I haven't changed the API for pcre_info().
351 nigel 3
352     Arguments:
353     external_re points to compiled code
354     optptr where to pass back the options
355     first_char where to pass back the first character,
356     or -1 if multiline and all branches start ^,
357     or -2 otherwise
358    
359 nigel 43 Returns: number of capturing subpatterns
360 nigel 3 or negative values on error
361     */
362    
363     int
364     pcre_info(const pcre *external_re, int *optptr, int *first_char)
365     {
366 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
367 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
368     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
369 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
370 nigel 3 if (first_char != NULL)
371     *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
372     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
373     return re->top_bracket;
374     }
375    
376    
377    
378 nigel 43 /*************************************************
379     * Return info about compiled pattern *
380     *************************************************/
381 nigel 3
382 nigel 43 /* This is a newer "info" function which has an extensible interface so
383     that additional items can be added compatibly.
384    
385     Arguments:
386     external_re points to compiled code
387     external_study points to study data, or NULL
388     what what information is required
389     where where to put the information
390    
391     Returns: 0 if data returned, negative on error
392     */
393    
394     int
395     pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
396     void *where)
397     {
398     const real_pcre *re = (const real_pcre *)external_re;
399     const real_pcre_extra *study = (const real_pcre_extra *)study_data;
400    
401     if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
402     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
403    
404     switch (what)
405     {
406     case PCRE_INFO_OPTIONS:
407     *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
408     break;
409    
410     case PCRE_INFO_SIZE:
411     *((size_t *)where) = re->size;
412     break;
413    
414     case PCRE_INFO_CAPTURECOUNT:
415     *((int *)where) = re->top_bracket;
416     break;
417    
418     case PCRE_INFO_BACKREFMAX:
419     *((int *)where) = re->top_backref;
420     break;
421    
422     case PCRE_INFO_FIRSTCHAR:
423     *((int *)where) =
424     ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
425     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426     break;
427    
428     case PCRE_INFO_FIRSTTABLE:
429     *((const uschar **)where) =
430     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
431     study->start_bits : NULL;
432     break;
433    
434     case PCRE_INFO_LASTLITERAL:
435     *((int *)where) =
436     ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
437     break;
438    
439     default: return PCRE_ERROR_BADOPTION;
440     }
441    
442     return 0;
443     }
444    
445    
446    
447 nigel 3 #ifdef DEBUG
448     /*************************************************
449     * Debugging function to print chars *
450     *************************************************/
451    
452     /* Print a sequence of chars in printable format, stopping at the end of the
453     subject if the requested.
454    
455     Arguments:
456     p points to characters
457     length number to print
458     is_subject TRUE if printing from within md->start_subject
459     md pointer to matching data block, if is_subject is TRUE
460    
461     Returns: nothing
462     */
463    
464 nigel 9 static void
465     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
466 nigel 3 {
467     int c;
468     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
469     while (length-- > 0)
470     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
471     }
472     #endif
473    
474    
475    
476    
477     /*************************************************
478     * Handle escapes *
479     *************************************************/
480    
481     /* This function is called when a \ has been encountered. It either returns a
482     positive value for a simple escape such as \n, or a negative value which
483 nigel 49 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
484     a positive value greater than 255 may be returned. On entry, ptr is pointing at
485     the \. On exit, it is on the final character of the escape sequence.
486 nigel 3
487     Arguments:
488     ptrptr points to the pattern position pointer
489     errorptr points to the pointer to the error message
490     bracount number of previous extracting brackets
491     options the options bits
492     isclass TRUE if inside a character class
493 nigel 25 cd pointer to char tables block
494 nigel 3
495     Returns: zero or positive => a data character
496     negative => a special escape sequence
497     on error, errorptr is set
498     */
499    
500     static int
501 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
502 nigel 25 int options, BOOL isclass, compile_data *cd)
503 nigel 3 {
504 nigel 7 const uschar *ptr = *ptrptr;
505 nigel 43 int c, i;
506 nigel 3
507 nigel 49 /* If backslash is at the end of the pattern, it's an error. */
508    
509     c = *(++ptr);
510 nigel 3 if (c == 0) *errorptr = ERR1;
511    
512     /* Digits or letters may have special meaning; all others are literals. */
513    
514     else if (c < '0' || c > 'z') {}
515    
516     /* Do an initial lookup in a table. A non-zero result is something that can be
517     returned immediately. Otherwise further processing may be required. */
518    
519     else if ((i = escapes[c - '0']) != 0) c = i;
520    
521     /* Escapes that need further processing, or are illegal. */
522    
523     else
524     {
525 nigel 7 const uschar *oldptr;
526 nigel 3 switch (c)
527     {
528     /* The handling of escape sequences consisting of a string of digits
529     starting with one that is not zero is not straightforward. By experiment,
530     the way Perl works seems to be as follows:
531    
532     Outside a character class, the digits are read as a decimal number. If the
533     number is less than 10, or if there are that many previous extracting
534     left brackets, then it is a back reference. Otherwise, up to three octal
535     digits are read to form an escaped byte. Thus \123 is likely to be octal
536     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
537     value is greater than 377, the least significant 8 bits are taken. Inside a
538     character class, \ followed by a digit is always an octal number. */
539    
540     case '1': case '2': case '3': case '4': case '5':
541     case '6': case '7': case '8': case '9':
542    
543     if (!isclass)
544     {
545     oldptr = ptr;
546     c -= '0';
547 nigel 25 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
548 nigel 3 c = c * 10 + *(++ptr) - '0';
549     if (c < 10 || c <= bracount)
550     {
551     c = -(ESC_REF + c);
552     break;
553     }
554     ptr = oldptr; /* Put the pointer back and fall through */
555     }
556    
557     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
558     generates a binary zero byte and treats the digit as a following literal.
559     Thus we have to pull back the pointer by one. */
560    
561     if ((c = *ptr) >= '8')
562     {
563     ptr--;
564     c = 0;
565     break;
566     }
567    
568     /* \0 always starts an octal number, but we may drop through to here with a
569 nigel 49 larger first octal digit. */
570 nigel 3
571     case '0':
572     c -= '0';
573 nigel 25 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
574 nigel 3 ptr[1] != '8' && ptr[1] != '9')
575     c = c * 8 + *(++ptr) - '0';
576 nigel 49 c &= 255; /* Take least significant 8 bits */
577 nigel 3 break;
578    
579 nigel 49 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
580     which can be greater than 0xff, but only if the ddd are hex digits. */
581 nigel 3
582     case 'x':
583 nigel 49 #ifdef SUPPORT_UTF8
584     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
585     {
586     const uschar *pt = ptr + 2;
587     register int count = 0;
588     c = 0;
589     while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
590     {
591     count++;
592     c = c * 16 + cd->lcc[*pt] -
593     (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
594     pt++;
595     }
596     if (*pt == '}')
597     {
598     if (c < 0 || count > 8) *errorptr = ERR34;
599     ptr = pt;
600     break;
601     }
602     /* If the sequence of hex digits does not end with '}', then we don't
603     recognize this construct; fall through to the normal \x handling. */
604     }
605     #endif
606    
607     /* Read just a single hex char */
608    
609 nigel 3 c = 0;
610 nigel 25 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
611 nigel 3 {
612     ptr++;
613 nigel 25 c = c * 16 + cd->lcc[*ptr] -
614     (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
615 nigel 3 }
616     break;
617    
618 nigel 49 /* Other special escapes not starting with a digit are straightforward */
619    
620 nigel 3 case 'c':
621     c = *(++ptr);
622     if (c == 0)
623     {
624     *errorptr = ERR2;
625     return 0;
626     }
627    
628     /* A letter is upper-cased; then the 0x40 bit is flipped */
629    
630 nigel 25 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
631 nigel 3 c ^= 0x40;
632     break;
633    
634     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
635     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
636 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
637     there used to be some cases other than the default, and there may be again
638     in future, so I haven't "optimized" it. */
639 nigel 3
640     default:
641     if ((options & PCRE_EXTRA) != 0) switch(c)
642     {
643     default:
644     *errorptr = ERR3;
645     break;
646     }
647     break;
648     }
649     }
650    
651     *ptrptr = ptr;
652     return c;
653     }
654    
655    
656    
657     /*************************************************
658     * Check for counted repeat *
659     *************************************************/
660    
661     /* This function is called when a '{' is encountered in a place where it might
662     start a quantifier. It looks ahead to see if it really is a quantifier or not.
663     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
664     where the ddds are digits.
665    
666     Arguments:
667     p pointer to the first char after '{'
668 nigel 25 cd pointer to char tables block
669 nigel 3
670     Returns: TRUE or FALSE
671     */
672    
673     static BOOL
674 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
675 nigel 3 {
676 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
677     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
678 nigel 3 if (*p == '}') return TRUE;
679    
680     if (*p++ != ',') return FALSE;
681     if (*p == '}') return TRUE;
682    
683 nigel 25 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
684     while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
685 nigel 3 return (*p == '}');
686     }
687    
688    
689    
690     /*************************************************
691     * Read repeat counts *
692     *************************************************/
693    
694     /* Read an item of the form {n,m} and return the values. This is called only
695     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
696     so the syntax is guaranteed to be correct, but we need to check the values.
697    
698     Arguments:
699     p pointer to first char after '{'
700     minp pointer to int for min
701     maxp pointer to int for max
702     returned as -1 if no max
703     errorptr points to pointer to error message
704 nigel 25 cd pointer to character tables clock
705 nigel 3
706     Returns: pointer to '}' on success;
707     current ptr on error, with errorptr set
708     */
709    
710 nigel 7 static const uschar *
711 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
712     const char **errorptr, compile_data *cd)
713 nigel 3 {
714     int min = 0;
715     int max = -1;
716    
717 nigel 25 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
718 nigel 3
719     if (*p == '}') max = min; else
720     {
721     if (*(++p) != '}')
722     {
723     max = 0;
724 nigel 25 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
725 nigel 3 if (max < min)
726     {
727     *errorptr = ERR4;
728     return p;
729     }
730     }
731     }
732    
733     /* Do paranoid checks, then fill in the required variables, and pass back the
734     pointer to the terminating '}'. */
735    
736     if (min > 65535 || max > 65535)
737     *errorptr = ERR5;
738     else
739     {
740     *minp = min;
741     *maxp = max;
742     }
743     return p;
744     }
745    
746    
747    
748     /*************************************************
749 nigel 23 * Find the fixed length of a pattern *
750     *************************************************/
751    
752     /* Scan a pattern and compute the fixed length of subject that will match it,
753     if the length is fixed. This is needed for dealing with backward assertions.
754    
755     Arguments:
756     code points to the start of the pattern (the bracket)
757 nigel 49 options the compiling options
758 nigel 23
759     Returns: the fixed length, or -1 if there is no fixed length
760     */
761    
762     static int
763 nigel 49 find_fixedlength(uschar *code, int options)
764 nigel 23 {
765     int length = -1;
766    
767     register int branchlength = 0;
768     register uschar *cc = code + 3;
769    
770     /* Scan along the opcodes for this branch. If we get to the end of the
771     branch, check the length against that of the other branches. */
772    
773     for (;;)
774     {
775     int d;
776     register int op = *cc;
777     if (op >= OP_BRA) op = OP_BRA;
778    
779     switch (op)
780     {
781     case OP_BRA:
782     case OP_ONCE:
783     case OP_COND:
784 nigel 49 d = find_fixedlength(cc, options);
785 nigel 23 if (d < 0) return -1;
786     branchlength += d;
787     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
788     cc += 3;
789     break;
790    
791     /* Reached end of a branch; if it's a ket it is the end of a nested
792     call. If it's ALT it is an alternation in a nested call. If it is
793     END it's the end of the outer call. All can be handled by the same code. */
794    
795     case OP_ALT:
796     case OP_KET:
797     case OP_KETRMAX:
798     case OP_KETRMIN:
799     case OP_END:
800     if (length < 0) length = branchlength;
801     else if (length != branchlength) return -1;
802     if (*cc != OP_ALT) return length;
803     cc += 3;
804     branchlength = 0;
805     break;
806    
807     /* Skip over assertive subpatterns */
808    
809     case OP_ASSERT:
810     case OP_ASSERT_NOT:
811     case OP_ASSERTBACK:
812     case OP_ASSERTBACK_NOT:
813     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
814     cc += 3;
815     break;
816    
817     /* Skip over things that don't match chars */
818    
819     case OP_REVERSE:
820 nigel 53 case OP_BRANUMBER:
821     case OP_CREF:
822 nigel 23 cc++;
823 nigel 37 /* Fall through */
824 nigel 23
825     case OP_OPT:
826     cc++;
827     /* Fall through */
828    
829     case OP_SOD:
830     case OP_EOD:
831     case OP_EODN:
832     case OP_CIRC:
833     case OP_DOLL:
834     case OP_NOT_WORD_BOUNDARY:
835     case OP_WORD_BOUNDARY:
836     cc++;
837     break;
838    
839 nigel 49 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
840     This requires a scan of the string, unfortunately. We assume valid UTF-8
841     strings, so all we do is reduce the length by one for byte whose bits are
842     10xxxxxx. */
843 nigel 23
844     case OP_CHARS:
845     branchlength += *(++cc);
846 nigel 49 #ifdef SUPPORT_UTF8
847     for (d = 1; d <= *cc; d++)
848     if ((cc[d] & 0xc0) == 0x80) branchlength--;
849     #endif
850 nigel 23 cc += *cc + 1;
851     break;
852    
853     /* Handle exact repetitions */
854    
855     case OP_EXACT:
856     case OP_TYPEEXACT:
857     branchlength += (cc[1] << 8) + cc[2];
858     cc += 4;
859     break;
860    
861     /* Handle single-char matchers */
862    
863     case OP_NOT_DIGIT:
864     case OP_DIGIT:
865     case OP_NOT_WHITESPACE:
866     case OP_WHITESPACE:
867     case OP_NOT_WORDCHAR:
868     case OP_WORDCHAR:
869     case OP_ANY:
870     branchlength++;
871     cc++;
872     break;
873    
874    
875     /* Check a class for variable quantification */
876    
877     case OP_CLASS:
878 nigel 53 cc += 33;
879 nigel 23
880     switch (*cc)
881     {
882     case OP_CRSTAR:
883     case OP_CRMINSTAR:
884     case OP_CRQUERY:
885     case OP_CRMINQUERY:
886     return -1;
887    
888     case OP_CRRANGE:
889     case OP_CRMINRANGE:
890     if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
891     branchlength += (cc[1] << 8) + cc[2];
892     cc += 5;
893     break;
894    
895     default:
896     branchlength++;
897     }
898     break;
899    
900     /* Anything else is variable length */
901    
902     default:
903     return -1;
904     }
905     }
906     /* Control never gets here */
907     }
908    
909    
910    
911    
912     /*************************************************
913 nigel 43 * Check for POSIX class syntax *
914     *************************************************/
915    
916     /* This function is called when the sequence "[:" or "[." or "[=" is
917     encountered in a character class. It checks whether this is followed by an
918     optional ^ and then a sequence of letters, terminated by a matching ":]" or
919     ".]" or "=]".
920    
921     Argument:
922     ptr pointer to the initial [
923     endptr where to return the end pointer
924     cd pointer to compile data
925    
926     Returns: TRUE or FALSE
927     */
928    
929     static BOOL
930     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
931     {
932     int terminator; /* Don't combine these lines; the Solaris cc */
933     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
934     if (*(++ptr) == '^') ptr++;
935     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
936     if (*ptr == terminator && ptr[1] == ']')
937     {
938     *endptr = ptr;
939     return TRUE;
940     }
941     return FALSE;
942     }
943    
944    
945    
946    
947     /*************************************************
948     * Check POSIX class name *
949     *************************************************/
950    
951     /* This function is called to check the name given in a POSIX-style class entry
952     such as [:alnum:].
953    
954     Arguments:
955     ptr points to the first letter
956     len the length of the name
957    
958     Returns: a value representing the name, or -1 if unknown
959     */
960    
961     static int
962     check_posix_name(const uschar *ptr, int len)
963     {
964     register int yield = 0;
965     while (posix_name_lengths[yield] != 0)
966     {
967     if (len == posix_name_lengths[yield] &&
968     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
969     yield++;
970     }
971     return -1;
972     }
973    
974    
975    
976    
977     /*************************************************
978 nigel 3 * Compile one branch *
979     *************************************************/
980    
981     /* Scan the pattern, compiling it into the code vector.
982    
983     Arguments:
984 nigel 25 options the option bits
985 nigel 53 brackets points to number of extracting brackets used
986 nigel 25 code points to the pointer to the current code point
987     ptrptr points to the current pattern pointer
988     errorptr points to pointer to error message
989     optchanged set to the value of the last OP_OPT item compiled
990 nigel 37 reqchar set to the last literal character required, else -1
991     countlits set to count of mandatory literal characters
992 nigel 25 cd contains pointers to tables
993 nigel 3
994 nigel 25 Returns: TRUE on success
995     FALSE, with *errorptr set on error
996 nigel 3 */
997    
998     static BOOL
999 nigel 7 compile_branch(int options, int *brackets, uschar **codeptr,
1000 nigel 25 const uschar **ptrptr, const char **errorptr, int *optchanged,
1001 nigel 37 int *reqchar, int *countlits, compile_data *cd)
1002 nigel 3 {
1003     int repeat_type, op_type;
1004     int repeat_min, repeat_max;
1005     int bravalue, length;
1006 nigel 19 int greedy_default, greedy_non_default;
1007 nigel 37 int prevreqchar;
1008     int condcount = 0;
1009     int subcountlits = 0;
1010 nigel 3 register int c;
1011     register uschar *code = *codeptr;
1012 nigel 23 uschar *tempcode;
1013 nigel 7 const uschar *ptr = *ptrptr;
1014 nigel 23 const uschar *tempptr;
1015 nigel 3 uschar *previous = NULL;
1016     uschar class[32];
1017    
1018 nigel 19 /* Set up the default and non-default settings for greediness */
1019    
1020     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1021     greedy_non_default = greedy_default ^ 1;
1022    
1023 nigel 37 /* Initialize no required char, and count of literals */
1024    
1025     *reqchar = prevreqchar = -1;
1026     *countlits = 0;
1027    
1028 nigel 3 /* Switch on next character until the end of the branch */
1029    
1030     for (;; ptr++)
1031     {
1032     BOOL negate_class;
1033 nigel 23 int class_charcount;
1034     int class_lastchar;
1035     int newoptions;
1036 nigel 53 int skipbytes;
1037 nigel 37 int subreqchar;
1038 nigel 3
1039     c = *ptr;
1040     if ((options & PCRE_EXTENDED) != 0)
1041     {
1042 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1043 nigel 3 if (c == '#')
1044     {
1045 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
1046     on the Macintosh. */
1047 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1048 nigel 3 continue;
1049     }
1050     }
1051    
1052     switch(c)
1053     {
1054     /* The branch terminates at end of string, |, or ). */
1055    
1056     case 0:
1057     case '|':
1058     case ')':
1059     *codeptr = code;
1060     *ptrptr = ptr;
1061     return TRUE;
1062    
1063     /* Handle single-character metacharacters */
1064    
1065     case '^':
1066     previous = NULL;
1067     *code++ = OP_CIRC;
1068     break;
1069    
1070     case '$':
1071     previous = NULL;
1072     *code++ = OP_DOLL;
1073     break;
1074    
1075     case '.':
1076     previous = code;
1077     *code++ = OP_ANY;
1078     break;
1079    
1080     /* Character classes. These always build a 32-byte bitmap of the permitted
1081     characters, except in the special case where there is only one character.
1082     For negated classes, we build the map as usual, then invert it at the end.
1083     */
1084    
1085     case '[':
1086     previous = code;
1087 nigel 23 *code++ = OP_CLASS;
1088 nigel 3
1089 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
1090 nigel 3
1091     if ((c = *(++ptr)) == '^')
1092     {
1093     negate_class = TRUE;
1094     c = *(++ptr);
1095     }
1096 nigel 23 else negate_class = FALSE;
1097 nigel 3
1098     /* Keep a count of chars so that we can optimize the case of just a single
1099     character. */
1100    
1101     class_charcount = 0;
1102     class_lastchar = -1;
1103    
1104     /* Initialize the 32-char bit map to all zeros. We have to build the
1105     map in a temporary bit of store, in case the class contains only 1
1106     character, because in that case the compiled code doesn't use the
1107     bit map. */
1108    
1109     memset(class, 0, 32 * sizeof(uschar));
1110    
1111     /* Process characters until ] is reached. By writing this as a "do" it
1112     means that an initial ] is taken as a data character. */
1113    
1114     do
1115     {
1116     if (c == 0)
1117     {
1118     *errorptr = ERR6;
1119     goto FAILED;
1120     }
1121    
1122 nigel 43 /* Handle POSIX class names. Perl allows a negation extension of the
1123     form [:^name]. A square bracket that doesn't match the syntax is
1124     treated as a literal. We also recognize the POSIX constructions
1125     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1126     5.6 does. */
1127    
1128     if (c == '[' &&
1129     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1130     check_posix_syntax(ptr, &tempptr, cd))
1131     {
1132     BOOL local_negate = FALSE;
1133     int posix_class, i;
1134     register const uschar *cbits = cd->cbits;
1135    
1136     if (ptr[1] != ':')
1137     {
1138     *errorptr = ERR31;
1139     goto FAILED;
1140     }
1141    
1142     ptr += 2;
1143     if (*ptr == '^')
1144     {
1145     local_negate = TRUE;
1146     ptr++;
1147     }
1148    
1149     posix_class = check_posix_name(ptr, tempptr - ptr);
1150     if (posix_class < 0)
1151     {
1152     *errorptr = ERR30;
1153     goto FAILED;
1154     }
1155    
1156     /* If matching is caseless, upper and lower are converted to
1157     alpha. This relies on the fact that the class table starts with
1158     alpha, lower, upper as the first 3 entries. */
1159    
1160     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1161     posix_class = 0;
1162    
1163     /* Or into the map we are building up to 3 of the static class
1164     tables, or their negations. */
1165    
1166     posix_class *= 3;
1167     for (i = 0; i < 3; i++)
1168     {
1169     int taboffset = posix_class_maps[posix_class + i];
1170     if (taboffset < 0) break;
1171     if (local_negate)
1172     for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1173     else
1174     for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1175     }
1176    
1177     ptr = tempptr + 1;
1178     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1179     continue;
1180     }
1181    
1182 nigel 3 /* Backslash may introduce a single character, or it may introduce one
1183     of the specials, which just set a flag. Escaped items are checked for
1184     validity in the pre-compiling pass. The sequence \b is a special case.
1185 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
1186 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
1187     or into the one we are building. We assume they have more than one
1188     character in them, so set class_count bigger than one. */
1189    
1190     if (c == '\\')
1191     {
1192 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1193 nigel 3 if (-c == ESC_b) c = '\b';
1194     else if (c < 0)
1195     {
1196 nigel 25 register const uschar *cbits = cd->cbits;
1197 nigel 3 class_charcount = 10;
1198     switch (-c)
1199     {
1200     case ESC_d:
1201 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1202 nigel 3 continue;
1203    
1204     case ESC_D:
1205 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1206 nigel 3 continue;
1207    
1208     case ESC_w:
1209 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1210 nigel 3 continue;
1211    
1212     case ESC_W:
1213 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1214 nigel 3 continue;
1215    
1216     case ESC_s:
1217 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1218 nigel 3 continue;
1219    
1220     case ESC_S:
1221 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1222 nigel 3 continue;
1223    
1224     default:
1225     *errorptr = ERR7;
1226     goto FAILED;
1227     }
1228     }
1229 nigel 49
1230     /* Fall through if single character, but don't at present allow
1231     chars > 255 in UTF-8 mode. */
1232    
1233     #ifdef SUPPORT_UTF8
1234     if (c > 255)
1235     {
1236     *errorptr = ERR33;
1237     goto FAILED;
1238     }
1239     #endif
1240 nigel 3 }
1241    
1242     /* A single character may be followed by '-' to form a range. However,
1243     Perl does not permit ']' to be the end of the range. A '-' character
1244     here is treated as a literal. */
1245    
1246     if (ptr[1] == '-' && ptr[2] != ']')
1247     {
1248     int d;
1249     ptr += 2;
1250     d = *ptr;
1251    
1252     if (d == 0)
1253     {
1254     *errorptr = ERR6;
1255     goto FAILED;
1256     }
1257    
1258     /* The second part of a range can be a single-character escape, but
1259 nigel 49 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1260     in such circumstances. */
1261 nigel 3
1262     if (d == '\\')
1263     {
1264 nigel 49 const uschar *oldptr = ptr;
1265 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1266 nigel 49
1267     #ifdef SUPPORT_UTF8
1268     if (d > 255)
1269     {
1270     *errorptr = ERR33;
1271     goto FAILED;
1272     }
1273     #endif
1274     /* \b is backslash; any other special means the '-' was literal */
1275    
1276 nigel 3 if (d < 0)
1277     {
1278     if (d == -ESC_b) d = '\b'; else
1279     {
1280 nigel 49 ptr = oldptr - 2;
1281     goto SINGLE_CHARACTER; /* A few lines below */
1282 nigel 3 }
1283     }
1284     }
1285    
1286     if (d < c)
1287     {
1288     *errorptr = ERR8;
1289     goto FAILED;
1290     }
1291    
1292     for (; c <= d; c++)
1293     {
1294     class[c/8] |= (1 << (c&7));
1295     if ((options & PCRE_CASELESS) != 0)
1296     {
1297 nigel 25 int uc = cd->fcc[c]; /* flip case */
1298 nigel 3 class[uc/8] |= (1 << (uc&7));
1299     }
1300     class_charcount++; /* in case a one-char range */
1301     class_lastchar = c;
1302     }
1303     continue; /* Go get the next char in the class */
1304     }
1305    
1306     /* Handle a lone single character - we can get here for a normal
1307     non-escape char, or after \ that introduces a single character. */
1308    
1309 nigel 49 SINGLE_CHARACTER:
1310    
1311 nigel 3 class [c/8] |= (1 << (c&7));
1312     if ((options & PCRE_CASELESS) != 0)
1313     {
1314 nigel 25 c = cd->fcc[c]; /* flip case */
1315 nigel 3 class[c/8] |= (1 << (c&7));
1316     }
1317     class_charcount++;
1318     class_lastchar = c;
1319     }
1320    
1321     /* Loop until ']' reached; the check for end of string happens inside the
1322     loop. This "while" is the end of the "do" above. */
1323    
1324     while ((c = *(++ptr)) != ']');
1325    
1326     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1327     precisely one character. This doesn't need the whole 32-byte bit map.
1328     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1329     it's negative. */
1330    
1331     if (class_charcount == 1 && class_lastchar >= 0)
1332     {
1333     if (negate_class)
1334     {
1335     code[-1] = OP_NOT;
1336     }
1337     else
1338     {
1339     code[-1] = OP_CHARS;
1340     *code++ = 1;
1341     }
1342     *code++ = class_lastchar;
1343     }
1344    
1345     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1346     the code vector. */
1347    
1348     else
1349     {
1350     if (negate_class)
1351     for (c = 0; c < 32; c++) code[c] = ~class[c];
1352     else
1353     memcpy(code, class, 32);
1354     code += 32;
1355     }
1356     break;
1357    
1358     /* Various kinds of repeat */
1359    
1360     case '{':
1361 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1362     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1363 nigel 3 if (*errorptr != NULL) goto FAILED;
1364     goto REPEAT;
1365    
1366     case '*':
1367     repeat_min = 0;
1368     repeat_max = -1;
1369     goto REPEAT;
1370    
1371     case '+':
1372     repeat_min = 1;
1373     repeat_max = -1;
1374     goto REPEAT;
1375    
1376     case '?':
1377     repeat_min = 0;
1378     repeat_max = 1;
1379    
1380     REPEAT:
1381     if (previous == NULL)
1382     {
1383     *errorptr = ERR9;
1384     goto FAILED;
1385     }
1386    
1387 nigel 19 /* If the next character is '?' this is a minimizing repeat, by default,
1388     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1389 nigel 3 next character. */
1390    
1391 nigel 19 if (ptr[1] == '?')
1392     { repeat_type = greedy_non_default; ptr++; }
1393     else repeat_type = greedy_default;
1394 nigel 3
1395     /* If previous was a string of characters, chop off the last one and use it
1396     as the subject of the repeat. If there was only one character, we can
1397 nigel 37 abolish the previous item altogether. A repeat with a zero minimum wipes
1398     out any reqchar setting, backing up to the previous value. We must also
1399     adjust the countlits value. */
1400 nigel 3
1401 nigel 37 if (*previous == OP_CHARS)
1402 nigel 3 {
1403     int len = previous[1];
1404 nigel 37
1405     if (repeat_min == 0) *reqchar = prevreqchar;
1406     *countlits += repeat_min - 1;
1407    
1408 nigel 3 if (len == 1)
1409     {
1410     c = previous[2];
1411     code = previous;
1412     }
1413     else
1414     {
1415     c = previous[len+1];
1416     previous[1]--;
1417     code--;
1418     }
1419     op_type = 0; /* Use single-char op codes */
1420     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1421     }
1422    
1423     /* If previous was a single negated character ([^a] or similar), we use
1424     one of the special opcodes, replacing it. The code is shared with single-
1425     character repeats by adding a suitable offset into repeat_type. */
1426    
1427     else if ((int)*previous == OP_NOT)
1428     {
1429     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1430     c = previous[1];
1431     code = previous;
1432     goto OUTPUT_SINGLE_REPEAT;
1433     }
1434    
1435     /* If previous was a character type match (\d or similar), abolish it and
1436     create a suitable repeat item. The code is shared with single-character
1437     repeats by adding a suitable offset into repeat_type. */
1438    
1439 nigel 23 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1440 nigel 3 {
1441     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1442     c = *previous;
1443     code = previous;
1444    
1445     OUTPUT_SINGLE_REPEAT:
1446    
1447 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
1448     this case, so we do too - by simply omitting the item altogether. */
1449    
1450     if (repeat_max == 0) goto END_REPEAT;
1451    
1452     /* Combine the op_type with the repeat_type */
1453    
1454     repeat_type += op_type;
1455    
1456 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
1457     an UPTO, with the maximum given. */
1458    
1459     if (repeat_min == 0)
1460     {
1461     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1462     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1463     else
1464     {
1465     *code++ = OP_UPTO + repeat_type;
1466     *code++ = repeat_max >> 8;
1467     *code++ = (repeat_max & 255);
1468     }
1469     }
1470    
1471     /* The case {1,} is handled as the special case + */
1472    
1473     else if (repeat_min == 1 && repeat_max == -1)
1474     *code++ = OP_PLUS + repeat_type;
1475    
1476     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1477     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1478    
1479     else
1480     {
1481     if (repeat_min != 1)
1482     {
1483     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1484     *code++ = repeat_min >> 8;
1485     *code++ = (repeat_min & 255);
1486     }
1487    
1488     /* If the mininum is 1 and the previous item was a character string,
1489     we either have to put back the item that got cancelled if the string
1490     length was 1, or add the character back onto the end of a longer
1491 nigel 21 string. For a character type nothing need be done; it will just get
1492     put back naturally. Note that the final character is always going to
1493     get added below. */
1494 nigel 3
1495     else if (*previous == OP_CHARS)
1496     {
1497     if (code == previous) code += 2; else previous[1]++;
1498     }
1499    
1500 nigel 21 /* For a single negated character we also have to put back the
1501     item that got cancelled. */
1502    
1503     else if (*previous == OP_NOT) code++;
1504    
1505 nigel 9 /* If the maximum is unlimited, insert an OP_STAR. */
1506 nigel 3
1507 nigel 9 if (repeat_max < 0)
1508 nigel 3 {
1509     *code++ = c;
1510 nigel 9 *code++ = OP_STAR + repeat_type;
1511     }
1512    
1513     /* Else insert an UPTO if the max is greater than the min. */
1514    
1515     else if (repeat_max != repeat_min)
1516     {
1517     *code++ = c;
1518 nigel 3 repeat_max -= repeat_min;
1519     *code++ = OP_UPTO + repeat_type;
1520     *code++ = repeat_max >> 8;
1521     *code++ = (repeat_max & 255);
1522     }
1523     }
1524    
1525     /* The character or character type itself comes last in all cases. */
1526    
1527     *code++ = c;
1528     }
1529    
1530     /* If previous was a character class or a back reference, we put the repeat
1531 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
1532 nigel 3
1533 nigel 23 else if (*previous == OP_CLASS || *previous == OP_REF)
1534 nigel 3 {
1535 nigel 37 if (repeat_max == 0)
1536     {
1537     code = previous;
1538     goto END_REPEAT;
1539     }
1540 nigel 3 if (repeat_min == 0 && repeat_max == -1)
1541     *code++ = OP_CRSTAR + repeat_type;
1542     else if (repeat_min == 1 && repeat_max == -1)
1543     *code++ = OP_CRPLUS + repeat_type;
1544     else if (repeat_min == 0 && repeat_max == 1)
1545     *code++ = OP_CRQUERY + repeat_type;
1546     else
1547     {
1548     *code++ = OP_CRRANGE + repeat_type;
1549     *code++ = repeat_min >> 8;
1550     *code++ = repeat_min & 255;
1551     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1552     *code++ = repeat_max >> 8;
1553     *code++ = repeat_max & 255;
1554     }
1555     }
1556    
1557     /* If previous was a bracket group, we may have to replicate it in certain
1558 nigel 23 cases. */
1559 nigel 3
1560 nigel 23 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1561     (int)*previous == OP_COND)
1562 nigel 3 {
1563 nigel 31 register int i;
1564     int ketoffset = 0;
1565 nigel 9 int len = code - previous;
1566 nigel 31 uschar *bralink = NULL;
1567 nigel 3
1568 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
1569     by scanning through from the start, and compute the offset back to it
1570     from the current code pointer. There may be an OP_OPT setting following
1571     the final KET, so we can't find the end just by going back from the code
1572     pointer. */
1573    
1574     if (repeat_max == -1)
1575 nigel 3 {
1576 nigel 23 register uschar *ket = previous;
1577     do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1578     ketoffset = code - ket;
1579 nigel 3 }
1580    
1581 nigel 31 /* The case of a zero minimum is special because of the need to stick
1582     OP_BRAZERO in front of it, and because the group appears once in the
1583     data, whereas in other cases it appears the minimum number of times. For
1584     this reason, it is simplest to treat this case separately, as otherwise
1585 nigel 53 the code gets far too messy. There are several special subcases when the
1586 nigel 31 minimum is zero. */
1587    
1588     if (repeat_min == 0)
1589     {
1590 nigel 37 /* If we set up a required char from the bracket, we must back off
1591     to the previous value and reset the countlits value too. */
1592    
1593     if (subcountlits > 0)
1594     {
1595     *reqchar = prevreqchar;
1596     *countlits -= subcountlits;
1597     }
1598    
1599 nigel 31 /* If the maximum is also zero, we just omit the group from the output
1600     altogether. */
1601    
1602     if (repeat_max == 0)
1603     {
1604     code = previous;
1605 nigel 37 goto END_REPEAT;
1606 nigel 31 }
1607    
1608     /* If the maximum is 1 or unlimited, we just have to stick in the
1609     BRAZERO and do no more at this point. */
1610    
1611     if (repeat_max <= 1)
1612     {
1613     memmove(previous+1, previous, len);
1614     code++;
1615     *previous++ = OP_BRAZERO + repeat_type;
1616     }
1617    
1618     /* If the maximum is greater than 1 and limited, we have to replicate
1619     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1620     The first one has to be handled carefully because it's the original
1621     copy, which has to be moved up. The remainder can be handled by code
1622     that is common with the non-zero minimum case below. We just have to
1623     adjust the value or repeat_max, since one less copy is required. */
1624    
1625     else
1626     {
1627     int offset;
1628     memmove(previous+4, previous, len);
1629     code += 4;
1630     *previous++ = OP_BRAZERO + repeat_type;
1631     *previous++ = OP_BRA;
1632    
1633     /* We chain together the bracket offset fields that have to be
1634     filled in later when the ends of the brackets are reached. */
1635    
1636     offset = (bralink == NULL)? 0 : previous - bralink;
1637     bralink = previous;
1638     *previous++ = offset >> 8;
1639     *previous++ = offset & 255;
1640     }
1641    
1642     repeat_max--;
1643     }
1644    
1645     /* If the minimum is greater than zero, replicate the group as many
1646     times as necessary, and adjust the maximum to the number of subsequent
1647     copies that we need. */
1648    
1649     else
1650     {
1651     for (i = 1; i < repeat_min; i++)
1652     {
1653     memcpy(code, previous, len);
1654     code += len;
1655     }
1656     if (repeat_max > 0) repeat_max -= repeat_min;
1657     }
1658    
1659     /* This code is common to both the zero and non-zero minimum cases. If
1660     the maximum is limited, it replicates the group in a nested fashion,
1661     remembering the bracket starts on a stack. In the case of a zero minimum,
1662     the first one was set up above. In all cases the repeat_max now specifies
1663     the number of additional copies needed. */
1664    
1665     if (repeat_max >= 0)
1666     {
1667     for (i = repeat_max - 1; i >= 0; i--)
1668     {
1669     *code++ = OP_BRAZERO + repeat_type;
1670    
1671     /* All but the final copy start a new nesting, maintaining the
1672     chain of brackets outstanding. */
1673    
1674     if (i != 0)
1675     {
1676     int offset;
1677     *code++ = OP_BRA;
1678     offset = (bralink == NULL)? 0 : code - bralink;
1679     bralink = code;
1680     *code++ = offset >> 8;
1681     *code++ = offset & 255;
1682     }
1683    
1684     memcpy(code, previous, len);
1685     code += len;
1686     }
1687    
1688     /* Now chain through the pending brackets, and fill in their length
1689     fields (which are holding the chain links pro tem). */
1690    
1691     while (bralink != NULL)
1692     {
1693     int oldlinkoffset;
1694     int offset = code - bralink + 1;
1695     uschar *bra = code - offset;
1696     oldlinkoffset = (bra[1] << 8) + bra[2];
1697     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1698     *code++ = OP_KET;
1699     *code++ = bra[1] = offset >> 8;
1700     *code++ = bra[2] = (offset & 255);
1701     }
1702     }
1703    
1704     /* If the maximum is unlimited, set a repeater in the final copy. We
1705     can't just offset backwards from the current code point, because we
1706     don't know if there's been an options resetting after the ket. The
1707     correct offset was computed above. */
1708    
1709     else code[-ketoffset] = OP_KETRMAX + repeat_type;
1710 nigel 3 }
1711    
1712     /* Else there's some kind of shambles */
1713    
1714     else
1715     {
1716     *errorptr = ERR11;
1717     goto FAILED;
1718     }
1719    
1720     /* In all case we no longer have a previous item. */
1721    
1722 nigel 37 END_REPEAT:
1723 nigel 3 previous = NULL;
1724     break;
1725    
1726    
1727 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
1728     lookbehind or option setting or condition. First deal with special things
1729     that can come after a bracket; all are introduced by ?, and the appearance
1730     of any of them means that this is not a referencing group. They were
1731     checked for validity in the first pass over the string, so we don't have to
1732     check for syntax errors here. */
1733 nigel 3
1734     case '(':
1735 nigel 23 newoptions = options;
1736 nigel 53 skipbytes = 0;
1737 nigel 23
1738 nigel 3 if (*(++ptr) == '?')
1739     {
1740 nigel 23 int set, unset;
1741     int *optset;
1742 nigel 3
1743     switch (*(++ptr))
1744     {
1745 nigel 23 case '#': /* Comment; skip to ket */
1746 nigel 3 ptr++;
1747     while (*ptr != ')') ptr++;
1748     continue;
1749    
1750     case ':': /* Non-extracting bracket */
1751 nigel 23 bravalue = OP_BRA;
1752 nigel 3 ptr++;
1753     break;
1754    
1755 nigel 23 case '(':
1756     bravalue = OP_COND; /* Conditional group */
1757 nigel 25 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1758 nigel 23 {
1759 nigel 53 int condref = *ptr - '0';
1760 nigel 23 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1761 nigel 51 if (condref == 0)
1762     {
1763     *errorptr = ERR35;
1764     goto FAILED;
1765     }
1766 nigel 23 ptr++;
1767 nigel 53 code[3] = OP_CREF;
1768     code[4] = condref >> 8;
1769     code[5] = condref & 255;
1770     skipbytes = 3;
1771 nigel 23 }
1772     else ptr--;
1773     break;
1774    
1775     case '=': /* Positive lookahead */
1776 nigel 3 bravalue = OP_ASSERT;
1777     ptr++;
1778     break;
1779    
1780 nigel 23 case '!': /* Negative lookahead */
1781 nigel 3 bravalue = OP_ASSERT_NOT;
1782     ptr++;
1783     break;
1784    
1785 nigel 23 case '<': /* Lookbehinds */
1786     switch (*(++ptr))
1787 nigel 3 {
1788 nigel 23 case '=': /* Positive lookbehind */
1789     bravalue = OP_ASSERTBACK;
1790 nigel 3 ptr++;
1791     break;
1792 nigel 23
1793     case '!': /* Negative lookbehind */
1794     bravalue = OP_ASSERTBACK_NOT;
1795     ptr++;
1796     break;
1797    
1798     default: /* Syntax error */
1799     *errorptr = ERR24;
1800     goto FAILED;
1801 nigel 3 }
1802 nigel 23 break;
1803 nigel 3
1804 nigel 23 case '>': /* One-time brackets */
1805     bravalue = OP_ONCE;
1806     ptr++;
1807     break;
1808    
1809 nigel 43 case 'R': /* Pattern recursion */
1810     *code++ = OP_RECURSE;
1811     ptr++;
1812     continue;
1813    
1814 nigel 23 default: /* Option setting */
1815     set = unset = 0;
1816     optset = &set;
1817    
1818     while (*ptr != ')' && *ptr != ':')
1819     {
1820     switch (*ptr++)
1821     {
1822     case '-': optset = &unset; break;
1823    
1824     case 'i': *optset |= PCRE_CASELESS; break;
1825     case 'm': *optset |= PCRE_MULTILINE; break;
1826     case 's': *optset |= PCRE_DOTALL; break;
1827     case 'x': *optset |= PCRE_EXTENDED; break;
1828     case 'U': *optset |= PCRE_UNGREEDY; break;
1829     case 'X': *optset |= PCRE_EXTRA; break;
1830    
1831     default:
1832     *errorptr = ERR12;
1833     goto FAILED;
1834     }
1835     }
1836    
1837     /* Set up the changed option bits, but don't change anything yet. */
1838    
1839     newoptions = (options | set) & (~unset);
1840    
1841     /* If the options ended with ')' this is not the start of a nested
1842     group with option changes, so the options change at this level. At top
1843     level there is nothing else to be done (the options will in fact have
1844     been set from the start of compiling as a result of the first pass) but
1845     at an inner level we must compile code to change the ims options if
1846     necessary, and pass the new setting back so that it can be put at the
1847     start of any following branches, and when this group ends, a resetting
1848     item can be compiled. */
1849    
1850     if (*ptr == ')')
1851     {
1852     if ((options & PCRE_INGROUP) != 0 &&
1853     (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1854     {
1855     *code++ = OP_OPT;
1856     *code++ = *optchanged = newoptions & PCRE_IMS;
1857     }
1858     options = newoptions; /* Change options at this level */
1859     previous = NULL; /* This item can't be repeated */
1860     continue; /* It is complete */
1861     }
1862    
1863     /* If the options ended with ':' we are heading into a nested group
1864     with possible change of options. Such groups are non-capturing and are
1865     not assertions of any kind. All we need to do is skip over the ':';
1866     the newoptions value is handled below. */
1867    
1868     bravalue = OP_BRA;
1869     ptr++;
1870 nigel 3 }
1871     }
1872    
1873 nigel 53 /* Else we have a referencing group; adjust the opcode. If the bracket
1874     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1875     arrange for the true number to follow later, in an OP_BRANUMBER item. */
1876 nigel 3
1877     else
1878     {
1879 nigel 53 if (++(*brackets) > EXTRACT_BASIC_MAX)
1880 nigel 3 {
1881 nigel 53 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1882     code[3] = OP_BRANUMBER;
1883     code[4] = *brackets >> 8;
1884     code[5] = *brackets & 255;
1885     skipbytes = 3;
1886 nigel 3 }
1887 nigel 53 else bravalue = OP_BRA + *brackets;
1888 nigel 3 }
1889    
1890 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
1891     kinds can be. We copy code into a non-register variable in order to be able
1892     to pass its address because some compilers complain otherwise. Pass in a
1893     new setting for the ims options if they have changed. */
1894 nigel 3
1895 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
1896 nigel 3 *code = bravalue;
1897 nigel 23 tempcode = code;
1898    
1899     if (!compile_regex(
1900     options | PCRE_INGROUP, /* Set for all nested groups */
1901     ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1902     newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1903 nigel 53 brackets, /* Extracting bracket count */
1904 nigel 23 &tempcode, /* Where to put code (updated) */
1905     &ptr, /* Input pointer (updated) */
1906     errorptr, /* Where to put an error message */
1907     (bravalue == OP_ASSERTBACK ||
1908     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1909 nigel 53 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
1910 nigel 37 &subreqchar, /* For possible last char */
1911     &subcountlits, /* For literal count */
1912 nigel 25 cd)) /* Tables block */
1913 nigel 23 goto FAILED;
1914    
1915     /* At the end of compiling, code is still pointing to the start of the
1916     group, while tempcode has been updated to point past the end of the group
1917     and any option resetting that may follow it. The pattern pointer (ptr)
1918     is on the bracket. */
1919    
1920     /* If this is a conditional bracket, check that there are no more than
1921     two branches in the group. */
1922    
1923 nigel 53 else if (bravalue == OP_COND)
1924 nigel 3 {
1925 nigel 23 uschar *tc = code;
1926 nigel 37 condcount = 0;
1927 nigel 23
1928     do {
1929 nigel 37 condcount++;
1930 nigel 23 tc += (tc[1] << 8) | tc[2];
1931     }
1932     while (*tc != OP_KET);
1933    
1934 nigel 37 if (condcount > 2)
1935 nigel 23 {
1936     *errorptr = ERR27;
1937 nigel 3 goto FAILED;
1938 nigel 23 }
1939 nigel 3 }
1940    
1941 nigel 37 /* Handle updating of the required character. If the subpattern didn't
1942     set one, leave it as it was. Otherwise, update it for normal brackets of
1943     all kinds, forward assertions, and conditions with two branches. Don't
1944     update the literal count for forward assertions, however. If the bracket
1945     is followed by a quantifier with zero repeat, we have to back off. Hence
1946     the definition of prevreqchar and subcountlits outside the main loop so
1947     that they can be accessed for the back off. */
1948    
1949     if (subreqchar > 0 &&
1950     (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1951     (bravalue == OP_COND && condcount == 2)))
1952     {
1953     prevreqchar = *reqchar;
1954     *reqchar = subreqchar;
1955     if (bravalue != OP_ASSERT) *countlits += subcountlits;
1956     }
1957    
1958 nigel 23 /* Now update the main code pointer to the end of the group. */
1959    
1960     code = tempcode;
1961    
1962     /* Error if hit end of pattern */
1963    
1964 nigel 3 if (*ptr != ')')
1965     {
1966     *errorptr = ERR14;
1967     goto FAILED;
1968     }
1969     break;
1970    
1971     /* Check \ for being a real metacharacter; if not, fall through and handle
1972     it as a data character at the start of a string. Escape items are checked
1973     for validity in the pre-compiling pass. */
1974    
1975     case '\\':
1976 nigel 23 tempptr = ptr;
1977 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1978 nigel 3
1979     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1980     are arranged to be the negation of the corresponding OP_values. For the
1981     back references, the values are ESC_REF plus the reference number. Only
1982     back references and those types that consume a character may be repeated.
1983     We can test for values between ESC_b and ESC_Z for the latter; this may
1984     have to change if any new ones are ever created. */
1985    
1986     if (c < 0)
1987     {
1988     if (-c >= ESC_REF)
1989     {
1990 nigel 53 int number = -c - ESC_REF;
1991 nigel 3 previous = code;
1992     *code++ = OP_REF;
1993 nigel 53 *code++ = number >> 8;
1994     *code++ = number & 255;
1995 nigel 3 }
1996     else
1997     {
1998 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1999 nigel 3 *code++ = -c;
2000     }
2001     continue;
2002     }
2003    
2004 nigel 7 /* Data character: reset and fall through */
2005 nigel 3
2006 nigel 23 ptr = tempptr;
2007 nigel 3 c = '\\';
2008    
2009     /* Handle a run of data characters until a metacharacter is encountered.
2010     The first character is guaranteed not to be whitespace or # when the
2011     extended flag is set. */
2012    
2013     NORMAL_CHAR:
2014     default:
2015     previous = code;
2016     *code = OP_CHARS;
2017     code += 2;
2018     length = 0;
2019    
2020     do
2021     {
2022     if ((options & PCRE_EXTENDED) != 0)
2023     {
2024 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2025 nigel 3 if (c == '#')
2026     {
2027 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
2028     on the Macintosh. */
2029 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2030 nigel 3 if (c == 0) break;
2031     continue;
2032     }
2033     }
2034    
2035     /* Backslash may introduce a data char or a metacharacter. Escaped items
2036     are checked for validity in the pre-compiling pass. Stop the string
2037     before a metaitem. */
2038    
2039     if (c == '\\')
2040     {
2041 nigel 23 tempptr = ptr;
2042 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2043 nigel 23 if (c < 0) { ptr = tempptr; break; }
2044 nigel 49
2045     /* If a character is > 127 in UTF-8 mode, we have to turn it into
2046     two or more characters in the UTF-8 encoding. */
2047    
2048     #ifdef SUPPORT_UTF8
2049     if (c > 127 && (options & PCRE_UTF8) != 0)
2050     {
2051     uschar buffer[8];
2052     int len = ord2utf8(c, buffer);
2053     for (c = 0; c < len; c++) *code++ = buffer[c];
2054     length += len;
2055     continue;
2056     }
2057     #endif
2058 nigel 3 }
2059    
2060     /* Ordinary character or single-char escape */
2061    
2062     *code++ = c;
2063     length++;
2064     }
2065    
2066     /* This "while" is the end of the "do" above. */
2067    
2068 nigel 49 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2069 nigel 3
2070 nigel 37 /* Update the last character and the count of literals */
2071    
2072     prevreqchar = (length > 1)? code[-2] : *reqchar;
2073     *reqchar = code[-1];
2074     *countlits += length;
2075    
2076 nigel 3 /* Compute the length and set it in the data vector, and advance to
2077     the next state. */
2078    
2079     previous[1] = length;
2080 nigel 49 if (length < MAXLIT) ptr--;
2081 nigel 3 break;
2082     }
2083     } /* end of big loop */
2084    
2085     /* Control never reaches here by falling through, only by a goto for all the
2086     error states. Pass back the position in the pattern so that it can be displayed
2087     to the user for diagnosing the error. */
2088    
2089     FAILED:
2090     *ptrptr = ptr;
2091     return FALSE;
2092     }
2093    
2094    
2095    
2096    
2097     /*************************************************
2098     * Compile sequence of alternatives *
2099     *************************************************/
2100    
2101     /* On entry, ptr is pointing past the bracket character, but on return
2102     it points to the closing bracket, or vertical bar, or end of string.
2103     The code variable is pointing at the byte into which the BRA operator has been
2104 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
2105     during any branch, we need to insert an OP_OPT item at the start of every
2106     following branch to ensure they get set correctly at run time, and also pass
2107     the new options into every subsequent branch compile.
2108 nigel 3
2109     Argument:
2110 nigel 23 options the option bits
2111     optchanged new ims options to set as if (?ims) were at the start, or -1
2112     for no change
2113     brackets -> int containing the number of extracting brackets used
2114     codeptr -> the address of the current code pointer
2115     ptrptr -> the address of the current pattern pointer
2116     errorptr -> pointer to error message
2117     lookbehind TRUE if this is a lookbehind assertion
2118 nigel 53 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2119 nigel 37 reqchar -> place to put the last required character, or a negative number
2120     countlits -> place to put the shortest literal count of any branch
2121 nigel 25 cd points to the data block with tables pointers
2122 nigel 3
2123 nigel 23 Returns: TRUE on success
2124 nigel 3 */
2125    
2126     static BOOL
2127 nigel 23 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2128 nigel 53 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2129 nigel 37 int *reqchar, int *countlits, compile_data *cd)
2130 nigel 3 {
2131 nigel 7 const uschar *ptr = *ptrptr;
2132 nigel 3 uschar *code = *codeptr;
2133 nigel 23 uschar *last_branch = code;
2134 nigel 3 uschar *start_bracket = code;
2135 nigel 23 uschar *reverse_count = NULL;
2136     int oldoptions = options & PCRE_IMS;
2137 nigel 37 int branchreqchar, branchcountlits;
2138 nigel 3
2139 nigel 37 *reqchar = -1;
2140     *countlits = INT_MAX;
2141 nigel 53 code += 3 + skipbytes;
2142 nigel 23
2143     /* Loop for each alternative branch */
2144    
2145 nigel 3 for (;;)
2146     {
2147     int length;
2148    
2149 nigel 23 /* Handle change of options */
2150    
2151     if (optchanged >= 0)
2152 nigel 3 {
2153 nigel 23 *code++ = OP_OPT;
2154     *code++ = optchanged;
2155     options = (options & ~PCRE_IMS) | optchanged;
2156     }
2157    
2158     /* Set up dummy OP_REVERSE if lookbehind assertion */
2159    
2160     if (lookbehind)
2161     {
2162     *code++ = OP_REVERSE;
2163     reverse_count = code;
2164     *code++ = 0;
2165     *code++ = 0;
2166     }
2167    
2168     /* Now compile the branch */
2169    
2170 nigel 37 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2171     &branchreqchar, &branchcountlits, cd))
2172 nigel 23 {
2173 nigel 3 *ptrptr = ptr;
2174     return FALSE;
2175     }
2176    
2177     /* Fill in the length of the last branch */
2178    
2179     length = code - last_branch;
2180     last_branch[1] = length >> 8;
2181     last_branch[2] = length & 255;
2182    
2183 nigel 37 /* Save the last required character if all branches have the same; a current
2184     value of -1 means unset, while -2 means "previous branch had no last required
2185     char". */
2186    
2187     if (*reqchar != -2)
2188     {
2189     if (branchreqchar >= 0)
2190     {
2191     if (*reqchar == -1) *reqchar = branchreqchar;
2192     else if (*reqchar != branchreqchar) *reqchar = -2;
2193     }
2194     else *reqchar = -2;
2195     }
2196    
2197     /* Keep the shortest literal count */
2198    
2199     if (branchcountlits < *countlits) *countlits = branchcountlits;
2200     DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2201    
2202 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
2203     and put the length into the OP_REVERSE item. Temporarily mark the end of
2204     the branch with OP_END. */
2205    
2206     if (lookbehind)
2207     {
2208     *code = OP_END;
2209 nigel 49 length = find_fixedlength(last_branch, options);
2210 nigel 23 DPRINTF(("fixed length = %d\n", length));
2211     if (length < 0)
2212     {
2213     *errorptr = ERR25;
2214     *ptrptr = ptr;
2215     return FALSE;
2216     }
2217     reverse_count[0] = (length >> 8);
2218     reverse_count[1] = length & 255;
2219     }
2220    
2221 nigel 3 /* Reached end of expression, either ')' or end of pattern. Insert a
2222     terminating ket and the length of the whole bracketed item, and return,
2223 nigel 23 leaving the pointer at the terminating char. If any of the ims options
2224     were changed inside the group, compile a resetting op-code following. */
2225 nigel 3
2226     if (*ptr != '|')
2227     {
2228     length = code - start_bracket;
2229     *code++ = OP_KET;
2230     *code++ = length >> 8;
2231     *code++ = length & 255;
2232 nigel 23 if (optchanged >= 0)
2233     {
2234     *code++ = OP_OPT;
2235     *code++ = oldoptions;
2236     }
2237 nigel 3 *codeptr = code;
2238     *ptrptr = ptr;
2239     return TRUE;
2240     }
2241    
2242     /* Another branch follows; insert an "or" node and advance the pointer. */
2243    
2244     *code = OP_ALT;
2245 nigel 23 last_branch = code;
2246     code += 3;
2247 nigel 3 ptr++;
2248     }
2249     /* Control never reaches here */
2250     }
2251    
2252    
2253    
2254 nigel 23
2255 nigel 3 /*************************************************
2256 nigel 23 * Find first significant op code *
2257     *************************************************/
2258    
2259     /* This is called by several functions that scan a compiled expression looking
2260     for a fixed first character, or an anchoring op code etc. It skips over things
2261     that do not influence this. For one application, a change of caseless option is
2262     important.
2263    
2264     Arguments:
2265     code pointer to the start of the group
2266     options pointer to external options
2267     optbit the option bit whose changing is significant, or
2268     zero if none are
2269     optstop TRUE to return on option change, otherwise change the options
2270     value and continue
2271    
2272     Returns: pointer to the first significant opcode
2273     */
2274    
2275     static const uschar*
2276     first_significant_code(const uschar *code, int *options, int optbit,
2277     BOOL optstop)
2278     {
2279     for (;;)
2280     {
2281     switch ((int)*code)
2282     {
2283     case OP_OPT:
2284     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2285     {
2286     if (optstop) return code;
2287     *options = (int)code[1];
2288     }
2289     code += 2;
2290     break;
2291    
2292     case OP_CREF:
2293 nigel 53 case OP_BRANUMBER:
2294     code += 3;
2295 nigel 23 break;
2296    
2297 nigel 35 case OP_WORD_BOUNDARY:
2298     case OP_NOT_WORD_BOUNDARY:
2299     code++;
2300     break;
2301    
2302 nigel 23 case OP_ASSERT_NOT:
2303     case OP_ASSERTBACK:
2304     case OP_ASSERTBACK_NOT:
2305     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2306     code += 3;
2307     break;
2308    
2309     default:
2310     return code;
2311     }
2312     }
2313     /* Control never reaches here */
2314     }
2315    
2316    
2317    
2318    
2319     /*************************************************
2320 nigel 3 * Check for anchored expression *
2321     *************************************************/
2322    
2323     /* Try to find out if this is an anchored regular expression. Consider each
2324     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2325     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2326     it's anchored. However, if this is a multiline pattern, then only OP_SOD
2327     counts, since OP_CIRC can match in the middle.
2328    
2329 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2330     because that will try the rest of the pattern at all possible matching points,
2331     so there is no point trying them again.
2332 nigel 3
2333 nigel 23 Arguments:
2334     code points to start of expression (the bracket)
2335     options points to the options setting
2336    
2337     Returns: TRUE or FALSE
2338 nigel 3 */
2339    
2340     static BOOL
2341 nigel 23 is_anchored(register const uschar *code, int *options)
2342 nigel 3 {
2343     do {
2344 nigel 23 const uschar *scode = first_significant_code(code + 3, options,
2345     PCRE_MULTILINE, FALSE);
2346     register int op = *scode;
2347     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2348     { if (!is_anchored(scode, options)) return FALSE; }
2349 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2350     (*options & PCRE_DOTALL) != 0)
2351 nigel 23 { if (scode[1] != OP_ANY) return FALSE; }
2352     else if (op != OP_SOD &&
2353     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2354     return FALSE;
2355 nigel 3 code += (code[1] << 8) + code[2];
2356     }
2357     while (*code == OP_ALT);
2358     return TRUE;
2359     }
2360    
2361    
2362    
2363     /*************************************************
2364 nigel 33 * Check for starting with ^ or .* *
2365 nigel 3 *************************************************/
2366    
2367 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
2368     "first char" processing can be done to speed things up in multiline
2369     matching and for non-DOTALL patterns that start with .* (which must start at
2370     the beginning or after \n).
2371 nigel 3
2372     Argument: points to start of expression (the bracket)
2373     Returns: TRUE or FALSE
2374     */
2375    
2376     static BOOL
2377 nigel 7 is_startline(const uschar *code)
2378 nigel 3 {
2379     do {
2380 nigel 23 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2381     register int op = *scode;
2382     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2383     { if (!is_startline(scode)) return FALSE; }
2384 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2385     { if (scode[1] != OP_ANY) return FALSE; }
2386 nigel 23 else if (op != OP_CIRC) return FALSE;
2387 nigel 3 code += (code[1] << 8) + code[2];
2388     }
2389     while (*code == OP_ALT);
2390     return TRUE;
2391     }
2392    
2393    
2394    
2395     /*************************************************
2396     * Check for fixed first char *
2397     *************************************************/
2398    
2399     /* Try to find out if there is a fixed first character. This is called for
2400     unanchored expressions, as it speeds up their processing quite considerably.
2401     Consider each alternative branch. If they all start with the same char, or with
2402     a bracket all of whose alternatives start with the same char (recurse ad lib),
2403     then we return that char, otherwise -1.
2404    
2405 nigel 23 Arguments:
2406     code points to start of expression (the bracket)
2407     options pointer to the options (used to check casing changes)
2408    
2409     Returns: -1 or the fixed first char
2410 nigel 3 */
2411    
2412     static int
2413 nigel 23 find_firstchar(const uschar *code, int *options)
2414 nigel 3 {
2415     register int c = -1;
2416 nigel 23 do {
2417     int d;
2418     const uschar *scode = first_significant_code(code + 3, options,
2419     PCRE_CASELESS, TRUE);
2420     register int op = *scode;
2421 nigel 3
2422 nigel 23 if (op >= OP_BRA) op = OP_BRA;
2423 nigel 3
2424 nigel 23 switch(op)
2425     {
2426     default:
2427     return -1;
2428 nigel 3
2429 nigel 23 case OP_BRA:
2430     case OP_ASSERT:
2431     case OP_ONCE:
2432     case OP_COND:
2433     if ((d = find_firstchar(scode, options)) < 0) return -1;
2434     if (c < 0) c = d; else if (c != d) return -1;
2435     break;
2436 nigel 3
2437 nigel 23 case OP_EXACT: /* Fall through */
2438     scode++;
2439 nigel 3
2440 nigel 23 case OP_CHARS: /* Fall through */
2441     scode++;
2442    
2443     case OP_PLUS:
2444     case OP_MINPLUS:
2445     if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2446     break;
2447     }
2448    
2449     code += (code[1] << 8) + code[2];
2450     }
2451 nigel 3 while (*code == OP_ALT);
2452     return c;
2453     }
2454    
2455    
2456    
2457 nigel 23
2458    
2459 nigel 3 /*************************************************
2460     * Compile a Regular Expression *
2461     *************************************************/
2462    
2463     /* This function takes a string and returns a pointer to a block of store
2464     holding a compiled version of the expression.
2465    
2466     Arguments:
2467     pattern the regular expression
2468     options various option bits
2469     errorptr pointer to pointer to error text
2470     erroroffset ptr offset in pattern where error was detected
2471 nigel 25 tables pointer to character tables or NULL
2472 nigel 3
2473     Returns: pointer to compiled data block, or NULL on error,
2474     with errorptr and erroroffset set
2475     */
2476    
2477     pcre *
2478 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
2479 nigel 25 int *erroroffset, const unsigned char *tables)
2480 nigel 3 {
2481     real_pcre *re;
2482     int length = 3; /* For initial BRA plus length */
2483     int runlength;
2484 nigel 43 int c, reqchar, countlits;
2485 nigel 3 int bracount = 0;
2486     int top_backref = 0;
2487 nigel 23 int branch_extra = 0;
2488     int branch_newextra;
2489 nigel 7 unsigned int brastackptr = 0;
2490 nigel 43 size_t size;
2491 nigel 7 uschar *code;
2492     const uschar *ptr;
2493 nigel 25 compile_data compile_block;
2494 nigel 23 int brastack[BRASTACK_SIZE];
2495     uschar bralenstack[BRASTACK_SIZE];
2496 nigel 3
2497     #ifdef DEBUG
2498     uschar *code_base, *code_end;
2499     #endif
2500    
2501 nigel 49 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2502    
2503     #ifndef SUPPORT_UTF8
2504     if ((options & PCRE_UTF8) != 0)
2505     {
2506     *errorptr = ERR32;
2507     return NULL;
2508     }
2509     #endif
2510    
2511 nigel 3 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2512     can do is just return NULL. */
2513    
2514     if (errorptr == NULL) return NULL;
2515     *errorptr = NULL;
2516    
2517     /* However, we can give a message for this error */
2518    
2519     if (erroroffset == NULL)
2520     {
2521     *errorptr = ERR16;
2522     return NULL;
2523     }
2524     *erroroffset = 0;
2525    
2526     if ((options & ~PUBLIC_OPTIONS) != 0)
2527     {
2528     *errorptr = ERR17;
2529     return NULL;
2530     }
2531    
2532 nigel 25 /* Set up pointers to the individual character tables */
2533    
2534     if (tables == NULL) tables = pcre_default_tables;
2535     compile_block.lcc = tables + lcc_offset;
2536     compile_block.fcc = tables + fcc_offset;
2537     compile_block.cbits = tables + cbits_offset;
2538     compile_block.ctypes = tables + ctypes_offset;
2539    
2540     /* Reflect pattern for debugging output */
2541    
2542 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
2543     DPRINTF(("%s\n", pattern));
2544 nigel 3
2545     /* The first thing to do is to make a pass over the pattern to compute the
2546     amount of store required to hold the compiled code. This does not have to be
2547     perfect as long as errors are overestimates. At the same time we can detect any
2548     internal flag settings. Make an attempt to correct for any counted white space
2549     if an "extended" flag setting appears late in the pattern. We can't be so
2550     clever for #-comments. */
2551    
2552 nigel 7 ptr = (const uschar *)(pattern - 1);
2553 nigel 3 while ((c = *(++ptr)) != 0)
2554     {
2555     int min, max;
2556     int class_charcount;
2557 nigel 53 int bracket_length;
2558 nigel 3
2559 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2560 nigel 3 {
2561 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2562 nigel 23 if (c == '#')
2563     {
2564 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
2565     on the Macintosh. */
2566 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2567 nigel 23 continue;
2568     }
2569 nigel 3 }
2570    
2571     switch(c)
2572     {
2573     /* A backslashed item may be an escaped "normal" character or a
2574     character type. For a "normal" character, put the pointers and
2575     character back so that tests for whitespace etc. in the input
2576     are done correctly. */
2577    
2578     case '\\':
2579     {
2580 nigel 7 const uschar *save_ptr = ptr;
2581 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2582 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583     if (c >= 0)
2584     {
2585     ptr = save_ptr;
2586     c = '\\';
2587     goto NORMAL_CHAR;
2588     }
2589     }
2590     length++;
2591    
2592 nigel 53 /* A back reference needs an additional 2 bytes, plus either one or 5
2593 nigel 3 bytes for a repeat. We also need to keep the value of the highest
2594     back reference. */
2595    
2596     if (c <= -ESC_REF)
2597     {
2598     int refnum = -c - ESC_REF;
2599     if (refnum > top_backref) top_backref = refnum;
2600 nigel 53 length += 2; /* For single back reference */
2601 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2602 nigel 3 {
2603 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2604 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2605     if ((min == 0 && (max == 1 || max == -1)) ||
2606     (min == 1 && max == -1))
2607     length++;
2608     else length += 5;
2609     if (ptr[1] == '?') ptr++;
2610     }
2611     }
2612     continue;
2613    
2614     case '^':
2615     case '.':
2616     case '$':
2617     case '*': /* These repeats won't be after brackets; */
2618     case '+': /* those are handled separately */
2619     case '?':
2620     length++;
2621     continue;
2622    
2623     /* This covers the cases of repeats after a single char, metachar, class,
2624     or back reference. */
2625    
2626     case '{':
2627 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2628     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2629 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630     if ((min == 0 && (max == 1 || max == -1)) ||
2631     (min == 1 && max == -1))
2632     length++;
2633     else
2634     {
2635     length--; /* Uncount the original char or metachar */
2636     if (min == 1) length++; else if (min > 0) length += 4;
2637     if (max > 0) length += 4; else length += 2;
2638     }
2639     if (ptr[1] == '?') ptr++;
2640     continue;
2641    
2642 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
2643     options changed in the previous branch(es), and/or if we are in a
2644     lookbehind assertion, extra space will be needed at the start of the
2645     branch. This is handled by branch_extra. */
2646    
2647 nigel 3 case '|':
2648 nigel 23 length += 3 + branch_extra;
2649 nigel 3 continue;
2650    
2651     /* A character class uses 33 characters. Don't worry about character types
2652     that aren't allowed in classes - they'll get picked up during the compile.
2653     A character class that contains only one character uses 2 or 3 bytes,
2654     depending on whether it is negated or not. Notice this where we can. */
2655    
2656     case '[':
2657     class_charcount = 0;
2658     if (*(++ptr) == '^') ptr++;
2659     do
2660     {
2661     if (*ptr == '\\')
2662     {
2663 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2664     &compile_block);
2665 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2666 nigel 9 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2667 nigel 3 }
2668     else class_charcount++;
2669     ptr++;
2670     }
2671     while (*ptr != 0 && *ptr != ']');
2672    
2673     /* Repeats for negated single chars are handled by the general code */
2674    
2675     if (class_charcount == 1) length += 3; else
2676     {
2677     length += 33;
2678    
2679     /* A repeat needs either 1 or 5 bytes. */
2680    
2681 nigel 25 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2682 nigel 3 {
2683 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2684 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2685     if ((min == 0 && (max == 1 || max == -1)) ||
2686     (min == 1 && max == -1))
2687     length++;
2688     else length += 5;
2689     if (ptr[1] == '?') ptr++;
2690     }
2691     }
2692     continue;
2693    
2694     /* Brackets may be genuine groups or special things */
2695    
2696     case '(':
2697 nigel 23 branch_newextra = 0;
2698 nigel 53 bracket_length = 3;
2699 nigel 3
2700     /* Handle special forms of bracket, which all start (? */
2701    
2702 nigel 23 if (ptr[1] == '?')
2703 nigel 3 {
2704 nigel 23 int set, unset;
2705     int *optset;
2706    
2707     switch (c = ptr[2])
2708 nigel 3 {
2709 nigel 23 /* Skip over comments entirely */
2710     case '#':
2711     ptr += 3;
2712     while (*ptr != 0 && *ptr != ')') ptr++;
2713     if (*ptr == 0)
2714     {
2715     *errorptr = ERR18;
2716     goto PCRE_ERROR_RETURN;
2717     }
2718     continue;
2719 nigel 3
2720 nigel 23 /* Non-referencing groups and lookaheads just move the pointer on, and
2721     then behave like a non-special bracket, except that they don't increment
2722     the count of extracting brackets. Ditto for the "once only" bracket,
2723     which is in Perl from version 5.005. */
2724 nigel 3
2725 nigel 23 case ':':
2726     case '=':
2727     case '!':
2728     case '>':
2729 nigel 3 ptr += 2;
2730     break;
2731    
2732 nigel 43 /* A recursive call to the regex is an extension, to provide the
2733     facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2734    
2735     case 'R':
2736     if (ptr[3] != ')')
2737     {
2738     *errorptr = ERR29;
2739     goto PCRE_ERROR_RETURN;
2740     }
2741     ptr += 3;
2742     length += 1;
2743     break;
2744    
2745 nigel 23 /* Lookbehinds are in Perl from version 5.005 */
2746 nigel 3
2747 nigel 23 case '<':
2748     if (ptr[3] == '=' || ptr[3] == '!')
2749 nigel 3 {
2750 nigel 23 ptr += 3;
2751     branch_newextra = 3;
2752     length += 3; /* For the first branch */
2753     break;
2754 nigel 3 }
2755 nigel 23 *errorptr = ERR24;
2756     goto PCRE_ERROR_RETURN;
2757    
2758     /* Conditionals are in Perl from version 5.005. The bracket must either
2759     be followed by a number (for bracket reference) or by an assertion
2760     group. */
2761    
2762     case '(':
2763 nigel 25 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2764 nigel 3 {
2765 nigel 23 ptr += 4;
2766 nigel 53 length += 3;
2767 nigel 25 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2768 nigel 23 if (*ptr != ')')
2769     {
2770     *errorptr = ERR26;
2771     goto PCRE_ERROR_RETURN;
2772     }
2773 nigel 3 }
2774 nigel 23 else /* An assertion must follow */
2775 nigel 3 {
2776 nigel 23 ptr++; /* Can treat like ':' as far as spacing is concerned */
2777 nigel 47 if (ptr[2] != '?' ||
2778     (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2779 nigel 23 {
2780     ptr += 2; /* To get right offset in message */
2781     *errorptr = ERR28;
2782     goto PCRE_ERROR_RETURN;
2783     }
2784 nigel 3 }
2785 nigel 23 break;
2786    
2787     /* Else loop checking valid options until ) is met. Anything else is an
2788     error. If we are without any brackets, i.e. at top level, the settings
2789     act as if specified in the options, so massage the options immediately.
2790     This is for backward compatibility with Perl 5.004. */
2791    
2792     default:
2793     set = unset = 0;
2794     optset = &set;
2795     ptr += 2;
2796    
2797     for (;; ptr++)
2798 nigel 3 {
2799 nigel 23 c = *ptr;
2800     switch (c)
2801     {
2802     case 'i':
2803     *optset |= PCRE_CASELESS;
2804     continue;
2805    
2806     case 'm':
2807     *optset |= PCRE_MULTILINE;
2808     continue;
2809    
2810     case 's':
2811     *optset |= PCRE_DOTALL;
2812     continue;
2813    
2814     case 'x':
2815     *optset |= PCRE_EXTENDED;
2816     continue;
2817    
2818     case 'X':
2819     *optset |= PCRE_EXTRA;
2820     continue;
2821    
2822     case 'U':
2823     *optset |= PCRE_UNGREEDY;
2824     continue;
2825    
2826     case '-':
2827     optset = &unset;
2828     continue;
2829    
2830     /* A termination by ')' indicates an options-setting-only item;
2831     this is global at top level; otherwise nothing is done here and
2832     it is handled during the compiling process on a per-bracket-group
2833     basis. */
2834    
2835     case ')':
2836     if (brastackptr == 0)
2837     {
2838     options = (options | set) & (~unset);
2839     set = unset = 0; /* To save length */
2840     }
2841     /* Fall through */
2842    
2843     /* A termination by ':' indicates the start of a nested group with
2844     the given options set. This is again handled at compile time, but
2845     we must allow for compiled space if any of the ims options are
2846     set. We also have to allow for resetting space at the end of
2847     the group, which is why 4 is added to the length and not just 2.
2848     If there are several changes of options within the same group, this
2849     will lead to an over-estimate on the length, but this shouldn't
2850     matter very much. We also have to allow for resetting options at
2851     the start of any alternations, which we do by setting
2852 nigel 37 branch_newextra to 2. Finally, we record whether the case-dependent
2853     flag ever changes within the regex. This is used by the "required
2854     character" code. */
2855 nigel 23
2856     case ':':
2857     if (((set|unset) & PCRE_IMS) != 0)
2858     {
2859     length += 4;
2860     branch_newextra = 2;
2861 nigel 37 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2862 nigel 23 }
2863     goto END_OPTIONS;
2864    
2865     /* Unrecognized option character */
2866    
2867     default:
2868     *errorptr = ERR12;
2869     goto PCRE_ERROR_RETURN;
2870     }
2871 nigel 3 }
2872 nigel 23
2873     /* If we hit a closing bracket, that's it - this is a freestanding
2874     option-setting. We need to ensure that branch_extra is updated if
2875     necessary. The only values branch_newextra can have here are 0 or 2.
2876     If the value is 2, then branch_extra must either be 2 or 5, depending
2877     on whether this is a lookbehind group or not. */
2878    
2879     END_OPTIONS:
2880     if (c == ')')
2881 nigel 19 {
2882 nigel 23 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2883     branch_extra += branch_newextra;
2884 nigel 19 continue;
2885     }
2886 nigel 3
2887 nigel 23 /* If options were terminated by ':' control comes here. Fall through
2888     to handle the group below. */
2889 nigel 3 }
2890     }
2891    
2892     /* Extracting brackets must be counted so we can process escapes in a
2893 nigel 53 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2894     need an additional 3 bytes of store per extracting bracket. */
2895 nigel 3
2896 nigel 53 else
2897     {
2898     bracount++;
2899     if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2900     }
2901 nigel 3
2902 nigel 53 /* Save length for computing whole length at end if there's a repeat that
2903     requires duplication of the group. Also save the current value of
2904     branch_extra, and start the new group with the new value. If non-zero, this
2905     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2906 nigel 3
2907     if (brastackptr >= sizeof(brastack)/sizeof(int))
2908     {
2909     *errorptr = ERR19;
2910     goto PCRE_ERROR_RETURN;
2911     }
2912    
2913 nigel 23 bralenstack[brastackptr] = branch_extra;
2914     branch_extra = branch_newextra;
2915    
2916 nigel 3 brastack[brastackptr++] = length;
2917 nigel 53 length += bracket_length;
2918 nigel 3 continue;
2919    
2920     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2921 nigel 9 have to replicate this bracket up to that many times. If brastackptr is
2922     0 this is an unmatched bracket which will generate an error, but take care
2923 nigel 23 not to try to access brastack[-1] when computing the length and restoring
2924     the branch_extra value. */
2925 nigel 3
2926     case ')':
2927     length += 3;
2928     {
2929 nigel 9 int minval = 1;
2930     int maxval = 1;
2931 nigel 23 int duplength;
2932 nigel 3
2933 nigel 23 if (brastackptr > 0)
2934     {
2935     duplength = length - brastack[--brastackptr];
2936     branch_extra = bralenstack[brastackptr];
2937     }
2938     else duplength = 0;
2939    
2940 nigel 3 /* Leave ptr at the final char; for read_repeat_counts this happens
2941     automatically; for the others we need an increment. */
2942    
2943 nigel 25 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2944 nigel 3 {
2945 nigel 25 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2946     &compile_block);
2947 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2948     }
2949 nigel 9 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2950     else if (c == '+') { maxval = -1; ptr++; }
2951     else if (c == '?') { minval = 0; ptr++; }
2952 nigel 3
2953 nigel 31 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2954     group, and if the maximum is greater than zero, we have to replicate
2955     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2956     bracket set - hence the 7. */
2957 nigel 3
2958 nigel 31 if (minval == 0)
2959     {
2960     length++;
2961     if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2962     }
2963    
2964     /* When the minimum is greater than zero, 1 we have to replicate up to
2965     minval-1 times, with no additions required in the copies. Then, if
2966     there is a limited maximum we have to replicate up to maxval-1 times
2967     allowing for a BRAZERO item before each optional copy and nesting
2968     brackets for all but one of the optional copies. */
2969    
2970     else
2971     {
2972     length += (minval - 1) * duplength;
2973     if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2974     length += (maxval - minval) * (duplength + 7) - 6;
2975     }
2976 nigel 3 }
2977     continue;
2978    
2979     /* Non-special character. For a run of such characters the length required
2980     is the number of characters + 2, except that the maximum run length is 255.
2981     We won't get a skipped space or a non-data escape or the start of a #
2982     comment as the first character, so the length can't be zero. */
2983    
2984     NORMAL_CHAR:
2985     default:
2986     length += 2;
2987     runlength = 0;
2988     do
2989     {
2990 nigel 23 if ((options & PCRE_EXTENDED) != 0)
2991 nigel 3 {
2992 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2993 nigel 23 if (c == '#')
2994     {
2995 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
2996     on the Macintosh. */
2997 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2998 nigel 23 continue;
2999     }
3000 nigel 3 }
3001    
3002     /* Backslash may introduce a data char or a metacharacter; stop the
3003     string before the latter. */
3004    
3005     if (c == '\\')
3006     {
3007 nigel 7 const uschar *saveptr = ptr;
3008 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3009     &compile_block);
3010 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3011     if (c < 0) { ptr = saveptr; break; }
3012 nigel 49
3013     #ifdef SUPPORT_UTF8
3014     if (c > 127 && (options & PCRE_UTF8) != 0)
3015     {
3016     int i;
3017     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3018     if (c <= utf8_table1[i]) break;
3019     runlength += i;
3020     }
3021     #endif
3022 nigel 3 }
3023    
3024     /* Ordinary character or single-char escape */
3025    
3026     runlength++;
3027     }
3028    
3029     /* This "while" is the end of the "do" above. */
3030    
3031 nigel 49 while (runlength < MAXLIT &&
3032 nigel 25 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3033 nigel 3
3034     ptr--;
3035     length += runlength;
3036     continue;
3037     }
3038     }
3039    
3040     length += 4; /* For final KET and END */
3041    
3042     if (length > 65539)
3043     {
3044     *errorptr = ERR20;
3045     return NULL;
3046     }
3047    
3048     /* Compute the size of data block needed and get it, either from malloc or
3049 nigel 9 externally provided function. We specify "code[0]" in the offsetof() expression
3050     rather than just "code", because it has been reported that one broken compiler
3051     fails on "code" because it is also an independent variable. It should make no
3052     difference to the value of the offsetof(). */
3053 nigel 3
3054 nigel 9 size = length + offsetof(real_pcre, code[0]);
3055 nigel 3 re = (real_pcre *)(pcre_malloc)(size);
3056    
3057     if (re == NULL)
3058     {
3059     *errorptr = ERR21;
3060     return NULL;
3061     }
3062    
3063 nigel 43 /* Put in the magic number, and save the size, options, and table pointer */
3064 nigel 9
3065 nigel 3 re->magic_number = MAGIC_NUMBER;
3066 nigel 43 re->size = size;
3067 nigel 3 re->options = options;
3068 nigel 25 re->tables = tables;
3069 nigel 3
3070     /* Set up a starting, non-extracting bracket, then compile the expression. On
3071     error, *errorptr will be set non-NULL, so we don't need to look at the result
3072     of the function here. */
3073    
3074 nigel 7 ptr = (const uschar *)pattern;
3075 nigel 3 code = re->code;
3076     *code = OP_BRA;
3077     bracount = 0;
3078 nigel 53 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3079 nigel 37 &reqchar, &countlits, &compile_block);
3080 nigel 3 re->top_bracket = bracount;
3081     re->top_backref = top_backref;
3082    
3083     /* If not reached end of pattern on success, there's an excess bracket. */
3084    
3085     if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3086    
3087     /* Fill in the terminating state and check for disastrous overflow, but
3088     if debugging, leave the test till after things are printed out. */
3089    
3090     *code++ = OP_END;
3091    
3092     #ifndef DEBUG
3093     if (code - re->code > length) *errorptr = ERR23;
3094     #endif
3095    
3096 nigel 23 /* Give an error if there's back reference to a non-existent capturing
3097     subpattern. */
3098    
3099     if (top_backref > re->top_bracket) *errorptr = ERR15;
3100    
3101 nigel 3 /* Failed to compile */
3102    
3103     if (*errorptr != NULL)
3104     {
3105     (pcre_free)(re);
3106     PCRE_ERROR_RETURN:
3107 nigel 7 *erroroffset = ptr - (const uschar *)pattern;
3108 nigel 3 return NULL;
3109     }
3110    
3111 nigel 33 /* If the anchored option was not passed, set flag if we can determine that the
3112     pattern is anchored by virtue of ^ characters or \A or anything else (such as
3113     starting with .* when DOTALL is set).
3114 nigel 3
3115 nigel 33 Otherwise, see if we can determine what the first character has to be, because
3116     that speeds up unanchored matches no end. If not, see if we can set the
3117     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3118     start with ^. and also when all branches start with .* for non-DOTALL matches.
3119     */
3120    
3121 nigel 3 if ((options & PCRE_ANCHORED) == 0)
3122     {
3123 nigel 23 int temp_options = options;
3124     if (is_anchored(re->code, &temp_options))
3125 nigel 3 re->options |= PCRE_ANCHORED;
3126     else
3127     {
3128 nigel 23 int ch = find_firstchar(re->code, &temp_options);
3129 nigel 9 if (ch >= 0)
3130 nigel 3 {
3131 nigel 9 re->first_char = ch;
3132 nigel 3 re->options |= PCRE_FIRSTSET;
3133     }
3134     else if (is_startline(re->code))
3135     re->options |= PCRE_STARTLINE;
3136     }
3137     }
3138    
3139 nigel 37 /* Save the last required character if there are at least two literal
3140     characters on all paths, or if there is no first character setting. */
3141    
3142     if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3143     {
3144     re->req_char = reqchar;
3145     re->options |= PCRE_REQCHSET;
3146     }
3147    
3148 nigel 3 /* Print out the compiled data for debugging */
3149    
3150     #ifdef DEBUG
3151    
3152 nigel 23 printf("Length = %d top_bracket = %d top_backref = %d\n",
3153 nigel 3 length, re->top_bracket, re->top_backref);
3154    
3155     if (re->options != 0)
3156     {
3157 nigel 37 printf("%s%s%s%s%s%s%s%s%s\n",
3158 nigel 3 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3159     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3160 nigel 37 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3161 nigel 3 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3162     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3163     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3164     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3165 nigel 19 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3166     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3167 nigel 3 }
3168    
3169     if ((re->options & PCRE_FIRSTSET) != 0)
3170     {
3171     if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3172     else printf("First char = \\x%02x\n", re->first_char);
3173     }
3174    
3175 nigel 37 if ((re->options & PCRE_REQCHSET) != 0)
3176     {
3177     if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3178     else printf("Req char = \\x%02x\n", re->req_char);
3179     }
3180    
3181 nigel 3 code_end = code;
3182     code_base = code = re->code;
3183    
3184     while (code < code_end)
3185     {
3186     int charlength;
3187    
3188     printf("%3d ", code - code_base);
3189    
3190     if (*code >= OP_BRA)
3191     {
3192 nigel 53 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3193     printf("%3d Bra extra", (code[1] << 8) + code[2]);
3194     else
3195     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3196 nigel 3 code += 2;
3197     }
3198    
3199     else switch(*code)
3200     {
3201 nigel 23 case OP_OPT:
3202     printf(" %.2x %s", code[1], OP_names[*code]);
3203     code++;
3204     break;
3205    
3206 nigel 3 case OP_CHARS:
3207     charlength = *(++code);
3208     printf("%3d ", charlength);
3209     while (charlength-- > 0)
3210     if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3211     break;
3212    
3213     case OP_KETRMAX:
3214     case OP_KETRMIN:
3215     case OP_ALT:
3216     case OP_KET:
3217     case OP_ASSERT:
3218     case OP_ASSERT_NOT:
3219 nigel 23 case OP_ASSERTBACK:
3220     case OP_ASSERTBACK_NOT:
3221 nigel 3 case OP_ONCE:
3222 nigel 23 case OP_REVERSE:
3223 nigel 53 case OP_BRANUMBER:
3224     case OP_COND:
3225     case OP_CREF:
3226 nigel 23 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3227     code += 2;
3228     break;
3229    
3230 nigel 3 case OP_STAR:
3231     case OP_MINSTAR:
3232     case OP_PLUS:
3233     case OP_MINPLUS:
3234     case OP_QUERY:
3235     case OP_MINQUERY:
3236     case OP_TYPESTAR:
3237     case OP_TYPEMINSTAR:
3238     case OP_TYPEPLUS:
3239     case OP_TYPEMINPLUS:
3240     case OP_TYPEQUERY:
3241     case OP_TYPEMINQUERY:
3242     if (*code >= OP_TYPESTAR)
3243     printf(" %s", OP_names[code[1]]);
3244     else if (isprint(c = code[1])) printf(" %c", c);
3245     else printf(" \\x%02x", c);
3246     printf("%s", OP_names[*code++]);
3247     break;
3248    
3249     case OP_EXACT:
3250     case OP_UPTO:
3251     case OP_MINUPTO:
3252     if (isprint(c = code[3])) printf(" %c{", c);
3253     else printf(" \\x%02x{", c);
3254 nigel 11 if (*code != OP_EXACT) printf("0,");
3255 nigel 3 printf("%d}", (code[1] << 8) + code[2]);
3256     if (*code == OP_MINUPTO) printf("?");
3257     code += 3;
3258     break;
3259    
3260     case OP_TYPEEXACT:
3261     case OP_TYPEUPTO:
3262     case OP_TYPEMINUPTO:
3263     printf(" %s{", OP_names[code[3]]);
3264     if (*code != OP_TYPEEXACT) printf(",");
3265     printf("%d}", (code[1] << 8) + code[2]);
3266     if (*code == OP_TYPEMINUPTO) printf("?");
3267     code += 3;
3268     break;
3269    
3270     case OP_NOT:
3271     if (isprint(c = *(++code))) printf(" [^%c]", c);
3272     else printf(" [^\\x%02x]", c);
3273     break;
3274    
3275     case OP_NOTSTAR:
3276     case OP_NOTMINSTAR:
3277     case OP_NOTPLUS:
3278     case OP_NOTMINPLUS:
3279     case OP_NOTQUERY:
3280     case OP_NOTMINQUERY:
3281     if (isprint(c = code[1])) printf(" [^%c]", c);
3282     else printf(" [^\\x%02x]", c);
3283     printf("%s", OP_names[*code++]);
3284     break;
3285    
3286     case OP_NOTEXACT:
3287     case OP_NOTUPTO:
3288     case OP_NOTMINUPTO:
3289     if (isprint(c = code[3])) printf(" [^%c]{", c);
3290     else printf(" [^\\x%02x]{", c);
3291     if (*code != OP_NOTEXACT) printf(",");
3292     printf("%d}", (code[1] << 8) + code[2]);
3293     if (*code == OP_NOTMINUPTO) printf("?");
3294     code += 3;
3295     break;
3296    
3297     case OP_REF:
3298 nigel 53 printf(" \\%d", (code[1] << 8) | code[2]);
3299     code += 3;
3300 nigel 9 goto CLASS_REF_REPEAT;
3301 nigel 3
3302     case OP_CLASS:
3303     {
3304     int i, min, max;
3305 nigel 23 code++;
3306     printf(" [");
3307 nigel 3
3308     for (i = 0; i < 256; i++)
3309     {
3310     if ((code[i/8] & (1 << (i&7))) != 0)
3311     {
3312     int j;
3313     for (j = i+1; j < 256; j++)
3314     if ((code[j/8] & (1 << (j&7))) == 0) break;
3315     if (i == '-' || i == ']') printf("\\");
3316     if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3317     if (--j > i)
3318     {
3319     printf("-");
3320     if (j == '-' || j == ']') printf("\\");
3321     if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3322     }
3323     i = j;
3324     }
3325     }
3326     printf("]");
3327     code += 32;
3328    
3329 nigel 9 CLASS_REF_REPEAT:
3330    
3331 nigel 3 switch(*code)
3332     {
3333     case OP_CRSTAR:
3334     case OP_CRMINSTAR:
3335     case OP_CRPLUS:
3336     case OP_CRMINPLUS:
3337     case OP_CRQUERY:
3338     case OP_CRMINQUERY:
3339     printf("%s", OP_names[*code]);
3340     break;
3341    
3342     case OP_CRRANGE:
3343     case OP_CRMINRANGE:
3344     min = (code[1] << 8) + code[2];
3345     max = (code[3] << 8) + code[4];
3346     if (max == 0) printf("{%d,}", min);
3347     else printf("{%d,%d}", min, max);
3348     if (*code == OP_CRMINRANGE) printf("?");
3349     code += 4;
3350     break;
3351    
3352     default:
3353     code--;
3354     }
3355     }
3356     break;
3357    
3358     /* Anything else is just a one-node item */
3359    
3360     default:
3361     printf(" %s", OP_names[*code]);
3362     break;
3363     }
3364    
3365     code++;
3366     printf("\n");
3367     }
3368     printf("------------------------------------------------------------------\n");
3369    
3370     /* This check is done here in the debugging case so that the code that
3371     was compiled can be seen. */
3372    
3373     if (code - re->code > length)
3374     {
3375     *errorptr = ERR23;
3376     (pcre_free)(re);
3377     *erroroffset = ptr - (uschar *)pattern;
3378     return NULL;
3379     }
3380     #endif
3381    
3382     return (pcre *)re;
3383     }
3384    
3385    
3386    
3387     /*************************************************
3388     * Match a back-reference *
3389     *************************************************/
3390    
3391 nigel 23 /* If a back reference hasn't been set, the length that is passed is greater
3392     than the number of characters left in the string, so the match fails.
3393 nigel 3
3394     Arguments:
3395 nigel 23 offset index into the offset vector
3396 nigel 3 eptr points into the subject
3397     length length to be matched
3398     md points to match data block
3399 nigel 23 ims the ims flags
3400 nigel 3
3401     Returns: TRUE if matched
3402     */
3403    
3404     static BOOL
3405 nigel 23 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3406 nigel 37 unsigned long int ims)
3407 nigel 3 {
3408 nigel 23 const uschar *p = md->start_subject + md->offset_vector[offset];
3409 nigel 3
3410     #ifdef DEBUG
3411     if (eptr >= md->end_subject)
3412     printf("matching subject <null>");
3413     else
3414     {
3415     printf("matching subject ");
3416     pchars(eptr, length, TRUE, md);
3417     }
3418     printf(" against backref ");
3419     pchars(p, length, FALSE, md);
3420     printf("\n");
3421     #endif
3422    
3423     /* Always fail if not enough characters left */
3424    
3425 nigel 23 if (length > md->end_subject - eptr) return FALSE;
3426 nigel 3
3427     /* Separate the caselesss case for speed */
3428    
3429 nigel 23 if ((ims & PCRE_CASELESS) != 0)
3430 nigel 25 {
3431     while (length-- > 0)
3432     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3433     }
3434 nigel 3 else
3435     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3436    
3437     return TRUE;
3438     }
3439    
3440    
3441    
3442     /*************************************************
3443     * Match from current position *
3444     *************************************************/
3445    
3446 nigel 23 /* On entry ecode points to the first opcode, and eptr to the first character
3447     in the subject string, while eptrb holds the value of eptr at the start of the
3448     last bracketed group - used for breaking infinite loops matching zero-length
3449     strings.
3450 nigel 3
3451     Arguments:
3452     eptr pointer in subject
3453     ecode position in code
3454     offset_top current top pointer
3455     md pointer to "static" info for the match
3456 nigel 23 ims current /i, /m, and /s options
3457 nigel 47 eptrb pointer to chain of blocks containing eptr at start of
3458     brackets - for testing for empty matches
3459     flags can contain
3460     match_condassert - this is an assertion condition
3461     match_isgroup - this is the start of a bracketed group
3462 nigel 3
3463     Returns: TRUE if matched
3464     */
3465    
3466     static BOOL
3467 nigel 23 match(register const uschar *eptr, register const uschar *ecode,
3468 nigel 47 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3469     int flags)
3470 nigel 3 {
3471 nigel 37 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3472 nigel 47 eptrblock newptrb;
3473 nigel 23
3474 nigel 47 /* At the start of a bracketed group, add the current subject pointer to the
3475     stack of such pointers, to be re-instated at the end of the group when we hit
3476     the closing ket. When match() is called in other circumstances, we don't add to
3477     the stack. */
3478    
3479     if ((flags & match_isgroup) != 0)
3480     {
3481     newptrb.prev = eptrb;
3482     newptrb.saved_eptr = eptr;
3483     eptrb = &newptrb;
3484     }
3485    
3486     /* Now start processing the operations. */
3487    
3488 nigel 3 for (;;)
3489     {
3490 nigel 23 int op = (int)*ecode;
3491 nigel 3 int min, max, ctype;
3492     register int i;
3493     register int c;
3494 nigel 7 BOOL minimize = FALSE;
3495 nigel 3
3496 nigel 23 /* Opening capturing bracket. If there is space in the offset vector, save
3497     the current subject position in the working slot at the top of the vector. We
3498     mustn't change the current values of the data slot, because they may be set
3499     from a previous iteration of this group, and be referred to by a reference
3500     inside the group.
3501 nigel 3
3502 nigel 23 If the bracket fails to match, we need to restore this value and also the
3503     values of the final offsets, in case they were set by a previous iteration of
3504     the same bracket.
3505    
3506     If there isn't enough space in the offset vector, treat this as if it were a
3507     non-capturing bracket. Don't worry about setting the flag for the error case
3508     here; that is handled in the code for KET. */
3509    
3510     if (op > OP_BRA)
3511 nigel 3 {
3512 nigel 53 int offset;
3513 nigel 23 int number = op - OP_BRA;
3514 nigel 3
3515 nigel 53 /* For extended extraction brackets (large number), we have to fish out the
3516     number from a dummy opcode at the start. */
3517    
3518     if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3519     offset = number << 1;
3520    
3521 nigel 31 #ifdef DEBUG
3522     printf("start bracket %d subject=", number);
3523     pchars(eptr, 16, TRUE, md);
3524     printf("\n");
3525     #endif
3526 nigel 3
3527 nigel 23 if (offset < md->offset_max)
3528 nigel 3 {
3529 nigel 23 int save_offset1 = md->offset_vector[offset];
3530     int save_offset2 = md->offset_vector[offset+1];
3531     int save_offset3 = md->offset_vector[md->offset_end - number];
3532 nigel 3
3533 nigel 23 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3534     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3535    
3536     do
3537     {
3538 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3539     return TRUE;
3540 nigel 23 ecode += (ecode[1] << 8) + ecode[2];
3541     }
3542     while (*ecode == OP_ALT);
3543    
3544     DPRINTF(("bracket %d failed\n", number));
3545    
3546     md->offset_vector[offset] = save_offset1;
3547     md->offset_vector[offset+1] = save_offset2;
3548     md->offset_vector[md->offset_end - number] = save_offset3;
3549 nigel 53
3550 nigel 23 return FALSE;
3551 nigel 3 }
3552    
3553 nigel 23 /* Insufficient room for saving captured contents */
3554 nigel 3
3555 nigel 23 else op = OP_BRA;
3556     }
3557    
3558     /* Other types of node can be handled by a switch */
3559    
3560     switch(op)
3561     {
3562     case OP_BRA: /* Non-capturing bracket: optimized */
3563     DPRINTF(("start bracket 0\n"));
3564 nigel 3 do
3565     {
3566 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567     return TRUE;
3568 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3569     }
3570     while (*ecode == OP_ALT);
3571 nigel 23 DPRINTF(("bracket 0 failed\n"));
3572     return FALSE;
3573 nigel 3
3574 nigel 23 /* Conditional group: compilation checked that there are no more than
3575     two branches. If the condition is false, skipping the first branch takes us
3576     past the end if there is only one branch, but that's OK because that is
3577     exactly what going to the ket would do. */
3578 nigel 3
3579 nigel 23 case OP_COND:
3580     if (ecode[3] == OP_CREF) /* Condition is extraction test */
3581 nigel 3 {
3582 nigel 53 int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3583 nigel 23 return match(eptr,
3584     ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585 nigel 53 6 : 3 + (ecode[1] << 8) + ecode[2]),
3586 nigel 47 offset_top, md, ims, eptrb, match_isgroup);
3587 nigel 3 }
3588    
3589 nigel 23 /* The condition is an assertion. Call match() to evaluate it - setting
3590     the final argument TRUE causes it to stop at the end of an assertion. */
3591 nigel 3
3592 nigel 23 else
3593     {
3594 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595     match_condassert | match_isgroup))
3596 nigel 23 {
3597     ecode += 3 + (ecode[4] << 8) + ecode[5];
3598     while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599     }
3600     else ecode += (ecode[1] << 8) + ecode[2];
3601 nigel 47 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602 nigel 23 }
3603     /* Control never reaches here */
3604 nigel 3
3605 nigel 53 /* Skip over conditional reference or large extraction number data if
3606     encountered. */
3607 nigel 23
3608     case OP_CREF:
3609 nigel 53 case OP_BRANUMBER:
3610     ecode += 3;
3611 nigel 23 break;
3612    
3613 nigel 37 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3614     an empty string - recursion will then try other alternatives, if any. */
3615 nigel 23
3616 nigel 3 case OP_END:
3617 nigel 37 if (md->notempty && eptr == md->start_match) return FALSE;
3618 nigel 3 md->end_match_ptr = eptr; /* Record where we ended */
3619     md->end_offset_top = offset_top; /* and how many extracts were taken */
3620     return TRUE;
3621    
3622 nigel 23 /* Change option settings */
3623 nigel 3
3624 nigel 23 case OP_OPT:
3625     ims = ecode[1];
3626     ecode += 2;
3627 nigel 39 DPRINTF(("ims set to %02lx\n", ims));
3628 nigel 23 break;
3629 nigel 3
3630     /* Assertion brackets. Check the alternative branches in turn - the
3631     matching won't pass the KET for an assertion. If any one branch matches,
3632 nigel 23 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3633     start of each branch to move the current point backwards, so the code at
3634     this level is identical to the lookahead case. */
3635 nigel 3
3636     case OP_ASSERT:
3637 nigel 23 case OP_ASSERTBACK:
3638 nigel 3 do
3639     {
3640 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3641 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3642     }
3643     while (*ecode == OP_ALT);
3644     if (*ecode == OP_KET) return FALSE;
3645    
3646 nigel 23 /* If checking an assertion for a condition, return TRUE. */
3647    
3648 nigel 47 if ((flags & match_condassert) != 0) return TRUE;
3649 nigel 23
3650 nigel 3 /* Continue from after the assertion, updating the offsets high water
3651     mark, since extracts may have been taken during the assertion. */
3652    
3653     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3654     ecode += 3;
3655     offset_top = md->end_offset_top;
3656     continue;
3657    
3658     /* Negative assertion: all branches must fail to match */
3659    
3660     case OP_ASSERT_NOT:
3661 nigel 23 case OP_ASSERTBACK_NOT:
3662 nigel 3 do
3663     {
3664 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3665     return FALSE;
3666 nigel 3 ecode += (ecode[1] << 8) + ecode[2];
3667     }
3668     while (*ecode == OP_ALT);
3669 nigel 23
3670 nigel 47 if ((flags & match_condassert) != 0) return TRUE;
3671    
3672 nigel 3 ecode += 3;
3673     continue;
3674    
3675 nigel 23 /* Move the subject pointer back. This occurs only at the start of
3676     each branch of a lookbehind assertion. If we are too close to the start to
3677 nigel 49 move back, this match function fails. When working with UTF-8 we move
3678     back a number of characters, not bytes. */
3679 nigel 23
3680     case OP_REVERSE:
3681 nigel 49 #ifdef SUPPORT_UTF8
3682     c = (ecode[1] << 8) + ecode[2];
3683     for (i = 0; i < c; i++)
3684     {
3685     eptr--;
3686     BACKCHAR(eptr)
3687     }
3688     #else
3689 nigel 23 eptr -= (ecode[1] << 8) + ecode[2];
3690 nigel 49 #endif
3691    
3692 nigel 23 if (eptr < md->start_subject) return FALSE;
3693     ecode += 3;
3694     break;
3695    
3696 nigel 43 /* Recursion matches the current regex, nested. If there are any capturing
3697     brackets started but not finished, we have to save their starting points
3698     and reinstate them after the recursion. However, we don't know how many
3699     such there are (offset_top records the completed total) so we just have
3700     to save all the potential data. There may be up to 99 such values, which
3701     is a bit large to put on the stack, but using malloc for small numbers
3702     seems expensive. As a compromise, the stack is used when there are fewer
3703     than 16 values to store; otherwise malloc is used. A problem is what to do
3704     if the malloc fails ... there is no way of returning to the top level with
3705     an error. Save the top 15 values on the stack, and accept that the rest
3706     may be wrong. */
3707 nigel 23
3708 nigel 43 case OP_RECURSE:
3709     {
3710     BOOL rc;
3711     int *save;
3712     int stacksave[15];
3713    
3714     c = md->offset_max;
3715    
3716     if (c < 16) save = stacksave; else
3717     {
3718     save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3719     if (save == NULL)
3720     {
3721     save = stacksave;
3722     c = 15;
3723     }
3724     }
3725    
3726     for (i = 1; i <= c; i++)
3727     save[i] = md->offset_vector[md->offset_end - i];
3728 nigel 47 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3729     match_isgroup);
3730 nigel 43 for (i = 1; i <= c; i++)
3731     md->offset_vector[md->offset_end - i] = save[i];
3732     if (save != stacksave) (pcre_free)(save);
3733     if (!rc) return FALSE;
3734    
3735     /* In case the recursion has set more capturing values, save the final
3736     number, then move along the subject till after the recursive match,
3737     and advance one byte in the pattern code. */
3738    
3739     offset_top = md->end_offset_top;
3740     eptr = md->end_match_ptr;
3741     ecode++;
3742     }
3743     break;
3744    
3745 nigel 3 /* "Once" brackets are like assertion brackets except that after a match,
3746     the point in the subject string is not moved back. Thus there can never be
3747 nigel 5 a move back into the brackets. Check the alternative branches in turn - the
3748 nigel 3 matching won't pass the KET for this kind of subpattern. If any one branch
3749 nigel 23 matches, we carry on as at the end of a normal bracket, leaving the subject
3750     pointer. */
3751 nigel 3
3752     case OP_ONCE:
3753     {
3754 nigel 23 const uschar *prev = ecode;
3755 nigel 47 const uschar *saved_eptr = eptr;
3756 nigel 3
3757 nigel 23 do
3758     {
3759 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3760     break;
3761 nigel 23 ecode += (ecode[1] << 8) + ecode[2];
3762     }
3763     while (*ecode == OP_ALT);
3764 nigel 3
3765 nigel 23 /* If hit the end of the group (which could be repeated), fail */
3766 nigel 3
3767 nigel 23 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3768    
3769     /* Continue as from after the assertion, updating the offsets high water
3770     mark, since extracts may have been taken. */
3771    
3772     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3773    
3774     offset_top = md->end_offset_top;
3775     eptr = md->end_match_ptr;
3776    
3777     /* For a non-repeating ket, just continue at this level. This also
3778     happens for a repeating ket if no characters were matched in the group.
3779     This is the forcible breaking of infinite loops as implemented in Perl
3780     5.005. If there is an options reset, it will get obeyed in the normal
3781     course of events. */
3782    
3783 nigel 47 if (*ecode == OP_KET || eptr == saved_eptr)
3784 nigel 23 {
3785     ecode += 3;
3786     break;
3787     }
3788    
3789     /* The repeating kets try the rest of the pattern or restart from the
3790     preceding bracket, in the appropriate order. We need to reset any options
3791     that changed within the bracket before re-running it, so check the next
3792     opcode. */
3793    
3794     if (ecode[3] == OP_OPT)
3795     {
3796     ims = (ims & ~PCRE_IMS) | ecode[4];
3797 nigel 39 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3798 nigel 23 }
3799    
3800     if (*ecode == OP_KETRMIN)
3801     {
3802 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3803     match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3804     return TRUE;
3805 nigel 23 }
3806     else /* OP_KETRMAX */
3807     {
3808 nigel 47 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3809     match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3810 nigel 23 }
3811     }
3812     return FALSE;
3813    
3814 nigel 3 /* An alternation is the end of a branch; scan along to find the end of the
3815     bracketed group and go to there. */
3816    
3817     case OP_ALT:
3818     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3819     break;
3820    
3821     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3822     that it may occur zero times. It may repeat infinitely, or not at all -
3823     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3824     repeat limits are compiled as a number of copies, with the optional ones
3825     preceded by BRAZERO or BRAMINZERO. */
3826    
3827     case OP_BRAZERO:
3828     {
3829 nigel 7 const uschar *next = ecode+1;
3830 nigel 47 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3831     return TRUE;
3832 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833     ecode = next + 3;
3834     }
3835     break;
3836    
3837     case OP_BRAMINZERO:
3838     {
3839 nigel 7 const uschar *next = ecode+1;
3840 nigel 3 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3841 nigel 47 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3842     return TRUE;
3843 nigel 3 ecode++;
3844     }
3845 nigel 23 break;
3846 nigel 3
3847     /* End of a group, repeated or non-repeating. If we are at the end of
3848     an assertion "group", stop matching and return TRUE, but record the
3849 nigel 23 current high water mark for use by positive assertions. Do this also
3850     for the "once" (not-backup up) groups. */
3851 nigel 3
3852     case OP_KET:
3853     case OP_KETRMIN:
3854     case OP_KETRMAX:
3855     {
3856 nigel 7 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3857 nigel 47 const uschar *saved_eptr = eptrb->saved_eptr;
3858 nigel 3
3859 nigel 47 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3860    
3861 nigel 23 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3862     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3863     *prev == OP_ONCE)
3864 nigel 3 {
3865     md->end_match_ptr = eptr; /* For ONCE */
3866     md->end_offset_top = offset_top;
3867     return TRUE;
3868     }
3869    
3870 nigel 23 /* In all other cases except a conditional group we have to check the
3871     group number back at the start and if necessary complete handling an
3872     extraction by setting the offsets and bumping the high water mark. */
3873 nigel 3
3874 nigel 23 if (*prev != OP_COND)
3875     {
3876 nigel 53 int offset;
3877 nigel 23 int number = *prev - OP_BRA;
3878 nigel 3
3879 nigel 53 /* For extended extraction brackets (large number), we have to fish out
3880     the number from a dummy opcode at the start. */
3881    
3882     if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3883     offset = number << 1;
3884    
3885 nigel 47 #ifdef DEBUG
3886     printf("end bracket %d", number);
3887     printf("\n");
3888     #endif
3889 nigel 3
3890 nigel 23 if (number > 0)
3891 nigel 3 {
3892 nigel 23 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3893     {
3894     md->offset_vector[offset] =
3895     md->offset_vector[md->offset_end - number];
3896     md->offset_vector[offset+1] = eptr - md->start_subject;
3897     if (offset_top <= offset) offset_top = offset + 2;
3898     }
3899 nigel 3 }
3900     }
3901    
3902 nigel 23 /* Reset the value of the ims flags, in case they got changed during
3903     the group. */
3904 nigel 3
3905 nigel 23 ims = original_ims;
3906 nigel 39 DPRINTF(("ims reset to %02lx\n", ims));
3907 nigel 23
3908     /* For a non-repeating ket, just continue at this level. This also
3909     happens for a repeating ket if no characters were matched in the group.
3910     This is the forcible breaking of infinite loops as implemented in Perl
3911     5.005. If there is an options reset, it will get obeyed in the normal
3912     course of events. */
3913    
3914 nigel 47 if (*ecode == OP_KET || eptr == saved_eptr)
3915 nigel 3 {
3916     ecode += 3;
3917     break;
3918     }
3919    
3920     /* The repeating kets try the rest of the pattern or restart from the
3921     preceding bracket, in the appropriate order. */
3922    
3923     if (*ecode == OP_KETRMIN)
3924     {
3925 nigel 47 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3926     match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3927     return TRUE;
3928 nigel 3 }
3929     else /* OP_KETRMAX */
3930     {
3931 nigel 47 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3932     match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3933 nigel 3 }
3934     }
3935     return FALSE;
3936    
3937     /* Start of subject unless notbol, or after internal newline if multiline */
3938    
3939     case OP_CIRC:
3940     if (md->notbol && eptr == md->start_subject) return FALSE;
3941 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3942 nigel 3 {
3943 nigel 53 if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3944 nigel 3 ecode++;
3945     break;
3946     }
3947     /* ... else fall through */
3948    
3949     /* Start of subject assertion */
3950    
3951     case OP_SOD:
3952     if (eptr != md->start_subject) return FALSE;
3953     ecode++;
3954     break;
3955    
3956 nigel 23 /* Assert before internal newline if multiline, or before a terminating
3957     newline unless endonly is set, else end of subject unless noteol is set. */
3958 nigel 3
3959     case OP_DOLL:
3960 nigel 23 if ((ims & PCRE_MULTILINE) != 0)
3961 nigel 3 {
3962 nigel 53 if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3963 nigel 23 else { if (md->noteol) return FALSE; }
3964 nigel 3 ecode++;
3965     break;
3966     }
3967 nigel 23 else
3968 nigel 3 {
3969 nigel 23 if (md->noteol) return FALSE;
3970     if (!md->endonly)
3971     {
3972     if (eptr < md->end_subject - 1 ||
3973 nigel 53 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3974 nigel 23
3975     ecode++;
3976     break;
3977     }
3978 nigel 3 }
3979     /* ... else fall through */
3980    
3981 nigel 23 /* End of subject assertion (\z) */
3982 nigel 3
3983     case OP_EOD:
3984     if (eptr < md->end_subject) return FALSE;
3985     ecode++;
3986     break;
3987    
3988 nigel 23 /* End of subject or ending \n assertion (\Z) */
3989    
3990     case OP_EODN:
3991     if (eptr < md->end_subject - 1 ||
3992 nigel 53 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3993 nigel 23 ecode++;
3994     break;
3995    
3996 nigel 3 /* Word boundary assertions */
3997    
3998     case OP_NOT_WORD_BOUNDARY:
3999     case OP_WORD_BOUNDARY:
4000     {
4001     BOOL prev_is_word = (eptr != md->start_subject) &&
4002 nigel 25 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
4003 nigel 3 BOOL cur_is_word = (eptr < md->end_subject) &&
4004 nigel 25 ((md->ctypes[*eptr] & ctype_word) != 0);
4005 nigel 3 if ((*ecode++ == OP_WORD_BOUNDARY)?
4006     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4007     return FALSE;
4008     }
4009     break;
4010    
4011     /* Match a single character type; inline for speed */
4012    
4013     case OP_ANY:
4014 nigel 53 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4015 nigel 23 return FALSE;
4016 nigel 3 if (eptr++ >= md->end_subject) return FALSE;
4017 nigel 49 #ifdef SUPPORT_UTF8
4018     if (md->utf8)
4019     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4020     #endif
4021 nigel 3 ecode++;
4022     break;
4023    
4024     case OP_NOT_DIGIT:
4025 nigel 25 if (eptr >= md->end_subject ||
4026     (md->ctypes[*eptr++] & ctype_digit) != 0)
4027 nigel 3 return FALSE;
4028     ecode++;
4029     break;
4030    
4031     case OP_DIGIT:
4032 nigel 25 if (eptr >= md->end_subject ||
4033     (md->ctypes[*eptr++] & ctype_digit) == 0)
4034 nigel 3 return FALSE;
4035     ecode++;
4036     break;
4037    
4038     case OP_NOT_WHITESPACE:
4039 nigel 25 if (eptr >= md->end_subject ||
4040     (md->ctypes[*eptr++] & ctype_space) != 0)
4041 nigel 3 return FALSE;
4042     ecode++;
4043     break;
4044    
4045     case OP_WHITESPACE:
4046 nigel 25 if (eptr >= md->end_subject ||
4047     (md->ctypes[*eptr++] & ctype_space) == 0)
4048 nigel 3 return FALSE;
4049     ecode++;
4050     break;
4051    
4052     case OP_NOT_WORDCHAR:
4053 nigel 25 if (eptr >= md->end_subject ||
4054     (md->ctypes[*eptr++] & ctype_word) != 0)
4055 nigel 3 return FALSE;
4056     ecode++;
4057     break;
4058    
4059     case OP_WORDCHAR:
4060 nigel 25 if (eptr >= md->end_subject ||
4061     (md->ctypes[*eptr++] & ctype_word) == 0)
4062 nigel 3 return FALSE;
4063     ecode++;
4064     break;
4065    
4066     /* Match a back reference, possibly repeatedly. Look past the end of the
4067     item to see if there is repeat information following. The code is similar
4068     to that for character classes, but repeated for efficiency. Then obey
4069     similar code to character type repeats - written out again for speed.
4070     However, if the referenced string is the empty string, always treat
4071     it as matched, any number of times (otherwise there could be infinite
4072     loops). */
4073    
4074     case OP_REF:
4075     {
4076     int length;
4077 nigel 53 int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4078     ecode += 3; /* Advance past item */
4079 nigel 3
4080 nigel 23 /* If the reference is unset, set the length to be longer than the amount
4081     of subject left; this ensures that every attempt at a match fails. We
4082     can't just fail here, because of the possibility of quantifiers with zero
4083     minima. */
4084 nigel 3
4085 nigel 23 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4086     md->end_subject - eptr + 1 :
4087     md->offset_vector[offset+1] - md->offset_vector[offset];
4088 nigel 3
4089 nigel 23 /* Set up for repetition, or handle the non-repeated case */
4090    
4091 nigel 3 switch (*ecode)
4092     {
4093     case OP_CRSTAR:
4094     case OP_CRMINSTAR:
4095     case OP_CRPLUS:
4096     case OP_CRMINPLUS:
4097     case OP_CRQUERY:
4098     case OP_CRMINQUERY:
4099     c = *ecode++ - OP_CRSTAR;
4100     minimize = (c & 1) != 0;
4101     min = rep_min[c]; /* Pick up values from tables; */
4102     max = rep_max[c]; /* zero for max => infinity */
4103     if (max == 0) max = INT_MAX;
4104     break;
4105    
4106     case OP_CRRANGE:
4107     case OP_CRMINRANGE:
4108     minimize = (*ecode == OP_CRMINRANGE);
4109     min = (ecode[1] << 8) + ecode[2];
4110     max = (ecode[3] << 8) + ecode[4];
4111     if (max == 0) max = INT_MAX;
4112     ecode += 5;
4113     break;
4114    
4115     default: /* No repeat follows */
4116 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4117 nigel 3 eptr += length;
4118     continue; /* With the main loop */
4119     }
4120    
4121     /* If the length of the reference is zero, just continue with the
4122     main loop. */
4123    
4124     if (length == 0) continue;
4125    
4126     /* First, ensure the minimum number of matches are present. We get back
4127     the length of the reference string explicitly rather than passing the
4128     address of eptr, so that eptr can be a register variable. */
4129    
4130     for (i = 1; i <= min; i++)
4131     {
4132 nigel 23 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4133 nigel 3 eptr += length;
4134     }
4135    
4136     /* If min = max, continue at the same level without recursion.
4137     They are not both allowed to be zero. */
4138    
4139     if (min == max) continue;
4140    
4141     /* If minimizing, keep trying and advancing the pointer */
4142    
4143     if (minimize)
4144     {
4145     for (i = min;; i++)
4146     {
4147 nigel 47 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4148 nigel 23 return TRUE;
4149     if (i >= max || !match_ref(offset, eptr, length, md, ims))
4150 nigel 3 return FALSE;
4151     eptr += length;
4152     }
4153     /* Control never gets here */