/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 69 - (hide annotations) (download)
Sat Feb 24 21:40:18 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 231769 byte(s)
Load pcre-4.3 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 63 Copyright (c) 1997-2003 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35     /* Define DEBUG to get debugging output on stdout. */
36    
37     /* #define DEBUG */
38    
39 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40     inline, and there are *still* stupid compilers about that don't like indented
41     pre-processor statements. I suppose it's only been 10 years... */
42 nigel 3
43 nigel 9 #ifdef DEBUG
44     #define DPRINTF(p) printf p
45     #else
46     #define DPRINTF(p) /*nothing*/
47     #endif
48    
49 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
50     the external pcre header. */
51    
52     #include "internal.h"
53    
54    
55 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
56    
57     #ifdef __cplusplus
58     #define class pcre_class
59     #endif
60    
61    
62 nigel 53 /* Maximum number of items on the nested bracket stacks at compile time. This
63     applies to the nesting of all kinds of parentheses. It does not limit
64     un-nested, non-capturing parentheses. This number can be made bigger if
65     necessary - it is used to dimension one int and one unsigned char vector at
66     compile time. */
67 nigel 23
68     #define BRASTACK_SIZE 200
69    
70    
71 nigel 63 /* Maximum number of ints of offset to save on the stack for recursive calls.
72     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73     because the offset vector is always a multiple of 3 long. */
74    
75     #define REC_STACK_SAVE_MAX 30
76    
77    
78 nigel 49 /* The number of bytes in a literal character string above which we can't add
79 nigel 63 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80     could be 255 when UTF-8 support is excluded, but that means that some of the
81     test output would be different, which just complicates things.) */
82 nigel 49
83     #define MAXLIT 250
84    
85    
86 nigel 65 /* The maximum remaining length of subject we are prepared to search for a
87     req_byte match. */
88    
89     #define REQ_BYTE_MAX 1000
90    
91    
92 nigel 63 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93     the definition is next to the definition of the opcodes in internal.h. */
94    
95     static uschar OP_lengths[] = { OP_LENGTHS };
96    
97 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98    
99 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101 nigel 3
102     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103     are simple data values; negative values are for special things like \d and so
104     on. Zero means further processing is needed (for things like \x), or the escape
105     is invalid. */
106    
107 nigel 15 static const short int escapes[] = {
108 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 nigel 63 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
111 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 nigel 63 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 nigel 3 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 nigel 63 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
115     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
116 nigel 69 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
117 nigel 23 0, 0, -ESC_z /* x - z */
118 nigel 3 };
119    
120 nigel 43 /* Tables of names of POSIX character classes and their lengths. The list is
121     terminated by a zero length entry. The first three must be alpha, upper, lower,
122     as this is assumed for handling case independence. */
123    
124     static const char *posix_names[] = {
125     "alpha", "lower", "upper",
126 nigel 63 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127 nigel 43 "print", "punct", "space", "word", "xdigit" };
128    
129     static const uschar posix_name_lengths[] = {
130 nigel 63 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131 nigel 43
132     /* Table of class bit maps for each POSIX class; up to three may be combined
133 nigel 63 to form the class. The table for [:blank:] is dynamically modified to remove
134     the vertical space characters. */
135 nigel 43
136     static const int posix_class_maps[] = {
137     cbit_lower, cbit_upper, -1, /* alpha */
138     cbit_lower, -1, -1, /* lower */
139     cbit_upper, -1, -1, /* upper */
140     cbit_digit, cbit_lower, cbit_upper, /* alnum */
141     cbit_print, cbit_cntrl, -1, /* ascii */
142 nigel 63 cbit_space, -1, -1, /* blank - a GNU extension */
143 nigel 43 cbit_cntrl, -1, -1, /* cntrl */
144     cbit_digit, -1, -1, /* digit */
145     cbit_graph, -1, -1, /* graph */
146     cbit_print, -1, -1, /* print */
147     cbit_punct, -1, -1, /* punct */
148     cbit_space, -1, -1, /* space */
149 nigel 63 cbit_word, -1, -1, /* word - a Perl extension */
150 nigel 43 cbit_xdigit,-1, -1 /* xdigit */
151     };
152    
153 nigel 69 /* Table to identify ASCII digits and hex digits. This is used when compiling
154     patterns. Note that the tables in chartables are dependent on the locale, and
155     may mark arbitrary characters as digits - but the PCRE compiling code expects
156     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
157     a private table here. It costs 256 bytes, but it is a lot faster than doing
158     character value tests (at least in some simple cases I timed), and in some
159     applications one wants PCRE to compile efficiently as well as match
160     efficiently.
161 nigel 43
162 nigel 69 For convenience, we use the same bit definitions as in chartables:
163    
164     0x04 decimal digit
165     0x08 hexadecimal digit
166    
167     Then we can use ctype_digit and ctype_xdigit in the code. */
168    
169     static const unsigned char digitab[] =
170     {
171     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
172     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
173     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
174     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
175     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
176     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
177     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
178     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
179     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
180     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
181     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
182     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
183     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
184     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
185     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
186     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
187     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
188     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
189     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
190     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
191     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
192     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
193     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
194     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
195     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
196     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
197     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
198     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
199     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
200     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
201     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
202     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
203    
204 nigel 3 /* Definition to allow mutual recursion */
205    
206 nigel 13 static BOOL
207 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
208 nigel 63 BOOL, int, int *, int *, branch_chain *, compile_data *);
209 nigel 3
210 nigel 47 /* Structure for building a chain of data that actually lives on the
211     stack, for holding the values of the subject pointer at the start of each
212     subpattern, so as to detect when an empty string has been matched by a
213     subpattern - to break infinite loops. */
214 nigel 3
215 nigel 47 typedef struct eptrblock {
216     struct eptrblock *prev;
217     const uschar *saved_eptr;
218     } eptrblock;
219 nigel 3
220 nigel 47 /* Flag bits for the match() function */
221    
222     #define match_condassert 0x01 /* Called to check a condition assertion */
223     #define match_isgroup 0x02 /* Set if start of bracketed group */
224    
225 nigel 63 /* Non-error returns from the match() function. Error returns are externally
226     defined PCRE_ERROR_xxx codes, which are all negative. */
227 nigel 47
228 nigel 63 #define MATCH_MATCH 1
229     #define MATCH_NOMATCH 0
230 nigel 47
231 nigel 63
232    
233 nigel 3 /*************************************************
234     * Global variables *
235     *************************************************/
236    
237     /* PCRE is thread-clean and doesn't use any global variables in the normal
238     sense. However, it calls memory allocation and free functions via the two
239 nigel 63 indirections below, and it can optionally do callouts. These values can be
240     changed by the caller, but are shared between all threads. However, when
241     compiling for Virtual Pascal, things are done differently (see pcre.in). */
242 nigel 3
243 nigel 63 #ifndef VPCOMPAT
244 nigel 3 void *(*pcre_malloc)(size_t) = malloc;
245     void (*pcre_free)(void *) = free;
246 nigel 63 int (*pcre_callout)(pcre_callout_block *) = NULL;
247     #endif
248 nigel 3
249    
250 nigel 49 /*************************************************
251     * Macros and tables for character handling *
252     *************************************************/
253 nigel 3
254 nigel 49 /* When UTF-8 encoding is being used, a character is no longer just a single
255     byte. The macros for character handling generate simple sequences when used in
256     byte-mode, and more complicated ones for UTF-8 characters. */
257    
258     #ifndef SUPPORT_UTF8
259 nigel 63 #define GETCHAR(c, eptr) c = *eptr;
260 nigel 49 #define GETCHARINC(c, eptr) c = *eptr++;
261 nigel 63 #define GETCHARINCTEST(c, eptr) c = *eptr++;
262 nigel 49 #define GETCHARLEN(c, eptr, len) c = *eptr;
263     #define BACKCHAR(eptr)
264    
265     #else /* SUPPORT_UTF8 */
266    
267 nigel 63 /* Get the next UTF-8 character, not advancing the pointer. This is called when
268     we know we are in UTF-8 mode. */
269 nigel 49
270 nigel 63 #define GETCHAR(c, eptr) \
271     c = *eptr; \
272     if ((c & 0xc0) == 0xc0) \
273     { \
274 nigel 67 int gcii; \
275     int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
276     int gcss = 6*gcaa; \
277     c = (c & utf8_table3[gcaa]) << gcss; \
278     for (gcii = 1; gcii <= gcaa; gcii++) \
279 nigel 63 { \
280 nigel 67 gcss -= 6; \
281     c |= (eptr[gcii] & 0x3f) << gcss; \
282 nigel 63 } \
283     }
284    
285     /* Get the next UTF-8 character, advancing the pointer. This is called when we
286     know we are in UTF-8 mode. */
287    
288 nigel 49 #define GETCHARINC(c, eptr) \
289     c = *eptr++; \
290 nigel 63 if ((c & 0xc0) == 0xc0) \
291     { \
292 nigel 67 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
293     int gcss = 6*gcaa; \
294     c = (c & utf8_table3[gcaa]) << gcss; \
295     while (gcaa-- > 0) \
296 nigel 63 { \
297 nigel 67 gcss -= 6; \
298     c |= (*eptr++ & 0x3f) << gcss; \
299 nigel 63 } \
300     }
301    
302     /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
303    
304     #define GETCHARINCTEST(c, eptr) \
305     c = *eptr++; \
306 nigel 49 if (md->utf8 && (c & 0xc0) == 0xc0) \
307     { \
308 nigel 67 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
309     int gcss = 6*gcaa; \
310     c = (c & utf8_table3[gcaa]) << gcss; \
311     while (gcaa-- > 0) \
312 nigel 49 { \
313 nigel 67 gcss -= 6; \
314     c |= (*eptr++ & 0x3f) << gcss; \
315 nigel 49 } \
316     }
317    
318 nigel 63 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
319     if there are extra bytes. This is called when we know we are in UTF-8 mode. */
320 nigel 49
321     #define GETCHARLEN(c, eptr, len) \
322     c = *eptr; \
323 nigel 63 if ((c & 0xc0) == 0xc0) \
324 nigel 49 { \
325 nigel 67 int gcii; \
326     int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
327     int gcss = 6*gcaa; \
328     c = (c & utf8_table3[gcaa]) << gcss; \
329     for (gcii = 1; gcii <= gcaa; gcii++) \
330 nigel 49 { \
331 nigel 67 gcss -= 6; \
332     c |= (eptr[gcii] & 0x3f) << gcss; \
333 nigel 49 } \
334 nigel 67 len += gcaa; \
335 nigel 49 }
336    
337     /* If the pointer is not at the start of a character, move it back until
338 nigel 63 it is. Called only in UTF-8 mode. */
339 nigel 49
340     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
341    
342     #endif
343    
344    
345    
346 nigel 3 /*************************************************
347 nigel 25 * Default character tables *
348     *************************************************/
349    
350     /* A default set of character tables is included in the PCRE binary. Its source
351     is built by the maketables auxiliary program, which uses the default C ctypes
352     functions, and put in the file chartables.c. These tables are used by PCRE
353     whenever the caller of pcre_compile() does not provide an alternate set of
354     tables. */
355    
356     #include "chartables.c"
357    
358    
359    
360 nigel 49 #ifdef SUPPORT_UTF8
361 nigel 25 /*************************************************
362 nigel 49 * Tables for UTF-8 support *
363     *************************************************/
364    
365     /* These are the breakpoints for different numbers of bytes in a UTF-8
366     character. */
367    
368 nigel 69 static const int utf8_table1[] =
369     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
370 nigel 49
371     /* These are the indicator bits and the mask for the data bits to set in the
372     first byte of a character, indexed by the number of additional bytes. */
373    
374 nigel 69 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
375     static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
376 nigel 49
377     /* Table of the number of extra characters, indexed by the first character
378     masked with 0x3f. The highest number for a valid UTF-8 character is in fact
379     0x3d. */
380    
381 nigel 69 static const uschar utf8_table4[] = {
382 nigel 49 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
383     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
384     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
385     3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
386    
387    
388     /*************************************************
389     * Convert character value to UTF-8 *
390     *************************************************/
391    
392     /* This function takes an integer value in the range 0 - 0x7fffffff
393     and encodes it as a UTF-8 character in 0 to 6 bytes.
394    
395     Arguments:
396     cvalue the character value
397     buffer pointer to buffer for result - at least 6 bytes long
398    
399     Returns: number of characters placed in the buffer
400     */
401    
402     static int
403     ord2utf8(int cvalue, uschar *buffer)
404     {
405     register int i, j;
406     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
407     if (cvalue <= utf8_table1[i]) break;
408 nigel 59 buffer += i;
409     for (j = i; j > 0; j--)
410     {
411     *buffer-- = 0x80 | (cvalue & 0x3f);
412     cvalue >>= 6;
413     }
414     *buffer = utf8_table2[i] | cvalue;
415 nigel 49 return i + 1;
416     }
417     #endif
418    
419    
420    
421     /*************************************************
422 nigel 63 * Print compiled regex *
423     *************************************************/
424    
425     /* The code for doing this is held in a separate file that is also included in
426     pcretest.c. It defines a function called print_internals(). */
427    
428     #ifdef DEBUG
429     #include "printint.c"
430     #endif
431    
432    
433    
434     /*************************************************
435 nigel 3 * Return version string *
436     *************************************************/
437    
438 nigel 39 #define STRING(a) # a
439     #define XSTRING(s) STRING(s)
440    
441 nigel 7 const char *
442 nigel 3 pcre_version(void)
443     {
444 nigel 39 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
445 nigel 3 }
446    
447    
448    
449    
450     /*************************************************
451 nigel 43 * (Obsolete) Return info about compiled pattern *
452 nigel 3 *************************************************/
453    
454 nigel 43 /* This is the original "info" function. It picks potentially useful data out
455     of the private structure, but its interface was too rigid. It remains for
456     backwards compatibility. The public options are passed back in an int - though
457     the re->options field has been expanded to a long int, all the public options
458 nigel 37 at the low end of it, and so even on 16-bit systems this will still be OK.
459     Therefore, I haven't changed the API for pcre_info().
460 nigel 3
461     Arguments:
462     external_re points to compiled code
463     optptr where to pass back the options
464 nigel 63 first_byte where to pass back the first character,
465 nigel 3 or -1 if multiline and all branches start ^,
466     or -2 otherwise
467    
468 nigel 43 Returns: number of capturing subpatterns
469 nigel 3 or negative values on error
470     */
471    
472     int
473 nigel 63 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
474 nigel 3 {
475 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
476 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
477     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
478 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
479 nigel 63 if (first_byte != NULL)
480     *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
481 nigel 3 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
482     return re->top_bracket;
483     }
484    
485    
486    
487 nigel 43 /*************************************************
488     * Return info about compiled pattern *
489     *************************************************/
490 nigel 3
491 nigel 43 /* This is a newer "info" function which has an extensible interface so
492     that additional items can be added compatibly.
493    
494     Arguments:
495     external_re points to compiled code
496 nigel 63 extra_data points extra data, or NULL
497 nigel 43 what what information is required
498     where where to put the information
499    
500     Returns: 0 if data returned, negative on error
501     */
502    
503     int
504 nigel 63 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
505 nigel 43 void *where)
506     {
507     const real_pcre *re = (const real_pcre *)external_re;
508 nigel 63 const pcre_study_data *study = NULL;
509 nigel 43
510     if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
511     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
512    
513 nigel 63 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
514     study = extra_data->study_data;
515    
516 nigel 43 switch (what)
517     {
518     case PCRE_INFO_OPTIONS:
519     *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
520     break;
521    
522     case PCRE_INFO_SIZE:
523     *((size_t *)where) = re->size;
524     break;
525    
526 nigel 63 case PCRE_INFO_STUDYSIZE:
527     *((size_t *)where) = (study == NULL)? 0 : study->size;
528     break;
529    
530 nigel 43 case PCRE_INFO_CAPTURECOUNT:
531     *((int *)where) = re->top_bracket;
532     break;
533    
534     case PCRE_INFO_BACKREFMAX:
535     *((int *)where) = re->top_backref;
536     break;
537    
538 nigel 63 case PCRE_INFO_FIRSTBYTE:
539 nigel 43 *((int *)where) =
540 nigel 63 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
541 nigel 43 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
542     break;
543    
544     case PCRE_INFO_FIRSTTABLE:
545     *((const uschar **)where) =
546     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
547     study->start_bits : NULL;
548     break;
549    
550     case PCRE_INFO_LASTLITERAL:
551     *((int *)where) =
552 nigel 63 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
553 nigel 43 break;
554    
555 nigel 63 case PCRE_INFO_NAMEENTRYSIZE:
556     *((int *)where) = re->name_entry_size;
557     break;
558    
559     case PCRE_INFO_NAMECOUNT:
560     *((int *)where) = re->name_count;
561     break;
562    
563     case PCRE_INFO_NAMETABLE:
564     *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
565     break;
566    
567 nigel 43 default: return PCRE_ERROR_BADOPTION;
568     }
569    
570     return 0;
571     }
572    
573    
574    
575 nigel 63 /*************************************************
576     * Return info about what features are configured *
577     *************************************************/
578    
579     /* This is function which has an extensible interface so that additional items
580     can be added compatibly.
581    
582     Arguments:
583     what what information is required
584     where where to put the information
585    
586     Returns: 0 if data returned, negative on error
587     */
588    
589     int
590     pcre_config(int what, void *where)
591     {
592     switch (what)
593     {
594     case PCRE_CONFIG_UTF8:
595     #ifdef SUPPORT_UTF8
596     *((int *)where) = 1;
597     #else
598     *((int *)where) = 0;
599     #endif
600     break;
601    
602     case PCRE_CONFIG_NEWLINE:
603     *((int *)where) = NEWLINE;
604     break;
605    
606     case PCRE_CONFIG_LINK_SIZE:
607     *((int *)where) = LINK_SIZE;
608     break;
609    
610     case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
611     *((int *)where) = POSIX_MALLOC_THRESHOLD;
612     break;
613    
614     case PCRE_CONFIG_MATCH_LIMIT:
615     *((unsigned int *)where) = MATCH_LIMIT;
616     break;
617    
618     default: return PCRE_ERROR_BADOPTION;
619     }
620    
621     return 0;
622     }
623    
624    
625    
626 nigel 3 #ifdef DEBUG
627     /*************************************************
628     * Debugging function to print chars *
629     *************************************************/
630    
631     /* Print a sequence of chars in printable format, stopping at the end of the
632     subject if the requested.
633    
634     Arguments:
635     p points to characters
636     length number to print
637     is_subject TRUE if printing from within md->start_subject
638     md pointer to matching data block, if is_subject is TRUE
639    
640     Returns: nothing
641     */
642    
643 nigel 9 static void
644     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
645 nigel 3 {
646     int c;
647     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
648     while (length-- > 0)
649     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
650     }
651     #endif
652    
653    
654    
655    
656     /*************************************************
657     * Handle escapes *
658     *************************************************/
659    
660     /* This function is called when a \ has been encountered. It either returns a
661     positive value for a simple escape such as \n, or a negative value which
662 nigel 49 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
663     a positive value greater than 255 may be returned. On entry, ptr is pointing at
664     the \. On exit, it is on the final character of the escape sequence.
665 nigel 3
666     Arguments:
667     ptrptr points to the pattern position pointer
668     errorptr points to the pointer to the error message
669     bracount number of previous extracting brackets
670     options the options bits
671     isclass TRUE if inside a character class
672 nigel 25 cd pointer to char tables block
673 nigel 3
674     Returns: zero or positive => a data character
675     negative => a special escape sequence
676     on error, errorptr is set
677     */
678    
679     static int
680 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
681 nigel 25 int options, BOOL isclass, compile_data *cd)
682 nigel 3 {
683 nigel 7 const uschar *ptr = *ptrptr;
684 nigel 43 int c, i;
685 nigel 3
686 nigel 49 /* If backslash is at the end of the pattern, it's an error. */
687    
688     c = *(++ptr);
689 nigel 3 if (c == 0) *errorptr = ERR1;
690    
691     /* Digits or letters may have special meaning; all others are literals. */
692    
693     else if (c < '0' || c > 'z') {}
694    
695     /* Do an initial lookup in a table. A non-zero result is something that can be
696     returned immediately. Otherwise further processing may be required. */
697    
698     else if ((i = escapes[c - '0']) != 0) c = i;
699    
700     /* Escapes that need further processing, or are illegal. */
701    
702     else
703     {
704 nigel 7 const uschar *oldptr;
705 nigel 3 switch (c)
706     {
707 nigel 63 /* A number of Perl escapes are not handled by PCRE. We give an explicit
708     error. */
709    
710     case 'l':
711     case 'L':
712     case 'N':
713     case 'p':
714     case 'P':
715     case 'u':
716     case 'U':
717     case 'X':
718     *errorptr = ERR37;
719     break;
720    
721 nigel 3 /* The handling of escape sequences consisting of a string of digits
722     starting with one that is not zero is not straightforward. By experiment,
723     the way Perl works seems to be as follows:
724    
725     Outside a character class, the digits are read as a decimal number. If the
726     number is less than 10, or if there are that many previous extracting
727     left brackets, then it is a back reference. Otherwise, up to three octal
728     digits are read to form an escaped byte. Thus \123 is likely to be octal
729     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
730     value is greater than 377, the least significant 8 bits are taken. Inside a
731     character class, \ followed by a digit is always an octal number. */
732    
733     case '1': case '2': case '3': case '4': case '5':
734     case '6': case '7': case '8': case '9':
735    
736     if (!isclass)
737     {
738     oldptr = ptr;
739     c -= '0';
740 nigel 69 while ((digitab[ptr[1]] & ctype_digit) != 0)
741 nigel 3 c = c * 10 + *(++ptr) - '0';
742     if (c < 10 || c <= bracount)
743     {
744     c = -(ESC_REF + c);
745     break;
746     }
747     ptr = oldptr; /* Put the pointer back and fall through */
748     }
749    
750     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
751     generates a binary zero byte and treats the digit as a following literal.
752     Thus we have to pull back the pointer by one. */
753    
754     if ((c = *ptr) >= '8')
755     {
756     ptr--;
757     c = 0;
758     break;
759     }
760    
761     /* \0 always starts an octal number, but we may drop through to here with a
762 nigel 49 larger first octal digit. */
763 nigel 3
764     case '0':
765     c -= '0';
766 nigel 69 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
767 nigel 3 c = c * 8 + *(++ptr) - '0';
768 nigel 49 c &= 255; /* Take least significant 8 bits */
769 nigel 3 break;
770    
771 nigel 49 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
772     which can be greater than 0xff, but only if the ddd are hex digits. */
773 nigel 3
774     case 'x':
775 nigel 49 #ifdef SUPPORT_UTF8
776     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
777     {
778     const uschar *pt = ptr + 2;
779     register int count = 0;
780     c = 0;
781 nigel 69 while ((digitab[*pt] & ctype_xdigit) != 0)
782 nigel 49 {
783 nigel 69 int cc = *pt++;
784     if (cc >= 'a') cc -= 32; /* Convert to upper case */
785 nigel 49 count++;
786 nigel 69 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
787 nigel 49 }
788     if (*pt == '}')
789     {
790     if (c < 0 || count > 8) *errorptr = ERR34;
791     ptr = pt;
792     break;
793     }
794     /* If the sequence of hex digits does not end with '}', then we don't
795     recognize this construct; fall through to the normal \x handling. */
796     }
797     #endif
798    
799     /* Read just a single hex char */
800    
801 nigel 3 c = 0;
802 nigel 69 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
803 nigel 3 {
804 nigel 69 int cc = *(++ptr);
805     if (cc >= 'a') cc -= 32; /* Convert to upper case */
806     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
807 nigel 3 }
808     break;
809    
810 nigel 49 /* Other special escapes not starting with a digit are straightforward */
811    
812 nigel 3 case 'c':
813     c = *(++ptr);
814     if (c == 0)
815     {
816     *errorptr = ERR2;
817     return 0;
818     }
819    
820 nigel 69 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
821     is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
822 nigel 3
823 nigel 69 if (c >= 'a' && c <= 'z') c -= 32;
824 nigel 3 c ^= 0x40;
825     break;
826    
827     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
828     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
829 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
830     there used to be some cases other than the default, and there may be again
831     in future, so I haven't "optimized" it. */
832 nigel 3
833     default:
834     if ((options & PCRE_EXTRA) != 0) switch(c)
835     {
836     default:
837     *errorptr = ERR3;
838     break;
839     }
840     break;
841     }
842     }
843    
844     *ptrptr = ptr;
845     return c;
846     }
847    
848    
849    
850     /*************************************************
851     * Check for counted repeat *
852     *************************************************/
853    
854     /* This function is called when a '{' is encountered in a place where it might
855     start a quantifier. It looks ahead to see if it really is a quantifier or not.
856     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
857     where the ddds are digits.
858    
859     Arguments:
860     p pointer to the first char after '{'
861 nigel 25 cd pointer to char tables block
862 nigel 3
863     Returns: TRUE or FALSE
864     */
865    
866     static BOOL
867 nigel 25 is_counted_repeat(const uschar *p, compile_data *cd)
868 nigel 3 {
869 nigel 69 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
870     while ((digitab[*p] & ctype_digit) != 0) p++;
871 nigel 3 if (*p == '}') return TRUE;
872    
873     if (*p++ != ',') return FALSE;
874     if (*p == '}') return TRUE;
875    
876 nigel 69 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
877     while ((digitab[*p] & ctype_digit) != 0) p++;
878    
879 nigel 3 return (*p == '}');
880     }
881    
882    
883    
884     /*************************************************
885     * Read repeat counts *
886     *************************************************/
887    
888     /* Read an item of the form {n,m} and return the values. This is called only
889     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
890     so the syntax is guaranteed to be correct, but we need to check the values.
891    
892     Arguments:
893     p pointer to first char after '{'
894     minp pointer to int for min
895     maxp pointer to int for max
896     returned as -1 if no max
897     errorptr points to pointer to error message
898 nigel 25 cd pointer to character tables clock
899 nigel 3
900     Returns: pointer to '}' on success;
901     current ptr on error, with errorptr set
902     */
903    
904 nigel 7 static const uschar *
905 nigel 25 read_repeat_counts(const uschar *p, int *minp, int *maxp,
906     const char **errorptr, compile_data *cd)
907 nigel 3 {
908     int min = 0;
909     int max = -1;
910    
911 nigel 69 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
912 nigel 3
913     if (*p == '}') max = min; else
914     {
915     if (*(++p) != '}')
916     {
917     max = 0;
918 nigel 69 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 nigel 3 if (max < min)
920     {
921     *errorptr = ERR4;
922     return p;
923     }
924     }
925     }
926    
927     /* Do paranoid checks, then fill in the required variables, and pass back the
928     pointer to the terminating '}'. */
929    
930     if (min > 65535 || max > 65535)
931     *errorptr = ERR5;
932     else
933     {
934     *minp = min;
935     *maxp = max;
936     }
937     return p;
938     }
939    
940    
941    
942     /*************************************************
943 nigel 63 * Find first significant op code *
944     *************************************************/
945    
946     /* This is called by several functions that scan a compiled expression looking
947     for a fixed first character, or an anchoring op code etc. It skips over things
948     that do not influence this. For some calls, a change of option is important.
949    
950     Arguments:
951     code pointer to the start of the group
952     options pointer to external options
953     optbit the option bit whose changing is significant, or
954     zero if none are
955    
956     Returns: pointer to the first significant opcode
957     */
958    
959     static const uschar*
960     first_significant_code(const uschar *code, int *options, int optbit)
961     {
962     for (;;)
963     {
964     switch ((int)*code)
965     {
966     case OP_OPT:
967     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
968     *options = (int)code[1];
969     code += 2;
970     break;
971    
972     case OP_ASSERT_NOT:
973     case OP_ASSERTBACK:
974     case OP_ASSERTBACK_NOT:
975     do code += GET(code, 1); while (*code == OP_ALT);
976     /* Fall through */
977    
978     case OP_CALLOUT:
979     case OP_CREF:
980     case OP_BRANUMBER:
981     case OP_WORD_BOUNDARY:
982     case OP_NOT_WORD_BOUNDARY:
983     code += OP_lengths[*code];
984     break;
985    
986     default:
987     return code;
988     }
989     }
990     /* Control never reaches here */
991     }
992    
993    
994    
995    
996     /*************************************************
997 nigel 23 * Find the fixed length of a pattern *
998     *************************************************/
999    
1000     /* Scan a pattern and compute the fixed length of subject that will match it,
1001     if the length is fixed. This is needed for dealing with backward assertions.
1002 nigel 63 In UTF8 mode, the result is in characters rather than bytes.
1003 nigel 23
1004     Arguments:
1005     code points to the start of the pattern (the bracket)
1006 nigel 49 options the compiling options
1007 nigel 23
1008 nigel 63 Returns: the fixed length, or -1 if there is no fixed length,
1009     or -2 if \C was encountered
1010 nigel 23 */
1011    
1012     static int
1013 nigel 49 find_fixedlength(uschar *code, int options)
1014 nigel 23 {
1015     int length = -1;
1016    
1017     register int branchlength = 0;
1018 nigel 63 register uschar *cc = code + 1 + LINK_SIZE;
1019 nigel 23
1020     /* Scan along the opcodes for this branch. If we get to the end of the
1021     branch, check the length against that of the other branches. */
1022    
1023     for (;;)
1024     {
1025     int d;
1026     register int op = *cc;
1027     if (op >= OP_BRA) op = OP_BRA;
1028    
1029     switch (op)
1030     {
1031     case OP_BRA:
1032     case OP_ONCE:
1033     case OP_COND:
1034 nigel 49 d = find_fixedlength(cc, options);
1035 nigel 63 if (d < 0) return d;
1036 nigel 23 branchlength += d;
1037 nigel 63 do cc += GET(cc, 1); while (*cc == OP_ALT);
1038     cc += 1 + LINK_SIZE;
1039 nigel 23 break;
1040    
1041     /* Reached end of a branch; if it's a ket it is the end of a nested
1042     call. If it's ALT it is an alternation in a nested call. If it is
1043     END it's the end of the outer call. All can be handled by the same code. */
1044    
1045     case OP_ALT:
1046     case OP_KET:
1047     case OP_KETRMAX:
1048     case OP_KETRMIN:
1049     case OP_END:
1050     if (length < 0) length = branchlength;
1051     else if (length != branchlength) return -1;
1052     if (*cc != OP_ALT) return length;
1053 nigel 63 cc += 1 + LINK_SIZE;
1054 nigel 23 branchlength = 0;
1055     break;
1056    
1057     /* Skip over assertive subpatterns */
1058    
1059     case OP_ASSERT:
1060     case OP_ASSERT_NOT:
1061     case OP_ASSERTBACK:
1062     case OP_ASSERTBACK_NOT:
1063 nigel 63 do cc += GET(cc, 1); while (*cc == OP_ALT);
1064     /* Fall through */
1065 nigel 23
1066     /* Skip over things that don't match chars */
1067    
1068     case OP_REVERSE:
1069 nigel 53 case OP_BRANUMBER:
1070     case OP_CREF:
1071 nigel 23 case OP_OPT:
1072 nigel 63 case OP_CALLOUT:
1073 nigel 23 case OP_SOD:
1074 nigel 63 case OP_SOM:
1075 nigel 23 case OP_EOD:
1076     case OP_EODN:
1077     case OP_CIRC:
1078     case OP_DOLL:
1079     case OP_NOT_WORD_BOUNDARY:
1080     case OP_WORD_BOUNDARY:
1081 nigel 63 cc += OP_lengths[*cc];
1082 nigel 23 break;
1083    
1084 nigel 49 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1085     This requires a scan of the string, unfortunately. We assume valid UTF-8
1086 nigel 63 strings, so all we do is reduce the length by one for every byte whose bits
1087     are 10xxxxxx. */
1088 nigel 23
1089     case OP_CHARS:
1090     branchlength += *(++cc);
1091 nigel 49 #ifdef SUPPORT_UTF8
1092 nigel 63 if ((options & PCRE_UTF8) != 0)
1093     for (d = 1; d <= *cc; d++)
1094     if ((cc[d] & 0xc0) == 0x80) branchlength--;
1095 nigel 49 #endif
1096 nigel 23 cc += *cc + 1;
1097     break;
1098    
1099 nigel 63 /* Handle exact repetitions. The count is already in characters, but we
1100     need to skip over a multibyte character in UTF8 mode. */
1101 nigel 23
1102     case OP_EXACT:
1103 nigel 63 branchlength += GET2(cc,1);
1104     cc += 4;
1105     #ifdef SUPPORT_UTF8
1106     if ((options & PCRE_UTF8) != 0)
1107     {
1108     while((*cc & 0x80) == 0x80) cc++;
1109     }
1110     #endif
1111     break;
1112    
1113 nigel 23 case OP_TYPEEXACT:
1114 nigel 63 branchlength += GET2(cc,1);
1115 nigel 23 cc += 4;
1116     break;
1117    
1118     /* Handle single-char matchers */
1119    
1120     case OP_NOT_DIGIT:
1121     case OP_DIGIT:
1122     case OP_NOT_WHITESPACE:
1123     case OP_WHITESPACE:
1124     case OP_NOT_WORDCHAR:
1125     case OP_WORDCHAR:
1126     case OP_ANY:
1127     branchlength++;
1128     cc++;
1129     break;
1130    
1131 nigel 63 /* The single-byte matcher isn't allowed */
1132 nigel 23
1133 nigel 63 case OP_ANYBYTE:
1134     return -2;
1135    
1136 nigel 23 /* Check a class for variable quantification */
1137    
1138 nigel 63 #ifdef SUPPORT_UTF8
1139     case OP_XCLASS:
1140     cc += GET(cc, 1) - 33;
1141     /* Fall through */
1142     #endif
1143    
1144 nigel 23 case OP_CLASS:
1145 nigel 63 case OP_NCLASS:
1146 nigel 53 cc += 33;
1147 nigel 23
1148     switch (*cc)
1149     {
1150     case OP_CRSTAR:
1151     case OP_CRMINSTAR:
1152     case OP_CRQUERY:
1153     case OP_CRMINQUERY:
1154     return -1;
1155    
1156     case OP_CRRANGE:
1157     case OP_CRMINRANGE:
1158 nigel 63 if (GET2(cc,1) != GET2(cc,3)) return -1;
1159     branchlength += GET2(cc,1);
1160 nigel 23 cc += 5;
1161     break;
1162    
1163     default:
1164     branchlength++;
1165     }
1166     break;
1167    
1168     /* Anything else is variable length */
1169    
1170     default:
1171     return -1;
1172     }
1173     }
1174     /* Control never gets here */
1175     }
1176    
1177    
1178    
1179    
1180     /*************************************************
1181 nigel 63 * Scan compiled regex for numbered bracket *
1182     *************************************************/
1183    
1184     /* This little function scans through a compiled pattern until it finds a
1185     capturing bracket with the given number.
1186    
1187     Arguments:
1188     code points to start of expression
1189     utf8 TRUE in UTF-8 mode
1190     number the required bracket number
1191    
1192     Returns: pointer to the opcode for the bracket, or NULL if not found
1193     */
1194    
1195     static const uschar *
1196     find_bracket(const uschar *code, BOOL utf8, int number)
1197     {
1198 nigel 65 #ifndef SUPPORT_UTF8
1199     utf8 = utf8; /* Stop pedantic compilers complaining */
1200     #endif
1201    
1202 nigel 63 for (;;)
1203     {
1204     register int c = *code;
1205     if (c == OP_END) return NULL;
1206     else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1207     else if (c > OP_BRA)
1208     {
1209     int n = c - OP_BRA;
1210     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1211     if (n == number) return (uschar *)code;
1212     code += OP_lengths[OP_BRA];
1213     }
1214     else
1215     {
1216     code += OP_lengths[c];
1217    
1218     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1219     by a multi-byte character. The length in the table is a minimum, so we have
1220     to scan along to skip the extra characters. All opcodes are less than 128,
1221     so we can use relatively efficient code. */
1222    
1223     #ifdef SUPPORT_UTF8
1224     if (utf8) switch(c)
1225     {
1226     case OP_EXACT:
1227     case OP_UPTO:
1228     case OP_MINUPTO:
1229     case OP_STAR:
1230     case OP_MINSTAR:
1231     case OP_PLUS:
1232     case OP_MINPLUS:
1233     case OP_QUERY:
1234     case OP_MINQUERY:
1235     while ((*code & 0xc0) == 0x80) code++;
1236     break;
1237     }
1238     #endif
1239     }
1240     }
1241     }
1242    
1243    
1244    
1245     /*************************************************
1246     * Scan compiled branch for non-emptiness *
1247     *************************************************/
1248    
1249     /* This function scans through a branch of a compiled pattern to see whether it
1250     can match the empty string or not. It is called only from could_be_empty()
1251     below. Note that first_significant_code() skips over assertions. If we hit an
1252     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1253     whose current branch will already have been scanned.
1254    
1255     Arguments:
1256     code points to start of search
1257     endcode points to where to stop
1258     utf8 TRUE if in UTF8 mode
1259    
1260     Returns: TRUE if what is matched could be empty
1261     */
1262    
1263     static BOOL
1264     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1265     {
1266     register int c;
1267     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1268     code < endcode;
1269     code = first_significant_code(code + OP_lengths[c], NULL, 0))
1270     {
1271     const uschar *ccode;
1272    
1273     c = *code;
1274    
1275     if (c >= OP_BRA)
1276     {
1277     BOOL empty_branch;
1278     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1279    
1280     /* Scan a closed bracket */
1281    
1282     empty_branch = FALSE;
1283     do
1284     {
1285     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1286     empty_branch = TRUE;
1287     code += GET(code, 1);
1288     }
1289     while (*code == OP_ALT);
1290     if (!empty_branch) return FALSE; /* All branches are non-empty */
1291     code += 1 + LINK_SIZE;
1292     c = *code;
1293     }
1294    
1295     else switch (c)
1296     {
1297     /* Check for quantifiers after a class */
1298    
1299     #ifdef SUPPORT_UTF8
1300     case OP_XCLASS:
1301     ccode = code + GET(code, 1);
1302     goto CHECK_CLASS_REPEAT;
1303     #endif
1304    
1305     case OP_CLASS:
1306     case OP_NCLASS:
1307     ccode = code + 33;
1308    
1309     #ifdef SUPPORT_UTF8
1310     CHECK_CLASS_REPEAT:
1311     #endif
1312    
1313     switch (*ccode)
1314     {
1315     case OP_CRSTAR: /* These could be empty; continue */
1316     case OP_CRMINSTAR:
1317     case OP_CRQUERY:
1318     case OP_CRMINQUERY:
1319     break;
1320    
1321     default: /* Non-repeat => class must match */
1322     case OP_CRPLUS: /* These repeats aren't empty */
1323     case OP_CRMINPLUS:
1324     return FALSE;
1325    
1326     case OP_CRRANGE:
1327     case OP_CRMINRANGE:
1328     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1329     break;
1330     }
1331     break;
1332    
1333     /* Opcodes that must match a character */
1334    
1335     case OP_NOT_DIGIT:
1336     case OP_DIGIT:
1337     case OP_NOT_WHITESPACE:
1338     case OP_WHITESPACE:
1339     case OP_NOT_WORDCHAR:
1340     case OP_WORDCHAR:
1341     case OP_ANY:
1342     case OP_ANYBYTE:
1343     case OP_CHARS:
1344     case OP_NOT:
1345     case OP_PLUS:
1346     case OP_MINPLUS:
1347     case OP_EXACT:
1348     case OP_NOTPLUS:
1349     case OP_NOTMINPLUS:
1350     case OP_NOTEXACT:
1351     case OP_TYPEPLUS:
1352     case OP_TYPEMINPLUS:
1353     case OP_TYPEEXACT:
1354     return FALSE;
1355    
1356     /* End of branch */
1357    
1358     case OP_KET:
1359     case OP_KETRMAX:
1360     case OP_KETRMIN:
1361     case OP_ALT:
1362     return TRUE;
1363    
1364     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1365     followed by a multibyte character */
1366    
1367     #ifdef SUPPORT_UTF8
1368     case OP_STAR:
1369     case OP_MINSTAR:
1370     case OP_QUERY:
1371     case OP_MINQUERY:
1372     case OP_UPTO:
1373     case OP_MINUPTO:
1374     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1375     break;
1376     #endif
1377     }
1378     }
1379    
1380     return TRUE;
1381     }
1382    
1383    
1384    
1385     /*************************************************
1386     * Scan compiled regex for non-emptiness *
1387     *************************************************/
1388    
1389     /* This function is called to check for left recursive calls. We want to check
1390     the current branch of the current pattern to see if it could match the empty
1391     string. If it could, we must look outwards for branches at other levels,
1392     stopping when we pass beyond the bracket which is the subject of the recursion.
1393    
1394     Arguments:
1395     code points to start of the recursion
1396     endcode points to where to stop (current RECURSE item)
1397     bcptr points to the chain of current (unclosed) branch starts
1398     utf8 TRUE if in UTF-8 mode
1399    
1400     Returns: TRUE if what is matched could be empty
1401     */
1402    
1403     static BOOL
1404     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1405     BOOL utf8)
1406     {
1407     while (bcptr != NULL && bcptr->current >= code)
1408     {
1409     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1410     bcptr = bcptr->outer;
1411     }
1412     return TRUE;
1413     }
1414    
1415    
1416    
1417     /*************************************************
1418 nigel 43 * Check for POSIX class syntax *
1419     *************************************************/
1420    
1421     /* This function is called when the sequence "[:" or "[." or "[=" is
1422     encountered in a character class. It checks whether this is followed by an
1423     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1424     ".]" or "=]".
1425    
1426     Argument:
1427     ptr pointer to the initial [
1428     endptr where to return the end pointer
1429     cd pointer to compile data
1430    
1431     Returns: TRUE or FALSE
1432     */
1433    
1434     static BOOL
1435     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1436     {
1437     int terminator; /* Don't combine these lines; the Solaris cc */
1438     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1439     if (*(++ptr) == '^') ptr++;
1440     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1441     if (*ptr == terminator && ptr[1] == ']')
1442     {
1443     *endptr = ptr;
1444     return TRUE;
1445     }
1446     return FALSE;
1447     }
1448    
1449    
1450    
1451    
1452     /*************************************************
1453     * Check POSIX class name *
1454     *************************************************/
1455    
1456     /* This function is called to check the name given in a POSIX-style class entry
1457     such as [:alnum:].
1458    
1459     Arguments:
1460     ptr points to the first letter
1461     len the length of the name
1462    
1463     Returns: a value representing the name, or -1 if unknown
1464     */
1465    
1466     static int
1467     check_posix_name(const uschar *ptr, int len)
1468     {
1469     register int yield = 0;
1470     while (posix_name_lengths[yield] != 0)
1471     {
1472     if (len == posix_name_lengths[yield] &&
1473     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1474     yield++;
1475     }
1476     return -1;
1477     }
1478    
1479    
1480    
1481    
1482     /*************************************************
1483 nigel 3 * Compile one branch *
1484     *************************************************/
1485    
1486 nigel 63 /* Scan the pattern, compiling it into the code vector. If the options are
1487     changed during the branch, the pointer is used to change the external options
1488     bits.
1489 nigel 3
1490     Arguments:
1491 nigel 63 optionsptr pointer to the option bits
1492     brackets points to number of extracting brackets used
1493     code points to the pointer to the current code point
1494     ptrptr points to the current pattern pointer
1495     errorptr points to pointer to error message
1496     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1497     reqbyteptr set to the last literal character required, else < 0
1498     bcptr points to current branch chain
1499     cd contains pointers to tables etc.
1500 nigel 3
1501 nigel 63 Returns: TRUE on success
1502     FALSE, with *errorptr set on error
1503 nigel 3 */
1504    
1505     static BOOL
1506 nigel 63 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1507     const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1508     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1509 nigel 3 {
1510     int repeat_type, op_type;
1511 nigel 63 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1512     int bravalue = 0;
1513     int length;
1514 nigel 19 int greedy_default, greedy_non_default;
1515 nigel 63 int firstbyte, reqbyte;
1516     int zeroreqbyte, zerofirstbyte;
1517 nigel 65 int req_caseopt, reqvary, tempreqvary;
1518 nigel 37 int condcount = 0;
1519 nigel 63 int options = *optionsptr;
1520 nigel 3 register int c;
1521     register uschar *code = *codeptr;
1522 nigel 23 uschar *tempcode;
1523 nigel 63 BOOL inescq = FALSE;
1524     BOOL groupsetfirstbyte = FALSE;
1525 nigel 7 const uschar *ptr = *ptrptr;
1526 nigel 23 const uschar *tempptr;
1527 nigel 3 uschar *previous = NULL;
1528     uschar class[32];
1529    
1530 nigel 63 #ifdef SUPPORT_UTF8
1531     BOOL class_utf8;
1532     BOOL utf8 = (options & PCRE_UTF8) != 0;
1533     uschar *class_utf8data;
1534     uschar utf8_char[6];
1535     #else
1536     BOOL utf8 = FALSE;
1537     #endif
1538    
1539 nigel 19 /* Set up the default and non-default settings for greediness */
1540    
1541     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1542     greedy_non_default = greedy_default ^ 1;
1543    
1544 nigel 63 /* Initialize no first char, no required char. REQ_UNSET means "no char
1545     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1546     matches a non-fixed char first char; reqbyte just remains unset if we never
1547     find one.
1548 nigel 37
1549 nigel 63 When we hit a repeat whose minimum is zero, we may have to adjust these values
1550     to take the zero repeat into account. This is implemented by setting them to
1551     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1552     item types that can be repeated set these backoff variables appropriately. */
1553 nigel 37
1554 nigel 63 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1555    
1556     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1557     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1558     value > 255. It is added into the firstbyte or reqbyte variables to record the
1559     case status of the value. */
1560    
1561     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1562    
1563 nigel 3 /* Switch on next character until the end of the branch */
1564    
1565     for (;; ptr++)
1566     {
1567     BOOL negate_class;
1568 nigel 63 BOOL possessive_quantifier;
1569 nigel 23 int class_charcount;
1570     int class_lastchar;
1571     int newoptions;
1572 nigel 63 int recno;
1573 nigel 53 int skipbytes;
1574 nigel 63 int subreqbyte;
1575     int subfirstbyte;
1576 nigel 3
1577     c = *ptr;
1578 nigel 63 if (inescq && c != 0) goto NORMAL_CHAR;
1579    
1580 nigel 3 if ((options & PCRE_EXTENDED) != 0)
1581     {
1582 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1583 nigel 3 if (c == '#')
1584     {
1585 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
1586     on the Macintosh. */
1587 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1588 nigel 63 if (c != 0) continue; /* Else fall through to handle end of string */
1589 nigel 3 }
1590     }
1591    
1592     switch(c)
1593     {
1594     /* The branch terminates at end of string, |, or ). */
1595    
1596     case 0:
1597     case '|':
1598     case ')':
1599 nigel 63 *firstbyteptr = firstbyte;
1600     *reqbyteptr = reqbyte;
1601 nigel 3 *codeptr = code;
1602     *ptrptr = ptr;
1603     return TRUE;
1604    
1605 nigel 63 /* Handle single-character metacharacters. In multiline mode, ^ disables
1606     the setting of any following char as a first character. */
1607 nigel 3
1608     case '^':
1609 nigel 63 if ((options & PCRE_MULTILINE) != 0)
1610     {
1611     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1612     }
1613 nigel 3 previous = NULL;
1614     *code++ = OP_CIRC;
1615     break;
1616    
1617     case '$':
1618     previous = NULL;
1619     *code++ = OP_DOLL;
1620     break;
1621    
1622 nigel 63 /* There can never be a first char if '.' is first, whatever happens about
1623     repeats. The value of reqbyte doesn't change either. */
1624    
1625 nigel 3 case '.':
1626 nigel 63 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1627     zerofirstbyte = firstbyte;
1628     zeroreqbyte = reqbyte;
1629 nigel 3 previous = code;
1630     *code++ = OP_ANY;
1631     break;
1632    
1633 nigel 63 /* Character classes. If the included characters are all < 255 in value, we
1634     build a 32-byte bitmap of the permitted characters, except in the special
1635     case where there is only one such character. For negated classes, we build
1636     the map as usual, then invert it at the end. However, we use a different
1637     opcode so that data characters > 255 can be handled correctly.
1638    
1639     If the class contains characters outside the 0-255 range, a different
1640     opcode is compiled. It may optionally have a bit map for characters < 256,
1641     but those above are are explicitly listed afterwards. A flag byte tells
1642     whether the bitmap is present, and whether this is a negated class or not.
1643 nigel 3 */
1644    
1645     case '[':
1646     previous = code;
1647    
1648 nigel 63 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1649     they are encountered at the top level, so we'll do that too. */
1650    
1651     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1652     check_posix_syntax(ptr, &tempptr, cd))
1653     {
1654     *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1655     goto FAILED;
1656     }
1657    
1658 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
1659 nigel 3
1660     if ((c = *(++ptr)) == '^')
1661     {
1662     negate_class = TRUE;
1663     c = *(++ptr);
1664     }
1665 nigel 63 else
1666     {
1667     negate_class = FALSE;
1668     }
1669 nigel 3
1670 nigel 63 /* Keep a count of chars with values < 256 so that we can optimize the case
1671     of just a single character (as long as it's < 256). For higher valued UTF-8
1672     characters, we don't yet do any optimization. */
1673 nigel 3
1674     class_charcount = 0;
1675     class_lastchar = -1;
1676    
1677 nigel 63 #ifdef SUPPORT_UTF8
1678     class_utf8 = FALSE; /* No chars >= 256 */
1679     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1680     #endif
1681    
1682 nigel 3 /* Initialize the 32-char bit map to all zeros. We have to build the
1683     map in a temporary bit of store, in case the class contains only 1
1684 nigel 63 character (< 256), because in that case the compiled code doesn't use the
1685 nigel 3 bit map. */
1686    
1687     memset(class, 0, 32 * sizeof(uschar));
1688    
1689     /* Process characters until ] is reached. By writing this as a "do" it
1690 nigel 63 means that an initial ] is taken as a data character. The first pass
1691     through the regex checked the overall syntax, so we don't need to be very
1692     strict here. At the start of the loop, c contains the first byte of the
1693     character. */
1694 nigel 3
1695     do
1696     {
1697 nigel 63 #ifdef SUPPORT_UTF8
1698 nigel 67 if (utf8 && c > 127)
1699     { /* Braces are required because the */
1700     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1701     }
1702 nigel 63 #endif
1703    
1704     /* Inside \Q...\E everything is literal except \E */
1705    
1706     if (inescq)
1707 nigel 3 {
1708 nigel 63 if (c == '\\' && ptr[1] == 'E')
1709     {
1710     inescq = FALSE;
1711     ptr++;
1712     continue;
1713     }
1714     else goto LONE_SINGLE_CHARACTER;
1715 nigel 3 }
1716    
1717 nigel 43 /* Handle POSIX class names. Perl allows a negation extension of the
1718 nigel 63 form [:^name:]. A square bracket that doesn't match the syntax is
1719 nigel 43 treated as a literal. We also recognize the POSIX constructions
1720     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1721 nigel 63 5.6 and 5.8 do. */
1722 nigel 43
1723     if (c == '[' &&
1724     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1725     check_posix_syntax(ptr, &tempptr, cd))
1726     {
1727     BOOL local_negate = FALSE;
1728     int posix_class, i;
1729     register const uschar *cbits = cd->cbits;
1730    
1731     if (ptr[1] != ':')
1732     {
1733     *errorptr = ERR31;
1734     goto FAILED;
1735     }
1736    
1737     ptr += 2;
1738     if (*ptr == '^')
1739     {
1740     local_negate = TRUE;
1741     ptr++;
1742     }
1743    
1744     posix_class = check_posix_name(ptr, tempptr - ptr);
1745     if (posix_class < 0)
1746     {
1747     *errorptr = ERR30;
1748     goto FAILED;
1749     }
1750    
1751     /* If matching is caseless, upper and lower are converted to
1752     alpha. This relies on the fact that the class table starts with
1753     alpha, lower, upper as the first 3 entries. */
1754    
1755     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1756     posix_class = 0;
1757    
1758     /* Or into the map we are building up to 3 of the static class
1759 nigel 63 tables, or their negations. The [:blank:] class sets up the same
1760     chars as the [:space:] class (all white space). We remove the vertical
1761     white space chars afterwards. */
1762 nigel 43
1763     posix_class *= 3;
1764     for (i = 0; i < 3; i++)
1765     {
1766 nigel 65 BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1767 nigel 43 int taboffset = posix_class_maps[posix_class + i];
1768     if (taboffset < 0) break;
1769     if (local_negate)
1770 nigel 63 {
1771 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1772 nigel 63 if (isblank) class[1] |= 0x3c;
1773     }
1774 nigel 43 else
1775 nigel 63 {
1776 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1777 nigel 63 if (isblank) class[1] &= ~0x3c;
1778     }
1779 nigel 43 }
1780    
1781     ptr = tempptr + 1;
1782     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1783 nigel 63 continue; /* End of POSIX syntax handling */
1784 nigel 43 }
1785    
1786 nigel 3 /* Backslash may introduce a single character, or it may introduce one
1787     of the specials, which just set a flag. Escaped items are checked for
1788     validity in the pre-compiling pass. The sequence \b is a special case.
1789 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
1790 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
1791     or into the one we are building. We assume they have more than one
1792 nigel 63 character in them, so set class_charcount bigger than one. */
1793 nigel 3
1794     if (c == '\\')
1795     {
1796 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1797 nigel 63 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1798    
1799     if (-c == ESC_Q) /* Handle start of quoted string */
1800     {
1801     if (ptr[1] == '\\' && ptr[2] == 'E')
1802     {
1803     ptr += 2; /* avoid empty string */
1804     }
1805     else inescq = TRUE;
1806     continue;
1807     }
1808    
1809 nigel 3 else if (c < 0)
1810     {
1811 nigel 25 register const uschar *cbits = cd->cbits;
1812 nigel 63 class_charcount = 10; /* Greater than 1 is what matters */
1813 nigel 3 switch (-c)
1814     {
1815     case ESC_d:
1816 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1817 nigel 3 continue;
1818    
1819     case ESC_D:
1820 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1821 nigel 3 continue;
1822    
1823     case ESC_w:
1824 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1825 nigel 3 continue;
1826    
1827     case ESC_W:
1828 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1829 nigel 3 continue;
1830    
1831     case ESC_s:
1832 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1833 nigel 63 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1834 nigel 3 continue;
1835    
1836     case ESC_S:
1837 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1838 nigel 63 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1839 nigel 3 continue;
1840    
1841 nigel 63 /* Unrecognized escapes are faulted if PCRE is running in its
1842     strict mode. By default, for compatibility with Perl, they are
1843     treated as literals. */
1844    
1845 nigel 3 default:
1846 nigel 63 if ((options & PCRE_EXTRA) != 0)
1847     {
1848     *errorptr = ERR7;
1849     goto FAILED;
1850     }
1851     c = *ptr; /* The final character */
1852 nigel 3 }
1853     }
1854 nigel 49
1855 nigel 63 /* Fall through if we have a single character (c >= 0). This may be
1856     > 256 in UTF-8 mode. */
1857 nigel 49
1858 nigel 63 } /* End of backslash handling */
1859 nigel 3
1860     /* A single character may be followed by '-' to form a range. However,
1861     Perl does not permit ']' to be the end of the range. A '-' character
1862     here is treated as a literal. */
1863    
1864     if (ptr[1] == '-' && ptr[2] != ']')
1865     {
1866     int d;
1867     ptr += 2;
1868    
1869 nigel 63 #ifdef SUPPORT_UTF8
1870     if (utf8)
1871     { /* Braces are required because the */
1872     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1873 nigel 3 }
1874 nigel 63 else
1875     #endif
1876     d = *ptr;
1877 nigel 3
1878     /* The second part of a range can be a single-character escape, but
1879 nigel 49 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1880     in such circumstances. */
1881 nigel 3
1882     if (d == '\\')
1883     {
1884 nigel 49 const uschar *oldptr = ptr;
1885 nigel 25 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1886 nigel 49
1887     /* \b is backslash; any other special means the '-' was literal */
1888    
1889 nigel 3 if (d < 0)
1890     {
1891     if (d == -ESC_b) d = '\b'; else
1892     {
1893 nigel 49 ptr = oldptr - 2;
1894 nigel 63 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1895 nigel 3 }
1896     }
1897     }
1898    
1899 nigel 63 /* Check that the two values are in the correct order */
1900    
1901 nigel 3 if (d < c)
1902     {
1903     *errorptr = ERR8;
1904     goto FAILED;
1905     }
1906    
1907 nigel 63 /* If d is greater than 255, we can't just use the bit map, so set up
1908     for the UTF-8 supporting class type. If we are not caseless, we can
1909     just set up a single range. If we are caseless, the characters < 256
1910     are handled with a bitmap, in order to get the case-insensitive
1911     handling. */
1912    
1913     #ifdef SUPPORT_UTF8
1914     if (d > 255)
1915     {
1916     class_utf8 = TRUE;
1917     *class_utf8data++ = XCL_RANGE;
1918     if ((options & PCRE_CASELESS) == 0)
1919     {
1920     class_utf8data += ord2utf8(c, class_utf8data);
1921     class_utf8data += ord2utf8(d, class_utf8data);
1922     continue; /* Go get the next char in the class */
1923     }
1924     class_utf8data += ord2utf8(256, class_utf8data);
1925     class_utf8data += ord2utf8(d, class_utf8data);
1926     d = 255;
1927     /* Fall through */
1928     }
1929     #endif
1930     /* We use the bit map if the range is entirely < 255, or if part of it
1931     is < 255 and matching is caseless. */
1932    
1933 nigel 3 for (; c <= d; c++)
1934     {
1935     class[c/8] |= (1 << (c&7));
1936     if ((options & PCRE_CASELESS) != 0)
1937     {
1938 nigel 25 int uc = cd->fcc[c]; /* flip case */
1939 nigel 3 class[uc/8] |= (1 << (uc&7));
1940     }
1941     class_charcount++; /* in case a one-char range */
1942     class_lastchar = c;
1943     }
1944 nigel 63
1945 nigel 3 continue; /* Go get the next char in the class */
1946     }
1947    
1948     /* Handle a lone single character - we can get here for a normal
1949     non-escape char, or after \ that introduces a single character. */
1950    
1951 nigel 63 LONE_SINGLE_CHARACTER:
1952 nigel 49
1953 nigel 63 /* Handle a multibyte character */
1954    
1955     #ifdef SUPPORT_UTF8
1956     if (utf8 && c > 255)
1957 nigel 3 {
1958 nigel 63 class_utf8 = TRUE;
1959     *class_utf8data++ = XCL_SINGLE;
1960     class_utf8data += ord2utf8(c, class_utf8data);
1961 nigel 3 }
1962 nigel 63 else
1963     #endif
1964     /* Handle a single-byte character */
1965     {
1966     class [c/8] |= (1 << (c&7));
1967     if ((options & PCRE_CASELESS) != 0)
1968     {
1969     c = cd->fcc[c]; /* flip case */
1970     class[c/8] |= (1 << (c&7));
1971     }
1972     class_charcount++;
1973     class_lastchar = c;
1974     }
1975 nigel 3 }
1976    
1977     /* Loop until ']' reached; the check for end of string happens inside the
1978     loop. This "while" is the end of the "do" above. */
1979    
1980 nigel 63 while ((c = *(++ptr)) != ']' || inescq);
1981 nigel 3
1982 nigel 63 /* If class_charcount is 1, we saw precisely one character with a value <
1983     256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1984     the one character is < 128. In non-UTF-8 mode we can always optimize.
1985 nigel 3
1986 nigel 63 The optimization throws away the bit map. We turn the item into a
1987     1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1988     that OP_NOT does not support multibyte characters. In the positive case, it
1989     can cause firstbyte to be set. Otherwise, there can be no first char if
1990     this item is first, whatever repeat count may follow. In the case of
1991     reqbyte, save the previous value for reinstating. */
1992    
1993     #ifdef SUPPORT_UTF8
1994 nigel 67 if (class_charcount == 1 &&
1995     (!utf8 ||
1996     (!class_utf8 && class_lastchar < 128)))
1997 nigel 63 #else
1998     if (class_charcount == 1)
1999     #endif
2000 nigel 3 {
2001 nigel 63 zeroreqbyte = reqbyte;
2002 nigel 3 if (negate_class)
2003     {
2004 nigel 63 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2005     zerofirstbyte = firstbyte;
2006     *code++ = OP_NOT;
2007 nigel 3 }
2008     else
2009     {
2010 nigel 63 if (firstbyte == REQ_UNSET)
2011     {
2012     zerofirstbyte = REQ_NONE;
2013     firstbyte = class_lastchar | req_caseopt;
2014     }
2015     else
2016     {
2017     zerofirstbyte = firstbyte;
2018 nigel 65 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2019 nigel 63 }
2020     *code++ = OP_CHARS;
2021 nigel 3 *code++ = 1;
2022     }
2023     *code++ = class_lastchar;
2024 nigel 63 break; /* End of class handling */
2025     } /* End of 1-byte optimization */
2026    
2027     /* Otherwise, if this is the first thing in the branch, there can be no
2028     first char setting, whatever the repeat count. Any reqbyte setting must
2029     remain unchanged after any kind of repeat. */
2030    
2031     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2032     zerofirstbyte = firstbyte;
2033     zeroreqbyte = reqbyte;
2034    
2035     /* If there are characters with values > 255, we have to compile an
2036     extended class, with its own opcode. If there are no characters < 256,
2037     we can omit the bitmap. */
2038    
2039     #ifdef SUPPORT_UTF8
2040     if (class_utf8)
2041     {
2042     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2043     *code++ = OP_XCLASS;
2044     code += LINK_SIZE;
2045     *code = negate_class? XCL_NOT : 0;
2046    
2047     /* If the map is required, install it, and move on to the end of
2048     the extra data */
2049    
2050     if (class_charcount > 0)
2051     {
2052     *code++ |= XCL_MAP;
2053     memcpy(code, class, 32);
2054     code = class_utf8data;
2055     }
2056    
2057     /* If the map is not required, slide down the extra data. */
2058    
2059     else
2060     {
2061     int len = class_utf8data - (code + 33);
2062     memmove(code + 1, code + 33, len);
2063     code += len + 1;
2064     }
2065    
2066     /* Now fill in the complete length of the item */
2067    
2068     PUT(previous, 1, code - previous);
2069     break; /* End of class handling */
2070 nigel 3 }
2071 nigel 63 #endif
2072 nigel 3
2073 nigel 63 /* If there are no characters > 255, negate the 32-byte map if necessary,
2074     and copy it into the code vector. If this is the first thing in the branch,
2075     there can be no first char setting, whatever the repeat count. Any reqbyte
2076     setting must remain unchanged after any kind of repeat. */
2077 nigel 3
2078 nigel 63 if (negate_class)
2079     {
2080     *code++ = OP_NCLASS;
2081     for (c = 0; c < 32; c++) code[c] = ~class[c];
2082     }
2083 nigel 3 else
2084     {
2085 nigel 63 *code++ = OP_CLASS;
2086     memcpy(code, class, 32);
2087 nigel 3 }
2088 nigel 63 code += 32;
2089 nigel 3 break;
2090    
2091     /* Various kinds of repeat */
2092    
2093     case '{':
2094 nigel 25 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
2095     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
2096 nigel 3 if (*errorptr != NULL) goto FAILED;
2097     goto REPEAT;
2098    
2099     case '*':
2100     repeat_min = 0;
2101     repeat_max = -1;
2102     goto REPEAT;
2103    
2104     case '+':
2105     repeat_min = 1;
2106     repeat_max = -1;
2107     goto REPEAT;
2108    
2109     case '?':
2110     repeat_min = 0;
2111     repeat_max = 1;
2112    
2113     REPEAT:
2114     if (previous == NULL)
2115     {
2116     *errorptr = ERR9;
2117     goto FAILED;
2118     }
2119    
2120 nigel 63 if (repeat_min == 0)
2121     {
2122 nigel 65 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2123     reqbyte = zeroreqbyte; /* Ditto */
2124 nigel 63 }
2125 nigel 3
2126 nigel 65 /* Remember whether this is a variable length repeat */
2127    
2128     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2129    
2130 nigel 63 op_type = 0; /* Default single-char op codes */
2131     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2132    
2133     /* Save start of previous item, in case we have to move it up to make space
2134     for an inserted OP_ONCE for the additional '+' extension. */
2135    
2136     tempcode = previous;
2137    
2138     /* If the next character is '+', we have a possessive quantifier. This
2139     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2140     If the next character is '?' this is a minimizing repeat, by default,
2141     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2142     repeat type to the non-default. */
2143    
2144     if (ptr[1] == '+')
2145     {
2146     repeat_type = 0; /* Force greedy */
2147     possessive_quantifier = TRUE;
2148     ptr++;
2149     }
2150     else if (ptr[1] == '?')
2151     {
2152     repeat_type = greedy_non_default;
2153     ptr++;
2154     }
2155 nigel 19 else repeat_type = greedy_default;
2156 nigel 3
2157 nigel 63 /* If previous was a recursion, we need to wrap it inside brackets so that
2158     it can be replicated if necessary. */
2159    
2160     if (*previous == OP_RECURSE)
2161     {
2162     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2163     code += 1 + LINK_SIZE;
2164     *previous = OP_BRA;
2165     PUT(previous, 1, code - previous);
2166     *code = OP_KET;
2167     PUT(code, 1, code - previous);
2168     code += 1 + LINK_SIZE;
2169     }
2170    
2171 nigel 3 /* If previous was a string of characters, chop off the last one and use it
2172     as the subject of the repeat. If there was only one character, we can
2173 nigel 63 abolish the previous item altogether. If a one-char item has a minumum of
2174     more than one, ensure that it is set in reqbyte - it might not be if a
2175     sequence such as x{3} is the first thing in a branch because the x will
2176     have gone into firstbyte instead. */
2177 nigel 3
2178 nigel 37 if (*previous == OP_CHARS)
2179 nigel 3 {
2180 nigel 63 /* Deal with UTF-8 characters that take up more than one byte. It's
2181     easier to write this out separately than try to macrify it. Use c to
2182     hold the length of the character in bytes, plus 0x80 to flag that it's a
2183     length rather than a small character. */
2184 nigel 37
2185 nigel 63 #ifdef SUPPORT_UTF8
2186     if (utf8 && (code[-1] & 0x80) != 0)
2187 nigel 3 {
2188 nigel 63 uschar *lastchar = code - 1;
2189     while((*lastchar & 0xc0) == 0x80) lastchar--;
2190     c = code - lastchar; /* Length of UTF-8 character */
2191     memcpy(utf8_char, lastchar, c); /* Save the char */
2192     if (lastchar == previous + 2) /* There was only one character */
2193     {
2194     code = previous; /* Abolish the previous item */
2195     }
2196     else
2197     {
2198     previous[1] -= c; /* Adjust length of previous */
2199     code = lastchar; /* Lost char off the end */
2200     tempcode = code; /* Adjust position to be moved for '+' */
2201     }
2202     c |= 0x80; /* Flag c as a length */
2203 nigel 3 }
2204     else
2205 nigel 63 #endif
2206    
2207     /* Handle the case of a single byte - either with no UTF8 support, or
2208     with UTF-8 disabled, or for a UTF-8 character < 128. */
2209    
2210 nigel 3 {
2211 nigel 63 c = *(--code);
2212     if (code == previous + 2) /* There was only one character */
2213     {
2214     code = previous; /* Abolish the previous item */
2215 nigel 65 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2216 nigel 63 }
2217     else
2218     {
2219     previous[1]--; /* adjust length */
2220     tempcode = code; /* Adjust position to be moved for '+' */
2221     }
2222 nigel 3 }
2223 nigel 63
2224 nigel 3 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2225     }
2226    
2227     /* If previous was a single negated character ([^a] or similar), we use
2228     one of the special opcodes, replacing it. The code is shared with single-
2229 nigel 63 character repeats by setting opt_type to add a suitable offset into
2230     repeat_type. OP_NOT is currently used only for single-byte chars. */
2231 nigel 3
2232 nigel 63 else if (*previous == OP_NOT)
2233 nigel 3 {
2234     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2235     c = previous[1];
2236     code = previous;
2237     goto OUTPUT_SINGLE_REPEAT;
2238     }
2239    
2240     /* If previous was a character type match (\d or similar), abolish it and
2241     create a suitable repeat item. The code is shared with single-character
2242 nigel 63 repeats by setting op_type to add a suitable offset into repeat_type. */
2243 nigel 3
2244 nigel 63 else if (*previous < OP_EODN)
2245 nigel 3 {
2246     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2247     c = *previous;
2248     code = previous;
2249    
2250     OUTPUT_SINGLE_REPEAT:
2251    
2252 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
2253     this case, so we do too - by simply omitting the item altogether. */
2254    
2255     if (repeat_max == 0) goto END_REPEAT;
2256    
2257     /* Combine the op_type with the repeat_type */
2258    
2259     repeat_type += op_type;
2260    
2261 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
2262     an UPTO, with the maximum given. */
2263    
2264     if (repeat_min == 0)
2265     {
2266     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2267     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2268     else
2269     {
2270     *code++ = OP_UPTO + repeat_type;
2271 nigel 63 PUT2INC(code, 0, repeat_max);
2272 nigel 3 }
2273     }
2274    
2275     /* The case {1,} is handled as the special case + */
2276    
2277     else if (repeat_min == 1 && repeat_max == -1)
2278     *code++ = OP_PLUS + repeat_type;
2279    
2280     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2281     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2282    
2283     else
2284     {
2285     if (repeat_min != 1)
2286     {
2287     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2288 nigel 63 PUT2INC(code, 0, repeat_min);
2289 nigel 3 }
2290    
2291     /* If the mininum is 1 and the previous item was a character string,
2292     we either have to put back the item that got cancelled if the string
2293     length was 1, or add the character back onto the end of a longer
2294 nigel 21 string. For a character type nothing need be done; it will just get
2295     put back naturally. Note that the final character is always going to
2296 nigel 63 get added below, so we leave code ready for its insertion. */
2297 nigel 3
2298     else if (*previous == OP_CHARS)
2299     {
2300 nigel 63 if (code == previous) code += 2; else
2301    
2302     /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2303     bit set as a flag. The length will always be between 2 and 6. */
2304    
2305     #ifdef SUPPORT_UTF8
2306     if (utf8 && c >= 128) previous[1] += c & 7; else
2307     #endif
2308     previous[1]++;
2309 nigel 3 }
2310    
2311 nigel 21 /* For a single negated character we also have to put back the
2312 nigel 63 item that got cancelled. At present this applies only to single byte
2313     characters in any mode. */
2314 nigel 21
2315     else if (*previous == OP_NOT) code++;
2316    
2317 nigel 63 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2318     we have to insert the character for the previous code. In UTF-8 mode,
2319     long characters have their length in c, with the 0x80 bit as a flag. */
2320 nigel 3
2321 nigel 9 if (repeat_max < 0)
2322 nigel 3 {
2323 nigel 63 #ifdef SUPPORT_UTF8
2324     if (utf8 && c >= 128)
2325     {
2326     memcpy(code, utf8_char, c & 7);
2327     code += c & 7;
2328     }
2329     else
2330     #endif
2331 nigel 3 *code++ = c;
2332 nigel 9 *code++ = OP_STAR + repeat_type;
2333     }
2334    
2335 nigel 63 /* Else insert an UPTO if the max is greater than the min, again
2336     preceded by the character, for the previously inserted code. */
2337 nigel 9
2338     else if (repeat_max != repeat_min)
2339     {
2340 nigel 63 #ifdef SUPPORT_UTF8
2341     if (utf8 && c >= 128)
2342     {
2343     memcpy(code, utf8_char, c & 7);
2344     code += c & 7;
2345     }
2346     else
2347     #endif
2348 nigel 9 *code++ = c;
2349 nigel 3 repeat_max -= repeat_min;
2350     *code++ = OP_UPTO + repeat_type;
2351 nigel 63 PUT2INC(code, 0, repeat_max);
2352 nigel 3 }
2353     }
2354    
2355     /* The character or character type itself comes last in all cases. */
2356    
2357 nigel 63 #ifdef SUPPORT_UTF8
2358     if (utf8 && c >= 128)
2359     {
2360     memcpy(code, utf8_char, c & 7);
2361     code += c & 7;
2362     }
2363     else
2364     #endif
2365    
2366 nigel 3 *code++ = c;
2367     }
2368    
2369     /* If previous was a character class or a back reference, we put the repeat
2370 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
2371 nigel 3
2372 nigel 63 else if (*previous == OP_CLASS ||
2373     *previous == OP_NCLASS ||
2374     #ifdef SUPPORT_UTF8
2375     *previous == OP_XCLASS ||
2376     #endif
2377     *previous == OP_REF)
2378 nigel 3 {
2379 nigel 37 if (repeat_max == 0)
2380     {
2381     code = previous;
2382     goto END_REPEAT;
2383     }
2384 nigel 3 if (repeat_min == 0 && repeat_max == -1)
2385     *code++ = OP_CRSTAR + repeat_type;
2386     else if (repeat_min == 1 && repeat_max == -1)
2387     *code++ = OP_CRPLUS + repeat_type;
2388     else if (repeat_min == 0 && repeat_max == 1)
2389     *code++ = OP_CRQUERY + repeat_type;
2390     else
2391     {
2392     *code++ = OP_CRRANGE + repeat_type;
2393 nigel 63 PUT2INC(code, 0, repeat_min);
2394 nigel 3 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2395 nigel 63 PUT2INC(code, 0, repeat_max);
2396 nigel 3 }
2397     }
2398    
2399     /* If previous was a bracket group, we may have to replicate it in certain
2400 nigel 23 cases. */
2401 nigel 3
2402 nigel 63 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2403     *previous == OP_COND)
2404 nigel 3 {
2405 nigel 31 register int i;
2406     int ketoffset = 0;
2407 nigel 9 int len = code - previous;
2408 nigel 31 uschar *bralink = NULL;
2409 nigel 3
2410 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
2411     by scanning through from the start, and compute the offset back to it
2412     from the current code pointer. There may be an OP_OPT setting following
2413     the final KET, so we can't find the end just by going back from the code
2414     pointer. */
2415    
2416     if (repeat_max == -1)
2417 nigel 3 {
2418 nigel 23 register uschar *ket = previous;
2419 nigel 63 do ket += GET(ket, 1); while (*ket != OP_KET);
2420 nigel 23 ketoffset = code - ket;
2421 nigel 3 }
2422    
2423 nigel 31 /* The case of a zero minimum is special because of the need to stick
2424     OP_BRAZERO in front of it, and because the group appears once in the
2425     data, whereas in other cases it appears the minimum number of times. For
2426     this reason, it is simplest to treat this case separately, as otherwise
2427 nigel 53 the code gets far too messy. There are several special subcases when the
2428 nigel 31 minimum is zero. */
2429    
2430     if (repeat_min == 0)
2431     {
2432     /* If the maximum is also zero, we just omit the group from the output
2433     altogether. */
2434    
2435     if (repeat_max == 0)
2436     {
2437     code = previous;
2438 nigel 37 goto END_REPEAT;
2439 nigel 31 }
2440    
2441     /* If the maximum is 1 or unlimited, we just have to stick in the
2442     BRAZERO and do no more at this point. */
2443    
2444     if (repeat_max <= 1)
2445     {
2446     memmove(previous+1, previous, len);
2447     code++;
2448     *previous++ = OP_BRAZERO + repeat_type;
2449     }
2450    
2451     /* If the maximum is greater than 1 and limited, we have to replicate
2452     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2453     The first one has to be handled carefully because it's the original
2454     copy, which has to be moved up. The remainder can be handled by code
2455     that is common with the non-zero minimum case below. We just have to
2456     adjust the value or repeat_max, since one less copy is required. */
2457    
2458     else
2459     {
2460     int offset;
2461 nigel 63 memmove(previous + 2 + LINK_SIZE, previous, len);
2462     code += 2 + LINK_SIZE;
2463 nigel 31 *previous++ = OP_BRAZERO + repeat_type;
2464     *previous++ = OP_BRA;
2465    
2466     /* We chain together the bracket offset fields that have to be
2467     filled in later when the ends of the brackets are reached. */
2468    
2469     offset = (bralink == NULL)? 0 : previous - bralink;
2470     bralink = previous;
2471 nigel 63 PUTINC(previous, 0, offset);
2472 nigel 31 }
2473    
2474     repeat_max--;
2475     }
2476    
2477     /* If the minimum is greater than zero, replicate the group as many
2478     times as necessary, and adjust the maximum to the number of subsequent
2479 nigel 63 copies that we need. If we set a first char from the group, and didn't
2480     set a required char, copy the latter from the former. */
2481 nigel 31
2482     else
2483     {
2484 nigel 63 if (repeat_min > 1)
2485 nigel 31 {
2486 nigel 63 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2487     for (i = 1; i < repeat_min; i++)
2488     {
2489     memcpy(code, previous, len);
2490     code += len;
2491     }
2492 nigel 31 }
2493     if (repeat_max > 0) repeat_max -= repeat_min;
2494     }
2495    
2496     /* This code is common to both the zero and non-zero minimum cases. If
2497     the maximum is limited, it replicates the group in a nested fashion,
2498     remembering the bracket starts on a stack. In the case of a zero minimum,
2499     the first one was set up above. In all cases the repeat_max now specifies
2500     the number of additional copies needed. */
2501    
2502     if (repeat_max >= 0)
2503     {
2504     for (i = repeat_max - 1; i >= 0; i--)
2505     {
2506     *code++ = OP_BRAZERO + repeat_type;
2507    
2508     /* All but the final copy start a new nesting, maintaining the
2509     chain of brackets outstanding. */
2510    
2511     if (i != 0)
2512     {
2513     int offset;
2514     *code++ = OP_BRA;
2515     offset = (bralink == NULL)? 0 : code - bralink;
2516     bralink = code;
2517 nigel 63 PUTINC(code, 0, offset);
2518 nigel 31 }
2519    
2520     memcpy(code, previous, len);
2521     code += len;
2522     }
2523    
2524     /* Now chain through the pending brackets, and fill in their length
2525     fields (which are holding the chain links pro tem). */
2526    
2527     while (bralink != NULL)
2528     {
2529     int oldlinkoffset;
2530     int offset = code - bralink + 1;
2531     uschar *bra = code - offset;
2532 nigel 63 oldlinkoffset = GET(bra, 1);
2533 nigel 31 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2534     *code++ = OP_KET;
2535 nigel 63 PUTINC(code, 0, offset);
2536     PUT(bra, 1, offset);
2537 nigel 31 }
2538     }
2539    
2540     /* If the maximum is unlimited, set a repeater in the final copy. We
2541     can't just offset backwards from the current code point, because we
2542     don't know if there's been an options resetting after the ket. The
2543     correct offset was computed above. */
2544    
2545     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2546 nigel 3 }
2547    
2548     /* Else there's some kind of shambles */
2549    
2550     else
2551     {
2552     *errorptr = ERR11;
2553     goto FAILED;
2554     }
2555    
2556 nigel 63 /* If the character following a repeat is '+', we wrap the entire repeated
2557     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2558     Sun's Java package. The repeated item starts at tempcode, not at previous,
2559     which might be the first part of a string whose (former) last char we
2560     repeated. However, we don't support '+' after a greediness '?'. */
2561    
2562     if (possessive_quantifier)
2563     {
2564     int len = code - tempcode;
2565     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2566     code += 1 + LINK_SIZE;
2567     len += 1 + LINK_SIZE;
2568     tempcode[0] = OP_ONCE;
2569     *code++ = OP_KET;
2570     PUTINC(code, 0, len);
2571     PUT(tempcode, 1, len);
2572     }
2573    
2574 nigel 65 /* In all case we no longer have a previous item. We also set the
2575     "follows varying string" flag for subsequently encountered reqbytes if
2576     it isn't already set and we have just passed a varying length item. */
2577 nigel 3
2578 nigel 37 END_REPEAT:
2579 nigel 3 previous = NULL;
2580 nigel 65 cd->req_varyopt |= reqvary;
2581 nigel 3 break;
2582    
2583    
2584 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
2585     lookbehind or option setting or condition. First deal with special things
2586     that can come after a bracket; all are introduced by ?, and the appearance
2587     of any of them means that this is not a referencing group. They were
2588     checked for validity in the first pass over the string, so we don't have to
2589     check for syntax errors here. */
2590 nigel 3
2591     case '(':
2592 nigel 23 newoptions = options;
2593 nigel 53 skipbytes = 0;
2594 nigel 23
2595 nigel 3 if (*(++ptr) == '?')
2596     {
2597 nigel 23 int set, unset;
2598     int *optset;
2599 nigel 3
2600     switch (*(++ptr))
2601     {
2602 nigel 23 case '#': /* Comment; skip to ket */
2603 nigel 3 ptr++;
2604     while (*ptr != ')') ptr++;
2605     continue;
2606    
2607     case ':': /* Non-extracting bracket */
2608 nigel 23 bravalue = OP_BRA;
2609 nigel 3 ptr++;
2610     break;
2611    
2612 nigel 23 case '(':
2613     bravalue = OP_COND; /* Conditional group */
2614 nigel 63
2615     /* Condition to test for recursion */
2616    
2617     if (ptr[1] == 'R')
2618 nigel 23 {
2619 nigel 63 code[1+LINK_SIZE] = OP_CREF;
2620     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2621     skipbytes = 3;
2622     ptr += 3;
2623     }
2624    
2625 nigel 69 /* Condition to test for a numbered subpattern match. We know that
2626     if a digit follows ( then there will just be digits until ) because
2627     the syntax was checked in the first pass. */
2628 nigel 63
2629 nigel 69 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2630 nigel 63 {
2631 nigel 65 int condref; /* Don't amalgamate; some compilers */
2632     condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2633 nigel 23 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2634 nigel 51 if (condref == 0)
2635     {
2636     *errorptr = ERR35;
2637     goto FAILED;
2638     }
2639 nigel 23 ptr++;
2640 nigel 63 code[1+LINK_SIZE] = OP_CREF;
2641     PUT2(code, 2+LINK_SIZE, condref);
2642 nigel 53 skipbytes = 3;
2643 nigel 23 }
2644 nigel 63 /* For conditions that are assertions, we just fall through, having
2645     set bravalue above. */
2646 nigel 23 break;
2647    
2648     case '=': /* Positive lookahead */
2649 nigel 3 bravalue = OP_ASSERT;
2650     ptr++;
2651     break;
2652    
2653 nigel 23 case '!': /* Negative lookahead */
2654 nigel 3 bravalue = OP_ASSERT_NOT;
2655     ptr++;
2656     break;
2657    
2658 nigel 23 case '<': /* Lookbehinds */
2659     switch (*(++ptr))
2660 nigel 3 {
2661 nigel 23 case '=': /* Positive lookbehind */
2662     bravalue = OP_ASSERTBACK;
2663 nigel 3 ptr++;
2664     break;
2665 nigel 23
2666     case '!': /* Negative lookbehind */
2667     bravalue = OP_ASSERTBACK_NOT;
2668     ptr++;
2669     break;
2670 nigel 3 }
2671 nigel 23 break;
2672 nigel 3
2673 nigel 23 case '>': /* One-time brackets */
2674     bravalue = OP_ONCE;
2675     ptr++;
2676     break;
2677    
2678 nigel 63 case 'C': /* Callout - may be followed by digits */
2679     *code++ = OP_CALLOUT;
2680     {
2681     int n = 0;
2682 nigel 69 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2683 nigel 63 n = n * 10 + *ptr - '0';
2684     if (n > 255)
2685     {
2686     *errorptr = ERR38;
2687     goto FAILED;
2688     }
2689     *code++ = n;
2690     }
2691     previous = NULL;
2692     continue;
2693    
2694     case 'P': /* Named subpattern handling */
2695     if (*(++ptr) == '<') /* Definition */
2696     {
2697     int i, namelen;
2698     uschar *slot = cd->name_table;
2699 nigel 65 const uschar *name; /* Don't amalgamate; some compilers */
2700     name = ++ptr; /* grumble at autoincrement in declaration */
2701 nigel 63
2702     while (*ptr++ != '>');
2703     namelen = ptr - name - 1;
2704    
2705     for (i = 0; i < cd->names_found; i++)
2706     {
2707 nigel 67 int crc = memcmp(name, slot+2, namelen);
2708     if (crc == 0)
2709 nigel 63 {
2710 nigel 65 if (slot[2+namelen] == 0)
2711     {
2712     *errorptr = ERR43;
2713     goto FAILED;
2714     }
2715 nigel 67 crc = -1; /* Current name is substring */
2716 nigel 63 }
2717 nigel 67 if (crc < 0)
2718 nigel 63 {
2719     memmove(slot + cd->name_entry_size, slot,
2720     (cd->names_found - i) * cd->name_entry_size);
2721     break;
2722     }
2723     slot += cd->name_entry_size;
2724     }
2725    
2726     PUT2(slot, 0, *brackets + 1);
2727     memcpy(slot + 2, name, namelen);
2728     slot[2+namelen] = 0;
2729     cd->names_found++;
2730     goto NUMBERED_GROUP;
2731     }
2732    
2733     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2734     {
2735     int i, namelen;
2736     int type = *ptr++;
2737     const uschar *name = ptr;
2738     uschar *slot = cd->name_table;
2739    
2740     while (*ptr != ')') ptr++;
2741     namelen = ptr - name;
2742    
2743     for (i = 0; i < cd->names_found; i++)
2744     {
2745 nigel 65 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2746 nigel 63 slot += cd->name_entry_size;
2747     }
2748     if (i >= cd->names_found)
2749     {
2750     *errorptr = ERR15;
2751     goto FAILED;
2752     }
2753    
2754     recno = GET2(slot, 0);
2755    
2756     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2757    
2758     /* Back reference */
2759    
2760     previous = code;
2761     *code++ = OP_REF;
2762     PUT2INC(code, 0, recno);
2763     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2764     if (recno > cd->top_backref) cd->top_backref = recno;
2765     continue;
2766     }
2767    
2768     /* Should never happen */
2769     break;
2770    
2771 nigel 43 case 'R': /* Pattern recursion */
2772 nigel 63 ptr++; /* Same as (?0) */
2773     /* Fall through */
2774    
2775     /* Recursion or "subroutine" call */
2776    
2777     case '0': case '1': case '2': case '3': case '4':
2778     case '5': case '6': case '7': case '8': case '9':
2779     {
2780     const uschar *called;
2781     recno = 0;
2782 nigel 69 while((digitab[*ptr] & ctype_digit) != 0)
2783 nigel 63 recno = recno * 10 + *ptr++ - '0';
2784    
2785     /* Come here from code above that handles a named recursion */
2786    
2787     HANDLE_RECURSION:
2788    
2789     previous = code;
2790    
2791     /* Find the bracket that is being referenced. Temporarily end the
2792     regex in case it doesn't exist. */
2793    
2794     *code = OP_END;
2795     called = (recno == 0)?
2796     cd->start_code : find_bracket(cd->start_code, utf8, recno);
2797    
2798     if (called == NULL)
2799     {
2800     *errorptr = ERR15;
2801     goto FAILED;
2802     }
2803    
2804     /* If the subpattern is still open, this is a recursive call. We
2805     check to see if this is a left recursion that could loop for ever,
2806     and diagnose that case. */
2807    
2808     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2809     {
2810     *errorptr = ERR40;
2811     goto FAILED;
2812     }
2813    
2814     /* Insert the recursion/subroutine item */
2815    
2816     *code = OP_RECURSE;
2817     PUT(code, 1, called - cd->start_code);
2818     code += 1 + LINK_SIZE;
2819     }
2820 nigel 43 continue;
2821    
2822 nigel 63 /* Character after (? not specially recognized */
2823    
2824 nigel 23 default: /* Option setting */
2825     set = unset = 0;
2826     optset = &set;
2827    
2828     while (*ptr != ')' && *ptr != ':')
2829     {
2830     switch (*ptr++)
2831     {
2832     case '-': optset = &unset; break;
2833    
2834     case 'i': *optset |= PCRE_CASELESS; break;
2835     case 'm': *optset |= PCRE_MULTILINE; break;
2836     case 's': *optset |= PCRE_DOTALL; break;
2837     case 'x': *optset |= PCRE_EXTENDED; break;
2838     case 'U': *optset |= PCRE_UNGREEDY; break;
2839     case 'X': *optset |= PCRE_EXTRA; break;
2840     }
2841     }
2842    
2843     /* Set up the changed option bits, but don't change anything yet. */
2844    
2845     newoptions = (options | set) & (~unset);
2846    
2847     /* If the options ended with ')' this is not the start of a nested
2848 nigel 63 group with option changes, so the options change at this level. Compile
2849     code to change the ims options if this setting actually changes any of
2850     them. We also pass the new setting back so that it can be put at the
2851     start of any following branches, and when this group ends (if we are in
2852     a group), a resetting item can be compiled.
2853 nigel 23
2854 nigel 63 Note that if this item is right at the start of the pattern, the
2855     options will have been abstracted and made global, so there will be no
2856     change to compile. */
2857    
2858 nigel 23 if (*ptr == ')')
2859     {
2860 nigel 63 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2861 nigel 23 {
2862     *code++ = OP_OPT;
2863 nigel 63 *code++ = newoptions & PCRE_IMS;
2864 nigel 23 }
2865 nigel 63
2866     /* Change options at this level, and pass them back for use
2867     in subsequent branches. Reset the greedy defaults and the case
2868     value for firstbyte and reqbyte. */
2869    
2870     *optionsptr = options = newoptions;
2871     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2872     greedy_non_default = greedy_default ^ 1;
2873     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2874    
2875 nigel 23 previous = NULL; /* This item can't be repeated */
2876     continue; /* It is complete */
2877     }
2878    
2879     /* If the options ended with ':' we are heading into a nested group
2880     with possible change of options. Such groups are non-capturing and are
2881     not assertions of any kind. All we need to do is skip over the ':';
2882     the newoptions value is handled below. */
2883    
2884     bravalue = OP_BRA;
2885     ptr++;
2886 nigel 3 }
2887     }
2888    
2889 nigel 63 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2890     non-capturing and behave like (?:...) brackets */
2891    
2892     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2893     {
2894     bravalue = OP_BRA;
2895     }
2896    
2897 nigel 53 /* Else we have a referencing group; adjust the opcode. If the bracket
2898     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2899     arrange for the true number to follow later, in an OP_BRANUMBER item. */
2900 nigel 3
2901     else
2902     {
2903 nigel 63 NUMBERED_GROUP:
2904 nigel 53 if (++(*brackets) > EXTRACT_BASIC_MAX)
2905 nigel 3 {
2906 nigel 53 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2907 nigel 63 code[1+LINK_SIZE] = OP_BRANUMBER;
2908     PUT2(code, 2+LINK_SIZE, *brackets);
2909 nigel 53 skipbytes = 3;
2910 nigel 3 }
2911 nigel 53 else bravalue = OP_BRA + *brackets;
2912 nigel 3 }
2913    
2914 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
2915     kinds can be. We copy code into a non-register variable in order to be able
2916     to pass its address because some compilers complain otherwise. Pass in a
2917     new setting for the ims options if they have changed. */
2918 nigel 3
2919 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
2920 nigel 3 *code = bravalue;
2921 nigel 23 tempcode = code;
2922 nigel 65 tempreqvary = cd->req_varyopt; /* Save value before bracket */
2923 nigel 23
2924     if (!compile_regex(
2925 nigel 63 newoptions, /* The complete new option state */
2926     options & PCRE_IMS, /* The previous ims option state */
2927 nigel 53 brackets, /* Extracting bracket count */
2928 nigel 23 &tempcode, /* Where to put code (updated) */
2929     &ptr, /* Input pointer (updated) */
2930     errorptr, /* Where to put an error message */
2931     (bravalue == OP_ASSERTBACK ||
2932     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2933 nigel 53 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
2934 nigel 63 &subfirstbyte, /* For possible first char */
2935     &subreqbyte, /* For possible last char */
2936     bcptr, /* Current branch chain */
2937 nigel 25 cd)) /* Tables block */
2938 nigel 23 goto FAILED;
2939    
2940     /* At the end of compiling, code is still pointing to the start of the
2941     group, while tempcode has been updated to point past the end of the group
2942     and any option resetting that may follow it. The pattern pointer (ptr)
2943     is on the bracket. */
2944    
2945     /* If this is a conditional bracket, check that there are no more than
2946     two branches in the group. */
2947    
2948 nigel 53 else if (bravalue == OP_COND)
2949 nigel 3 {
2950 nigel 23 uschar *tc = code;
2951 nigel 37 condcount = 0;
2952 nigel 23
2953     do {
2954 nigel 37 condcount++;
2955 nigel 63 tc += GET(tc,1);
2956 nigel 23 }
2957     while (*tc != OP_KET);
2958    
2959 nigel 37 if (condcount > 2)
2960 nigel 23 {
2961     *errorptr = ERR27;
2962 nigel 3 goto FAILED;
2963 nigel 23 }
2964 nigel 63
2965     /* If there is just one branch, we must not make use of its firstbyte or
2966     reqbyte, because this is equivalent to an empty second branch. */
2967    
2968     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2969 nigel 3 }
2970    
2971 nigel 63 /* Handle updating of the required and first characters. Update for normal
2972     brackets of all kinds, and conditions with two branches (see code above).
2973     If the bracket is followed by a quantifier with zero repeat, we have to
2974     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2975     main loop so that they can be accessed for the back off. */
2976 nigel 37
2977 nigel 63 zeroreqbyte = reqbyte;
2978     zerofirstbyte = firstbyte;
2979     groupsetfirstbyte = FALSE;
2980    
2981     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2982 nigel 37 {
2983 nigel 63 /* If we have not yet set a firstbyte in this branch, take it from the
2984     subpattern, remembering that it was set here so that a repeat of more
2985     than one can replicate it as reqbyte if necessary. If the subpattern has
2986     no firstbyte, set "none" for the whole branch. In both cases, a zero
2987     repeat forces firstbyte to "none". */
2988    
2989     if (firstbyte == REQ_UNSET)
2990     {
2991     if (subfirstbyte >= 0)
2992     {
2993     firstbyte = subfirstbyte;
2994     groupsetfirstbyte = TRUE;
2995     }
2996     else firstbyte = REQ_NONE;
2997     zerofirstbyte = REQ_NONE;
2998     }
2999    
3000     /* If firstbyte was previously set, convert the subpattern's firstbyte
3001 nigel 65 into reqbyte if there wasn't one, using the vary flag that was in
3002     existence beforehand. */
3003 nigel 63
3004 nigel 65 else if (subfirstbyte >= 0 && subreqbyte < 0)
3005     subreqbyte = subfirstbyte | tempreqvary;
3006 nigel 63
3007 nigel 65 /* If the subpattern set a required byte (or set a first byte that isn't
3008     really the first byte - see above), set it. */
3009 nigel 63
3010     if (subreqbyte >= 0) reqbyte = subreqbyte;
3011 nigel 37 }
3012    
3013 nigel 63 /* For a forward assertion, we take the reqbyte, if set. This can be
3014     helpful if the pattern that follows the assertion doesn't set a different
3015     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3016     for an assertion, however because it leads to incorrect effect for patterns
3017     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3018     of a firstbyte. This is overcome by a scan at the end if there's no
3019     firstbyte, looking for an asserted first char. */
3020    
3021     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3022    
3023 nigel 23 /* Now update the main code pointer to the end of the group. */
3024    
3025     code = tempcode;
3026    
3027     /* Error if hit end of pattern */
3028    
3029 nigel 3 if (*ptr != ')')
3030     {
3031     *errorptr = ERR14;
3032     goto FAILED;
3033     }
3034     break;
3035    
3036     /* Check \ for being a real metacharacter; if not, fall through and handle
3037     it as a data character at the start of a string. Escape items are checked
3038     for validity in the pre-compiling pass. */
3039    
3040     case '\\':
3041 nigel 23 tempptr = ptr;
3042 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3043 nigel 3
3044     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3045     are arranged to be the negation of the corresponding OP_values. For the
3046     back references, the values are ESC_REF plus the reference number. Only
3047     back references and those types that consume a character may be repeated.
3048     We can test for values between ESC_b and ESC_Z for the latter; this may
3049     have to change if any new ones are ever created. */
3050    
3051     if (c < 0)
3052     {
3053 nigel 63 if (-c == ESC_Q) /* Handle start of quoted string */
3054     {
3055     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3056     else inescq = TRUE;
3057     continue;
3058     }
3059    
3060     /* For metasequences that actually match a character, we disable the
3061     setting of a first character if it hasn't already been set. */
3062    
3063     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3064     firstbyte = REQ_NONE;
3065    
3066     /* Set values to reset to if this is followed by a zero repeat. */
3067    
3068     zerofirstbyte = firstbyte;
3069     zeroreqbyte = reqbyte;
3070    
3071     /* Back references are handled specially */
3072    
3073 nigel 3 if (-c >= ESC_REF)
3074     {
3075 nigel 53 int number = -c - ESC_REF;
3076 nigel 3 previous = code;
3077     *code++ = OP_REF;
3078 nigel 63 PUT2INC(code, 0, number);
3079 nigel 3 }
3080     else
3081     {
3082 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3083 nigel 3 *code++ = -c;
3084     }
3085     continue;
3086     }
3087    
3088 nigel 7 /* Data character: reset and fall through */
3089 nigel 3
3090 nigel 23 ptr = tempptr;
3091 nigel 3 c = '\\';
3092    
3093     /* Handle a run of data characters until a metacharacter is encountered.
3094     The first character is guaranteed not to be whitespace or # when the
3095     extended flag is set. */
3096    
3097     NORMAL_CHAR:
3098     default:
3099     previous = code;
3100     *code = OP_CHARS;
3101     code += 2;
3102     length = 0;
3103    
3104     do
3105     {
3106 nigel 63 /* If in \Q...\E, check for the end; if not, we always have a literal */
3107    
3108     if (inescq)
3109     {
3110     if (c == '\\' && ptr[1] == 'E')
3111     {
3112     inescq = FALSE;
3113     ptr++;
3114     }
3115     else
3116     {
3117     *code++ = c;
3118     length++;
3119     }
3120     continue;
3121     }
3122    
3123     /* Skip white space and comments for /x patterns */
3124    
3125 nigel 3 if ((options & PCRE_EXTENDED) != 0)
3126     {
3127 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3128 nigel 3 if (c == '#')
3129     {
3130 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
3131     on the Macintosh. */
3132 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3133 nigel 3 if (c == 0) break;
3134     continue;
3135     }
3136     }
3137    
3138     /* Backslash may introduce a data char or a metacharacter. Escaped items
3139     are checked for validity in the pre-compiling pass. Stop the string
3140     before a metaitem. */
3141    
3142     if (c == '\\')
3143     {
3144 nigel 23 tempptr = ptr;
3145 nigel 25 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3146 nigel 23 if (c < 0) { ptr = tempptr; break; }
3147 nigel 49
3148     /* If a character is > 127 in UTF-8 mode, we have to turn it into
3149     two or more characters in the UTF-8 encoding. */
3150    
3151     #ifdef SUPPORT_UTF8
3152 nigel 63 if (utf8 && c > 127)
3153 nigel 49 {
3154     uschar buffer[8];
3155     int len = ord2utf8(c, buffer);
3156     for (c = 0; c < len; c++) *code++ = buffer[c];
3157     length += len;
3158     continue;
3159     }
3160     #endif
3161 nigel 3 }
3162    
3163     /* Ordinary character or single-char escape */
3164    
3165     *code++ = c;
3166     length++;
3167     }
3168    
3169     /* This "while" is the end of the "do" above. */
3170    
3171 nigel 49 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3172 nigel 3
3173 nigel 63 /* Update the first and last requirements. These are always bytes, even in
3174     UTF-8 mode. However, there is a special case to be considered when there
3175     are only one or two characters. Because this gets messy in UTF-8 mode, the
3176     code is kept separate. When we get here "length" contains the number of
3177     bytes. */
3178 nigel 37
3179 nigel 63 #ifdef SUPPORT_UTF8
3180     if (utf8 && length > 1)
3181     {
3182     uschar *t = previous + 3; /* After this code, t */
3183     while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3184 nigel 37
3185 nigel 63 /* Handle the case when there is only one multibyte character. It must
3186     have at least two bytes because of the "length > 1" test above. */
3187 nigel 3
3188 nigel 63 if (t == code)
3189     {
3190     /* If no previous first byte, set it from this character, but revert to
3191     none on a zero repeat. */
3192    
3193     if (firstbyte == REQ_UNSET)
3194     {
3195     zerofirstbyte = REQ_NONE;
3196     firstbyte = previous[2];
3197     }
3198    
3199     /* Otherwise, leave the first byte value alone, and don't change it on
3200     a zero repeat */
3201    
3202     else zerofirstbyte = firstbyte;
3203    
3204     /* In both cases, a zero repeat resets the previous required byte */
3205    
3206     zeroreqbyte = reqbyte;
3207     }
3208    
3209     /* Handle the case when there is more than one character. These may be
3210     single-byte or multibyte characters */
3211    
3212     else
3213     {
3214 nigel 67 t = code - 1; /* After this code, t is at the */
3215 nigel 63 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3216    
3217     /* If no previous first byte, set it from the first character, and
3218     retain it on a zero repeat (of the last character). The required byte
3219     is reset on a zero repeat, either to the byte before the last
3220     character, unless this is the first byte of the string. In that case,
3221     it reverts to its previous value. */
3222    
3223     if (firstbyte == REQ_UNSET)
3224     {
3225     zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3226 nigel 65 zeroreqbyte = (t - 1 == previous + 2)?
3227     reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3228 nigel 63 }
3229    
3230     /* If there was a previous first byte, leave it alone, and don't change
3231     it on a zero repeat. The required byte is reset on a zero repeat to the
3232     byte before the last character. */
3233    
3234     else
3235     {
3236     zerofirstbyte = firstbyte;
3237 nigel 65 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3238 nigel 63 }
3239     }
3240    
3241     /* In all cases (we know length > 1), the new required byte is the last
3242     byte of the string. */
3243    
3244 nigel 65 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3245 nigel 63 }
3246    
3247     else /* End of UTF-8 coding */
3248     #endif
3249    
3250     /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3251     or when UTF-8 is not enabled. */
3252    
3253     {
3254     /* firstbyte was not previously set; take it from this string */
3255    
3256     if (firstbyte == REQ_UNSET)
3257     {
3258     if (length == 1)
3259     {
3260     zerofirstbyte = REQ_NONE;
3261     firstbyte = previous[2] | req_caseopt;
3262     zeroreqbyte = reqbyte;
3263     }
3264     else
3265     {
3266     zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3267 nigel 65 zeroreqbyte = (length > 2)?
3268     (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3269     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3270 nigel 63 }
3271     }
3272    
3273     /* firstbyte was previously set */
3274    
3275     else
3276     {
3277     zerofirstbyte = firstbyte;
3278 nigel 65 zeroreqbyte = (length == 1)? reqbyte :
3279     code[-2] | req_caseopt | cd->req_varyopt;
3280     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3281 nigel 63 }
3282     }
3283    
3284     /* Set the length in the data vector, and advance to the next state. */
3285    
3286 nigel 3 previous[1] = length;
3287 nigel 49 if (length < MAXLIT) ptr--;
3288 nigel 3 break;
3289     }
3290     } /* end of big loop */
3291    
3292     /* Control never reaches here by falling through, only by a goto for all the
3293     error states. Pass back the position in the pattern so that it can be displayed
3294     to the user for diagnosing the error. */
3295    
3296     FAILED:
3297     *ptrptr = ptr;
3298     return FALSE;
3299     }
3300    
3301    
3302    
3303    
3304     /*************************************************
3305     * Compile sequence of alternatives *
3306     *************************************************/
3307    
3308     /* On entry, ptr is pointing past the bracket character, but on return
3309     it points to the closing bracket, or vertical bar, or end of string.
3310     The code variable is pointing at the byte into which the BRA operator has been
3311 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
3312     during any branch, we need to insert an OP_OPT item at the start of every
3313     following branch to ensure they get set correctly at run time, and also pass
3314     the new options into every subsequent branch compile.
3315 nigel 3
3316     Argument:
3317 nigel 63 options option bits, including any changes for this subpattern
3318     oldims previous settings of ims option bits
3319     brackets -> int containing the number of extracting brackets used
3320     codeptr -> the address of the current code pointer
3321     ptrptr -> the address of the current pattern pointer
3322     errorptr -> pointer to error message
3323     lookbehind TRUE if this is a lookbehind assertion
3324     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3325     firstbyteptr place to put the first required character, or a negative number
3326     reqbyteptr place to put the last required character, or a negative number
3327     bcptr pointer to the chain of currently open branches
3328     cd points to the data block with tables pointers etc.
3329 nigel 3
3330 nigel 23 Returns: TRUE on success
3331 nigel 3 */
3332    
3333     static BOOL
3334 nigel 63 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3335 nigel 53 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3336 nigel 63 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3337 nigel 3 {
3338 nigel 7 const uschar *ptr = *ptrptr;
3339 nigel 3 uschar *code = *codeptr;
3340 nigel 23 uschar *last_branch = code;
3341 nigel 3 uschar *start_bracket = code;
3342 nigel 23 uschar *reverse_count = NULL;
3343 nigel 63 int firstbyte, reqbyte;
3344     int branchfirstbyte, branchreqbyte;
3345     branch_chain bc;
3346 nigel 3
3347 nigel 63 bc.outer = bcptr;
3348     bc.current = code;
3349 nigel 23
3350 nigel 63 firstbyte = reqbyte = REQ_UNSET;
3351    
3352     /* Offset is set zero to mark that this bracket is still open */
3353    
3354     PUT(code, 1, 0);
3355     code += 1 + LINK_SIZE + skipbytes;
3356    
3357 nigel 23 /* Loop for each alternative branch */
3358    
3359 nigel 3 for (;;)
3360     {
3361 nigel 63 /* Handle a change of ims options at the start of the branch */
3362 nigel 3
3363 nigel 63 if ((options & PCRE_IMS) != oldims)
3364 nigel 3 {
3365 nigel 23 *code++ = OP_OPT;
3366 nigel 63 *code++ = options & PCRE_IMS;
3367 nigel 23 }
3368    
3369     /* Set up dummy OP_REVERSE if lookbehind assertion */
3370    
3371     if (lookbehind)
3372     {
3373     *code++ = OP_REVERSE;
3374     reverse_count = code;
3375 nigel 63 PUTINC(code, 0, 0);
3376 nigel 23 }
3377    
3378     /* Now compile the branch */
3379    
3380 nigel 63 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3381     &branchfirstbyte, &branchreqbyte, &bc, cd))
3382 nigel 23 {
3383 nigel 3 *ptrptr = ptr;
3384     return FALSE;
3385     }
3386    
3387 nigel 63 /* If this is the first branch, the firstbyte and reqbyte values for the
3388     branch become the values for the regex. */
3389 nigel 3
3390 nigel 63 if (*last_branch != OP_ALT)
3391     {
3392     firstbyte = branchfirstbyte;
3393     reqbyte = branchreqbyte;
3394     }
3395 nigel 3
3396 nigel 63 /* If this is not the first branch, the first char and reqbyte have to
3397 nigel 65 match the values from all the previous branches, except that if the previous
3398     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3399     REQ_VARY for the regex. */
3400 nigel 37
3401 nigel 63 else
3402 nigel 37 {
3403 nigel 63 /* If we previously had a firstbyte, but it doesn't match the new branch,
3404     we have to abandon the firstbyte for the regex, but if there was previously
3405     no reqbyte, it takes on the value of the old firstbyte. */
3406    
3407     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3408 nigel 37 {
3409 nigel 63 if (reqbyte < 0) reqbyte = firstbyte;
3410     firstbyte = REQ_NONE;
3411 nigel 37 }
3412    
3413 nigel 63 /* If we (now or from before) have no firstbyte, a firstbyte from the
3414     branch becomes a reqbyte if there isn't a branch reqbyte. */
3415 nigel 37
3416 nigel 63 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3417     branchreqbyte = branchfirstbyte;
3418 nigel 37
3419 nigel 63 /* Now ensure that the reqbytes match */
3420    
3421 nigel 65 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3422     reqbyte = REQ_NONE;
3423     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3424 nigel 63 }
3425    
3426 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
3427     and put the length into the OP_REVERSE item. Temporarily mark the end of
3428     the branch with OP_END. */
3429    
3430     if (lookbehind)
3431     {
3432 nigel 63 int length;
3433 nigel 23 *code = OP_END;
3434 nigel 49 length = find_fixedlength(last_branch, options);
3435 nigel 23 DPRINTF(("fixed length = %d\n", length));
3436     if (length < 0)
3437     {
3438 nigel 63 *errorptr = (length == -2)? ERR36 : ERR25;
3439 nigel 23 *ptrptr = ptr;
3440     return FALSE;
3441     }
3442 nigel 63 PUT(reverse_count, 0, length);
3443 nigel 23 }
3444    
3445 nigel 63 /* Reached end of expression, either ')' or end of pattern. Go back through
3446     the alternative branches and reverse the chain of offsets, with the field in
3447     the BRA item now becoming an offset to the first alternative. If there are
3448     no alternatives, it points to the end of the group. The length in the
3449     terminating ket is always the length of the whole bracketed item. If any of
3450     the ims options were changed inside the group, compile a resetting op-code
3451     following, except at the very end of the pattern. Return leaving the pointer
3452     at the terminating char. */
3453 nigel 3
3454     if (*ptr != '|')
3455     {
3456 nigel 63 int length = code - last_branch;
3457     do
3458 nigel 23 {
3459 nigel 63 int prev_length = GET(last_branch, 1);
3460     PUT(last_branch, 1, length);
3461     length = prev_length;
3462     last_branch -= length;
3463 nigel 23 }
3464 nigel 63 while (length > 0);
3465 nigel 3
3466 nigel 63 /* Fill in the ket */
3467 nigel 3
3468 nigel 63 *code = OP_KET;
3469     PUT(code, 1, code - start_bracket);
3470     code += 1 + LINK_SIZE;
3471 nigel 3
3472 nigel 63 /* Resetting option if needed */
3473 nigel 3
3474 nigel 63 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3475 nigel 23 {
3476 nigel 63 *code++ = OP_OPT;
3477     *code++ = oldims;
3478 nigel 23 }
3479    
3480 nigel 63 /* Set values to pass back */
3481 nigel 23
3482 nigel 63 *codeptr = code;
3483     *ptrptr = ptr;
3484     *firstbyteptr = firstbyte;
3485     *reqbyteptr = reqbyte;
3486     return TRUE;
3487     }
3488 nigel 35
3489 nigel 63 /* Another branch follows; insert an "or" node. Its length field points back
3490     to the previous branch while the bracket remains open. At the end the chain
3491     is reversed. It's done like this so that the start of the bracket has a
3492     zero offset until it is closed, making it possible to detect recursion. */
3493 nigel 23
3494 nigel 63 *code = OP_ALT;
3495     PUT(code, 1, code - last_branch);
3496     bc.current = last_branch = code;
3497     code += 1 + LINK_SIZE;
3498     ptr++;
3499 nigel 23 }
3500     /* Control never reaches here */
3501     }
3502    
3503    
3504    
3505    
3506     /*************************************************
3507 nigel 3 * Check for anchored expression *
3508     *************************************************/
3509    
3510     /* Try to find out if this is an anchored regular expression. Consider each
3511     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3512     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3513     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3514     counts, since OP_CIRC can match in the middle.
3515    
3516 nigel 63 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3517     This is the code for \G, which means "match at start of match position, taking
3518     into account the match offset".
3519    
3520 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3521     because that will try the rest of the pattern at all possible matching points,
3522 nigel 63 so there is no point trying again.... er ....
3523 nigel 3
3524 nigel 63 .... except when the .* appears inside capturing parentheses, and there is a
3525     subsequent back reference to those parentheses. We haven't enough information
3526     to catch that case precisely.
3527    
3528     At first, the best we could do was to detect when .* was in capturing brackets
3529     and the highest back reference was greater than or equal to that level.
3530     However, by keeping a bitmap of the first 31 back references, we can catch some
3531     of the more common cases more precisely.
3532    
3533 nigel 23 Arguments:
3534 nigel 63 code points to start of expression (the bracket)
3535     options points to the options setting
3536     bracket_map a bitmap of which brackets we are inside while testing; this
3537     handles up to substring 31; after that we just have to take
3538     the less precise approach
3539     backref_map the back reference bitmap
3540 nigel 23
3541     Returns: TRUE or FALSE
3542 nigel 3 */
3543    
3544     static BOOL
3545 nigel 63 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3546     unsigned int backref_map)
3547 nigel 3 {
3548     do {
3549 nigel 63 const uschar *scode =
3550     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3551 nigel 23 register int op = *scode;
3552 nigel 63
3553     /* Capturing brackets */
3554    
3555     if (op > OP_BRA)
3556     {
3557     int new_map;
3558     op -= OP_BRA;
3559     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3560     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3561     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3562     }
3563    
3564     /* Other brackets */
3565    
3566     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3567     {
3568     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3569     }
3570    
3571     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3572     are or may be referenced. */
3573    
3574 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3575     (*options & PCRE_DOTALL) != 0)
3576 nigel 63 {
3577     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3578     }
3579    
3580     /* Check for explicit anchoring */
3581    
3582     else if (op != OP_SOD && op != OP_SOM &&
3583 nigel 23 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3584     return FALSE;
3585 nigel 63 code += GET(code, 1);
3586 nigel 3 }
3587 nigel 63 while (*code == OP_ALT); /* Loop for each alternative */
3588 nigel 3 return TRUE;
3589     }
3590    
3591    
3592    
3593     /*************************************************
3594 nigel 33 * Check for starting with ^ or .* *
3595 nigel 3 *************************************************/
3596    
3597 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
3598     "first char" processing can be done to speed things up in multiline
3599     matching and for non-DOTALL patterns that start with .* (which must start at
3600 nigel 63 the beginning or after \n). As in the case of is_anchored() (see above), we
3601     have to take account of back references to capturing brackets that contain .*
3602     because in that case we can't make the assumption.
3603 nigel 3
3604 nigel 63 Arguments:
3605     code points to start of expression (the bracket)
3606     bracket_map a bitmap of which brackets we are inside while testing; this
3607     handles up to substring 31; after that we just have to take
3608     the less precise approach
3609     backref_map the back reference bitmap
3610    
3611     Returns: TRUE or FALSE
3612 nigel 3 */
3613    
3614     static BOOL
3615 nigel 63 is_startline(const uschar *code, unsigned int bracket_map,
3616     unsigned int backref_map)
3617 nigel 3 {
3618     do {
3619 nigel 63 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3620 nigel 23 register int op = *scode;
3621 nigel 63
3622     /* Capturing brackets */
3623    
3624     if (op > OP_BRA)
3625     {
3626     int new_map;
3627     op -= OP_BRA;
3628     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3629     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3630     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3631     }
3632    
3633     /* Other brackets */
3634    
3635     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3636     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3637    
3638     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3639     may be referenced. */
3640    
3641 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3642 nigel 63 {
3643     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3644     }
3645    
3646     /* Check for explicit circumflex */
3647    
3648 nigel 23 else if (op != OP_CIRC) return FALSE;
3649 nigel 63 code += GET(code, 1);
3650 nigel 3 }
3651 nigel 63 while (*code == OP_ALT); /* Loop for each alternative */
3652 nigel 3 return TRUE;
3653     }
3654    
3655    
3656    
3657     /*************************************************
3658 nigel 63 * Check for asserted fixed first char *
3659 nigel 3 *************************************************/
3660    
3661 nigel 63 /* During compilation, the "first char" settings from forward assertions are
3662     discarded, because they can cause conflicts with actual literals that follow.
3663     However, if we end up without a first char setting for an unanchored pattern,
3664     it is worth scanning the regex to see if there is an initial asserted first
3665     char. If all branches start with the same asserted char, or with a bracket all
3666     of whose alternatives start with the same asserted char (recurse ad lib), then
3667     we return that char, otherwise -1.
3668 nigel 3
3669 nigel 23 Arguments:
3670     code points to start of expression (the bracket)
3671     options pointer to the options (used to check casing changes)
3672 nigel 63 inassert TRUE if in an assertion
3673 nigel 23
3674     Returns: -1 or the fixed first char
3675 nigel 3 */
3676    
3677     static int
3678 nigel 63 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3679 nigel 3 {
3680     register int c = -1;
3681 nigel 23 do {
3682     int d;
3683 nigel 63 const uschar *scode =
3684     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3685 nigel 23 register int op = *scode;
3686 nigel 3
3687 nigel 23 if (op >= OP_BRA) op = OP_BRA;
3688 nigel 3
3689 nigel 23 switch(op)
3690     {
3691     default:
3692     return -1;
3693 nigel 3
3694 nigel 23 case OP_BRA:
3695     case OP_ASSERT:
3696     case OP_ONCE:
3697     case OP_COND:
3698 nigel 63 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3699     return -1;
3700 nigel 23 if (c < 0) c = d; else if (c != d) return -1;
3701     break;
3702 nigel 3
3703 nigel 23 case OP_EXACT: /* Fall through */
3704     scode++;
3705 nigel 3
3706 nigel 23 case OP_CHARS: /* Fall through */
3707     scode++;
3708    
3709     case OP_PLUS:
3710     case OP_MINPLUS:
3711 nigel 63 if (!inassert) return -1;
3712     if (c < 0)
3713     {
3714     c = scode[1];
3715     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3716     }
3717     else if (c != scode[1]) return -1;
3718 nigel 23 break;
3719     }
3720    
3721 nigel 63 code += GET(code, 1);
3722 nigel 23 }
3723 nigel 3 while (*code == OP_ALT);
3724     return c;
3725     }
3726    
3727    
3728    
3729 nigel 23
3730 nigel 3 /*************************************************
3731     * Compile a Regular Expression *
3732     *************************************************/
3733    
3734     /* This function takes a string and returns a pointer to a block of store
3735     holding a compiled version of the expression.
3736    
3737     Arguments:
3738     pattern the regular expression
3739     options various option bits
3740     errorptr pointer to pointer to error text
3741     erroroffset ptr offset in pattern where error was detected
3742 nigel 25 tables pointer to character tables or NULL
3743 nigel 3
3744     Returns: pointer to compiled data block, or NULL on error,
3745     with errorptr and erroroffset set
3746     */
3747    
3748     pcre *
3749 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
3750 nigel 25 int *erroroffset, const unsigned char *tables)
3751 nigel 3 {
3752     real_pcre *re;
3753 nigel 63 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3754 nigel 3 int runlength;
3755 nigel 63 int c, firstbyte, reqbyte;
3756 nigel 3 int bracount = 0;
3757 nigel 23 int branch_extra = 0;
3758     int branch_newextra;
3759 nigel 63 int item_count = -1;
3760     int name_count = 0;
3761     int max_name_size = 0;
3762     #ifdef SUPPORT_UTF8
3763     int lastcharlength = 0;
3764     BOOL utf8;
3765     BOOL class_utf8;
3766     #endif
3767     BOOL inescq = FALSE;
3768 nigel 7 unsigned int brastackptr = 0;
3769 nigel 43 size_t size;
3770 nigel 7 uschar *code;
3771 nigel 63 const uschar *codestart;
3772 nigel 7 const uschar *ptr;
3773 nigel 25 compile_data compile_block;
3774 nigel 23 int brastack[BRASTACK_SIZE];
3775     uschar bralenstack[BRASTACK_SIZE];
3776 nigel 3
3777     /* We can't pass back an error message if errorptr is NULL; I guess the best we
3778     can do is just return NULL. */
3779    
3780     if (errorptr == NULL) return NULL;
3781     *errorptr = NULL;
3782    
3783     /* However, we can give a message for this error */
3784    
3785     if (erroroffset == NULL)
3786     {
3787     *errorptr = ERR16;
3788     return NULL;
3789     }
3790     *erroroffset = 0;
3791    
3792 nigel 63 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3793    
3794     #ifdef SUPPORT_UTF8
3795     utf8 = (options & PCRE_UTF8) != 0;
3796     #else
3797     if ((options & PCRE_UTF8) != 0)
3798     {
3799     *errorptr = ERR32;
3800     return NULL;
3801     }
3802     #endif
3803    
3804 nigel 3 if ((options & ~PUBLIC_OPTIONS) != 0)
3805     {
3806     *errorptr = ERR17;
3807     return NULL;
3808     }
3809    
3810 nigel 25 /* Set up pointers to the individual character tables */
3811    
3812     if (tables == NULL) tables = pcre_default_tables;
3813     compile_block.lcc = tables + lcc_offset;
3814     compile_block.fcc = tables + fcc_offset;
3815     compile_block.cbits = tables + cbits_offset;
3816     compile_block.ctypes = tables + ctypes_offset;
3817    
3818 nigel 63 /* Maximum back reference and backref bitmap. This is updated for numeric
3819     references during the first pass, but for named references during the actual
3820     compile pass. The bitmap records up to 31 back references to help in deciding
3821     whether (.*) can be treated as anchored or not. */
3822    
3823     compile_block.top_backref = 0;
3824     compile_block.backref_map = 0;
3825    
3826 nigel 25 /* Reflect pattern for debugging output */
3827    
3828 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
3829     DPRINTF(("%s\n", pattern));
3830 nigel 3
3831     /* The first thing to do is to make a pass over the pattern to compute the
3832     amount of store required to hold the compiled code. This does not have to be
3833     perfect as long as errors are overestimates. At the same time we can detect any
3834 nigel 63 flag settings right at the start, and extract them. Make an attempt to correct
3835     for any counted white space if an "extended" flag setting appears late in the
3836     pattern. We can't be so clever for #-comments. */
3837 nigel 3
3838 nigel 7 ptr = (const uschar *)(pattern - 1);
3839 nigel 3 while ((c = *(++ptr)) != 0)
3840     {
3841     int min, max;
3842 nigel 63 int class_optcount;
3843 nigel 53 int bracket_length;
3844 nigel 63 int duplength;
3845 nigel 3
3846 nigel 63 /* If we are inside a \Q...\E sequence, all chars are literal */
3847    
3848     if (inescq) goto NORMAL_CHAR;
3849    
3850     /* Otherwise, first check for ignored whitespace and comments */
3851    
3852 nigel 23 if ((options & PCRE_EXTENDED) != 0)
3853 nigel 3 {
3854 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3855 nigel 23 if (c == '#')
3856     {
3857 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
3858     on the Macintosh. */
3859 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3860 nigel 63 if (c == 0) break;
3861 nigel 23 continue;
3862     }
3863 nigel 3 }
3864    
3865 nigel 63 item_count++; /* Is zero for the first non-comment item */
3866    
3867 nigel 3 switch(c)
3868     {
3869     /* A backslashed item may be an escaped "normal" character or a
3870     character type. For a "normal" character, put the pointers and
3871     character back so that tests for whitespace etc. in the input
3872     are done correctly. */
3873    
3874     case '\\':
3875     {
3876 nigel 7 const uschar *save_ptr = ptr;
3877 nigel 25 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
3878 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3879     if (c >= 0)
3880     {
3881     ptr = save_ptr;
3882     c = '\\';
3883     goto NORMAL_CHAR;
3884     }
3885     }
3886 nigel 63
3887     /* If \Q, enter "literal" mode */
3888    
3889     if (-c == ESC_Q)
3890     {
3891     inescq = TRUE;
3892     continue;
3893     }
3894    
3895     /* Other escapes need one byte, and are of length one for repeats */
3896    
3897 nigel 3 length++;
3898 nigel 63 #ifdef SUPPORT_UTF8
3899     lastcharlength = 1;
3900     #endif
3901 nigel 3
3902 nigel 53 /* A back reference needs an additional 2 bytes, plus either one or 5
3903 nigel 3 bytes for a repeat. We also need to keep the value of the highest
3904     back reference. */
3905    
3906     if (c <= -ESC_REF)
3907     {
3908     int refnum = -c - ESC_REF;
3909 nigel 63 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3910     if (refnum > compile_block.top_backref)
3911     compile_block.top_backref = refnum;
3912 nigel 53 length += 2; /* For single back reference */
3913 nigel 25 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3914 nigel 3 {
3915 nigel 25 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
3916 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3917     if ((min == 0 && (max == 1 || max == -1)) ||
3918     (min == 1 && max == -1))
3919     length++;
3920     else length += 5;
3921     if (ptr[1] == '?') ptr++;
3922     }
3923     }
3924     continue;
3925    
3926 nigel 63 case '^': /* Single-byte metacharacters */
3927 nigel 3 case '.':
3928     case '$':
3929     length++;
3930 nigel 63 #ifdef SUPPORT_UTF8
3931     lastcharlength = 1;
3932     #endif
3933 nigel 3 continue;
3934    
3935 nigel 63 case '*': /* These repeats won't be after brackets; */
3936     case '+': /* those are handled separately */
3937     case '?':
3938     length++;
3939     goto POSESSIVE; /* A few lines below */
3940 nigel 3
3941 nigel 63 /* This covers the cases of braced repeats after a single char, metachar,
3942     class, or back reference. */
3943    
3944 nigel 3 case '{':
3945 nigel 25 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3946     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3947 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3948 nigel 63
3949     /* These special cases just insert one extra opcode */
3950    
3951 nigel 3 if ((min == 0 && (max == 1 || max == -1)) ||
3952     (min == 1 && max == -1))
3953     length++;
3954 nigel 63
3955     /* These cases might insert additional copies of a preceding character. */
3956    
3957 nigel 3 else
3958     {
3959 nigel 63 #ifdef SUPPORT_UTF8
3960     /* In UTF-8 mode, we should find the length in lastcharlength */
3961     if (utf8)
3962     {
3963     if (min != 1)
3964     {
3965     length -= lastcharlength; /* Uncount the original char or metachar */
3966     if (min > 0) length += 3 + lastcharlength;
3967     }
3968     length += lastcharlength + ((max > 0)? 3 : 1);
3969     }
3970     else
3971     #endif
3972    
3973     /* Not UTF-8 mode: all characters are one byte */
3974     {
3975     if (min != 1)
3976     {
3977     length--; /* Uncount the original char or metachar */
3978     if (min > 0) length += 4;
3979     }
3980    
3981     length += (max > 0)? 4 : 2;
3982     }
3983 nigel 3 }
3984 nigel 63
3985     if (ptr[1] == '?') ptr++; /* Needs no extra length */
3986    
3987     POSESSIVE: /* Test for possessive quantifier */
3988     if (ptr[1] == '+')
3989     {
3990     ptr++;
3991     length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
3992     }
3993 nigel 3 continue;
3994    
3995 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
3996     options changed in the previous branch(es), and/or if we are in a
3997     lookbehind assertion, extra space will be needed at the start of the
3998     branch. This is handled by branch_extra. */
3999    
4000 nigel 3 case '|':
4001 nigel 63 length += 1 + LINK_SIZE + branch_extra;
4002 nigel 3 continue;
4003    
4004 nigel 63 /* A character class uses 33 characters provided that all the character
4005     values are less than 256. Otherwise, it uses a bit map for low valued
4006     characters, and individual items for others. Don't worry about character
4007     types that aren't allowed in classes - they'll get picked up during the
4008     compile. A character class that contains only one single-byte character
4009     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4010     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4011 nigel 3
4012     case '[':
4013 nigel 63 class_optcount = 0;
4014    
4015     #ifdef SUPPORT_UTF8
4016     class_utf8 = FALSE;
4017     #endif
4018    
4019 nigel 3 if (*(++ptr) == '^') ptr++;
4020 nigel 63
4021     /* Written as a "do" so that an initial ']' is taken as data */
4022    
4023     if (*ptr != 0) do
4024 nigel 3 {
4025 nigel 63 /* Inside \Q...\E everything is literal except \E */
4026    
4027     if (inescq)
4028     {
4029     if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
4030     inescq = FALSE;
4031     ptr += 1;
4032     continue;
4033     }
4034    
4035     /* Outside \Q...\E, check for escapes */
4036    
4037 nigel 3 if (*ptr == '\\')
4038     {
4039 nigel 63 #ifdef SUPPORT_UTF8
4040     int prevchar = ptr[-1];
4041     #endif
4042 nigel 25 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
4043     &compile_block);
4044 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4045 nigel 63
4046     /* \b is backspace inside a class */
4047    
4048     if (-ch == ESC_b) ch = '\b';
4049    
4050     /* \Q enters quoting mode */
4051    
4052     if (-ch == ESC_Q)
4053     {
4054     inescq = TRUE;
4055     continue;
4056     }
4057    
4058     /* Handle escapes that turn into characters */
4059    
4060     if (ch >= 0)
4061     {
4062     #ifdef SUPPORT_UTF8
4063     if (utf8)
4064     {
4065     if (ch > 127) class_optcount = 10; /* Ensure > 1 */
4066     if (ch > 255)
4067     {
4068     uschar buffer[6];
4069     if (!class_utf8)
4070     {
4071     class_utf8 = TRUE;
4072     length += LINK_SIZE + 1 + 1;
4073     }
4074     length += 1 + ord2utf8(ch, buffer);
4075    
4076     /* If this wide character is preceded by '-', add an extra 2 to
4077     the length in case the previous character was < 128, because in
4078     this case the whole range will be put into the list. */
4079    
4080     if (prevchar == '-') length += 2;
4081     }
4082     }
4083     #endif
4084     class_optcount++; /* for possible optimization */
4085     }
4086     else class_optcount = 10; /* \d, \s etc; make sure > 1 */
4087 nigel 3 }
4088 nigel 63
4089     /* Check the syntax for POSIX stuff. The bits we actually handle are
4090     checked during the real compile phase. */
4091    
4092     else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4093     {
4094     ptr++;
4095     class_optcount = 10; /* Make sure > 1 */
4096     }
4097    
4098     /* Anything else just increments the possible optimization count. If
4099     there are wide characters, we are going to have to use an XCLASS. */
4100    
4101     else
4102     {
4103     NON_SPECIAL_CHARACTER:
4104     class_optcount++;
4105    
4106     #ifdef SUPPORT_UTF8