/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 71 - (hide annotations) (download)
Sat Feb 24 21:40:24 2007 UTC (6 years, 3 months ago) by nigel
File MIME type: text/plain
File size: 233449 byte(s)
Load pcre-4.4 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 63 Copyright (c) 1997-2003 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35     /* Define DEBUG to get debugging output on stdout. */
36    
37     /* #define DEBUG */
38    
39 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40     inline, and there are *still* stupid compilers about that don't like indented
41     pre-processor statements. I suppose it's only been 10 years... */
42 nigel 3
43 nigel 9 #ifdef DEBUG
44     #define DPRINTF(p) printf p
45     #else
46     #define DPRINTF(p) /*nothing*/
47     #endif
48    
49 nigel 3 /* Include the internals header, which itself includes Standard C headers plus
50     the external pcre header. */
51    
52     #include "internal.h"
53    
54    
55 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
56    
57     #ifdef __cplusplus
58     #define class pcre_class
59     #endif
60    
61    
62 nigel 53 /* Maximum number of items on the nested bracket stacks at compile time. This
63     applies to the nesting of all kinds of parentheses. It does not limit
64     un-nested, non-capturing parentheses. This number can be made bigger if
65     necessary - it is used to dimension one int and one unsigned char vector at
66     compile time. */
67 nigel 23
68     #define BRASTACK_SIZE 200
69    
70    
71 nigel 63 /* Maximum number of ints of offset to save on the stack for recursive calls.
72     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73     because the offset vector is always a multiple of 3 long. */
74    
75     #define REC_STACK_SAVE_MAX 30
76    
77    
78 nigel 49 /* The number of bytes in a literal character string above which we can't add
79 nigel 63 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80     could be 255 when UTF-8 support is excluded, but that means that some of the
81     test output would be different, which just complicates things.) */
82 nigel 49
83     #define MAXLIT 250
84    
85    
86 nigel 65 /* The maximum remaining length of subject we are prepared to search for a
87     req_byte match. */
88    
89     #define REQ_BYTE_MAX 1000
90    
91    
92 nigel 63 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93     the definition is next to the definition of the opcodes in internal.h. */
94    
95     static uschar OP_lengths[] = { OP_LENGTHS };
96    
97 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98    
99 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101 nigel 3
102     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103     are simple data values; negative values are for special things like \d and so
104     on. Zero means further processing is needed (for things like \x), or the escape
105     is invalid. */
106    
107 nigel 15 static const short int escapes[] = {
108 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 nigel 63 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
111 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 nigel 63 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 nigel 3 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 nigel 63 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
115     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
116 nigel 69 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
117 nigel 23 0, 0, -ESC_z /* x - z */
118 nigel 3 };
119    
120 nigel 43 /* Tables of names of POSIX character classes and their lengths. The list is
121     terminated by a zero length entry. The first three must be alpha, upper, lower,
122     as this is assumed for handling case independence. */
123    
124     static const char *posix_names[] = {
125     "alpha", "lower", "upper",
126 nigel 63 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127 nigel 43 "print", "punct", "space", "word", "xdigit" };
128    
129     static const uschar posix_name_lengths[] = {
130 nigel 63 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131 nigel 43
132     /* Table of class bit maps for each POSIX class; up to three may be combined
133 nigel 63 to form the class. The table for [:blank:] is dynamically modified to remove
134     the vertical space characters. */
135 nigel 43
136     static const int posix_class_maps[] = {
137     cbit_lower, cbit_upper, -1, /* alpha */
138     cbit_lower, -1, -1, /* lower */
139     cbit_upper, -1, -1, /* upper */
140     cbit_digit, cbit_lower, cbit_upper, /* alnum */
141     cbit_print, cbit_cntrl, -1, /* ascii */
142 nigel 63 cbit_space, -1, -1, /* blank - a GNU extension */
143 nigel 43 cbit_cntrl, -1, -1, /* cntrl */
144     cbit_digit, -1, -1, /* digit */
145     cbit_graph, -1, -1, /* graph */
146     cbit_print, -1, -1, /* print */
147     cbit_punct, -1, -1, /* punct */
148     cbit_space, -1, -1, /* space */
149 nigel 63 cbit_word, -1, -1, /* word - a Perl extension */
150 nigel 43 cbit_xdigit,-1, -1 /* xdigit */
151     };
152    
153 nigel 69 /* Table to identify ASCII digits and hex digits. This is used when compiling
154     patterns. Note that the tables in chartables are dependent on the locale, and
155     may mark arbitrary characters as digits - but the PCRE compiling code expects
156     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
157     a private table here. It costs 256 bytes, but it is a lot faster than doing
158     character value tests (at least in some simple cases I timed), and in some
159     applications one wants PCRE to compile efficiently as well as match
160     efficiently.
161 nigel 43
162 nigel 69 For convenience, we use the same bit definitions as in chartables:
163    
164     0x04 decimal digit
165     0x08 hexadecimal digit
166    
167     Then we can use ctype_digit and ctype_xdigit in the code. */
168    
169     static const unsigned char digitab[] =
170     {
171     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
172     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
173     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
174     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
175     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
176     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
177     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
178     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
179     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
180     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
181     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
182     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
183     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
184     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
185     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
186     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
187     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
188     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
189     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
190     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
191     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
192     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
193     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
194     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
195     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
196     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
197     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
198     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
199     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
200     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
201     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
202     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
203    
204 nigel 3 /* Definition to allow mutual recursion */
205    
206 nigel 13 static BOOL
207 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
208 nigel 63 BOOL, int, int *, int *, branch_chain *, compile_data *);
209 nigel 3
210 nigel 47 /* Structure for building a chain of data that actually lives on the
211     stack, for holding the values of the subject pointer at the start of each
212     subpattern, so as to detect when an empty string has been matched by a
213     subpattern - to break infinite loops. */
214 nigel 3
215 nigel 47 typedef struct eptrblock {
216     struct eptrblock *prev;
217     const uschar *saved_eptr;
218     } eptrblock;
219 nigel 3
220 nigel 47 /* Flag bits for the match() function */
221    
222     #define match_condassert 0x01 /* Called to check a condition assertion */
223     #define match_isgroup 0x02 /* Set if start of bracketed group */
224    
225 nigel 63 /* Non-error returns from the match() function. Error returns are externally
226     defined PCRE_ERROR_xxx codes, which are all negative. */
227 nigel 47
228 nigel 63 #define MATCH_MATCH 1
229     #define MATCH_NOMATCH 0
230 nigel 47
231 nigel 63
232    
233 nigel 3 /*************************************************
234     * Global variables *
235     *************************************************/
236    
237     /* PCRE is thread-clean and doesn't use any global variables in the normal
238     sense. However, it calls memory allocation and free functions via the two
239 nigel 63 indirections below, and it can optionally do callouts. These values can be
240     changed by the caller, but are shared between all threads. However, when
241     compiling for Virtual Pascal, things are done differently (see pcre.in). */
242 nigel 3
243 nigel 63 #ifndef VPCOMPAT
244 nigel 71 #ifdef __cplusplus
245     extern "C" void *(*pcre_malloc)(size_t) = malloc;
246     extern "C" void (*pcre_free)(void *) = free;
247     extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
248     #else
249 nigel 3 void *(*pcre_malloc)(size_t) = malloc;
250     void (*pcre_free)(void *) = free;
251 nigel 63 int (*pcre_callout)(pcre_callout_block *) = NULL;
252     #endif
253 nigel 71 #endif
254 nigel 3
255    
256 nigel 49 /*************************************************
257     * Macros and tables for character handling *
258     *************************************************/
259 nigel 3
260 nigel 49 /* When UTF-8 encoding is being used, a character is no longer just a single
261     byte. The macros for character handling generate simple sequences when used in
262     byte-mode, and more complicated ones for UTF-8 characters. */
263    
264     #ifndef SUPPORT_UTF8
265 nigel 63 #define GETCHAR(c, eptr) c = *eptr;
266 nigel 49 #define GETCHARINC(c, eptr) c = *eptr++;
267 nigel 63 #define GETCHARINCTEST(c, eptr) c = *eptr++;
268 nigel 49 #define GETCHARLEN(c, eptr, len) c = *eptr;
269     #define BACKCHAR(eptr)
270    
271     #else /* SUPPORT_UTF8 */
272    
273 nigel 63 /* Get the next UTF-8 character, not advancing the pointer. This is called when
274     we know we are in UTF-8 mode. */
275 nigel 49
276 nigel 63 #define GETCHAR(c, eptr) \
277     c = *eptr; \
278     if ((c & 0xc0) == 0xc0) \
279     { \
280 nigel 67 int gcii; \
281     int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
282     int gcss = 6*gcaa; \
283     c = (c & utf8_table3[gcaa]) << gcss; \
284     for (gcii = 1; gcii <= gcaa; gcii++) \
285 nigel 63 { \
286 nigel 67 gcss -= 6; \
287     c |= (eptr[gcii] & 0x3f) << gcss; \
288 nigel 63 } \
289     }
290    
291     /* Get the next UTF-8 character, advancing the pointer. This is called when we
292     know we are in UTF-8 mode. */
293    
294 nigel 49 #define GETCHARINC(c, eptr) \
295     c = *eptr++; \
296 nigel 63 if ((c & 0xc0) == 0xc0) \
297     { \
298 nigel 67 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
299     int gcss = 6*gcaa; \
300     c = (c & utf8_table3[gcaa]) << gcss; \
301     while (gcaa-- > 0) \
302 nigel 63 { \
303 nigel 67 gcss -= 6; \
304     c |= (*eptr++ & 0x3f) << gcss; \
305 nigel 63 } \
306     }
307    
308     /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
309    
310     #define GETCHARINCTEST(c, eptr) \
311     c = *eptr++; \
312 nigel 49 if (md->utf8 && (c & 0xc0) == 0xc0) \
313     { \
314 nigel 67 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
315     int gcss = 6*gcaa; \
316     c = (c & utf8_table3[gcaa]) << gcss; \
317     while (gcaa-- > 0) \
318 nigel 49 { \
319 nigel 67 gcss -= 6; \
320     c |= (*eptr++ & 0x3f) << gcss; \
321 nigel 49 } \
322     }
323    
324 nigel 63 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
325     if there are extra bytes. This is called when we know we are in UTF-8 mode. */
326 nigel 49
327     #define GETCHARLEN(c, eptr, len) \
328     c = *eptr; \
329 nigel 63 if ((c & 0xc0) == 0xc0) \
330 nigel 49 { \
331 nigel 67 int gcii; \
332     int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
333     int gcss = 6*gcaa; \
334     c = (c & utf8_table3[gcaa]) << gcss; \
335     for (gcii = 1; gcii <= gcaa; gcii++) \
336 nigel 49 { \
337 nigel 67 gcss -= 6; \
338     c |= (eptr[gcii] & 0x3f) << gcss; \
339 nigel 49 } \
340 nigel 67 len += gcaa; \
341 nigel 49 }
342    
343     /* If the pointer is not at the start of a character, move it back until
344 nigel 63 it is. Called only in UTF-8 mode. */
345 nigel 49
346     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
347    
348     #endif
349    
350    
351    
352 nigel 3 /*************************************************
353 nigel 25 * Default character tables *
354     *************************************************/
355    
356     /* A default set of character tables is included in the PCRE binary. Its source
357     is built by the maketables auxiliary program, which uses the default C ctypes
358     functions, and put in the file chartables.c. These tables are used by PCRE
359     whenever the caller of pcre_compile() does not provide an alternate set of
360     tables. */
361    
362     #include "chartables.c"
363    
364    
365    
366 nigel 49 #ifdef SUPPORT_UTF8
367 nigel 25 /*************************************************
368 nigel 49 * Tables for UTF-8 support *
369     *************************************************/
370    
371     /* These are the breakpoints for different numbers of bytes in a UTF-8
372     character. */
373    
374 nigel 69 static const int utf8_table1[] =
375     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
376 nigel 49
377     /* These are the indicator bits and the mask for the data bits to set in the
378     first byte of a character, indexed by the number of additional bytes. */
379    
380 nigel 69 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
381     static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
382 nigel 49
383     /* Table of the number of extra characters, indexed by the first character
384     masked with 0x3f. The highest number for a valid UTF-8 character is in fact
385     0x3d. */
386    
387 nigel 69 static const uschar utf8_table4[] = {
388 nigel 49 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
389     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
390     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
391     3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
392    
393    
394     /*************************************************
395     * Convert character value to UTF-8 *
396     *************************************************/
397    
398     /* This function takes an integer value in the range 0 - 0x7fffffff
399     and encodes it as a UTF-8 character in 0 to 6 bytes.
400    
401     Arguments:
402     cvalue the character value
403     buffer pointer to buffer for result - at least 6 bytes long
404    
405     Returns: number of characters placed in the buffer
406     */
407    
408     static int
409     ord2utf8(int cvalue, uschar *buffer)
410     {
411     register int i, j;
412     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
413     if (cvalue <= utf8_table1[i]) break;
414 nigel 59 buffer += i;
415     for (j = i; j > 0; j--)
416     {
417     *buffer-- = 0x80 | (cvalue & 0x3f);
418     cvalue >>= 6;
419     }
420     *buffer = utf8_table2[i] | cvalue;
421 nigel 49 return i + 1;
422     }
423     #endif
424    
425    
426    
427     /*************************************************
428 nigel 63 * Print compiled regex *
429     *************************************************/
430    
431     /* The code for doing this is held in a separate file that is also included in
432     pcretest.c. It defines a function called print_internals(). */
433    
434     #ifdef DEBUG
435     #include "printint.c"
436     #endif
437    
438    
439    
440     /*************************************************
441 nigel 3 * Return version string *
442     *************************************************/
443    
444 nigel 39 #define STRING(a) # a
445     #define XSTRING(s) STRING(s)
446    
447 nigel 7 const char *
448 nigel 3 pcre_version(void)
449     {
450 nigel 39 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
451 nigel 3 }
452    
453    
454    
455    
456     /*************************************************
457 nigel 43 * (Obsolete) Return info about compiled pattern *
458 nigel 3 *************************************************/
459    
460 nigel 43 /* This is the original "info" function. It picks potentially useful data out
461     of the private structure, but its interface was too rigid. It remains for
462     backwards compatibility. The public options are passed back in an int - though
463     the re->options field has been expanded to a long int, all the public options
464 nigel 37 at the low end of it, and so even on 16-bit systems this will still be OK.
465     Therefore, I haven't changed the API for pcre_info().
466 nigel 3
467     Arguments:
468     external_re points to compiled code
469     optptr where to pass back the options
470 nigel 63 first_byte where to pass back the first character,
471 nigel 3 or -1 if multiline and all branches start ^,
472     or -2 otherwise
473    
474 nigel 43 Returns: number of capturing subpatterns
475 nigel 3 or negative values on error
476     */
477    
478     int
479 nigel 63 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
480 nigel 3 {
481 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
482 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
483     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
484 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
485 nigel 63 if (first_byte != NULL)
486     *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
487 nigel 3 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
488     return re->top_bracket;
489     }
490    
491    
492    
493 nigel 43 /*************************************************
494     * Return info about compiled pattern *
495     *************************************************/
496 nigel 3
497 nigel 43 /* This is a newer "info" function which has an extensible interface so
498     that additional items can be added compatibly.
499    
500     Arguments:
501     external_re points to compiled code
502 nigel 63 extra_data points extra data, or NULL
503 nigel 43 what what information is required
504     where where to put the information
505    
506     Returns: 0 if data returned, negative on error
507     */
508    
509     int
510 nigel 63 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
511 nigel 43 void *where)
512     {
513     const real_pcre *re = (const real_pcre *)external_re;
514 nigel 63 const pcre_study_data *study = NULL;
515 nigel 43
516     if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
517     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
518    
519 nigel 63 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
520 nigel 71 study = (const pcre_study_data *)extra_data->study_data;
521 nigel 63
522 nigel 43 switch (what)
523     {
524     case PCRE_INFO_OPTIONS:
525     *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
526     break;
527    
528     case PCRE_INFO_SIZE:
529     *((size_t *)where) = re->size;
530     break;
531    
532 nigel 63 case PCRE_INFO_STUDYSIZE:
533     *((size_t *)where) = (study == NULL)? 0 : study->size;
534     break;
535    
536 nigel 43 case PCRE_INFO_CAPTURECOUNT:
537     *((int *)where) = re->top_bracket;
538     break;
539    
540     case PCRE_INFO_BACKREFMAX:
541     *((int *)where) = re->top_backref;
542     break;
543    
544 nigel 63 case PCRE_INFO_FIRSTBYTE:
545 nigel 43 *((int *)where) =
546 nigel 63 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
547 nigel 43 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
548     break;
549    
550     case PCRE_INFO_FIRSTTABLE:
551     *((const uschar **)where) =
552     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
553     study->start_bits : NULL;
554     break;
555    
556     case PCRE_INFO_LASTLITERAL:
557     *((int *)where) =
558 nigel 63 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
559 nigel 43 break;
560    
561 nigel 63 case PCRE_INFO_NAMEENTRYSIZE:
562     *((int *)where) = re->name_entry_size;
563     break;
564    
565     case PCRE_INFO_NAMECOUNT:
566     *((int *)where) = re->name_count;
567     break;
568    
569     case PCRE_INFO_NAMETABLE:
570     *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
571     break;
572    
573 nigel 43 default: return PCRE_ERROR_BADOPTION;
574     }
575    
576     return 0;
577     }
578    
579    
580    
581 nigel 63 /*************************************************
582     * Return info about what features are configured *
583     *************************************************/
584    
585     /* This is function which has an extensible interface so that additional items
586     can be added compatibly.
587    
588     Arguments:
589     what what information is required
590     where where to put the information
591    
592     Returns: 0 if data returned, negative on error
593     */
594    
595     int
596     pcre_config(int what, void *where)
597     {
598     switch (what)
599     {
600     case PCRE_CONFIG_UTF8:
601 nigel 71 #ifdef SUPPORT_UTF8
602 nigel 63 *((int *)where) = 1;
603 nigel 71 #else
604 nigel 63 *((int *)where) = 0;
605 nigel 71 #endif
606 nigel 63 break;
607    
608     case PCRE_CONFIG_NEWLINE:
609     *((int *)where) = NEWLINE;
610     break;
611    
612     case PCRE_CONFIG_LINK_SIZE:
613     *((int *)where) = LINK_SIZE;
614     break;
615    
616     case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
617     *((int *)where) = POSIX_MALLOC_THRESHOLD;
618     break;
619    
620     case PCRE_CONFIG_MATCH_LIMIT:
621     *((unsigned int *)where) = MATCH_LIMIT;
622     break;
623    
624     default: return PCRE_ERROR_BADOPTION;
625     }
626    
627     return 0;
628     }
629    
630    
631    
632 nigel 3 #ifdef DEBUG
633     /*************************************************
634     * Debugging function to print chars *
635     *************************************************/
636    
637     /* Print a sequence of chars in printable format, stopping at the end of the
638     subject if the requested.
639    
640     Arguments:
641     p points to characters
642     length number to print
643     is_subject TRUE if printing from within md->start_subject
644     md pointer to matching data block, if is_subject is TRUE
645    
646     Returns: nothing
647     */
648    
649 nigel 9 static void
650     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
651 nigel 3 {
652     int c;
653     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
654     while (length-- > 0)
655     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
656     }
657     #endif
658    
659    
660    
661    
662     /*************************************************
663     * Handle escapes *
664     *************************************************/
665    
666     /* This function is called when a \ has been encountered. It either returns a
667     positive value for a simple escape such as \n, or a negative value which
668 nigel 49 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
669     a positive value greater than 255 may be returned. On entry, ptr is pointing at
670     the \. On exit, it is on the final character of the escape sequence.
671 nigel 3
672     Arguments:
673     ptrptr points to the pattern position pointer
674     errorptr points to the pointer to the error message
675     bracount number of previous extracting brackets
676     options the options bits
677     isclass TRUE if inside a character class
678    
679     Returns: zero or positive => a data character
680     negative => a special escape sequence
681     on error, errorptr is set
682     */
683    
684     static int
685 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
686 nigel 71 int options, BOOL isclass)
687 nigel 3 {
688 nigel 7 const uschar *ptr = *ptrptr;
689 nigel 43 int c, i;
690 nigel 3
691 nigel 49 /* If backslash is at the end of the pattern, it's an error. */
692    
693     c = *(++ptr);
694 nigel 3 if (c == 0) *errorptr = ERR1;
695    
696     /* Digits or letters may have special meaning; all others are literals. */
697    
698     else if (c < '0' || c > 'z') {}
699    
700     /* Do an initial lookup in a table. A non-zero result is something that can be
701     returned immediately. Otherwise further processing may be required. */
702    
703     else if ((i = escapes[c - '0']) != 0) c = i;
704    
705     /* Escapes that need further processing, or are illegal. */
706    
707     else
708     {
709 nigel 7 const uschar *oldptr;
710 nigel 3 switch (c)
711     {
712 nigel 63 /* A number of Perl escapes are not handled by PCRE. We give an explicit
713     error. */
714    
715     case 'l':
716     case 'L':
717     case 'N':
718     case 'p':
719     case 'P':
720     case 'u':
721     case 'U':
722     case 'X':
723     *errorptr = ERR37;
724     break;
725    
726 nigel 3 /* The handling of escape sequences consisting of a string of digits
727     starting with one that is not zero is not straightforward. By experiment,
728     the way Perl works seems to be as follows:
729    
730     Outside a character class, the digits are read as a decimal number. If the
731     number is less than 10, or if there are that many previous extracting
732     left brackets, then it is a back reference. Otherwise, up to three octal
733     digits are read to form an escaped byte. Thus \123 is likely to be octal
734     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
735     value is greater than 377, the least significant 8 bits are taken. Inside a
736     character class, \ followed by a digit is always an octal number. */
737    
738     case '1': case '2': case '3': case '4': case '5':
739     case '6': case '7': case '8': case '9':
740    
741     if (!isclass)
742     {
743     oldptr = ptr;
744     c -= '0';
745 nigel 69 while ((digitab[ptr[1]] & ctype_digit) != 0)
746 nigel 3 c = c * 10 + *(++ptr) - '0';
747     if (c < 10 || c <= bracount)
748     {
749     c = -(ESC_REF + c);
750     break;
751     }
752     ptr = oldptr; /* Put the pointer back and fall through */
753     }
754    
755     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
756     generates a binary zero byte and treats the digit as a following literal.
757     Thus we have to pull back the pointer by one. */
758    
759     if ((c = *ptr) >= '8')
760     {
761     ptr--;
762     c = 0;
763     break;
764     }
765    
766     /* \0 always starts an octal number, but we may drop through to here with a
767 nigel 49 larger first octal digit. */
768 nigel 3
769     case '0':
770     c -= '0';
771 nigel 69 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
772 nigel 3 c = c * 8 + *(++ptr) - '0';
773 nigel 49 c &= 255; /* Take least significant 8 bits */
774 nigel 3 break;
775    
776 nigel 49 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
777     which can be greater than 0xff, but only if the ddd are hex digits. */
778 nigel 3
779     case 'x':
780 nigel 49 #ifdef SUPPORT_UTF8
781     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
782     {
783     const uschar *pt = ptr + 2;
784     register int count = 0;
785     c = 0;
786 nigel 69 while ((digitab[*pt] & ctype_xdigit) != 0)
787 nigel 49 {
788 nigel 69 int cc = *pt++;
789     if (cc >= 'a') cc -= 32; /* Convert to upper case */
790 nigel 49 count++;
791 nigel 69 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
792 nigel 49 }
793     if (*pt == '}')
794     {
795     if (c < 0 || count > 8) *errorptr = ERR34;
796     ptr = pt;
797     break;
798     }
799     /* If the sequence of hex digits does not end with '}', then we don't
800     recognize this construct; fall through to the normal \x handling. */
801     }
802     #endif
803    
804     /* Read just a single hex char */
805    
806 nigel 3 c = 0;
807 nigel 69 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
808 nigel 3 {
809 nigel 71 int cc; /* Some compilers don't like ++ */
810     cc = *(++ptr); /* in initializers */
811 nigel 69 if (cc >= 'a') cc -= 32; /* Convert to upper case */
812     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
813 nigel 3 }
814     break;
815    
816 nigel 49 /* Other special escapes not starting with a digit are straightforward */
817    
818 nigel 3 case 'c':
819     c = *(++ptr);
820     if (c == 0)
821     {
822     *errorptr = ERR2;
823     return 0;
824     }
825    
826 nigel 69 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
827     is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
828 nigel 3
829 nigel 69 if (c >= 'a' && c <= 'z') c -= 32;
830 nigel 3 c ^= 0x40;
831     break;
832    
833     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
834     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
835 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
836     there used to be some cases other than the default, and there may be again
837     in future, so I haven't "optimized" it. */
838 nigel 3
839     default:
840     if ((options & PCRE_EXTRA) != 0) switch(c)
841     {
842     default:
843     *errorptr = ERR3;
844     break;
845     }
846     break;
847     }
848     }
849    
850     *ptrptr = ptr;
851     return c;
852     }
853    
854    
855    
856     /*************************************************
857     * Check for counted repeat *
858     *************************************************/
859    
860     /* This function is called when a '{' is encountered in a place where it might
861     start a quantifier. It looks ahead to see if it really is a quantifier or not.
862     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
863     where the ddds are digits.
864    
865     Arguments:
866     p pointer to the first char after '{'
867    
868     Returns: TRUE or FALSE
869     */
870    
871     static BOOL
872 nigel 71 is_counted_repeat(const uschar *p)
873 nigel 3 {
874 nigel 69 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
875     while ((digitab[*p] & ctype_digit) != 0) p++;
876 nigel 3 if (*p == '}') return TRUE;
877    
878     if (*p++ != ',') return FALSE;
879     if (*p == '}') return TRUE;
880    
881 nigel 69 if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
882     while ((digitab[*p] & ctype_digit) != 0) p++;
883    
884 nigel 3 return (*p == '}');
885     }
886    
887    
888    
889     /*************************************************
890     * Read repeat counts *
891     *************************************************/
892    
893     /* Read an item of the form {n,m} and return the values. This is called only
894     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
895     so the syntax is guaranteed to be correct, but we need to check the values.
896    
897     Arguments:
898     p pointer to first char after '{'
899     minp pointer to int for min
900     maxp pointer to int for max
901     returned as -1 if no max
902     errorptr points to pointer to error message
903    
904     Returns: pointer to '}' on success;
905     current ptr on error, with errorptr set
906     */
907    
908 nigel 7 static const uschar *
909 nigel 71 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
910 nigel 3 {
911     int min = 0;
912     int max = -1;
913    
914 nigel 69 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
915 nigel 3
916     if (*p == '}') max = min; else
917     {
918     if (*(++p) != '}')
919     {
920     max = 0;
921 nigel 69 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
922 nigel 3 if (max < min)
923     {
924     *errorptr = ERR4;
925     return p;
926     }
927     }
928     }
929    
930     /* Do paranoid checks, then fill in the required variables, and pass back the
931     pointer to the terminating '}'. */
932    
933     if (min > 65535 || max > 65535)
934     *errorptr = ERR5;
935     else
936     {
937     *minp = min;
938     *maxp = max;
939     }
940     return p;
941     }
942    
943    
944    
945     /*************************************************
946 nigel 63 * Find first significant op code *
947     *************************************************/
948    
949     /* This is called by several functions that scan a compiled expression looking
950     for a fixed first character, or an anchoring op code etc. It skips over things
951     that do not influence this. For some calls, a change of option is important.
952    
953     Arguments:
954     code pointer to the start of the group
955     options pointer to external options
956     optbit the option bit whose changing is significant, or
957     zero if none are
958    
959     Returns: pointer to the first significant opcode
960     */
961    
962     static const uschar*
963     first_significant_code(const uschar *code, int *options, int optbit)
964     {
965     for (;;)
966     {
967     switch ((int)*code)
968     {
969     case OP_OPT:
970     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
971     *options = (int)code[1];
972     code += 2;
973     break;
974    
975     case OP_ASSERT_NOT:
976     case OP_ASSERTBACK:
977     case OP_ASSERTBACK_NOT:
978     do code += GET(code, 1); while (*code == OP_ALT);
979     /* Fall through */
980    
981     case OP_CALLOUT:
982     case OP_CREF:
983     case OP_BRANUMBER:
984     case OP_WORD_BOUNDARY:
985     case OP_NOT_WORD_BOUNDARY:
986     code += OP_lengths[*code];
987     break;
988    
989     default:
990     return code;
991     }
992     }
993     /* Control never reaches here */
994     }
995    
996    
997    
998    
999     /*************************************************
1000 nigel 23 * Find the fixed length of a pattern *
1001     *************************************************/
1002    
1003     /* Scan a pattern and compute the fixed length of subject that will match it,
1004     if the length is fixed. This is needed for dealing with backward assertions.
1005 nigel 63 In UTF8 mode, the result is in characters rather than bytes.
1006 nigel 23
1007     Arguments:
1008     code points to the start of the pattern (the bracket)
1009 nigel 49 options the compiling options
1010 nigel 23
1011 nigel 63 Returns: the fixed length, or -1 if there is no fixed length,
1012     or -2 if \C was encountered
1013 nigel 23 */
1014    
1015     static int
1016 nigel 49 find_fixedlength(uschar *code, int options)
1017 nigel 23 {
1018     int length = -1;
1019    
1020     register int branchlength = 0;
1021 nigel 63 register uschar *cc = code + 1 + LINK_SIZE;
1022 nigel 23
1023     /* Scan along the opcodes for this branch. If we get to the end of the
1024     branch, check the length against that of the other branches. */
1025    
1026     for (;;)
1027     {
1028     int d;
1029     register int op = *cc;
1030     if (op >= OP_BRA) op = OP_BRA;
1031    
1032     switch (op)
1033     {
1034     case OP_BRA:
1035     case OP_ONCE:
1036     case OP_COND:
1037 nigel 49 d = find_fixedlength(cc, options);
1038 nigel 63 if (d < 0) return d;
1039 nigel 23 branchlength += d;
1040 nigel 63 do cc += GET(cc, 1); while (*cc == OP_ALT);
1041     cc += 1 + LINK_SIZE;
1042 nigel 23 break;
1043    
1044     /* Reached end of a branch; if it's a ket it is the end of a nested
1045     call. If it's ALT it is an alternation in a nested call. If it is
1046     END it's the end of the outer call. All can be handled by the same code. */
1047    
1048     case OP_ALT:
1049     case OP_KET:
1050     case OP_KETRMAX:
1051     case OP_KETRMIN:
1052     case OP_END:
1053     if (length < 0) length = branchlength;
1054     else if (length != branchlength) return -1;
1055     if (*cc != OP_ALT) return length;
1056 nigel 63 cc += 1 + LINK_SIZE;
1057 nigel 23 branchlength = 0;
1058     break;
1059    
1060     /* Skip over assertive subpatterns */
1061    
1062     case OP_ASSERT:
1063     case OP_ASSERT_NOT:
1064     case OP_ASSERTBACK:
1065     case OP_ASSERTBACK_NOT:
1066 nigel 63 do cc += GET(cc, 1); while (*cc == OP_ALT);
1067     /* Fall through */
1068 nigel 23
1069     /* Skip over things that don't match chars */
1070    
1071     case OP_REVERSE:
1072 nigel 53 case OP_BRANUMBER:
1073     case OP_CREF:
1074 nigel 23 case OP_OPT:
1075 nigel 63 case OP_CALLOUT:
1076 nigel 23 case OP_SOD:
1077 nigel 63 case OP_SOM:
1078 nigel 23 case OP_EOD:
1079     case OP_EODN:
1080     case OP_CIRC:
1081     case OP_DOLL:
1082     case OP_NOT_WORD_BOUNDARY:
1083     case OP_WORD_BOUNDARY:
1084 nigel 63 cc += OP_lengths[*cc];
1085 nigel 23 break;
1086    
1087 nigel 49 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1088     This requires a scan of the string, unfortunately. We assume valid UTF-8
1089 nigel 63 strings, so all we do is reduce the length by one for every byte whose bits
1090     are 10xxxxxx. */
1091 nigel 23
1092     case OP_CHARS:
1093     branchlength += *(++cc);
1094 nigel 49 #ifdef SUPPORT_UTF8
1095 nigel 63 if ((options & PCRE_UTF8) != 0)
1096     for (d = 1; d <= *cc; d++)
1097     if ((cc[d] & 0xc0) == 0x80) branchlength--;
1098 nigel 49 #endif
1099 nigel 23 cc += *cc + 1;
1100     break;
1101    
1102 nigel 63 /* Handle exact repetitions. The count is already in characters, but we
1103     need to skip over a multibyte character in UTF8 mode. */
1104 nigel 23
1105     case OP_EXACT:
1106 nigel 63 branchlength += GET2(cc,1);
1107     cc += 4;
1108     #ifdef SUPPORT_UTF8
1109     if ((options & PCRE_UTF8) != 0)
1110     {
1111     while((*cc & 0x80) == 0x80) cc++;
1112     }
1113     #endif
1114     break;
1115    
1116 nigel 23 case OP_TYPEEXACT:
1117 nigel 63 branchlength += GET2(cc,1);
1118 nigel 23 cc += 4;
1119     break;
1120    
1121     /* Handle single-char matchers */
1122    
1123     case OP_NOT_DIGIT:
1124     case OP_DIGIT:
1125     case OP_NOT_WHITESPACE:
1126     case OP_WHITESPACE:
1127     case OP_NOT_WORDCHAR:
1128     case OP_WORDCHAR:
1129     case OP_ANY:
1130     branchlength++;
1131     cc++;
1132     break;
1133    
1134 nigel 63 /* The single-byte matcher isn't allowed */
1135 nigel 23
1136 nigel 63 case OP_ANYBYTE:
1137     return -2;
1138    
1139 nigel 23 /* Check a class for variable quantification */
1140    
1141 nigel 63 #ifdef SUPPORT_UTF8
1142     case OP_XCLASS:
1143     cc += GET(cc, 1) - 33;
1144     /* Fall through */
1145     #endif
1146    
1147 nigel 23 case OP_CLASS:
1148 nigel 63 case OP_NCLASS:
1149 nigel 53 cc += 33;
1150 nigel 23
1151     switch (*cc)
1152     {
1153     case OP_CRSTAR:
1154     case OP_CRMINSTAR:
1155     case OP_CRQUERY:
1156     case OP_CRMINQUERY:
1157     return -1;
1158    
1159     case OP_CRRANGE:
1160     case OP_CRMINRANGE:
1161 nigel 63 if (GET2(cc,1) != GET2(cc,3)) return -1;
1162     branchlength += GET2(cc,1);
1163 nigel 23 cc += 5;
1164     break;
1165    
1166     default:
1167     branchlength++;
1168     }
1169     break;
1170    
1171     /* Anything else is variable length */
1172    
1173     default:
1174     return -1;
1175     }
1176     }
1177     /* Control never gets here */
1178     }
1179    
1180    
1181    
1182    
1183     /*************************************************
1184 nigel 63 * Scan compiled regex for numbered bracket *
1185     *************************************************/
1186    
1187     /* This little function scans through a compiled pattern until it finds a
1188     capturing bracket with the given number.
1189    
1190     Arguments:
1191     code points to start of expression
1192     utf8 TRUE in UTF-8 mode
1193     number the required bracket number
1194    
1195     Returns: pointer to the opcode for the bracket, or NULL if not found
1196     */
1197    
1198     static const uschar *
1199     find_bracket(const uschar *code, BOOL utf8, int number)
1200     {
1201 nigel 65 #ifndef SUPPORT_UTF8
1202     utf8 = utf8; /* Stop pedantic compilers complaining */
1203     #endif
1204    
1205 nigel 63 for (;;)
1206     {
1207     register int c = *code;
1208     if (c == OP_END) return NULL;
1209     else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1210     else if (c > OP_BRA)
1211     {
1212     int n = c - OP_BRA;
1213     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1214     if (n == number) return (uschar *)code;
1215     code += OP_lengths[OP_BRA];
1216     }
1217     else
1218     {
1219     code += OP_lengths[c];
1220    
1221     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1222     by a multi-byte character. The length in the table is a minimum, so we have
1223     to scan along to skip the extra characters. All opcodes are less than 128,
1224     so we can use relatively efficient code. */
1225    
1226     #ifdef SUPPORT_UTF8
1227     if (utf8) switch(c)
1228     {
1229     case OP_EXACT:
1230     case OP_UPTO:
1231     case OP_MINUPTO:
1232     case OP_STAR:
1233     case OP_MINSTAR:
1234     case OP_PLUS:
1235     case OP_MINPLUS:
1236     case OP_QUERY:
1237     case OP_MINQUERY:
1238     while ((*code & 0xc0) == 0x80) code++;
1239     break;
1240     }
1241     #endif
1242     }
1243     }
1244     }
1245    
1246    
1247    
1248     /*************************************************
1249     * Scan compiled branch for non-emptiness *
1250     *************************************************/
1251    
1252     /* This function scans through a branch of a compiled pattern to see whether it
1253     can match the empty string or not. It is called only from could_be_empty()
1254     below. Note that first_significant_code() skips over assertions. If we hit an
1255     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1256     whose current branch will already have been scanned.
1257    
1258     Arguments:
1259     code points to start of search
1260     endcode points to where to stop
1261     utf8 TRUE if in UTF8 mode
1262    
1263     Returns: TRUE if what is matched could be empty
1264     */
1265    
1266     static BOOL
1267     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1268     {
1269     register int c;
1270     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1271     code < endcode;
1272     code = first_significant_code(code + OP_lengths[c], NULL, 0))
1273     {
1274     const uschar *ccode;
1275    
1276     c = *code;
1277    
1278     if (c >= OP_BRA)
1279     {
1280     BOOL empty_branch;
1281     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1282    
1283     /* Scan a closed bracket */
1284    
1285     empty_branch = FALSE;
1286     do
1287     {
1288     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1289     empty_branch = TRUE;
1290     code += GET(code, 1);
1291     }
1292     while (*code == OP_ALT);
1293     if (!empty_branch) return FALSE; /* All branches are non-empty */
1294     code += 1 + LINK_SIZE;
1295     c = *code;
1296     }
1297    
1298     else switch (c)
1299     {
1300     /* Check for quantifiers after a class */
1301    
1302     #ifdef SUPPORT_UTF8
1303     case OP_XCLASS:
1304     ccode = code + GET(code, 1);
1305     goto CHECK_CLASS_REPEAT;
1306     #endif
1307    
1308     case OP_CLASS:
1309     case OP_NCLASS:
1310     ccode = code + 33;
1311    
1312     #ifdef SUPPORT_UTF8
1313     CHECK_CLASS_REPEAT:
1314     #endif
1315    
1316     switch (*ccode)
1317     {
1318     case OP_CRSTAR: /* These could be empty; continue */
1319     case OP_CRMINSTAR:
1320     case OP_CRQUERY:
1321     case OP_CRMINQUERY:
1322     break;
1323    
1324     default: /* Non-repeat => class must match */
1325     case OP_CRPLUS: /* These repeats aren't empty */
1326     case OP_CRMINPLUS:
1327     return FALSE;
1328    
1329     case OP_CRRANGE:
1330     case OP_CRMINRANGE:
1331     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1332     break;
1333     }
1334     break;
1335    
1336     /* Opcodes that must match a character */
1337    
1338     case OP_NOT_DIGIT:
1339     case OP_DIGIT:
1340     case OP_NOT_WHITESPACE:
1341     case OP_WHITESPACE:
1342     case OP_NOT_WORDCHAR:
1343     case OP_WORDCHAR:
1344     case OP_ANY:
1345     case OP_ANYBYTE:
1346     case OP_CHARS:
1347     case OP_NOT:
1348     case OP_PLUS:
1349     case OP_MINPLUS:
1350     case OP_EXACT:
1351     case OP_NOTPLUS:
1352     case OP_NOTMINPLUS:
1353     case OP_NOTEXACT:
1354     case OP_TYPEPLUS:
1355     case OP_TYPEMINPLUS:
1356     case OP_TYPEEXACT:
1357     return FALSE;
1358    
1359     /* End of branch */
1360    
1361     case OP_KET:
1362     case OP_KETRMAX:
1363     case OP_KETRMIN:
1364     case OP_ALT:
1365     return TRUE;
1366    
1367     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1368     followed by a multibyte character */
1369    
1370     #ifdef SUPPORT_UTF8
1371     case OP_STAR:
1372     case OP_MINSTAR:
1373     case OP_QUERY:
1374     case OP_MINQUERY:
1375     case OP_UPTO:
1376     case OP_MINUPTO:
1377     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1378     break;
1379     #endif
1380     }
1381     }
1382    
1383     return TRUE;
1384     }
1385    
1386    
1387    
1388     /*************************************************
1389     * Scan compiled regex for non-emptiness *
1390     *************************************************/
1391    
1392     /* This function is called to check for left recursive calls. We want to check
1393     the current branch of the current pattern to see if it could match the empty
1394     string. If it could, we must look outwards for branches at other levels,
1395     stopping when we pass beyond the bracket which is the subject of the recursion.
1396    
1397     Arguments:
1398     code points to start of the recursion
1399     endcode points to where to stop (current RECURSE item)
1400     bcptr points to the chain of current (unclosed) branch starts
1401     utf8 TRUE if in UTF-8 mode
1402    
1403     Returns: TRUE if what is matched could be empty
1404     */
1405    
1406     static BOOL
1407     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1408     BOOL utf8)
1409     {
1410     while (bcptr != NULL && bcptr->current >= code)
1411     {
1412     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1413     bcptr = bcptr->outer;
1414     }
1415     return TRUE;
1416     }
1417    
1418    
1419    
1420     /*************************************************
1421 nigel 43 * Check for POSIX class syntax *
1422     *************************************************/
1423    
1424     /* This function is called when the sequence "[:" or "[." or "[=" is
1425     encountered in a character class. It checks whether this is followed by an
1426     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1427     ".]" or "=]".
1428    
1429     Argument:
1430     ptr pointer to the initial [
1431     endptr where to return the end pointer
1432     cd pointer to compile data
1433    
1434     Returns: TRUE or FALSE
1435     */
1436    
1437     static BOOL
1438     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1439     {
1440     int terminator; /* Don't combine these lines; the Solaris cc */
1441     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1442     if (*(++ptr) == '^') ptr++;
1443     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1444     if (*ptr == terminator && ptr[1] == ']')
1445     {
1446     *endptr = ptr;
1447     return TRUE;
1448     }
1449     return FALSE;
1450     }
1451    
1452    
1453    
1454    
1455     /*************************************************
1456     * Check POSIX class name *
1457     *************************************************/
1458    
1459     /* This function is called to check the name given in a POSIX-style class entry
1460     such as [:alnum:].
1461    
1462     Arguments:
1463     ptr points to the first letter
1464     len the length of the name
1465    
1466     Returns: a value representing the name, or -1 if unknown
1467     */
1468    
1469     static int
1470     check_posix_name(const uschar *ptr, int len)
1471     {
1472     register int yield = 0;
1473     while (posix_name_lengths[yield] != 0)
1474     {
1475     if (len == posix_name_lengths[yield] &&
1476     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1477     yield++;
1478     }
1479     return -1;
1480     }
1481    
1482    
1483    
1484    
1485     /*************************************************
1486 nigel 3 * Compile one branch *
1487     *************************************************/
1488    
1489 nigel 63 /* Scan the pattern, compiling it into the code vector. If the options are
1490     changed during the branch, the pointer is used to change the external options
1491     bits.
1492 nigel 3
1493     Arguments:
1494 nigel 63 optionsptr pointer to the option bits
1495     brackets points to number of extracting brackets used
1496     code points to the pointer to the current code point
1497     ptrptr points to the current pattern pointer
1498     errorptr points to pointer to error message
1499     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1500     reqbyteptr set to the last literal character required, else < 0
1501     bcptr points to current branch chain
1502     cd contains pointers to tables etc.
1503 nigel 3
1504 nigel 63 Returns: TRUE on success
1505     FALSE, with *errorptr set on error
1506 nigel 3 */
1507    
1508     static BOOL
1509 nigel 63 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1510     const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1511     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1512 nigel 3 {
1513     int repeat_type, op_type;
1514 nigel 63 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1515     int bravalue = 0;
1516     int length;
1517 nigel 19 int greedy_default, greedy_non_default;
1518 nigel 63 int firstbyte, reqbyte;
1519     int zeroreqbyte, zerofirstbyte;
1520 nigel 65 int req_caseopt, reqvary, tempreqvary;
1521 nigel 37 int condcount = 0;
1522 nigel 63 int options = *optionsptr;
1523 nigel 3 register int c;
1524     register uschar *code = *codeptr;
1525 nigel 23 uschar *tempcode;
1526 nigel 63 BOOL inescq = FALSE;
1527     BOOL groupsetfirstbyte = FALSE;
1528 nigel 7 const uschar *ptr = *ptrptr;
1529 nigel 23 const uschar *tempptr;
1530 nigel 3 uschar *previous = NULL;
1531     uschar class[32];
1532    
1533 nigel 63 #ifdef SUPPORT_UTF8
1534     BOOL class_utf8;
1535     BOOL utf8 = (options & PCRE_UTF8) != 0;
1536     uschar *class_utf8data;
1537     uschar utf8_char[6];
1538     #else
1539     BOOL utf8 = FALSE;
1540     #endif
1541    
1542 nigel 19 /* Set up the default and non-default settings for greediness */
1543    
1544     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1545     greedy_non_default = greedy_default ^ 1;
1546    
1547 nigel 63 /* Initialize no first char, no required char. REQ_UNSET means "no char
1548     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1549     matches a non-fixed char first char; reqbyte just remains unset if we never
1550     find one.
1551 nigel 37
1552 nigel 63 When we hit a repeat whose minimum is zero, we may have to adjust these values
1553     to take the zero repeat into account. This is implemented by setting them to
1554     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1555     item types that can be repeated set these backoff variables appropriately. */
1556 nigel 37
1557 nigel 63 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1558    
1559     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1560     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1561     value > 255. It is added into the firstbyte or reqbyte variables to record the
1562     case status of the value. */
1563    
1564     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1565    
1566 nigel 3 /* Switch on next character until the end of the branch */
1567    
1568     for (;; ptr++)
1569     {
1570     BOOL negate_class;
1571 nigel 63 BOOL possessive_quantifier;
1572 nigel 23 int class_charcount;
1573     int class_lastchar;
1574     int newoptions;
1575 nigel 63 int recno;
1576 nigel 53 int skipbytes;
1577 nigel 63 int subreqbyte;
1578     int subfirstbyte;
1579 nigel 3
1580     c = *ptr;
1581 nigel 63 if (inescq && c != 0) goto NORMAL_CHAR;
1582    
1583 nigel 3 if ((options & PCRE_EXTENDED) != 0)
1584     {
1585 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1586 nigel 3 if (c == '#')
1587     {
1588 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
1589     on the Macintosh. */
1590 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1591 nigel 63 if (c != 0) continue; /* Else fall through to handle end of string */
1592 nigel 3 }
1593     }
1594    
1595     switch(c)
1596     {
1597     /* The branch terminates at end of string, |, or ). */
1598    
1599     case 0:
1600     case '|':
1601     case ')':
1602 nigel 63 *firstbyteptr = firstbyte;
1603     *reqbyteptr = reqbyte;
1604 nigel 3 *codeptr = code;
1605     *ptrptr = ptr;
1606     return TRUE;
1607    
1608 nigel 63 /* Handle single-character metacharacters. In multiline mode, ^ disables
1609     the setting of any following char as a first character. */
1610 nigel 3
1611     case '^':
1612 nigel 63 if ((options & PCRE_MULTILINE) != 0)
1613     {
1614     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1615     }
1616 nigel 3 previous = NULL;
1617     *code++ = OP_CIRC;
1618     break;
1619    
1620     case '$':
1621     previous = NULL;
1622     *code++ = OP_DOLL;
1623     break;
1624    
1625 nigel 63 /* There can never be a first char if '.' is first, whatever happens about
1626     repeats. The value of reqbyte doesn't change either. */
1627    
1628 nigel 3 case '.':
1629 nigel 63 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1630     zerofirstbyte = firstbyte;
1631     zeroreqbyte = reqbyte;
1632 nigel 3 previous = code;
1633     *code++ = OP_ANY;
1634     break;
1635    
1636 nigel 63 /* Character classes. If the included characters are all < 255 in value, we
1637     build a 32-byte bitmap of the permitted characters, except in the special
1638     case where there is only one such character. For negated classes, we build
1639     the map as usual, then invert it at the end. However, we use a different
1640     opcode so that data characters > 255 can be handled correctly.
1641    
1642     If the class contains characters outside the 0-255 range, a different
1643     opcode is compiled. It may optionally have a bit map for characters < 256,
1644     but those above are are explicitly listed afterwards. A flag byte tells
1645     whether the bitmap is present, and whether this is a negated class or not.
1646 nigel 3 */
1647    
1648     case '[':
1649     previous = code;
1650    
1651 nigel 63 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1652     they are encountered at the top level, so we'll do that too. */
1653    
1654     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1655     check_posix_syntax(ptr, &tempptr, cd))
1656     {
1657     *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1658     goto FAILED;
1659     }
1660    
1661 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
1662 nigel 3
1663     if ((c = *(++ptr)) == '^')
1664     {
1665     negate_class = TRUE;
1666     c = *(++ptr);
1667     }
1668 nigel 63 else
1669     {
1670     negate_class = FALSE;
1671     }
1672 nigel 3
1673 nigel 63 /* Keep a count of chars with values < 256 so that we can optimize the case
1674     of just a single character (as long as it's < 256). For higher valued UTF-8
1675     characters, we don't yet do any optimization. */
1676 nigel 3
1677     class_charcount = 0;
1678     class_lastchar = -1;
1679    
1680 nigel 63 #ifdef SUPPORT_UTF8
1681     class_utf8 = FALSE; /* No chars >= 256 */
1682     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1683     #endif
1684    
1685 nigel 3 /* Initialize the 32-char bit map to all zeros. We have to build the
1686     map in a temporary bit of store, in case the class contains only 1
1687 nigel 63 character (< 256), because in that case the compiled code doesn't use the
1688 nigel 3 bit map. */
1689    
1690     memset(class, 0, 32 * sizeof(uschar));
1691    
1692     /* Process characters until ] is reached. By writing this as a "do" it
1693 nigel 63 means that an initial ] is taken as a data character. The first pass
1694     through the regex checked the overall syntax, so we don't need to be very
1695     strict here. At the start of the loop, c contains the first byte of the
1696     character. */
1697 nigel 3
1698     do
1699     {
1700 nigel 63 #ifdef SUPPORT_UTF8
1701 nigel 67 if (utf8 && c > 127)
1702     { /* Braces are required because the */
1703     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1704     }
1705 nigel 63 #endif
1706    
1707     /* Inside \Q...\E everything is literal except \E */
1708    
1709     if (inescq)
1710 nigel 3 {
1711 nigel 63 if (c == '\\' && ptr[1] == 'E')
1712     {
1713     inescq = FALSE;
1714     ptr++;
1715     continue;
1716     }
1717     else goto LONE_SINGLE_CHARACTER;
1718 nigel 3 }
1719    
1720 nigel 43 /* Handle POSIX class names. Perl allows a negation extension of the
1721 nigel 63 form [:^name:]. A square bracket that doesn't match the syntax is
1722 nigel 43 treated as a literal. We also recognize the POSIX constructions
1723     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1724 nigel 63 5.6 and 5.8 do. */
1725 nigel 43
1726     if (c == '[' &&
1727     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1728     check_posix_syntax(ptr, &tempptr, cd))
1729     {
1730     BOOL local_negate = FALSE;
1731     int posix_class, i;
1732     register const uschar *cbits = cd->cbits;
1733    
1734     if (ptr[1] != ':')
1735     {
1736     *errorptr = ERR31;
1737     goto FAILED;
1738     }
1739    
1740     ptr += 2;
1741     if (*ptr == '^')
1742     {
1743     local_negate = TRUE;
1744     ptr++;
1745     }
1746    
1747     posix_class = check_posix_name(ptr, tempptr - ptr);
1748     if (posix_class < 0)
1749     {
1750     *errorptr = ERR30;
1751     goto FAILED;
1752     }
1753    
1754     /* If matching is caseless, upper and lower are converted to
1755     alpha. This relies on the fact that the class table starts with
1756     alpha, lower, upper as the first 3 entries. */
1757    
1758     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1759     posix_class = 0;
1760    
1761     /* Or into the map we are building up to 3 of the static class
1762 nigel 63 tables, or their negations. The [:blank:] class sets up the same
1763     chars as the [:space:] class (all white space). We remove the vertical
1764     white space chars afterwards. */
1765 nigel 43
1766     posix_class *= 3;
1767     for (i = 0; i < 3; i++)
1768     {
1769 nigel 65 BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1770 nigel 43 int taboffset = posix_class_maps[posix_class + i];
1771     if (taboffset < 0) break;
1772     if (local_negate)
1773 nigel 63 {
1774 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1775 nigel 63 if (isblank) class[1] |= 0x3c;
1776     }
1777 nigel 43 else
1778 nigel 63 {
1779 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1780 nigel 63 if (isblank) class[1] &= ~0x3c;
1781     }
1782 nigel 43 }
1783    
1784     ptr = tempptr + 1;
1785     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1786 nigel 63 continue; /* End of POSIX syntax handling */
1787 nigel 43 }
1788    
1789 nigel 3 /* Backslash may introduce a single character, or it may introduce one
1790     of the specials, which just set a flag. Escaped items are checked for
1791     validity in the pre-compiling pass. The sequence \b is a special case.
1792 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
1793 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
1794     or into the one we are building. We assume they have more than one
1795 nigel 63 character in them, so set class_charcount bigger than one. */
1796 nigel 3
1797     if (c == '\\')
1798     {
1799 nigel 71 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1800 nigel 63 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1801    
1802     if (-c == ESC_Q) /* Handle start of quoted string */
1803     {
1804     if (ptr[1] == '\\' && ptr[2] == 'E')
1805     {
1806     ptr += 2; /* avoid empty string */
1807     }
1808     else inescq = TRUE;
1809     continue;
1810     }
1811    
1812 nigel 3 else if (c < 0)
1813     {
1814 nigel 25 register const uschar *cbits = cd->cbits;
1815 nigel 63 class_charcount = 10; /* Greater than 1 is what matters */
1816 nigel 3 switch (-c)
1817     {
1818     case ESC_d:
1819 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1820 nigel 3 continue;
1821    
1822     case ESC_D:
1823 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1824 nigel 3 continue;
1825    
1826     case ESC_w:
1827 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1828 nigel 3 continue;
1829    
1830     case ESC_W:
1831 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1832 nigel 3 continue;
1833    
1834     case ESC_s:
1835 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1836 nigel 63 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1837 nigel 3 continue;
1838    
1839     case ESC_S:
1840 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1841 nigel 63 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1842 nigel 3 continue;
1843    
1844 nigel 63 /* Unrecognized escapes are faulted if PCRE is running in its
1845     strict mode. By default, for compatibility with Perl, they are
1846     treated as literals. */
1847    
1848 nigel 3 default:
1849 nigel 63 if ((options & PCRE_EXTRA) != 0)
1850     {
1851     *errorptr = ERR7;
1852     goto FAILED;
1853     }
1854     c = *ptr; /* The final character */
1855 nigel 3 }
1856     }
1857 nigel 49
1858 nigel 63 /* Fall through if we have a single character (c >= 0). This may be
1859     > 256 in UTF-8 mode. */
1860 nigel 49
1861 nigel 63 } /* End of backslash handling */
1862 nigel 3
1863     /* A single character may be followed by '-' to form a range. However,
1864     Perl does not permit ']' to be the end of the range. A '-' character
1865     here is treated as a literal. */
1866    
1867     if (ptr[1] == '-' && ptr[2] != ']')
1868     {
1869     int d;
1870     ptr += 2;
1871    
1872 nigel 63 #ifdef SUPPORT_UTF8
1873     if (utf8)
1874     { /* Braces are required because the */
1875     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1876 nigel 3 }
1877 nigel 63 else
1878     #endif
1879     d = *ptr;
1880 nigel 3
1881     /* The second part of a range can be a single-character escape, but
1882 nigel 49 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1883     in such circumstances. */
1884 nigel 3
1885     if (d == '\\')
1886     {
1887 nigel 49 const uschar *oldptr = ptr;
1888 nigel 71 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1889 nigel 49
1890     /* \b is backslash; any other special means the '-' was literal */
1891    
1892 nigel 3 if (d < 0)
1893     {
1894     if (d == -ESC_b) d = '\b'; else
1895     {
1896 nigel 49 ptr = oldptr - 2;
1897 nigel 63 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1898 nigel 3 }
1899     }
1900     }
1901    
1902 nigel 63 /* Check that the two values are in the correct order */
1903    
1904 nigel 3 if (d < c)
1905     {
1906     *errorptr = ERR8;
1907     goto FAILED;
1908     }
1909    
1910 nigel 63 /* If d is greater than 255, we can't just use the bit map, so set up
1911     for the UTF-8 supporting class type. If we are not caseless, we can
1912     just set up a single range. If we are caseless, the characters < 256
1913     are handled with a bitmap, in order to get the case-insensitive
1914     handling. */
1915    
1916     #ifdef SUPPORT_UTF8
1917     if (d > 255)
1918     {
1919     class_utf8 = TRUE;
1920     *class_utf8data++ = XCL_RANGE;
1921     if ((options & PCRE_CASELESS) == 0)
1922     {
1923     class_utf8data += ord2utf8(c, class_utf8data);
1924     class_utf8data += ord2utf8(d, class_utf8data);
1925     continue; /* Go get the next char in the class */
1926     }
1927     class_utf8data += ord2utf8(256, class_utf8data);
1928     class_utf8data += ord2utf8(d, class_utf8data);
1929     d = 255;
1930     /* Fall through */
1931     }
1932     #endif
1933     /* We use the bit map if the range is entirely < 255, or if part of it
1934     is < 255 and matching is caseless. */
1935    
1936 nigel 3 for (; c <= d; c++)
1937     {
1938     class[c/8] |= (1 << (c&7));
1939     if ((options & PCRE_CASELESS) != 0)
1940     {
1941 nigel 25 int uc = cd->fcc[c]; /* flip case */
1942 nigel 3 class[uc/8] |= (1 << (uc&7));
1943     }
1944     class_charcount++; /* in case a one-char range */
1945     class_lastchar = c;
1946     }
1947 nigel 63
1948 nigel 3 continue; /* Go get the next char in the class */
1949     }
1950    
1951     /* Handle a lone single character - we can get here for a normal
1952     non-escape char, or after \ that introduces a single character. */
1953    
1954 nigel 63 LONE_SINGLE_CHARACTER:
1955 nigel 49
1956 nigel 63 /* Handle a multibyte character */
1957    
1958     #ifdef SUPPORT_UTF8
1959     if (utf8 && c > 255)
1960 nigel 3 {
1961 nigel 63 class_utf8 = TRUE;
1962     *class_utf8data++ = XCL_SINGLE;
1963     class_utf8data += ord2utf8(c, class_utf8data);
1964 nigel 3 }
1965 nigel 63 else
1966     #endif
1967     /* Handle a single-byte character */
1968     {
1969     class [c/8] |= (1 << (c&7));
1970     if ((options & PCRE_CASELESS) != 0)
1971     {
1972     c = cd->fcc[c]; /* flip case */
1973     class[c/8] |= (1 << (c&7));
1974     }
1975     class_charcount++;
1976     class_lastchar = c;
1977     }
1978 nigel 3 }
1979    
1980     /* Loop until ']' reached; the check for end of string happens inside the
1981     loop. This "while" is the end of the "do" above. */
1982    
1983 nigel 63 while ((c = *(++ptr)) != ']' || inescq);
1984 nigel 3
1985 nigel 63 /* If class_charcount is 1, we saw precisely one character with a value <
1986     256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1987     the one character is < 128. In non-UTF-8 mode we can always optimize.
1988 nigel 3
1989 nigel 63 The optimization throws away the bit map. We turn the item into a
1990     1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1991     that OP_NOT does not support multibyte characters. In the positive case, it
1992     can cause firstbyte to be set. Otherwise, there can be no first char if
1993     this item is first, whatever repeat count may follow. In the case of
1994     reqbyte, save the previous value for reinstating. */
1995    
1996     #ifdef SUPPORT_UTF8
1997 nigel 67 if (class_charcount == 1 &&
1998     (!utf8 ||
1999     (!class_utf8 && class_lastchar < 128)))
2000 nigel 63 #else
2001     if (class_charcount == 1)
2002     #endif
2003 nigel 3 {
2004 nigel 63 zeroreqbyte = reqbyte;
2005 nigel 3 if (negate_class)
2006     {
2007 nigel 63 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2008     zerofirstbyte = firstbyte;
2009     *code++ = OP_NOT;
2010 nigel 3 }
2011     else
2012     {
2013 nigel 63 if (firstbyte == REQ_UNSET)
2014     {
2015     zerofirstbyte = REQ_NONE;
2016     firstbyte = class_lastchar | req_caseopt;
2017     }
2018     else
2019     {
2020     zerofirstbyte = firstbyte;
2021 nigel 65 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2022 nigel 63 }
2023     *code++ = OP_CHARS;
2024 nigel 3 *code++ = 1;
2025     }
2026     *code++ = class_lastchar;
2027 nigel 63 break; /* End of class handling */
2028     } /* End of 1-byte optimization */
2029    
2030     /* Otherwise, if this is the first thing in the branch, there can be no
2031     first char setting, whatever the repeat count. Any reqbyte setting must
2032     remain unchanged after any kind of repeat. */
2033    
2034     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2035     zerofirstbyte = firstbyte;
2036     zeroreqbyte = reqbyte;
2037    
2038     /* If there are characters with values > 255, we have to compile an
2039     extended class, with its own opcode. If there are no characters < 256,
2040     we can omit the bitmap. */
2041    
2042     #ifdef SUPPORT_UTF8
2043     if (class_utf8)
2044     {
2045     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2046     *code++ = OP_XCLASS;
2047     code += LINK_SIZE;
2048     *code = negate_class? XCL_NOT : 0;
2049    
2050     /* If the map is required, install it, and move on to the end of
2051     the extra data */
2052    
2053     if (class_charcount > 0)
2054     {
2055     *code++ |= XCL_MAP;
2056     memcpy(code, class, 32);
2057     code = class_utf8data;
2058     }
2059    
2060     /* If the map is not required, slide down the extra data. */
2061    
2062     else
2063     {
2064     int len = class_utf8data - (code + 33);
2065     memmove(code + 1, code + 33, len);
2066     code += len + 1;
2067     }
2068    
2069     /* Now fill in the complete length of the item */
2070    
2071     PUT(previous, 1, code - previous);
2072     break; /* End of class handling */
2073 nigel 3 }
2074 nigel 63 #endif
2075 nigel 3
2076 nigel 63 /* If there are no characters > 255, negate the 32-byte map if necessary,
2077     and copy it into the code vector. If this is the first thing in the branch,
2078     there can be no first char setting, whatever the repeat count. Any reqbyte
2079     setting must remain unchanged after any kind of repeat. */
2080 nigel 3
2081 nigel 63 if (negate_class)
2082     {
2083     *code++ = OP_NCLASS;
2084     for (c = 0; c < 32; c++) code[c] = ~class[c];
2085     }
2086 nigel 3 else
2087     {
2088 nigel 63 *code++ = OP_CLASS;
2089     memcpy(code, class, 32);
2090 nigel 3 }
2091 nigel 63 code += 32;
2092 nigel 3 break;
2093    
2094     /* Various kinds of repeat */
2095    
2096     case '{':
2097 nigel 71 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2098     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2099 nigel 3 if (*errorptr != NULL) goto FAILED;
2100     goto REPEAT;
2101    
2102     case '*':
2103     repeat_min = 0;
2104     repeat_max = -1;
2105     goto REPEAT;
2106    
2107     case '+':
2108     repeat_min = 1;
2109     repeat_max = -1;
2110     goto REPEAT;
2111    
2112     case '?':
2113     repeat_min = 0;
2114     repeat_max = 1;
2115    
2116     REPEAT:
2117     if (previous == NULL)
2118     {
2119     *errorptr = ERR9;
2120     goto FAILED;
2121     }
2122    
2123 nigel 63 if (repeat_min == 0)
2124     {
2125 nigel 65 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2126     reqbyte = zeroreqbyte; /* Ditto */
2127 nigel 63 }
2128 nigel 3
2129 nigel 65 /* Remember whether this is a variable length repeat */
2130    
2131     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2132    
2133 nigel 63 op_type = 0; /* Default single-char op codes */
2134     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2135    
2136     /* Save start of previous item, in case we have to move it up to make space
2137     for an inserted OP_ONCE for the additional '+' extension. */
2138    
2139     tempcode = previous;
2140    
2141     /* If the next character is '+', we have a possessive quantifier. This
2142     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2143     If the next character is '?' this is a minimizing repeat, by default,
2144     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2145     repeat type to the non-default. */
2146    
2147     if (ptr[1] == '+')
2148     {
2149     repeat_type = 0; /* Force greedy */
2150     possessive_quantifier = TRUE;
2151     ptr++;
2152     }
2153     else if (ptr[1] == '?')
2154     {
2155     repeat_type = greedy_non_default;
2156     ptr++;
2157     }
2158 nigel 19 else repeat_type = greedy_default;
2159 nigel 3
2160 nigel 63 /* If previous was a recursion, we need to wrap it inside brackets so that
2161     it can be replicated if necessary. */
2162    
2163     if (*previous == OP_RECURSE)
2164     {
2165     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2166     code += 1 + LINK_SIZE;
2167     *previous = OP_BRA;
2168     PUT(previous, 1, code - previous);
2169     *code = OP_KET;
2170     PUT(code, 1, code - previous);
2171     code += 1 + LINK_SIZE;
2172     }
2173    
2174 nigel 3 /* If previous was a string of characters, chop off the last one and use it
2175     as the subject of the repeat. If there was only one character, we can
2176 nigel 63 abolish the previous item altogether. If a one-char item has a minumum of
2177     more than one, ensure that it is set in reqbyte - it might not be if a
2178     sequence such as x{3} is the first thing in a branch because the x will
2179     have gone into firstbyte instead. */
2180 nigel 3
2181 nigel 37 if (*previous == OP_CHARS)
2182 nigel 3 {
2183 nigel 63 /* Deal with UTF-8 characters that take up more than one byte. It's
2184     easier to write this out separately than try to macrify it. Use c to
2185     hold the length of the character in bytes, plus 0x80 to flag that it's a
2186     length rather than a small character. */
2187 nigel 37
2188 nigel 63 #ifdef SUPPORT_UTF8
2189     if (utf8 && (code[-1] & 0x80) != 0)
2190 nigel 3 {
2191 nigel 63 uschar *lastchar = code - 1;
2192     while((*lastchar & 0xc0) == 0x80) lastchar--;
2193     c = code - lastchar; /* Length of UTF-8 character */
2194     memcpy(utf8_char, lastchar, c); /* Save the char */
2195     if (lastchar == previous + 2) /* There was only one character */
2196     {
2197     code = previous; /* Abolish the previous item */
2198     }
2199     else
2200     {
2201     previous[1] -= c; /* Adjust length of previous */
2202     code = lastchar; /* Lost char off the end */
2203     tempcode = code; /* Adjust position to be moved for '+' */
2204     }
2205     c |= 0x80; /* Flag c as a length */
2206 nigel 3 }
2207     else
2208 nigel 63 #endif
2209    
2210     /* Handle the case of a single byte - either with no UTF8 support, or
2211     with UTF-8 disabled, or for a UTF-8 character < 128. */
2212    
2213 nigel 3 {
2214 nigel 63 c = *(--code);
2215     if (code == previous + 2) /* There was only one character */
2216     {
2217     code = previous; /* Abolish the previous item */
2218 nigel 65 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2219 nigel 63 }
2220     else
2221     {
2222     previous[1]--; /* adjust length */
2223     tempcode = code; /* Adjust position to be moved for '+' */
2224     }
2225 nigel 3 }
2226 nigel 63
2227 nigel 3 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2228     }
2229    
2230     /* If previous was a single negated character ([^a] or similar), we use
2231     one of the special opcodes, replacing it. The code is shared with single-
2232 nigel 63 character repeats by setting opt_type to add a suitable offset into
2233     repeat_type. OP_NOT is currently used only for single-byte chars. */
2234 nigel 3
2235 nigel 63 else if (*previous == OP_NOT)
2236 nigel 3 {
2237     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2238     c = previous[1];
2239     code = previous;
2240     goto OUTPUT_SINGLE_REPEAT;
2241     }
2242    
2243     /* If previous was a character type match (\d or similar), abolish it and
2244     create a suitable repeat item. The code is shared with single-character
2245 nigel 63 repeats by setting op_type to add a suitable offset into repeat_type. */
2246 nigel 3
2247 nigel 63 else if (*previous < OP_EODN)
2248 nigel 3 {
2249     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2250     c = *previous;
2251     code = previous;
2252    
2253     OUTPUT_SINGLE_REPEAT:
2254    
2255 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
2256     this case, so we do too - by simply omitting the item altogether. */
2257    
2258     if (repeat_max == 0) goto END_REPEAT;
2259    
2260     /* Combine the op_type with the repeat_type */
2261    
2262     repeat_type += op_type;
2263    
2264 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
2265     an UPTO, with the maximum given. */
2266    
2267     if (repeat_min == 0)
2268     {
2269     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2270     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2271     else
2272     {
2273     *code++ = OP_UPTO + repeat_type;
2274 nigel 63 PUT2INC(code, 0, repeat_max);
2275 nigel 3 }
2276     }
2277    
2278     /* The case {1,} is handled as the special case + */
2279    
2280     else if (repeat_min == 1 && repeat_max == -1)
2281     *code++ = OP_PLUS + repeat_type;
2282    
2283     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2284     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2285    
2286     else
2287     {
2288     if (repeat_min != 1)
2289     {
2290     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2291 nigel 63 PUT2INC(code, 0, repeat_min);
2292 nigel 3 }
2293    
2294     /* If the mininum is 1 and the previous item was a character string,
2295     we either have to put back the item that got cancelled if the string
2296     length was 1, or add the character back onto the end of a longer
2297 nigel 21 string. For a character type nothing need be done; it will just get
2298     put back naturally. Note that the final character is always going to
2299 nigel 63 get added below, so we leave code ready for its insertion. */
2300 nigel 3
2301     else if (*previous == OP_CHARS)
2302     {
2303 nigel 63 if (code == previous) code += 2; else
2304    
2305     /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2306     bit set as a flag. The length will always be between 2 and 6. */
2307    
2308     #ifdef SUPPORT_UTF8
2309     if (utf8 && c >= 128) previous[1] += c & 7; else
2310     #endif
2311     previous[1]++;
2312 nigel 3 }
2313    
2314 nigel 21 /* For a single negated character we also have to put back the
2315 nigel 63 item that got cancelled. At present this applies only to single byte
2316     characters in any mode. */
2317 nigel 21
2318     else if (*previous == OP_NOT) code++;
2319    
2320 nigel 63 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2321     we have to insert the character for the previous code. In UTF-8 mode,
2322     long characters have their length in c, with the 0x80 bit as a flag. */
2323 nigel 3
2324 nigel 9 if (repeat_max < 0)
2325 nigel 3 {
2326 nigel 63 #ifdef SUPPORT_UTF8
2327     if (utf8 && c >= 128)
2328     {
2329     memcpy(code, utf8_char, c & 7);
2330     code += c & 7;
2331     }
2332     else
2333     #endif
2334 nigel 3 *code++ = c;
2335 nigel 9 *code++ = OP_STAR + repeat_type;
2336     }
2337    
2338 nigel 63 /* Else insert an UPTO if the max is greater than the min, again
2339     preceded by the character, for the previously inserted code. */
2340 nigel 9
2341     else if (repeat_max != repeat_min)
2342     {
2343 nigel 63 #ifdef SUPPORT_UTF8
2344     if (utf8 && c >= 128)
2345     {
2346     memcpy(code, utf8_char, c & 7);
2347     code += c & 7;
2348     }
2349     else
2350     #endif
2351 nigel 9 *code++ = c;
2352 nigel 3 repeat_max -= repeat_min;
2353     *code++ = OP_UPTO + repeat_type;
2354 nigel 63 PUT2INC(code, 0, repeat_max);
2355 nigel 3 }
2356     }
2357    
2358     /* The character or character type itself comes last in all cases. */
2359    
2360 nigel 63 #ifdef SUPPORT_UTF8
2361     if (utf8 && c >= 128)
2362     {
2363     memcpy(code, utf8_char, c & 7);
2364     code += c & 7;
2365     }
2366     else
2367     #endif
2368    
2369 nigel 3 *code++ = c;
2370     }
2371    
2372     /* If previous was a character class or a back reference, we put the repeat
2373 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
2374 nigel 3
2375 nigel 63 else if (*previous == OP_CLASS ||
2376     *previous == OP_NCLASS ||
2377     #ifdef SUPPORT_UTF8
2378     *previous == OP_XCLASS ||
2379     #endif
2380     *previous == OP_REF)
2381 nigel 3 {
2382 nigel 37 if (repeat_max == 0)
2383     {
2384     code = previous;
2385     goto END_REPEAT;
2386     }
2387 nigel 3 if (repeat_min == 0 && repeat_max == -1)
2388     *code++ = OP_CRSTAR + repeat_type;
2389     else if (repeat_min == 1 && repeat_max == -1)
2390     *code++ = OP_CRPLUS + repeat_type;
2391     else if (repeat_min == 0 && repeat_max == 1)
2392     *code++ = OP_CRQUERY + repeat_type;
2393     else
2394     {
2395     *code++ = OP_CRRANGE + repeat_type;
2396 nigel 63 PUT2INC(code, 0, repeat_min);
2397 nigel 3 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2398 nigel 63 PUT2INC(code, 0, repeat_max);
2399 nigel 3 }
2400     }
2401    
2402     /* If previous was a bracket group, we may have to replicate it in certain
2403 nigel 23 cases. */
2404 nigel 3
2405 nigel 63 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2406     *previous == OP_COND)
2407 nigel 3 {
2408 nigel 31 register int i;
2409     int ketoffset = 0;
2410 nigel 9 int len = code - previous;
2411 nigel 31 uschar *bralink = NULL;
2412 nigel 3
2413 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
2414     by scanning through from the start, and compute the offset back to it
2415     from the current code pointer. There may be an OP_OPT setting following
2416     the final KET, so we can't find the end just by going back from the code
2417     pointer. */
2418    
2419     if (repeat_max == -1)
2420 nigel 3 {
2421 nigel 23 register uschar *ket = previous;
2422 nigel 63 do ket += GET(ket, 1); while (*ket != OP_KET);
2423 nigel 23 ketoffset = code - ket;
2424 nigel 3 }
2425    
2426 nigel 31 /* The case of a zero minimum is special because of the need to stick
2427     OP_BRAZERO in front of it, and because the group appears once in the
2428     data, whereas in other cases it appears the minimum number of times. For
2429     this reason, it is simplest to treat this case separately, as otherwise
2430 nigel 53 the code gets far too messy. There are several special subcases when the
2431 nigel 31 minimum is zero. */
2432    
2433     if (repeat_min == 0)
2434     {
2435     /* If the maximum is also zero, we just omit the group from the output
2436     altogether. */
2437    
2438     if (repeat_max == 0)
2439     {
2440     code = previous;
2441 nigel 37 goto END_REPEAT;
2442 nigel 31 }
2443    
2444     /* If the maximum is 1 or unlimited, we just have to stick in the
2445     BRAZERO and do no more at this point. */
2446    
2447     if (repeat_max <= 1)
2448     {
2449     memmove(previous+1, previous, len);
2450     code++;
2451     *previous++ = OP_BRAZERO + repeat_type;
2452     }
2453    
2454     /* If the maximum is greater than 1 and limited, we have to replicate
2455     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2456     The first one has to be handled carefully because it's the original
2457     copy, which has to be moved up. The remainder can be handled by code
2458     that is common with the non-zero minimum case below. We just have to
2459     adjust the value or repeat_max, since one less copy is required. */
2460    
2461     else
2462     {
2463     int offset;
2464 nigel 63 memmove(previous + 2 + LINK_SIZE, previous, len);
2465     code += 2 + LINK_SIZE;
2466 nigel 31 *previous++ = OP_BRAZERO + repeat_type;
2467     *previous++ = OP_BRA;
2468    
2469     /* We chain together the bracket offset fields that have to be
2470     filled in later when the ends of the brackets are reached. */
2471    
2472     offset = (bralink == NULL)? 0 : previous - bralink;
2473     bralink = previous;
2474 nigel 63 PUTINC(previous, 0, offset);
2475 nigel 31 }
2476    
2477     repeat_max--;
2478     }
2479    
2480     /* If the minimum is greater than zero, replicate the group as many
2481     times as necessary, and adjust the maximum to the number of subsequent
2482 nigel 63 copies that we need. If we set a first char from the group, and didn't
2483     set a required char, copy the latter from the former. */
2484 nigel 31
2485     else
2486     {
2487 nigel 63 if (repeat_min > 1)
2488 nigel 31 {
2489 nigel 63 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2490     for (i = 1; i < repeat_min; i++)
2491     {
2492     memcpy(code, previous, len);
2493     code += len;
2494     }
2495 nigel 31 }
2496     if (repeat_max > 0) repeat_max -= repeat_min;
2497     }
2498    
2499     /* This code is common to both the zero and non-zero minimum cases. If
2500     the maximum is limited, it replicates the group in a nested fashion,
2501     remembering the bracket starts on a stack. In the case of a zero minimum,
2502     the first one was set up above. In all cases the repeat_max now specifies
2503     the number of additional copies needed. */
2504    
2505     if (repeat_max >= 0)
2506     {
2507     for (i = repeat_max - 1; i >= 0; i--)
2508     {
2509     *code++ = OP_BRAZERO + repeat_type;
2510    
2511     /* All but the final copy start a new nesting, maintaining the
2512     chain of brackets outstanding. */
2513    
2514     if (i != 0)
2515     {
2516     int offset;
2517     *code++ = OP_BRA;
2518     offset = (bralink == NULL)? 0 : code - bralink;
2519     bralink = code;
2520 nigel 63 PUTINC(code, 0, offset);
2521 nigel 31 }
2522    
2523     memcpy(code, previous, len);
2524     code += len;
2525     }
2526    
2527     /* Now chain through the pending brackets, and fill in their length
2528     fields (which are holding the chain links pro tem). */
2529    
2530     while (bralink != NULL)
2531     {
2532     int oldlinkoffset;
2533     int offset = code - bralink + 1;
2534     uschar *bra = code - offset;
2535 nigel 63 oldlinkoffset = GET(bra, 1);
2536 nigel 31 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2537     *code++ = OP_KET;
2538 nigel 63 PUTINC(code, 0, offset);
2539     PUT(bra, 1, offset);
2540 nigel 31 }
2541     }
2542    
2543     /* If the maximum is unlimited, set a repeater in the final copy. We
2544     can't just offset backwards from the current code point, because we
2545     don't know if there's been an options resetting after the ket. The
2546     correct offset was computed above. */
2547    
2548     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2549 nigel 3 }
2550    
2551     /* Else there's some kind of shambles */
2552    
2553     else
2554     {
2555     *errorptr = ERR11;
2556     goto FAILED;
2557     }
2558    
2559 nigel 63 /* If the character following a repeat is '+', we wrap the entire repeated
2560     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2561     Sun's Java package. The repeated item starts at tempcode, not at previous,
2562     which might be the first part of a string whose (former) last char we
2563     repeated. However, we don't support '+' after a greediness '?'. */
2564    
2565     if (possessive_quantifier)
2566     {
2567     int len = code - tempcode;
2568     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2569     code += 1 + LINK_SIZE;
2570     len += 1 + LINK_SIZE;
2571     tempcode[0] = OP_ONCE;
2572     *code++ = OP_KET;
2573     PUTINC(code, 0, len);
2574     PUT(tempcode, 1, len);
2575     }
2576    
2577 nigel 65 /* In all case we no longer have a previous item. We also set the
2578     "follows varying string" flag for subsequently encountered reqbytes if
2579     it isn't already set and we have just passed a varying length item. */
2580 nigel 3
2581 nigel 37 END_REPEAT:
2582 nigel 3 previous = NULL;
2583 nigel 65 cd->req_varyopt |= reqvary;
2584 nigel 3 break;
2585    
2586    
2587 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
2588     lookbehind or option setting or condition. First deal with special things
2589     that can come after a bracket; all are introduced by ?, and the appearance
2590     of any of them means that this is not a referencing group. They were
2591     checked for validity in the first pass over the string, so we don't have to
2592     check for syntax errors here. */
2593 nigel 3
2594     case '(':
2595 nigel 23 newoptions = options;
2596 nigel 53 skipbytes = 0;
2597 nigel 23
2598 nigel 3 if (*(++ptr) == '?')
2599     {
2600 nigel 23 int set, unset;
2601     int *optset;
2602 nigel 3
2603     switch (*(++ptr))
2604     {
2605 nigel 23 case '#': /* Comment; skip to ket */
2606 nigel 3 ptr++;
2607     while (*ptr != ')') ptr++;
2608     continue;
2609    
2610     case ':': /* Non-extracting bracket */
2611 nigel 23 bravalue = OP_BRA;
2612 nigel 3 ptr++;
2613     break;
2614    
2615 nigel 23 case '(':
2616     bravalue = OP_COND; /* Conditional group */
2617 nigel 63
2618     /* Condition to test for recursion */
2619    
2620     if (ptr[1] == 'R')
2621 nigel 23 {
2622 nigel 63 code[1+LINK_SIZE] = OP_CREF;
2623     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2624     skipbytes = 3;
2625     ptr += 3;
2626     }
2627    
2628 nigel 69 /* Condition to test for a numbered subpattern match. We know that
2629     if a digit follows ( then there will just be digits until ) because
2630     the syntax was checked in the first pass. */
2631 nigel 63
2632 nigel 69 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2633 nigel 63 {
2634 nigel 65 int condref; /* Don't amalgamate; some compilers */
2635     condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2636 nigel 23 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2637 nigel 51 if (condref == 0)
2638     {
2639     *errorptr = ERR35;
2640     goto FAILED;
2641     }
2642 nigel 23 ptr++;
2643 nigel 63 code[1+LINK_SIZE] = OP_CREF;
2644     PUT2(code, 2+LINK_SIZE, condref);
2645 nigel 53 skipbytes = 3;
2646 nigel 23 }
2647 nigel 63 /* For conditions that are assertions, we just fall through, having
2648     set bravalue above. */
2649 nigel 23 break;
2650    
2651     case '=': /* Positive lookahead */
2652 nigel 3 bravalue = OP_ASSERT;
2653     ptr++;
2654     break;
2655    
2656 nigel 23 case '!': /* Negative lookahead */
2657 nigel 3 bravalue = OP_ASSERT_NOT;
2658     ptr++;
2659     break;
2660    
2661 nigel 23 case '<': /* Lookbehinds */
2662     switch (*(++ptr))
2663 nigel 3 {
2664 nigel 23 case '=': /* Positive lookbehind */
2665     bravalue = OP_ASSERTBACK;
2666 nigel 3 ptr++;
2667     break;
2668 nigel 23
2669     case '!': /* Negative lookbehind */
2670     bravalue = OP_ASSERTBACK_NOT;
2671     ptr++;
2672     break;
2673 nigel 3 }
2674 nigel 23 break;
2675 nigel 3
2676 nigel 23 case '>': /* One-time brackets */
2677     bravalue = OP_ONCE;
2678     ptr++;
2679     break;
2680    
2681 nigel 63 case 'C': /* Callout - may be followed by digits */
2682     *code++ = OP_CALLOUT;
2683     {
2684     int n = 0;
2685 nigel 69 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2686 nigel 63 n = n * 10 + *ptr - '0';
2687     if (n > 255)
2688     {
2689     *errorptr = ERR38;
2690     goto FAILED;
2691     }
2692     *code++ = n;
2693     }
2694     previous = NULL;
2695     continue;
2696    
2697     case 'P': /* Named subpattern handling */
2698     if (*(++ptr) == '<') /* Definition */
2699     {
2700     int i, namelen;
2701     uschar *slot = cd->name_table;
2702 nigel 65 const uschar *name; /* Don't amalgamate; some compilers */
2703     name = ++ptr; /* grumble at autoincrement in declaration */
2704 nigel 63
2705     while (*ptr++ != '>');
2706     namelen = ptr - name - 1;
2707    
2708     for (i = 0; i < cd->names_found; i++)
2709     {
2710 nigel 67 int crc = memcmp(name, slot+2, namelen);
2711     if (crc == 0)
2712 nigel 63 {
2713 nigel 65 if (slot[2+namelen] == 0)
2714     {
2715     *errorptr = ERR43;
2716     goto FAILED;
2717     }
2718 nigel 67 crc = -1; /* Current name is substring */
2719 nigel 63 }
2720 nigel 67 if (crc < 0)
2721 nigel 63 {
2722     memmove(slot + cd->name_entry_size, slot,
2723     (cd->names_found - i) * cd->name_entry_size);
2724     break;
2725     }
2726     slot += cd->name_entry_size;
2727     }
2728    
2729     PUT2(slot, 0, *brackets + 1);
2730     memcpy(slot + 2, name, namelen);
2731     slot[2+namelen] = 0;
2732     cd->names_found++;
2733     goto NUMBERED_GROUP;
2734     }
2735    
2736     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2737     {
2738     int i, namelen;
2739     int type = *ptr++;
2740     const uschar *name = ptr;
2741     uschar *slot = cd->name_table;
2742    
2743     while (*ptr != ')') ptr++;
2744     namelen = ptr - name;
2745    
2746     for (i = 0; i < cd->names_found; i++)
2747     {
2748 nigel 65 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2749 nigel 63 slot += cd->name_entry_size;
2750     }
2751     if (i >= cd->names_found)
2752     {
2753     *errorptr = ERR15;
2754     goto FAILED;
2755     }
2756    
2757     recno = GET2(slot, 0);
2758    
2759     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2760    
2761     /* Back reference */
2762    
2763     previous = code;
2764     *code++ = OP_REF;
2765     PUT2INC(code, 0, recno);
2766     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2767     if (recno > cd->top_backref) cd->top_backref = recno;
2768     continue;
2769     }
2770    
2771     /* Should never happen */
2772     break;
2773    
2774 nigel 43 case 'R': /* Pattern recursion */
2775 nigel 63 ptr++; /* Same as (?0) */
2776     /* Fall through */
2777    
2778     /* Recursion or "subroutine" call */
2779    
2780     case '0': case '1': case '2': case '3': case '4':
2781     case '5': case '6': case '7': case '8': case '9':
2782     {
2783     const uschar *called;
2784     recno = 0;
2785 nigel 69 while((digitab[*ptr] & ctype_digit) != 0)
2786 nigel 63 recno = recno * 10 + *ptr++ - '0';
2787    
2788     /* Come here from code above that handles a named recursion */
2789    
2790     HANDLE_RECURSION:
2791    
2792     previous = code;
2793    
2794     /* Find the bracket that is being referenced. Temporarily end the
2795     regex in case it doesn't exist. */
2796    
2797     *code = OP_END;
2798     called = (recno == 0)?
2799     cd->start_code : find_bracket(cd->start_code, utf8, recno);
2800    
2801     if (called == NULL)
2802     {
2803     *errorptr = ERR15;
2804     goto FAILED;
2805     }
2806    
2807     /* If the subpattern is still open, this is a recursive call. We
2808     check to see if this is a left recursion that could loop for ever,
2809     and diagnose that case. */
2810    
2811     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2812     {
2813     *errorptr = ERR40;
2814     goto FAILED;
2815     }
2816    
2817     /* Insert the recursion/subroutine item */
2818    
2819     *code = OP_RECURSE;
2820     PUT(code, 1, called - cd->start_code);
2821     code += 1 + LINK_SIZE;
2822     }
2823 nigel 43 continue;
2824    
2825 nigel 63 /* Character after (? not specially recognized */
2826    
2827 nigel 23 default: /* Option setting */
2828     set = unset = 0;
2829     optset = &set;
2830    
2831     while (*ptr != ')' && *ptr != ':')
2832     {
2833     switch (*ptr++)
2834     {
2835     case '-': optset = &unset; break;
2836    
2837     case 'i': *optset |= PCRE_CASELESS; break;
2838     case 'm': *optset |= PCRE_MULTILINE; break;
2839     case 's': *optset |= PCRE_DOTALL; break;
2840     case 'x': *optset |= PCRE_EXTENDED; break;
2841     case 'U': *optset |= PCRE_UNGREEDY; break;
2842     case 'X': *optset |= PCRE_EXTRA; break;
2843     }
2844     }
2845    
2846     /* Set up the changed option bits, but don't change anything yet. */
2847    
2848     newoptions = (options | set) & (~unset);
2849    
2850     /* If the options ended with ')' this is not the start of a nested
2851 nigel 63 group with option changes, so the options change at this level. Compile
2852     code to change the ims options if this setting actually changes any of
2853     them. We also pass the new setting back so that it can be put at the
2854     start of any following branches, and when this group ends (if we are in
2855     a group), a resetting item can be compiled.
2856 nigel 23
2857 nigel 63 Note that if this item is right at the start of the pattern, the
2858     options will have been abstracted and made global, so there will be no
2859     change to compile. */
2860    
2861 nigel 23 if (*ptr == ')')
2862     {
2863 nigel 63 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2864 nigel 23 {
2865     *code++ = OP_OPT;
2866 nigel 63 *code++ = newoptions & PCRE_IMS;
2867 nigel 23 }
2868 nigel 63
2869     /* Change options at this level, and pass them back for use
2870     in subsequent branches. Reset the greedy defaults and the case
2871     value for firstbyte and reqbyte. */
2872    
2873     *optionsptr = options = newoptions;
2874     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2875     greedy_non_default = greedy_default ^ 1;
2876     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2877    
2878 nigel 23 previous = NULL; /* This item can't be repeated */
2879     continue; /* It is complete */
2880     }
2881    
2882     /* If the options ended with ':' we are heading into a nested group
2883     with possible change of options. Such groups are non-capturing and are
2884     not assertions of any kind. All we need to do is skip over the ':';
2885     the newoptions value is handled below. */
2886    
2887     bravalue = OP_BRA;
2888     ptr++;
2889 nigel 3 }
2890     }
2891    
2892 nigel 63 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2893     non-capturing and behave like (?:...) brackets */
2894    
2895     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2896     {
2897     bravalue = OP_BRA;
2898     }
2899    
2900 nigel 53 /* Else we have a referencing group; adjust the opcode. If the bracket
2901     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2902     arrange for the true number to follow later, in an OP_BRANUMBER item. */
2903 nigel 3
2904     else
2905     {
2906 nigel 63 NUMBERED_GROUP:
2907 nigel 53 if (++(*brackets) > EXTRACT_BASIC_MAX)
2908 nigel 3 {
2909 nigel 53 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2910 nigel 63 code[1+LINK_SIZE] = OP_BRANUMBER;
2911     PUT2(code, 2+LINK_SIZE, *brackets);
2912 nigel 53 skipbytes = 3;
2913 nigel 3 }
2914 nigel 53 else bravalue = OP_BRA + *brackets;
2915 nigel 3 }
2916    
2917 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
2918     kinds can be. We copy code into a non-register variable in order to be able
2919     to pass its address because some compilers complain otherwise. Pass in a
2920     new setting for the ims options if they have changed. */
2921 nigel 3
2922 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
2923 nigel 3 *code = bravalue;
2924 nigel 23 tempcode = code;
2925 nigel 65 tempreqvary = cd->req_varyopt; /* Save value before bracket */
2926 nigel 23
2927     if (!compile_regex(
2928 nigel 63 newoptions, /* The complete new option state */
2929     options & PCRE_IMS, /* The previous ims option state */
2930 nigel 53 brackets, /* Extracting bracket count */
2931 nigel 23 &tempcode, /* Where to put code (updated) */
2932     &ptr, /* Input pointer (updated) */
2933     errorptr, /* Where to put an error message */
2934     (bravalue == OP_ASSERTBACK ||
2935     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2936 nigel 53 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
2937 nigel 63 &subfirstbyte, /* For possible first char */
2938     &subreqbyte, /* For possible last char */
2939     bcptr, /* Current branch chain */
2940 nigel 25 cd)) /* Tables block */
2941 nigel 23 goto FAILED;
2942    
2943     /* At the end of compiling, code is still pointing to the start of the
2944     group, while tempcode has been updated to point past the end of the group
2945     and any option resetting that may follow it. The pattern pointer (ptr)
2946     is on the bracket. */
2947    
2948     /* If this is a conditional bracket, check that there are no more than
2949     two branches in the group. */
2950    
2951 nigel 53 else if (bravalue == OP_COND)
2952 nigel 3 {
2953 nigel 23 uschar *tc = code;
2954 nigel 37 condcount = 0;
2955 nigel 23
2956     do {
2957 nigel 37 condcount++;
2958 nigel 63 tc += GET(tc,1);
2959 nigel 23 }
2960     while (*tc != OP_KET);
2961    
2962 nigel 37 if (condcount > 2)
2963 nigel 23 {
2964     *errorptr = ERR27;
2965 nigel 3 goto FAILED;
2966 nigel 23 }
2967 nigel 63
2968     /* If there is just one branch, we must not make use of its firstbyte or
2969     reqbyte, because this is equivalent to an empty second branch. */
2970    
2971     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2972 nigel 3 }
2973    
2974 nigel 63 /* Handle updating of the required and first characters. Update for normal
2975     brackets of all kinds, and conditions with two branches (see code above).
2976     If the bracket is followed by a quantifier with zero repeat, we have to
2977     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2978     main loop so that they can be accessed for the back off. */
2979 nigel 37
2980 nigel 63 zeroreqbyte = reqbyte;
2981     zerofirstbyte = firstbyte;
2982     groupsetfirstbyte = FALSE;
2983    
2984     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2985 nigel 37 {
2986 nigel 63 /* If we have not yet set a firstbyte in this branch, take it from the
2987     subpattern, remembering that it was set here so that a repeat of more
2988     than one can replicate it as reqbyte if necessary. If the subpattern has
2989     no firstbyte, set "none" for the whole branch. In both cases, a zero
2990     repeat forces firstbyte to "none". */
2991    
2992     if (firstbyte == REQ_UNSET)
2993     {
2994     if (subfirstbyte >= 0)
2995     {
2996     firstbyte = subfirstbyte;
2997     groupsetfirstbyte = TRUE;
2998     }
2999     else firstbyte = REQ_NONE;
3000     zerofirstbyte = REQ_NONE;
3001     }
3002    
3003     /* If firstbyte was previously set, convert the subpattern's firstbyte
3004 nigel 65 into reqbyte if there wasn't one, using the vary flag that was in
3005     existence beforehand. */
3006 nigel 63
3007 nigel 65 else if (subfirstbyte >= 0 && subreqbyte < 0)
3008     subreqbyte = subfirstbyte | tempreqvary;
3009 nigel 63
3010 nigel 65 /* If the subpattern set a required byte (or set a first byte that isn't
3011     really the first byte - see above), set it. */
3012 nigel 63
3013     if (subreqbyte >= 0) reqbyte = subreqbyte;
3014 nigel 37 }
3015    
3016 nigel 63 /* For a forward assertion, we take the reqbyte, if set. This can be
3017     helpful if the pattern that follows the assertion doesn't set a different
3018     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3019     for an assertion, however because it leads to incorrect effect for patterns
3020     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3021     of a firstbyte. This is overcome by a scan at the end if there's no
3022     firstbyte, looking for an asserted first char. */
3023    
3024     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3025    
3026 nigel 23 /* Now update the main code pointer to the end of the group. */
3027    
3028     code = tempcode;
3029    
3030     /* Error if hit end of pattern */
3031    
3032 nigel 3 if (*ptr != ')')
3033     {
3034     *errorptr = ERR14;
3035     goto FAILED;
3036     }
3037     break;
3038    
3039     /* Check \ for being a real metacharacter; if not, fall through and handle
3040     it as a data character at the start of a string. Escape items are checked
3041     for validity in the pre-compiling pass. */
3042    
3043     case '\\':
3044 nigel 23 tempptr = ptr;
3045 nigel 71 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3046 nigel 3
3047     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3048     are arranged to be the negation of the corresponding OP_values. For the
3049     back references, the values are ESC_REF plus the reference number. Only
3050     back references and those types that consume a character may be repeated.
3051     We can test for values between ESC_b and ESC_Z for the latter; this may
3052     have to change if any new ones are ever created. */
3053    
3054     if (c < 0)
3055     {
3056 nigel 63 if (-c == ESC_Q) /* Handle start of quoted string */
3057     {
3058     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3059     else inescq = TRUE;
3060     continue;
3061     }
3062    
3063     /* For metasequences that actually match a character, we disable the
3064     setting of a first character if it hasn't already been set. */
3065    
3066     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3067     firstbyte = REQ_NONE;
3068    
3069     /* Set values to reset to if this is followed by a zero repeat. */
3070    
3071     zerofirstbyte = firstbyte;
3072     zeroreqbyte = reqbyte;
3073    
3074     /* Back references are handled specially */
3075    
3076 nigel 3 if (-c >= ESC_REF)
3077     {
3078 nigel 53 int number = -c - ESC_REF;
3079 nigel 3 previous = code;
3080     *code++ = OP_REF;
3081 nigel 63 PUT2INC(code, 0, number);
3082 nigel 3 }
3083     else
3084     {
3085 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3086 nigel 3 *code++ = -c;
3087     }
3088     continue;
3089     }
3090    
3091 nigel 7 /* Data character: reset and fall through */
3092 nigel 3
3093 nigel 23 ptr = tempptr;
3094 nigel 3 c = '\\';
3095    
3096     /* Handle a run of data characters until a metacharacter is encountered.
3097     The first character is guaranteed not to be whitespace or # when the
3098     extended flag is set. */
3099    
3100     NORMAL_CHAR:
3101     default:
3102     previous = code;
3103     *code = OP_CHARS;
3104     code += 2;
3105     length = 0;
3106    
3107     do
3108     {
3109 nigel 63 /* If in \Q...\E, check for the end; if not, we always have a literal */
3110    
3111     if (inescq)
3112     {
3113     if (c == '\\' && ptr[1] == 'E')
3114     {
3115     inescq = FALSE;
3116     ptr++;
3117     }
3118     else
3119     {
3120     *code++ = c;
3121     length++;
3122     }
3123     continue;
3124     }
3125    
3126     /* Skip white space and comments for /x patterns */
3127    
3128 nigel 3 if ((options & PCRE_EXTENDED) != 0)
3129     {
3130 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3131 nigel 3 if (c == '#')
3132     {
3133 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
3134     on the Macintosh. */
3135 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3136 nigel 3 if (c == 0) break;
3137     continue;
3138     }
3139     }
3140    
3141     /* Backslash may introduce a data char or a metacharacter. Escaped items
3142     are checked for validity in the pre-compiling pass. Stop the string
3143     before a metaitem. */
3144    
3145     if (c == '\\')
3146     {
3147 nigel 23 tempptr = ptr;
3148 nigel 71 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3149 nigel 23 if (c < 0) { ptr = tempptr; break; }
3150 nigel 49
3151     /* If a character is > 127 in UTF-8 mode, we have to turn it into
3152     two or more characters in the UTF-8 encoding. */
3153    
3154     #ifdef SUPPORT_UTF8
3155 nigel 63 if (utf8 && c > 127)
3156 nigel 49 {
3157     uschar buffer[8];
3158     int len = ord2utf8(c, buffer);
3159     for (c = 0; c < len; c++) *code++ = buffer[c];
3160     length += len;
3161     continue;
3162     }
3163     #endif
3164 nigel 3 }
3165    
3166     /* Ordinary character or single-char escape */
3167    
3168     *code++ = c;
3169     length++;
3170     }
3171    
3172     /* This "while" is the end of the "do" above. */
3173    
3174 nigel 49 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3175 nigel 3
3176 nigel 63 /* Update the first and last requirements. These are always bytes, even in
3177     UTF-8 mode. However, there is a special case to be considered when there
3178     are only one or two characters. Because this gets messy in UTF-8 mode, the
3179     code is kept separate. When we get here "length" contains the number of
3180     bytes. */
3181 nigel 37
3182 nigel 63 #ifdef SUPPORT_UTF8
3183     if (utf8 && length > 1)
3184     {
3185     uschar *t = previous + 3; /* After this code, t */
3186     while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3187 nigel 37
3188 nigel 63 /* Handle the case when there is only one multibyte character. It must
3189     have at least two bytes because of the "length > 1" test above. */
3190 nigel 3
3191 nigel 63 if (t == code)
3192     {
3193     /* If no previous first byte, set it from this character, but revert to
3194     none on a zero repeat. */
3195    
3196     if (firstbyte == REQ_UNSET)
3197     {
3198     zerofirstbyte = REQ_NONE;
3199     firstbyte = previous[2];
3200     }
3201    
3202     /* Otherwise, leave the first byte value alone, and don't change it on
3203     a zero repeat */
3204    
3205     else zerofirstbyte = firstbyte;
3206    
3207     /* In both cases, a zero repeat resets the previous required byte */
3208    
3209     zeroreqbyte = reqbyte;
3210     }
3211    
3212     /* Handle the case when there is more than one character. These may be
3213     single-byte or multibyte characters */
3214    
3215     else
3216     {
3217 nigel 67 t = code - 1; /* After this code, t is at the */
3218 nigel 63 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3219    
3220     /* If no previous first byte, set it from the first character, and
3221     retain it on a zero repeat (of the last character). The required byte
3222     is reset on a zero repeat, either to the byte before the last
3223     character, unless this is the first byte of the string. In that case,
3224     it reverts to its previous value. */
3225    
3226     if (firstbyte == REQ_UNSET)
3227     {
3228     zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3229 nigel 65 zeroreqbyte = (t - 1 == previous + 2)?
3230     reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3231 nigel 63 }
3232    
3233     /* If there was a previous first byte, leave it alone, and don't change
3234     it on a zero repeat. The required byte is reset on a zero repeat to the
3235     byte before the last character. */
3236    
3237     else
3238     {
3239     zerofirstbyte = firstbyte;
3240 nigel 65 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3241 nigel 63 }
3242     }
3243    
3244     /* In all cases (we know length > 1), the new required byte is the last
3245     byte of the string. */
3246    
3247 nigel 65 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3248 nigel 63 }
3249    
3250     else /* End of UTF-8 coding */
3251     #endif
3252    
3253     /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3254     or when UTF-8 is not enabled. */
3255    
3256     {
3257     /* firstbyte was not previously set; take it from this string */
3258    
3259     if (firstbyte == REQ_UNSET)
3260     {
3261     if (length == 1)
3262     {
3263     zerofirstbyte = REQ_NONE;
3264     firstbyte = previous[2] | req_caseopt;
3265     zeroreqbyte = reqbyte;
3266     }
3267     else
3268     {
3269     zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3270 nigel 65 zeroreqbyte = (length > 2)?
3271     (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3272     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3273 nigel 63 }
3274     }
3275    
3276     /* firstbyte was previously set */
3277    
3278     else
3279     {
3280     zerofirstbyte = firstbyte;
3281 nigel 65 zeroreqbyte = (length == 1)? reqbyte :
3282     code[-2] | req_caseopt | cd->req_varyopt;
3283     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3284 nigel 63 }
3285     }
3286    
3287     /* Set the length in the data vector, and advance to the next state. */
3288    
3289 nigel 3 previous[1] = length;
3290 nigel 49 if (length < MAXLIT) ptr--;
3291 nigel 3 break;
3292     }
3293     } /* end of big loop */
3294    
3295     /* Control never reaches here by falling through, only by a goto for all the
3296     error states. Pass back the position in the pattern so that it can be displayed
3297     to the user for diagnosing the error. */
3298    
3299     FAILED:
3300     *ptrptr = ptr;
3301     return FALSE;
3302     }
3303    
3304    
3305    
3306    
3307     /*************************************************
3308     * Compile sequence of alternatives *
3309     *************************************************/
3310    
3311     /* On entry, ptr is pointing past the bracket character, but on return
3312     it points to the closing bracket, or vertical bar, or end of string.
3313     The code variable is pointing at the byte into which the BRA operator has been
3314 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
3315     during any branch, we need to insert an OP_OPT item at the start of every
3316     following branch to ensure they get set correctly at run time, and also pass
3317     the new options into every subsequent branch compile.
3318 nigel 3
3319     Argument:
3320 nigel 63 options option bits, including any changes for this subpattern
3321     oldims previous settings of ims option bits
3322     brackets -> int containing the number of extracting brackets used
3323     codeptr -> the address of the current code pointer
3324     ptrptr -> the address of the current pattern pointer
3325     errorptr -> pointer to error message
3326     lookbehind TRUE if this is a lookbehind assertion
3327     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3328     firstbyteptr place to put the first required character, or a negative number
3329     reqbyteptr place to put the last required character, or a negative number
3330     bcptr pointer to the chain of currently open branches
3331     cd points to the data block with tables pointers etc.
3332 nigel 3
3333 nigel 23 Returns: TRUE on success
3334 nigel 3 */
3335    
3336     static BOOL
3337 nigel 63 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3338 nigel 53 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3339 nigel 63 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3340 nigel 3 {
3341 nigel 7 const uschar *ptr = *ptrptr;
3342 nigel 3 uschar *code = *codeptr;
3343 nigel 23 uschar *last_branch = code;
3344 nigel 3 uschar *start_bracket = code;
3345 nigel 23 uschar *reverse_count = NULL;
3346 nigel 63 int firstbyte, reqbyte;
3347     int branchfirstbyte, branchreqbyte;
3348     branch_chain bc;
3349 nigel 3
3350 nigel 63 bc.outer = bcptr;
3351     bc.current = code;
3352 nigel 23
3353 nigel 63 firstbyte = reqbyte = REQ_UNSET;
3354    
3355     /* Offset is set zero to mark that this bracket is still open */
3356    
3357     PUT(code, 1, 0);
3358     code += 1 + LINK_SIZE + skipbytes;
3359    
3360 nigel 23 /* Loop for each alternative branch */
3361    
3362 nigel 3 for (;;)
3363     {
3364 nigel 63 /* Handle a change of ims options at the start of the branch */
3365 nigel 3
3366 nigel 63 if ((options & PCRE_IMS) != oldims)
3367 nigel 3 {
3368 nigel 23 *code++ = OP_OPT;
3369 nigel 63 *code++ = options & PCRE_IMS;
3370 nigel 23 }
3371    
3372     /* Set up dummy OP_REVERSE if lookbehind assertion */
3373    
3374     if (lookbehind)
3375     {
3376     *code++ = OP_REVERSE;
3377     reverse_count = code;
3378 nigel 63 PUTINC(code, 0, 0);
3379 nigel 23 }
3380    
3381     /* Now compile the branch */
3382    
3383 nigel 63 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3384     &branchfirstbyte, &branchreqbyte, &bc, cd))
3385 nigel 23 {
3386 nigel 3 *ptrptr = ptr;
3387     return FALSE;
3388     }
3389    
3390 nigel 63 /* If this is the first branch, the firstbyte and reqbyte values for the
3391     branch become the values for the regex. */
3392 nigel 3
3393 nigel 63 if (*last_branch != OP_ALT)
3394     {
3395     firstbyte = branchfirstbyte;
3396     reqbyte = branchreqbyte;
3397     }
3398 nigel 3
3399 nigel 63 /* If this is not the first branch, the first char and reqbyte have to
3400 nigel 65 match the values from all the previous branches, except that if the previous
3401     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3402     REQ_VARY for the regex. */
3403 nigel 37
3404 nigel 63 else
3405 nigel 37 {
3406 nigel 63 /* If we previously had a firstbyte, but it doesn't match the new branch,
3407     we have to abandon the firstbyte for the regex, but if there was previously
3408     no reqbyte, it takes on the value of the old firstbyte. */
3409    
3410     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3411 nigel 37 {
3412 nigel 63 if (reqbyte < 0) reqbyte = firstbyte;
3413     firstbyte = REQ_NONE;
3414 nigel 37 }
3415    
3416 nigel 63 /* If we (now or from before) have no firstbyte, a firstbyte from the
3417     branch becomes a reqbyte if there isn't a branch reqbyte. */
3418 nigel 37
3419 nigel 63 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3420     branchreqbyte = branchfirstbyte;
3421 nigel 37
3422 nigel 63 /* Now ensure that the reqbytes match */
3423    
3424 nigel 65 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3425     reqbyte = REQ_NONE;
3426     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3427 nigel 63 }
3428    
3429 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
3430     and put the length into the OP_REVERSE item. Temporarily mark the end of
3431     the branch with OP_END. */
3432    
3433     if (lookbehind)
3434     {
3435 nigel 63 int length;
3436 nigel 23 *code = OP_END;
3437 nigel 49 length = find_fixedlength(last_branch, options);
3438 nigel 23 DPRINTF(("fixed length = %d\n", length));
3439     if (length < 0)
3440     {
3441 nigel 63 *errorptr = (length == -2)? ERR36 : ERR25;
3442 nigel 23 *ptrptr = ptr;
3443     return FALSE;
3444     }
3445 nigel 63 PUT(reverse_count, 0, length);
3446 nigel 23 }
3447    
3448 nigel 63 /* Reached end of expression, either ')' or end of pattern. Go back through
3449     the alternative branches and reverse the chain of offsets, with the field in
3450     the BRA item now becoming an offset to the first alternative. If there are
3451     no alternatives, it points to the end of the group. The length in the
3452     terminating ket is always the length of the whole bracketed item. If any of
3453     the ims options were changed inside the group, compile a resetting op-code
3454     following, except at the very end of the pattern. Return leaving the pointer
3455     at the terminating char. */
3456 nigel 3
3457     if (*ptr != '|')
3458     {
3459 nigel 63 int length = code - last_branch;
3460     do
3461 nigel 23 {
3462 nigel 63 int prev_length = GET(last_branch, 1);
3463     PUT(last_branch, 1, length);
3464     length = prev_length;
3465     last_branch -= length;
3466 nigel 23 }
3467 nigel 63 while (length > 0);
3468 nigel 3
3469 nigel 63 /* Fill in the ket */
3470 nigel 3
3471 nigel 63 *code = OP_KET;
3472     PUT(code, 1, code - start_bracket);
3473     code += 1 + LINK_SIZE;
3474 nigel 3
3475 nigel 63 /* Resetting option if needed */
3476 nigel 3
3477 nigel 63 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3478 nigel 23 {
3479 nigel 63 *code++ = OP_OPT;
3480     *code++ = oldims;
3481 nigel 23 }
3482    
3483 nigel 63 /* Set values to pass back */
3484 nigel 23
3485 nigel 63 *codeptr = code;
3486     *ptrptr = ptr;
3487     *firstbyteptr = firstbyte;
3488     *reqbyteptr = reqbyte;
3489     return TRUE;
3490     }
3491 nigel 35
3492 nigel 63 /* Another branch follows; insert an "or" node. Its length field points back
3493     to the previous branch while the bracket remains open. At the end the chain
3494     is reversed. It's done like this so that the start of the bracket has a
3495     zero offset until it is closed, making it possible to detect recursion. */
3496 nigel 23
3497 nigel 63 *code = OP_ALT;
3498     PUT(code, 1, code - last_branch);
3499     bc.current = last_branch = code;
3500     code += 1 + LINK_SIZE;
3501     ptr++;
3502 nigel 23 }
3503     /* Control never reaches here */
3504     }
3505    
3506    
3507    
3508    
3509     /*************************************************
3510 nigel 3 * Check for anchored expression *
3511     *************************************************/
3512    
3513     /* Try to find out if this is an anchored regular expression. Consider each
3514     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3515     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3516     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3517     counts, since OP_CIRC can match in the middle.
3518    
3519 nigel 63 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3520     This is the code for \G, which means "match at start of match position, taking
3521     into account the match offset".
3522    
3523 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3524     because that will try the rest of the pattern at all possible matching points,
3525 nigel 63 so there is no point trying again.... er ....
3526 nigel 3
3527 nigel 63 .... except when the .* appears inside capturing parentheses, and there is a
3528     subsequent back reference to those parentheses. We haven't enough information
3529     to catch that case precisely.
3530    
3531     At first, the best we could do was to detect when .* was in capturing brackets
3532     and the highest back reference was greater than or equal to that level.
3533     However, by keeping a bitmap of the first 31 back references, we can catch some
3534     of the more common cases more precisely.
3535    
3536 nigel 23 Arguments:
3537 nigel 63 code points to start of expression (the bracket)
3538     options points to the options setting
3539     bracket_map a bitmap of which brackets we are inside while testing; this
3540     handles up to substring 31; after that we just have to take
3541     the less precise approach
3542     backref_map the back reference bitmap
3543 nigel 23
3544     Returns: TRUE or FALSE
3545 nigel 3 */
3546    
3547     static BOOL
3548 nigel 63 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3549     unsigned int backref_map)
3550 nigel 3 {
3551     do {
3552 nigel 63 const uschar *scode =
3553     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3554 nigel 23 register int op = *scode;
3555 nigel 63
3556     /* Capturing brackets */
3557    
3558     if (op > OP_BRA)
3559     {
3560     int new_map;
3561     op -= OP_BRA;
3562     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3563     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3564     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3565     }
3566    
3567     /* Other brackets */
3568    
3569     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3570     {
3571     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3572     }
3573    
3574     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3575     are or may be referenced. */
3576    
3577 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3578     (*options & PCRE_DOTALL) != 0)
3579 nigel 63 {
3580     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3581     }
3582    
3583     /* Check for explicit anchoring */
3584    
3585     else if (op != OP_SOD && op != OP_SOM &&
3586 nigel 23 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3587     return FALSE;
3588 nigel 63 code += GET(code, 1);
3589 nigel 3 }
3590 nigel 63 while (*code == OP_ALT); /* Loop for each alternative */
3591 nigel 3 return TRUE;
3592     }
3593    
3594    
3595    
3596     /*************************************************
3597 nigel 33 * Check for starting with ^ or .* *
3598 nigel 3 *************************************************/
3599    
3600 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
3601     "first char" processing can be done to speed things up in multiline
3602     matching and for non-DOTALL patterns that start with .* (which must start at
3603 nigel 63 the beginning or after \n). As in the case of is_anchored() (see above), we
3604     have to take account of back references to capturing brackets that contain .*
3605     because in that case we can't make the assumption.
3606 nigel 3
3607 nigel 63 Arguments:
3608     code points to start of expression (the bracket)
3609     bracket_map a bitmap of which brackets we are inside while testing; this
3610     handles up to substring 31; after that we just have to take
3611     the less precise approach
3612     backref_map the back reference bitmap
3613    
3614     Returns: TRUE or FALSE
3615 nigel 3 */
3616    
3617     static BOOL
3618 nigel 63 is_startline(const uschar *code, unsigned int bracket_map,
3619     unsigned int backref_map)
3620 nigel 3 {
3621     do {
3622 nigel 63 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3623 nigel 23 register int op = *scode;
3624 nigel 63
3625     /* Capturing brackets */
3626    
3627     if (op > OP_BRA)
3628     {
3629     int new_map;
3630     op -= OP_BRA;
3631     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3632     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3633     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3634     }
3635    
3636     /* Other brackets */
3637    
3638     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3639     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3640    
3641     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3642     may be referenced. */
3643    
3644 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3645 nigel 63 {
3646     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3647     }
3648    
3649     /* Check for explicit circumflex */
3650    
3651 nigel 23 else if (op != OP_CIRC) return FALSE;
3652 nigel 63 code += GET(code, 1);
3653 nigel 3 }
3654 nigel 63 while (*code == OP_ALT); /* Loop for each alternative */
3655 nigel 3 return TRUE;
3656     }
3657    
3658    
3659    
3660     /*************************************************
3661 nigel 63 * Check for asserted fixed first char *
3662 nigel 3 *************************************************/
3663    
3664 nigel 63 /* During compilation, the "first char" settings from forward assertions are
3665     discarded, because they can cause conflicts with actual literals that follow.
3666     However, if we end up without a first char setting for an unanchored pattern,
3667     it is worth scanning the regex to see if there is an initial asserted first
3668     char. If all branches start with the same asserted char, or with a bracket all
3669     of whose alternatives start with the same asserted char (recurse ad lib), then
3670     we return that char, otherwise -1.
3671 nigel 3
3672 nigel 23 Arguments:
3673     code points to start of expression (the bracket)
3674     options pointer to the options (used to check casing changes)
3675 nigel 63 inassert TRUE if in an assertion
3676 nigel 23
3677     Returns: -1 or the fixed first char
3678 nigel 3 */
3679    
3680     static int
3681 nigel 63 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3682 nigel 3 {
3683     register int c = -1;
3684 nigel 23 do {
3685     int d;
3686 nigel 63 const uschar *scode =
3687     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3688 nigel 23 register int op = *scode;
3689 nigel 3
3690 nigel 23 if (op >= OP_BRA) op = OP_BRA;
3691 nigel 3
3692 nigel 23 switch(op)
3693     {
3694     default:
3695     return -1;
3696 nigel 3
3697 nigel 23 case OP_BRA:
3698     case OP_ASSERT:
3699     case OP_ONCE:
3700     case OP_COND:
3701 nigel 63 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3702     return -1;
3703 nigel 23 if (c < 0) c = d; else if (c != d) return -1;
3704     break;
3705 nigel 3
3706 nigel 23 case OP_EXACT: /* Fall through */
3707     scode++;
3708 nigel 3
3709 nigel 23 case OP_CHARS: /* Fall through */
3710     scode++;
3711    
3712     case OP_PLUS:
3713     case OP_MINPLUS:
3714 nigel 63 if (!inassert) return -1;
3715     if (c < 0)
3716     {
3717     c = scode[1];
3718     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3719     }
3720     else if (c != scode[1]) return -1;
3721 nigel 23 break;
3722     }
3723    
3724 nigel 63 code += GET(code, 1);
3725 nigel 23 }
3726 nigel 3 while (*code == OP_ALT);
3727     return c;
3728     }
3729    
3730    
3731    
3732 nigel 23
3733 nigel 71 #ifdef SUPPORT_UTF8
3734 nigel 3 /*************************************************
3735 nigel 71 * Validate a UTF-8 string *
3736     *************************************************/
3737    
3738     /* This function is called (optionally) at the start of compile or match, to
3739     validate that a supposed UTF-8 string is actually valid. The early check means
3740     that subsequent code can assume it is dealing with a valid string. The check
3741     can be turned off for maximum performance, but then consequences of supplying
3742     an invalid string are then undefined.
3743    
3744     Arguments:
3745     string points to the string
3746     length length of string, or -1 if the string is zero-terminated
3747    
3748     Returns: < 0 if the string is a valid UTF-8 string
3749     >= 0 otherwise; the value is the offset of the bad byte
3750     */
3751    
3752     static int
3753     valid_utf8(const uschar *string, int length)
3754     {
3755     register const uschar *p;
3756    
3757     if (length < 0)
3758     {
3759     for (p = string; *p != 0; p++);
3760     length = p - string;
3761     }
3762    
3763     for (p = string; length-- > 0; p++)
3764     {
3765     int ab;
3766     if (*p < 128) continue;
3767     if ((*p & 0xc0) != 0xc0) return p - string;
3768     ab = utf8_table4[*p & 0x3f]; /* Number of additional bytes */
3769     if (length < ab) return p - string;
3770     while (ab-- > 0)
3771     {
3772     if ((*(++p) & 0xc0) != 0x80) return p - string;
3773     length--;
3774     }
3775     }
3776    
3777     return -1;
3778     }
3779     #endif
3780    
3781    
3782    
3783     /*************************************************
3784 nigel 3 * Compile a Regular Expression *
3785     *************************************************/
3786    
3787     /* This function takes a string and returns a pointer to a block of store
3788     holding a compiled version of the expression.
3789    
3790     Arguments:
3791     pattern the regular expression
3792     options various option bits
3793     errorptr pointer to pointer to error text
3794     erroroffset ptr offset in pattern where error was detected
3795 nigel 25 tables pointer to character tables or NULL
3796 nigel 3
3797     Returns: pointer to compiled data block, or NULL on error,
3798     with errorptr and erroroffset set
3799     */
3800    
3801     pcre *
3802 nigel 7 pcre_compile(const char *pattern, int options, const char **errorptr,
3803 nigel 25 int *erroroffset, const unsigned char *tables)
3804 nigel 3 {
3805     real_pcre *re;
3806 nigel 63 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3807 nigel 3 int runlength;
3808 nigel 63 int c, firstbyte, reqbyte;
3809 nigel 3 int bracount = 0;
3810 nigel 23 int branch_extra = 0;
3811     int branch_newextra;
3812 nigel 63 int item_count = -1;
3813     int name_count = 0;
3814     int max_name_size = 0;
3815     #ifdef SUPPORT_UTF8
3816     int lastcharlength = 0;
3817     BOOL utf8;
3818     BOOL class_utf8;
3819     #endif
3820     BOOL inescq = FALSE;
3821 nigel 7 unsigned int brastackptr = 0;
3822 nigel 43 size_t size;
3823 nigel 7 uschar *code;
3824 nigel 63 const uschar *codestart;
3825 nigel 7 const uschar *ptr;
3826 nigel 25 compile_data compile_block;
3827 nigel 23 int brastack[BRASTACK_SIZE];
3828     uschar bralenstack[BRASTACK_SIZE];
3829 nigel 3
3830     /* We can't pass back an error message if errorptr is NULL; I guess the best we
3831     can do is just return NULL. */
3832    
3833     if (errorptr == NULL) return NULL;
3834     *errorptr = NULL;
3835    
3836     /* However, we can give a message for this error */
3837    
3838     if (erroroffset == NULL)
3839     {
3840     *errorptr = ERR16;
3841     return NULL;
3842     }
3843     *erroroffset = 0;
3844    
3845 nigel 63 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3846    
3847     #ifdef SUPPORT_UTF8
3848     utf8 = (options & PCRE_UTF8) != 0;
3849 nigel 71 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3850     (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
3851     {
3852     *errorptr = ERR44;
3853     return NULL;
3854     }
3855 nigel 63 #else
3856     if ((options & PCRE_UTF8) != 0)
3857     {
3858     *errorptr = ERR32;
3859     return NULL;
3860     }
3861     #endif
3862    
3863 nigel 3 if ((options & ~PUBLIC_OPTIONS) != 0)
3864     {
3865     *errorptr = ERR17;
3866     return NULL;
3867     }
3868    
3869 nigel 25 /* Set up pointers to the individual character tables */
3870    
3871     if (tables == NULL) tables = pcre_default_tables;
3872     compile_block.lcc = tables + lcc_offset;
3873     compile_block.fcc = tables + fcc_offset;
3874     compile_block.cbits = tables + cbits_offset;
3875     compile_block.ctypes = tables + ctypes_offset;
3876    
3877 nigel 63 /* Maximum back reference and backref bitmap. This is updated for numeric
3878     references during the first pass, but for named references during the actual
3879     compile pass. The bitmap records up to 31 back references to help in deciding
3880     whether (.*) can be treated as anchored or not. */
3881    
3882     compile_block.top_backref = 0;
3883     compile_block.backref_map = 0;
3884    
3885 nigel 25 /* Reflect pattern for debugging output */
3886    
3887 nigel 9 DPRINTF(("------------------------------------------------------------------\n"));
3888     DPRINTF(("%s\n", pattern));
3889 nigel 3
3890     /* The first thing to do is to make a pass over the pattern to compute the
3891     amount of store required to hold the compiled code. This does not have to be
3892     perfect as long as errors are overestimates. At the same time we can detect any
3893 nigel 63 flag settings right at the start, and extract them. Make an attempt to correct
3894     for any counted white space if an "extended" flag setting appears late in the
3895     pattern. We can't be so clever for #-comments. */
3896 nigel 3
3897 nigel 7 ptr = (const uschar *)(pattern - 1);
3898 nigel 3 while ((c = *(++ptr)) != 0)
3899     {
3900     int min, max;
3901 nigel 63 int class_optcount;
3902 nigel 53 int bracket_length;
3903 nigel 63 int duplength;
3904 nigel 3
3905 nigel 63 /* If we are inside a \Q...\E sequence, all chars are literal */
3906    
3907     if (inescq) goto NORMAL_CHAR;
3908    
3909     /* Otherwise, first check for ignored whitespace and comments */
3910    
3911 nigel 23 if ((options & PCRE_EXTENDED) != 0)
3912 nigel 3 {
3913 nigel 25 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3914 nigel 23 if (c == '#')
3915     {
3916 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
3917     on the Macintosh. */
3918 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3919 nigel 63 if (c == 0) break;
3920 nigel 23 continue;
3921     }
3922 nigel 3 }
3923    
3924 nigel 63 item_count++; /* Is zero for the first non-comment item */
3925    
3926 nigel 3 switch(c)
3927     {
3928     /* A backslashed item may be an escaped "normal" character or a
3929     character type. For a "normal" character, put the pointers and
3930     character back so that tests for whitespace etc. in the input
3931     are done correctly. */
3932    
3933     case '\\':
3934     {
3935 nigel 7 const uschar *save_ptr = ptr;
3936 nigel 71 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
3937 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3938     if (c >= 0)
3939     {
3940     ptr = save_ptr;
3941     c = '\\';
3942     goto NORMAL_CHAR;
3943     }
3944     }
3945 nigel 63
3946     /* If \Q, enter "literal" mode */
3947    
3948     if (-c == ESC_Q)
3949     {
3950     inescq = TRUE;
3951     continue;
3952     }
3953    
3954     /* Other escapes need one byte, and are of length one for repeats */
3955    
3956 nigel 3 length++;
3957 nigel 63 #ifdef SUPPORT_UTF8
3958     lastcharlength = 1;
3959     #endif
3960 nigel 3
3961 nigel 53 /* A back reference needs an additional 2 bytes, plus either one or 5
3962 nigel 3 bytes for a repeat. We also need to keep the value of the highest
3963     back reference. */
3964    
3965     if (c <= -ESC_REF)
3966     {
3967     int refnum = -c - ESC_REF;
3968 nigel 63 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3969     if (refnum > compile_block.top_backref)
3970     compile_block.top_backref = refnum;
3971 nigel 53 length += 2; /* For single back reference */
3972 nigel 71 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
3973 nigel 3 {
3974 nigel 71 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
3975 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3976     if ((min == 0 && (max == 1 || max == -1)) ||
3977     (min == 1 && max == -1))
3978     length++;
3979     else length += 5;
3980     if (ptr[1] == '?') ptr++;
3981     }
3982     }
3983     continue;
3984    
3985 nigel 63 case '^': /* Single-byte metacharacters */
3986 nigel 3 case '.':
3987     case '$':
3988     length++;
3989 nigel 63 #ifdef SUPPORT_UTF8
3990     lastcharlength = 1;
3991     #endif
3992 nigel 3 continue;
3993    
3994 nigel 63 case '*': /* These repeats won't be after brackets; */
3995     case '+': /* those are handled separately */
3996     case '?':
3997     length++;
3998     goto POSESSIVE; /* A few lines below */
3999 nigel 3
4000 nigel 63 /* This covers the cases of braced repeats after a single char, metachar,
4001     class, or back reference. */
4002    
4003 nigel 3 case '{':
4004 nigel 71 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4005     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4006 nigel 3 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4007 nigel 63
4008     /* These special cases just insert one extra opcode */
4009    
4010 nigel 3 if ((min == 0 && (max == 1 || max == -1)) ||
4011     (min == 1 && max == -1))
4012     length++;
4013 nigel 63
4014     /* These cases might insert additional copies of a preceding character. */
4015    
4016 nigel 3 else
4017     {
4018 nigel 63 #ifdef SUPPORT_UTF8
4019     /* In UTF-8 mode, we should find the length in lastcharlength */
4020     if (utf8)
4021     {
4022     if (min != 1)
4023     {
4024     length -= lastcharlength; /* Uncount the original char or metachar */
4025     if (min > 0) length += 3 + lastcharlength;
4026     }
4027     length += lastcharlength + ((max > 0)? 3 : 1);
4028     }
4029     else
4030     #endif
4031    
4032     /* Not UTF-8 mode: all characters are one byte */
4033     {
4034     if (min != 1)
4035     {
4036     length--; /* Uncount the original char or metachar */
4037     if (min > 0) length += 4;
4038     }
4039    
4040     length += (max > 0)? 4 : 2;
4041     }
4042 nigel 3 }
4043 nigel 63
4044     if (ptr[1] == '?') ptr++; /* Needs no extra length */
4045    
4046     POSESSIVE: /* Test for possessive quantifier */
4047     if (ptr[1] == '+')
4048     {
4049     ptr++;
4050     length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4051     }
4052 nigel 3 continue;
4053    
4054 nigel 23 /* An alternation contains an offset to the next branch or ket. If any ims
4055     options changed in the previous branch(es), and/or if we are in a
4056     lookbehind assertion, extra space will be needed at the start of the
4057     branch. This is handled by branch_extra. */
4058