/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 79 - (hide annotations) (download)
Sat Feb 24 21:40:52 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 159122 byte(s)
Load pcre-6.1 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9     Copyright (c) 1997-2005 University of Cambridge
10    
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45     #include "pcre_internal.h"
46    
47    
48     /*************************************************
49     * Code parameters and static tables *
50     *************************************************/
51    
52     /* Maximum number of items on the nested bracket stacks at compile time. This
53     applies to the nesting of all kinds of parentheses. It does not limit
54     un-nested, non-capturing parentheses. This number can be made bigger if
55     necessary - it is used to dimension one int and one unsigned char vector at
56     compile time. */
57    
58     #define BRASTACK_SIZE 200
59    
60    
61     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
62     are simple data values; negative values are for special things like \d and so
63     on. Zero means further processing is needed (for things like \x), or the escape
64     is invalid. */
65    
66     #if !EBCDIC /* This is the "normal" table for ASCII systems */
67     static const short int escapes[] = {
68     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
69     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
70     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
71     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
72     -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
73     -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
74     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
75     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
76     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
77     0, 0, -ESC_z /* x - z */
78     };
79    
80     #else /* This is the "abnormal" table for EBCDIC systems */
81     static const short int escapes[] = {
82     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
83     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
84     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
85     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
86     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
87     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
88     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
89     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
90     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
91     /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
92     /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
93     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
94     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
95     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
96     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
97     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
98     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
99     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
100     /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
101     /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
102     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
103     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
104     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
105     };
106     #endif
107    
108    
109     /* Tables of names of POSIX character classes and their lengths. The list is
110     terminated by a zero length entry. The first three must be alpha, upper, lower,
111     as this is assumed for handling case independence. */
112    
113     static const char *const posix_names[] = {
114     "alpha", "lower", "upper",
115     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
116     "print", "punct", "space", "word", "xdigit" };
117    
118     static const uschar posix_name_lengths[] = {
119     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
120    
121     /* Table of class bit maps for each POSIX class; up to three may be combined
122     to form the class. The table for [:blank:] is dynamically modified to remove
123     the vertical space characters. */
124    
125     static const int posix_class_maps[] = {
126     cbit_lower, cbit_upper, -1, /* alpha */
127     cbit_lower, -1, -1, /* lower */
128     cbit_upper, -1, -1, /* upper */
129     cbit_digit, cbit_lower, cbit_upper, /* alnum */
130     cbit_print, cbit_cntrl, -1, /* ascii */
131     cbit_space, -1, -1, /* blank - a GNU extension */
132     cbit_cntrl, -1, -1, /* cntrl */
133     cbit_digit, -1, -1, /* digit */
134     cbit_graph, -1, -1, /* graph */
135     cbit_print, -1, -1, /* print */
136     cbit_punct, -1, -1, /* punct */
137     cbit_space, -1, -1, /* space */
138     cbit_word, -1, -1, /* word - a Perl extension */
139     cbit_xdigit,-1, -1 /* xdigit */
140     };
141    
142    
143     /* The texts of compile-time error messages. These are "char *" because they
144     are passed to the outside world. */
145    
146     static const char *error_texts[] = {
147     "no error",
148     "\\ at end of pattern",
149     "\\c at end of pattern",
150     "unrecognized character follows \\",
151     "numbers out of order in {} quantifier",
152     /* 5 */
153     "number too big in {} quantifier",
154     "missing terminating ] for character class",
155     "invalid escape sequence in character class",
156     "range out of order in character class",
157     "nothing to repeat",
158     /* 10 */
159     "operand of unlimited repeat could match the empty string",
160     "internal error: unexpected repeat",
161     "unrecognized character after (?",
162     "POSIX named classes are supported only within a class",
163     "missing )",
164     /* 15 */
165     "reference to non-existent subpattern",
166     "erroffset passed as NULL",
167     "unknown option bit(s) set",
168     "missing ) after comment",
169     "parentheses nested too deeply",
170     /* 20 */
171     "regular expression too large",
172     "failed to get memory",
173     "unmatched parentheses",
174     "internal error: code overflow",
175     "unrecognized character after (?<",
176     /* 25 */
177     "lookbehind assertion is not fixed length",
178     "malformed number after (?(",
179     "conditional group contains more than two branches",
180     "assertion expected after (?(",
181     "(?R or (?digits must be followed by )",
182     /* 30 */
183     "unknown POSIX class name",
184     "POSIX collating elements are not supported",
185     "this version of PCRE is not compiled with PCRE_UTF8 support",
186     "spare error",
187     "character value in \\x{...} sequence is too large",
188     /* 35 */
189     "invalid condition (?(0)",
190     "\\C not allowed in lookbehind assertion",
191     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
192     "number after (?C is > 255",
193     "closing ) for (?C expected",
194     /* 40 */
195     "recursive call could loop indefinitely",
196     "unrecognized character after (?P",
197     "syntax error after (?P",
198     "two named groups have the same name",
199     "invalid UTF-8 string",
200     /* 45 */
201     "support for \\P, \\p, and \\X has not been compiled",
202     "malformed \\P or \\p sequence",
203     "unknown property name after \\P or \\p"
204     };
205    
206    
207     /* Table to identify digits and hex digits. This is used when compiling
208     patterns. Note that the tables in chartables are dependent on the locale, and
209     may mark arbitrary characters as digits - but the PCRE compiling code expects
210     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
211     a private table here. It costs 256 bytes, but it is a lot faster than doing
212     character value tests (at least in some simple cases I timed), and in some
213     applications one wants PCRE to compile efficiently as well as match
214     efficiently.
215    
216     For convenience, we use the same bit definitions as in chartables:
217    
218     0x04 decimal digit
219     0x08 hexadecimal digit
220    
221     Then we can use ctype_digit and ctype_xdigit in the code. */
222    
223     #if !EBCDIC /* This is the "normal" case, for ASCII systems */
224     static const unsigned char digitab[] =
225     {
226     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
227     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
228     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
229     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
230     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
231     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
232     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
233     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
234     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
235     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
236     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
237     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
238     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
239     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
240     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
241     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
242     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
243     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
244     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
245     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
246     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
247     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
248     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
249     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
250     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
251     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
252     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
253     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
254     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
255     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
256     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
257     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
258    
259     #else /* This is the "abnormal" case, for EBCDIC systems */
260     static const unsigned char digitab[] =
261     {
262     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
263     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
264     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
265     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
266     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
267     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
268     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
275     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
276     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
278     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
281     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
286     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
292     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
293     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
294    
295     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
296     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
297     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
298     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
300     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
304     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
305     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
307     0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
309     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
312     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
313     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
314     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
315     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
316     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
317     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
318     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
319     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
320     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
321     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
322     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
323     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
324     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
325     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
326     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
327     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
328     #endif
329    
330    
331     /* Definition to allow mutual recursion */
332    
333     static BOOL
334     compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
335     int *, int *, branch_chain *, compile_data *);
336    
337    
338    
339     /*************************************************
340     * Handle escapes *
341     *************************************************/
342    
343     /* This function is called when a \ has been encountered. It either returns a
344     positive value for a simple escape such as \n, or a negative value which
345     encodes one of the more complicated things such as \d. When UTF-8 is enabled,
346     a positive value greater than 255 may be returned. On entry, ptr is pointing at
347     the \. On exit, it is on the final character of the escape sequence.
348    
349     Arguments:
350     ptrptr points to the pattern position pointer
351     errorcodeptr points to the errorcode variable
352     bracount number of previous extracting brackets
353     options the options bits
354     isclass TRUE if inside a character class
355    
356     Returns: zero or positive => a data character
357     negative => a special escape sequence
358     on error, errorptr is set
359     */
360    
361     static int
362     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
363     int options, BOOL isclass)
364     {
365     const uschar *ptr = *ptrptr;
366     int c, i;
367    
368     /* If backslash is at the end of the pattern, it's an error. */
369    
370     c = *(++ptr);
371     if (c == 0) *errorcodeptr = ERR1;
372    
373     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
374     a table. A non-zero result is something that can be returned immediately.
375     Otherwise further processing may be required. */
376    
377     #if !EBCDIC /* ASCII coding */
378     else if (c < '0' || c > 'z') {} /* Not alphameric */
379     else if ((i = escapes[c - '0']) != 0) c = i;
380    
381     #else /* EBCDIC coding */
382     else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
383     else if ((i = escapes[c - 0x48]) != 0) c = i;
384     #endif
385    
386     /* Escapes that need further processing, or are illegal. */
387    
388     else
389     {
390     const uschar *oldptr;
391     switch (c)
392     {
393     /* A number of Perl escapes are not handled by PCRE. We give an explicit
394     error. */
395    
396     case 'l':
397     case 'L':
398     case 'N':
399     case 'u':
400     case 'U':
401     *errorcodeptr = ERR37;
402     break;
403    
404     /* The handling of escape sequences consisting of a string of digits
405     starting with one that is not zero is not straightforward. By experiment,
406     the way Perl works seems to be as follows:
407    
408     Outside a character class, the digits are read as a decimal number. If the
409     number is less than 10, or if there are that many previous extracting
410     left brackets, then it is a back reference. Otherwise, up to three octal
411     digits are read to form an escaped byte. Thus \123 is likely to be octal
412     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
413     value is greater than 377, the least significant 8 bits are taken. Inside a
414     character class, \ followed by a digit is always an octal number. */
415    
416     case '1': case '2': case '3': case '4': case '5':
417     case '6': case '7': case '8': case '9':
418    
419     if (!isclass)
420     {
421     oldptr = ptr;
422     c -= '0';
423     while ((digitab[ptr[1]] & ctype_digit) != 0)
424     c = c * 10 + *(++ptr) - '0';
425     if (c < 10 || c <= bracount)
426     {
427     c = -(ESC_REF + c);
428     break;
429     }
430     ptr = oldptr; /* Put the pointer back and fall through */
431     }
432    
433     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
434     generates a binary zero byte and treats the digit as a following literal.
435     Thus we have to pull back the pointer by one. */
436    
437     if ((c = *ptr) >= '8')
438     {
439     ptr--;
440     c = 0;
441     break;
442     }
443    
444     /* \0 always starts an octal number, but we may drop through to here with a
445     larger first octal digit. */
446    
447     case '0':
448     c -= '0';
449     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
450     c = c * 8 + *(++ptr) - '0';
451     c &= 255; /* Take least significant 8 bits */
452     break;
453    
454     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
455     which can be greater than 0xff, but only if the ddd are hex digits. */
456    
457     case 'x':
458     #ifdef SUPPORT_UTF8
459     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
460     {
461     const uschar *pt = ptr + 2;
462     register int count = 0;
463     c = 0;
464     while ((digitab[*pt] & ctype_xdigit) != 0)
465     {
466     int cc = *pt++;
467     count++;
468     #if !EBCDIC /* ASCII coding */
469     if (cc >= 'a') cc -= 32; /* Convert to upper case */
470     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
471     #else /* EBCDIC coding */
472     if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
473     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
474     #endif
475     }
476     if (*pt == '}')
477     {
478     if (c < 0 || count > 8) *errorcodeptr = ERR34;
479     ptr = pt;
480     break;
481     }
482     /* If the sequence of hex digits does not end with '}', then we don't
483     recognize this construct; fall through to the normal \x handling. */
484     }
485     #endif
486    
487     /* Read just a single hex char */
488    
489     c = 0;
490     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
491     {
492     int cc; /* Some compilers don't like ++ */
493     cc = *(++ptr); /* in initializers */
494     #if !EBCDIC /* ASCII coding */
495     if (cc >= 'a') cc -= 32; /* Convert to upper case */
496     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
497     #else /* EBCDIC coding */
498     if (cc <= 'z') cc += 64; /* Convert to upper case */
499     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
500     #endif
501     }
502     break;
503    
504     /* Other special escapes not starting with a digit are straightforward */
505    
506     case 'c':
507     c = *(++ptr);
508     if (c == 0)
509     {
510     *errorcodeptr = ERR2;
511     return 0;
512     }
513    
514     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
515     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
516     (However, an EBCDIC equivalent has now been added.) */
517    
518     #if !EBCDIC /* ASCII coding */
519     if (c >= 'a' && c <= 'z') c -= 32;
520     c ^= 0x40;
521     #else /* EBCDIC coding */
522     if (c >= 'a' && c <= 'z') c += 64;
523     c ^= 0xC0;
524     #endif
525     break;
526    
527     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
528     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
529     for Perl compatibility, it is a literal. This code looks a bit odd, but
530     there used to be some cases other than the default, and there may be again
531     in future, so I haven't "optimized" it. */
532    
533     default:
534     if ((options & PCRE_EXTRA) != 0) switch(c)
535     {
536     default:
537     *errorcodeptr = ERR3;
538     break;
539     }
540     break;
541     }
542     }
543    
544     *ptrptr = ptr;
545     return c;
546     }
547    
548    
549    
550     #ifdef SUPPORT_UCP
551     /*************************************************
552     * Handle \P and \p *
553     *************************************************/
554    
555     /* This function is called after \P or \p has been encountered, provided that
556     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
557     pointing at the P or p. On exit, it is pointing at the final character of the
558     escape sequence.
559    
560     Argument:
561     ptrptr points to the pattern position pointer
562     negptr points to a boolean that is set TRUE for negation else FALSE
563     errorcodeptr points to the error code variable
564    
565     Returns: value from ucp_type_table, or -1 for an invalid type
566     */
567    
568     static int
569     get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
570     {
571     int c, i, bot, top;
572     const uschar *ptr = *ptrptr;
573     char name[4];
574    
575     c = *(++ptr);
576     if (c == 0) goto ERROR_RETURN;
577    
578     *negptr = FALSE;
579    
580     /* \P or \p can be followed by a one- or two-character name in {}, optionally
581     preceded by ^ for negation. */
582    
583     if (c == '{')
584     {
585     if (ptr[1] == '^')
586     {
587     *negptr = TRUE;
588     ptr++;
589     }
590     for (i = 0; i <= 2; i++)
591     {
592     c = *(++ptr);
593     if (c == 0) goto ERROR_RETURN;
594     if (c == '}') break;
595     name[i] = c;
596     }
597     if (c !='}') /* Try to distinguish error cases */
598     {
599     while (*(++ptr) != 0 && *ptr != '}');
600     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
601     }
602     name[i] = 0;
603     }
604    
605     /* Otherwise there is just one following character */
606    
607     else
608     {
609     name[0] = c;
610     name[1] = 0;
611     }
612    
613     *ptrptr = ptr;
614    
615     /* Search for a recognized property name using binary chop */
616    
617     bot = 0;
618     top = _pcre_utt_size;
619    
620     while (bot < top)
621     {
622     i = (bot + top)/2;
623     c = strcmp(name, _pcre_utt[i].name);
624     if (c == 0) return _pcre_utt[i].value;
625     if (c > 0) bot = i + 1; else top = i;
626     }
627    
628     UNKNOWN_RETURN:
629     *errorcodeptr = ERR47;
630     *ptrptr = ptr;
631     return -1;
632    
633     ERROR_RETURN:
634     *errorcodeptr = ERR46;
635     *ptrptr = ptr;
636     return -1;
637     }
638     #endif
639    
640    
641    
642    
643     /*************************************************
644     * Check for counted repeat *
645     *************************************************/
646    
647     /* This function is called when a '{' is encountered in a place where it might
648     start a quantifier. It looks ahead to see if it really is a quantifier or not.
649     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
650     where the ddds are digits.
651    
652     Arguments:
653     p pointer to the first char after '{'
654    
655     Returns: TRUE or FALSE
656     */
657    
658     static BOOL
659     is_counted_repeat(const uschar *p)
660     {
661     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
662     while ((digitab[*p] & ctype_digit) != 0) p++;
663     if (*p == '}') return TRUE;
664    
665     if (*p++ != ',') return FALSE;
666     if (*p == '}') return TRUE;
667    
668     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
669     while ((digitab[*p] & ctype_digit) != 0) p++;
670    
671     return (*p == '}');
672     }
673    
674    
675    
676     /*************************************************
677     * Read repeat counts *
678     *************************************************/
679    
680     /* Read an item of the form {n,m} and return the values. This is called only
681     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
682     so the syntax is guaranteed to be correct, but we need to check the values.
683    
684     Arguments:
685     p pointer to first char after '{'
686     minp pointer to int for min
687     maxp pointer to int for max
688     returned as -1 if no max
689     errorcodeptr points to error code variable
690    
691     Returns: pointer to '}' on success;
692     current ptr on error, with errorcodeptr set non-zero
693     */
694    
695     static const uschar *
696     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
697     {
698     int min = 0;
699     int max = -1;
700    
701     while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
702    
703     if (*p == '}') max = min; else
704     {
705     if (*(++p) != '}')
706     {
707     max = 0;
708     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
709     if (max < min)
710     {
711     *errorcodeptr = ERR4;
712     return p;
713     }
714     }
715     }
716    
717     /* Do paranoid checks, then fill in the required variables, and pass back the
718     pointer to the terminating '}'. */
719    
720     if (min > 65535 || max > 65535)
721     *errorcodeptr = ERR5;
722     else
723     {
724     *minp = min;
725     *maxp = max;
726     }
727     return p;
728     }
729    
730    
731    
732     /*************************************************
733     * Find first significant op code *
734     *************************************************/
735    
736     /* This is called by several functions that scan a compiled expression looking
737     for a fixed first character, or an anchoring op code etc. It skips over things
738     that do not influence this. For some calls, a change of option is important.
739     For some calls, it makes sense to skip negative forward and all backward
740     assertions, and also the \b assertion; for others it does not.
741    
742     Arguments:
743     code pointer to the start of the group
744     options pointer to external options
745     optbit the option bit whose changing is significant, or
746     zero if none are
747     skipassert TRUE if certain assertions are to be skipped
748    
749     Returns: pointer to the first significant opcode
750     */
751    
752     static const uschar*
753     first_significant_code(const uschar *code, int *options, int optbit,
754     BOOL skipassert)
755     {
756     for (;;)
757     {
758     switch ((int)*code)
759     {
760     case OP_OPT:
761     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
762     *options = (int)code[1];
763     code += 2;
764     break;
765    
766     case OP_ASSERT_NOT:
767     case OP_ASSERTBACK:
768     case OP_ASSERTBACK_NOT:
769     if (!skipassert) return code;
770     do code += GET(code, 1); while (*code == OP_ALT);
771     code += _pcre_OP_lengths[*code];
772     break;
773    
774     case OP_WORD_BOUNDARY:
775     case OP_NOT_WORD_BOUNDARY:
776     if (!skipassert) return code;
777     /* Fall through */
778    
779     case OP_CALLOUT:
780     case OP_CREF:
781     case OP_BRANUMBER:
782     code += _pcre_OP_lengths[*code];
783     break;
784    
785     default:
786     return code;
787     }
788     }
789     /* Control never reaches here */
790     }
791    
792    
793    
794    
795     /*************************************************
796     * Find the fixed length of a pattern *
797     *************************************************/
798    
799     /* Scan a pattern and compute the fixed length of subject that will match it,
800     if the length is fixed. This is needed for dealing with backward assertions.
801     In UTF8 mode, the result is in characters rather than bytes.
802    
803     Arguments:
804     code points to the start of the pattern (the bracket)
805     options the compiling options
806    
807     Returns: the fixed length, or -1 if there is no fixed length,
808     or -2 if \C was encountered
809     */
810    
811     static int
812     find_fixedlength(uschar *code, int options)
813     {
814     int length = -1;
815    
816     register int branchlength = 0;
817     register uschar *cc = code + 1 + LINK_SIZE;
818    
819     /* Scan along the opcodes for this branch. If we get to the end of the
820     branch, check the length against that of the other branches. */
821    
822     for (;;)
823     {
824     int d;
825     register int op = *cc;
826     if (op >= OP_BRA) op = OP_BRA;
827    
828     switch (op)
829     {
830     case OP_BRA:
831     case OP_ONCE:
832     case OP_COND:
833     d = find_fixedlength(cc, options);
834     if (d < 0) return d;
835     branchlength += d;
836     do cc += GET(cc, 1); while (*cc == OP_ALT);
837     cc += 1 + LINK_SIZE;
838     break;
839    
840     /* Reached end of a branch; if it's a ket it is the end of a nested
841     call. If it's ALT it is an alternation in a nested call. If it is
842     END it's the end of the outer call. All can be handled by the same code. */
843    
844     case OP_ALT:
845     case OP_KET:
846     case OP_KETRMAX:
847     case OP_KETRMIN:
848     case OP_END:
849     if (length < 0) length = branchlength;
850     else if (length != branchlength) return -1;
851     if (*cc != OP_ALT) return length;
852     cc += 1 + LINK_SIZE;
853     branchlength = 0;
854     break;
855    
856     /* Skip over assertive subpatterns */
857    
858     case OP_ASSERT:
859     case OP_ASSERT_NOT:
860     case OP_ASSERTBACK:
861     case OP_ASSERTBACK_NOT:
862     do cc += GET(cc, 1); while (*cc == OP_ALT);
863     /* Fall through */
864    
865     /* Skip over things that don't match chars */
866    
867     case OP_REVERSE:
868     case OP_BRANUMBER:
869     case OP_CREF:
870     case OP_OPT:
871     case OP_CALLOUT:
872     case OP_SOD:
873     case OP_SOM:
874     case OP_EOD:
875     case OP_EODN:
876     case OP_CIRC:
877     case OP_DOLL:
878     case OP_NOT_WORD_BOUNDARY:
879     case OP_WORD_BOUNDARY:
880     cc += _pcre_OP_lengths[*cc];
881     break;
882    
883     /* Handle literal characters */
884    
885     case OP_CHAR:
886     case OP_CHARNC:
887     branchlength++;
888     cc += 2;
889     #ifdef SUPPORT_UTF8
890     if ((options & PCRE_UTF8) != 0)
891     {
892     while ((*cc & 0xc0) == 0x80) cc++;
893     }
894     #endif
895     break;
896    
897     /* Handle exact repetitions. The count is already in characters, but we
898     need to skip over a multibyte character in UTF8 mode. */
899    
900     case OP_EXACT:
901     branchlength += GET2(cc,1);
902     cc += 4;
903     #ifdef SUPPORT_UTF8
904     if ((options & PCRE_UTF8) != 0)
905     {
906     while((*cc & 0x80) == 0x80) cc++;
907     }
908     #endif
909     break;
910    
911     case OP_TYPEEXACT:
912     branchlength += GET2(cc,1);
913     cc += 4;
914     break;
915    
916     /* Handle single-char matchers */
917    
918     case OP_PROP:
919     case OP_NOTPROP:
920     cc++;
921     /* Fall through */
922    
923     case OP_NOT_DIGIT:
924     case OP_DIGIT:
925     case OP_NOT_WHITESPACE:
926     case OP_WHITESPACE:
927     case OP_NOT_WORDCHAR:
928     case OP_WORDCHAR:
929     case OP_ANY:
930     branchlength++;
931     cc++;
932     break;
933    
934     /* The single-byte matcher isn't allowed */
935    
936     case OP_ANYBYTE:
937     return -2;
938    
939     /* Check a class for variable quantification */
940    
941     #ifdef SUPPORT_UTF8
942     case OP_XCLASS:
943     cc += GET(cc, 1) - 33;
944     /* Fall through */
945     #endif
946    
947     case OP_CLASS:
948     case OP_NCLASS:
949     cc += 33;
950    
951     switch (*cc)
952     {
953     case OP_CRSTAR:
954     case OP_CRMINSTAR:
955     case OP_CRQUERY:
956     case OP_CRMINQUERY:
957     return -1;
958    
959     case OP_CRRANGE:
960     case OP_CRMINRANGE:
961     if (GET2(cc,1) != GET2(cc,3)) return -1;
962     branchlength += GET2(cc,1);
963     cc += 5;
964     break;
965    
966     default:
967     branchlength++;
968     }
969     break;
970    
971     /* Anything else is variable length */
972    
973     default:
974     return -1;
975     }
976     }
977     /* Control never gets here */
978     }
979    
980    
981    
982    
983     /*************************************************
984     * Scan compiled regex for numbered bracket *
985     *************************************************/
986    
987     /* This little function scans through a compiled pattern until it finds a
988     capturing bracket with the given number.
989    
990     Arguments:
991     code points to start of expression
992     utf8 TRUE in UTF-8 mode
993     number the required bracket number
994    
995     Returns: pointer to the opcode for the bracket, or NULL if not found
996     */
997    
998     static const uschar *
999     find_bracket(const uschar *code, BOOL utf8, int number)
1000     {
1001     #ifndef SUPPORT_UTF8
1002     utf8 = utf8; /* Stop pedantic compilers complaining */
1003     #endif
1004    
1005     for (;;)
1006     {
1007     register int c = *code;
1008     if (c == OP_END) return NULL;
1009     else if (c > OP_BRA)
1010     {
1011     int n = c - OP_BRA;
1012     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1013     if (n == number) return (uschar *)code;
1014     code += _pcre_OP_lengths[OP_BRA];
1015     }
1016     else
1017     {
1018     code += _pcre_OP_lengths[c];
1019    
1020     #ifdef SUPPORT_UTF8
1021    
1022     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1023     by a multi-byte character. The length in the table is a minimum, so we have
1024     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1025     can use relatively efficient code. */
1026    
1027     if (utf8) switch(c)
1028     {
1029     case OP_CHAR:
1030     case OP_CHARNC:
1031     case OP_EXACT:
1032     case OP_UPTO:
1033     case OP_MINUPTO:
1034     case OP_STAR:
1035     case OP_MINSTAR:
1036     case OP_PLUS:
1037     case OP_MINPLUS:
1038     case OP_QUERY:
1039     case OP_MINQUERY:
1040     while ((*code & 0xc0) == 0x80) code++;
1041     break;
1042    
1043     /* XCLASS is used for classes that cannot be represented just by a bit
1044     map. This includes negated single high-valued characters. The length in
1045     the table is zero; the actual length is stored in the compiled code. */
1046    
1047     case OP_XCLASS:
1048     code += GET(code, 1) + 1;
1049     break;
1050     }
1051     #endif
1052     }
1053     }
1054     }
1055    
1056    
1057    
1058     /*************************************************
1059     * Scan compiled regex for recursion reference *
1060     *************************************************/
1061    
1062     /* This little function scans through a compiled pattern until it finds an
1063     instance of OP_RECURSE.
1064    
1065     Arguments:
1066     code points to start of expression
1067     utf8 TRUE in UTF-8 mode
1068    
1069     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1070     */
1071    
1072     static const uschar *
1073     find_recurse(const uschar *code, BOOL utf8)
1074     {
1075     #ifndef SUPPORT_UTF8
1076     utf8 = utf8; /* Stop pedantic compilers complaining */
1077     #endif
1078    
1079     for (;;)
1080     {
1081     register int c = *code;
1082     if (c == OP_END) return NULL;
1083     else if (c == OP_RECURSE) return code;
1084     else if (c > OP_BRA)
1085     {
1086     code += _pcre_OP_lengths[OP_BRA];
1087     }
1088     else
1089     {
1090     code += _pcre_OP_lengths[c];
1091    
1092     #ifdef SUPPORT_UTF8
1093    
1094     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1095     by a multi-byte character. The length in the table is a minimum, so we have
1096     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1097     can use relatively efficient code. */
1098    
1099     if (utf8) switch(c)
1100     {
1101     case OP_CHAR:
1102     case OP_CHARNC:
1103     case OP_EXACT:
1104     case OP_UPTO:
1105     case OP_MINUPTO:
1106     case OP_STAR:
1107     case OP_MINSTAR:
1108     case OP_PLUS:
1109     case OP_MINPLUS:
1110     case OP_QUERY:
1111     case OP_MINQUERY:
1112     while ((*code & 0xc0) == 0x80) code++;
1113     break;
1114    
1115     /* XCLASS is used for classes that cannot be represented just by a bit
1116     map. This includes negated single high-valued characters. The length in
1117     the table is zero; the actual length is stored in the compiled code. */
1118    
1119     case OP_XCLASS:
1120     code += GET(code, 1) + 1;
1121     break;
1122     }
1123     #endif
1124     }
1125     }
1126     }
1127    
1128    
1129    
1130     /*************************************************
1131     * Scan compiled branch for non-emptiness *
1132     *************************************************/
1133    
1134     /* This function scans through a branch of a compiled pattern to see whether it
1135     can match the empty string or not. It is called only from could_be_empty()
1136     below. Note that first_significant_code() skips over assertions. If we hit an
1137     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1138     whose current branch will already have been scanned.
1139    
1140     Arguments:
1141     code points to start of search
1142     endcode points to where to stop
1143     utf8 TRUE if in UTF8 mode
1144    
1145     Returns: TRUE if what is matched could be empty
1146     */
1147    
1148     static BOOL
1149     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1150     {
1151     register int c;
1152     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1153     code < endcode;
1154     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1155     {
1156     const uschar *ccode;
1157    
1158     c = *code;
1159    
1160     if (c >= OP_BRA)
1161     {
1162     BOOL empty_branch;
1163     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1164    
1165     /* Scan a closed bracket */
1166    
1167     empty_branch = FALSE;
1168     do
1169     {
1170     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1171     empty_branch = TRUE;
1172     code += GET(code, 1);
1173     }
1174     while (*code == OP_ALT);
1175     if (!empty_branch) return FALSE; /* All branches are non-empty */
1176     code += 1 + LINK_SIZE;
1177     c = *code;
1178     }
1179    
1180     else switch (c)
1181     {
1182     /* Check for quantifiers after a class */
1183    
1184     #ifdef SUPPORT_UTF8
1185     case OP_XCLASS:
1186     ccode = code + GET(code, 1);
1187     goto CHECK_CLASS_REPEAT;
1188     #endif
1189    
1190     case OP_CLASS:
1191     case OP_NCLASS:
1192     ccode = code + 33;
1193    
1194     #ifdef SUPPORT_UTF8
1195     CHECK_CLASS_REPEAT:
1196     #endif
1197    
1198     switch (*ccode)
1199     {
1200     case OP_CRSTAR: /* These could be empty; continue */
1201     case OP_CRMINSTAR:
1202     case OP_CRQUERY:
1203     case OP_CRMINQUERY:
1204     break;
1205    
1206     default: /* Non-repeat => class must match */
1207     case OP_CRPLUS: /* These repeats aren't empty */
1208     case OP_CRMINPLUS:
1209     return FALSE;
1210    
1211     case OP_CRRANGE:
1212     case OP_CRMINRANGE:
1213     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1214     break;
1215     }
1216     break;
1217    
1218     /* Opcodes that must match a character */
1219    
1220     case OP_PROP:
1221     case OP_NOTPROP:
1222     case OP_EXTUNI:
1223     case OP_NOT_DIGIT:
1224     case OP_DIGIT:
1225     case OP_NOT_WHITESPACE:
1226     case OP_WHITESPACE:
1227     case OP_NOT_WORDCHAR:
1228     case OP_WORDCHAR:
1229     case OP_ANY:
1230     case OP_ANYBYTE:
1231     case OP_CHAR:
1232     case OP_CHARNC:
1233     case OP_NOT:
1234     case OP_PLUS:
1235     case OP_MINPLUS:
1236     case OP_EXACT:
1237     case OP_NOTPLUS:
1238     case OP_NOTMINPLUS:
1239     case OP_NOTEXACT:
1240     case OP_TYPEPLUS:
1241     case OP_TYPEMINPLUS:
1242     case OP_TYPEEXACT:
1243     return FALSE;
1244    
1245     /* End of branch */
1246    
1247     case OP_KET:
1248     case OP_KETRMAX:
1249     case OP_KETRMIN:
1250     case OP_ALT:
1251     return TRUE;
1252    
1253     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1254     followed by a multibyte character */
1255    
1256     #ifdef SUPPORT_UTF8
1257     case OP_STAR:
1258     case OP_MINSTAR:
1259     case OP_QUERY:
1260     case OP_MINQUERY:
1261     case OP_UPTO:
1262     case OP_MINUPTO:
1263     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1264     break;
1265     #endif
1266     }
1267     }
1268    
1269     return TRUE;
1270     }
1271    
1272    
1273    
1274     /*************************************************
1275     * Scan compiled regex for non-emptiness *
1276     *************************************************/
1277    
1278     /* This function is called to check for left recursive calls. We want to check
1279     the current branch of the current pattern to see if it could match the empty
1280     string. If it could, we must look outwards for branches at other levels,
1281     stopping when we pass beyond the bracket which is the subject of the recursion.
1282    
1283     Arguments:
1284     code points to start of the recursion
1285     endcode points to where to stop (current RECURSE item)
1286     bcptr points to the chain of current (unclosed) branch starts
1287     utf8 TRUE if in UTF-8 mode
1288    
1289     Returns: TRUE if what is matched could be empty
1290     */
1291    
1292     static BOOL
1293     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1294     BOOL utf8)
1295     {
1296     while (bcptr != NULL && bcptr->current >= code)
1297     {
1298     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1299     bcptr = bcptr->outer;
1300     }
1301     return TRUE;
1302     }
1303    
1304    
1305    
1306     /*************************************************
1307     * Check for POSIX class syntax *
1308     *************************************************/
1309    
1310     /* This function is called when the sequence "[:" or "[." or "[=" is
1311     encountered in a character class. It checks whether this is followed by an
1312     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1313     ".]" or "=]".
1314    
1315     Argument:
1316     ptr pointer to the initial [
1317     endptr where to return the end pointer
1318     cd pointer to compile data
1319    
1320     Returns: TRUE or FALSE
1321     */
1322    
1323     static BOOL
1324     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1325     {
1326     int terminator; /* Don't combine these lines; the Solaris cc */
1327     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1328     if (*(++ptr) == '^') ptr++;
1329     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1330     if (*ptr == terminator && ptr[1] == ']')
1331     {
1332     *endptr = ptr;
1333     return TRUE;
1334     }
1335     return FALSE;
1336     }
1337    
1338    
1339    
1340    
1341     /*************************************************
1342     * Check POSIX class name *
1343     *************************************************/
1344    
1345     /* This function is called to check the name given in a POSIX-style class entry
1346     such as [:alnum:].
1347    
1348     Arguments:
1349     ptr points to the first letter
1350     len the length of the name
1351    
1352     Returns: a value representing the name, or -1 if unknown
1353     */
1354    
1355     static int
1356     check_posix_name(const uschar *ptr, int len)
1357     {
1358     register int yield = 0;
1359     while (posix_name_lengths[yield] != 0)
1360     {
1361     if (len == posix_name_lengths[yield] &&
1362     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1363     yield++;
1364     }
1365     return -1;
1366     }
1367    
1368    
1369     /*************************************************
1370     * Adjust OP_RECURSE items in repeated group *
1371     *************************************************/
1372    
1373     /* OP_RECURSE items contain an offset from the start of the regex to the group
1374     that is referenced. This means that groups can be replicated for fixed
1375     repetition simply by copying (because the recursion is allowed to refer to
1376     earlier groups that are outside the current group). However, when a group is
1377     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1378     it, after it has been compiled. This means that any OP_RECURSE items within it
1379     that refer to the group itself or any contained groups have to have their
1380     offsets adjusted. That is the job of this function. Before it is called, the
1381     partially compiled regex must be temporarily terminated with OP_END.
1382    
1383     Arguments:
1384     group points to the start of the group
1385     adjust the amount by which the group is to be moved
1386     utf8 TRUE in UTF-8 mode
1387     cd contains pointers to tables etc.
1388    
1389     Returns: nothing
1390     */
1391    
1392     static void
1393     adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1394     {
1395     uschar *ptr = group;
1396     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1397     {
1398     int offset = GET(ptr, 1);
1399     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1400     ptr += 1 + LINK_SIZE;
1401     }
1402     }
1403    
1404    
1405    
1406     /*************************************************
1407     * Insert an automatic callout point *
1408     *************************************************/
1409    
1410     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1411     callout points before each pattern item.
1412    
1413     Arguments:
1414     code current code pointer
1415     ptr current pattern pointer
1416     cd pointers to tables etc
1417    
1418     Returns: new code pointer
1419     */
1420    
1421     static uschar *
1422     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1423     {
1424     *code++ = OP_CALLOUT;
1425     *code++ = 255;
1426     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1427     PUT(code, LINK_SIZE, 0); /* Default length */
1428     return code + 2*LINK_SIZE;
1429     }
1430    
1431    
1432    
1433     /*************************************************
1434     * Complete a callout item *
1435     *************************************************/
1436    
1437     /* A callout item contains the length of the next item in the pattern, which
1438     we can't fill in till after we have reached the relevant point. This is used
1439     for both automatic and manual callouts.
1440    
1441     Arguments:
1442     previous_callout points to previous callout item
1443     ptr current pattern pointer
1444     cd pointers to tables etc
1445    
1446     Returns: nothing
1447     */
1448    
1449     static void
1450     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1451     {
1452     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1453     PUT(previous_callout, 2 + LINK_SIZE, length);
1454     }
1455    
1456    
1457    
1458     #ifdef SUPPORT_UCP
1459     /*************************************************
1460     * Get othercase range *
1461     *************************************************/
1462    
1463     /* This function is passed the start and end of a class range, in UTF-8 mode
1464     with UCP support. It searches up the characters, looking for internal ranges of
1465     characters in the "other" case. Each call returns the next one, updating the
1466     start address.
1467    
1468     Arguments:
1469     cptr points to starting character value; updated
1470     d end value
1471     ocptr where to put start of othercase range
1472     odptr where to put end of othercase range
1473    
1474     Yield: TRUE when range returned; FALSE when no more
1475     */
1476    
1477     static BOOL
1478     get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1479     {
1480     int c, chartype, othercase, next;
1481    
1482     for (c = *cptr; c <= d; c++)
1483     {
1484     if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1485     break;
1486     }
1487    
1488     if (c > d) return FALSE;
1489    
1490     *ocptr = othercase;
1491     next = othercase + 1;
1492    
1493     for (++c; c <= d; c++)
1494     {
1495     if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1496     othercase != next)
1497     break;
1498     next++;
1499     }
1500    
1501     *odptr = next - 1;
1502     *cptr = c;
1503    
1504     return TRUE;
1505     }
1506     #endif /* SUPPORT_UCP */
1507    
1508    
1509     /*************************************************
1510     * Compile one branch *
1511     *************************************************/
1512    
1513     /* Scan the pattern, compiling it into the code vector. If the options are
1514     changed during the branch, the pointer is used to change the external options
1515     bits.
1516    
1517     Arguments:
1518     optionsptr pointer to the option bits
1519     brackets points to number of extracting brackets used
1520     codeptr points to the pointer to the current code point
1521     ptrptr points to the current pattern pointer
1522     errorcodeptr points to error code variable
1523     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1524     reqbyteptr set to the last literal character required, else < 0
1525     bcptr points to current branch chain
1526     cd contains pointers to tables etc.
1527    
1528     Returns: TRUE on success
1529     FALSE, with *errorcodeptr set non-zero on error
1530     */
1531    
1532     static BOOL
1533     compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1534     const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1535     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1536     {
1537     int repeat_type, op_type;
1538     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1539     int bravalue = 0;
1540     int greedy_default, greedy_non_default;
1541     int firstbyte, reqbyte;
1542     int zeroreqbyte, zerofirstbyte;
1543     int req_caseopt, reqvary, tempreqvary;
1544     int condcount = 0;
1545     int options = *optionsptr;
1546     int after_manual_callout = 0;
1547     register int c;
1548     register uschar *code = *codeptr;
1549     uschar *tempcode;
1550     BOOL inescq = FALSE;
1551     BOOL groupsetfirstbyte = FALSE;
1552     const uschar *ptr = *ptrptr;
1553     const uschar *tempptr;
1554     uschar *previous = NULL;
1555     uschar *previous_callout = NULL;
1556     uschar classbits[32];
1557    
1558     #ifdef SUPPORT_UTF8
1559     BOOL class_utf8;
1560     BOOL utf8 = (options & PCRE_UTF8) != 0;
1561     uschar *class_utf8data;
1562     uschar utf8_char[6];
1563     #else
1564     BOOL utf8 = FALSE;
1565     #endif
1566    
1567     /* Set up the default and non-default settings for greediness */
1568    
1569     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1570     greedy_non_default = greedy_default ^ 1;
1571    
1572     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1573     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1574     matches a non-fixed char first char; reqbyte just remains unset if we never
1575     find one.
1576    
1577     When we hit a repeat whose minimum is zero, we may have to adjust these values
1578     to take the zero repeat into account. This is implemented by setting them to
1579     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1580     item types that can be repeated set these backoff variables appropriately. */
1581    
1582     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1583    
1584     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1585     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1586     value > 255. It is added into the firstbyte or reqbyte variables to record the
1587     case status of the value. This is used only for ASCII characters. */
1588    
1589     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1590    
1591     /* Switch on next character until the end of the branch */
1592    
1593     for (;; ptr++)
1594     {
1595     BOOL negate_class;
1596     BOOL possessive_quantifier;
1597     BOOL is_quantifier;
1598     int class_charcount;
1599     int class_lastchar;
1600     int newoptions;
1601     int recno;
1602     int skipbytes;
1603     int subreqbyte;
1604     int subfirstbyte;
1605     int mclength;
1606     uschar mcbuffer[8];
1607    
1608     /* Next byte in the pattern */
1609    
1610     c = *ptr;
1611    
1612     /* If in \Q...\E, check for the end; if not, we have a literal */
1613    
1614     if (inescq && c != 0)
1615     {
1616     if (c == '\\' && ptr[1] == 'E')
1617     {
1618     inescq = FALSE;
1619     ptr++;
1620     continue;
1621     }
1622     else
1623     {
1624     if (previous_callout != NULL)
1625     {
1626     complete_callout(previous_callout, ptr, cd);
1627     previous_callout = NULL;
1628     }
1629     if ((options & PCRE_AUTO_CALLOUT) != 0)
1630     {
1631     previous_callout = code;
1632     code = auto_callout(code, ptr, cd);
1633     }
1634     goto NORMAL_CHAR;
1635     }
1636     }
1637    
1638     /* Fill in length of a previous callout, except when the next thing is
1639     a quantifier. */
1640    
1641     is_quantifier = c == '*' || c == '+' || c == '?' ||
1642     (c == '{' && is_counted_repeat(ptr+1));
1643    
1644     if (!is_quantifier && previous_callout != NULL &&
1645     after_manual_callout-- <= 0)
1646     {
1647     complete_callout(previous_callout, ptr, cd);
1648     previous_callout = NULL;
1649     }
1650    
1651     /* In extended mode, skip white space and comments */
1652    
1653     if ((options & PCRE_EXTENDED) != 0)
1654     {
1655     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1656     if (c == '#')
1657     {
1658     /* The space before the ; is to avoid a warning on a silly compiler
1659     on the Macintosh. */
1660     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1661     if (c != 0) continue; /* Else fall through to handle end of string */
1662     }
1663     }
1664    
1665     /* No auto callout for quantifiers. */
1666    
1667     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1668     {
1669     previous_callout = code;
1670     code = auto_callout(code, ptr, cd);
1671     }
1672    
1673     switch(c)
1674     {
1675     /* The branch terminates at end of string, |, or ). */
1676    
1677     case 0:
1678     case '|':
1679     case ')':
1680     *firstbyteptr = firstbyte;
1681     *reqbyteptr = reqbyte;
1682     *codeptr = code;
1683     *ptrptr = ptr;
1684     return TRUE;
1685    
1686     /* Handle single-character metacharacters. In multiline mode, ^ disables
1687     the setting of any following char as a first character. */
1688    
1689     case '^':
1690     if ((options & PCRE_MULTILINE) != 0)
1691     {
1692     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1693     }
1694     previous = NULL;
1695     *code++ = OP_CIRC;
1696     break;
1697    
1698     case '$':
1699     previous = NULL;
1700     *code++ = OP_DOLL;
1701     break;
1702    
1703     /* There can never be a first char if '.' is first, whatever happens about
1704     repeats. The value of reqbyte doesn't change either. */
1705    
1706     case '.':
1707     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1708     zerofirstbyte = firstbyte;
1709     zeroreqbyte = reqbyte;
1710     previous = code;
1711     *code++ = OP_ANY;
1712     break;
1713    
1714     /* Character classes. If the included characters are all < 255 in value, we
1715     build a 32-byte bitmap of the permitted characters, except in the special
1716     case where there is only one such character. For negated classes, we build
1717     the map as usual, then invert it at the end. However, we use a different
1718     opcode so that data characters > 255 can be handled correctly.
1719    
1720     If the class contains characters outside the 0-255 range, a different
1721     opcode is compiled. It may optionally have a bit map for characters < 256,
1722     but those above are are explicitly listed afterwards. A flag byte tells
1723     whether the bitmap is present, and whether this is a negated class or not.
1724     */
1725    
1726     case '[':
1727     previous = code;
1728    
1729     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1730     they are encountered at the top level, so we'll do that too. */
1731    
1732     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1733     check_posix_syntax(ptr, &tempptr, cd))
1734     {
1735     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1736     goto FAILED;
1737     }
1738    
1739     /* If the first character is '^', set the negation flag and skip it. */
1740    
1741     if ((c = *(++ptr)) == '^')
1742     {
1743     negate_class = TRUE;
1744     c = *(++ptr);
1745     }
1746     else
1747     {
1748     negate_class = FALSE;
1749     }
1750    
1751     /* Keep a count of chars with values < 256 so that we can optimize the case
1752     of just a single character (as long as it's < 256). For higher valued UTF-8
1753     characters, we don't yet do any optimization. */
1754    
1755     class_charcount = 0;
1756     class_lastchar = -1;
1757    
1758     #ifdef SUPPORT_UTF8
1759     class_utf8 = FALSE; /* No chars >= 256 */
1760     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1761     #endif
1762    
1763     /* Initialize the 32-char bit map to all zeros. We have to build the
1764     map in a temporary bit of store, in case the class contains only 1
1765     character (< 256), because in that case the compiled code doesn't use the
1766     bit map. */
1767    
1768     memset(classbits, 0, 32 * sizeof(uschar));
1769    
1770     /* Process characters until ] is reached. By writing this as a "do" it
1771     means that an initial ] is taken as a data character. The first pass
1772     through the regex checked the overall syntax, so we don't need to be very
1773     strict here. At the start of the loop, c contains the first byte of the
1774     character. */
1775    
1776     do
1777     {
1778     #ifdef SUPPORT_UTF8
1779     if (utf8 && c > 127)
1780     { /* Braces are required because the */
1781     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1782     }
1783     #endif
1784    
1785     /* Inside \Q...\E everything is literal except \E */
1786    
1787     if (inescq)
1788     {
1789     if (c == '\\' && ptr[1] == 'E')
1790     {
1791     inescq = FALSE;
1792     ptr++;
1793     continue;
1794     }
1795     else goto LONE_SINGLE_CHARACTER;
1796     }
1797    
1798     /* Handle POSIX class names. Perl allows a negation extension of the
1799     form [:^name:]. A square bracket that doesn't match the syntax is
1800     treated as a literal. We also recognize the POSIX constructions
1801     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1802     5.6 and 5.8 do. */
1803    
1804     if (c == '[' &&
1805     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1806     check_posix_syntax(ptr, &tempptr, cd))
1807     {
1808     BOOL local_negate = FALSE;
1809     int posix_class, i;
1810     register const uschar *cbits = cd->cbits;
1811    
1812     if (ptr[1] != ':')
1813     {
1814     *errorcodeptr = ERR31;
1815     goto FAILED;
1816     }
1817    
1818     ptr += 2;
1819     if (*ptr == '^')
1820     {
1821     local_negate = TRUE;
1822     ptr++;
1823     }
1824    
1825     posix_class = check_posix_name(ptr, tempptr - ptr);
1826     if (posix_class < 0)
1827     {
1828     *errorcodeptr = ERR30;
1829     goto FAILED;
1830     }
1831    
1832     /* If matching is caseless, upper and lower are converted to
1833     alpha. This relies on the fact that the class table starts with
1834     alpha, lower, upper as the first 3 entries. */
1835    
1836     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1837     posix_class = 0;
1838    
1839     /* Or into the map we are building up to 3 of the static class
1840     tables, or their negations. The [:blank:] class sets up the same
1841     chars as the [:space:] class (all white space). We remove the vertical
1842     white space chars afterwards. */
1843    
1844     posix_class *= 3;
1845     for (i = 0; i < 3; i++)
1846     {
1847     BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1848     int taboffset = posix_class_maps[posix_class + i];
1849     if (taboffset < 0) break;
1850     if (local_negate)
1851     {
1852     if (i == 0)
1853     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1854     else
1855     for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1856     if (blankclass) classbits[1] |= 0x3c;
1857     }
1858     else
1859     {
1860     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1861     if (blankclass) classbits[1] &= ~0x3c;
1862     }
1863     }
1864    
1865     ptr = tempptr + 1;
1866     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1867     continue; /* End of POSIX syntax handling */
1868     }
1869    
1870     /* Backslash may introduce a single character, or it may introduce one
1871     of the specials, which just set a flag. Escaped items are checked for
1872     validity in the pre-compiling pass. The sequence \b is a special case.
1873     Inside a class (and only there) it is treated as backspace. Elsewhere
1874     it marks a word boundary. Other escapes have preset maps ready to
1875     or into the one we are building. We assume they have more than one
1876     character in them, so set class_charcount bigger than one. */
1877    
1878     if (c == '\\')
1879     {
1880     c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1881    
1882     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1883     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1884     else if (-c == ESC_Q) /* Handle start of quoted string */
1885     {
1886     if (ptr[1] == '\\' && ptr[2] == 'E')
1887     {
1888     ptr += 2; /* avoid empty string */
1889     }
1890     else inescq = TRUE;
1891     continue;
1892     }
1893    
1894     if (c < 0)
1895     {
1896     register const uschar *cbits = cd->cbits;
1897     class_charcount += 2; /* Greater than 1 is what matters */
1898     switch (-c)
1899     {
1900     case ESC_d:
1901     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1902     continue;
1903    
1904     case ESC_D:
1905     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1906     continue;
1907    
1908     case ESC_w:
1909     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1910     continue;
1911    
1912     case ESC_W:
1913     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1914     continue;
1915    
1916     case ESC_s:
1917     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1918     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1919     continue;
1920    
1921     case ESC_S:
1922     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1923     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1924     continue;
1925    
1926     #ifdef SUPPORT_UCP
1927     case ESC_p:
1928     case ESC_P:
1929     {
1930     BOOL negated;
1931     int property = get_ucp(&ptr, &negated, errorcodeptr);
1932     if (property < 0) goto FAILED;
1933     class_utf8 = TRUE;
1934     *class_utf8data++ = ((-c == ESC_p) != negated)?
1935     XCL_PROP : XCL_NOTPROP;
1936     *class_utf8data++ = property;
1937     class_charcount -= 2; /* Not a < 256 character */
1938     }
1939     continue;
1940     #endif
1941    
1942     /* Unrecognized escapes are faulted if PCRE is running in its
1943     strict mode. By default, for compatibility with Perl, they are
1944     treated as literals. */
1945    
1946     default:
1947     if ((options & PCRE_EXTRA) != 0)
1948     {
1949     *errorcodeptr = ERR7;
1950     goto FAILED;
1951     }
1952     c = *ptr; /* The final character */
1953     class_charcount -= 2; /* Undo the default count from above */
1954     }
1955     }
1956    
1957     /* Fall through if we have a single character (c >= 0). This may be
1958     > 256 in UTF-8 mode. */
1959    
1960     } /* End of backslash handling */
1961    
1962     /* A single character may be followed by '-' to form a range. However,
1963     Perl does not permit ']' to be the end of the range. A '-' character
1964     here is treated as a literal. */
1965    
1966     if (ptr[1] == '-' && ptr[2] != ']')
1967     {
1968     int d;
1969     ptr += 2;
1970    
1971     #ifdef SUPPORT_UTF8
1972     if (utf8)
1973     { /* Braces are required because the */
1974     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1975     }
1976     else
1977     #endif
1978     d = *ptr; /* Not UTF-8 mode */
1979    
1980     /* The second part of a range can be a single-character escape, but
1981     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1982     in such circumstances. */
1983    
1984     if (d == '\\')
1985     {
1986     const uschar *oldptr = ptr;
1987     d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1988    
1989     /* \b is backslash; \X is literal X; any other special means the '-'
1990     was literal */
1991    
1992     if (d < 0)
1993     {
1994     if (d == -ESC_b) d = '\b';
1995     else if (d == -ESC_X) d = 'X'; else
1996     {
1997     ptr = oldptr - 2;
1998     goto LONE_SINGLE_CHARACTER; /* A few lines below */
1999     }
2000     }
2001     }
2002    
2003     /* The check that the two values are in the correct order happens in
2004     the pre-pass. Optimize one-character ranges */
2005    
2006     if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2007    
2008     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2009     matching, we have to use an XCLASS with extra data items. Caseless
2010     matching for characters > 127 is available only if UCP support is
2011     available. */
2012    
2013     #ifdef SUPPORT_UTF8
2014     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2015     {
2016     class_utf8 = TRUE;
2017    
2018     /* With UCP support, we can find the other case equivalents of
2019     the relevant characters. There may be several ranges. Optimize how
2020     they fit with the basic range. */
2021    
2022     #ifdef SUPPORT_UCP
2023     if ((options & PCRE_CASELESS) != 0)
2024     {
2025     int occ, ocd;
2026     int cc = c;
2027     int origd = d;
2028     while (get_othercase_range(&cc, origd, &occ, &ocd))
2029     {
2030     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2031    
2032     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2033     { /* if there is overlap, */
2034     c = occ; /* noting that if occ < c */
2035     continue; /* we can't have ocd > d */
2036     } /* because a subrange is */
2037     if (ocd > d && occ <= d + 1) /* always shorter than */
2038     { /* the basic range. */
2039     d = ocd;
2040     continue;
2041     }
2042    
2043     if (occ == ocd)
2044     {
2045     *class_utf8data++ = XCL_SINGLE;
2046     }
2047     else
2048     {
2049     *class_utf8data++ = XCL_RANGE;
2050     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2051     }
2052     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2053     }
2054     }
2055     #endif /* SUPPORT_UCP */
2056    
2057     /* Now record the original range, possibly modified for UCP caseless
2058     overlapping ranges. */
2059    
2060     *class_utf8data++ = XCL_RANGE;
2061     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2062     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2063    
2064     /* With UCP support, we are done. Without UCP support, there is no
2065     caseless matching for UTF-8 characters > 127; we can use the bit map
2066     for the smaller ones. */
2067    
2068     #ifdef SUPPORT_UCP
2069     continue; /* With next character in the class */
2070     #else
2071     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2072    
2073     /* Adjust upper limit and fall through to set up the map */
2074    
2075     d = 127;
2076    
2077     #endif /* SUPPORT_UCP */
2078     }
2079     #endif /* SUPPORT_UTF8 */
2080    
2081     /* We use the bit map for all cases when not in UTF-8 mode; else
2082     ranges that lie entirely within 0-127 when there is UCP support; else
2083     for partial ranges without UCP support. */
2084    
2085     for (; c <= d; c++)
2086     {
2087     classbits[c/8] |= (1 << (c&7));
2088     if ((options & PCRE_CASELESS) != 0)
2089     {
2090     int uc = cd->fcc[c]; /* flip case */
2091     classbits[uc/8] |= (1 << (uc&7));
2092     }
2093     class_charcount++; /* in case a one-char range */
2094     class_lastchar = c;
2095     }
2096    
2097     continue; /* Go get the next char in the class */
2098     }
2099    
2100     /* Handle a lone single character - we can get here for a normal
2101     non-escape char, or after \ that introduces a single character or for an
2102     apparent range that isn't. */
2103    
2104     LONE_SINGLE_CHARACTER:
2105    
2106     /* Handle a character that cannot go in the bit map */
2107    
2108     #ifdef SUPPORT_UTF8
2109     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2110     {
2111     class_utf8 = TRUE;
2112     *class_utf8data++ = XCL_SINGLE;
2113     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2114    
2115     #ifdef SUPPORT_UCP
2116     if ((options & PCRE_CASELESS) != 0)
2117     {
2118     int chartype;
2119     int othercase;
2120     if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2121     othercase > 0)
2122     {
2123     *class_utf8data++ = XCL_SINGLE;
2124     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2125     }
2126     }
2127     #endif /* SUPPORT_UCP */
2128    
2129     }
2130     else
2131     #endif /* SUPPORT_UTF8 */
2132    
2133     /* Handle a single-byte character */
2134     {
2135     classbits[c/8] |= (1 << (c&7));
2136     if ((options & PCRE_CASELESS) != 0)
2137     {
2138     c = cd->fcc[c]; /* flip case */
2139     classbits[c/8] |= (1 << (c&7));
2140     }
2141     class_charcount++;
2142     class_lastchar = c;
2143     }
2144     }
2145    
2146     /* Loop until ']' reached; the check for end of string happens inside the
2147     loop. This "while" is the end of the "do" above. */
2148    
2149     while ((c = *(++ptr)) != ']' || inescq);
2150    
2151     /* If class_charcount is 1, we saw precisely one character whose value is
2152     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2153     can optimize the negative case only if there were no characters >= 128
2154     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2155     single-bytes only. This is an historical hangover. Maybe one day we can
2156     tidy these opcodes to handle multi-byte characters.
2157    
2158     The optimization throws away the bit map. We turn the item into a
2159     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2160     that OP_NOT does not support multibyte characters. In the positive case, it
2161     can cause firstbyte to be set. Otherwise, there can be no first char if
2162     this item is first, whatever repeat count may follow. In the case of
2163     reqbyte, save the previous value for reinstating. */
2164    
2165     #ifdef SUPPORT_UTF8
2166     if (class_charcount == 1 &&
2167     (!utf8 ||
2168     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2169    
2170     #else
2171     if (class_charcount == 1)
2172     #endif
2173     {
2174     zeroreqbyte = reqbyte;
2175    
2176     /* The OP_NOT opcode works on one-byte characters only. */
2177    
2178     if (negate_class)
2179     {
2180     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2181     zerofirstbyte = firstbyte;
2182     *code++ = OP_NOT;
2183     *code++ = class_lastchar;
2184     break;
2185     }
2186    
2187     /* For a single, positive character, get the value into mcbuffer, and
2188     then we can handle this with the normal one-character code. */
2189    
2190     #ifdef SUPPORT_UTF8
2191     if (utf8 && class_lastchar > 127)
2192     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2193     else
2194     #endif
2195     {
2196     mcbuffer[0] = class_lastchar;
2197     mclength = 1;
2198     }
2199     goto ONE_CHAR;
2200     } /* End of 1-char optimization */
2201    
2202     /* The general case - not the one-char optimization. If this is the first
2203     thing in the branch, there can be no first char setting, whatever the
2204     repeat count. Any reqbyte setting must remain unchanged after any kind of
2205     repeat. */
2206    
2207     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2208     zerofirstbyte = firstbyte;
2209     zeroreqbyte = reqbyte;
2210    
2211     /* If there are characters with values > 255, we have to compile an
2212     extended class, with its own opcode. If there are no characters < 256,
2213     we can omit the bitmap. */
2214    
2215     #ifdef SUPPORT_UTF8
2216     if (class_utf8)
2217     {
2218     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2219     *code++ = OP_XCLASS;
2220     code += LINK_SIZE;
2221     *code = negate_class? XCL_NOT : 0;
2222    
2223     /* If the map is required, install it, and move on to the end of
2224     the extra data */
2225    
2226     if (class_charcount > 0)
2227     {
2228     *code++ |= XCL_MAP;
2229     memcpy(code, classbits, 32);
2230     code = class_utf8data;
2231     }
2232    
2233     /* If the map is not required, slide down the extra data. */
2234    
2235     else
2236     {
2237     int len = class_utf8data - (code + 33);
2238     memmove(code + 1, code + 33, len);
2239     code += len + 1;
2240     }
2241    
2242     /* Now fill in the complete length of the item */
2243    
2244     PUT(previous, 1, code - previous);
2245     break; /* End of class handling */
2246     }
2247     #endif
2248    
2249     /* If there are no characters > 255, negate the 32-byte map if necessary,
2250     and copy it into the code vector. If this is the first thing in the branch,
2251     there can be no first char setting, whatever the repeat count. Any reqbyte
2252     setting must remain unchanged after any kind of repeat. */
2253    
2254     if (negate_class)
2255     {
2256     *code++ = OP_NCLASS;
2257     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2258     }
2259     else
2260     {
2261     *code++ = OP_CLASS;
2262     memcpy(code, classbits, 32);
2263     }
2264     code += 32;
2265     break;
2266    
2267     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2268     has been tested above. */
2269    
2270     case '{':
2271     if (!is_quantifier) goto NORMAL_CHAR;
2272     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2273     if (*errorcodeptr != 0) goto FAILED;
2274     goto REPEAT;
2275    
2276     case '*':
2277     repeat_min = 0;
2278     repeat_max = -1;
2279     goto REPEAT;
2280    
2281     case '+':
2282     repeat_min = 1;
2283     repeat_max = -1;
2284     goto REPEAT;
2285    
2286     case '?':
2287     repeat_min = 0;
2288     repeat_max = 1;
2289    
2290     REPEAT:
2291     if (previous == NULL)
2292     {
2293     *errorcodeptr = ERR9;
2294     goto FAILED;
2295     }
2296    
2297     if (repeat_min == 0)
2298     {
2299     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2300     reqbyte = zeroreqbyte; /* Ditto */
2301     }
2302    
2303     /* Remember whether this is a variable length repeat */
2304    
2305     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2306    
2307     op_type = 0; /* Default single-char op codes */
2308     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2309    
2310     /* Save start of previous item, in case we have to move it up to make space
2311     for an inserted OP_ONCE for the additional '+' extension. */
2312    
2313     tempcode = previous;
2314    
2315     /* If the next character is '+', we have a possessive quantifier. This
2316     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2317     If the next character is '?' this is a minimizing repeat, by default,
2318     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2319     repeat type to the non-default. */
2320    
2321     if (ptr[1] == '+')
2322     {
2323     repeat_type = 0; /* Force greedy */
2324     possessive_quantifier = TRUE;
2325     ptr++;
2326     }
2327     else if (ptr[1] == '?')
2328     {
2329     repeat_type = greedy_non_default;
2330     ptr++;
2331     }
2332     else repeat_type = greedy_default;
2333    
2334     /* If previous was a recursion, we need to wrap it inside brackets so that
2335     it can be replicated if necessary. */
2336    
2337     if (*previous == OP_RECURSE)
2338     {
2339     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2340     code += 1 + LINK_SIZE;
2341     *previous = OP_BRA;
2342     PUT(previous, 1, code - previous);
2343     *code = OP_KET;
2344     PUT(code, 1, code - previous);
2345     code += 1 + LINK_SIZE;
2346     }
2347    
2348     /* If previous was a character match, abolish the item and generate a
2349     repeat item instead. If a char item has a minumum of more than one, ensure
2350     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2351     the first thing in a branch because the x will have gone into firstbyte
2352     instead. */
2353    
2354     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2355     {
2356     /* Deal with UTF-8 characters that take up more than one byte. It's
2357     easier to write this out separately than try to macrify it. Use c to
2358     hold the length of the character in bytes, plus 0x80 to flag that it's a
2359     length rather than a small character. */
2360    
2361     #ifdef SUPPORT_UTF8
2362     if (utf8 && (code[-1] & 0x80) != 0)
2363     {
2364     uschar *lastchar = code - 1;
2365     while((*lastchar & 0xc0) == 0x80) lastchar--;
2366     c = code - lastchar; /* Length of UTF-8 character */
2367     memcpy(utf8_char, lastchar, c); /* Save the char */
2368     c |= 0x80; /* Flag c as a length */
2369     }
2370     else
2371     #endif
2372    
2373     /* Handle the case of a single byte - either with no UTF8 support, or
2374     with UTF-8 disabled, or for a UTF-8 character < 128. */
2375    
2376     {
2377     c = code[-1];
2378     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2379     }
2380    
2381     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2382     }
2383    
2384     /* If previous was a single negated character ([^a] or similar), we use
2385     one of the special opcodes, replacing it. The code is shared with single-
2386     character repeats by setting opt_type to add a suitable offset into
2387     repeat_type. OP_NOT is currently used only for single-byte chars. */
2388    
2389     else if (*previous == OP_NOT)
2390     {
2391     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2392     c = previous[1];
2393     goto OUTPUT_SINGLE_REPEAT;
2394     }
2395    
2396     /* If previous was a character type match (\d or similar), abolish it and
2397     create a suitable repeat item. The code is shared with single-character
2398     repeats by setting op_type to add a suitable offset into repeat_type. Note
2399     the the Unicode property types will be present only when SUPPORT_UCP is
2400     defined, but we don't wrap the little bits of code here because it just
2401     makes it horribly messy. */
2402    
2403     else if (*previous < OP_EODN)
2404     {
2405     uschar *oldcode;
2406     int prop_type;
2407     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2408     c = *previous;
2409    
2410     OUTPUT_SINGLE_REPEAT:
2411     prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2412     previous[1] : -1;
2413    
2414     oldcode = code;
2415     code = previous; /* Usually overwrite previous item */
2416    
2417     /* If the maximum is zero then the minimum must also be zero; Perl allows
2418     this case, so we do too - by simply omitting the item altogether. */
2419    
2420     if (repeat_max == 0) goto END_REPEAT;
2421    
2422     /* All real repeats make it impossible to handle partial matching (maybe
2423     one day we will be able to remove this restriction). */
2424    
2425     if (repeat_max != 1) cd->nopartial = TRUE;
2426    
2427     /* Combine the op_type with the repeat_type */
2428    
2429     repeat_type += op_type;
2430    
2431     /* A minimum of zero is handled either as the special case * or ?, or as
2432     an UPTO, with the maximum given. */
2433    
2434     if (repeat_min == 0)
2435     {
2436     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2437     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2438     else
2439     {
2440     *code++ = OP_UPTO + repeat_type;
2441     PUT2INC(code, 0, repeat_max);
2442     }
2443     }
2444    
2445     /* A repeat minimum of 1 is optimized into some special cases. If the
2446     maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2447     left in place and, if the maximum is greater than 1, we use OP_UPTO with
2448     one less than the maximum. */
2449    
2450     else if (repeat_min == 1)
2451     {
2452     if (repeat_max == -1)
2453     *code++ = OP_PLUS + repeat_type;
2454     else
2455     {
2456     code = oldcode; /* leave previous item in place */
2457     if (repeat_max == 1) goto END_REPEAT;
2458     *code++ = OP_UPTO + repeat_type;
2459     PUT2INC(code, 0, repeat_max - 1);
2460     }
2461     }
2462    
2463     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2464     handled as an EXACT followed by an UPTO. */
2465    
2466     else
2467     {
2468     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2469     PUT2INC(code, 0, repeat_min);
2470    
2471     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2472     we have to insert the character for the previous code. For a repeated
2473     Unicode property match, there is an extra byte that defines the
2474     required property. In UTF-8 mode, long characters have their length in
2475     c, with the 0x80 bit as a flag. */
2476    
2477     if (repeat_max < 0)
2478     {
2479     #ifdef SUPPORT_UTF8
2480     if (utf8 && c >= 128)
2481     {
2482     memcpy(code, utf8_char, c & 7);
2483     code += c & 7;
2484     }
2485     else
2486     #endif
2487     {
2488     *code++ = c;
2489     if (prop_type >= 0) *code++ = prop_type;
2490     }
2491     *code++ = OP_STAR + repeat_type;
2492     }
2493    
2494     /* Else insert an UPTO if the max is greater than the min, again
2495     preceded by the character, for the previously inserted code. */
2496    
2497     else if (repeat_max != repeat_min)
2498     {
2499     #ifdef SUPPORT_UTF8
2500     if (utf8 && c >= 128)
2501     {
2502     memcpy(code, utf8_char, c & 7);
2503     code += c & 7;
2504     }
2505     else
2506     #endif
2507     *code++ = c;
2508     if (prop_type >= 0) *code++ = prop_type;
2509     repeat_max -= repeat_min;
2510     *code++ = OP_UPTO + repeat_type;
2511     PUT2INC(code, 0, repeat_max);
2512     }
2513     }
2514    
2515     /* The character or character type itself comes last in all cases. */
2516    
2517     #ifdef SUPPORT_UTF8
2518     if (utf8 && c >= 128)
2519     {
2520     memcpy(code, utf8_char, c & 7);
2521     code += c & 7;
2522     }
2523     else
2524     #endif
2525     *code++ = c;
2526    
2527     /* For a repeated Unicode property match, there is an extra byte that
2528     defines the required property. */
2529    
2530     #ifdef SUPPORT_UCP
2531     if (prop_type >= 0) *code++ = prop_type;
2532     #endif
2533     }
2534    
2535     /* If previous was a character class or a back reference, we put the repeat
2536     stuff after it, but just skip the item if the repeat was {0,0}. */
2537    
2538     else if (*previous == OP_CLASS ||
2539     *previous == OP_NCLASS ||
2540     #ifdef SUPPORT_UTF8
2541     *previous == OP_XCLASS ||
2542     #endif
2543     *previous == OP_REF)
2544     {
2545     if (repeat_max == 0)
2546     {
2547     code = previous;
2548     goto END_REPEAT;
2549     }
2550    
2551     /* All real repeats make it impossible to handle partial matching (maybe
2552     one day we will be able to remove this restriction). */
2553    
2554     if (repeat_max != 1) cd->nopartial = TRUE;
2555    
2556     if (repeat_min == 0 && repeat_max == -1)
2557     *code++ = OP_CRSTAR + repeat_type;
2558     else if (repeat_min == 1 && repeat_max == -1)
2559     *code++ = OP_CRPLUS + repeat_type;
2560     else if (repeat_min == 0 && repeat_max == 1)
2561     *code++ = OP_CRQUERY + repeat_type;
2562     else
2563     {
2564     *code++ = OP_CRRANGE + repeat_type;
2565     PUT2INC(code, 0, repeat_min);
2566     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2567     PUT2INC(code, 0, repeat_max);
2568     }
2569     }
2570    
2571     /* If previous was a bracket group, we may have to replicate it in certain
2572     cases. */
2573    
2574     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2575     *previous == OP_COND)
2576     {
2577     register int i;
2578     int ketoffset = 0;
2579     int len = code - previous;
2580     uschar *bralink = NULL;
2581    
2582     /* If the maximum repeat count is unlimited, find the end of the bracket
2583     by scanning through from the start, and compute the offset back to it
2584     from the current code pointer. There may be an OP_OPT setting following
2585     the final KET, so we can't find the end just by going back from the code
2586     pointer. */
2587    
2588     if (repeat_max == -1)
2589     {
2590     register uschar *ket = previous;
2591     do ket += GET(ket, 1); while (*ket != OP_KET);
2592     ketoffset = code - ket;
2593     }
2594    
2595     /* The case of a zero minimum is special because of the need to stick
2596     OP_BRAZERO in front of it, and because the group appears once in the
2597     data, whereas in other cases it appears the minimum number of times. For
2598     this reason, it is simplest to treat this case separately, as otherwise
2599     the code gets far too messy. There are several special subcases when the
2600     minimum is zero. */
2601    
2602     if (repeat_min == 0)
2603     {
2604     /* If the maximum is also zero, we just omit the group from the output
2605     altogether. */
2606    
2607     if (repeat_max == 0)
2608     {
2609     code = previous;
2610     goto END_REPEAT;
2611     }
2612    
2613     /* If the maximum is 1 or unlimited, we just have to stick in the
2614     BRAZERO and do no more at this point. However, we do need to adjust
2615     any OP_RECURSE calls inside the group that refer to the group itself or
2616     any internal group, because the offset is from the start of the whole
2617     regex. Temporarily terminate the pattern while doing this. */
2618    
2619     if (repeat_max <= 1)
2620     {
2621     *code = OP_END;
2622     adjust_recurse(previous, 1, utf8, cd);
2623     memmove(previous+1, previous, len);
2624     code++;
2625     *previous++ = OP_BRAZERO + repeat_type;
2626     }
2627    
2628     /* If the maximum is greater than 1 and limited, we have to replicate
2629     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2630     The first one has to be handled carefully because it's the original
2631     copy, which has to be moved up. The remainder can be handled by code
2632     that is common with the non-zero minimum case below. We have to
2633     adjust the value or repeat_max, since one less copy is required. Once
2634     again, we may have to adjust any OP_RECURSE calls inside the group. */
2635    
2636     else
2637     {
2638     int offset;
2639     *code = OP_END;
2640     adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2641     memmove(previous + 2 + LINK_SIZE, previous, len);
2642     code += 2 + LINK_SIZE;
2643     *previous++ = OP_BRAZERO + repeat_type;
2644     *previous++ = OP_BRA;
2645    
2646     /* We chain together the bracket offset fields that have to be
2647     filled in later when the ends of the brackets are reached. */
2648    
2649     offset = (bralink == NULL)? 0 : previous - bralink;
2650     bralink = previous;
2651     PUTINC(previous, 0, offset);
2652     }
2653    
2654     repeat_max--;
2655     }
2656    
2657     /* If the minimum is greater than zero, replicate the group as many
2658     times as necessary, and adjust the maximum to the number of subsequent
2659     copies that we need. If we set a first char from the group, and didn't
2660     set a required char, copy the latter from the former. */
2661    
2662     else
2663     {
2664     if (repeat_min > 1)
2665     {
2666     if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2667     for (i = 1; i < repeat_min; i++)
2668     {
2669     memcpy(code, previous, len);
2670     code += len;
2671     }
2672     }
2673     if (repeat_max > 0) repeat_max -= repeat_min;
2674     }
2675    
2676     /* This code is common to both the zero and non-zero minimum cases. If
2677     the maximum is limited, it replicates the group in a nested fashion,
2678     remembering the bracket starts on a stack. In the case of a zero minimum,
2679     the first one was set up above. In all cases the repeat_max now specifies
2680     the number of additional copies needed. */
2681    
2682     if (repeat_max >= 0)
2683     {
2684     for (i = repeat_max - 1; i >= 0; i--)
2685     {
2686     *code++ = OP_BRAZERO + repeat_type;
2687    
2688     /* All but the final copy start a new nesting, maintaining the
2689     chain of brackets outstanding. */
2690    
2691     if (i != 0)
2692     {
2693     int offset;
2694     *code++ = OP_BRA;
2695     offset = (bralink == NULL)? 0 : code - bralink;
2696     bralink = code;
2697     PUTINC(code, 0, offset);
2698     }
2699    
2700     memcpy(code, previous, len);
2701     code += len;
2702     }
2703    
2704     /* Now chain through the pending brackets, and fill in their length
2705     fields (which are holding the chain links pro tem). */
2706    
2707     while (bralink != NULL)
2708     {
2709     int oldlinkoffset;
2710     int offset = code - bralink + 1;
2711     uschar *bra = code - offset;
2712     oldlinkoffset = GET(bra, 1);
2713     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2714     *code++ = OP_KET;
2715     PUTINC(code, 0, offset);
2716     PUT(bra, 1, offset);
2717     }
2718     }
2719    
2720     /* If the maximum is unlimited, set a repeater in the final copy. We
2721     can't just offset backwards from the current code point, because we
2722     don't know if there's been an options resetting after the ket. The
2723     correct offset was computed above. */
2724    
2725     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2726     }
2727    
2728     /* Else there's some kind of shambles */
2729    
2730     else
2731     {
2732     *errorcodeptr = ERR11;
2733     goto FAILED;
2734     }
2735    
2736     /* If the character following a repeat is '+', we wrap the entire repeated
2737     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2738     Sun's Java package. The repeated item starts at tempcode, not at previous,
2739     which might be the first part of a string whose (former) last char we
2740     repeated. However, we don't support '+' after a greediness '?'. */
2741    
2742     if (possessive_quantifier)
2743     {
2744     int len = code - tempcode;
2745     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2746     code += 1 + LINK_SIZE;
2747     len += 1 + LINK_SIZE;
2748     tempcode[0] = OP_ONCE;
2749     *code++ = OP_KET;
2750     PUTINC(code, 0, len);
2751     PUT(tempcode, 1, len);
2752     }
2753    
2754     /* In all case we no longer have a previous item. We also set the
2755     "follows varying string" flag for subsequently encountered reqbytes if
2756     it isn't already set and we have just passed a varying length item. */
2757    
2758     END_REPEAT:
2759     previous = NULL;
2760     cd->req_varyopt |= reqvary;
2761     break;
2762    
2763    
2764     /* Start of nested bracket sub-expression, or comment or lookahead or
2765     lookbehind or option setting or condition. First deal with special things
2766     that can come after a bracket; all are introduced by ?, and the appearance
2767     of any of them means that this is not a referencing group. They were
2768     checked for validity in the first pass over the string, so we don't have to
2769     check for syntax errors here. */
2770    
2771     case '(':
2772     newoptions = options;
2773     skipbytes = 0;
2774    
2775     if (*(++ptr) == '?')
2776     {
2777     int set, unset;
2778     int *optset;
2779    
2780     switch (*(++ptr))
2781     {
2782     case '#': /* Comment; skip to ket */
2783     ptr++;
2784     while (*ptr != ')') ptr++;
2785     continue;
2786    
2787     case ':': /* Non-extracting bracket */
2788     bravalue = OP_BRA;
2789     ptr++;
2790     break;
2791    
2792     case '(':
2793     bravalue = OP_COND; /* Conditional group */
2794    
2795     /* Condition to test for recursion */
2796    
2797     if (ptr[1] == 'R')
2798     {
2799     code[1+LINK_SIZE] = OP_CREF;
2800     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2801     skipbytes = 3;
2802     ptr += 3;
2803     }
2804    
2805     /* Condition to test for a numbered subpattern match. We know that
2806     if a digit follows ( then there will just be digits until ) because
2807     the syntax was checked in the first pass. */
2808    
2809     else if ((digitab[ptr[1]] && ctype_digit) != 0)
2810     {
2811     int condref; /* Don't amalgamate; some compilers */
2812     condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2813     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2814     if (condref == 0)
2815     {
2816     *errorcodeptr = ERR35;
2817     goto FAILED;
2818     }
2819     ptr++;
2820     code[1+LINK_SIZE] = OP_CREF;
2821     PUT2(code, 2+LINK_SIZE, condref);
2822     skipbytes = 3;
2823     }
2824     /* For conditions that are assertions, we just fall through, having
2825     set bravalue above. */
2826     break;
2827    
2828     case '=': /* Positive lookahead */
2829     bravalue = OP_ASSERT;
2830     ptr++;
2831     break;
2832    
2833     case '!': /* Negative lookahead */
2834     bravalue = OP_ASSERT_NOT;
2835     ptr++;
2836     break;
2837    
2838     case '<': /* Lookbehinds */
2839     switch (*(++ptr))
2840     {
2841     case '=': /* Positive lookbehind */
2842     bravalue = OP_ASSERTBACK;
2843     ptr++;
2844     break;
2845    
2846     case '!': /* Negative lookbehind */
2847     bravalue = OP_ASSERTBACK_NOT;
2848     ptr++;
2849     break;
2850     }
2851     break;
2852    
2853     case '>': /* One-time brackets */
2854     bravalue = OP_ONCE;
2855     ptr++;
2856     break;
2857    
2858     case 'C': /* Callout - may be followed by digits; */
2859     previous_callout = code; /* Save for later completion */
2860     after_manual_callout = 1; /* Skip one item before completing */
2861     *code++ = OP_CALLOUT; /* Already checked that the terminating */
2862     { /* closing parenthesis is present. */
2863     int n = 0;
2864     while ((digitab[*(++ptr)] & ctype_digit) != 0)
2865     n = n * 10 + *ptr - '0';
2866     if (n > 255)
2867     {
2868     *errorcodeptr = ERR38;
2869     goto FAILED;
2870     }
2871     *code++ = n;
2872     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2873     PUT(code, LINK_SIZE, 0); /* Default length */
2874     code += 2 * LINK_SIZE;
2875     }
2876     previous = NULL;
2877     continue;
2878    
2879     case 'P': /* Named subpattern handling */
2880     if (*(++ptr) == '<') /* Definition */
2881     {
2882     int i, namelen;
2883     uschar *slot = cd->name_table;
2884     const uschar *name; /* Don't amalgamate; some compilers */
2885     name = ++ptr; /* grumble at autoincrement in declaration */
2886    
2887     while (*ptr++ != '>');
2888     namelen = ptr - name - 1;
2889    
2890     for (i = 0; i < cd->names_found; i++)
2891     {
2892     int crc = memcmp(name, slot+2, namelen);
2893     if (crc == 0)
2894     {
2895     if (slot[2+namelen] == 0)
2896     {
2897     *errorcodeptr = ERR43;
2898     goto FAILED;
2899     }
2900     crc = -1; /* Current name is substring */
2901     }
2902     if (crc < 0)
2903     {
2904     memmove(slot + cd->name_entry_size, slot,
2905     (cd->names_found - i) * cd->name_entry_size);
2906     break;
2907     }
2908     slot += cd->name_entry_size;
2909     }
2910    
2911     PUT2(slot, 0, *brackets + 1);
2912     memcpy(slot + 2, name, namelen);
2913     slot[2+namelen] = 0;
2914     cd->names_found++;
2915     goto NUMBERED_GROUP;
2916     }
2917    
2918     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2919     {
2920     int i, namelen;
2921     int type = *ptr++;
2922     const uschar *name = ptr;
2923     uschar *slot = cd->name_table;
2924    
2925     while (*ptr != ')') ptr++;
2926     namelen = ptr - name;
2927    
2928     for (i = 0; i < cd->names_found; i++)
2929     {
2930     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2931     slot += cd->name_entry_size;
2932     }
2933     if (i >= cd->names_found)
2934     {
2935     *errorcodeptr = ERR15;
2936     goto FAILED;
2937     }
2938    
2939     recno = GET2(slot, 0);
2940    
2941     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2942    
2943     /* Back reference */
2944    
2945     previous = code;
2946     *code++ = OP_REF;
2947     PUT2INC(code, 0, recno);
2948     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2949     if (recno > cd->top_backref) cd->top_backref = recno;
2950     continue;
2951     }
2952    
2953     /* Should never happen */
2954     break;
2955    
2956     case 'R': /* Pattern recursion */
2957     ptr++; /* Same as (?0) */
2958     /* Fall through */
2959    
2960     /* Recursion or "subroutine" call */
2961    
2962     case '0': case '1': case '2': case '3': case '4':
2963     case '5': case '6': case '7': case '8': case '9':
2964     {
2965     const uschar *called;
2966     recno = 0;
2967     while((digitab[*ptr] & ctype_digit) != 0)
2968     recno = recno * 10 + *ptr++ - '0';
2969    
2970     /* Come here from code above that handles a named recursion */
2971    
2972     HANDLE_RECURSION:
2973    
2974     previous = code;
2975    
2976     /* Find the bracket that is being referenced. Temporarily end the
2977     regex in case it doesn't exist. */
2978    
2979     *code = OP_END;
2980     called = (recno == 0)?
2981     cd->start_code : find_bracket(cd->start_code, utf8, recno);
2982    
2983     if (called == NULL)
2984     {
2985     *errorcodeptr = ERR15;
2986     goto FAILED;
2987     }
2988    
2989     /* If the subpattern is still open, this is a recursive call. We
2990     check to see if this is a left recursion that could loop for ever,
2991     and diagnose that case. */
2992    
2993     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2994     {
2995     *errorcodeptr = ERR40;
2996     goto FAILED;
2997     }
2998    
2999     /* Insert the recursion/subroutine item */
3000    
3001     *code = OP_RECURSE;
3002     PUT(code, 1, called - cd->start_code);
3003     code += 1 + LINK_SIZE;
3004     }
3005     continue;
3006    
3007     /* Character after (? not specially recognized */
3008    
3009     default: /* Option setting */
3010     set = unset = 0;
3011     optset = &set;
3012    
3013     while (*ptr != ')' && *ptr != ':')
3014     {
3015     switch (*ptr++)
3016     {
3017     case '-': optset = &unset; break;
3018    
3019     case 'i': *optset |= PCRE_CASELESS; break;
3020     case 'm': *optset |= PCRE_MULTILINE; break;
3021     case 's': *optset |= PCRE_DOTALL; break;
3022     case 'x': *optset |= PCRE_EXTENDED; break;
3023     case 'U': *optset |= PCRE_UNGREEDY; break;
3024     case 'X': *optset |= PCRE_EXTRA; break;
3025     }
3026     }
3027    
3028     /* Set up the changed option bits, but don't change anything yet. */
3029    
3030     newoptions = (options | set) & (~unset);
3031    
3032     /* If the options ended with ')' this is not the start of a nested
3033     group with option changes, so the options change at this level. Compile
3034     code to change the ims options if this setting actually changes any of
3035     them. We also pass the new setting back so that it can be put at the
3036     start of any following branches, and when this group ends (if we are in
3037     a group), a resetting item can be compiled.
3038    
3039     Note that if this item is right at the start of the pattern, the
3040     options will have been abstracted and made global, so there will be no
3041     change to compile. */
3042    
3043     if (*ptr == ')')
3044     {
3045     if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3046     {
3047     *code++ = OP_OPT;
3048     *code++ = newoptions & PCRE_IMS;
3049     }
3050    
3051     /* Change options at this level, and pass them back for use
3052     in subsequent branches. Reset the greedy defaults and the case
3053     value for firstbyte and reqbyte. */
3054    
3055     *optionsptr = options = newoptions;
3056     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3057     greedy_non_default = greedy_default ^ 1;
3058     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3059    
3060     previous = NULL; /* This item can't be repeated */
3061     continue; /* It is complete */
3062     }
3063    
3064     /* If the options ended with ':' we are heading into a nested group
3065     with possible change of options. Such groups are non-capturing and are
3066     not assertions of any kind. All we need to do is skip over the ':';
3067     the newoptions value is handled below. */
3068    
3069     bravalue = OP_BRA;
3070     ptr++;
3071     }
3072     }
3073    
3074     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3075     non-capturing and behave like (?:...) brackets */
3076    
3077     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3078     {
3079     bravalue = OP_BRA;
3080     }
3081    
3082     /* Else we have a referencing group; adjust the opcode. If the bracket
3083     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3084     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3085    
3086     else
3087     {
3088     NUMBERED_GROUP:
3089     if (++(*brackets) > EXTRACT_BASIC_MAX)
3090     {
3091     bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3092     code[1+LINK_SIZE] = OP_BRANUMBER;
3093     PUT2(code, 2+LINK_SIZE, *brackets);
3094     skipbytes = 3;
3095     }
3096     else bravalue = OP_BRA + *brackets;
3097     }
3098    
3099     /* Process nested bracketed re. Assertions may not be repeated, but other
3100     kinds can be. We copy code into a non-register variable in order to be able
3101     to pass its address because some compilers complain otherwise. Pass in a
3102     new setting for the ims options if they have changed. */
3103    
3104     previous = (bravalue >= OP_ONCE)? code : NULL;
3105     *code = bravalue;
3106     tempcode = code;
3107     tempreqvary = cd->req_varyopt; /* Save value before bracket */
3108    
3109     if (!compile_regex(
3110     newoptions, /* The complete new option state */
3111     options & PCRE_IMS, /* The previous ims option state */
3112     brackets, /* Extracting bracket count */
3113     &tempcode, /* Where to put code (updated) */
3114     &ptr, /* Input pointer (updated) */
3115     errorcodeptr, /* Where to put an error message */
3116     (bravalue == OP_ASSERTBACK ||
3117     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3118     skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3119     &subfirstbyte, /* For possible first char */
3120     &subreqbyte, /* For possible last char */
3121     bcptr, /* Current branch chain */
3122     cd)) /* Tables block */
3123     goto FAILED;
3124    
3125     /* At the end of compiling, code is still pointing to the start of the
3126     group, while tempcode has been updated to point past the end of the group
3127     and any option resetting that may follow it. The pattern pointer (ptr)
3128     is on the bracket. */
3129    
3130     /* If this is a conditional bracket, check that there are no more than
3131     two branches in the group. */
3132    
3133     else if (bravalue == OP_COND)
3134     {
3135     uschar *tc = code;
3136     condcount = 0;
3137    
3138     do {
3139     condcount++;
3140     tc += GET(tc,1);
3141     }
3142     while (*tc != OP_KET);
3143    
3144     if (condcount > 2)
3145     {
3146     *errorcodeptr = ERR27;
3147     goto FAILED;
3148     }
3149    
3150     /* If there is just one branch, we must not make use of its firstbyte or
3151     reqbyte, because this is equivalent to an empty second branch. */
3152    
3153     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3154     }
3155    
3156     /* Handle updating of the required and first characters. Update for normal
3157     brackets of all kinds, and conditions with two branches (see code above).
3158     If the bracket is followed by a quantifier with zero repeat, we have to
3159     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3160     main loop so that they can be accessed for the back off. */
3161    
3162     zeroreqbyte = reqbyte;
3163     zerofirstbyte = firstbyte;
3164     groupsetfirstbyte = FALSE;
3165    
3166     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3167     {
3168     /* If we have not yet set a firstbyte in this branch, take it from the
3169     subpattern, remembering that it was set here so that a repeat of more
3170     than one can replicate it as reqbyte if necessary. If the subpattern has
3171     no firstbyte, set "none" for the whole branch. In both cases, a zero
3172     repeat forces firstbyte to "none". */
3173    
3174     if (firstbyte == REQ_UNSET)
3175     {
3176     if (subfirstbyte >= 0)
3177     {
3178     firstbyte = subfirstbyte;
3179     groupsetfirstbyte = TRUE;
3180     }
3181     else firstbyte = REQ_NONE;
3182     zerofirstbyte = REQ_NONE;
3183     }
3184    
3185     /* If firstbyte was previously set, convert the subpattern's firstbyte
3186     into reqbyte if there wasn't one, using the vary flag that was in
3187     existence beforehand. */
3188    
3189     else if (subfirstbyte >= 0 && subreqbyte < 0)
3190     subreqbyte = subfirstbyte | tempreqvary;
3191    
3192     /* If the subpattern set a required byte (or set a first byte that isn't
3193     really the first byte - see above), set it. */
3194    
3195     if (subreqbyte >= 0) reqbyte = subreqbyte;
3196     }
3197    
3198     /* For a forward assertion, we take the reqbyte, if set. This can be
3199     helpful if the pattern that follows the assertion doesn't set a different
3200     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3201     for an assertion, however because it leads to incorrect effect for patterns
3202     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3203     of a firstbyte. This is overcome by a scan at the end if there's no
3204     firstbyte, looking for an asserted first char. */
3205    
3206     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3207    
3208     /* Now update the main code pointer to the end of the group. */
3209    
3210     code = tempcode;
3211    
3212     /* Error if hit end of pattern */
3213    
3214     if (*ptr != ')')
3215     {
3216     *errorcodeptr = ERR14;
3217     goto FAILED;
3218     }
3219     break;
3220    
3221     /* Check \ for being a real metacharacter; if not, fall through and handle
3222     it as a data character at the start of a string. Escape items are checked
3223     for validity in the pre-compiling pass. */
3224    
3225     case '\\':
3226     tempptr = ptr;
3227     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3228    
3229     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3230     are arranged to be the negation of the corresponding OP_values. For the
3231     back references, the values are ESC_REF plus the reference number. Only
3232     back references and those types that consume a character may be repeated.
3233     We can test for values between ESC_b and ESC_Z for the latter; this may
3234     have to change if any new ones are ever created. */
3235    
3236     if (c < 0)
3237     {
3238     if (-c == ESC_Q) /* Handle start of quoted string */
3239     {
3240     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3241     else inescq = TRUE;
3242     continue;
3243     }
3244    
3245     /* For metasequences that actually match a character, we disable the
3246     setting of a first character if it hasn't already been set. */
3247    
3248     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3249     firstbyte = REQ_NONE;
3250    
3251     /* Set values to reset to if this is followed by a zero repeat. */
3252    
3253     zerofirstbyte = firstbyte;
3254     zeroreqbyte = reqbyte;
3255    
3256     /* Back references are handled specially */
3257    
3258     if (-c >= ESC_REF)
3259     {
3260     int number = -c - ESC_REF;
3261     previous = code;
3262     *code++ = OP_REF;
3263     PUT2INC(code, 0, number);
3264     }
3265    
3266     /* So are Unicode property matches, if supported. We know that get_ucp
3267     won't fail because it was tested in the pre-pass. */
3268    
3269     #ifdef SUPPORT_UCP
3270     else if (-c == ESC_P || -c == ESC_p)
3271     {
3272     BOOL negated;
3273     int value = get_ucp(&ptr, &negated, errorcodeptr);
3274     previous = code;
3275     *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3276     *code++ = value;
3277     }
3278     #endif
3279    
3280     /* For the rest, we can obtain the OP value by negating the escape
3281     value */
3282    
3283     else
3284     {
3285     previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3286     *code++ = -c;
3287     }
3288     continue;
3289     }
3290    
3291     /* We have a data character whose value is in c. In UTF-8 mode it may have
3292     a value > 127. We set its representation in the length/buffer, and then
3293     handle it as a data character. */
3294    
3295     #ifdef SUPPORT_UTF8
3296     if (utf8 && c > 127)
3297     mclength = _pcre_ord2utf8(c, mcbuffer);
3298     else
3299     #endif
3300    
3301     {
3302     mcbuffer[0] = c;
3303     mclength = 1;
3304     }
3305    
3306     goto ONE_CHAR;
3307    
3308     /* Handle a literal character. It is guaranteed not to be whitespace or #
3309     when the extended flag is set. If we are in UTF-8 mode, it may be a
3310     multi-byte literal character. */
3311    
3312     default:
3313     NORMAL_CHAR:
3314     mclength = 1;
3315     mcbuffer[0] = c;
3316    
3317     #ifdef SUPPORT_UTF8
3318     if (utf8 && (c & 0xc0) == 0xc0)
3319     {
3320     while ((ptr[1] & 0xc0) == 0x80)
3321     mcbuffer[mclength++] = *(++ptr);
3322     }
3323     #endif
3324    
3325     /* At this point we have the character's bytes in mcbuffer, and the length
3326     in mclength. When not in UTF-8 mode, the length is always 1. */
3327    
3328     ONE_CHAR:
3329     previous = code;
3330     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3331     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3332    
3333     /* Set the first and required bytes appropriately. If no previous first
3334     byte, set it from this character, but revert to none on a zero repeat.
3335     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3336     repeat. */
3337    
3338     if (firstbyte == REQ_UNSET)
3339     {
3340     zerofirstbyte = REQ_NONE;
3341     zeroreqbyte = reqbyte;
3342    
3343     /* If the character is more than one byte long, we can set firstbyte
3344     only if it is not to be matched caselessly. */
3345    
3346     if (mclength == 1 || req_caseopt == 0)
3347     {
3348     firstbyte = mcbuffer[0] | req_caseopt;
3349     if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3350     }
3351     else firstbyte = reqbyte = REQ_NONE;
3352     }
3353    
3354     /* firstbyte was previously set; we can set reqbyte only the length is
3355     1 or the matching is caseful. */
3356    
3357     else
3358     {
3359     zerofirstbyte = firstbyte;
3360     zeroreqbyte = reqbyte;
3361     if (mclength == 1 || req_caseopt == 0)
3362     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3363     }
3364    
3365     break; /* End of literal character handling */
3366     }
3367     } /* end of big loop */
3368    
3369     /* Control never reaches here by falling through, only by a goto for all the
3370     error states. Pass back the position in the pattern so that it can be displayed
3371     to the user for diagnosing the error. */
3372    
3373     FAILED:
3374     *ptrptr = ptr;
3375     return FALSE;
3376     }
3377    
3378    
3379    
3380    
3381     /*************************************************
3382     * Compile sequence of alternatives *
3383     *************************************************/
3384    
3385     /* On entry, ptr is pointing past the bracket character, but on return
3386     it points to the closing bracket, or vertical bar, or end of string.
3387     The code variable is pointing at the byte into which the BRA operator has been
3388     stored. If the ims options are changed at the start (for a (?ims: group) or
3389     during any branch, we need to insert an OP_OPT item at the start of every
3390     following branch to ensure they get set correctly at run time, and also pass
3391     the new options into every subsequent branch compile.
3392    
3393     Argument:
3394     options option bits, including any changes for this subpattern
3395     oldims previous settings of ims option bits
3396     brackets -> int containing the number of extracting brackets used
3397     codeptr -> the address of the current code pointer
3398     ptrptr -> the address of the current pattern pointer
3399     errorcodeptr -> pointer to error code variable
3400     lookbehind TRUE if this is a lookbehind assertion
3401     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3402     firstbyteptr place to put the first required character, or a negative number
3403     reqbyteptr place to put the last required character, or a negative number
3404     bcptr pointer to the chain of currently open branches
3405     cd points to the data block with tables pointers etc.
3406    
3407     Returns: TRUE on success
3408     */
3409    
3410     static BOOL
3411     compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3412     const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3413     int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3414     {
3415     const uschar *ptr = *ptrptr;
3416     uschar *code = *codeptr;
3417     uschar *last_branch = code;
3418     uschar *start_bracket = code;
3419     uschar *reverse_count = NULL;
3420     int firstbyte, reqbyte;
3421     int branchfirstbyte, branchreqbyte;
3422     branch_chain bc;
3423    
3424     bc.outer = bcptr;
3425     bc.current = code;
3426    
3427     firstbyte = reqbyte = REQ_UNSET;
3428    
3429     /* Offset is set zero to mark that this bracket is still open */
3430    
3431     PUT(code, 1, 0);
3432     code += 1 + LINK_SIZE + skipbytes;
3433    
3434     /* Loop for each alternative branch */
3435    
3436     for (;;)
3437     {
3438     /* Handle a change of ims options at the start of the branch */
3439    
3440     if ((options & PCRE_IMS) != oldims)
3441     {
3442     *code++ = OP_OPT;
3443     *code++ = options & PCRE_IMS;
3444     }
3445    
3446     /* Set up dummy OP_REVERSE if lookbehind assertion */
3447    
3448     if (lookbehind)
3449     {
3450     *code++ = OP_REVERSE;
3451     reverse_count = code;
3452     PUTINC(code, 0, 0);
3453     }
3454    
3455     /* Now compile the branch */
3456    
3457     if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3458     &branchfirstbyte, &branchreqbyte, &bc, cd))
3459     {
3460     *ptrptr = ptr;
3461     return FALSE;
3462     }
3463    
3464     /* If this is the first branch, the firstbyte and reqbyte values for the
3465     branch become the values for the regex. */
3466    
3467     if (*last_branch != OP_ALT)
3468     {
3469     firstbyte = branchfirstbyte;
3470     reqbyte = branchreqbyte;
3471     }
3472    
3473     /* If this is not the first branch, the first char and reqbyte have to
3474     match the values from all the previous branches, except that if the previous
3475     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3476     REQ_VARY for the regex. */
3477    
3478     else
3479     {
3480     /* If we previously had a firstbyte, but it doesn't match the new branch,
3481     we have to abandon the firstbyte for the regex, but if there was previously
3482     no reqbyte, it takes on the value of the old firstbyte. */
3483    
3484     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3485     {
3486     if (reqbyte < 0) reqbyte = firstbyte;
3487     firstbyte = REQ_NONE;
3488     }
3489    
3490     /* If we (now or from before) have no firstbyte, a firstbyte from the
3491     branch becomes a reqbyte if there isn't a branch reqbyte. */
3492    
3493     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3494     branchreqbyte = branchfirstbyte;
3495    
3496     /* Now ensure that the reqbytes match */
3497    
3498     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3499     reqbyte = REQ_NONE;
3500     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3501     }
3502    
3503     /* If lookbehind, check that this branch matches a fixed-length string,
3504     and put the length into the OP_REVERSE item. Temporarily mark the end of
3505     the branch with OP_END. */
3506    
3507     if (lookbehind)
3508     {
3509     int length;
3510     *code = OP_END;
3511     length = find_fixedlength(last_branch, options);
3512     DPRINTF(("fixed length = %d\n", length));
3513     if (length < 0)
3514     {
3515     *errorcodeptr = (length == -2)? ERR36 : ERR25;
3516     *ptrptr = ptr;
3517     return FALSE;
3518     }
3519     PUT(reverse_count, 0, length);
3520     }
3521    
3522     /* Reached end of expression, either ')' or end of pattern. Go back through
3523     the alternative branches and reverse the chain of offsets, with the field in
3524     the BRA item now becoming an offset to the first alternative. If there are
3525     no alternatives, it points to the end of the group. The length in the
3526     terminating ket is always the length of the whole bracketed item. If any of
3527     the ims options were changed inside the group, compile a resetting op-code
3528     following, except at the very end of the pattern. Return leaving the pointer
3529     at the terminating char. */
3530    
3531     if (*ptr != '|')
3532     {
3533     int length = code - last_branch;
3534     do
3535     {
3536     int prev_length = GET(last_branch, 1);
3537     PUT(last_branch, 1, length);
3538     length = prev_length;
3539     last_branch -= length;
3540     }
3541     while (length > 0);
3542    
3543     /* Fill in the ket */
3544    
3545     *code = OP_KET;
3546     PUT(code, 1, code - start_bracket);
3547     code += 1 + LINK_SIZE;
3548    
3549     /* Resetting option if needed */
3550    
3551     if ((options & PCRE_IMS) != oldims && *ptr == ')')
3552     {
3553     *code++ = OP_OPT;
3554     *code++ = oldims;
3555     }
3556    
3557     /* Set values to pass back */
3558    
3559     *codeptr = code;
3560     *ptrptr = ptr;
3561     *firstbyteptr = firstbyte;
3562     *reqbyteptr = reqbyte;
3563     return TRUE;
3564     }
3565    
3566     /* Another branch follows; insert an "or" node. Its length field points back
3567     to the previous branch while the bracket remains open. At the end the chain
3568     is reversed. It's done like this so that the start of the bracket has a
3569     zero offset until it is closed, making it possible to detect recursion. */
3570    
3571     *code = OP_ALT;
3572     PUT(code, 1, code - last_branch);
3573     bc.current = last_branch = code;
3574     code += 1 + LINK_SIZE;
3575     ptr++;
3576     }
3577     /* Control never reaches here */
3578     }
3579    
3580    
3581    
3582    
3583     /*************************************************
3584     * Check for anchored expression *
3585     *************************************************/
3586    
3587     /* Try to find out if this is an anchored regular expression. Consider each
3588     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3589     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3590     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3591     counts, since OP_CIRC can match in the middle.
3592    
3593     We can also consider a regex to be anchored if OP_SOM starts all its branches.
3594     This is the code for \G, which means "match at start of match position, taking
3595     into account the match offset".
3596    
3597     A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3598     because that will try the rest of the pattern at all possible matching points,
3599     so there is no point trying again.... er ....
3600    
3601     .... except when the .* appears inside capturing parentheses, and there is a
3602     subsequent back reference to those parentheses. We haven't enough information
3603     to catch that case precisely.
3604    
3605     At first, the best we could do was to detect when .* was in capturing brackets
3606     and the highest back reference was greater than or equal to that level.
3607     However, by keeping a bitmap of the first 31 back references, we can catch some
3608     of the more common cases more precisely.
3609    
3610     Arguments:
3611     code points to start of expression (the bracket)
3612     options points to the options setting
3613     bracket_map a bitmap of which brackets we are inside while testing; this
3614     handles up to substring 31; after that we just have to take
3615     the less precise approach
3616     backref_map the back reference bitmap
3617    
3618     Returns: TRUE or FALSE
3619     */
3620    
3621     static BOOL
3622     is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3623     unsigned int backref_map)
3624     {
3625     do {
3626     const uschar *scode =
3627     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3628     register int op = *scode;
3629    
3630     /* Capturing brackets */
3631    
3632     if (op > OP_BRA)
3633     {
3634     int new_map;
3635     op -= OP_BRA;
3636     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3637     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3638     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3639     }
3640    
3641     /* Other brackets */
3642    
3643     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3644     {
3645     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3646     }
3647    
3648     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3649     are or may be referenced. */
3650    
3651     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3652     (*options & PCRE_DOTALL) != 0)
3653     {
3654     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3655     }
3656    
3657     /* Check for explicit anchoring */
3658    
3659     else if (op != OP_SOD && op != OP_SOM &&
3660     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3661     return FALSE;
3662     code += GET(code, 1);
3663     }
3664     while (*code == OP_ALT); /* Loop for each alternative */
3665     return TRUE;
3666     }
3667    
3668    
3669    
3670     /*************************************************
3671     * Check for starting with ^ or .* *
3672     *************************************************/
3673    
3674     /* This is called to find out if every branch starts with ^ or .* so that
3675     "first char" processing can be done to speed things up in multiline
3676     matching and for non-DOTALL patterns that start with .* (which must start at
3677     the beginning or after \n). As in the case of is_anchored() (see above), we
3678     have to take account of back references to capturing brackets that contain .*
3679     because in that case we can't make the assumption.
3680    
3681     Arguments:
3682     code points to start of expression (the bracket)
3683     bracket_map a bitmap of which brackets we are inside while testing; this
3684     handles up to substring 31; after that we just have to take
3685     the less precise approach
3686     backref_map the back reference bitmap
3687    
3688     Returns: TRUE or FALSE
3689     */
3690    
3691     static BOOL
3692     is_startline(const uschar *code, unsigned int bracket_map,
3693     unsigned int backref_map)
3694     {
3695     do {
3696     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3697     FALSE);
3698     register int op = *scode;
3699    
3700     /* Capturing brackets */
3701    
3702     if (op > OP_BRA)
3703     {
3704     int new_map;
3705     op -= OP_BRA;
3706     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3707     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3708     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3709     }
3710    
3711     /* Other brackets */
3712    
3713     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3714     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3715    
3716     /* .* means "start at start or after \n" if it isn't in brackets that
3717     may be referenced. */
3718    
3719     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3720     {
3721     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3722     }
3723    
3724     /* Check for explicit circumflex */
3725    
3726     else if (op != OP_CIRC) return FALSE;
3727    
3728     /* Move on to the next alternative */
3729    
3730     code += GET(code, 1);
3731     }
3732     while (*code == OP_ALT); /* Loop for each alternative */
3733     return TRUE;
3734     }
3735    
3736    
3737    
3738     /*************************************************
3739     * Check for asserted fixed first char *
3740     *************************************************/
3741    
3742     /* During compilation, the "first char" settings from forward assertions are
3743     discarded, because they can cause conflicts with actual literals that follow.
3744     However, if we end up without a first char setting for an unanchored pattern,
3745     it is worth scanning the regex to see if there is an initial asserted first
3746     char. If all branches start with the same asserted char, or with a bracket all
3747     of whose alternatives start with the same asserted char (recurse ad lib), then
3748     we return that char, otherwise -1.
3749    
3750     Arguments:
3751     code points to start of expression (the bracket)
3752     options pointer to the options (used to check casing changes)
3753     inassert TRUE if in an assertion
3754    
3755     Returns: -1 or the fixed first char
3756     */
3757    
3758     static int
3759     find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3760     {
3761     register int c = -1;
3762     do {
3763     int d;
3764     const uschar *scode =
3765     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3766     register int op = *scode;
3767    
3768     if (op >= OP_BRA) op = OP_BRA;
3769    
3770     switch(op)
3771     {
3772     default:
3773     return -1;
3774    
3775     case OP_BRA:
3776     case OP_ASSERT:
3777     case OP_ONCE:
3778     case OP_COND:
3779     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3780     return -1;
3781     if (c < 0) c = d; else if (c != d) return -1;
3782     break;
3783    
3784     case OP_EXACT: /* Fall through */
3785     scode += 2;
3786    
3787     case OP_CHAR:
3788     case OP_CHARNC:
3789     case OP_PLUS:
3790     case OP_MINPLUS:
3791     if (!inassert) return -1;
3792     if (c < 0)
3793     {
3794     c = scode[1];
3795     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3796     }
3797     else if (c != scode[1]) return -1;
3798     break;
3799     }
3800    
3801     code += GET(code, 1);
3802     }
3803     while (*code == OP_ALT);
3804     return c;
3805     }
3806    
3807    
3808    
3809     /*************************************************
3810     * Compile a Regular Expression *
3811     *************************************************/
3812    
3813     /* This function takes a string and returns a pointer to a block of store
3814     holding a compiled version of the expression. The original API for this
3815     function had no error code return variable; it is retained for backwards
3816     compatibility. The new function is given a new name.
3817    
3818     Arguments:
3819     pattern the regular expression
3820     options various option bits
3821     errorcodeptr pointer to error code variable (pcre_compile2() only)
3822     can be NULL if you don't want a code value
3823     errorptr pointer to pointer to error text
3824     erroroffset ptr offset in pattern where error was detected
3825     tables pointer to character tables or NULL
3826    
3827     Returns: pointer to compiled data block, or NULL on error,
3828     with errorptr and erroroffset set
3829     */
3830    
3831     EXPORT pcre *
3832     pcre_compile(const char *pattern, int options, const char **errorptr,
3833     int *erroroffset, const unsigned char *tables)
3834     {
3835     return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3836     }
3837    
3838    
3839     EXPORT pcre *
3840     pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3841     const char **errorptr, int *erroroffset, const unsigned char *tables)
3842     {
3843     real_pcre *re;
3844     int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3845     int c, firstbyte, reqbyte;
3846     int bracount = 0;
3847     int branch_extra = 0;
3848     int branch_newextra;
3849     int item_count = -1;
3850     int name_count = 0;
3851     int max_name_size = 0;
3852     int lastitemlength = 0;
3853     int errorcode = 0;
3854     #ifdef SUPPORT_UTF8
3855     BOOL utf8;
3856     BOOL class_utf8;
3857     #endif
3858     BOOL inescq = FALSE;
3859     unsigned int brastackptr = 0;
3860     size_t size;
3861     uschar *code;
3862     const uschar *codestart;
3863     const uschar *ptr;
3864     compile_data compile_block;
3865     int brastack[BRASTACK_SIZE];
3866     uschar bralenstack[BRASTACK_SIZE];
3867    
3868     /* We can't pass back an error message if errorptr is NULL; I guess the best we
3869     can do is just return NULL, but we can set a code value if there is a code
3870     pointer. */
3871    
3872     if (errorptr == NULL)
3873     {
3874     if (errorcodeptr != NULL) *errorcodeptr = 99;
3875     return NULL;
3876     }
3877    
3878     *errorptr = NULL;
3879     if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3880    
3881     /* However, we can give a message for this error */
3882    
3883     if (erroroffset == NULL)
3884     {
3885     errorcode = ERR16;
3886     goto PCRE_EARLY_ERROR_RETURN;
3887     }
3888    
3889     *erroroffset = 0;
3890    
3891     /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3892    
3893     #ifdef SUPPORT_UTF8
3894     utf8 = (options & PCRE_UTF8) != 0;
3895     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3896     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3897     {
3898     errorcode = ERR44;
3899     goto PCRE_EARLY_ERROR_RETURN;
3900     }
3901     #else
3902     if ((options & PCRE_UTF8) != 0)
3903     {
3904     errorcode = ERR32;
3905     goto PCRE_EARLY_ERROR_RETURN;
3906     }
3907     #endif
3908    
3909     if ((options & ~PUBLIC_OPTIONS) != 0)
3910     {
3911     errorcode = ERR17;
3912     goto PCRE_EARLY_ERROR_RETURN;
3913     }
3914    
3915     /* Set up pointers to the individual character tables */
3916    
3917     if (tables == NULL) tables = _pcre_default_tables;
3918     compile_block.lcc = tables + lcc_offset;
3919     compile_block.fcc = tables + fcc_offset;
3920     compile_block.cbits = tables + cbits_offset;
3921     compile_block.ctypes = tables + ctypes_offset;
3922    
3923     /* Maximum back reference and backref bitmap. This is updated for numeric
3924     references during the first pass, but for named references during the actual
3925     compile pass. The bitmap records up to 31 back references to help in deciding
3926     whether (.*) can be treated as anchored or not. */
3927    
3928     compile_block.top_backref = 0;
3929     compile_block.backref_map = 0;
3930    
3931     /* Reflect pattern for debugging output */
3932    
3933     DPRINTF(("------------------------------------------------------------------\n"));
3934     DPRINTF(("%s\n", pattern));
3935    
3936     /* The first thing to do is to make a pass over the pattern to compute the
3937     amount of store required to hold the compiled code. This does not have to be
3938     perfect as long as errors are overestimates. At the same time we can detect any
3939     flag settings right at the start, and extract them. Make an attempt to correct
3940     for any counted white space if an "extended" flag setting appears late in the
3941     pattern. We can't be so clever for #-comments. */
3942    
3943     ptr = (const uschar *)(pattern - 1);
3944     while ((c = *(++ptr)) != 0)
3945     {
3946     int min, max;
3947     int class_optcount;
3948     int bracket_length;
3949     int duplength;
3950    
3951     /* If we are inside a \Q...\E sequence, all chars are literal */
3952    
3953     if (inescq)
3954     {
3955     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3956     goto NORMAL_CHAR;
3957     }
3958    
3959     /* Otherwise, first check for ignored whitespace and comments */
3960    
3961     if ((options & PCRE_EXTENDED) != 0)
3962     {
3963     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3964     if (c == '#')
3965     {
3966     /* The space before the ; is to avoid a warning on a silly compiler
3967     on the Macintosh. */
3968     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3969     if (c == 0) break;
3970     continue;
3971     }
3972     }
3973    
3974     item_count++; /* Is zero for the first non-comment item */
3975    
3976     /* Allow space for auto callout before every item except quantifiers. */
3977    
3978     if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3979     c != '*' && c != '+' && c != '?' &&
3980     (c != '{' || !is_counted_repeat(ptr + 1)))
3981     length += 2 + 2*LINK_SIZE;
3982    
3983     switch(c)
3984     {
3985     /* A backslashed item may be an escaped data character or it may be a
3986     character type. */
3987    
3988     case '\\':
3989     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
3990     if (errorcode != 0) goto PCRE_ERROR_RETURN;
3991    
3992     lastitemlength = 1; /* Default length of last item for repeats */
3993    
3994     if (c >= 0) /* Data character */
3995     {
3996     length += 2; /* For a one-byte character */
3997    
3998     #ifdef SUPPORT_UTF8
3999     if (utf8 && c > 127)
4000     {
4001     int i;
4002     for (i = 0; i < _pcre_utf8_table1_size; i++)
4003     if (c <= _pcre_utf8_table1[i]) break;
4004     length += i;
4005     lastitemlength += i;
4006     }
4007     #endif
4008    
4009     continue;
4010     }
4011    
4012     /* If \Q, enter "literal" mode */
4013    
4014     if (-c == ESC_Q)
4015     {
4016     inescq = TRUE;
4017     continue;
4018     }
4019    
4020     /* \X is supported only if Unicode property support is compiled */
4021    
4022     #ifndef SUPPORT_UCP
4023     if (-c == ESC_X)
4024     {
4025     errorcode = ERR45;
4026     goto PCRE_ERROR_RETURN;
4027     }
4028     #endif
4029    
4030     /* \P and \p are for Unicode properties, but only when the support has
4031     been compiled. Each item needs 2 bytes. */
4032    
4033     else if (-c == ESC_P || -c == ESC_p)
4034     {
4035     #ifdef SUPPORT_UCP
4036     BOOL negated;
4037     length += 2;
4038     lastitemlength = 2;
4039     if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4040     continue;
4041     #else
4042     errorcode = ERR45;
4043     goto PCRE_ERROR_RETURN;
4044     #endif
4045     }
4046    
4047     /* Other escapes need one byte */
4048    
4049     length++;
4050    
4051     /* A back reference needs an additional 2 bytes, plus either one or 5
4052     bytes for a repeat. We also need to keep the value of the highest
4053     back reference. */
4054    
4055     if (c <= -ESC_REF)
4056     {
4057     int refnum = -c - ESC_REF;
4058     compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4059     if (refnum > compile_block.top_backref)
4060     compile_block.top_backref = refnum;
4061     length += 2; /* For single back reference */
4062     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4063     {
4064     ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4065     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4066     if ((min == 0 && (max == 1 || max == -1)) ||
4067     (min == 1 && max == -1))
4068     length++;
4069     else length += 5;
4070     if (ptr[1] == '?') ptr++;
4071     }
4072     }
4073     continue;
4074    
4075     case '^': /* Single-byte metacharacters */
4076     case '.':
4077     case '$':
4078     length++;
4079     lastitemlength = 1;
4080     continue;
4081    
4082     case '*': /* These repeats won't be after brackets; */
4083     case '+': /* those are handled separately */
4084     case '?':
4085     length++;
4086     goto POSESSIVE; /* A few lines below */
4087    
4088     /* This covers the cases of braced repeats after a single char, metachar,
4089     class, or back reference. */
4090    
4091     case '{':
4092     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4093     ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4094     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4095    
4096     /* These special cases just insert one extra opcode */
4097    
4098     if ((min == 0 && (max == 1 || max == -1)) ||
4099     (min == 1 && max == -1))
4100     length++;
4101    
4102     /* These cases might insert additional copies of a preceding character. */
4103    
4104     else
4105     {
4106     if (min != 1)
4107     {
4108     length -= lastitemlength; /* Uncount the original char or metachar */
4109     if (min > 0) length += 3 + lastitemlength;
4110     }
4111     length += lastitemlength + ((max > 0)? 3 : 1);
4112     }
4113    
4114     if (ptr[1] == '?') ptr++; /* Needs no extra length */
4115    
4116     POSESSIVE: /* Test for possessive quantifier */
4117     if (ptr[1] == '+')
4118     {
4119     ptr++;
4120     length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4121     }
4122     continue;
4123    
4124     /* An alternation contains an offset to the next branch or ket. If any ims
4125     options changed in the previous branch(es), and/or if we are in a
4126     lookbehind assertion, extra space will be needed at the start of the
4127     branch. This is handled by branch_extra. */
4128    
4129     case '|':
4130     length += 1 + LINK_SIZE + branch_extra;
4131     continue;
4132    
4133     /* A character class uses 33 characters provided that all the character
4134     values are less than 256. Otherwise, it uses a bit map for low valued
4135     characters, and individual items for others. Don't worry about character
4136     types that aren't allowed in classes - they'll get picked up during the
4137     compile. A character class that contains only one single-byte character
4138     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4139     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4140    
4141     case '[':
4142     if (*(++ptr) == '^')
4143     {
4144     class_optcount = 10; /* Greater than one */
4145     ptr++;
4146     }
4147     else class_optcount = 0;
4148    
4149     #ifdef SUPPORT_UTF8
4150     class_utf8 = FALSE;
4151     #endif
4152    
4153     /* Written as a "do" so that an initial ']' is taken as data */
4154    
4155     if (*ptr != 0) do
4156     {
4157     /* Inside \Q...\E everything is literal except \E */
4158    
4159     if (inescq)
4160     {
4161     if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4162     inescq = FALSE;
4163     ptr += 1;
4164     continue;
4165     }
4166    
4167     /* Outside \Q...\E, check for escapes */
4168    
4169     if (*ptr == '\\')
4170     {
4171     c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4172     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4173    
4174     /* \b is backspace inside a class; \X is literal */
4175    
4176     if (-c == ESC_b) c = '\b';
4177     else if (-c == ESC_X) c = 'X';
4178    
4179     /* \Q enters quoting mode */
4180    
4181     else if (-c == ESC_Q)
4182     {
4183     inescq = TRUE;
4184     continue;
4185     }
4186    
4187     /* Handle escapes that turn into characters */
4188    
4189     if (c >= 0) goto NON_SPECIAL_CHARACTER;
4190    
4191     /* Escapes that are meta-things. The normal ones just affect the
4192     bit map, but Unicode properties require an XCLASS extended item. */
4193    
4194     else
4195     {
4196     class_optcount = 10; /* \d, \s etc; make sure > 1 */
4197     #ifdef SUPPORT_UTF8
4198     if (-c == ESC_p || -c == ESC_P)
4199     {
4200     if (!class_utf8)
4201     {
4202     class_utf8 = TRUE;
4203     length += LINK_SIZE + 2;
4204     }
4205     length += 2;
4206     }
4207     #endif
4208     }
4209     }
4210    
4211     /* Check the syntax for POSIX stuff. The bits we actually handle are
4212     checked during the real compile phase. */
4213    
4214     else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4215     {
4216     ptr++;
4217     class_optcount = 10; /* Make sure > 1 */
4218     }
4219    
4220     /* Anything else increments the possible optimization count. We have to
4221     detect ranges here so that we can compute the number of extra ranges for
4222     caseless wide characters when UCP support is available. If there are wide
4223     characters, we are going to have to use an XCLASS, even for single
4224     characters. */
4225    
4226     else
4227     {
4228     int d;
4229    
4230     GET_ONE_CHARACTER:
4231    
4232     #ifdef SUPPORT_UTF8
4233     if (utf8)
4234     {
4235     int extra = 0;
4236     GETCHARLEN(c, ptr, extra);
4237     ptr += extra;
4238     }
4239     else c = *ptr;
4240     #else
4241     c = *ptr;
4242     #endif
4243    
4244     /* Come here from handling \ above when it escapes to a char value */
4245    
4246     NON_SPECIAL_CHARACTER:
4247     class_optcount++;
4248    
4249     d = -1;
4250     if (ptr[1] == '-')
4251     {
4252     uschar const *hyptr = ptr++;
4253     if (ptr[1] == '\\')
4254     {
4255     ptr++;
4256     d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4257     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4258     if (-d == ESC_b) d = '\b'; /* backspace */
4259     else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4260     }
4261     else if (ptr[1] != 0 && ptr[1] != ']')
4262     {
4263     ptr++;
4264     #ifdef SUPPORT_UTF8
4265     if (utf8)
4266     {
4267     int extra = 0;
4268     GETCHARLEN(d, ptr, extra);
4269     ptr += extra;
4270     }
4271     else
4272     #endif
4273     d = *ptr;
4274     }
4275     if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4276     }
4277    
4278     /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4279     127 for caseless matching, we will need to use an XCLASS. */
4280    
4281     if (d >= 0)
4282     {
4283     class_optcount = 10; /* Ensure > 1 */
4284     if (d < c)
4285     {
4286     errorcode = ERR8;
4287     goto PCRE_ERROR_RETURN;
4288     }
4289    
4290     #ifdef SUPPORT_UTF8
4291     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4292     {
4293     uschar buffer[6];
4294     if (!class_utf8) /* Allow for XCLASS overhead */
4295     {
4296     class_utf8 = TRUE;
4297     length += LINK_SIZE + 2;
4298     }
4299    
4300     #ifdef SUPPORT_UCP
4301     /* If we have UCP support, find out how many extra ranges are
4302     needed to map the other case of characters within this range. We
4303     have to mimic the range optimization here, because extending the
4304     range upwards might push d over a boundary that makes is use
4305     another byte in the UTF-8 representation. */
4306    
4307     if ((options & PCRE_CASELESS) != 0)
4308     {
4309     int occ, ocd;
4310     int cc = c;
4311     int origd = d;
4312     while (get_othercase_range(&cc, origd, &occ, &ocd))
4313     {
4314     if (occ >= c && ocd <= d) continue; /* Skip embedded */
4315    
4316     if (occ < c && ocd >= c - 1) /* Extend the basic range */
4317     { /* if there is overlap, */
4318     c = occ; /* noting that if occ < c */
4319     continue; /* we can't have ocd > d */
4320     } /* because a subrange is */
4321     if (ocd > d && occ <= d + 1) /* always shorter than */
4322     { /* the basic range. */
4323     d = ocd;
4324     continue;
4325     }
4326    
4327     /* An extra item is needed */
4328    
4329     length += 1 + _pcre_ord2utf8(occ, buffer) +
4330     ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4331     }
4332     }
4333     #endif /* SUPPORT_UCP */
4334