/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 81 - (hide annotations) (download)
Sat Feb 24 21:40:59 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 159806 byte(s)
Load pcre-6.2 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9     Copyright (c) 1997-2005 University of Cambridge
10    
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45     #include "pcre_internal.h"
46    
47    
48     /*************************************************
49     * Code parameters and static tables *
50     *************************************************/
51    
52     /* Maximum number of items on the nested bracket stacks at compile time. This
53     applies to the nesting of all kinds of parentheses. It does not limit
54     un-nested, non-capturing parentheses. This number can be made bigger if
55     necessary - it is used to dimension one int and one unsigned char vector at
56     compile time. */
57    
58     #define BRASTACK_SIZE 200
59    
60    
61     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
62     are simple data values; negative values are for special things like \d and so
63     on. Zero means further processing is needed (for things like \x), or the escape
64     is invalid. */
65    
66     #if !EBCDIC /* This is the "normal" table for ASCII systems */
67     static const short int escapes[] = {
68     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
69     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
70     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
71     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
72     -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
73     -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
74     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
75     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
76     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
77     0, 0, -ESC_z /* x - z */
78     };
79    
80     #else /* This is the "abnormal" table for EBCDIC systems */
81     static const short int escapes[] = {
82     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
83     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
84     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
85     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
86     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
87     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
88     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
89     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
90     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
91     /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
92     /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
93     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
94     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
95     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
96     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
97     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
98     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
99     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
100     /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
101     /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
102     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
103     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
104     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
105     };
106     #endif
107    
108    
109     /* Tables of names of POSIX character classes and their lengths. The list is
110     terminated by a zero length entry. The first three must be alpha, upper, lower,
111     as this is assumed for handling case independence. */
112    
113     static const char *const posix_names[] = {
114     "alpha", "lower", "upper",
115     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
116     "print", "punct", "space", "word", "xdigit" };
117    
118     static const uschar posix_name_lengths[] = {
119     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
120    
121     /* Table of class bit maps for each POSIX class; up to three may be combined
122     to form the class. The table for [:blank:] is dynamically modified to remove
123     the vertical space characters. */
124    
125     static const int posix_class_maps[] = {
126     cbit_lower, cbit_upper, -1, /* alpha */
127     cbit_lower, -1, -1, /* lower */
128     cbit_upper, -1, -1, /* upper */
129     cbit_digit, cbit_lower, cbit_upper, /* alnum */
130     cbit_print, cbit_cntrl, -1, /* ascii */
131     cbit_space, -1, -1, /* blank - a GNU extension */
132     cbit_cntrl, -1, -1, /* cntrl */
133     cbit_digit, -1, -1, /* digit */
134     cbit_graph, -1, -1, /* graph */
135     cbit_print, -1, -1, /* print */
136     cbit_punct, -1, -1, /* punct */
137     cbit_space, -1, -1, /* space */
138     cbit_word, -1, -1, /* word - a Perl extension */
139     cbit_xdigit,-1, -1 /* xdigit */
140     };
141    
142    
143     /* The texts of compile-time error messages. These are "char *" because they
144     are passed to the outside world. */
145    
146     static const char *error_texts[] = {
147     "no error",
148     "\\ at end of pattern",
149     "\\c at end of pattern",
150     "unrecognized character follows \\",
151     "numbers out of order in {} quantifier",
152     /* 5 */
153     "number too big in {} quantifier",
154     "missing terminating ] for character class",
155     "invalid escape sequence in character class",
156     "range out of order in character class",
157     "nothing to repeat",
158     /* 10 */
159     "operand of unlimited repeat could match the empty string",
160     "internal error: unexpected repeat",
161     "unrecognized character after (?",
162     "POSIX named classes are supported only within a class",
163     "missing )",
164     /* 15 */
165     "reference to non-existent subpattern",
166     "erroffset passed as NULL",
167     "unknown option bit(s) set",
168     "missing ) after comment",
169     "parentheses nested too deeply",
170     /* 20 */
171     "regular expression too large",
172     "failed to get memory",
173     "unmatched parentheses",
174     "internal error: code overflow",
175     "unrecognized character after (?<",
176     /* 25 */
177     "lookbehind assertion is not fixed length",
178     "malformed number after (?(",
179     "conditional group contains more than two branches",
180     "assertion expected after (?(",
181     "(?R or (?digits must be followed by )",
182     /* 30 */
183     "unknown POSIX class name",
184     "POSIX collating elements are not supported",
185     "this version of PCRE is not compiled with PCRE_UTF8 support",
186     "spare error",
187     "character value in \\x{...} sequence is too large",
188     /* 35 */
189     "invalid condition (?(0)",
190     "\\C not allowed in lookbehind assertion",
191     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
192     "number after (?C is > 255",
193     "closing ) for (?C expected",
194     /* 40 */
195     "recursive call could loop indefinitely",
196     "unrecognized character after (?P",
197     "syntax error after (?P",
198     "two named groups have the same name",
199     "invalid UTF-8 string",
200     /* 45 */
201     "support for \\P, \\p, and \\X has not been compiled",
202     "malformed \\P or \\p sequence",
203     "unknown property name after \\P or \\p"
204     };
205    
206    
207     /* Table to identify digits and hex digits. This is used when compiling
208     patterns. Note that the tables in chartables are dependent on the locale, and
209     may mark arbitrary characters as digits - but the PCRE compiling code expects
210     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
211     a private table here. It costs 256 bytes, but it is a lot faster than doing
212     character value tests (at least in some simple cases I timed), and in some
213     applications one wants PCRE to compile efficiently as well as match
214     efficiently.
215    
216     For convenience, we use the same bit definitions as in chartables:
217    
218     0x04 decimal digit
219     0x08 hexadecimal digit
220    
221     Then we can use ctype_digit and ctype_xdigit in the code. */
222    
223     #if !EBCDIC /* This is the "normal" case, for ASCII systems */
224     static const unsigned char digitab[] =
225     {
226     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
227     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
228     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
229     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
230     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
231     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
232     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
233     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
234     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
235     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
236     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
237     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
238     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
239     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
240     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
241     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
242     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
243     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
244     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
245     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
246     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
247     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
248     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
249     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
250     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
251     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
252     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
253     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
254     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
255     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
256     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
257     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
258    
259     #else /* This is the "abnormal" case, for EBCDIC systems */
260     static const unsigned char digitab[] =
261     {
262     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
263     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
264     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
265     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
266     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
267     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
268     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
275     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
276     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
278     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
281     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
286     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
292     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
293     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
294    
295     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
296     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
297     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
298     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
300     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
304     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
305     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
307     0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
309     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
312     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
313     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
314     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
315     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
316     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
317     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
318     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
319     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
320     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
321     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
322     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
323     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
324     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
325     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
326     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
327     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
328     #endif
329    
330    
331     /* Definition to allow mutual recursion */
332    
333     static BOOL
334     compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
335     int *, int *, branch_chain *, compile_data *);
336    
337    
338    
339     /*************************************************
340     * Handle escapes *
341     *************************************************/
342    
343     /* This function is called when a \ has been encountered. It either returns a
344     positive value for a simple escape such as \n, or a negative value which
345     encodes one of the more complicated things such as \d. When UTF-8 is enabled,
346     a positive value greater than 255 may be returned. On entry, ptr is pointing at
347     the \. On exit, it is on the final character of the escape sequence.
348    
349     Arguments:
350     ptrptr points to the pattern position pointer
351     errorcodeptr points to the errorcode variable
352     bracount number of previous extracting brackets
353     options the options bits
354     isclass TRUE if inside a character class
355    
356     Returns: zero or positive => a data character
357     negative => a special escape sequence
358     on error, errorptr is set
359     */
360    
361     static int
362     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
363     int options, BOOL isclass)
364     {
365     const uschar *ptr = *ptrptr;
366     int c, i;
367    
368     /* If backslash is at the end of the pattern, it's an error. */
369    
370     c = *(++ptr);
371     if (c == 0) *errorcodeptr = ERR1;
372    
373     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
374     a table. A non-zero result is something that can be returned immediately.
375     Otherwise further processing may be required. */
376    
377     #if !EBCDIC /* ASCII coding */
378     else if (c < '0' || c > 'z') {} /* Not alphameric */
379     else if ((i = escapes[c - '0']) != 0) c = i;
380    
381     #else /* EBCDIC coding */
382     else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
383     else if ((i = escapes[c - 0x48]) != 0) c = i;
384     #endif
385    
386     /* Escapes that need further processing, or are illegal. */
387    
388     else
389     {
390     const uschar *oldptr;
391     switch (c)
392     {
393     /* A number of Perl escapes are not handled by PCRE. We give an explicit
394     error. */
395    
396     case 'l':
397     case 'L':
398     case 'N':
399     case 'u':
400     case 'U':
401     *errorcodeptr = ERR37;
402     break;
403    
404     /* The handling of escape sequences consisting of a string of digits
405     starting with one that is not zero is not straightforward. By experiment,
406     the way Perl works seems to be as follows:
407    
408     Outside a character class, the digits are read as a decimal number. If the
409     number is less than 10, or if there are that many previous extracting
410     left brackets, then it is a back reference. Otherwise, up to three octal
411     digits are read to form an escaped byte. Thus \123 is likely to be octal
412     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
413     value is greater than 377, the least significant 8 bits are taken. Inside a
414     character class, \ followed by a digit is always an octal number. */
415    
416     case '1': case '2': case '3': case '4': case '5':
417     case '6': case '7': case '8': case '9':
418    
419     if (!isclass)
420     {
421     oldptr = ptr;
422     c -= '0';
423     while ((digitab[ptr[1]] & ctype_digit) != 0)
424     c = c * 10 + *(++ptr) - '0';
425     if (c < 10 || c <= bracount)
426     {
427     c = -(ESC_REF + c);
428     break;
429     }
430     ptr = oldptr; /* Put the pointer back and fall through */
431     }
432    
433     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
434     generates a binary zero byte and treats the digit as a following literal.
435     Thus we have to pull back the pointer by one. */
436    
437     if ((c = *ptr) >= '8')
438     {
439     ptr--;
440     c = 0;
441     break;
442     }
443    
444     /* \0 always starts an octal number, but we may drop through to here with a
445     larger first octal digit. */
446    
447     case '0':
448     c -= '0';
449     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
450     c = c * 8 + *(++ptr) - '0';
451     c &= 255; /* Take least significant 8 bits */
452     break;
453    
454     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
455     which can be greater than 0xff, but only if the ddd are hex digits. */
456    
457     case 'x':
458     #ifdef SUPPORT_UTF8
459     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
460     {
461     const uschar *pt = ptr + 2;
462     register int count = 0;
463     c = 0;
464     while ((digitab[*pt] & ctype_xdigit) != 0)
465     {
466     int cc = *pt++;
467     count++;
468     #if !EBCDIC /* ASCII coding */
469     if (cc >= 'a') cc -= 32; /* Convert to upper case */
470     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
471     #else /* EBCDIC coding */
472     if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
473     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
474     #endif
475     }
476     if (*pt == '}')
477     {
478     if (c < 0 || count > 8) *errorcodeptr = ERR34;
479     ptr = pt;
480     break;
481     }
482     /* If the sequence of hex digits does not end with '}', then we don't
483     recognize this construct; fall through to the normal \x handling. */
484     }
485     #endif
486    
487     /* Read just a single hex char */
488    
489     c = 0;
490     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
491     {
492     int cc; /* Some compilers don't like ++ */
493     cc = *(++ptr); /* in initializers */
494     #if !EBCDIC /* ASCII coding */
495     if (cc >= 'a') cc -= 32; /* Convert to upper case */
496     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
497     #else /* EBCDIC coding */
498     if (cc <= 'z') cc += 64; /* Convert to upper case */
499     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
500     #endif
501     }
502     break;
503    
504     /* Other special escapes not starting with a digit are straightforward */
505    
506     case 'c':
507     c = *(++ptr);
508     if (c == 0)
509     {
510     *errorcodeptr = ERR2;
511     return 0;
512     }
513    
514     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
515     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
516     (However, an EBCDIC equivalent has now been added.) */
517    
518     #if !EBCDIC /* ASCII coding */
519     if (c >= 'a' && c <= 'z') c -= 32;
520     c ^= 0x40;
521     #else /* EBCDIC coding */
522     if (c >= 'a' && c <= 'z') c += 64;
523     c ^= 0xC0;
524     #endif
525     break;
526    
527     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
528     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
529     for Perl compatibility, it is a literal. This code looks a bit odd, but
530     there used to be some cases other than the default, and there may be again
531     in future, so I haven't "optimized" it. */
532    
533     default:
534     if ((options & PCRE_EXTRA) != 0) switch(c)
535     {
536     default:
537     *errorcodeptr = ERR3;
538     break;
539     }
540     break;
541     }
542     }
543    
544     *ptrptr = ptr;
545     return c;
546     }
547    
548    
549    
550     #ifdef SUPPORT_UCP
551     /*************************************************
552     * Handle \P and \p *
553     *************************************************/
554    
555     /* This function is called after \P or \p has been encountered, provided that
556     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
557     pointing at the P or p. On exit, it is pointing at the final character of the
558     escape sequence.
559    
560     Argument:
561     ptrptr points to the pattern position pointer
562     negptr points to a boolean that is set TRUE for negation else FALSE
563     errorcodeptr points to the error code variable
564    
565     Returns: value from ucp_type_table, or -1 for an invalid type
566     */
567    
568     static int
569     get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
570     {
571     int c, i, bot, top;
572     const uschar *ptr = *ptrptr;
573     char name[4];
574    
575     c = *(++ptr);
576     if (c == 0) goto ERROR_RETURN;
577    
578     *negptr = FALSE;
579    
580     /* \P or \p can be followed by a one- or two-character name in {}, optionally
581     preceded by ^ for negation. */
582    
583     if (c == '{')
584     {
585     if (ptr[1] == '^')
586     {
587     *negptr = TRUE;
588     ptr++;
589     }
590     for (i = 0; i <= 2; i++)
591     {
592     c = *(++ptr);
593     if (c == 0) goto ERROR_RETURN;
594     if (c == '}') break;
595     name[i] = c;
596     }
597     if (c !='}') /* Try to distinguish error cases */
598     {
599     while (*(++ptr) != 0 && *ptr != '}');
600     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
601     }
602     name[i] = 0;
603     }
604    
605     /* Otherwise there is just one following character */
606    
607     else
608     {
609     name[0] = c;
610     name[1] = 0;
611     }
612    
613     *ptrptr = ptr;
614    
615     /* Search for a recognized property name using binary chop */
616    
617     bot = 0;
618     top = _pcre_utt_size;
619    
620     while (bot < top)
621     {
622     i = (bot + top)/2;
623     c = strcmp(name, _pcre_utt[i].name);
624     if (c == 0) return _pcre_utt[i].value;
625     if (c > 0) bot = i + 1; else top = i;
626     }
627    
628     UNKNOWN_RETURN:
629     *errorcodeptr = ERR47;
630     *ptrptr = ptr;
631     return -1;
632    
633     ERROR_RETURN:
634     *errorcodeptr = ERR46;
635     *ptrptr = ptr;
636     return -1;
637     }
638     #endif
639    
640    
641    
642    
643     /*************************************************
644     * Check for counted repeat *
645     *************************************************/
646    
647     /* This function is called when a '{' is encountered in a place where it might
648     start a quantifier. It looks ahead to see if it really is a quantifier or not.
649     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
650     where the ddds are digits.
651    
652     Arguments:
653     p pointer to the first char after '{'
654    
655     Returns: TRUE or FALSE
656     */
657    
658     static BOOL
659     is_counted_repeat(const uschar *p)
660     {
661     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
662     while ((digitab[*p] & ctype_digit) != 0) p++;
663     if (*p == '}') return TRUE;
664    
665     if (*p++ != ',') return FALSE;
666     if (*p == '}') return TRUE;
667    
668     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
669     while ((digitab[*p] & ctype_digit) != 0) p++;
670    
671     return (*p == '}');
672     }
673    
674    
675    
676     /*************************************************
677     * Read repeat counts *
678     *************************************************/
679    
680     /* Read an item of the form {n,m} and return the values. This is called only
681     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
682     so the syntax is guaranteed to be correct, but we need to check the values.
683    
684     Arguments:
685     p pointer to first char after '{'
686     minp pointer to int for min
687     maxp pointer to int for max
688     returned as -1 if no max
689     errorcodeptr points to error code variable
690    
691     Returns: pointer to '}' on success;
692     current ptr on error, with errorcodeptr set non-zero
693     */
694    
695     static const uschar *
696     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
697     {
698     int min = 0;
699     int max = -1;
700    
701 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
702     an integer overflow. */
703    
704 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
705 nigel 81 if (min < 0 || min > 65535)
706     {
707     *errorcodeptr = ERR5;
708     return p;
709     }
710 nigel 77
711 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
712     Also, max must not be less than min. */
713    
714 nigel 77 if (*p == '}') max = min; else
715     {
716     if (*(++p) != '}')
717     {
718     max = 0;
719     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
720 nigel 81 if (max < 0 || max > 65535)
721     {
722     *errorcodeptr = ERR5;
723     return p;
724     }
725 nigel 77 if (max < min)
726     {
727     *errorcodeptr = ERR4;
728     return p;
729     }
730     }
731     }
732    
733 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
734     '}'. */
735 nigel 77
736 nigel 81 *minp = min;
737     *maxp = max;
738 nigel 77 return p;
739     }
740    
741    
742    
743     /*************************************************
744     * Find first significant op code *
745     *************************************************/
746    
747     /* This is called by several functions that scan a compiled expression looking
748     for a fixed first character, or an anchoring op code etc. It skips over things
749     that do not influence this. For some calls, a change of option is important.
750     For some calls, it makes sense to skip negative forward and all backward
751     assertions, and also the \b assertion; for others it does not.
752    
753     Arguments:
754     code pointer to the start of the group
755     options pointer to external options
756     optbit the option bit whose changing is significant, or
757     zero if none are
758     skipassert TRUE if certain assertions are to be skipped
759    
760     Returns: pointer to the first significant opcode
761     */
762    
763     static const uschar*
764     first_significant_code(const uschar *code, int *options, int optbit,
765     BOOL skipassert)
766     {
767     for (;;)
768     {
769     switch ((int)*code)
770     {
771     case OP_OPT:
772     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
773     *options = (int)code[1];
774     code += 2;
775     break;
776    
777     case OP_ASSERT_NOT:
778     case OP_ASSERTBACK:
779     case OP_ASSERTBACK_NOT:
780     if (!skipassert) return code;
781     do code += GET(code, 1); while (*code == OP_ALT);
782     code += _pcre_OP_lengths[*code];
783     break;
784    
785     case OP_WORD_BOUNDARY:
786     case OP_NOT_WORD_BOUNDARY:
787     if (!skipassert) return code;
788     /* Fall through */
789    
790     case OP_CALLOUT:
791     case OP_CREF:
792     case OP_BRANUMBER:
793     code += _pcre_OP_lengths[*code];
794     break;
795    
796     default:
797     return code;
798     }
799     }
800     /* Control never reaches here */
801     }
802    
803    
804    
805    
806     /*************************************************
807     * Find the fixed length of a pattern *
808     *************************************************/
809    
810     /* Scan a pattern and compute the fixed length of subject that will match it,
811     if the length is fixed. This is needed for dealing with backward assertions.
812     In UTF8 mode, the result is in characters rather than bytes.
813    
814     Arguments:
815     code points to the start of the pattern (the bracket)
816     options the compiling options
817    
818     Returns: the fixed length, or -1 if there is no fixed length,
819     or -2 if \C was encountered
820     */
821    
822     static int
823     find_fixedlength(uschar *code, int options)
824     {
825     int length = -1;
826    
827     register int branchlength = 0;
828     register uschar *cc = code + 1 + LINK_SIZE;
829    
830     /* Scan along the opcodes for this branch. If we get to the end of the
831     branch, check the length against that of the other branches. */
832    
833     for (;;)
834     {
835     int d;
836     register int op = *cc;
837     if (op >= OP_BRA) op = OP_BRA;
838    
839     switch (op)
840     {
841     case OP_BRA:
842     case OP_ONCE:
843     case OP_COND:
844     d = find_fixedlength(cc, options);
845     if (d < 0) return d;
846     branchlength += d;
847     do cc += GET(cc, 1); while (*cc == OP_ALT);
848     cc += 1 + LINK_SIZE;
849     break;
850    
851     /* Reached end of a branch; if it's a ket it is the end of a nested
852     call. If it's ALT it is an alternation in a nested call. If it is
853     END it's the end of the outer call. All can be handled by the same code. */
854    
855     case OP_ALT:
856     case OP_KET:
857     case OP_KETRMAX:
858     case OP_KETRMIN:
859     case OP_END:
860     if (length < 0) length = branchlength;
861     else if (length != branchlength) return -1;
862     if (*cc != OP_ALT) return length;
863     cc += 1 + LINK_SIZE;
864     branchlength = 0;
865     break;
866    
867     /* Skip over assertive subpatterns */
868    
869     case OP_ASSERT:
870     case OP_ASSERT_NOT:
871     case OP_ASSERTBACK:
872     case OP_ASSERTBACK_NOT:
873     do cc += GET(cc, 1); while (*cc == OP_ALT);
874     /* Fall through */
875    
876     /* Skip over things that don't match chars */
877    
878     case OP_REVERSE:
879     case OP_BRANUMBER:
880     case OP_CREF:
881     case OP_OPT:
882     case OP_CALLOUT:
883     case OP_SOD:
884     case OP_SOM:
885     case OP_EOD:
886     case OP_EODN:
887     case OP_CIRC:
888     case OP_DOLL:
889     case OP_NOT_WORD_BOUNDARY:
890     case OP_WORD_BOUNDARY:
891     cc += _pcre_OP_lengths[*cc];
892     break;
893    
894     /* Handle literal characters */
895    
896     case OP_CHAR:
897     case OP_CHARNC:
898     branchlength++;
899     cc += 2;
900     #ifdef SUPPORT_UTF8
901     if ((options & PCRE_UTF8) != 0)
902     {
903     while ((*cc & 0xc0) == 0x80) cc++;
904     }
905     #endif
906     break;
907    
908     /* Handle exact repetitions. The count is already in characters, but we
909     need to skip over a multibyte character in UTF8 mode. */
910    
911     case OP_EXACT:
912     branchlength += GET2(cc,1);
913     cc += 4;
914     #ifdef SUPPORT_UTF8
915     if ((options & PCRE_UTF8) != 0)
916     {
917     while((*cc & 0x80) == 0x80) cc++;
918     }
919     #endif
920     break;
921    
922     case OP_TYPEEXACT:
923     branchlength += GET2(cc,1);
924     cc += 4;
925     break;
926    
927     /* Handle single-char matchers */
928    
929     case OP_PROP:
930     case OP_NOTPROP:
931     cc++;
932     /* Fall through */
933    
934     case OP_NOT_DIGIT:
935     case OP_DIGIT:
936     case OP_NOT_WHITESPACE:
937     case OP_WHITESPACE:
938     case OP_NOT_WORDCHAR:
939     case OP_WORDCHAR:
940     case OP_ANY:
941     branchlength++;
942     cc++;
943     break;
944    
945     /* The single-byte matcher isn't allowed */
946    
947     case OP_ANYBYTE:
948     return -2;
949    
950     /* Check a class for variable quantification */
951    
952     #ifdef SUPPORT_UTF8
953     case OP_XCLASS:
954     cc += GET(cc, 1) - 33;
955     /* Fall through */
956     #endif
957    
958     case OP_CLASS:
959     case OP_NCLASS:
960     cc += 33;
961    
962     switch (*cc)
963     {
964     case OP_CRSTAR:
965     case OP_CRMINSTAR:
966     case OP_CRQUERY:
967     case OP_CRMINQUERY:
968     return -1;
969    
970     case OP_CRRANGE:
971     case OP_CRMINRANGE:
972     if (GET2(cc,1) != GET2(cc,3)) return -1;
973     branchlength += GET2(cc,1);
974     cc += 5;
975     break;
976    
977     default:
978     branchlength++;
979     }
980     break;
981    
982     /* Anything else is variable length */
983    
984     default:
985     return -1;
986     }
987     }
988     /* Control never gets here */
989     }
990    
991    
992    
993    
994     /*************************************************
995     * Scan compiled regex for numbered bracket *
996     *************************************************/
997    
998     /* This little function scans through a compiled pattern until it finds a
999     capturing bracket with the given number.
1000    
1001     Arguments:
1002     code points to start of expression
1003     utf8 TRUE in UTF-8 mode
1004     number the required bracket number
1005    
1006     Returns: pointer to the opcode for the bracket, or NULL if not found
1007     */
1008    
1009     static const uschar *
1010     find_bracket(const uschar *code, BOOL utf8, int number)
1011     {
1012     #ifndef SUPPORT_UTF8
1013     utf8 = utf8; /* Stop pedantic compilers complaining */
1014     #endif
1015    
1016     for (;;)
1017     {
1018     register int c = *code;
1019     if (c == OP_END) return NULL;
1020     else if (c > OP_BRA)
1021     {
1022     int n = c - OP_BRA;
1023     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1024     if (n == number) return (uschar *)code;
1025     code += _pcre_OP_lengths[OP_BRA];
1026     }
1027     else
1028     {
1029     code += _pcre_OP_lengths[c];
1030    
1031     #ifdef SUPPORT_UTF8
1032    
1033     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1034     by a multi-byte character. The length in the table is a minimum, so we have
1035     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1036     can use relatively efficient code. */
1037    
1038     if (utf8) switch(c)
1039     {
1040     case OP_CHAR:
1041     case OP_CHARNC:
1042     case OP_EXACT:
1043     case OP_UPTO:
1044     case OP_MINUPTO:
1045     case OP_STAR:
1046     case OP_MINSTAR:
1047     case OP_PLUS:
1048     case OP_MINPLUS:
1049     case OP_QUERY:
1050     case OP_MINQUERY:
1051     while ((*code & 0xc0) == 0x80) code++;
1052     break;
1053    
1054     /* XCLASS is used for classes that cannot be represented just by a bit
1055     map. This includes negated single high-valued characters. The length in
1056     the table is zero; the actual length is stored in the compiled code. */
1057    
1058     case OP_XCLASS:
1059     code += GET(code, 1) + 1;
1060     break;
1061     }
1062     #endif
1063     }
1064     }
1065     }
1066    
1067    
1068    
1069     /*************************************************
1070     * Scan compiled regex for recursion reference *
1071     *************************************************/
1072    
1073     /* This little function scans through a compiled pattern until it finds an
1074     instance of OP_RECURSE.
1075    
1076     Arguments:
1077     code points to start of expression
1078     utf8 TRUE in UTF-8 mode
1079    
1080     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1081     */
1082    
1083     static const uschar *
1084     find_recurse(const uschar *code, BOOL utf8)
1085     {
1086     #ifndef SUPPORT_UTF8
1087     utf8 = utf8; /* Stop pedantic compilers complaining */
1088     #endif
1089    
1090     for (;;)
1091     {
1092     register int c = *code;
1093     if (c == OP_END) return NULL;
1094     else if (c == OP_RECURSE) return code;
1095     else if (c > OP_BRA)
1096     {
1097     code += _pcre_OP_lengths[OP_BRA];
1098     }
1099     else
1100     {
1101     code += _pcre_OP_lengths[c];
1102    
1103     #ifdef SUPPORT_UTF8
1104    
1105     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1106     by a multi-byte character. The length in the table is a minimum, so we have
1107     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1108     can use relatively efficient code. */
1109    
1110     if (utf8) switch(c)
1111     {
1112     case OP_CHAR:
1113     case OP_CHARNC:
1114     case OP_EXACT:
1115     case OP_UPTO:
1116     case OP_MINUPTO:
1117     case OP_STAR:
1118     case OP_MINSTAR:
1119     case OP_PLUS:
1120     case OP_MINPLUS:
1121     case OP_QUERY:
1122     case OP_MINQUERY:
1123     while ((*code & 0xc0) == 0x80) code++;
1124     break;
1125    
1126     /* XCLASS is used for classes that cannot be represented just by a bit
1127     map. This includes negated single high-valued characters. The length in
1128     the table is zero; the actual length is stored in the compiled code. */
1129    
1130     case OP_XCLASS:
1131     code += GET(code, 1) + 1;
1132     break;
1133     }
1134     #endif
1135     }
1136     }
1137     }
1138    
1139    
1140    
1141     /*************************************************
1142     * Scan compiled branch for non-emptiness *
1143     *************************************************/
1144    
1145     /* This function scans through a branch of a compiled pattern to see whether it
1146     can match the empty string or not. It is called only from could_be_empty()
1147     below. Note that first_significant_code() skips over assertions. If we hit an
1148     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1149     whose current branch will already have been scanned.
1150    
1151     Arguments:
1152     code points to start of search
1153     endcode points to where to stop
1154     utf8 TRUE if in UTF8 mode
1155    
1156     Returns: TRUE if what is matched could be empty
1157     */
1158    
1159     static BOOL
1160     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1161     {
1162     register int c;
1163     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1164     code < endcode;
1165     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1166     {
1167     const uschar *ccode;
1168    
1169     c = *code;
1170    
1171     if (c >= OP_BRA)
1172     {
1173     BOOL empty_branch;
1174     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1175    
1176     /* Scan a closed bracket */
1177    
1178     empty_branch = FALSE;
1179     do
1180     {
1181     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1182     empty_branch = TRUE;
1183     code += GET(code, 1);
1184     }
1185     while (*code == OP_ALT);
1186     if (!empty_branch) return FALSE; /* All branches are non-empty */
1187     code += 1 + LINK_SIZE;
1188     c = *code;
1189     }
1190    
1191     else switch (c)
1192     {
1193     /* Check for quantifiers after a class */
1194    
1195     #ifdef SUPPORT_UTF8
1196     case OP_XCLASS:
1197     ccode = code + GET(code, 1);
1198     goto CHECK_CLASS_REPEAT;
1199     #endif
1200    
1201     case OP_CLASS:
1202     case OP_NCLASS:
1203     ccode = code + 33;
1204    
1205     #ifdef SUPPORT_UTF8
1206     CHECK_CLASS_REPEAT:
1207     #endif
1208    
1209     switch (*ccode)
1210     {
1211     case OP_CRSTAR: /* These could be empty; continue */
1212     case OP_CRMINSTAR:
1213     case OP_CRQUERY:
1214     case OP_CRMINQUERY:
1215     break;
1216    
1217     default: /* Non-repeat => class must match */
1218     case OP_CRPLUS: /* These repeats aren't empty */
1219     case OP_CRMINPLUS:
1220     return FALSE;
1221    
1222     case OP_CRRANGE:
1223     case OP_CRMINRANGE:
1224     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1225     break;
1226     }
1227     break;
1228    
1229     /* Opcodes that must match a character */
1230    
1231     case OP_PROP:
1232     case OP_NOTPROP:
1233     case OP_EXTUNI:
1234     case OP_NOT_DIGIT:
1235     case OP_DIGIT:
1236     case OP_NOT_WHITESPACE:
1237     case OP_WHITESPACE:
1238     case OP_NOT_WORDCHAR:
1239     case OP_WORDCHAR:
1240     case OP_ANY:
1241     case OP_ANYBYTE:
1242     case OP_CHAR:
1243     case OP_CHARNC:
1244     case OP_NOT:
1245     case OP_PLUS:
1246     case OP_MINPLUS:
1247     case OP_EXACT:
1248     case OP_NOTPLUS:
1249     case OP_NOTMINPLUS:
1250     case OP_NOTEXACT:
1251     case OP_TYPEPLUS:
1252     case OP_TYPEMINPLUS:
1253     case OP_TYPEEXACT:
1254     return FALSE;
1255    
1256     /* End of branch */
1257    
1258     case OP_KET:
1259     case OP_KETRMAX:
1260     case OP_KETRMIN:
1261     case OP_ALT:
1262     return TRUE;
1263    
1264     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1265     followed by a multibyte character */
1266    
1267     #ifdef SUPPORT_UTF8
1268     case OP_STAR:
1269     case OP_MINSTAR:
1270     case OP_QUERY:
1271     case OP_MINQUERY:
1272     case OP_UPTO:
1273     case OP_MINUPTO:
1274     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1275     break;
1276     #endif
1277     }
1278     }
1279    
1280     return TRUE;
1281     }
1282    
1283    
1284    
1285     /*************************************************
1286     * Scan compiled regex for non-emptiness *
1287     *************************************************/
1288    
1289     /* This function is called to check for left recursive calls. We want to check
1290     the current branch of the current pattern to see if it could match the empty
1291     string. If it could, we must look outwards for branches at other levels,
1292     stopping when we pass beyond the bracket which is the subject of the recursion.
1293    
1294     Arguments:
1295     code points to start of the recursion
1296     endcode points to where to stop (current RECURSE item)
1297     bcptr points to the chain of current (unclosed) branch starts
1298     utf8 TRUE if in UTF-8 mode
1299    
1300     Returns: TRUE if what is matched could be empty
1301     */
1302    
1303     static BOOL
1304     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1305     BOOL utf8)
1306     {
1307     while (bcptr != NULL && bcptr->current >= code)
1308     {
1309     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1310     bcptr = bcptr->outer;
1311     }
1312     return TRUE;
1313     }
1314    
1315    
1316    
1317     /*************************************************
1318     * Check for POSIX class syntax *
1319     *************************************************/
1320    
1321     /* This function is called when the sequence "[:" or "[." or "[=" is
1322     encountered in a character class. It checks whether this is followed by an
1323     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1324     ".]" or "=]".
1325    
1326     Argument:
1327     ptr pointer to the initial [
1328     endptr where to return the end pointer
1329     cd pointer to compile data
1330    
1331     Returns: TRUE or FALSE
1332     */
1333    
1334     static BOOL
1335     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1336     {
1337     int terminator; /* Don't combine these lines; the Solaris cc */
1338     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1339     if (*(++ptr) == '^') ptr++;
1340     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1341     if (*ptr == terminator && ptr[1] == ']')
1342     {
1343     *endptr = ptr;
1344     return TRUE;
1345     }
1346     return FALSE;
1347     }
1348    
1349    
1350    
1351    
1352     /*************************************************
1353     * Check POSIX class name *
1354     *************************************************/
1355    
1356     /* This function is called to check the name given in a POSIX-style class entry
1357     such as [:alnum:].
1358    
1359     Arguments:
1360     ptr points to the first letter
1361     len the length of the name
1362    
1363     Returns: a value representing the name, or -1 if unknown
1364     */
1365    
1366     static int
1367     check_posix_name(const uschar *ptr, int len)
1368     {
1369     register int yield = 0;
1370     while (posix_name_lengths[yield] != 0)
1371     {
1372     if (len == posix_name_lengths[yield] &&
1373     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1374     yield++;
1375     }
1376     return -1;
1377     }
1378    
1379    
1380     /*************************************************
1381     * Adjust OP_RECURSE items in repeated group *
1382     *************************************************/
1383    
1384     /* OP_RECURSE items contain an offset from the start of the regex to the group
1385     that is referenced. This means that groups can be replicated for fixed
1386     repetition simply by copying (because the recursion is allowed to refer to
1387     earlier groups that are outside the current group). However, when a group is
1388     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1389     it, after it has been compiled. This means that any OP_RECURSE items within it
1390     that refer to the group itself or any contained groups have to have their
1391     offsets adjusted. That is the job of this function. Before it is called, the
1392     partially compiled regex must be temporarily terminated with OP_END.
1393    
1394     Arguments:
1395     group points to the start of the group
1396     adjust the amount by which the group is to be moved
1397     utf8 TRUE in UTF-8 mode
1398     cd contains pointers to tables etc.
1399    
1400     Returns: nothing
1401     */
1402    
1403     static void
1404     adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1405     {
1406     uschar *ptr = group;
1407     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1408     {
1409     int offset = GET(ptr, 1);
1410     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1411     ptr += 1 + LINK_SIZE;
1412     }
1413     }
1414    
1415    
1416    
1417     /*************************************************
1418     * Insert an automatic callout point *
1419     *************************************************/
1420    
1421     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1422     callout points before each pattern item.
1423    
1424     Arguments:
1425     code current code pointer
1426     ptr current pattern pointer
1427     cd pointers to tables etc
1428    
1429     Returns: new code pointer
1430     */
1431    
1432     static uschar *
1433     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1434     {
1435     *code++ = OP_CALLOUT;
1436     *code++ = 255;
1437     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1438     PUT(code, LINK_SIZE, 0); /* Default length */
1439     return code + 2*LINK_SIZE;
1440     }
1441    
1442    
1443    
1444     /*************************************************
1445     * Complete a callout item *
1446     *************************************************/
1447    
1448     /* A callout item contains the length of the next item in the pattern, which
1449     we can't fill in till after we have reached the relevant point. This is used
1450     for both automatic and manual callouts.
1451    
1452     Arguments:
1453     previous_callout points to previous callout item
1454     ptr current pattern pointer
1455     cd pointers to tables etc
1456    
1457     Returns: nothing
1458     */
1459    
1460     static void
1461     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1462     {
1463     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1464     PUT(previous_callout, 2 + LINK_SIZE, length);
1465     }
1466    
1467    
1468    
1469     #ifdef SUPPORT_UCP
1470     /*************************************************
1471     * Get othercase range *
1472     *************************************************/
1473    
1474     /* This function is passed the start and end of a class range, in UTF-8 mode
1475     with UCP support. It searches up the characters, looking for internal ranges of
1476     characters in the "other" case. Each call returns the next one, updating the
1477     start address.
1478    
1479     Arguments:
1480     cptr points to starting character value; updated
1481     d end value
1482     ocptr where to put start of othercase range
1483     odptr where to put end of othercase range
1484    
1485     Yield: TRUE when range returned; FALSE when no more
1486     */
1487    
1488     static BOOL
1489     get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1490     {
1491     int c, chartype, othercase, next;
1492    
1493     for (c = *cptr; c <= d; c++)
1494     {
1495     if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1496     break;
1497     }
1498    
1499     if (c > d) return FALSE;
1500    
1501     *ocptr = othercase;
1502     next = othercase + 1;
1503    
1504     for (++c; c <= d; c++)
1505     {
1506     if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1507     othercase != next)
1508     break;
1509     next++;
1510     }
1511    
1512     *odptr = next - 1;
1513     *cptr = c;
1514    
1515     return TRUE;
1516     }
1517     #endif /* SUPPORT_UCP */
1518    
1519    
1520     /*************************************************
1521     * Compile one branch *
1522     *************************************************/
1523    
1524     /* Scan the pattern, compiling it into the code vector. If the options are
1525     changed during the branch, the pointer is used to change the external options
1526     bits.
1527    
1528     Arguments:
1529     optionsptr pointer to the option bits
1530     brackets points to number of extracting brackets used
1531     codeptr points to the pointer to the current code point
1532     ptrptr points to the current pattern pointer
1533     errorcodeptr points to error code variable
1534     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1535     reqbyteptr set to the last literal character required, else < 0
1536     bcptr points to current branch chain
1537     cd contains pointers to tables etc.
1538    
1539     Returns: TRUE on success
1540     FALSE, with *errorcodeptr set non-zero on error
1541     */
1542    
1543     static BOOL
1544     compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1545     const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1546     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1547     {
1548     int repeat_type, op_type;
1549     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1550     int bravalue = 0;
1551     int greedy_default, greedy_non_default;
1552     int firstbyte, reqbyte;
1553     int zeroreqbyte, zerofirstbyte;
1554     int req_caseopt, reqvary, tempreqvary;
1555     int condcount = 0;
1556     int options = *optionsptr;
1557     int after_manual_callout = 0;
1558     register int c;
1559     register uschar *code = *codeptr;
1560     uschar *tempcode;
1561     BOOL inescq = FALSE;
1562     BOOL groupsetfirstbyte = FALSE;
1563     const uschar *ptr = *ptrptr;
1564     const uschar *tempptr;
1565     uschar *previous = NULL;
1566     uschar *previous_callout = NULL;
1567     uschar classbits[32];
1568    
1569     #ifdef SUPPORT_UTF8
1570     BOOL class_utf8;
1571     BOOL utf8 = (options & PCRE_UTF8) != 0;
1572     uschar *class_utf8data;
1573     uschar utf8_char[6];
1574     #else
1575     BOOL utf8 = FALSE;
1576     #endif
1577    
1578     /* Set up the default and non-default settings for greediness */
1579    
1580     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1581     greedy_non_default = greedy_default ^ 1;
1582    
1583     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1584     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1585     matches a non-fixed char first char; reqbyte just remains unset if we never
1586     find one.
1587    
1588     When we hit a repeat whose minimum is zero, we may have to adjust these values
1589     to take the zero repeat into account. This is implemented by setting them to
1590     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1591     item types that can be repeated set these backoff variables appropriately. */
1592    
1593     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1594    
1595     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1596     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1597     value > 255. It is added into the firstbyte or reqbyte variables to record the
1598     case status of the value. This is used only for ASCII characters. */
1599    
1600     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1601    
1602     /* Switch on next character until the end of the branch */
1603    
1604     for (;; ptr++)
1605     {
1606     BOOL negate_class;
1607     BOOL possessive_quantifier;
1608     BOOL is_quantifier;
1609     int class_charcount;
1610     int class_lastchar;
1611     int newoptions;
1612     int recno;
1613     int skipbytes;
1614     int subreqbyte;
1615     int subfirstbyte;
1616     int mclength;
1617     uschar mcbuffer[8];
1618    
1619     /* Next byte in the pattern */
1620    
1621     c = *ptr;
1622    
1623     /* If in \Q...\E, check for the end; if not, we have a literal */
1624    
1625     if (inescq && c != 0)
1626     {
1627     if (c == '\\' && ptr[1] == 'E')
1628     {
1629     inescq = FALSE;
1630     ptr++;
1631     continue;
1632     }
1633     else
1634     {
1635     if (previous_callout != NULL)
1636     {
1637     complete_callout(previous_callout, ptr, cd);
1638     previous_callout = NULL;
1639     }
1640     if ((options & PCRE_AUTO_CALLOUT) != 0)
1641     {
1642     previous_callout = code;
1643     code = auto_callout(code, ptr, cd);
1644     }
1645     goto NORMAL_CHAR;
1646     }
1647     }
1648    
1649     /* Fill in length of a previous callout, except when the next thing is
1650     a quantifier. */
1651    
1652     is_quantifier = c == '*' || c == '+' || c == '?' ||
1653     (c == '{' && is_counted_repeat(ptr+1));
1654    
1655     if (!is_quantifier && previous_callout != NULL &&
1656     after_manual_callout-- <= 0)
1657     {
1658     complete_callout(previous_callout, ptr, cd);
1659     previous_callout = NULL;
1660     }
1661    
1662     /* In extended mode, skip white space and comments */
1663    
1664     if ((options & PCRE_EXTENDED) != 0)
1665     {
1666     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1667     if (c == '#')
1668     {
1669     /* The space before the ; is to avoid a warning on a silly compiler
1670     on the Macintosh. */
1671     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1672     if (c != 0) continue; /* Else fall through to handle end of string */
1673     }
1674     }
1675    
1676     /* No auto callout for quantifiers. */
1677    
1678     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1679     {
1680     previous_callout = code;
1681     code = auto_callout(code, ptr, cd);
1682     }
1683    
1684     switch(c)
1685     {
1686     /* The branch terminates at end of string, |, or ). */
1687    
1688     case 0:
1689     case '|':
1690     case ')':
1691     *firstbyteptr = firstbyte;
1692     *reqbyteptr = reqbyte;
1693     *codeptr = code;
1694     *ptrptr = ptr;
1695     return TRUE;
1696    
1697     /* Handle single-character metacharacters. In multiline mode, ^ disables
1698     the setting of any following char as a first character. */
1699    
1700     case '^':
1701     if ((options & PCRE_MULTILINE) != 0)
1702     {
1703     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1704     }
1705     previous = NULL;
1706     *code++ = OP_CIRC;
1707     break;
1708    
1709     case '$':
1710     previous = NULL;
1711     *code++ = OP_DOLL;
1712     break;
1713    
1714     /* There can never be a first char if '.' is first, whatever happens about
1715     repeats. The value of reqbyte doesn't change either. */
1716    
1717     case '.':
1718     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1719     zerofirstbyte = firstbyte;
1720     zeroreqbyte = reqbyte;
1721     previous = code;
1722     *code++ = OP_ANY;
1723     break;
1724    
1725     /* Character classes. If the included characters are all < 255 in value, we
1726     build a 32-byte bitmap of the permitted characters, except in the special
1727     case where there is only one such character. For negated classes, we build
1728     the map as usual, then invert it at the end. However, we use a different
1729     opcode so that data characters > 255 can be handled correctly.
1730    
1731     If the class contains characters outside the 0-255 range, a different
1732     opcode is compiled. It may optionally have a bit map for characters < 256,
1733     but those above are are explicitly listed afterwards. A flag byte tells
1734     whether the bitmap is present, and whether this is a negated class or not.
1735     */
1736    
1737     case '[':
1738     previous = code;
1739    
1740     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1741     they are encountered at the top level, so we'll do that too. */
1742    
1743     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1744     check_posix_syntax(ptr, &tempptr, cd))
1745     {
1746     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1747     goto FAILED;
1748     }
1749    
1750     /* If the first character is '^', set the negation flag and skip it. */
1751    
1752     if ((c = *(++ptr)) == '^')
1753     {
1754     negate_class = TRUE;
1755     c = *(++ptr);
1756     }
1757     else
1758     {
1759     negate_class = FALSE;
1760     }
1761    
1762     /* Keep a count of chars with values < 256 so that we can optimize the case
1763     of just a single character (as long as it's < 256). For higher valued UTF-8
1764     characters, we don't yet do any optimization. */
1765    
1766     class_charcount = 0;
1767     class_lastchar = -1;
1768    
1769     #ifdef SUPPORT_UTF8
1770     class_utf8 = FALSE; /* No chars >= 256 */
1771     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1772     #endif
1773    
1774     /* Initialize the 32-char bit map to all zeros. We have to build the
1775     map in a temporary bit of store, in case the class contains only 1
1776     character (< 256), because in that case the compiled code doesn't use the
1777     bit map. */
1778    
1779     memset(classbits, 0, 32 * sizeof(uschar));
1780    
1781     /* Process characters until ] is reached. By writing this as a "do" it
1782     means that an initial ] is taken as a data character. The first pass
1783     through the regex checked the overall syntax, so we don't need to be very
1784     strict here. At the start of the loop, c contains the first byte of the
1785     character. */
1786    
1787     do
1788     {
1789     #ifdef SUPPORT_UTF8
1790     if (utf8 && c > 127)
1791     { /* Braces are required because the */
1792     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1793     }
1794     #endif
1795    
1796     /* Inside \Q...\E everything is literal except \E */
1797    
1798     if (inescq)
1799     {
1800     if (c == '\\' && ptr[1] == 'E')
1801     {
1802     inescq = FALSE;
1803     ptr++;
1804     continue;
1805     }
1806     else goto LONE_SINGLE_CHARACTER;
1807     }
1808    
1809     /* Handle POSIX class names. Perl allows a negation extension of the
1810     form [:^name:]. A square bracket that doesn't match the syntax is
1811     treated as a literal. We also recognize the POSIX constructions
1812     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1813     5.6 and 5.8 do. */
1814    
1815     if (c == '[' &&
1816     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1817     check_posix_syntax(ptr, &tempptr, cd))
1818     {
1819     BOOL local_negate = FALSE;
1820     int posix_class, i;
1821     register const uschar *cbits = cd->cbits;
1822    
1823     if (ptr[1] != ':')
1824     {
1825     *errorcodeptr = ERR31;
1826     goto FAILED;
1827     }
1828    
1829     ptr += 2;
1830     if (*ptr == '^')
1831     {
1832     local_negate = TRUE;
1833     ptr++;
1834     }
1835    
1836     posix_class = check_posix_name(ptr, tempptr - ptr);
1837     if (posix_class < 0)
1838     {
1839     *errorcodeptr = ERR30;
1840     goto FAILED;
1841     }
1842    
1843     /* If matching is caseless, upper and lower are converted to
1844     alpha. This relies on the fact that the class table starts with
1845     alpha, lower, upper as the first 3 entries. */
1846    
1847     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1848     posix_class = 0;
1849    
1850     /* Or into the map we are building up to 3 of the static class
1851     tables, or their negations. The [:blank:] class sets up the same
1852     chars as the [:space:] class (all white space). We remove the vertical
1853     white space chars afterwards. */
1854    
1855     posix_class *= 3;
1856     for (i = 0; i < 3; i++)
1857     {
1858     BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1859     int taboffset = posix_class_maps[posix_class + i];
1860     if (taboffset < 0) break;
1861     if (local_negate)
1862     {
1863     if (i == 0)
1864     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1865     else
1866     for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1867     if (blankclass) classbits[1] |= 0x3c;
1868     }
1869     else
1870     {
1871     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1872     if (blankclass) classbits[1] &= ~0x3c;
1873     }
1874     }
1875    
1876     ptr = tempptr + 1;
1877     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1878     continue; /* End of POSIX syntax handling */
1879     }
1880    
1881     /* Backslash may introduce a single character, or it may introduce one
1882     of the specials, which just set a flag. Escaped items are checked for
1883     validity in the pre-compiling pass. The sequence \b is a special case.
1884     Inside a class (and only there) it is treated as backspace. Elsewhere
1885     it marks a word boundary. Other escapes have preset maps ready to
1886     or into the one we are building. We assume they have more than one
1887     character in them, so set class_charcount bigger than one. */
1888    
1889     if (c == '\\')
1890     {
1891     c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1892    
1893     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1894     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1895     else if (-c == ESC_Q) /* Handle start of quoted string */
1896     {
1897     if (ptr[1] == '\\' && ptr[2] == 'E')
1898     {
1899     ptr += 2; /* avoid empty string */
1900     }
1901     else inescq = TRUE;
1902     continue;
1903     }
1904    
1905     if (c < 0)
1906     {
1907     register const uschar *cbits = cd->cbits;
1908     class_charcount += 2; /* Greater than 1 is what matters */
1909     switch (-c)
1910     {
1911     case ESC_d:
1912     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1913     continue;
1914    
1915     case ESC_D:
1916     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1917     continue;
1918    
1919     case ESC_w:
1920     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1921     continue;
1922    
1923     case ESC_W:
1924     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1925     continue;
1926    
1927     case ESC_s:
1928     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1929     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1930     continue;
1931    
1932     case ESC_S:
1933     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1934     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1935     continue;
1936    
1937     #ifdef SUPPORT_UCP
1938     case ESC_p:
1939     case ESC_P:
1940     {
1941     BOOL negated;
1942     int property = get_ucp(&ptr, &negated, errorcodeptr);
1943     if (property < 0) goto FAILED;
1944     class_utf8 = TRUE;
1945     *class_utf8data++ = ((-c == ESC_p) != negated)?
1946     XCL_PROP : XCL_NOTPROP;
1947     *class_utf8data++ = property;
1948     class_charcount -= 2; /* Not a < 256 character */
1949     }
1950     continue;
1951     #endif
1952    
1953     /* Unrecognized escapes are faulted if PCRE is running in its
1954     strict mode. By default, for compatibility with Perl, they are
1955     treated as literals. */
1956    
1957     default:
1958     if ((options & PCRE_EXTRA) != 0)
1959     {
1960     *errorcodeptr = ERR7;
1961     goto FAILED;
1962     }
1963     c = *ptr; /* The final character */
1964     class_charcount -= 2; /* Undo the default count from above */
1965     }
1966     }
1967    
1968     /* Fall through if we have a single character (c >= 0). This may be
1969     > 256 in UTF-8 mode. */
1970    
1971     } /* End of backslash handling */
1972    
1973     /* A single character may be followed by '-' to form a range. However,
1974     Perl does not permit ']' to be the end of the range. A '-' character
1975     here is treated as a literal. */
1976    
1977     if (ptr[1] == '-' && ptr[2] != ']')
1978     {
1979     int d;
1980     ptr += 2;
1981    
1982     #ifdef SUPPORT_UTF8
1983     if (utf8)
1984     { /* Braces are required because the */
1985     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1986     }
1987     else
1988     #endif
1989     d = *ptr; /* Not UTF-8 mode */
1990    
1991     /* The second part of a range can be a single-character escape, but
1992     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1993     in such circumstances. */
1994    
1995     if (d == '\\')
1996     {
1997     const uschar *oldptr = ptr;
1998     d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1999    
2000     /* \b is backslash; \X is literal X; any other special means the '-'
2001     was literal */
2002    
2003     if (d < 0)
2004     {
2005     if (d == -ESC_b) d = '\b';
2006     else if (d == -ESC_X) d = 'X'; else
2007     {
2008     ptr = oldptr - 2;
2009     goto LONE_SINGLE_CHARACTER; /* A few lines below */
2010     }
2011     }
2012     }
2013    
2014     /* The check that the two values are in the correct order happens in
2015     the pre-pass. Optimize one-character ranges */
2016    
2017     if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2018    
2019     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2020     matching, we have to use an XCLASS with extra data items. Caseless
2021     matching for characters > 127 is available only if UCP support is
2022     available. */
2023    
2024     #ifdef SUPPORT_UTF8
2025     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2026     {
2027     class_utf8 = TRUE;
2028    
2029     /* With UCP support, we can find the other case equivalents of
2030     the relevant characters. There may be several ranges. Optimize how
2031     they fit with the basic range. */
2032    
2033     #ifdef SUPPORT_UCP
2034     if ((options & PCRE_CASELESS) != 0)
2035     {
2036     int occ, ocd;
2037     int cc = c;
2038     int origd = d;
2039     while (get_othercase_range(&cc, origd, &occ, &ocd))
2040     {
2041     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2042    
2043     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2044     { /* if there is overlap, */
2045     c = occ; /* noting that if occ < c */
2046     continue; /* we can't have ocd > d */
2047     } /* because a subrange is */
2048     if (ocd > d && occ <= d + 1) /* always shorter than */
2049     { /* the basic range. */
2050     d = ocd;
2051     continue;
2052     }
2053    
2054     if (occ == ocd)
2055     {
2056     *class_utf8data++ = XCL_SINGLE;
2057     }
2058     else
2059     {
2060     *class_utf8data++ = XCL_RANGE;
2061     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2062     }
2063     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2064     }
2065     }
2066     #endif /* SUPPORT_UCP */
2067    
2068     /* Now record the original range, possibly modified for UCP caseless
2069     overlapping ranges. */
2070    
2071     *class_utf8data++ = XCL_RANGE;
2072     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2073     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2074    
2075     /* With UCP support, we are done. Without UCP support, there is no
2076     caseless matching for UTF-8 characters > 127; we can use the bit map
2077     for the smaller ones. */
2078    
2079     #ifdef SUPPORT_UCP
2080     continue; /* With next character in the class */
2081     #else
2082     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2083    
2084     /* Adjust upper limit and fall through to set up the map */
2085    
2086     d = 127;
2087    
2088     #endif /* SUPPORT_UCP */
2089     }
2090     #endif /* SUPPORT_UTF8 */
2091    
2092     /* We use the bit map for all cases when not in UTF-8 mode; else
2093     ranges that lie entirely within 0-127 when there is UCP support; else
2094     for partial ranges without UCP support. */
2095    
2096     for (; c <= d; c++)
2097     {
2098     classbits[c/8] |= (1 << (c&7));
2099     if ((options & PCRE_CASELESS) != 0)
2100     {
2101     int uc = cd->fcc[c]; /* flip case */
2102     classbits[uc/8] |= (1 << (uc&7));
2103     }
2104     class_charcount++; /* in case a one-char range */
2105     class_lastchar = c;
2106     }
2107    
2108     continue; /* Go get the next char in the class */
2109     }
2110    
2111     /* Handle a lone single character - we can get here for a normal
2112     non-escape char, or after \ that introduces a single character or for an
2113     apparent range that isn't. */
2114    
2115     LONE_SINGLE_CHARACTER:
2116    
2117     /* Handle a character that cannot go in the bit map */
2118    
2119     #ifdef SUPPORT_UTF8
2120     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2121     {
2122     class_utf8 = TRUE;
2123     *class_utf8data++ = XCL_SINGLE;
2124     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2125    
2126     #ifdef SUPPORT_UCP
2127     if ((options & PCRE_CASELESS) != 0)
2128     {
2129     int chartype;
2130     int othercase;
2131     if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2132     othercase > 0)
2133     {
2134     *class_utf8data++ = XCL_SINGLE;
2135     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2136     }
2137     }
2138     #endif /* SUPPORT_UCP */
2139    
2140     }
2141     else
2142     #endif /* SUPPORT_UTF8 */
2143    
2144     /* Handle a single-byte character */
2145     {
2146     classbits[c/8] |= (1 << (c&7));
2147     if ((options & PCRE_CASELESS) != 0)
2148     {
2149     c = cd->fcc[c]; /* flip case */
2150     classbits[c/8] |= (1 << (c&7));
2151     }
2152     class_charcount++;
2153     class_lastchar = c;
2154     }
2155     }
2156    
2157     /* Loop until ']' reached; the check for end of string happens inside the
2158     loop. This "while" is the end of the "do" above. */
2159    
2160     while ((c = *(++ptr)) != ']' || inescq);
2161    
2162     /* If class_charcount is 1, we saw precisely one character whose value is
2163     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2164     can optimize the negative case only if there were no characters >= 128
2165     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2166     single-bytes only. This is an historical hangover. Maybe one day we can
2167     tidy these opcodes to handle multi-byte characters.
2168    
2169     The optimization throws away the bit map. We turn the item into a
2170     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2171     that OP_NOT does not support multibyte characters. In the positive case, it
2172     can cause firstbyte to be set. Otherwise, there can be no first char if
2173     this item is first, whatever repeat count may follow. In the case of
2174     reqbyte, save the previous value for reinstating. */
2175    
2176     #ifdef SUPPORT_UTF8
2177     if (class_charcount == 1 &&
2178     (!utf8 ||
2179     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2180    
2181     #else
2182     if (class_charcount == 1)
2183     #endif
2184     {
2185     zeroreqbyte = reqbyte;
2186    
2187     /* The OP_NOT opcode works on one-byte characters only. */
2188    
2189     if (negate_class)
2190     {
2191     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2192     zerofirstbyte = firstbyte;
2193     *code++ = OP_NOT;
2194     *code++ = class_lastchar;
2195     break;
2196     }
2197    
2198     /* For a single, positive character, get the value into mcbuffer, and
2199     then we can handle this with the normal one-character code. */
2200    
2201     #ifdef SUPPORT_UTF8
2202     if (utf8 && class_lastchar > 127)
2203     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2204     else
2205     #endif
2206     {
2207     mcbuffer[0] = class_lastchar;
2208     mclength = 1;
2209     }
2210     goto ONE_CHAR;
2211     } /* End of 1-char optimization */
2212    
2213     /* The general case - not the one-char optimization. If this is the first
2214     thing in the branch, there can be no first char setting, whatever the
2215     repeat count. Any reqbyte setting must remain unchanged after any kind of
2216     repeat. */
2217    
2218     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2219     zerofirstbyte = firstbyte;
2220     zeroreqbyte = reqbyte;
2221    
2222     /* If there are characters with values > 255, we have to compile an
2223     extended class, with its own opcode. If there are no characters < 256,
2224     we can omit the bitmap. */
2225    
2226     #ifdef SUPPORT_UTF8
2227     if (class_utf8)
2228     {
2229     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2230     *code++ = OP_XCLASS;
2231     code += LINK_SIZE;
2232     *code = negate_class? XCL_NOT : 0;
2233    
2234     /* If the map is required, install it, and move on to the end of
2235     the extra data */
2236    
2237     if (class_charcount > 0)
2238     {
2239     *code++ |= XCL_MAP;
2240     memcpy(code, classbits, 32);
2241     code = class_utf8data;
2242     }
2243    
2244     /* If the map is not required, slide down the extra data. */
2245    
2246     else
2247     {
2248     int len = class_utf8data - (code + 33);
2249     memmove(code + 1, code + 33, len);
2250     code += len + 1;
2251     }
2252    
2253     /* Now fill in the complete length of the item */
2254    
2255     PUT(previous, 1, code - previous);
2256     break; /* End of class handling */
2257     }
2258     #endif
2259    
2260     /* If there are no characters > 255, negate the 32-byte map if necessary,
2261     and copy it into the code vector. If this is the first thing in the branch,
2262     there can be no first char setting, whatever the repeat count. Any reqbyte
2263     setting must remain unchanged after any kind of repeat. */
2264    
2265     if (negate_class)
2266     {
2267     *code++ = OP_NCLASS;
2268     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2269     }
2270     else
2271     {
2272     *code++ = OP_CLASS;
2273     memcpy(code, classbits, 32);
2274     }
2275     code += 32;
2276     break;
2277    
2278     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2279     has been tested above. */
2280    
2281     case '{':
2282     if (!is_quantifier) goto NORMAL_CHAR;
2283     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2284     if (*errorcodeptr != 0) goto FAILED;
2285     goto REPEAT;
2286    
2287     case '*':
2288     repeat_min = 0;
2289     repeat_max = -1;
2290     goto REPEAT;
2291    
2292     case '+':
2293     repeat_min = 1;
2294     repeat_max = -1;
2295     goto REPEAT;
2296    
2297     case '?':
2298     repeat_min = 0;
2299     repeat_max = 1;
2300    
2301     REPEAT:
2302     if (previous == NULL)
2303     {
2304     *errorcodeptr = ERR9;
2305     goto FAILED;
2306     }
2307    
2308     if (repeat_min == 0)
2309     {
2310     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2311     reqbyte = zeroreqbyte; /* Ditto */
2312     }
2313    
2314     /* Remember whether this is a variable length repeat */
2315    
2316     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2317    
2318     op_type = 0; /* Default single-char op codes */
2319     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2320    
2321     /* Save start of previous item, in case we have to move it up to make space
2322     for an inserted OP_ONCE for the additional '+' extension. */
2323    
2324     tempcode = previous;
2325    
2326     /* If the next character is '+', we have a possessive quantifier. This
2327     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2328     If the next character is '?' this is a minimizing repeat, by default,
2329     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2330     repeat type to the non-default. */
2331    
2332     if (ptr[1] == '+')
2333     {
2334     repeat_type = 0; /* Force greedy */
2335     possessive_quantifier = TRUE;
2336     ptr++;
2337     }
2338     else if (ptr[1] == '?')
2339     {
2340     repeat_type = greedy_non_default;
2341     ptr++;
2342     }
2343     else repeat_type = greedy_default;
2344    
2345     /* If previous was a recursion, we need to wrap it inside brackets so that
2346     it can be replicated if necessary. */
2347    
2348     if (*previous == OP_RECURSE)
2349     {
2350     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2351     code += 1 + LINK_SIZE;
2352     *previous = OP_BRA;
2353     PUT(previous, 1, code - previous);
2354     *code = OP_KET;
2355     PUT(code, 1, code - previous);
2356     code += 1 + LINK_SIZE;
2357     }
2358    
2359     /* If previous was a character match, abolish the item and generate a
2360     repeat item instead. If a char item has a minumum of more than one, ensure
2361     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2362     the first thing in a branch because the x will have gone into firstbyte
2363     instead. */
2364    
2365     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2366     {
2367     /* Deal with UTF-8 characters that take up more than one byte. It's
2368     easier to write this out separately than try to macrify it. Use c to
2369     hold the length of the character in bytes, plus 0x80 to flag that it's a
2370     length rather than a small character. */
2371    
2372     #ifdef SUPPORT_UTF8
2373     if (utf8 && (code[-1] & 0x80) != 0)
2374     {
2375     uschar *lastchar = code - 1;
2376     while((*lastchar & 0xc0) == 0x80) lastchar--;
2377     c = code - lastchar; /* Length of UTF-8 character */
2378     memcpy(utf8_char, lastchar, c); /* Save the char */
2379     c |= 0x80; /* Flag c as a length */
2380     }
2381     else
2382     #endif
2383    
2384     /* Handle the case of a single byte - either with no UTF8 support, or
2385     with UTF-8 disabled, or for a UTF-8 character < 128. */
2386    
2387     {
2388     c = code[-1];
2389     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2390     }
2391    
2392     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2393     }
2394    
2395     /* If previous was a single negated character ([^a] or similar), we use
2396     one of the special opcodes, replacing it. The code is shared with single-
2397     character repeats by setting opt_type to add a suitable offset into
2398     repeat_type. OP_NOT is currently used only for single-byte chars. */
2399    
2400     else if (*previous == OP_NOT)
2401     {
2402     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2403     c = previous[1];
2404     goto OUTPUT_SINGLE_REPEAT;
2405     }
2406    
2407     /* If previous was a character type match (\d or similar), abolish it and
2408     create a suitable repeat item. The code is shared with single-character
2409     repeats by setting op_type to add a suitable offset into repeat_type. Note
2410     the the Unicode property types will be present only when SUPPORT_UCP is
2411     defined, but we don't wrap the little bits of code here because it just
2412     makes it horribly messy. */
2413    
2414     else if (*previous < OP_EODN)
2415     {
2416     uschar *oldcode;
2417     int prop_type;
2418     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2419     c = *previous;
2420    
2421     OUTPUT_SINGLE_REPEAT:
2422     prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2423     previous[1] : -1;
2424    
2425     oldcode = code;
2426     code = previous; /* Usually overwrite previous item */
2427    
2428     /* If the maximum is zero then the minimum must also be zero; Perl allows
2429     this case, so we do too - by simply omitting the item altogether. */
2430    
2431     if (repeat_max == 0) goto END_REPEAT;
2432    
2433     /* All real repeats make it impossible to handle partial matching (maybe
2434     one day we will be able to remove this restriction). */
2435    
2436     if (repeat_max != 1) cd->nopartial = TRUE;
2437    
2438     /* Combine the op_type with the repeat_type */
2439    
2440     repeat_type += op_type;
2441    
2442     /* A minimum of zero is handled either as the special case * or ?, or as
2443     an UPTO, with the maximum given. */
2444    
2445     if (repeat_min == 0)
2446     {
2447     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2448     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2449     else
2450     {
2451     *code++ = OP_UPTO + repeat_type;
2452     PUT2INC(code, 0, repeat_max);
2453     }
2454     }
2455    
2456     /* A repeat minimum of 1 is optimized into some special cases. If the
2457     maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2458     left in place and, if the maximum is greater than 1, we use OP_UPTO with
2459     one less than the maximum. */
2460    
2461     else if (repeat_min == 1)
2462     {
2463     if (repeat_max == -1)
2464     *code++ = OP_PLUS + repeat_type;
2465     else
2466     {
2467     code = oldcode; /* leave previous item in place */
2468     if (repeat_max == 1) goto END_REPEAT;
2469     *code++ = OP_UPTO + repeat_type;
2470     PUT2INC(code, 0, repeat_max - 1);
2471     }
2472     }
2473    
2474     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2475     handled as an EXACT followed by an UPTO. */
2476    
2477     else
2478     {
2479     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2480     PUT2INC(code, 0, repeat_min);
2481    
2482     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2483     we have to insert the character for the previous code. For a repeated
2484     Unicode property match, there is an extra byte that defines the
2485     required property. In UTF-8 mode, long characters have their length in
2486     c, with the 0x80 bit as a flag. */
2487    
2488     if (repeat_max < 0)
2489     {
2490     #ifdef SUPPORT_UTF8
2491     if (utf8 && c >= 128)
2492     {
2493     memcpy(code, utf8_char, c & 7);
2494     code += c & 7;
2495     }
2496     else
2497     #endif
2498     {
2499     *code++ = c;
2500     if (prop_type >= 0) *code++ = prop_type;
2501     }
2502     *code++ = OP_STAR + repeat_type;
2503     }
2504    
2505     /* Else insert an UPTO if the max is greater than the min, again
2506     preceded by the character, for the previously inserted code. */
2507    
2508     else if (repeat_max != repeat_min)
2509     {
2510     #ifdef SUPPORT_UTF8
2511     if (utf8 && c >= 128)
2512     {
2513     memcpy(code, utf8_char, c & 7);
2514     code += c & 7;
2515     }
2516     else
2517     #endif
2518     *code++ = c;
2519     if (prop_type >= 0) *code++ = prop_type;
2520     repeat_max -= repeat_min;
2521     *code++ = OP_UPTO + repeat_type;
2522     PUT2INC(code, 0, repeat_max);
2523     }
2524     }
2525    
2526     /* The character or character type itself comes last in all cases. */
2527    
2528     #ifdef SUPPORT_UTF8
2529     if (utf8 && c >= 128)
2530     {
2531     memcpy(code, utf8_char, c & 7);
2532     code += c & 7;
2533     }
2534     else
2535     #endif
2536     *code++ = c;
2537    
2538     /* For a repeated Unicode property match, there is an extra byte that
2539     defines the required property. */
2540    
2541     #ifdef SUPPORT_UCP
2542     if (prop_type >= 0) *code++ = prop_type;
2543     #endif
2544     }
2545    
2546     /* If previous was a character class or a back reference, we put the repeat
2547     stuff after it, but just skip the item if the repeat was {0,0}. */
2548    
2549     else if (*previous == OP_CLASS ||
2550     *previous == OP_NCLASS ||
2551     #ifdef SUPPORT_UTF8
2552     *previous == OP_XCLASS ||
2553     #endif
2554     *previous == OP_REF)
2555     {
2556     if (repeat_max == 0)
2557     {
2558     code = previous;
2559     goto END_REPEAT;
2560     }
2561    
2562     /* All real repeats make it impossible to handle partial matching (maybe
2563     one day we will be able to remove this restriction). */
2564    
2565     if (repeat_max != 1) cd->nopartial = TRUE;
2566    
2567     if (repeat_min == 0 && repeat_max == -1)
2568     *code++ = OP_CRSTAR + repeat_type;
2569     else if (repeat_min == 1 && repeat_max == -1)
2570     *code++ = OP_CRPLUS + repeat_type;
2571     else if (repeat_min == 0 && repeat_max == 1)
2572     *code++ = OP_CRQUERY + repeat_type;
2573     else
2574     {
2575     *code++ = OP_CRRANGE + repeat_type;
2576     PUT2INC(code, 0, repeat_min);
2577     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2578     PUT2INC(code, 0, repeat_max);
2579     }
2580     }
2581    
2582     /* If previous was a bracket group, we may have to replicate it in certain
2583     cases. */
2584    
2585     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2586     *previous == OP_COND)
2587     {
2588     register int i;
2589     int ketoffset = 0;
2590     int len = code - previous;
2591     uschar *bralink = NULL;
2592    
2593     /* If the maximum repeat count is unlimited, find the end of the bracket
2594     by scanning through from the start, and compute the offset back to it
2595     from the current code pointer. There may be an OP_OPT setting following
2596     the final KET, so we can't find the end just by going back from the code
2597     pointer. */
2598    
2599     if (repeat_max == -1)
2600     {
2601     register uschar *ket = previous;
2602     do ket += GET(ket, 1); while (*ket != OP_KET);
2603     ketoffset = code - ket;
2604     }
2605    
2606     /* The case of a zero minimum is special because of the need to stick
2607     OP_BRAZERO in front of it, and because the group appears once in the
2608     data, whereas in other cases it appears the minimum number of times. For
2609     this reason, it is simplest to treat this case separately, as otherwise
2610     the code gets far too messy. There are several special subcases when the
2611     minimum is zero. */
2612    
2613     if (repeat_min == 0)
2614     {
2615     /* If the maximum is also zero, we just omit the group from the output
2616     altogether. */
2617    
2618     if (repeat_max == 0)
2619     {
2620     code = previous;
2621     goto END_REPEAT;
2622     }
2623    
2624     /* If the maximum is 1 or unlimited, we just have to stick in the
2625     BRAZERO and do no more at this point. However, we do need to adjust
2626     any OP_RECURSE calls inside the group that refer to the group itself or
2627     any internal group, because the offset is from the start of the whole
2628     regex. Temporarily terminate the pattern while doing this. */
2629    
2630     if (repeat_max <= 1)
2631     {
2632     *code = OP_END;
2633     adjust_recurse(previous, 1, utf8, cd);
2634     memmove(previous+1, previous, len);
2635     code++;
2636     *previous++ = OP_BRAZERO + repeat_type;
2637     }
2638    
2639     /* If the maximum is greater than 1 and limited, we have to replicate
2640     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2641     The first one has to be handled carefully because it's the original
2642     copy, which has to be moved up. The remainder can be handled by code
2643     that is common with the non-zero minimum case below. We have to
2644     adjust the value or repeat_max, since one less copy is required. Once
2645     again, we may have to adjust any OP_RECURSE calls inside the group. */
2646    
2647     else
2648     {
2649     int offset;
2650     *code = OP_END;
2651     adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2652     memmove(previous + 2 + LINK_SIZE, previous, len);
2653     code += 2 + LINK_SIZE;
2654     *previous++ = OP_BRAZERO + repeat_type;
2655     *previous++ = OP_BRA;
2656    
2657     /* We chain together the bracket offset fields that have to be
2658     filled in later when the ends of the brackets are reached. */
2659    
2660     offset = (bralink == NULL)? 0 : previous - bralink;
2661     bralink = previous;
2662     PUTINC(previous, 0, offset);
2663     }
2664    
2665     repeat_max--;
2666     }
2667    
2668     /* If the minimum is greater than zero, replicate the group as many
2669     times as necessary, and adjust the maximum to the number of subsequent
2670     copies that we need. If we set a first char from the group, and didn't
2671     set a required char, copy the latter from the former. */
2672    
2673     else
2674     {
2675     if (repeat_min > 1)
2676     {
2677     if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2678     for (i = 1; i < repeat_min; i++)
2679     {
2680     memcpy(code, previous, len);
2681     code += len;
2682     }
2683     }
2684     if (repeat_max > 0) repeat_max -= repeat_min;
2685     }
2686    
2687     /* This code is common to both the zero and non-zero minimum cases. If
2688     the maximum is limited, it replicates the group in a nested fashion,
2689     remembering the bracket starts on a stack. In the case of a zero minimum,
2690     the first one was set up above. In all cases the repeat_max now specifies
2691     the number of additional copies needed. */
2692    
2693     if (repeat_max >= 0)
2694     {
2695     for (i = repeat_max - 1; i >= 0; i--)
2696     {
2697     *code++ = OP_BRAZERO + repeat_type;
2698    
2699     /* All but the final copy start a new nesting, maintaining the
2700     chain of brackets outstanding. */
2701    
2702     if (i != 0)
2703     {
2704     int offset;
2705     *code++ = OP_BRA;
2706     offset = (bralink == NULL)? 0 : code - bralink;
2707     bralink = code;
2708     PUTINC(code, 0, offset);
2709     }
2710    
2711     memcpy(code, previous, len);
2712     code += len;
2713     }
2714    
2715     /* Now chain through the pending brackets, and fill in their length
2716     fields (which are holding the chain links pro tem). */
2717    
2718     while (bralink != NULL)
2719     {
2720     int oldlinkoffset;
2721     int offset = code - bralink + 1;
2722     uschar *bra = code - offset;
2723     oldlinkoffset = GET(bra, 1);
2724     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2725     *code++ = OP_KET;
2726     PUTINC(code, 0, offset);
2727     PUT(bra, 1, offset);
2728     }
2729     }
2730    
2731     /* If the maximum is unlimited, set a repeater in the final copy. We
2732     can't just offset backwards from the current code point, because we
2733     don't know if there's been an options resetting after the ket. The
2734     correct offset was computed above. */
2735    
2736     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2737     }
2738    
2739     /* Else there's some kind of shambles */
2740    
2741     else
2742     {
2743     *errorcodeptr = ERR11;
2744     goto FAILED;
2745     }
2746    
2747     /* If the character following a repeat is '+', we wrap the entire repeated
2748     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2749     Sun's Java package. The repeated item starts at tempcode, not at previous,
2750     which might be the first part of a string whose (former) last char we
2751     repeated. However, we don't support '+' after a greediness '?'. */
2752    
2753     if (possessive_quantifier)
2754     {
2755     int len = code - tempcode;
2756     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2757     code += 1 + LINK_SIZE;
2758     len += 1 + LINK_SIZE;
2759     tempcode[0] = OP_ONCE;
2760     *code++ = OP_KET;
2761     PUTINC(code, 0, len);
2762     PUT(tempcode, 1, len);
2763     }
2764    
2765     /* In all case we no longer have a previous item. We also set the
2766     "follows varying string" flag for subsequently encountered reqbytes if
2767     it isn't already set and we have just passed a varying length item. */
2768    
2769     END_REPEAT:
2770     previous = NULL;
2771     cd->req_varyopt |= reqvary;
2772     break;
2773    
2774    
2775     /* Start of nested bracket sub-expression, or comment or lookahead or
2776     lookbehind or option setting or condition. First deal with special things
2777     that can come after a bracket; all are introduced by ?, and the appearance
2778     of any of them means that this is not a referencing group. They were
2779     checked for validity in the first pass over the string, so we don't have to
2780     check for syntax errors here. */
2781    
2782     case '(':
2783     newoptions = options;
2784     skipbytes = 0;
2785    
2786     if (*(++ptr) == '?')
2787     {
2788     int set, unset;
2789     int *optset;
2790    
2791     switch (*(++ptr))
2792     {
2793     case '#': /* Comment; skip to ket */
2794     ptr++;
2795     while (*ptr != ')') ptr++;
2796     continue;
2797    
2798     case ':': /* Non-extracting bracket */
2799     bravalue = OP_BRA;
2800     ptr++;
2801     break;
2802    
2803     case '(':
2804     bravalue = OP_COND; /* Conditional group */
2805    
2806     /* Condition to test for recursion */
2807    
2808     if (ptr[1] == 'R')
2809     {
2810     code[1+LINK_SIZE] = OP_CREF;
2811     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2812     skipbytes = 3;
2813     ptr += 3;
2814     }
2815    
2816     /* Condition to test for a numbered subpattern match. We know that
2817     if a digit follows ( then there will just be digits until ) because
2818     the syntax was checked in the first pass. */
2819    
2820     else if ((digitab[ptr[1]] && ctype_digit) != 0)
2821     {
2822     int condref; /* Don't amalgamate; some compilers */
2823     condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2824     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2825     if (condref == 0)
2826     {
2827     *errorcodeptr = ERR35;
2828     goto FAILED;
2829     }
2830     ptr++;
2831     code[1+LINK_SIZE] = OP_CREF;
2832     PUT2(code, 2+LINK_SIZE, condref);
2833     skipbytes = 3;
2834     }
2835     /* For conditions that are assertions, we just fall through, having
2836     set bravalue above. */
2837     break;
2838    
2839     case '=': /* Positive lookahead */
2840     bravalue = OP_ASSERT;
2841     ptr++;
2842     break;
2843    
2844     case '!': /* Negative lookahead */
2845     bravalue = OP_ASSERT_NOT;
2846     ptr++;
2847     break;
2848    
2849     case '<': /* Lookbehinds */
2850     switch (*(++ptr))
2851     {
2852     case '=': /* Positive lookbehind */
2853     bravalue = OP_ASSERTBACK;
2854     ptr++;
2855     break;
2856    
2857     case '!': /* Negative lookbehind */
2858     bravalue = OP_ASSERTBACK_NOT;
2859     ptr++;
2860     break;
2861     }
2862     break;
2863    
2864     case '>': /* One-time brackets */
2865     bravalue = OP_ONCE;
2866     ptr++;
2867     break;
2868    
2869     case 'C': /* Callout - may be followed by digits; */
2870     previous_callout = code; /* Save for later completion */
2871     after_manual_callout = 1; /* Skip one item before completing */
2872     *code++ = OP_CALLOUT; /* Already checked that the terminating */
2873     { /* closing parenthesis is present. */
2874     int n = 0;
2875     while ((digitab[*(++ptr)] & ctype_digit) != 0)
2876     n = n * 10 + *ptr - '0';
2877     if (n > 255)
2878     {
2879     *errorcodeptr = ERR38;
2880     goto FAILED;
2881     }
2882     *code++ = n;
2883     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2884     PUT(code, LINK_SIZE, 0); /* Default length */
2885     code += 2 * LINK_SIZE;
2886     }
2887     previous = NULL;
2888     continue;
2889    
2890     case 'P': /* Named subpattern handling */
2891     if (*(++ptr) == '<') /* Definition */
2892     {
2893     int i, namelen;
2894     uschar *slot = cd->name_table;
2895     const uschar *name; /* Don't amalgamate; some compilers */
2896     name = ++ptr; /* grumble at autoincrement in declaration */
2897    
2898     while (*ptr++ != '>');
2899     namelen = ptr - name - 1;
2900    
2901     for (i = 0; i < cd->names_found; i++)
2902     {
2903     int crc = memcmp(name, slot+2, namelen);
2904     if (crc == 0)
2905     {
2906     if (slot[2+namelen] == 0)
2907     {
2908     *errorcodeptr = ERR43;
2909     goto FAILED;
2910     }
2911     crc = -1; /* Current name is substring */
2912     }
2913     if (crc < 0)
2914     {
2915     memmove(slot + cd->name_entry_size, slot,
2916     (cd->names_found - i) * cd->name_entry_size);
2917     break;
2918     }
2919     slot += cd->name_entry_size;
2920     }
2921    
2922     PUT2(slot, 0, *brackets + 1);
2923     memcpy(slot + 2, name, namelen);
2924     slot[2+namelen] = 0;
2925     cd->names_found++;
2926     goto NUMBERED_GROUP;
2927     }
2928    
2929     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2930     {
2931     int i, namelen;
2932     int type = *ptr++;
2933     const uschar *name = ptr;
2934     uschar *slot = cd->name_table;
2935    
2936     while (*ptr != ')') ptr++;
2937     namelen = ptr - name;
2938    
2939     for (i = 0; i < cd->names_found; i++)
2940     {
2941     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2942     slot += cd->name_entry_size;
2943     }
2944     if (i >= cd->names_found)
2945     {
2946     *errorcodeptr = ERR15;
2947     goto FAILED;
2948     }
2949    
2950     recno = GET2(slot, 0);
2951    
2952     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2953    
2954     /* Back reference */
2955    
2956     previous = code;
2957     *code++ = OP_REF;
2958     PUT2INC(code, 0, recno);
2959     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2960     if (recno > cd->top_backref) cd->top_backref = recno;
2961     continue;
2962     }
2963    
2964     /* Should never happen */
2965     break;
2966    
2967     case 'R': /* Pattern recursion */
2968     ptr++; /* Same as (?0) */
2969     /* Fall through */
2970    
2971     /* Recursion or "subroutine" call */
2972    
2973     case '0': case '1': case '2': case '3': case '4':
2974     case '5': case '6': case '7': case '8': case '9':
2975     {
2976     const uschar *called;
2977     recno = 0;
2978     while((digitab[*ptr] & ctype_digit) != 0)
2979     recno = recno * 10 + *ptr++ - '0';
2980    
2981     /* Come here from code above that handles a named recursion */
2982    
2983     HANDLE_RECURSION:
2984    
2985     previous = code;
2986    
2987     /* Find the bracket that is being referenced. Temporarily end the
2988     regex in case it doesn't exist. */
2989    
2990     *code = OP_END;
2991     called = (recno == 0)?
2992     cd->start_code : find_bracket(cd->start_code, utf8, recno);
2993    
2994     if (called == NULL)
2995     {
2996     *errorcodeptr = ERR15;
2997     goto FAILED;
2998     }
2999    
3000     /* If the subpattern is still open, this is a recursive call. We
3001     check to see if this is a left recursion that could loop for ever,
3002     and diagnose that case. */
3003    
3004     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3005     {
3006     *errorcodeptr = ERR40;
3007     goto FAILED;
3008     }
3009    
3010     /* Insert the recursion/subroutine item */
3011    
3012     *code = OP_RECURSE;
3013     PUT(code, 1, called - cd->start_code);
3014     code += 1 + LINK_SIZE;
3015     }
3016     continue;
3017    
3018     /* Character after (? not specially recognized */
3019    
3020     default: /* Option setting */
3021     set = unset = 0;
3022     optset = &set;
3023    
3024     while (*ptr != ')' && *ptr != ':')
3025     {
3026     switch (*ptr++)
3027     {
3028     case '-': optset = &unset; break;
3029    
3030     case 'i': *optset |= PCRE_CASELESS; break;
3031     case 'm': *optset |= PCRE_MULTILINE; break;
3032     case 's': *optset |= PCRE_DOTALL; break;
3033     case 'x': *optset |= PCRE_EXTENDED; break;
3034     case 'U': *optset |= PCRE_UNGREEDY; break;
3035     case 'X': *optset |= PCRE_EXTRA; break;
3036     }
3037     }
3038    
3039     /* Set up the changed option bits, but don't change anything yet. */
3040    
3041     newoptions = (options | set) & (~unset);
3042    
3043     /* If the options ended with ')' this is not the start of a nested
3044     group with option changes, so the options change at this level. Compile
3045     code to change the ims options if this setting actually changes any of
3046     them. We also pass the new setting back so that it can be put at the
3047     start of any following branches, and when this group ends (if we are in
3048     a group), a resetting item can be compiled.
3049    
3050     Note that if this item is right at the start of the pattern, the
3051     options will have been abstracted and made global, so there will be no
3052     change to compile. */
3053    
3054     if (*ptr == ')')
3055     {
3056     if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3057     {
3058     *code++ = OP_OPT;
3059     *code++ = newoptions & PCRE_IMS;
3060     }
3061    
3062     /* Change options at this level, and pass them back for use
3063     in subsequent branches. Reset the greedy defaults and the case
3064     value for firstbyte and reqbyte. */
3065    
3066     *optionsptr = options = newoptions;
3067     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3068     greedy_non_default = greedy_default ^ 1;
3069     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3070    
3071     previous = NULL; /* This item can't be repeated */
3072     continue; /* It is complete */
3073     }
3074    
3075     /* If the options ended with ':' we are heading into a nested group
3076     with possible change of options. Such groups are non-capturing and are
3077     not assertions of any kind. All we need to do is skip over the ':';
3078     the newoptions value is handled below. */
3079    
3080     bravalue = OP_BRA;
3081     ptr++;
3082     }
3083     }
3084    
3085     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3086     non-capturing and behave like (?:...) brackets */
3087    
3088     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3089     {
3090     bravalue = OP_BRA;
3091     }
3092    
3093     /* Else we have a referencing group; adjust the opcode. If the bracket
3094     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3095     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3096    
3097     else
3098     {
3099     NUMBERED_GROUP:
3100     if (++(*brackets) > EXTRACT_BASIC_MAX)
3101     {
3102     bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3103     code[1+LINK_SIZE] = OP_BRANUMBER;
3104     PUT2(code, 2+LINK_SIZE, *brackets);
3105     skipbytes = 3;
3106     }
3107     else bravalue = OP_BRA + *brackets;
3108     }
3109    
3110     /* Process nested bracketed re. Assertions may not be repeated, but other
3111     kinds can be. We copy code into a non-register variable in order to be able
3112     to pass its address because some compilers complain otherwise. Pass in a
3113     new setting for the ims options if they have changed. */
3114    
3115     previous = (bravalue >= OP_ONCE)? code : NULL;
3116     *code = bravalue;
3117     tempcode = code;
3118     tempreqvary = cd->req_varyopt; /* Save value before bracket */
3119    
3120     if (!compile_regex(
3121     newoptions, /* The complete new option state */
3122     options & PCRE_IMS, /* The previous ims option state */
3123     brackets, /* Extracting bracket count */
3124     &tempcode, /* Where to put code (updated) */
3125     &ptr, /* Input pointer (updated) */
3126     errorcodeptr, /* Where to put an error message */
3127     (bravalue == OP_ASSERTBACK ||
3128     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3129     skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3130     &subfirstbyte, /* For possible first char */
3131     &subreqbyte, /* For possible last char */
3132     bcptr, /* Current branch chain */
3133     cd)) /* Tables block */
3134     goto FAILED;
3135    
3136     /* At the end of compiling, code is still pointing to the start of the
3137     group, while tempcode has been updated to point past the end of the group
3138     and any option resetting that may follow it. The pattern pointer (ptr)
3139     is on the bracket. */
3140    
3141     /* If this is a conditional bracket, check that there are no more than
3142     two branches in the group. */
3143    
3144     else if (bravalue == OP_COND)
3145     {
3146     uschar *tc = code;
3147     condcount = 0;
3148    
3149     do {
3150     condcount++;
3151     tc += GET(tc,1);
3152     }
3153     while (*tc != OP_KET);
3154    
3155     if (condcount > 2)
3156     {
3157     *errorcodeptr = ERR27;
3158     goto FAILED;
3159     }
3160    
3161     /* If there is just one branch, we must not make use of its firstbyte or
3162     reqbyte, because this is equivalent to an empty second branch. */
3163    
3164     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3165     }
3166    
3167     /* Handle updating of the required and first characters. Update for normal
3168     brackets of all kinds, and conditions with two branches (see code above).
3169     If the bracket is followed by a quantifier with zero repeat, we have to
3170     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3171     main loop so that they can be accessed for the back off. */
3172    
3173     zeroreqbyte = reqbyte;
3174     zerofirstbyte = firstbyte;
3175     groupsetfirstbyte = FALSE;
3176    
3177     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3178     {
3179     /* If we have not yet set a firstbyte in this branch, take it from the
3180     subpattern, remembering that it was set here so that a repeat of more
3181     than one can replicate it as reqbyte if necessary. If the subpattern has
3182     no firstbyte, set "none" for the whole branch. In both cases, a zero
3183     repeat forces firstbyte to "none". */
3184    
3185     if (firstbyte == REQ_UNSET)
3186     {
3187     if (subfirstbyte >= 0)
3188     {
3189     firstbyte = subfirstbyte;
3190     groupsetfirstbyte = TRUE;
3191     }
3192     else firstbyte = REQ_NONE;
3193     zerofirstbyte = REQ_NONE;
3194     }
3195    
3196     /* If firstbyte was previously set, convert the subpattern's firstbyte
3197     into reqbyte if there wasn't one, using the vary flag that was in
3198     existence beforehand. */
3199    
3200     else if (subfirstbyte >= 0 && subreqbyte < 0)
3201     subreqbyte = subfirstbyte | tempreqvary;
3202    
3203     /* If the subpattern set a required byte (or set a first byte that isn't
3204     really the first byte - see above), set it. */
3205    
3206     if (subreqbyte >= 0) reqbyte = subreqbyte;
3207     }
3208    
3209     /* For a forward assertion, we take the reqbyte, if set. This can be
3210     helpful if the pattern that follows the assertion doesn't set a different
3211     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3212     for an assertion, however because it leads to incorrect effect for patterns
3213     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3214     of a firstbyte. This is overcome by a scan at the end if there's no
3215     firstbyte, looking for an asserted first char. */
3216    
3217     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3218    
3219     /* Now update the main code pointer to the end of the group. */
3220    
3221     code = tempcode;
3222    
3223     /* Error if hit end of pattern */
3224    
3225     if (*ptr != ')')
3226     {
3227     *errorcodeptr = ERR14;
3228     goto FAILED;
3229     }
3230     break;
3231    
3232     /* Check \ for being a real metacharacter; if not, fall through and handle
3233     it as a data character at the start of a string. Escape items are checked
3234     for validity in the pre-compiling pass. */
3235    
3236     case '\\':
3237     tempptr = ptr;
3238     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3239    
3240     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3241     are arranged to be the negation of the corresponding OP_values. For the
3242     back references, the values are ESC_REF plus the reference number. Only
3243     back references and those types that consume a character may be repeated.
3244     We can test for values between ESC_b and ESC_Z for the latter; this may
3245     have to change if any new ones are ever created. */
3246    
3247     if (c < 0)
3248     {
3249     if (-c == ESC_Q) /* Handle start of quoted string */
3250     {
3251     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3252     else inescq = TRUE;
3253     continue;
3254     }
3255    
3256     /* For metasequences that actually match a character, we disable the
3257     setting of a first character if it hasn't already been set. */
3258    
3259     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3260     firstbyte = REQ_NONE;
3261    
3262     /* Set values to reset to if this is followed by a zero repeat. */
3263    
3264     zerofirstbyte = firstbyte;
3265     zeroreqbyte = reqbyte;
3266    
3267     /* Back references are handled specially */
3268    
3269     if (-c >= ESC_REF)
3270     {
3271     int number = -c - ESC_REF;
3272     previous = code;
3273     *code++ = OP_REF;
3274     PUT2INC(code, 0, number);
3275     }
3276    
3277     /* So are Unicode property matches, if supported. We know that get_ucp
3278     won't fail because it was tested in the pre-pass. */
3279    
3280     #ifdef SUPPORT_UCP
3281     else if (-c == ESC_P || -c == ESC_p)
3282     {
3283     BOOL negated;
3284     int value = get_ucp(&ptr, &negated, errorcodeptr);
3285     previous = code;
3286     *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3287     *code++ = value;
3288     }
3289     #endif
3290    
3291     /* For the rest, we can obtain the OP value by negating the escape
3292     value */
3293    
3294     else
3295     {
3296     previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3297     *code++ = -c;
3298     }
3299     continue;
3300     }
3301    
3302     /* We have a data character whose value is in c. In UTF-8 mode it may have
3303     a value > 127. We set its representation in the length/buffer, and then
3304     handle it as a data character. */
3305    
3306     #ifdef SUPPORT_UTF8
3307     if (utf8 && c > 127)
3308     mclength = _pcre_ord2utf8(c, mcbuffer);
3309     else
3310     #endif
3311    
3312     {
3313     mcbuffer[0] = c;
3314     mclength = 1;
3315     }
3316    
3317     goto ONE_CHAR;
3318    
3319     /* Handle a literal character. It is guaranteed not to be whitespace or #
3320     when the extended flag is set. If we are in UTF-8 mode, it may be a
3321     multi-byte literal character. */
3322    
3323     default:
3324     NORMAL_CHAR:
3325     mclength = 1;
3326     mcbuffer[0] = c;
3327    
3328     #ifdef SUPPORT_UTF8
3329     if (utf8 && (c & 0xc0) == 0xc0)
3330     {
3331     while ((ptr[1] & 0xc0) == 0x80)
3332     mcbuffer[mclength++] = *(++ptr);
3333     }
3334     #endif
3335    
3336     /* At this point we have the character's bytes in mcbuffer, and the length
3337     in mclength. When not in UTF-8 mode, the length is always 1. */
3338    
3339     ONE_CHAR:
3340     previous = code;
3341     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3342     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3343    
3344     /* Set the first and required bytes appropriately. If no previous first
3345     byte, set it from this character, but revert to none on a zero repeat.
3346     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3347     repeat. */
3348    
3349     if (firstbyte == REQ_UNSET)
3350     {
3351     zerofirstbyte = REQ_NONE;
3352     zeroreqbyte = reqbyte;
3353    
3354     /* If the character is more than one byte long, we can set firstbyte
3355     only if it is not to be matched caselessly. */
3356    
3357     if (mclength == 1 || req_caseopt == 0)
3358     {
3359     firstbyte = mcbuffer[0] | req_caseopt;
3360     if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3361     }
3362     else firstbyte = reqbyte = REQ_NONE;
3363     }
3364    
3365     /* firstbyte was previously set; we can set reqbyte only the length is
3366     1 or the matching is caseful. */
3367    
3368     else
3369     {
3370     zerofirstbyte = firstbyte;
3371     zeroreqbyte = reqbyte;
3372     if (mclength == 1 || req_caseopt == 0)
3373     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3374     }
3375    
3376     break; /* End of literal character handling */
3377     }
3378     } /* end of big loop */
3379    
3380     /* Control never reaches here by falling through, only by a goto for all the
3381     error states. Pass back the position in the pattern so that it can be displayed
3382     to the user for diagnosing the error. */
3383    
3384     FAILED:
3385     *ptrptr = ptr;
3386     return FALSE;
3387     }
3388    
3389    
3390    
3391    
3392     /*************************************************
3393     * Compile sequence of alternatives *
3394     *************************************************/
3395    
3396     /* On entry, ptr is pointing past the bracket character, but on return
3397     it points to the closing bracket, or vertical bar, or end of string.
3398     The code variable is pointing at the byte into which the BRA operator has been
3399     stored. If the ims options are changed at the start (for a (?ims: group) or
3400     during any branch, we need to insert an OP_OPT item at the start of every
3401     following branch to ensure they get set correctly at run time, and also pass
3402     the new options into every subsequent branch compile.
3403    
3404     Argument:
3405     options option bits, including any changes for this subpattern
3406     oldims previous settings of ims option bits
3407     brackets -> int containing the number of extracting brackets used
3408     codeptr -> the address of the current code pointer
3409     ptrptr -> the address of the current pattern pointer
3410     errorcodeptr -> pointer to error code variable
3411     lookbehind TRUE if this is a lookbehind assertion
3412     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3413     firstbyteptr place to put the first required character, or a negative number
3414     reqbyteptr place to put the last required character, or a negative number
3415     bcptr pointer to the chain of currently open branches
3416     cd points to the data block with tables pointers etc.
3417    
3418     Returns: TRUE on success
3419     */
3420    
3421     static BOOL
3422     compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3423     const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3424     int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3425     {
3426     const uschar *ptr = *ptrptr;
3427     uschar *code = *codeptr;
3428     uschar *last_branch = code;
3429     uschar *start_bracket = code;
3430     uschar *reverse_count = NULL;
3431     int firstbyte, reqbyte;
3432     int branchfirstbyte, branchreqbyte;
3433     branch_chain bc;
3434    
3435     bc.outer = bcptr;
3436     bc.current = code;
3437    
3438     firstbyte = reqbyte = REQ_UNSET;
3439    
3440     /* Offset is set zero to mark that this bracket is still open */
3441    
3442     PUT(code, 1, 0);
3443     code += 1 + LINK_SIZE + skipbytes;
3444    
3445     /* Loop for each alternative branch */
3446    
3447     for (;;)
3448     {
3449     /* Handle a change of ims options at the start of the branch */
3450    
3451     if ((options & PCRE_IMS) != oldims)
3452     {
3453     *code++ = OP_OPT;
3454     *code++ = options & PCRE_IMS;
3455     }
3456    
3457     /* Set up dummy OP_REVERSE if lookbehind assertion */
3458    
3459     if (lookbehind)
3460     {
3461     *code++ = OP_REVERSE;
3462     reverse_count = code;
3463     PUTINC(code, 0, 0);
3464     }
3465    
3466     /* Now compile the branch */
3467    
3468     if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3469     &branchfirstbyte, &branchreqbyte, &bc, cd))
3470     {
3471     *ptrptr = ptr;
3472     return FALSE;
3473     }
3474    
3475     /* If this is the first branch, the firstbyte and reqbyte values for the
3476     branch become the values for the regex. */
3477    
3478     if (*last_branch != OP_ALT)
3479     {
3480     firstbyte = branchfirstbyte;
3481     reqbyte = branchreqbyte;
3482     }
3483    
3484     /* If this is not the first branch, the first char and reqbyte have to
3485     match the values from all the previous branches, except that if the previous
3486     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3487     REQ_VARY for the regex. */
3488    
3489     else
3490     {
3491     /* If we previously had a firstbyte, but it doesn't match the new branch,
3492     we have to abandon the firstbyte for the regex, but if there was previously
3493     no reqbyte, it takes on the value of the old firstbyte. */
3494    
3495     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3496     {
3497     if (reqbyte < 0) reqbyte = firstbyte;
3498     firstbyte = REQ_NONE;
3499     }
3500    
3501     /* If we (now or from before) have no firstbyte, a firstbyte from the
3502     branch becomes a reqbyte if there isn't a branch reqbyte. */
3503    
3504     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3505     branchreqbyte = branchfirstbyte;
3506    
3507     /* Now ensure that the reqbytes match */
3508    
3509     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3510     reqbyte = REQ_NONE;
3511     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3512     }
3513    
3514     /* If lookbehind, check that this branch matches a fixed-length string,
3515     and put the length into the OP_REVERSE item. Temporarily mark the end of
3516     the branch with OP_END. */
3517    
3518     if (lookbehind)
3519     {
3520     int length;
3521     *code = OP_END;
3522     length = find_fixedlength(last_branch, options);
3523     DPRINTF(("fixed length = %d\n", length));
3524     if (length < 0)
3525     {
3526     *errorcodeptr = (length == -2)? ERR36 : ERR25;
3527     *ptrptr = ptr;
3528     return FALSE;
3529     }
3530     PUT(reverse_count, 0, length);
3531     }
3532    
3533     /* Reached end of expression, either ')' or end of pattern. Go back through
3534     the alternative branches and reverse the chain of offsets, with the field in
3535     the BRA item now becoming an offset to the first alternative. If there are
3536     no alternatives, it points to the end of the group. The length in the
3537     terminating ket is always the length of the whole bracketed item. If any of
3538     the ims options were changed inside the group, compile a resetting op-code
3539     following, except at the very end of the pattern. Return leaving the pointer
3540     at the terminating char. */
3541    
3542     if (*ptr != '|')
3543     {
3544     int length = code - last_branch;
3545     do
3546     {
3547     int prev_length = GET(last_branch, 1);
3548     PUT(last_branch, 1, length);
3549     length = prev_length;
3550     last_branch -= length;
3551     }
3552     while (length > 0);
3553    
3554     /* Fill in the ket */
3555    
3556     *code = OP_KET;
3557     PUT(code, 1, code - start_bracket);
3558     code += 1 + LINK_SIZE;
3559    
3560     /* Resetting option if needed */
3561    
3562     if ((options & PCRE_IMS) != oldims && *ptr == ')')
3563     {
3564     *code++ = OP_OPT;
3565     *code++ = oldims;
3566     }
3567    
3568     /* Set values to pass back */
3569    
3570     *codeptr = code;
3571     *ptrptr = ptr;
3572     *firstbyteptr = firstbyte;
3573     *reqbyteptr = reqbyte;
3574     return TRUE;
3575     }
3576    
3577     /* Another branch follows; insert an "or" node. Its length field points back
3578     to the previous branch while the bracket remains open. At the end the chain
3579     is reversed. It's done like this so that the start of the bracket has a
3580     zero offset until it is closed, making it possible to detect recursion. */
3581    
3582     *code = OP_ALT;
3583     PUT(code, 1, code - last_branch);
3584     bc.current = last_branch = code;
3585     code += 1 + LINK_SIZE;
3586     ptr++;
3587     }
3588     /* Control never reaches here */
3589     }
3590    
3591    
3592    
3593    
3594     /*************************************************
3595     * Check for anchored expression *
3596     *************************************************/
3597    
3598     /* Try to find out if this is an anchored regular expression. Consider each
3599     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3600     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3601     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3602     counts, since OP_CIRC can match in the middle.
3603    
3604     We can also consider a regex to be anchored if OP_SOM starts all its branches.
3605     This is the code for \G, which means "match at start of match position, taking
3606     into account the match offset".
3607    
3608     A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3609     because that will try the rest of the pattern at all possible matching points,
3610     so there is no point trying again.... er ....
3611    
3612     .... except when the .* appears inside capturing parentheses, and there is a
3613     subsequent back reference to those parentheses. We haven't enough information
3614     to catch that case precisely.
3615    
3616     At first, the best we could do was to detect when .* was in capturing brackets
3617     and the highest back reference was greater than or equal to that level.
3618     However, by keeping a bitmap of the first 31 back references, we can catch some
3619     of the more common cases more precisely.
3620    
3621     Arguments:
3622     code points to start of expression (the bracket)
3623     options points to the options setting
3624     bracket_map a bitmap of which brackets we are inside while testing; this
3625     handles up to substring 31; after that we just have to take
3626     the less precise approach
3627     backref_map the back reference bitmap
3628    
3629     Returns: TRUE or FALSE
3630     */
3631    
3632     static BOOL
3633     is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3634     unsigned int backref_map)
3635     {
3636     do {
3637     const uschar *scode =
3638     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3639     register int op = *scode;
3640    
3641     /* Capturing brackets */
3642    
3643     if (op > OP_BRA)
3644     {
3645     int new_map;
3646     op -= OP_BRA;
3647     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3648     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3649     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3650     }
3651    
3652     /* Other brackets */
3653    
3654     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3655     {
3656     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3657     }
3658    
3659     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3660     are or may be referenced. */
3661    
3662     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3663     (*options & PCRE_DOTALL) != 0)
3664     {
3665     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3666     }
3667    
3668     /* Check for explicit anchoring */
3669    
3670     else if (op != OP_SOD && op != OP_SOM &&
3671     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3672     return FALSE;
3673     code += GET(code, 1);
3674     }
3675     while (*code == OP_ALT); /* Loop for each alternative */
3676     return TRUE;
3677     }
3678    
3679    
3680    
3681     /*************************************************
3682     * Check for starting with ^ or .* *
3683     *************************************************/
3684    
3685     /* This is called to find out if every branch starts with ^ or .* so that
3686     "first char" processing can be done to speed things up in multiline
3687     matching and for non-DOTALL patterns that start with .* (which must start at
3688     the beginning or after \n). As in the case of is_anchored() (see above), we
3689     have to take account of back references to capturing brackets that contain .*
3690     because in that case we can't make the assumption.
3691    
3692     Arguments:
3693     code points to start of expression (the bracket)
3694     bracket_map a bitmap of which brackets we are inside while testing; this
3695     handles up to substring 31; after that we just have to take
3696     the less precise approach
3697     backref_map the back reference bitmap
3698    
3699     Returns: TRUE or FALSE
3700     */
3701    
3702     static BOOL
3703     is_startline(const uschar *code, unsigned int bracket_map,
3704     unsigned int backref_map)
3705     {
3706     do {
3707     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3708     FALSE);
3709     register int op = *scode;
3710    
3711     /* Capturing brackets */
3712    
3713     if (op > OP_BRA)
3714     {
3715     int new_map;
3716     op -= OP_BRA;
3717     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3718     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3719     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3720     }
3721    
3722     /* Other brackets */
3723    
3724     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3725     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3726    
3727     /* .* means "start at start or after \n" if it isn't in brackets that
3728     may be referenced. */
3729    
3730     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3731     {
3732     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3733     }
3734    
3735     /* Check for explicit circumflex */
3736    
3737     else if (op != OP_CIRC) return FALSE;
3738    
3739     /* Move on to the next alternative */
3740    
3741     code += GET(code, 1);
3742     }
3743     while (*code == OP_ALT); /* Loop for each alternative */
3744     return TRUE;
3745     }
3746    
3747    
3748    
3749     /*************************************************
3750     * Check for asserted fixed first char *
3751     *************************************************/
3752    
3753     /* During compilation, the "first char" settings from forward assertions are
3754     discarded, because they can cause conflicts with actual literals that follow.
3755     However, if we end up without a first char setting for an unanchored pattern,
3756     it is worth scanning the regex to see if there is an initial asserted first
3757     char. If all branches start with the same asserted char, or with a bracket all
3758     of whose alternatives start with the same asserted char (recurse ad lib), then
3759     we return that char, otherwise -1.
3760    
3761     Arguments:
3762     code points to start of expression (the bracket)
3763     options pointer to the options (used to check casing changes)
3764     inassert TRUE if in an assertion
3765    
3766     Returns: -1 or the fixed first char
3767     */
3768    
3769     static int
3770     find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3771     {
3772     register int c = -1;
3773     do {
3774     int d;
3775     const uschar *scode =
3776     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3777     register int op = *scode;
3778    
3779     if (op >= OP_BRA) op = OP_BRA;
3780    
3781     switch(op)
3782     {
3783     default:
3784     return -1;
3785    
3786     case OP_BRA:
3787     case OP_ASSERT:
3788     case OP_ONCE:
3789     case OP_COND:
3790     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3791     return -1;
3792     if (c < 0) c = d; else if (c != d) return -1;
3793     break;
3794    
3795     case OP_EXACT: /* Fall through */
3796     scode += 2;
3797    
3798     case OP_CHAR:
3799     case OP_CHARNC:
3800     case OP_PLUS:
3801     case OP_MINPLUS:
3802     if (!inassert) return -1;
3803     if (c < 0)
3804     {
3805     c = scode[1];
3806     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3807     }
3808     else if (c != scode[1]) return -1;
3809     break;
3810     }
3811    
3812     code += GET(code, 1);
3813     }
3814     while (*code == OP_ALT);
3815     return c;
3816     }
3817    
3818    
3819    
3820     /*************************************************
3821     * Compile a Regular Expression *
3822     *************************************************/
3823    
3824     /* This function takes a string and returns a pointer to a block of store
3825     holding a compiled version of the expression. The original API for this
3826     function had no error code return variable; it is retained for backwards
3827     compatibility. The new function is given a new name.
3828    
3829     Arguments:
3830     pattern the regular expression
3831     options various option bits
3832     errorcodeptr pointer to error code variable (pcre_compile2() only)
3833     can be NULL if you don't want a code value
3834     errorptr pointer to pointer to error text
3835     erroroffset ptr offset in pattern where error was detected
3836     tables pointer to character tables or NULL
3837    
3838     Returns: pointer to compiled data block, or NULL on error,
3839     with errorptr and erroroffset set
3840     */
3841    
3842     EXPORT pcre *
3843     pcre_compile(const char *pattern, int options, const char **errorptr,
3844     int *erroroffset, const unsigned char *tables)
3845     {
3846     return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3847     }
3848    
3849    
3850     EXPORT pcre *
3851     pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3852     const char **errorptr, int *erroroffset, const unsigned char *tables)
3853     {
3854     real_pcre *re;
3855     int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3856     int c, firstbyte, reqbyte;
3857     int bracount = 0;
3858     int branch_extra = 0;
3859     int branch_newextra;
3860     int item_count = -1;
3861     int name_count = 0;
3862     int max_name_size = 0;
3863     int lastitemlength = 0;
3864     int errorcode = 0;
3865     #ifdef SUPPORT_UTF8
3866     BOOL utf8;
3867     BOOL class_utf8;
3868     #endif
3869     BOOL inescq = FALSE;
3870 nigel 81 BOOL capturing;
3871 nigel 77 unsigned int brastackptr = 0;
3872     size_t size;
3873     uschar *code;
3874     const uschar *codestart;
3875     const uschar *ptr;
3876     compile_data compile_block;
3877     int brastack[BRASTACK_SIZE];
3878     uschar bralenstack[BRASTACK_SIZE];
3879    
3880     /* We can't pass back an error message if errorptr is NULL; I guess the best we
3881     can do is just return NULL, but we can set a code value if there is a code
3882     pointer. */
3883    
3884     if (errorptr == NULL)
3885     {
3886     if (errorcodeptr != NULL) *errorcodeptr = 99;
3887     return NULL;
3888     }
3889    
3890     *errorptr = NULL;
3891     if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3892    
3893     /* However, we can give a message for this error */
3894    
3895     if (erroroffset == NULL)
3896     {
3897     errorcode = ERR16;
3898     goto PCRE_EARLY_ERROR_RETURN;
3899     }
3900    
3901     *erroroffset = 0;
3902    
3903     /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3904    
3905     #ifdef SUPPORT_UTF8
3906     utf8 = (options & PCRE_UTF8) != 0;
3907     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3908     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3909     {
3910     errorcode = ERR44;
3911     goto PCRE_EARLY_ERROR_RETURN;
3912     }
3913     #else
3914     if ((options & PCRE_UTF8) != 0)
3915     {
3916     errorcode = ERR32;
3917     goto PCRE_EARLY_ERROR_RETURN;
3918     }
3919     #endif
3920    
3921     if ((options & ~PUBLIC_OPTIONS) != 0)
3922     {
3923     errorcode = ERR17;
3924     goto PCRE_EARLY_ERROR_RETURN;
3925     }
3926    
3927     /* Set up pointers to the individual character tables */
3928    
3929     if (tables == NULL) tables = _pcre_default_tables;
3930     compile_block.lcc = tables + lcc_offset;
3931     compile_block.fcc = tables + fcc_offset;
3932     compile_block.cbits = tables + cbits_offset;
3933     compile_block.ctypes = tables + ctypes_offset;
3934    
3935     /* Maximum back reference and backref bitmap. This is updated for numeric
3936     references during the first pass, but for named references during the actual
3937     compile pass. The bitmap records up to 31 back references to help in deciding
3938     whether (.*) can be treated as anchored or not. */
3939    
3940     compile_block.top_backref = 0;
3941     compile_block.backref_map = 0;
3942    
3943     /* Reflect pattern for debugging output */
3944    
3945     DPRINTF(("------------------------------------------------------------------\n"));
3946     DPRINTF(("%s\n", pattern));
3947    
3948     /* The first thing to do is to make a pass over the pattern to compute the
3949     amount of store required to hold the compiled code. This does not have to be
3950     perfect as long as errors are overestimates. At the same time we can detect any
3951     flag settings right at the start, and extract them. Make an attempt to correct
3952     for any counted white space if an "extended" flag setting appears late in the
3953     pattern. We can't be so clever for #-comments. */
3954    
3955     ptr = (const uschar *)(pattern - 1);
3956     while ((c = *(++ptr)) != 0)
3957     {
3958     int min, max;
3959     int class_optcount;
3960     int bracket_length;
3961     int duplength;
3962    
3963     /* If we are inside a \Q...\E sequence, all chars are literal */
3964    
3965     if (inescq)
3966     {
3967     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3968     goto NORMAL_CHAR;
3969     }
3970    
3971     /* Otherwise, first check for ignored whitespace and comments */
3972    
3973     if ((options & PCRE_EXTENDED) != 0)
3974     {
3975     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3976     if (c == '#')
3977     {
3978     /* The space before the ; is to avoid a warning on a silly compiler
3979     on the Macintosh. */
3980     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3981     if (c == 0) break;
3982     continue;
3983     }
3984     }
3985    
3986     item_count++; /* Is zero for the first non-comment item */
3987    
3988     /* Allow space for auto callout before every item except quantifiers. */
3989    
3990     if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3991     c != '*' && c != '+' && c != '?' &&
3992     (c != '{' || !is_counted_repeat(ptr + 1)))
3993     length += 2 + 2*LINK_SIZE;
3994    
3995     switch(c)
3996     {
3997     /* A backslashed item may be an escaped data character or it may be a
3998     character type. */
3999    
4000     case '\\':
4001     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4002     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4003    
4004     lastitemlength = 1; /* Default length of last item for repeats */
4005    
4006     if (c >= 0) /* Data character */
4007     {
4008     length += 2; /* For a one-byte character */
4009    
4010     #ifdef SUPPORT_UTF8
4011     if (utf8 && c > 127)
4012     {
4013     int i;
4014     for (i = 0; i < _pcre_utf8_table1_size; i++)
4015     if (c <= _pcre_utf8_table1[i]) break;
4016     length += i;
4017     lastitemlength += i;
4018     }
4019     #endif
4020    
4021     continue;
4022     }
4023    
4024     /* If \Q, enter "literal" mode */
4025    
4026     if (-c == ESC_Q)
4027     {
4028     inescq = TRUE;
4029     continue;
4030     }
4031    
4032     /* \X is supported only if Unicode property support is compiled */
4033    
4034     #ifndef SUPPORT_UCP
4035     if (-c == ESC_X)
4036     {
4037     errorcode = ERR45;
4038     goto PCRE_ERROR_RETURN;
4039     }
4040     #endif
4041    
4042     /* \P and \p are for Unicode properties, but only when the support has
4043     been compiled. Each item needs 2 bytes. */
4044    
4045     else if (-c == ESC_P || -c == ESC_p)
4046     {
4047     #ifdef SUPPORT_UCP
4048     BOOL negated;
4049     length += 2;
4050     lastitemlength = 2;
4051     if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4052     continue;
4053     #else
4054     errorcode = ERR45;
4055     goto PCRE_ERROR_RETURN;
4056     #endif
4057     }
4058    
4059     /* Other escapes need one byte */
4060    
4061     length++;
4062    
4063     /* A back reference needs an additional 2 bytes, plus either one or 5
4064     bytes for a repeat. We also need to keep the value of the highest
4065     back reference. */
4066    
4067     if (c <= -ESC_REF)
4068     {
4069     int refnum = -c - ESC_REF;
4070     compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4071     if (refnum > compile_block.top_backref)
4072     compile_block.top_backref = refnum;
4073     length += 2; /* For single back reference */
4074     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4075     {
4076     ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4077     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4078     if ((min == 0 && (max == 1 || max == -1)) ||
4079     (min == 1 && max == -1))
4080     length++;
4081     else length += 5;
4082     if (ptr[1] == '?') ptr++;
4083     }
4084     }
4085     continue;
4086    
4087     case '^': /* Single-byte metacharacters */
4088     case '.':
4089     case '$':
4090     length++;
4091     lastitemlength = 1;
4092     continue;
4093    
4094     case '*': /* These repeats won't be after brackets; */
4095     case '+': /* those are handled separately */
4096     case '?':
4097     length++;
4098     goto POSESSIVE; /* A few lines below */
4099    
4100     /* This covers the cases of braced repeats after a single char, metachar,
4101     class, or back reference. */
4102    
4103     case '{':
4104     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4105     ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4106     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4107    
4108     /* These special cases just insert one extra opcode */
4109    
4110     if ((min == 0 && (max == 1 || max == -1)) ||
4111     (min == 1 && max == -1))
4112     length++;
4113    
4114     /* These cases might insert additional copies of a preceding character. */
4115    
4116     else
4117     {
4118     if (min != 1)
4119     {
4120     length -= lastitemlength; /* Uncount the original char or metachar */
4121     if (min > 0) length += 3 + lastitemlength;
4122     }
4123     length += lastitemlength + ((max > 0)? 3 : 1);
4124     }
4125    
4126     if (ptr[1] == '?') ptr++; /* Needs no extra length */
4127    
4128     POSESSIVE: /* Test for possessive quantifier */
4129     if (ptr[1] == '+')
4130     {
4131     ptr++;
4132     length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4133     }
4134     continue;
4135    
4136     /* An alternation contains an offset to the next branch or ket. If any ims
4137     options changed in the previous branch(es), and/or if we are in a
4138     lookbehind assertion, extra space will be needed at the start of the
4139     branch. This is handled by branch_extra. */
4140    
4141     case '|':
4142     length += 1 + LINK_SIZE + branch_extra;
4143     continue;
4144    
4145     /* A character class uses 33 characters provided that all the character
4146     values are less than 256. Otherwise, it uses a bit map for low valued
4147     characters, and individual items for others. Don't worry about character
4148     types that aren't allowed in classes - they'll get picked up during the
4149     compile. A character class that contains only one single-byte character
4150     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4151     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4152    
4153     case '[':
4154     if (*(++ptr) == '^')
4155     {
4156     class_optcount = 10; /* Greater than one */
4157     ptr++;
4158     }
4159     else class_optcount = 0;
4160    
4161     #ifdef SUPPORT_UTF8
4162     class_utf8 = FALSE;
4163     #endif
4164    
4165     /* Written as a "do" so that an initial ']' is taken as data */
4166    
4167     if (*ptr != 0) do
4168     {
4169     /* Inside \Q...\E everything is literal except \E */
4170    
4171     if (inescq)
4172     {
4173     if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4174     inescq = FALSE;
4175     ptr += 1;
4176     continue;
4177     }
4178    
4179     /* Outside \Q...\E, check for escapes */
4180    
4181     if (*ptr == '\\')
4182     {
4183     c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4184     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4185    
4186     /* \b is backspace inside a class; \X is literal */
4187    
4188     if (-c == ESC_b) c = '\b';
4189     else if (-c == ESC_X) c = 'X';
4190    
4191     /* \Q enters quoting mode */
4192    
4193     else if (-c == ESC_Q)
4194     {
4195     inescq = TRUE;
4196     continue;
4197     }
4198    
4199     /* Handle escapes that turn into characters */
4200    
4201     if (c >= 0) goto NON_SPECIAL_CHARACTER;
4202    
4203     /* Escapes that are meta-things. The normal ones just affect the
4204     bit map, but Unicode properties require an XCLASS extended item. */
4205    
4206     else
4207     {
4208     class_optcount = 10; /* \d, \s etc; make sure > 1 */
4209     #ifdef SUPPORT_UTF8
4210     if (-c == ESC_p || -c == ESC_P)
4211     {
4212     if (!class_utf8)
4213     {
4214     class_utf8 = TRUE;
4215     length += LINK_SIZE + 2;
4216     }
4217     length += 2;
4218     }
4219     #endif
4220     }
4221     }
4222    
4223     /* Check the syntax for POSIX stuff. The bits we actually handle are
4224     checked during the real compile phase. */
4225    
4226     else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4227     {
4228     ptr++;
4229     class_optcount = 10; /* Make sure > 1 */
4230     }
4231    
4232     /* Anything else increments the possible optimization count. We have to
4233     detect ranges here so that we can compute the number of extra ranges for
4234     caseless wide characters when UCP support is available. If there are wide
4235     characters, we are going to have to use an XCLASS, even for single
4236     characters. */
4237    
4238     else
4239     {
4240     int d;
4241    
4242     GET_ONE_CHARACTER:
4243    
4244     #ifdef SUPPORT_UTF8
4245     if (utf8)
4246     {
4247     int extra = 0;
4248     GETCHARLEN(c, ptr, extra);
4249     ptr += extra;
4250     }
4251     else c = *ptr;
4252     #else
4253     c = *ptr;
4254     #endif
4255    
4256     /* Come here from handling \ above when it escapes to a char value */
4257    
4258     NON_SPECIAL_CHARACTER:
4259     class_optcount++;
4260    
4261     d = -1;
4262     if (ptr[1] == '-')
4263     {
4264     uschar const *hyptr = ptr++;
4265     if (ptr[1] == '\\')
4266     {
4267     ptr++;
4268     d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4269     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4270     if (-d == ESC_b) d = '\b'; /* backspace */
4271     else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4272     }
4273     else if (ptr[1] != 0 && ptr[1] != ']')
4274     {
4275     ptr++;
4276     #ifdef SUPPORT_UTF8
4277     if (utf8)
4278     {
4279     int extra = 0;
4280     GETCHARLEN(d, ptr, extra);
4281     ptr += extra;
4282     }
4283     else
4284     #endif
4285     d = *ptr;
4286     }
4287     if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4288     }
4289    
4290     /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4291     127 for caseless matching, we will need to use an XCLASS. */
4292    
4293     if (d >= 0)
4294     {
4295     class_optcount = 10; /* Ensure > 1 */
4296     if (d < c)
4297     {
4298     errorcode = ERR8;
4299     goto PCRE_ERROR_RETURN;
4300     }
4301    
4302     #ifdef SUPPORT_UTF8
4303     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4304     {
4305     uschar buffer[6];
4306     if (!class_utf8) /* Allow for XCLASS overhead */
4307     {
4308     class_utf8 = TRUE;
4309     length += LINK_SIZE + 2;
4310     }
4311    
4312     #ifdef SUPPORT_UCP
4313     /* If we have UCP support, find out how many extra ranges are
4314     needed to map the other case of characters within this range. We
4315     have to mimic the range optimization here, because extending the
4316     range upwards might push d over a boundary that makes is use
4317     another byte in the UTF-8 representation. */
4318    
4319     if ((options & PCRE_CASELESS) != 0)
4320     {
4321     int occ, ocd;
4322     int cc = c;
4323     int origd = d;
4324     while (get_othercase_range(&cc, origd, &occ, &ocd))
4325     {
4326     if (occ >= c && ocd <= d) continue; /* Skip embedded */
4327    
4328     if (occ < c && ocd >= c - 1) /* Extend the basic range */
4329     { /* if there is overlap, */
4330     c = occ; /* noting that if occ < c */