/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 85 - (hide annotations) (download)
Sat Feb 24 21:41:13 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 160094 byte(s)
Load pcre-6.4 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9     Copyright (c) 1997-2005 University of Cambridge
10    
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45     #include "pcre_internal.h"
46    
47    
48 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
49     used by pcretest. DEBUG is not defined when building a production library. */
50    
51     #ifdef DEBUG
52     #include "pcre_printint.src"
53     #endif
54    
55    
56    
57 nigel 77 /*************************************************
58     * Code parameters and static tables *
59     *************************************************/
60    
61     /* Maximum number of items on the nested bracket stacks at compile time. This
62     applies to the nesting of all kinds of parentheses. It does not limit
63     un-nested, non-capturing parentheses. This number can be made bigger if
64     necessary - it is used to dimension one int and one unsigned char vector at
65     compile time. */
66    
67     #define BRASTACK_SIZE 200
68    
69    
70     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
71     are simple data values; negative values are for special things like \d and so
72     on. Zero means further processing is needed (for things like \x), or the escape
73     is invalid. */
74    
75     #if !EBCDIC /* This is the "normal" table for ASCII systems */
76     static const short int escapes[] = {
77     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
78     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
79     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
80     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
81     -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
82     -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
83     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
84     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
85     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
86     0, 0, -ESC_z /* x - z */
87     };
88    
89     #else /* This is the "abnormal" table for EBCDIC systems */
90     static const short int escapes[] = {
91     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
92     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
93     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
94     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
95     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
96     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
97     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
98     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
99     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
100     /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
101     /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
102     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
103     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
104     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
105     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
106     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
107     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
108     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
109     /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
110     /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
111     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
112     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
113     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
114     };
115     #endif
116    
117    
118     /* Tables of names of POSIX character classes and their lengths. The list is
119     terminated by a zero length entry. The first three must be alpha, upper, lower,
120     as this is assumed for handling case independence. */
121    
122     static const char *const posix_names[] = {
123     "alpha", "lower", "upper",
124     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
125     "print", "punct", "space", "word", "xdigit" };
126    
127     static const uschar posix_name_lengths[] = {
128     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
129    
130     /* Table of class bit maps for each POSIX class; up to three may be combined
131     to form the class. The table for [:blank:] is dynamically modified to remove
132     the vertical space characters. */
133    
134     static const int posix_class_maps[] = {
135     cbit_lower, cbit_upper, -1, /* alpha */
136     cbit_lower, -1, -1, /* lower */
137     cbit_upper, -1, -1, /* upper */
138     cbit_digit, cbit_lower, cbit_upper, /* alnum */
139     cbit_print, cbit_cntrl, -1, /* ascii */
140     cbit_space, -1, -1, /* blank - a GNU extension */
141     cbit_cntrl, -1, -1, /* cntrl */
142     cbit_digit, -1, -1, /* digit */
143     cbit_graph, -1, -1, /* graph */
144     cbit_print, -1, -1, /* print */
145     cbit_punct, -1, -1, /* punct */
146     cbit_space, -1, -1, /* space */
147     cbit_word, -1, -1, /* word - a Perl extension */
148     cbit_xdigit,-1, -1 /* xdigit */
149     };
150    
151    
152     /* The texts of compile-time error messages. These are "char *" because they
153     are passed to the outside world. */
154    
155     static const char *error_texts[] = {
156     "no error",
157     "\\ at end of pattern",
158     "\\c at end of pattern",
159     "unrecognized character follows \\",
160     "numbers out of order in {} quantifier",
161     /* 5 */
162     "number too big in {} quantifier",
163     "missing terminating ] for character class",
164     "invalid escape sequence in character class",
165     "range out of order in character class",
166     "nothing to repeat",
167     /* 10 */
168     "operand of unlimited repeat could match the empty string",
169     "internal error: unexpected repeat",
170     "unrecognized character after (?",
171     "POSIX named classes are supported only within a class",
172     "missing )",
173     /* 15 */
174     "reference to non-existent subpattern",
175     "erroffset passed as NULL",
176     "unknown option bit(s) set",
177     "missing ) after comment",
178     "parentheses nested too deeply",
179     /* 20 */
180     "regular expression too large",
181     "failed to get memory",
182     "unmatched parentheses",
183     "internal error: code overflow",
184     "unrecognized character after (?<",
185     /* 25 */
186     "lookbehind assertion is not fixed length",
187     "malformed number after (?(",
188     "conditional group contains more than two branches",
189     "assertion expected after (?(",
190     "(?R or (?digits must be followed by )",
191     /* 30 */
192     "unknown POSIX class name",
193     "POSIX collating elements are not supported",
194     "this version of PCRE is not compiled with PCRE_UTF8 support",
195     "spare error",
196     "character value in \\x{...} sequence is too large",
197     /* 35 */
198     "invalid condition (?(0)",
199     "\\C not allowed in lookbehind assertion",
200     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
201     "number after (?C is > 255",
202     "closing ) for (?C expected",
203     /* 40 */
204     "recursive call could loop indefinitely",
205     "unrecognized character after (?P",
206     "syntax error after (?P",
207     "two named groups have the same name",
208     "invalid UTF-8 string",
209     /* 45 */
210     "support for \\P, \\p, and \\X has not been compiled",
211     "malformed \\P or \\p sequence",
212     "unknown property name after \\P or \\p"
213     };
214    
215    
216     /* Table to identify digits and hex digits. This is used when compiling
217     patterns. Note that the tables in chartables are dependent on the locale, and
218     may mark arbitrary characters as digits - but the PCRE compiling code expects
219     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
220     a private table here. It costs 256 bytes, but it is a lot faster than doing
221     character value tests (at least in some simple cases I timed), and in some
222     applications one wants PCRE to compile efficiently as well as match
223     efficiently.
224    
225     For convenience, we use the same bit definitions as in chartables:
226    
227     0x04 decimal digit
228     0x08 hexadecimal digit
229    
230     Then we can use ctype_digit and ctype_xdigit in the code. */
231    
232     #if !EBCDIC /* This is the "normal" case, for ASCII systems */
233     static const unsigned char digitab[] =
234     {
235     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
236     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
237     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
238     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
239     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
240     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
241     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
242     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
243     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
244     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
245     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
246     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
247     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
248     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
249     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
250     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
251     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
252     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
253     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
254     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
255     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
256     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
257     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
258     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
259     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
260     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
261     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
262     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
263     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
264     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
265     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
266     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
267    
268     #else /* This is the "abnormal" case, for EBCDIC systems */
269     static const unsigned char digitab[] =
270     {
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
275     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
276     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
281     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
287     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
295     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
301     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
302     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
303    
304     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
305     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
306     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
307     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
314     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
316     0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
318     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
322     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
324     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
326     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
328     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
330     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
332     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
334     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
336     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337     #endif
338    
339    
340     /* Definition to allow mutual recursion */
341    
342     static BOOL
343     compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
344     int *, int *, branch_chain *, compile_data *);
345    
346    
347    
348     /*************************************************
349     * Handle escapes *
350     *************************************************/
351    
352     /* This function is called when a \ has been encountered. It either returns a
353     positive value for a simple escape such as \n, or a negative value which
354     encodes one of the more complicated things such as \d. When UTF-8 is enabled,
355     a positive value greater than 255 may be returned. On entry, ptr is pointing at
356     the \. On exit, it is on the final character of the escape sequence.
357    
358     Arguments:
359     ptrptr points to the pattern position pointer
360     errorcodeptr points to the errorcode variable
361     bracount number of previous extracting brackets
362     options the options bits
363     isclass TRUE if inside a character class
364    
365     Returns: zero or positive => a data character
366     negative => a special escape sequence
367     on error, errorptr is set
368     */
369    
370     static int
371     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
372     int options, BOOL isclass)
373     {
374     const uschar *ptr = *ptrptr;
375     int c, i;
376    
377     /* If backslash is at the end of the pattern, it's an error. */
378    
379     c = *(++ptr);
380     if (c == 0) *errorcodeptr = ERR1;
381    
382     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
383     a table. A non-zero result is something that can be returned immediately.
384     Otherwise further processing may be required. */
385    
386     #if !EBCDIC /* ASCII coding */
387     else if (c < '0' || c > 'z') {} /* Not alphameric */
388     else if ((i = escapes[c - '0']) != 0) c = i;
389    
390     #else /* EBCDIC coding */
391     else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
392     else if ((i = escapes[c - 0x48]) != 0) c = i;
393     #endif
394    
395     /* Escapes that need further processing, or are illegal. */
396    
397     else
398     {
399     const uschar *oldptr;
400     switch (c)
401     {
402     /* A number of Perl escapes are not handled by PCRE. We give an explicit
403     error. */
404    
405     case 'l':
406     case 'L':
407     case 'N':
408     case 'u':
409     case 'U':
410     *errorcodeptr = ERR37;
411     break;
412    
413     /* The handling of escape sequences consisting of a string of digits
414     starting with one that is not zero is not straightforward. By experiment,
415     the way Perl works seems to be as follows:
416    
417     Outside a character class, the digits are read as a decimal number. If the
418     number is less than 10, or if there are that many previous extracting
419     left brackets, then it is a back reference. Otherwise, up to three octal
420     digits are read to form an escaped byte. Thus \123 is likely to be octal
421     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
422     value is greater than 377, the least significant 8 bits are taken. Inside a
423     character class, \ followed by a digit is always an octal number. */
424    
425     case '1': case '2': case '3': case '4': case '5':
426     case '6': case '7': case '8': case '9':
427    
428     if (!isclass)
429     {
430     oldptr = ptr;
431     c -= '0';
432     while ((digitab[ptr[1]] & ctype_digit) != 0)
433     c = c * 10 + *(++ptr) - '0';
434     if (c < 10 || c <= bracount)
435     {
436     c = -(ESC_REF + c);
437     break;
438     }
439     ptr = oldptr; /* Put the pointer back and fall through */
440     }
441    
442     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
443     generates a binary zero byte and treats the digit as a following literal.
444     Thus we have to pull back the pointer by one. */
445    
446     if ((c = *ptr) >= '8')
447     {
448     ptr--;
449     c = 0;
450     break;
451     }
452    
453     /* \0 always starts an octal number, but we may drop through to here with a
454     larger first octal digit. */
455    
456     case '0':
457     c -= '0';
458     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
459     c = c * 8 + *(++ptr) - '0';
460     c &= 255; /* Take least significant 8 bits */
461     break;
462    
463     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
464     which can be greater than 0xff, but only if the ddd are hex digits. */
465    
466     case 'x':
467     #ifdef SUPPORT_UTF8
468     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
469     {
470     const uschar *pt = ptr + 2;
471     register int count = 0;
472     c = 0;
473     while ((digitab[*pt] & ctype_xdigit) != 0)
474     {
475     int cc = *pt++;
476     count++;
477     #if !EBCDIC /* ASCII coding */
478     if (cc >= 'a') cc -= 32; /* Convert to upper case */
479     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
480     #else /* EBCDIC coding */
481     if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
482     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
483     #endif
484     }
485     if (*pt == '}')
486     {
487     if (c < 0 || count > 8) *errorcodeptr = ERR34;
488     ptr = pt;
489     break;
490     }
491     /* If the sequence of hex digits does not end with '}', then we don't
492     recognize this construct; fall through to the normal \x handling. */
493     }
494     #endif
495    
496     /* Read just a single hex char */
497    
498     c = 0;
499     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
500     {
501     int cc; /* Some compilers don't like ++ */
502     cc = *(++ptr); /* in initializers */
503     #if !EBCDIC /* ASCII coding */
504     if (cc >= 'a') cc -= 32; /* Convert to upper case */
505     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
506     #else /* EBCDIC coding */
507     if (cc <= 'z') cc += 64; /* Convert to upper case */
508     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
509     #endif
510     }
511     break;
512    
513     /* Other special escapes not starting with a digit are straightforward */
514    
515     case 'c':
516     c = *(++ptr);
517     if (c == 0)
518     {
519     *errorcodeptr = ERR2;
520     return 0;
521     }
522    
523     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
524     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
525     (However, an EBCDIC equivalent has now been added.) */
526    
527     #if !EBCDIC /* ASCII coding */
528     if (c >= 'a' && c <= 'z') c -= 32;
529     c ^= 0x40;
530     #else /* EBCDIC coding */
531     if (c >= 'a' && c <= 'z') c += 64;
532     c ^= 0xC0;
533     #endif
534     break;
535    
536     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
537     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
538     for Perl compatibility, it is a literal. This code looks a bit odd, but
539     there used to be some cases other than the default, and there may be again
540     in future, so I haven't "optimized" it. */
541    
542     default:
543     if ((options & PCRE_EXTRA) != 0) switch(c)
544     {
545     default:
546     *errorcodeptr = ERR3;
547     break;
548     }
549     break;
550     }
551     }
552    
553     *ptrptr = ptr;
554     return c;
555     }
556    
557    
558    
559     #ifdef SUPPORT_UCP
560     /*************************************************
561     * Handle \P and \p *
562     *************************************************/
563    
564     /* This function is called after \P or \p has been encountered, provided that
565     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
566     pointing at the P or p. On exit, it is pointing at the final character of the
567     escape sequence.
568    
569     Argument:
570     ptrptr points to the pattern position pointer
571     negptr points to a boolean that is set TRUE for negation else FALSE
572     errorcodeptr points to the error code variable
573    
574     Returns: value from ucp_type_table, or -1 for an invalid type
575     */
576    
577     static int
578     get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
579     {
580     int c, i, bot, top;
581     const uschar *ptr = *ptrptr;
582     char name[4];
583    
584     c = *(++ptr);
585     if (c == 0) goto ERROR_RETURN;
586    
587     *negptr = FALSE;
588    
589     /* \P or \p can be followed by a one- or two-character name in {}, optionally
590     preceded by ^ for negation. */
591    
592     if (c == '{')
593     {
594     if (ptr[1] == '^')
595     {
596     *negptr = TRUE;
597     ptr++;
598     }
599     for (i = 0; i <= 2; i++)
600     {
601     c = *(++ptr);
602     if (c == 0) goto ERROR_RETURN;
603     if (c == '}') break;
604     name[i] = c;
605     }
606     if (c !='}') /* Try to distinguish error cases */
607     {
608     while (*(++ptr) != 0 && *ptr != '}');
609     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
610     }
611     name[i] = 0;
612     }
613    
614     /* Otherwise there is just one following character */
615    
616     else
617     {
618     name[0] = c;
619     name[1] = 0;
620     }
621    
622     *ptrptr = ptr;
623    
624     /* Search for a recognized property name using binary chop */
625    
626     bot = 0;
627     top = _pcre_utt_size;
628    
629     while (bot < top)
630     {
631     i = (bot + top)/2;
632     c = strcmp(name, _pcre_utt[i].name);
633     if (c == 0) return _pcre_utt[i].value;
634     if (c > 0) bot = i + 1; else top = i;
635     }
636    
637     UNKNOWN_RETURN:
638     *errorcodeptr = ERR47;
639     *ptrptr = ptr;
640     return -1;
641    
642     ERROR_RETURN:
643     *errorcodeptr = ERR46;
644     *ptrptr = ptr;
645     return -1;
646     }
647     #endif
648    
649    
650    
651    
652     /*************************************************
653     * Check for counted repeat *
654     *************************************************/
655    
656     /* This function is called when a '{' is encountered in a place where it might
657     start a quantifier. It looks ahead to see if it really is a quantifier or not.
658     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
659     where the ddds are digits.
660    
661     Arguments:
662     p pointer to the first char after '{'
663    
664     Returns: TRUE or FALSE
665     */
666    
667     static BOOL
668     is_counted_repeat(const uschar *p)
669     {
670     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671     while ((digitab[*p] & ctype_digit) != 0) p++;
672     if (*p == '}') return TRUE;
673    
674     if (*p++ != ',') return FALSE;
675     if (*p == '}') return TRUE;
676    
677     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
678     while ((digitab[*p] & ctype_digit) != 0) p++;
679    
680     return (*p == '}');
681     }
682    
683    
684    
685     /*************************************************
686     * Read repeat counts *
687     *************************************************/
688    
689     /* Read an item of the form {n,m} and return the values. This is called only
690     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
691     so the syntax is guaranteed to be correct, but we need to check the values.
692    
693     Arguments:
694     p pointer to first char after '{'
695     minp pointer to int for min
696     maxp pointer to int for max
697     returned as -1 if no max
698     errorcodeptr points to error code variable
699    
700     Returns: pointer to '}' on success;
701     current ptr on error, with errorcodeptr set non-zero
702     */
703    
704     static const uschar *
705     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
706     {
707     int min = 0;
708     int max = -1;
709    
710 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
711     an integer overflow. */
712    
713 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
714 nigel 81 if (min < 0 || min > 65535)
715     {
716     *errorcodeptr = ERR5;
717     return p;
718     }
719 nigel 77
720 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
721     Also, max must not be less than min. */
722    
723 nigel 77 if (*p == '}') max = min; else
724     {
725     if (*(++p) != '}')
726     {
727     max = 0;
728     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
729 nigel 81 if (max < 0 || max > 65535)
730     {
731     *errorcodeptr = ERR5;
732     return p;
733     }
734 nigel 77 if (max < min)
735     {
736     *errorcodeptr = ERR4;
737     return p;
738     }
739     }
740     }
741    
742 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
743     '}'. */
744 nigel 77
745 nigel 81 *minp = min;
746     *maxp = max;
747 nigel 77 return p;
748     }
749    
750    
751    
752     /*************************************************
753     * Find first significant op code *
754     *************************************************/
755    
756     /* This is called by several functions that scan a compiled expression looking
757     for a fixed first character, or an anchoring op code etc. It skips over things
758     that do not influence this. For some calls, a change of option is important.
759     For some calls, it makes sense to skip negative forward and all backward
760     assertions, and also the \b assertion; for others it does not.
761    
762     Arguments:
763     code pointer to the start of the group
764     options pointer to external options
765     optbit the option bit whose changing is significant, or
766     zero if none are
767     skipassert TRUE if certain assertions are to be skipped
768    
769     Returns: pointer to the first significant opcode
770     */
771    
772     static const uschar*
773     first_significant_code(const uschar *code, int *options, int optbit,
774     BOOL skipassert)
775     {
776     for (;;)
777     {
778     switch ((int)*code)
779     {
780     case OP_OPT:
781     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
782     *options = (int)code[1];
783     code += 2;
784     break;
785    
786     case OP_ASSERT_NOT:
787     case OP_ASSERTBACK:
788     case OP_ASSERTBACK_NOT:
789     if (!skipassert) return code;
790     do code += GET(code, 1); while (*code == OP_ALT);
791     code += _pcre_OP_lengths[*code];
792     break;
793    
794     case OP_WORD_BOUNDARY:
795     case OP_NOT_WORD_BOUNDARY:
796     if (!skipassert) return code;
797     /* Fall through */
798    
799     case OP_CALLOUT:
800     case OP_CREF:
801     case OP_BRANUMBER:
802     code += _pcre_OP_lengths[*code];
803     break;
804    
805     default:
806     return code;
807     }
808     }
809     /* Control never reaches here */
810     }
811    
812    
813    
814    
815     /*************************************************
816     * Find the fixed length of a pattern *
817     *************************************************/
818    
819     /* Scan a pattern and compute the fixed length of subject that will match it,
820     if the length is fixed. This is needed for dealing with backward assertions.
821     In UTF8 mode, the result is in characters rather than bytes.
822    
823     Arguments:
824     code points to the start of the pattern (the bracket)
825     options the compiling options
826    
827     Returns: the fixed length, or -1 if there is no fixed length,
828     or -2 if \C was encountered
829     */
830    
831     static int
832     find_fixedlength(uschar *code, int options)
833     {
834     int length = -1;
835    
836     register int branchlength = 0;
837     register uschar *cc = code + 1 + LINK_SIZE;
838    
839     /* Scan along the opcodes for this branch. If we get to the end of the
840     branch, check the length against that of the other branches. */
841    
842     for (;;)
843     {
844     int d;
845     register int op = *cc;
846     if (op >= OP_BRA) op = OP_BRA;
847    
848     switch (op)
849     {
850     case OP_BRA:
851     case OP_ONCE:
852     case OP_COND:
853     d = find_fixedlength(cc, options);
854     if (d < 0) return d;
855     branchlength += d;
856     do cc += GET(cc, 1); while (*cc == OP_ALT);
857     cc += 1 + LINK_SIZE;
858     break;
859    
860     /* Reached end of a branch; if it's a ket it is the end of a nested
861     call. If it's ALT it is an alternation in a nested call. If it is
862     END it's the end of the outer call. All can be handled by the same code. */
863    
864     case OP_ALT:
865     case OP_KET:
866     case OP_KETRMAX:
867     case OP_KETRMIN:
868     case OP_END:
869     if (length < 0) length = branchlength;
870     else if (length != branchlength) return -1;
871     if (*cc != OP_ALT) return length;
872     cc += 1 + LINK_SIZE;
873     branchlength = 0;
874     break;
875    
876     /* Skip over assertive subpatterns */
877    
878     case OP_ASSERT:
879     case OP_ASSERT_NOT:
880     case OP_ASSERTBACK:
881     case OP_ASSERTBACK_NOT:
882     do cc += GET(cc, 1); while (*cc == OP_ALT);
883     /* Fall through */
884    
885     /* Skip over things that don't match chars */
886    
887     case OP_REVERSE:
888     case OP_BRANUMBER:
889     case OP_CREF:
890     case OP_OPT:
891     case OP_CALLOUT:
892     case OP_SOD:
893     case OP_SOM:
894     case OP_EOD:
895     case OP_EODN:
896     case OP_CIRC:
897     case OP_DOLL:
898     case OP_NOT_WORD_BOUNDARY:
899     case OP_WORD_BOUNDARY:
900     cc += _pcre_OP_lengths[*cc];
901     break;
902    
903     /* Handle literal characters */
904    
905     case OP_CHAR:
906     case OP_CHARNC:
907     branchlength++;
908     cc += 2;
909     #ifdef SUPPORT_UTF8
910     if ((options & PCRE_UTF8) != 0)
911     {
912     while ((*cc & 0xc0) == 0x80) cc++;
913     }
914     #endif
915     break;
916    
917     /* Handle exact repetitions. The count is already in characters, but we
918     need to skip over a multibyte character in UTF8 mode. */
919    
920     case OP_EXACT:
921     branchlength += GET2(cc,1);
922     cc += 4;
923     #ifdef SUPPORT_UTF8
924     if ((options & PCRE_UTF8) != 0)
925     {
926     while((*cc & 0x80) == 0x80) cc++;
927     }
928     #endif
929     break;
930    
931     case OP_TYPEEXACT:
932     branchlength += GET2(cc,1);
933     cc += 4;
934     break;
935    
936     /* Handle single-char matchers */
937    
938     case OP_PROP:
939     case OP_NOTPROP:
940     cc++;
941     /* Fall through */
942    
943     case OP_NOT_DIGIT:
944     case OP_DIGIT:
945     case OP_NOT_WHITESPACE:
946     case OP_WHITESPACE:
947     case OP_NOT_WORDCHAR:
948     case OP_WORDCHAR:
949     case OP_ANY:
950     branchlength++;
951     cc++;
952     break;
953    
954     /* The single-byte matcher isn't allowed */
955    
956     case OP_ANYBYTE:
957     return -2;
958    
959     /* Check a class for variable quantification */
960    
961     #ifdef SUPPORT_UTF8
962     case OP_XCLASS:
963     cc += GET(cc, 1) - 33;
964     /* Fall through */
965     #endif
966    
967     case OP_CLASS:
968     case OP_NCLASS:
969     cc += 33;
970    
971     switch (*cc)
972     {
973     case OP_CRSTAR:
974     case OP_CRMINSTAR:
975     case OP_CRQUERY:
976     case OP_CRMINQUERY:
977     return -1;
978    
979     case OP_CRRANGE:
980     case OP_CRMINRANGE:
981     if (GET2(cc,1) != GET2(cc,3)) return -1;
982     branchlength += GET2(cc,1);
983     cc += 5;
984     break;
985    
986     default:
987     branchlength++;
988     }
989     break;
990    
991     /* Anything else is variable length */
992    
993     default:
994     return -1;
995     }
996     }
997     /* Control never gets here */
998     }
999    
1000    
1001    
1002    
1003     /*************************************************
1004     * Scan compiled regex for numbered bracket *
1005     *************************************************/
1006    
1007     /* This little function scans through a compiled pattern until it finds a
1008     capturing bracket with the given number.
1009    
1010     Arguments:
1011     code points to start of expression
1012     utf8 TRUE in UTF-8 mode
1013     number the required bracket number
1014    
1015     Returns: pointer to the opcode for the bracket, or NULL if not found
1016     */
1017    
1018     static const uschar *
1019     find_bracket(const uschar *code, BOOL utf8, int number)
1020     {
1021     #ifndef SUPPORT_UTF8
1022     utf8 = utf8; /* Stop pedantic compilers complaining */
1023     #endif
1024    
1025     for (;;)
1026     {
1027     register int c = *code;
1028     if (c == OP_END) return NULL;
1029     else if (c > OP_BRA)
1030     {
1031     int n = c - OP_BRA;
1032     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1033     if (n == number) return (uschar *)code;
1034     code += _pcre_OP_lengths[OP_BRA];
1035     }
1036     else
1037     {
1038     code += _pcre_OP_lengths[c];
1039    
1040     #ifdef SUPPORT_UTF8
1041    
1042     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1043     by a multi-byte character. The length in the table is a minimum, so we have
1044     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1045     can use relatively efficient code. */
1046    
1047     if (utf8) switch(c)
1048     {
1049     case OP_CHAR:
1050     case OP_CHARNC:
1051     case OP_EXACT:
1052     case OP_UPTO:
1053     case OP_MINUPTO:
1054     case OP_STAR:
1055     case OP_MINSTAR:
1056     case OP_PLUS:
1057     case OP_MINPLUS:
1058     case OP_QUERY:
1059     case OP_MINQUERY:
1060     while ((*code & 0xc0) == 0x80) code++;
1061     break;
1062    
1063     /* XCLASS is used for classes that cannot be represented just by a bit
1064     map. This includes negated single high-valued characters. The length in
1065     the table is zero; the actual length is stored in the compiled code. */
1066    
1067     case OP_XCLASS:
1068     code += GET(code, 1) + 1;
1069     break;
1070     }
1071     #endif
1072     }
1073     }
1074     }
1075    
1076    
1077    
1078     /*************************************************
1079     * Scan compiled regex for recursion reference *
1080     *************************************************/
1081    
1082     /* This little function scans through a compiled pattern until it finds an
1083     instance of OP_RECURSE.
1084    
1085     Arguments:
1086     code points to start of expression
1087     utf8 TRUE in UTF-8 mode
1088    
1089     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1090     */
1091    
1092     static const uschar *
1093     find_recurse(const uschar *code, BOOL utf8)
1094     {
1095     #ifndef SUPPORT_UTF8
1096     utf8 = utf8; /* Stop pedantic compilers complaining */
1097     #endif
1098    
1099     for (;;)
1100     {
1101     register int c = *code;
1102     if (c == OP_END) return NULL;
1103     else if (c == OP_RECURSE) return code;
1104     else if (c > OP_BRA)
1105     {
1106     code += _pcre_OP_lengths[OP_BRA];
1107     }
1108     else
1109     {
1110     code += _pcre_OP_lengths[c];
1111    
1112     #ifdef SUPPORT_UTF8
1113    
1114     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1115     by a multi-byte character. The length in the table is a minimum, so we have
1116     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1117     can use relatively efficient code. */
1118    
1119     if (utf8) switch(c)
1120     {
1121     case OP_CHAR:
1122     case OP_CHARNC:
1123     case OP_EXACT:
1124     case OP_UPTO:
1125     case OP_MINUPTO:
1126     case OP_STAR:
1127     case OP_MINSTAR:
1128     case OP_PLUS:
1129     case OP_MINPLUS:
1130     case OP_QUERY:
1131     case OP_MINQUERY:
1132     while ((*code & 0xc0) == 0x80) code++;
1133     break;
1134    
1135     /* XCLASS is used for classes that cannot be represented just by a bit
1136     map. This includes negated single high-valued characters. The length in
1137     the table is zero; the actual length is stored in the compiled code. */
1138    
1139     case OP_XCLASS:
1140     code += GET(code, 1) + 1;
1141     break;
1142     }
1143     #endif
1144     }
1145     }
1146     }
1147    
1148    
1149    
1150     /*************************************************
1151     * Scan compiled branch for non-emptiness *
1152     *************************************************/
1153    
1154     /* This function scans through a branch of a compiled pattern to see whether it
1155     can match the empty string or not. It is called only from could_be_empty()
1156     below. Note that first_significant_code() skips over assertions. If we hit an
1157     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1158     whose current branch will already have been scanned.
1159    
1160     Arguments:
1161     code points to start of search
1162     endcode points to where to stop
1163     utf8 TRUE if in UTF8 mode
1164    
1165     Returns: TRUE if what is matched could be empty
1166     */
1167    
1168     static BOOL
1169     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1170     {
1171     register int c;
1172     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1173     code < endcode;
1174     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1175     {
1176     const uschar *ccode;
1177    
1178     c = *code;
1179    
1180     if (c >= OP_BRA)
1181     {
1182     BOOL empty_branch;
1183     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1184    
1185     /* Scan a closed bracket */
1186    
1187     empty_branch = FALSE;
1188     do
1189     {
1190     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1191     empty_branch = TRUE;
1192     code += GET(code, 1);
1193     }
1194     while (*code == OP_ALT);
1195     if (!empty_branch) return FALSE; /* All branches are non-empty */
1196     code += 1 + LINK_SIZE;
1197     c = *code;
1198     }
1199    
1200     else switch (c)
1201     {
1202     /* Check for quantifiers after a class */
1203    
1204     #ifdef SUPPORT_UTF8
1205     case OP_XCLASS:
1206     ccode = code + GET(code, 1);
1207     goto CHECK_CLASS_REPEAT;
1208     #endif
1209    
1210     case OP_CLASS:
1211     case OP_NCLASS:
1212     ccode = code + 33;
1213    
1214     #ifdef SUPPORT_UTF8
1215     CHECK_CLASS_REPEAT:
1216     #endif
1217    
1218     switch (*ccode)
1219     {
1220     case OP_CRSTAR: /* These could be empty; continue */
1221     case OP_CRMINSTAR:
1222     case OP_CRQUERY:
1223     case OP_CRMINQUERY:
1224     break;
1225    
1226     default: /* Non-repeat => class must match */
1227     case OP_CRPLUS: /* These repeats aren't empty */
1228     case OP_CRMINPLUS:
1229     return FALSE;
1230    
1231     case OP_CRRANGE:
1232     case OP_CRMINRANGE:
1233     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1234     break;
1235     }
1236     break;
1237    
1238     /* Opcodes that must match a character */
1239    
1240     case OP_PROP:
1241     case OP_NOTPROP:
1242     case OP_EXTUNI:
1243     case OP_NOT_DIGIT:
1244     case OP_DIGIT:
1245     case OP_NOT_WHITESPACE:
1246     case OP_WHITESPACE:
1247     case OP_NOT_WORDCHAR:
1248     case OP_WORDCHAR:
1249     case OP_ANY:
1250     case OP_ANYBYTE:
1251     case OP_CHAR:
1252     case OP_CHARNC:
1253     case OP_NOT:
1254     case OP_PLUS:
1255     case OP_MINPLUS:
1256     case OP_EXACT:
1257     case OP_NOTPLUS:
1258     case OP_NOTMINPLUS:
1259     case OP_NOTEXACT:
1260     case OP_TYPEPLUS:
1261     case OP_TYPEMINPLUS:
1262     case OP_TYPEEXACT:
1263     return FALSE;
1264    
1265     /* End of branch */
1266    
1267     case OP_KET:
1268     case OP_KETRMAX:
1269     case OP_KETRMIN:
1270     case OP_ALT:
1271     return TRUE;
1272    
1273     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1274     followed by a multibyte character */
1275    
1276     #ifdef SUPPORT_UTF8
1277     case OP_STAR:
1278     case OP_MINSTAR:
1279     case OP_QUERY:
1280     case OP_MINQUERY:
1281     case OP_UPTO:
1282     case OP_MINUPTO:
1283     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1284     break;
1285     #endif
1286     }
1287     }
1288    
1289     return TRUE;
1290     }
1291    
1292    
1293    
1294     /*************************************************
1295     * Scan compiled regex for non-emptiness *
1296     *************************************************/
1297    
1298     /* This function is called to check for left recursive calls. We want to check
1299     the current branch of the current pattern to see if it could match the empty
1300     string. If it could, we must look outwards for branches at other levels,
1301     stopping when we pass beyond the bracket which is the subject of the recursion.
1302    
1303     Arguments:
1304     code points to start of the recursion
1305     endcode points to where to stop (current RECURSE item)
1306     bcptr points to the chain of current (unclosed) branch starts
1307     utf8 TRUE if in UTF-8 mode
1308    
1309     Returns: TRUE if what is matched could be empty
1310     */
1311    
1312     static BOOL
1313     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1314     BOOL utf8)
1315     {
1316     while (bcptr != NULL && bcptr->current >= code)
1317     {
1318     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1319     bcptr = bcptr->outer;
1320     }
1321     return TRUE;
1322     }
1323    
1324    
1325    
1326     /*************************************************
1327     * Check for POSIX class syntax *
1328     *************************************************/
1329    
1330     /* This function is called when the sequence "[:" or "[." or "[=" is
1331     encountered in a character class. It checks whether this is followed by an
1332     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1333     ".]" or "=]".
1334    
1335     Argument:
1336     ptr pointer to the initial [
1337     endptr where to return the end pointer
1338     cd pointer to compile data
1339    
1340     Returns: TRUE or FALSE
1341     */
1342    
1343     static BOOL
1344     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1345     {
1346     int terminator; /* Don't combine these lines; the Solaris cc */
1347     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1348     if (*(++ptr) == '^') ptr++;
1349     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1350     if (*ptr == terminator && ptr[1] == ']')
1351     {
1352     *endptr = ptr;
1353     return TRUE;
1354     }
1355     return FALSE;
1356     }
1357    
1358    
1359    
1360    
1361     /*************************************************
1362     * Check POSIX class name *
1363     *************************************************/
1364    
1365     /* This function is called to check the name given in a POSIX-style class entry
1366     such as [:alnum:].
1367    
1368     Arguments:
1369     ptr points to the first letter
1370     len the length of the name
1371    
1372     Returns: a value representing the name, or -1 if unknown
1373     */
1374    
1375     static int
1376     check_posix_name(const uschar *ptr, int len)
1377     {
1378     register int yield = 0;
1379     while (posix_name_lengths[yield] != 0)
1380     {
1381     if (len == posix_name_lengths[yield] &&
1382     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1383     yield++;
1384     }
1385     return -1;
1386     }
1387    
1388    
1389     /*************************************************
1390     * Adjust OP_RECURSE items in repeated group *
1391     *************************************************/
1392    
1393     /* OP_RECURSE items contain an offset from the start of the regex to the group
1394     that is referenced. This means that groups can be replicated for fixed
1395     repetition simply by copying (because the recursion is allowed to refer to
1396     earlier groups that are outside the current group). However, when a group is
1397     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1398     it, after it has been compiled. This means that any OP_RECURSE items within it
1399     that refer to the group itself or any contained groups have to have their
1400     offsets adjusted. That is the job of this function. Before it is called, the
1401     partially compiled regex must be temporarily terminated with OP_END.
1402    
1403     Arguments:
1404     group points to the start of the group
1405     adjust the amount by which the group is to be moved
1406     utf8 TRUE in UTF-8 mode
1407     cd contains pointers to tables etc.
1408    
1409     Returns: nothing
1410     */
1411    
1412     static void
1413     adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1414     {
1415     uschar *ptr = group;
1416     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1417     {
1418     int offset = GET(ptr, 1);
1419     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1420     ptr += 1 + LINK_SIZE;
1421     }
1422     }
1423    
1424    
1425    
1426     /*************************************************
1427     * Insert an automatic callout point *
1428     *************************************************/
1429    
1430     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1431     callout points before each pattern item.
1432    
1433     Arguments:
1434     code current code pointer
1435     ptr current pattern pointer
1436     cd pointers to tables etc
1437    
1438     Returns: new code pointer
1439     */
1440    
1441     static uschar *
1442     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1443     {
1444     *code++ = OP_CALLOUT;
1445     *code++ = 255;
1446     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1447     PUT(code, LINK_SIZE, 0); /* Default length */
1448     return code + 2*LINK_SIZE;
1449     }
1450    
1451    
1452    
1453     /*************************************************
1454     * Complete a callout item *
1455     *************************************************/
1456    
1457     /* A callout item contains the length of the next item in the pattern, which
1458     we can't fill in till after we have reached the relevant point. This is used
1459     for both automatic and manual callouts.
1460    
1461     Arguments:
1462     previous_callout points to previous callout item
1463     ptr current pattern pointer
1464     cd pointers to tables etc
1465    
1466     Returns: nothing
1467     */
1468    
1469     static void
1470     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1471     {
1472     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1473     PUT(previous_callout, 2 + LINK_SIZE, length);
1474     }
1475    
1476    
1477    
1478     #ifdef SUPPORT_UCP
1479     /*************************************************
1480     * Get othercase range *
1481     *************************************************/
1482    
1483     /* This function is passed the start and end of a class range, in UTF-8 mode
1484     with UCP support. It searches up the characters, looking for internal ranges of
1485     characters in the "other" case. Each call returns the next one, updating the
1486     start address.
1487    
1488     Arguments:
1489     cptr points to starting character value; updated
1490     d end value
1491     ocptr where to put start of othercase range
1492     odptr where to put end of othercase range
1493    
1494     Yield: TRUE when range returned; FALSE when no more
1495     */
1496    
1497     static BOOL
1498     get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1499     {
1500     int c, chartype, othercase, next;
1501    
1502     for (c = *cptr; c <= d; c++)
1503     {
1504     if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1505     break;
1506     }
1507    
1508     if (c > d) return FALSE;
1509    
1510     *ocptr = othercase;
1511     next = othercase + 1;
1512    
1513     for (++c; c <= d; c++)
1514     {
1515     if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1516     othercase != next)
1517     break;
1518     next++;
1519     }
1520    
1521     *odptr = next - 1;
1522     *cptr = c;
1523    
1524     return TRUE;
1525     }
1526     #endif /* SUPPORT_UCP */
1527    
1528    
1529     /*************************************************
1530     * Compile one branch *
1531     *************************************************/
1532    
1533     /* Scan the pattern, compiling it into the code vector. If the options are
1534     changed during the branch, the pointer is used to change the external options
1535     bits.
1536    
1537     Arguments:
1538     optionsptr pointer to the option bits
1539     brackets points to number of extracting brackets used
1540     codeptr points to the pointer to the current code point
1541     ptrptr points to the current pattern pointer
1542     errorcodeptr points to error code variable
1543     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1544     reqbyteptr set to the last literal character required, else < 0
1545     bcptr points to current branch chain
1546     cd contains pointers to tables etc.
1547    
1548     Returns: TRUE on success
1549     FALSE, with *errorcodeptr set non-zero on error
1550     */
1551    
1552     static BOOL
1553     compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1554     const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1555     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1556     {
1557     int repeat_type, op_type;
1558     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1559     int bravalue = 0;
1560     int greedy_default, greedy_non_default;
1561     int firstbyte, reqbyte;
1562     int zeroreqbyte, zerofirstbyte;
1563     int req_caseopt, reqvary, tempreqvary;
1564     int condcount = 0;
1565     int options = *optionsptr;
1566     int after_manual_callout = 0;
1567     register int c;
1568     register uschar *code = *codeptr;
1569     uschar *tempcode;
1570     BOOL inescq = FALSE;
1571     BOOL groupsetfirstbyte = FALSE;
1572     const uschar *ptr = *ptrptr;
1573     const uschar *tempptr;
1574     uschar *previous = NULL;
1575     uschar *previous_callout = NULL;
1576     uschar classbits[32];
1577    
1578     #ifdef SUPPORT_UTF8
1579     BOOL class_utf8;
1580     BOOL utf8 = (options & PCRE_UTF8) != 0;
1581     uschar *class_utf8data;
1582     uschar utf8_char[6];
1583     #else
1584     BOOL utf8 = FALSE;
1585     #endif
1586    
1587     /* Set up the default and non-default settings for greediness */
1588    
1589     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1590     greedy_non_default = greedy_default ^ 1;
1591    
1592     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1593     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1594     matches a non-fixed char first char; reqbyte just remains unset if we never
1595     find one.
1596    
1597     When we hit a repeat whose minimum is zero, we may have to adjust these values
1598     to take the zero repeat into account. This is implemented by setting them to
1599     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1600     item types that can be repeated set these backoff variables appropriately. */
1601    
1602     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1603    
1604     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1605     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1606     value > 255. It is added into the firstbyte or reqbyte variables to record the
1607     case status of the value. This is used only for ASCII characters. */
1608    
1609     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1610    
1611     /* Switch on next character until the end of the branch */
1612    
1613     for (;; ptr++)
1614     {
1615     BOOL negate_class;
1616     BOOL possessive_quantifier;
1617     BOOL is_quantifier;
1618     int class_charcount;
1619     int class_lastchar;
1620     int newoptions;
1621     int recno;
1622     int skipbytes;
1623     int subreqbyte;
1624     int subfirstbyte;
1625     int mclength;
1626     uschar mcbuffer[8];
1627    
1628     /* Next byte in the pattern */
1629    
1630     c = *ptr;
1631    
1632     /* If in \Q...\E, check for the end; if not, we have a literal */
1633    
1634     if (inescq && c != 0)
1635     {
1636     if (c == '\\' && ptr[1] == 'E')
1637     {
1638     inescq = FALSE;
1639     ptr++;
1640     continue;
1641     }
1642     else
1643     {
1644     if (previous_callout != NULL)
1645     {
1646     complete_callout(previous_callout, ptr, cd);
1647     previous_callout = NULL;
1648     }
1649     if ((options & PCRE_AUTO_CALLOUT) != 0)
1650     {
1651     previous_callout = code;
1652     code = auto_callout(code, ptr, cd);
1653     }
1654     goto NORMAL_CHAR;
1655     }
1656     }
1657    
1658     /* Fill in length of a previous callout, except when the next thing is
1659     a quantifier. */
1660    
1661     is_quantifier = c == '*' || c == '+' || c == '?' ||
1662     (c == '{' && is_counted_repeat(ptr+1));
1663    
1664     if (!is_quantifier && previous_callout != NULL &&
1665     after_manual_callout-- <= 0)
1666     {
1667     complete_callout(previous_callout, ptr, cd);
1668     previous_callout = NULL;
1669     }
1670    
1671     /* In extended mode, skip white space and comments */
1672    
1673     if ((options & PCRE_EXTENDED) != 0)
1674     {
1675     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1676     if (c == '#')
1677     {
1678     /* The space before the ; is to avoid a warning on a silly compiler
1679     on the Macintosh. */
1680     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1681     if (c != 0) continue; /* Else fall through to handle end of string */
1682     }
1683     }
1684    
1685     /* No auto callout for quantifiers. */
1686    
1687     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1688     {
1689     previous_callout = code;
1690     code = auto_callout(code, ptr, cd);
1691     }
1692    
1693     switch(c)
1694     {
1695     /* The branch terminates at end of string, |, or ). */
1696    
1697     case 0:
1698     case '|':
1699     case ')':
1700     *firstbyteptr = firstbyte;
1701     *reqbyteptr = reqbyte;
1702     *codeptr = code;
1703     *ptrptr = ptr;
1704     return TRUE;
1705    
1706     /* Handle single-character metacharacters. In multiline mode, ^ disables
1707     the setting of any following char as a first character. */
1708    
1709     case '^':
1710     if ((options & PCRE_MULTILINE) != 0)
1711     {
1712     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1713     }
1714     previous = NULL;
1715     *code++ = OP_CIRC;
1716     break;
1717    
1718     case '$':
1719     previous = NULL;
1720     *code++ = OP_DOLL;
1721     break;
1722    
1723     /* There can never be a first char if '.' is first, whatever happens about
1724     repeats. The value of reqbyte doesn't change either. */
1725    
1726     case '.':
1727     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1728     zerofirstbyte = firstbyte;
1729     zeroreqbyte = reqbyte;
1730     previous = code;
1731     *code++ = OP_ANY;
1732     break;
1733    
1734     /* Character classes. If the included characters are all < 255 in value, we
1735     build a 32-byte bitmap of the permitted characters, except in the special
1736     case where there is only one such character. For negated classes, we build
1737     the map as usual, then invert it at the end. However, we use a different
1738     opcode so that data characters > 255 can be handled correctly.
1739    
1740     If the class contains characters outside the 0-255 range, a different
1741     opcode is compiled. It may optionally have a bit map for characters < 256,
1742     but those above are are explicitly listed afterwards. A flag byte tells
1743     whether the bitmap is present, and whether this is a negated class or not.
1744     */
1745    
1746     case '[':
1747     previous = code;
1748    
1749     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1750     they are encountered at the top level, so we'll do that too. */
1751    
1752     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1753     check_posix_syntax(ptr, &tempptr, cd))
1754     {
1755     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1756     goto FAILED;
1757     }
1758    
1759     /* If the first character is '^', set the negation flag and skip it. */
1760    
1761     if ((c = *(++ptr)) == '^')
1762     {
1763     negate_class = TRUE;
1764     c = *(++ptr);
1765     }
1766     else
1767     {
1768     negate_class = FALSE;
1769     }
1770    
1771     /* Keep a count of chars with values < 256 so that we can optimize the case
1772     of just a single character (as long as it's < 256). For higher valued UTF-8
1773     characters, we don't yet do any optimization. */
1774    
1775     class_charcount = 0;
1776     class_lastchar = -1;
1777    
1778     #ifdef SUPPORT_UTF8
1779     class_utf8 = FALSE; /* No chars >= 256 */
1780     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1781     #endif
1782    
1783     /* Initialize the 32-char bit map to all zeros. We have to build the
1784     map in a temporary bit of store, in case the class contains only 1
1785     character (< 256), because in that case the compiled code doesn't use the
1786     bit map. */
1787    
1788     memset(classbits, 0, 32 * sizeof(uschar));
1789    
1790     /* Process characters until ] is reached. By writing this as a "do" it
1791     means that an initial ] is taken as a data character. The first pass
1792     through the regex checked the overall syntax, so we don't need to be very
1793     strict here. At the start of the loop, c contains the first byte of the
1794     character. */
1795    
1796     do
1797     {
1798     #ifdef SUPPORT_UTF8
1799     if (utf8 && c > 127)
1800     { /* Braces are required because the */
1801     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1802     }
1803     #endif
1804    
1805     /* Inside \Q...\E everything is literal except \E */
1806    
1807     if (inescq)
1808     {
1809     if (c == '\\' && ptr[1] == 'E')
1810     {
1811     inescq = FALSE;
1812     ptr++;
1813     continue;
1814     }
1815     else goto LONE_SINGLE_CHARACTER;
1816     }
1817    
1818     /* Handle POSIX class names. Perl allows a negation extension of the
1819     form [:^name:]. A square bracket that doesn't match the syntax is
1820     treated as a literal. We also recognize the POSIX constructions
1821     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1822     5.6 and 5.8 do. */
1823    
1824     if (c == '[' &&
1825     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1826     check_posix_syntax(ptr, &tempptr, cd))
1827     {
1828     BOOL local_negate = FALSE;
1829     int posix_class, i;
1830     register const uschar *cbits = cd->cbits;
1831    
1832     if (ptr[1] != ':')
1833     {
1834     *errorcodeptr = ERR31;
1835     goto FAILED;
1836     }
1837    
1838     ptr += 2;
1839     if (*ptr == '^')
1840     {
1841     local_negate = TRUE;
1842     ptr++;
1843     }
1844    
1845     posix_class = check_posix_name(ptr, tempptr - ptr);
1846     if (posix_class < 0)
1847     {
1848     *errorcodeptr = ERR30;
1849     goto FAILED;
1850     }
1851    
1852     /* If matching is caseless, upper and lower are converted to
1853     alpha. This relies on the fact that the class table starts with
1854     alpha, lower, upper as the first 3 entries. */
1855    
1856     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1857     posix_class = 0;
1858    
1859     /* Or into the map we are building up to 3 of the static class
1860     tables, or their negations. The [:blank:] class sets up the same
1861     chars as the [:space:] class (all white space). We remove the vertical
1862     white space chars afterwards. */
1863    
1864     posix_class *= 3;
1865     for (i = 0; i < 3; i++)
1866     {
1867     BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1868     int taboffset = posix_class_maps[posix_class + i];
1869     if (taboffset < 0) break;
1870     if (local_negate)
1871     {
1872     if (i == 0)
1873     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1874     else
1875     for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1876     if (blankclass) classbits[1] |= 0x3c;
1877     }
1878     else
1879     {
1880     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1881     if (blankclass) classbits[1] &= ~0x3c;
1882     }
1883     }
1884    
1885     ptr = tempptr + 1;
1886     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1887     continue; /* End of POSIX syntax handling */
1888     }
1889    
1890     /* Backslash may introduce a single character, or it may introduce one
1891     of the specials, which just set a flag. Escaped items are checked for
1892     validity in the pre-compiling pass. The sequence \b is a special case.
1893     Inside a class (and only there) it is treated as backspace. Elsewhere
1894     it marks a word boundary. Other escapes have preset maps ready to
1895     or into the one we are building. We assume they have more than one
1896     character in them, so set class_charcount bigger than one. */
1897    
1898     if (c == '\\')
1899     {
1900     c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1901    
1902     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1903     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1904     else if (-c == ESC_Q) /* Handle start of quoted string */
1905     {
1906     if (ptr[1] == '\\' && ptr[2] == 'E')
1907     {
1908     ptr += 2; /* avoid empty string */
1909     }
1910     else inescq = TRUE;
1911     continue;
1912     }
1913    
1914     if (c < 0)
1915     {
1916     register const uschar *cbits = cd->cbits;
1917     class_charcount += 2; /* Greater than 1 is what matters */
1918     switch (-c)
1919     {
1920     case ESC_d:
1921     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1922     continue;
1923    
1924     case ESC_D:
1925     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1926     continue;
1927    
1928     case ESC_w:
1929     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1930     continue;
1931    
1932     case ESC_W:
1933     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1934     continue;
1935    
1936     case ESC_s:
1937     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1938     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1939     continue;
1940    
1941     case ESC_S:
1942     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1943     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1944     continue;
1945    
1946     #ifdef SUPPORT_UCP
1947     case ESC_p:
1948     case ESC_P:
1949     {
1950     BOOL negated;
1951     int property = get_ucp(&ptr, &negated, errorcodeptr);
1952     if (property < 0) goto FAILED;
1953     class_utf8 = TRUE;
1954     *class_utf8data++ = ((-c == ESC_p) != negated)?
1955     XCL_PROP : XCL_NOTPROP;
1956     *class_utf8data++ = property;
1957     class_charcount -= 2; /* Not a < 256 character */
1958     }
1959     continue;
1960     #endif
1961    
1962     /* Unrecognized escapes are faulted if PCRE is running in its
1963     strict mode. By default, for compatibility with Perl, they are
1964     treated as literals. */
1965    
1966     default:
1967     if ((options & PCRE_EXTRA) != 0)
1968     {
1969     *errorcodeptr = ERR7;
1970     goto FAILED;
1971     }
1972     c = *ptr; /* The final character */
1973     class_charcount -= 2; /* Undo the default count from above */
1974     }
1975     }
1976    
1977     /* Fall through if we have a single character (c >= 0). This may be
1978     > 256 in UTF-8 mode. */
1979    
1980     } /* End of backslash handling */
1981    
1982     /* A single character may be followed by '-' to form a range. However,
1983     Perl does not permit ']' to be the end of the range. A '-' character
1984     here is treated as a literal. */
1985    
1986     if (ptr[1] == '-' && ptr[2] != ']')
1987     {
1988     int d;
1989     ptr += 2;
1990    
1991     #ifdef SUPPORT_UTF8
1992     if (utf8)
1993     { /* Braces are required because the */
1994     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1995     }
1996     else
1997     #endif
1998     d = *ptr; /* Not UTF-8 mode */
1999    
2000     /* The second part of a range can be a single-character escape, but
2001     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2002     in such circumstances. */
2003    
2004     if (d == '\\')
2005     {
2006     const uschar *oldptr = ptr;
2007     d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2008    
2009     /* \b is backslash; \X is literal X; any other special means the '-'
2010     was literal */
2011    
2012     if (d < 0)
2013     {
2014     if (d == -ESC_b) d = '\b';
2015     else if (d == -ESC_X) d = 'X'; else
2016     {
2017     ptr = oldptr - 2;
2018     goto LONE_SINGLE_CHARACTER; /* A few lines below */
2019     }
2020     }
2021     }
2022    
2023     /* The check that the two values are in the correct order happens in
2024     the pre-pass. Optimize one-character ranges */
2025    
2026     if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2027    
2028     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2029     matching, we have to use an XCLASS with extra data items. Caseless
2030     matching for characters > 127 is available only if UCP support is
2031     available. */
2032    
2033     #ifdef SUPPORT_UTF8
2034     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2035     {
2036     class_utf8 = TRUE;
2037    
2038     /* With UCP support, we can find the other case equivalents of
2039     the relevant characters. There may be several ranges. Optimize how
2040     they fit with the basic range. */
2041    
2042     #ifdef SUPPORT_UCP
2043     if ((options & PCRE_CASELESS) != 0)
2044     {
2045     int occ, ocd;
2046     int cc = c;
2047     int origd = d;
2048     while (get_othercase_range(&cc, origd, &occ, &ocd))
2049     {
2050     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2051    
2052     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2053     { /* if there is overlap, */
2054     c = occ; /* noting that if occ < c */
2055     continue; /* we can't have ocd > d */
2056     } /* because a subrange is */
2057     if (ocd > d && occ <= d + 1) /* always shorter than */
2058     { /* the basic range. */
2059     d = ocd;
2060     continue;
2061     }
2062    
2063     if (occ == ocd)
2064     {
2065     *class_utf8data++ = XCL_SINGLE;
2066     }
2067     else
2068     {
2069     *class_utf8data++ = XCL_RANGE;
2070     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2071     }
2072     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2073     }
2074     }
2075     #endif /* SUPPORT_UCP */
2076    
2077     /* Now record the original range, possibly modified for UCP caseless
2078     overlapping ranges. */
2079    
2080     *class_utf8data++ = XCL_RANGE;
2081     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2082     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2083    
2084     /* With UCP support, we are done. Without UCP support, there is no
2085     caseless matching for UTF-8 characters > 127; we can use the bit map
2086     for the smaller ones. */
2087    
2088     #ifdef SUPPORT_UCP
2089     continue; /* With next character in the class */
2090     #else
2091     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2092    
2093     /* Adjust upper limit and fall through to set up the map */
2094    
2095     d = 127;
2096    
2097     #endif /* SUPPORT_UCP */
2098     }
2099     #endif /* SUPPORT_UTF8 */
2100    
2101     /* We use the bit map for all cases when not in UTF-8 mode; else
2102     ranges that lie entirely within 0-127 when there is UCP support; else
2103     for partial ranges without UCP support. */
2104    
2105     for (; c <= d; c++)
2106     {
2107     classbits[c/8] |= (1 << (c&7));
2108     if ((options & PCRE_CASELESS) != 0)
2109     {
2110     int uc = cd->fcc[c]; /* flip case */
2111     classbits[uc/8] |= (1 << (uc&7));
2112     }
2113     class_charcount++; /* in case a one-char range */
2114     class_lastchar = c;
2115     }
2116    
2117     continue; /* Go get the next char in the class */
2118     }
2119    
2120     /* Handle a lone single character - we can get here for a normal
2121     non-escape char, or after \ that introduces a single character or for an
2122     apparent range that isn't. */
2123    
2124     LONE_SINGLE_CHARACTER:
2125    
2126     /* Handle a character that cannot go in the bit map */
2127    
2128     #ifdef SUPPORT_UTF8
2129     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2130     {
2131     class_utf8 = TRUE;
2132     *class_utf8data++ = XCL_SINGLE;
2133     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2134    
2135     #ifdef SUPPORT_UCP
2136     if ((options & PCRE_CASELESS) != 0)
2137     {
2138     int chartype;
2139     int othercase;
2140     if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2141     othercase > 0)
2142     {
2143     *class_utf8data++ = XCL_SINGLE;
2144     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2145     }
2146     }
2147     #endif /* SUPPORT_UCP */
2148    
2149     }
2150     else
2151     #endif /* SUPPORT_UTF8 */
2152    
2153     /* Handle a single-byte character */
2154     {
2155     classbits[c/8] |= (1 << (c&7));
2156     if ((options & PCRE_CASELESS) != 0)
2157     {
2158     c = cd->fcc[c]; /* flip case */
2159     classbits[c/8] |= (1 << (c&7));
2160     }
2161     class_charcount++;
2162     class_lastchar = c;
2163     }
2164     }
2165    
2166     /* Loop until ']' reached; the check for end of string happens inside the
2167     loop. This "while" is the end of the "do" above. */
2168    
2169     while ((c = *(++ptr)) != ']' || inescq);
2170    
2171     /* If class_charcount is 1, we saw precisely one character whose value is
2172     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2173     can optimize the negative case only if there were no characters >= 128
2174     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2175     single-bytes only. This is an historical hangover. Maybe one day we can
2176     tidy these opcodes to handle multi-byte characters.
2177    
2178     The optimization throws away the bit map. We turn the item into a
2179     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2180     that OP_NOT does not support multibyte characters. In the positive case, it
2181     can cause firstbyte to be set. Otherwise, there can be no first char if
2182     this item is first, whatever repeat count may follow. In the case of
2183     reqbyte, save the previous value for reinstating. */
2184    
2185     #ifdef SUPPORT_UTF8
2186     if (class_charcount == 1 &&
2187     (!utf8 ||
2188     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2189    
2190     #else
2191     if (class_charcount == 1)
2192     #endif
2193     {
2194     zeroreqbyte = reqbyte;
2195    
2196     /* The OP_NOT opcode works on one-byte characters only. */
2197    
2198     if (negate_class)
2199     {
2200     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2201     zerofirstbyte = firstbyte;
2202     *code++ = OP_NOT;
2203     *code++ = class_lastchar;
2204     break;
2205     }
2206    
2207     /* For a single, positive character, get the value into mcbuffer, and
2208     then we can handle this with the normal one-character code. */
2209    
2210     #ifdef SUPPORT_UTF8
2211     if (utf8 && class_lastchar > 127)
2212     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2213     else
2214     #endif
2215     {
2216     mcbuffer[0] = class_lastchar;
2217     mclength = 1;
2218     }
2219     goto ONE_CHAR;
2220     } /* End of 1-char optimization */
2221    
2222     /* The general case - not the one-char optimization. If this is the first
2223     thing in the branch, there can be no first char setting, whatever the
2224     repeat count. Any reqbyte setting must remain unchanged after any kind of
2225     repeat. */
2226    
2227     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2228     zerofirstbyte = firstbyte;
2229     zeroreqbyte = reqbyte;
2230    
2231     /* If there are characters with values > 255, we have to compile an
2232     extended class, with its own opcode. If there are no characters < 256,
2233     we can omit the bitmap. */
2234    
2235     #ifdef SUPPORT_UTF8
2236     if (class_utf8)
2237     {
2238     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2239     *code++ = OP_XCLASS;
2240     code += LINK_SIZE;
2241     *code = negate_class? XCL_NOT : 0;
2242    
2243     /* If the map is required, install it, and move on to the end of
2244     the extra data */
2245    
2246     if (class_charcount > 0)
2247     {
2248     *code++ |= XCL_MAP;
2249     memcpy(code, classbits, 32);
2250     code = class_utf8data;
2251     }
2252    
2253     /* If the map is not required, slide down the extra data. */
2254    
2255     else
2256     {
2257     int len = class_utf8data - (code + 33);
2258     memmove(code + 1, code + 33, len);
2259     code += len + 1;
2260     }
2261    
2262     /* Now fill in the complete length of the item */
2263    
2264     PUT(previous, 1, code - previous);
2265     break; /* End of class handling */
2266     }
2267     #endif
2268    
2269     /* If there are no characters > 255, negate the 32-byte map if necessary,
2270     and copy it into the code vector. If this is the first thing in the branch,
2271     there can be no first char setting, whatever the repeat count. Any reqbyte
2272     setting must remain unchanged after any kind of repeat. */
2273    
2274     if (negate_class)
2275     {
2276     *code++ = OP_NCLASS;
2277     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2278     }
2279     else
2280     {
2281     *code++ = OP_CLASS;
2282     memcpy(code, classbits, 32);
2283     }
2284     code += 32;
2285     break;
2286    
2287     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2288     has been tested above. */
2289    
2290     case '{':
2291     if (!is_quantifier) goto NORMAL_CHAR;
2292     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2293     if (*errorcodeptr != 0) goto FAILED;
2294     goto REPEAT;
2295    
2296     case '*':
2297     repeat_min = 0;
2298     repeat_max = -1;
2299     goto REPEAT;
2300    
2301     case '+':
2302     repeat_min = 1;
2303     repeat_max = -1;
2304     goto REPEAT;
2305    
2306     case '?':
2307     repeat_min = 0;
2308     repeat_max = 1;
2309    
2310     REPEAT:
2311     if (previous == NULL)
2312     {
2313     *errorcodeptr = ERR9;
2314     goto FAILED;
2315     }
2316    
2317     if (repeat_min == 0)
2318     {
2319     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2320     reqbyte = zeroreqbyte; /* Ditto */
2321     }
2322    
2323     /* Remember whether this is a variable length repeat */
2324    
2325     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2326    
2327     op_type = 0; /* Default single-char op codes */
2328     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2329    
2330     /* Save start of previous item, in case we have to move it up to make space
2331     for an inserted OP_ONCE for the additional '+' extension. */
2332    
2333     tempcode = previous;
2334    
2335     /* If the next character is '+', we have a possessive quantifier. This
2336     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2337     If the next character is '?' this is a minimizing repeat, by default,
2338     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2339     repeat type to the non-default. */
2340    
2341     if (ptr[1] == '+')
2342     {
2343     repeat_type = 0; /* Force greedy */
2344     possessive_quantifier = TRUE;
2345     ptr++;
2346     }
2347     else if (ptr[1] == '?')
2348     {
2349     repeat_type = greedy_non_default;
2350     ptr++;
2351     }
2352     else repeat_type = greedy_default;
2353    
2354     /* If previous was a recursion, we need to wrap it inside brackets so that
2355     it can be replicated if necessary. */
2356    
2357     if (*previous == OP_RECURSE)
2358     {
2359     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2360     code += 1 + LINK_SIZE;
2361     *previous = OP_BRA;
2362     PUT(previous, 1, code - previous);
2363     *code = OP_KET;
2364     PUT(code, 1, code - previous);
2365     code += 1 + LINK_SIZE;
2366     }
2367    
2368     /* If previous was a character match, abolish the item and generate a
2369     repeat item instead. If a char item has a minumum of more than one, ensure
2370     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2371     the first thing in a branch because the x will have gone into firstbyte
2372     instead. */
2373    
2374     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2375     {
2376     /* Deal with UTF-8 characters that take up more than one byte. It's
2377     easier to write this out separately than try to macrify it. Use c to
2378     hold the length of the character in bytes, plus 0x80 to flag that it's a
2379     length rather than a small character. */
2380    
2381     #ifdef SUPPORT_UTF8
2382     if (utf8 && (code[-1] & 0x80) != 0)
2383     {
2384     uschar *lastchar = code - 1;
2385     while((*lastchar & 0xc0) == 0x80) lastchar--;
2386     c = code - lastchar; /* Length of UTF-8 character */
2387     memcpy(utf8_char, lastchar, c); /* Save the char */
2388     c |= 0x80; /* Flag c as a length */
2389     }
2390     else
2391     #endif
2392    
2393     /* Handle the case of a single byte - either with no UTF8 support, or
2394     with UTF-8 disabled, or for a UTF-8 character < 128. */
2395    
2396     {
2397     c = code[-1];
2398     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2399     }
2400    
2401     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2402     }
2403    
2404     /* If previous was a single negated character ([^a] or similar), we use
2405     one of the special opcodes, replacing it. The code is shared with single-
2406     character repeats by setting opt_type to add a suitable offset into
2407     repeat_type. OP_NOT is currently used only for single-byte chars. */
2408    
2409     else if (*previous == OP_NOT)
2410     {
2411     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2412     c = previous[1];
2413     goto OUTPUT_SINGLE_REPEAT;
2414     }
2415    
2416     /* If previous was a character type match (\d or similar), abolish it and
2417     create a suitable repeat item. The code is shared with single-character
2418     repeats by setting op_type to add a suitable offset into repeat_type. Note
2419     the the Unicode property types will be present only when SUPPORT_UCP is
2420     defined, but we don't wrap the little bits of code here because it just
2421     makes it horribly messy. */
2422    
2423     else if (*previous < OP_EODN)
2424     {
2425     uschar *oldcode;
2426     int prop_type;
2427     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2428     c = *previous;
2429    
2430     OUTPUT_SINGLE_REPEAT:
2431     prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2432     previous[1] : -1;
2433    
2434     oldcode = code;
2435     code = previous; /* Usually overwrite previous item */
2436    
2437     /* If the maximum is zero then the minimum must also be zero; Perl allows
2438     this case, so we do too - by simply omitting the item altogether. */
2439    
2440     if (repeat_max == 0) goto END_REPEAT;
2441    
2442     /* All real repeats make it impossible to handle partial matching (maybe
2443     one day we will be able to remove this restriction). */
2444    
2445     if (repeat_max != 1) cd->nopartial = TRUE;
2446    
2447     /* Combine the op_type with the repeat_type */
2448    
2449     repeat_type += op_type;
2450    
2451     /* A minimum of zero is handled either as the special case * or ?, or as
2452     an UPTO, with the maximum given. */
2453    
2454     if (repeat_min == 0)
2455     {
2456     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2457     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2458     else
2459     {
2460     *code++ = OP_UPTO + repeat_type;
2461     PUT2INC(code, 0, repeat_max);
2462     }
2463     }
2464    
2465     /* A repeat minimum of 1 is optimized into some special cases. If the
2466     maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2467     left in place and, if the maximum is greater than 1, we use OP_UPTO with
2468     one less than the maximum. */
2469    
2470     else if (repeat_min == 1)
2471     {
2472     if (repeat_max == -1)
2473     *code++ = OP_PLUS + repeat_type;
2474     else
2475     {
2476     code = oldcode; /* leave previous item in place */
2477     if (repeat_max == 1) goto END_REPEAT;
2478     *code++ = OP_UPTO + repeat_type;
2479     PUT2INC(code, 0, repeat_max - 1);
2480     }
2481     }
2482    
2483     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2484     handled as an EXACT followed by an UPTO. */
2485    
2486     else
2487     {
2488     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2489     PUT2INC(code, 0, repeat_min);
2490    
2491     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2492     we have to insert the character for the previous code. For a repeated
2493     Unicode property match, there is an extra byte that defines the
2494     required property. In UTF-8 mode, long characters have their length in
2495     c, with the 0x80 bit as a flag. */
2496    
2497     if (repeat_max < 0)
2498     {
2499     #ifdef SUPPORT_UTF8
2500     if (utf8 && c >= 128)
2501     {
2502     memcpy(code, utf8_char, c & 7);
2503     code += c & 7;
2504     }
2505     else
2506     #endif
2507     {
2508     *code++ = c;
2509     if (prop_type >= 0) *code++ = prop_type;
2510     }
2511     *code++ = OP_STAR + repeat_type;
2512     }
2513    
2514     /* Else insert an UPTO if the max is greater than the min, again
2515     preceded by the character, for the previously inserted code. */
2516    
2517     else if (repeat_max != repeat_min)
2518     {
2519     #ifdef SUPPORT_UTF8
2520     if (utf8 && c >= 128)
2521     {
2522     memcpy(code, utf8_char, c & 7);
2523     code += c & 7;
2524     }
2525     else
2526     #endif
2527     *code++ = c;
2528     if (prop_type >= 0) *code++ = prop_type;
2529     repeat_max -= repeat_min;
2530     *code++ = OP_UPTO + repeat_type;
2531     PUT2INC(code, 0, repeat_max);
2532     }
2533     }
2534    
2535     /* The character or character type itself comes last in all cases. */
2536    
2537     #ifdef SUPPORT_UTF8
2538     if (utf8 && c >= 128)
2539     {
2540     memcpy(code, utf8_char, c & 7);
2541     code += c & 7;
2542     }
2543     else
2544     #endif
2545     *code++ = c;
2546    
2547     /* For a repeated Unicode property match, there is an extra byte that
2548     defines the required property. */
2549    
2550     #ifdef SUPPORT_UCP
2551     if (prop_type >= 0) *code++ = prop_type;
2552     #endif
2553     }
2554    
2555     /* If previous was a character class or a back reference, we put the repeat
2556     stuff after it, but just skip the item if the repeat was {0,0}. */
2557    
2558     else if (*previous == OP_CLASS ||
2559     *previous == OP_NCLASS ||
2560     #ifdef SUPPORT_UTF8
2561     *previous == OP_XCLASS ||
2562     #endif
2563     *previous == OP_REF)
2564     {
2565     if (repeat_max == 0)
2566     {
2567     code = previous;
2568     goto END_REPEAT;
2569     }
2570    
2571     /* All real repeats make it impossible to handle partial matching (maybe
2572     one day we will be able to remove this restriction). */
2573    
2574     if (repeat_max != 1) cd->nopartial = TRUE;
2575    
2576     if (repeat_min == 0 && repeat_max == -1)
2577     *code++ = OP_CRSTAR + repeat_type;
2578     else if (repeat_min == 1 && repeat_max == -1)
2579     *code++ = OP_CRPLUS + repeat_type;
2580     else if (repeat_min == 0 && repeat_max == 1)
2581     *code++ = OP_CRQUERY + repeat_type;
2582     else
2583     {
2584     *code++ = OP_CRRANGE + repeat_type;
2585     PUT2INC(code, 0, repeat_min);
2586     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2587     PUT2INC(code, 0, repeat_max);
2588     }
2589     }
2590    
2591     /* If previous was a bracket group, we may have to replicate it in certain
2592     cases. */
2593    
2594     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2595     *previous == OP_COND)
2596     {
2597     register int i;
2598     int ketoffset = 0;
2599     int len = code - previous;
2600     uschar *bralink = NULL;
2601    
2602     /* If the maximum repeat count is unlimited, find the end of the bracket
2603     by scanning through from the start, and compute the offset back to it
2604     from the current code pointer. There may be an OP_OPT setting following
2605     the final KET, so we can't find the end just by going back from the code
2606     pointer. */
2607    
2608     if (repeat_max == -1)
2609     {
2610     register uschar *ket = previous;
2611     do ket += GET(ket, 1); while (*ket != OP_KET);
2612     ketoffset = code - ket;
2613     }
2614    
2615     /* The case of a zero minimum is special because of the need to stick
2616     OP_BRAZERO in front of it, and because the group appears once in the
2617     data, whereas in other cases it appears the minimum number of times. For
2618     this reason, it is simplest to treat this case separately, as otherwise
2619     the code gets far too messy. There are several special subcases when the
2620     minimum is zero. */
2621    
2622     if (repeat_min == 0)
2623     {
2624     /* If the maximum is also zero, we just omit the group from the output
2625     altogether. */
2626    
2627     if (repeat_max == 0)
2628     {
2629     code = previous;
2630     goto END_REPEAT;
2631     }
2632    
2633     /* If the maximum is 1 or unlimited, we just have to stick in the
2634     BRAZERO and do no more at this point. However, we do need to adjust
2635     any OP_RECURSE calls inside the group that refer to the group itself or
2636     any internal group, because the offset is from the start of the whole
2637     regex. Temporarily terminate the pattern while doing this. */
2638    
2639     if (repeat_max <= 1)
2640     {
2641     *code = OP_END;
2642     adjust_recurse(previous, 1, utf8, cd);
2643     memmove(previous+1, previous, len);
2644     code++;
2645     *previous++ = OP_BRAZERO + repeat_type;
2646     }
2647    
2648     /* If the maximum is greater than 1 and limited, we have to replicate
2649     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2650     The first one has to be handled carefully because it's the original
2651     copy, which has to be moved up. The remainder can be handled by code
2652     that is common with the non-zero minimum case below. We have to
2653     adjust the value or repeat_max, since one less copy is required. Once
2654     again, we may have to adjust any OP_RECURSE calls inside the group. */
2655    
2656     else
2657     {
2658     int offset;
2659     *code = OP_END;
2660     adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2661     memmove(previous + 2 + LINK_SIZE, previous, len);
2662     code += 2 + LINK_SIZE;
2663     *previous++ = OP_BRAZERO + repeat_type;
2664     *previous++ = OP_BRA;
2665    
2666     /* We chain together the bracket offset fields that have to be
2667     filled in later when the ends of the brackets are reached. */
2668    
2669     offset = (bralink == NULL)? 0 : previous - bralink;
2670     bralink = previous;
2671     PUTINC(previous, 0, offset);
2672     }
2673    
2674     repeat_max--;
2675     }
2676    
2677     /* If the minimum is greater than zero, replicate the group as many
2678     times as necessary, and adjust the maximum to the number of subsequent
2679     copies that we need. If we set a first char from the group, and didn't
2680     set a required char, copy the latter from the former. */
2681    
2682     else
2683     {
2684     if (repeat_min > 1)
2685     {
2686     if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2687     for (i = 1; i < repeat_min; i++)
2688     {
2689     memcpy(code, previous, len);
2690     code += len;
2691     }
2692     }
2693     if (repeat_max > 0) repeat_max -= repeat_min;
2694     }
2695    
2696     /* This code is common to both the zero and non-zero minimum cases. If
2697     the maximum is limited, it replicates the group in a nested fashion,
2698     remembering the bracket starts on a stack. In the case of a zero minimum,
2699     the first one was set up above. In all cases the repeat_max now specifies
2700     the number of additional copies needed. */
2701    
2702     if (repeat_max >= 0)
2703     {
2704     for (i = repeat_max - 1; i >= 0; i--)
2705     {
2706     *code++ = OP_BRAZERO + repeat_type;
2707    
2708     /* All but the final copy start a new nesting, maintaining the
2709     chain of brackets outstanding. */
2710    
2711     if (i != 0)
2712     {
2713     int offset;
2714     *code++ = OP_BRA;
2715     offset = (bralink == NULL)? 0 : code - bralink;
2716     bralink = code;
2717     PUTINC(code, 0, offset);
2718     }
2719    
2720     memcpy(code, previous, len);
2721     code += len;
2722     }
2723    
2724     /* Now chain through the pending brackets, and fill in their length
2725     fields (which are holding the chain links pro tem). */
2726    
2727     while (bralink != NULL)
2728     {
2729     int oldlinkoffset;
2730     int offset = code - bralink + 1;
2731     uschar *bra = code - offset;
2732     oldlinkoffset = GET(bra, 1);
2733     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2734     *code++ = OP_KET;
2735     PUTINC(code, 0, offset);
2736     PUT(bra, 1, offset);
2737     }
2738     }
2739    
2740     /* If the maximum is unlimited, set a repeater in the final copy. We
2741     can't just offset backwards from the current code point, because we
2742     don't know if there's been an options resetting after the ket. The
2743     correct offset was computed above. */
2744    
2745     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2746     }
2747    
2748     /* Else there's some kind of shambles */
2749    
2750     else
2751     {
2752     *errorcodeptr = ERR11;
2753     goto FAILED;
2754     }
2755    
2756     /* If the character following a repeat is '+', we wrap the entire repeated
2757     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2758     Sun's Java package. The repeated item starts at tempcode, not at previous,
2759     which might be the first part of a string whose (former) last char we
2760     repeated. However, we don't support '+' after a greediness '?'. */
2761    
2762     if (possessive_quantifier)
2763     {
2764     int len = code - tempcode;
2765     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2766     code += 1 + LINK_SIZE;
2767     len += 1 + LINK_SIZE;
2768     tempcode[0] = OP_ONCE;
2769     *code++ = OP_KET;
2770     PUTINC(code, 0, len);
2771     PUT(tempcode, 1, len);
2772     }
2773    
2774     /* In all case we no longer have a previous item. We also set the
2775     "follows varying string" flag for subsequently encountered reqbytes if
2776     it isn't already set and we have just passed a varying length item. */
2777    
2778     END_REPEAT:
2779     previous = NULL;
2780     cd->req_varyopt |= reqvary;
2781     break;
2782    
2783    
2784     /* Start of nested bracket sub-expression, or comment or lookahead or
2785     lookbehind or option setting or condition. First deal with special things
2786     that can come after a bracket; all are introduced by ?, and the appearance
2787     of any of them means that this is not a referencing group. They were
2788     checked for validity in the first pass over the string, so we don't have to
2789     check for syntax errors here. */
2790    
2791     case '(':
2792     newoptions = options;
2793     skipbytes = 0;
2794    
2795     if (*(++ptr) == '?')
2796     {
2797     int set, unset;
2798     int *optset;
2799    
2800     switch (*(++ptr))
2801     {
2802     case '#': /* Comment; skip to ket */
2803     ptr++;
2804     while (*ptr != ')') ptr++;
2805     continue;
2806    
2807     case ':': /* Non-extracting bracket */
2808     bravalue = OP_BRA;
2809     ptr++;
2810     break;
2811    
2812     case '(':
2813     bravalue = OP_COND; /* Conditional group */
2814    
2815     /* Condition to test for recursion */
2816    
2817     if (ptr[1] == 'R')
2818     {
2819     code[1+LINK_SIZE] = OP_CREF;
2820     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2821     skipbytes = 3;
2822     ptr += 3;
2823     }
2824    
2825     /* Condition to test for a numbered subpattern match. We know that
2826     if a digit follows ( then there will just be digits until ) because
2827     the syntax was checked in the first pass. */
2828    
2829     else if ((digitab[ptr[1]] && ctype_digit) != 0)
2830     {
2831     int condref; /* Don't amalgamate; some compilers */
2832     condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2833     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2834     if (condref == 0)
2835     {
2836     *errorcodeptr = ERR35;
2837     goto FAILED;
2838     }
2839     ptr++;
2840     code[1+LINK_SIZE] = OP_CREF;
2841     PUT2(code, 2+LINK_SIZE, condref);
2842     skipbytes = 3;
2843     }
2844     /* For conditions that are assertions, we just fall through, having
2845     set bravalue above. */
2846     break;
2847    
2848     case '=': /* Positive lookahead */
2849     bravalue = OP_ASSERT;
2850     ptr++;
2851     break;
2852    
2853     case '!': /* Negative lookahead */
2854     bravalue = OP_ASSERT_NOT;
2855     ptr++;
2856     break;
2857    
2858     case '<': /* Lookbehinds */
2859     switch (*(++ptr))
2860     {
2861     case '=': /* Positive lookbehind */
2862     bravalue = OP_ASSERTBACK;
2863     ptr++;
2864     break;
2865    
2866     case '!': /* Negative lookbehind */
2867     bravalue = OP_ASSERTBACK_NOT;
2868     ptr++;
2869     break;
2870     }
2871     break;
2872    
2873     case '>': /* One-time brackets */
2874     bravalue = OP_ONCE;
2875     ptr++;
2876     break;
2877    
2878     case 'C': /* Callout - may be followed by digits; */
2879     previous_callout = code; /* Save for later completion */
2880     after_manual_callout = 1; /* Skip one item before completing */
2881     *code++ = OP_CALLOUT; /* Already checked that the terminating */
2882     { /* closing parenthesis is present. */
2883     int n = 0;
2884     while ((digitab[*(++ptr)] & ctype_digit) != 0)
2885     n = n * 10 + *ptr - '0';
2886     if (n > 255)
2887     {
2888     *errorcodeptr = ERR38;
2889     goto FAILED;
2890     }
2891     *code++ = n;
2892     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2893     PUT(code, LINK_SIZE, 0); /* Default length */
2894     code += 2 * LINK_SIZE;
2895     }
2896     previous = NULL;
2897     continue;
2898    
2899     case 'P': /* Named subpattern handling */
2900     if (*(++ptr) == '<') /* Definition */
2901     {
2902     int i, namelen;
2903     uschar *slot = cd->name_table;
2904     const uschar *name; /* Don't amalgamate; some compilers */
2905     name = ++ptr; /* grumble at autoincrement in declaration */
2906    
2907     while (*ptr++ != '>');
2908     namelen = ptr - name - 1;
2909    
2910     for (i = 0; i < cd->names_found; i++)
2911     {
2912     int crc = memcmp(name, slot+2, namelen);
2913     if (crc == 0)
2914     {
2915     if (slot[2+namelen] == 0)
2916     {
2917     *errorcodeptr = ERR43;
2918     goto FAILED;
2919     }
2920     crc = -1; /* Current name is substring */
2921     }
2922     if (crc < 0)
2923     {
2924     memmove(slot + cd->name_entry_size, slot,
2925     (cd->names_found - i) * cd->name_entry_size);
2926     break;
2927     }
2928     slot += cd->name_entry_size;
2929     }
2930    
2931     PUT2(slot, 0, *brackets + 1);
2932     memcpy(slot + 2, name, namelen);
2933     slot[2+namelen] = 0;
2934     cd->names_found++;
2935     goto NUMBERED_GROUP;
2936     }
2937    
2938     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2939     {
2940     int i, namelen;
2941     int type = *ptr++;
2942     const uschar *name = ptr;
2943     uschar *slot = cd->name_table;
2944    
2945     while (*ptr != ')') ptr++;
2946     namelen = ptr - name;
2947    
2948     for (i = 0; i < cd->names_found; i++)
2949     {
2950     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2951     slot += cd->name_entry_size;
2952     }
2953     if (i >= cd->names_found)
2954     {
2955     *errorcodeptr = ERR15;
2956     goto FAILED;
2957     }
2958    
2959     recno = GET2(slot, 0);
2960    
2961     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2962    
2963     /* Back reference */
2964    
2965     previous = code;
2966     *code++ = OP_REF;
2967     PUT2INC(code, 0, recno);
2968     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2969     if (recno > cd->top_backref) cd->top_backref = recno;
2970     continue;
2971     }
2972    
2973     /* Should never happen */
2974     break;
2975    
2976     case 'R': /* Pattern recursion */
2977     ptr++; /* Same as (?0) */
2978     /* Fall through */
2979    
2980     /* Recursion or "subroutine" call */
2981    
2982     case '0': case '1': case '2': case '3': case '4':
2983     case '5': case '6': case '7': case '8': case '9':
2984     {
2985     const uschar *called;
2986     recno = 0;
2987     while((digitab[*ptr] & ctype_digit) != 0)
2988     recno = recno * 10 + *ptr++ - '0';
2989    
2990     /* Come here from code above that handles a named recursion */
2991    
2992     HANDLE_RECURSION:
2993    
2994     previous = code;
2995    
2996     /* Find the bracket that is being referenced. Temporarily end the
2997     regex in case it doesn't exist. */
2998    
2999     *code = OP_END;
3000     called = (recno == 0)?
3001     cd->start_code : find_bracket(cd->start_code, utf8, recno);
3002    
3003     if (called == NULL)
3004     {
3005     *errorcodeptr = ERR15;
3006     goto FAILED;
3007     }
3008    
3009     /* If the subpattern is still open, this is a recursive call. We
3010     check to see if this is a left recursion that could loop for ever,
3011     and diagnose that case. */
3012    
3013     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3014     {
3015     *errorcodeptr = ERR40;
3016     goto FAILED;
3017     }
3018    
3019     /* Insert the recursion/subroutine item */
3020    
3021     *code = OP_RECURSE;
3022     PUT(code, 1, called - cd->start_code);
3023     code += 1 + LINK_SIZE;
3024     }
3025     continue;
3026    
3027     /* Character after (? not specially recognized */
3028    
3029     default: /* Option setting */
3030     set = unset = 0;
3031     optset = &set;
3032    
3033     while (*ptr != ')' && *ptr != ':')
3034     {
3035     switch (*ptr++)
3036     {
3037     case '-': optset = &unset; break;
3038    
3039     case 'i': *optset |= PCRE_CASELESS; break;
3040     case 'm': *optset |= PCRE_MULTILINE; break;
3041     case 's': *optset |= PCRE_DOTALL; break;
3042     case 'x': *optset |= PCRE_EXTENDED; break;
3043     case 'U': *optset |= PCRE_UNGREEDY; break;
3044     case 'X': *optset |= PCRE_EXTRA; break;
3045     }
3046     }
3047    
3048     /* Set up the changed option bits, but don't change anything yet. */
3049    
3050     newoptions = (options | set) & (~unset);
3051    
3052     /* If the options ended with ')' this is not the start of a nested
3053     group with option changes, so the options change at this level. Compile
3054     code to change the ims options if this setting actually changes any of
3055     them. We also pass the new setting back so that it can be put at the
3056     start of any following branches, and when this group ends (if we are in
3057     a group), a resetting item can be compiled.
3058    
3059     Note that if this item is right at the start of the pattern, the
3060     options will have been abstracted and made global, so there will be no
3061     change to compile. */
3062    
3063     if (*ptr == ')')
3064     {
3065     if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3066     {
3067     *code++ = OP_OPT;
3068     *code++ = newoptions & PCRE_IMS;
3069     }
3070    
3071     /* Change options at this level, and pass them back for use
3072     in subsequent branches. Reset the greedy defaults and the case
3073     value for firstbyte and reqbyte. */
3074    
3075     *optionsptr = options = newoptions;
3076     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3077     greedy_non_default = greedy_default ^ 1;
3078     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3079    
3080     previous = NULL; /* This item can't be repeated */
3081     continue; /* It is complete */
3082     }
3083    
3084     /* If the options ended with ':' we are heading into a nested group
3085     with possible change of options. Such groups are non-capturing and are
3086     not assertions of any kind. All we need to do is skip over the ':';
3087     the newoptions value is handled below. */
3088    
3089     bravalue = OP_BRA;
3090     ptr++;
3091     }
3092     }
3093    
3094     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3095     non-capturing and behave like (?:...) brackets */
3096    
3097     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3098     {
3099     bravalue = OP_BRA;
3100     }
3101    
3102     /* Else we have a referencing group; adjust the opcode. If the bracket
3103     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3104     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3105    
3106     else
3107     {
3108     NUMBERED_GROUP:
3109     if (++(*brackets) > EXTRACT_BASIC_MAX)
3110     {
3111     bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3112     code[1+LINK_SIZE] = OP_BRANUMBER;
3113     PUT2(code, 2+LINK_SIZE, *brackets);
3114     skipbytes = 3;
3115     }
3116     else bravalue = OP_BRA + *brackets;
3117     }
3118    
3119     /* Process nested bracketed re. Assertions may not be repeated, but other
3120     kinds can be. We copy code into a non-register variable in order to be able
3121     to pass its address because some compilers complain otherwise. Pass in a
3122     new setting for the ims options if they have changed. */
3123    
3124     previous = (bravalue >= OP_ONCE)? code : NULL;
3125     *code = bravalue;
3126     tempcode = code;
3127     tempreqvary = cd->req_varyopt; /* Save value before bracket */
3128    
3129     if (!compile_regex(
3130     newoptions, /* The complete new option state */
3131     options & PCRE_IMS, /* The previous ims option state */
3132     brackets, /* Extracting bracket count */
3133     &tempcode, /* Where to put code (updated) */
3134     &ptr, /* Input pointer (updated) */
3135     errorcodeptr, /* Where to put an error message */
3136     (bravalue == OP_ASSERTBACK ||
3137     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3138     skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3139     &subfirstbyte, /* For possible first char */
3140     &subreqbyte, /* For possible last char */
3141     bcptr, /* Current branch chain */
3142     cd)) /* Tables block */
3143     goto FAILED;
3144    
3145     /* At the end of compiling, code is still pointing to the start of the
3146     group, while tempcode has been updated to point past the end of the group
3147     and any option resetting that may follow it. The pattern pointer (ptr)
3148     is on the bracket. */
3149    
3150     /* If this is a conditional bracket, check that there are no more than
3151     two branches in the group. */
3152    
3153     else if (bravalue == OP_COND)
3154     {
3155     uschar *tc = code;
3156     condcount = 0;
3157    
3158     do {
3159     condcount++;
3160     tc += GET(tc,1);
3161     }
3162     while (*tc != OP_KET);
3163    
3164     if (condcount > 2)
3165     {
3166     *errorcodeptr = ERR27;
3167     goto FAILED;
3168     }
3169    
3170     /* If there is just one branch, we must not make use of its firstbyte or
3171     reqbyte, because this is equivalent to an empty second branch. */
3172    
3173     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3174     }
3175    
3176     /* Handle updating of the required and first characters. Update for normal
3177     brackets of all kinds, and conditions with two branches (see code above).
3178     If the bracket is followed by a quantifier with zero repeat, we have to
3179     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3180     main loop so that they can be accessed for the back off. */
3181    
3182     zeroreqbyte = reqbyte;
3183     zerofirstbyte = firstbyte;
3184     groupsetfirstbyte = FALSE;
3185    
3186     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3187     {
3188     /* If we have not yet set a firstbyte in this branch, take it from the
3189     subpattern, remembering that it was set here so that a repeat of more
3190     than one can replicate it as reqbyte if necessary. If the subpattern has
3191     no firstbyte, set "none" for the whole branch. In both cases, a zero
3192     repeat forces firstbyte to "none". */
3193    
3194     if (firstbyte == REQ_UNSET)
3195     {
3196     if (subfirstbyte >= 0)
3197     {
3198     firstbyte = subfirstbyte;
3199     groupsetfirstbyte = TRUE;
3200     }
3201     else firstbyte = REQ_NONE;
3202     zerofirstbyte = REQ_NONE;
3203     }
3204    
3205     /* If firstbyte was previously set, convert the subpattern's firstbyte
3206     into reqbyte if there wasn't one, using the vary flag that was in
3207     existence beforehand. */
3208    
3209     else if (subfirstbyte >= 0 && subreqbyte < 0)
3210     subreqbyte = subfirstbyte | tempreqvary;
3211    
3212     /* If the subpattern set a required byte (or set a first byte that isn't
3213     really the first byte - see above), set it. */
3214    
3215     if (subreqbyte >= 0) reqbyte = subreqbyte;
3216     }
3217    
3218     /* For a forward assertion, we take the reqbyte, if set. This can be
3219     helpful if the pattern that follows the assertion doesn't set a different
3220     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3221     for an assertion, however because it leads to incorrect effect for patterns
3222     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3223     of a firstbyte. This is overcome by a scan at the end if there's no
3224     firstbyte, looking for an asserted first char. */
3225    
3226     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3227    
3228     /* Now update the main code pointer to the end of the group. */
3229    
3230     code = tempcode;
3231    
3232     /* Error if hit end of pattern */
3233    
3234     if (*ptr != ')')
3235     {
3236     *errorcodeptr = ERR14;
3237     goto FAILED;
3238     }
3239     break;
3240    
3241     /* Check \ for being a real metacharacter; if not, fall through and handle
3242     it as a data character at the start of a string. Escape items are checked
3243     for validity in the pre-compiling pass. */
3244    
3245     case '\\':
3246     tempptr = ptr;
3247     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3248    
3249     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3250     are arranged to be the negation of the corresponding OP_values. For the
3251     back references, the values are ESC_REF plus the reference number. Only
3252     back references and those types that consume a character may be repeated.
3253     We can test for values between ESC_b and ESC_Z for the latter; this may
3254     have to change if any new ones are ever created. */
3255    
3256     if (c < 0)
3257     {
3258     if (-c == ESC_Q) /* Handle start of quoted string */
3259     {
3260     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3261     else inescq = TRUE;
3262     continue;
3263     }
3264    
3265     /* For metasequences that actually match a character, we disable the
3266     setting of a first character if it hasn't already been set. */
3267    
3268     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3269     firstbyte = REQ_NONE;
3270    
3271     /* Set values to reset to if this is followed by a zero repeat. */
3272    
3273     zerofirstbyte = firstbyte;
3274     zeroreqbyte = reqbyte;
3275    
3276     /* Back references are handled specially */
3277    
3278     if (-c >= ESC_REF)
3279     {
3280     int number = -c - ESC_REF;
3281     previous = code;
3282     *code++ = OP_REF;
3283     PUT2INC(code, 0, number);
3284     }
3285    
3286     /* So are Unicode property matches, if supported. We know that get_ucp
3287     won't fail because it was tested in the pre-pass. */
3288    
3289     #ifdef SUPPORT_UCP
3290     else if (-c == ESC_P || -c == ESC_p)
3291     {
3292     BOOL negated;
3293     int value = get_ucp(&ptr, &negated, errorcodeptr);
3294     previous = code;
3295     *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3296     *code++ = value;
3297     }
3298     #endif
3299    
3300     /* For the rest, we can obtain the OP value by negating the escape
3301     value */
3302    
3303     else
3304     {
3305     previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3306     *code++ = -c;
3307     }
3308     continue;
3309     }
3310    
3311     /* We have a data character whose value is in c. In UTF-8 mode it may have
3312     a value > 127. We set its representation in the length/buffer, and then
3313     handle it as a data character. */
3314    
3315     #ifdef SUPPORT_UTF8
3316     if (utf8 && c > 127)
3317     mclength = _pcre_ord2utf8(c, mcbuffer);
3318     else
3319     #endif
3320    
3321     {
3322     mcbuffer[0] = c;
3323     mclength = 1;
3324     }
3325    
3326     goto ONE_CHAR;
3327    
3328     /* Handle a literal character. It is guaranteed not to be whitespace or #
3329     when the extended flag is set. If we are in UTF-8 mode, it may be a
3330     multi-byte literal character. */
3331    
3332     default:
3333     NORMAL_CHAR:
3334     mclength = 1;
3335     mcbuffer[0] = c;
3336    
3337     #ifdef SUPPORT_UTF8
3338     if (utf8 && (c & 0xc0) == 0xc0)
3339     {
3340     while ((ptr[1] & 0xc0) == 0x80)
3341     mcbuffer[mclength++] = *(++ptr);
3342     }
3343     #endif
3344    
3345     /* At this point we have the character's bytes in mcbuffer, and the length
3346     in mclength. When not in UTF-8 mode, the length is always 1. */
3347    
3348     ONE_CHAR:
3349     previous = code;
3350     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3351     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3352    
3353     /* Set the first and required bytes appropriately. If no previous first
3354     byte, set it from this character, but revert to none on a zero repeat.
3355     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3356     repeat. */
3357    
3358     if (firstbyte == REQ_UNSET)
3359     {
3360     zerofirstbyte = REQ_NONE;
3361     zeroreqbyte = reqbyte;
3362    
3363     /* If the character is more than one byte long, we can set firstbyte
3364     only if it is not to be matched caselessly. */
3365    
3366     if (mclength == 1 || req_caseopt == 0)
3367     {
3368     firstbyte = mcbuffer[0] | req_caseopt;
3369     if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3370     }
3371     else firstbyte = reqbyte = REQ_NONE;
3372     }
3373    
3374     /* firstbyte was previously set; we can set reqbyte only the length is
3375     1 or the matching is caseful. */
3376    
3377     else
3378     {
3379     zerofirstbyte = firstbyte;
3380     zeroreqbyte = reqbyte;
3381     if (mclength == 1 || req_caseopt == 0)
3382     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3383     }
3384    
3385     break; /* End of literal character handling */
3386     }
3387     } /* end of big loop */
3388    
3389     /* Control never reaches here by falling through, only by a goto for all the
3390     error states. Pass back the position in the pattern so that it can be displayed
3391     to the user for diagnosing the error. */
3392    
3393     FAILED:
3394     *ptrptr = ptr;
3395     return FALSE;
3396     }
3397    
3398    
3399    
3400    
3401     /*************************************************
3402     * Compile sequence of alternatives *
3403     *************************************************/
3404    
3405     /* On entry, ptr is pointing past the bracket character, but on return
3406     it points to the closing bracket, or vertical bar, or end of string.
3407     The code variable is pointing at the byte into which the BRA operator has been
3408     stored. If the ims options are changed at the start (for a (?ims: group) or
3409     during any branch, we need to insert an OP_OPT item at the start of every
3410     following branch to ensure they get set correctly at run time, and also pass
3411     the new options into every subsequent branch compile.
3412    
3413     Argument:
3414     options option bits, including any changes for this subpattern
3415     oldims previous settings of ims option bits
3416     brackets -> int containing the number of extracting brackets used
3417     codeptr -> the address of the current code pointer
3418     ptrptr -> the address of the current pattern pointer
3419     errorcodeptr -> pointer to error code variable
3420     lookbehind TRUE if this is a lookbehind assertion
3421     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3422     firstbyteptr place to put the first required character, or a negative number
3423     reqbyteptr place to put the last required character, or a negative number
3424     bcptr pointer to the chain of currently open branches
3425     cd points to the data block with tables pointers etc.
3426    
3427     Returns: TRUE on success
3428     */
3429    
3430     static BOOL
3431     compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3432     const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3433     int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3434     {
3435     const uschar *ptr = *ptrptr;
3436     uschar *code = *codeptr;
3437     uschar *last_branch = code;
3438     uschar *start_bracket = code;
3439     uschar *reverse_count = NULL;
3440     int firstbyte, reqbyte;
3441     int branchfirstbyte, branchreqbyte;
3442     branch_chain bc;
3443    
3444     bc.outer = bcptr;
3445     bc.current = code;
3446    
3447     firstbyte = reqbyte = REQ_UNSET;
3448    
3449     /* Offset is set zero to mark that this bracket is still open */
3450    
3451     PUT(code, 1, 0);
3452     code += 1 + LINK_SIZE + skipbytes;
3453    
3454     /* Loop for each alternative branch */
3455    
3456     for (;;)
3457     {
3458     /* Handle a change of ims options at the start of the branch */
3459    
3460     if ((options & PCRE_IMS) != oldims)
3461     {
3462     *code++ = OP_OPT;
3463     *code++ = options & PCRE_IMS;
3464     }
3465    
3466     /* Set up dummy OP_REVERSE if lookbehind assertion */
3467    
3468     if (lookbehind)
3469     {
3470     *code++ = OP_REVERSE;
3471     reverse_count = code;
3472     PUTINC(code, 0, 0);
3473     }
3474    
3475     /* Now compile the branch */
3476    
3477     if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3478     &branchfirstbyte, &branchreqbyte, &bc, cd))
3479     {
3480     *ptrptr = ptr;
3481     return FALSE;
3482     }
3483    
3484     /* If this is the first branch, the firstbyte and reqbyte values for the
3485     branch become the values for the regex. */
3486    
3487     if (*last_branch != OP_ALT)
3488     {
3489     firstbyte = branchfirstbyte;
3490     reqbyte = branchreqbyte;
3491     }
3492    
3493     /* If this is not the first branch, the first char and reqbyte have to
3494     match the values from all the previous branches, except that if the previous
3495     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3496     REQ_VARY for the regex. */
3497    
3498     else
3499     {
3500     /* If we previously had a firstbyte, but it doesn't match the new branch,
3501     we have to abandon the firstbyte for the regex, but if there was previously
3502     no reqbyte, it takes on the value of the old firstbyte. */
3503    
3504     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3505     {
3506     if (reqbyte < 0) reqbyte = firstbyte;
3507     firstbyte = REQ_NONE;
3508     }
3509    
3510     /* If we (now or from before) have no firstbyte, a firstbyte from the
3511     branch becomes a reqbyte if there isn't a branch reqbyte. */
3512    
3513     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3514     branchreqbyte = branchfirstbyte;
3515    
3516     /* Now ensure that the reqbytes match */
3517    
3518     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3519     reqbyte = REQ_NONE;
3520     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3521     }
3522    
3523     /* If lookbehind, check that this branch matches a fixed-length string,
3524     and put the length into the OP_REVERSE item. Temporarily mark the end of
3525     the branch with OP_END. */
3526    
3527     if (lookbehind)
3528     {
3529     int length;
3530     *code = OP_END;
3531     length = find_fixedlength(last_branch, options);
3532     DPRINTF(("fixed length = %d\n", length));
3533     if (length < 0)
3534     {
3535     *errorcodeptr = (length == -2)? ERR36 : ERR25;
3536     *ptrptr = ptr;
3537     return FALSE;
3538     }
3539     PUT(reverse_count, 0, length);
3540     }
3541    
3542     /* Reached end of expression, either ')' or end of pattern. Go back through
3543     the alternative branches and reverse the chain of offsets, with the field in
3544     the BRA item now becoming an offset to the first alternative. If there are
3545     no alternatives, it points to the end of the group. The length in the
3546     terminating ket is always the length of the whole bracketed item. If any of
3547     the ims options were changed inside the group, compile a resetting op-code
3548     following, except at the very end of the pattern. Return leaving the pointer
3549     at the terminating char. */
3550    
3551     if (*ptr != '|')
3552     {
3553     int length = code - last_branch;
3554     do
3555     {
3556     int prev_length = GET(last_branch, 1);
3557     PUT(last_branch, 1, length);
3558     length = prev_length;
3559     last_branch -= length;
3560     }
3561     while (length > 0);
3562    
3563     /* Fill in the ket */
3564    
3565     *code = OP_KET;
3566     PUT(code, 1, code - start_bracket);
3567     code += 1 + LINK_SIZE;
3568    
3569     /* Resetting option if needed */
3570    
3571     if ((options & PCRE_IMS) != oldims && *ptr == ')')
3572     {
3573     *code++ = OP_OPT;
3574     *code++ = oldims;
3575     }
3576    
3577     /* Set values to pass back */
3578    
3579     *codeptr = code;
3580     *ptrptr = ptr;
3581     *firstbyteptr = firstbyte;
3582     *reqbyteptr = reqbyte;
3583     return TRUE;
3584     }
3585    
3586     /* Another branch follows; insert an "or" node. Its length field points back
3587     to the previous branch while the bracket remains open. At the end the chain
3588     is reversed. It's done like this so that the start of the bracket has a
3589     zero offset until it is closed, making it possible to detect recursion. */
3590    
3591     *code = OP_ALT;
3592     PUT(code, 1, code - last_branch);
3593     bc.current = last_branch = code;
3594     code += 1 + LINK_SIZE;
3595     ptr++;
3596     }
3597     /* Control never reaches here */
3598     }
3599    
3600    
3601    
3602    
3603     /*************************************************
3604     * Check for anchored expression *
3605     *************************************************/
3606    
3607     /* Try to find out if this is an anchored regular expression. Consider each
3608     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3609     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3610     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3611     counts, since OP_CIRC can match in the middle.
3612    
3613     We can also consider a regex to be anchored if OP_SOM starts all its branches.
3614     This is the code for \G, which means "match at start of match position, taking
3615     into account the match offset".
3616    
3617     A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3618     because that will try the rest of the pattern at all possible matching points,
3619     so there is no point trying again.... er ....
3620    
3621     .... except when the .* appears inside capturing parentheses, and there is a
3622     subsequent back reference to those parentheses. We haven't enough information
3623     to catch that case precisely.
3624    
3625     At first, the best we could do was to detect when .* was in capturing brackets
3626     and the highest back reference was greater than or equal to that level.
3627     However, by keeping a bitmap of the first 31 back references, we can catch some
3628     of the more common cases more precisely.
3629    
3630     Arguments:
3631     code points to start of expression (the bracket)
3632     options points to the options setting
3633     bracket_map a bitmap of which brackets we are inside while testing; this
3634     handles up to substring 31; after that we just have to take
3635     the less precise approach
3636     backref_map the back reference bitmap
3637    
3638     Returns: TRUE or FALSE
3639     */
3640    
3641     static BOOL
3642     is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3643     unsigned int backref_map)
3644     {
3645     do {
3646     const uschar *scode =
3647     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3648     register int op = *scode;
3649    
3650     /* Capturing brackets */
3651    
3652     if (op > OP_BRA)
3653     {
3654     int new_map;
3655     op -= OP_BRA;
3656     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3657     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3658     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3659     }
3660    
3661     /* Other brackets */
3662    
3663     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3664     {
3665     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3666     }
3667    
3668     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3669     are or may be referenced. */
3670    
3671     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3672     (*options & PCRE_DOTALL) != 0)
3673     {
3674     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3675     }
3676    
3677     /* Check for explicit anchoring */
3678    
3679     else if (op != OP_SOD && op != OP_SOM &&
3680     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3681     return FALSE;
3682     code += GET(code, 1);
3683     }
3684     while (*code == OP_ALT); /* Loop for each alternative */
3685     return TRUE;
3686     }
3687    
3688    
3689    
3690     /*************************************************
3691     * Check for starting with ^ or .* *
3692     *************************************************/
3693    
3694     /* This is called to find out if every branch starts with ^ or .* so that
3695     "first char" processing can be done to speed things up in multiline
3696     matching and for non-DOTALL patterns that start with .* (which must start at
3697     the beginning or after \n). As in the case of is_anchored() (see above), we
3698     have to take account of back references to capturing brackets that contain .*
3699     because in that case we can't make the assumption.
3700    
3701     Arguments:
3702     code points to start of expression (the bracket)
3703     bracket_map a bitmap of which brackets we are inside while testing; this
3704     handles up to substring 31; after that we just have to take
3705     the less precise approach
3706     backref_map the back reference bitmap
3707    
3708     Returns: TRUE or FALSE
3709     */
3710    
3711     static BOOL
3712     is_startline(const uschar *code, unsigned int bracket_map,
3713     unsigned int backref_map)
3714     {
3715     do {
3716     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3717     FALSE);
3718     register int op = *scode;
3719    
3720     /* Capturing brackets */
3721    
3722     if (op > OP_BRA)
3723     {
3724     int new_map;
3725     op -= OP_BRA;
3726     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3727     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3728     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3729     }
3730    
3731     /* Other brackets */
3732    
3733     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3734     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3735    
3736     /* .* means "start at start or after \n" if it isn't in brackets that
3737     may be referenced. */
3738    
3739     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3740     {
3741     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3742     }
3743    
3744     /* Check for explicit circumflex */
3745    
3746     else if (op != OP_CIRC) return FALSE;
3747    
3748     /* Move on to the next alternative */
3749    
3750     code += GET(code, 1);
3751     }
3752     while (*code == OP_ALT); /* Loop for each alternative */
3753     return TRUE;
3754     }
3755    
3756    
3757    
3758     /*************************************************
3759     * Check for asserted fixed first char *
3760     *************************************************/
3761    
3762     /* During compilation, the "first char" settings from forward assertions are
3763     discarded, because they can cause conflicts with actual literals that follow.
3764     However, if we end up without a first char setting for an unanchored pattern,
3765     it is worth scanning the regex to see if there is an initial asserted first
3766     char. If all branches start with the same asserted char, or with a bracket all
3767     of whose alternatives start with the same asserted char (recurse ad lib), then
3768     we return that char, otherwise -1.
3769    
3770     Arguments:
3771     code points to start of expression (the bracket)
3772     options pointer to the options (used to check casing changes)
3773     inassert TRUE if in an assertion
3774    
3775     Returns: -1 or the fixed first char
3776     */
3777    
3778     static int
3779     find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3780     {
3781     register int c = -1;
3782     do {
3783     int d;
3784     const uschar *scode =
3785     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3786     register int op = *scode;
3787    
3788     if (op >= OP_BRA) op = OP_BRA;
3789    
3790     switch(op)
3791     {
3792     default:
3793     return -1;
3794    
3795     case OP_BRA:
3796     case OP_ASSERT:
3797     case OP_ONCE:
3798     case OP_COND:
3799     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3800     return -1;
3801     if (c < 0) c = d; else if (c != d) return -1;
3802     break;
3803    
3804     case OP_EXACT: /* Fall through */
3805     scode += 2;
3806    
3807     case OP_CHAR:
3808     case OP_CHARNC:
3809     case OP_PLUS:
3810     case OP_MINPLUS:
3811     if (!inassert) return -1;
3812     if (c < 0)
3813     {
3814     c = scode[1];
3815     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3816     }
3817     else if (c != scode[1]) return -1;
3818     break;
3819     }
3820    
3821     code += GET(code, 1);
3822     }
3823     while (*code == OP_ALT);
3824     return c;
3825     }
3826    
3827    
3828    
3829     /*************************************************
3830     * Compile a Regular Expression *
3831     *************************************************/
3832    
3833     /* This function takes a string and returns a pointer to a block of store
3834     holding a compiled version of the expression. The original API for this
3835     function had no error code return variable; it is retained for backwards
3836     compatibility. The new function is given a new name.
3837    
3838     Arguments:
3839     pattern the regular expression
3840     options various option bits
3841     errorcodeptr pointer to error code variable (pcre_compile2() only)
3842     can be NULL if you don't want a code value
3843     errorptr pointer to pointer to error text
3844     erroroffset ptr offset in pattern where error was detected
3845     tables pointer to character tables or NULL
3846    
3847     Returns: pointer to compiled data block, or NULL on error,
3848     with errorptr and erroroffset set
3849     */
3850    
3851 nigel 85 PCRE_EXPORT pcre *
3852 nigel 77 pcre_compile(const char *pattern, int options, const char **errorptr,
3853     int *erroroffset, const unsigned char *tables)
3854     {
3855     return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3856     }
3857    
3858    
3859 nigel 85 PCRE_EXPORT pcre *
3860 nigel 77 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3861     const char **errorptr, int *erroroffset, const unsigned char *tables)
3862     {
3863     real_pcre *re;
3864     int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3865     int c, firstbyte, reqbyte;
3866     int bracount = 0;
3867     int branch_extra = 0;
3868     int branch_newextra;
3869     int item_count = -1;
3870     int name_count = 0;
3871     int max_name_size = 0;
3872     int lastitemlength = 0;
3873     int errorcode = 0;
3874     #ifdef SUPPORT_UTF8
3875     BOOL utf8;
3876     BOOL class_utf8;
3877     #endif
3878     BOOL inescq = FALSE;
3879 nigel 81 BOOL capturing;
3880 nigel 77 unsigned int brastackptr = 0;
3881     size_t size;
3882     uschar *code;
3883     const uschar *codestart;
3884     const uschar *ptr;
3885     compile_data compile_block;
3886     int brastack[BRASTACK_SIZE];
3887     uschar bralenstack[BRASTACK_SIZE];
3888    
3889     /* We can't pass back an error message if errorptr is NULL; I guess the best we
3890     can do is just return NULL, but we can set a code value if there is a code
3891     pointer. */
3892    
3893     if (errorptr == NULL)
3894     {
3895     if (errorcodeptr != NULL) *errorcodeptr = 99;
3896     return NULL;
3897     }
3898    
3899     *errorptr = NULL;
3900     if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3901    
3902     /* However, we can give a message for this error */
3903    
3904     if (erroroffset == NULL)
3905     {
3906     errorcode = ERR16;
3907     goto PCRE_EARLY_ERROR_RETURN;
3908     }
3909    
3910     *erroroffset = 0;
3911    
3912     /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3913    
3914     #ifdef SUPPORT_UTF8
3915     utf8 = (options & PCRE_UTF8) != 0;
3916     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3917     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3918     {
3919     errorcode = ERR44;
3920     goto PCRE_EARLY_ERROR_RETURN;
3921     }
3922     #else
3923     if ((options & PCRE_UTF8) != 0)
3924     {
3925     errorcode = ERR32;
3926     goto PCRE_EARLY_ERROR_RETURN;
3927     }
3928     #endif
3929    
3930     if ((options & ~PUBLIC_OPTIONS) != 0)
3931     {
3932     errorcode = ERR17;
3933     goto PCRE_EARLY_ERROR_RETURN;
3934     }
3935    
3936     /* Set up pointers to the individual character tables */
3937    
3938     if (tables == NULL) tables = _pcre_default_tables;
3939     compile_block.lcc = tables + lcc_offset;
3940     compile_block.fcc = tables + fcc_offset;
3941     compile_block.cbits = tables + cbits_offset;
3942     compile_block.ctypes = tables + ctypes_offset;
3943    
3944     /* Maximum back reference and backref bitmap. This is updated for numeric
3945     references during the first pass, but for named references during the actual
3946     compile pass. The bitmap records up to 31 back references to help in deciding
3947     whether (.*) can be treated as anchored or not. */
3948    
3949     compile_block.top_backref = 0;
3950     compile_block.backref_map = 0;
3951    
3952     /* Reflect pattern for debugging output */
3953    
3954     DPRINTF(("------------------------------------------------------------------\n"));
3955     DPRINTF(("%s\n", pattern));
3956    
3957     /* The first thing to do is to make a pass over the pattern to compute the
3958     amount of store required to hold the compiled code. This does not have to be
3959     perfect as long as errors are overestimates. At the same time we can detect any
3960     flag settings right at the start, and extract them. Make an attempt to correct
3961     for any counted white space if an "extended" flag setting appears late in the
3962     pattern. We can't be so clever for #-comments. */
3963    
3964     ptr = (const uschar *)(pattern - 1);
3965     while ((c = *(++ptr)) != 0)
3966     {
3967     int min, max;
3968     int class_optcount;
3969     int bracket_length;
3970     int duplength;
3971    
3972     /* If we are inside a \Q...\E sequence, all chars are literal */
3973    
3974     if (inescq)
3975     {
3976     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3977     goto NORMAL_CHAR;
3978     }
3979    
3980     /* Otherwise, first check for ignored whitespace and comments */
3981    
3982     if ((options & PCRE_EXTENDED) != 0)
3983     {
3984     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3985     if (c == '#')
3986     {
3987     /* The space before the ; is to avoid a warning on a silly compiler
3988     on the Macintosh. */
3989     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3990     if (c == 0) break;
3991     continue;
3992     }
3993     }
3994    
3995     item_count++; /* Is zero for the first non-comment item */
3996    
3997     /* Allow space for auto callout before every item except quantifiers. */
3998    
3999     if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4000     c != '*' && c != '+' && c != '?' &&
4001     (c != '{' || !is_counted_repeat(ptr + 1)))
4002     length += 2 + 2*LINK_SIZE;
4003    
4004     switch(c)
4005     {
4006     /* A backslashed item may be an escaped data character or it may be a
4007     character type. */
4008    
4009     case '\\':
4010     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4011     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4012    
4013     lastitemlength = 1; /* Default length of last item for repeats */
4014    
4015     if (c >= 0) /* Data character */
4016     {
4017     length += 2; /* For a one-byte character */
4018    
4019     #ifdef SUPPORT_UTF8
4020     if (utf8 && c > 127)
4021     {
4022     int i;
4023     for (i = 0; i < _pcre_utf8_table1_size; i++)
4024     if (c <= _pcre_utf8_table1[i]) break;
4025     length += i;
4026     lastitemlength += i;
4027     }
4028     #endif
4029    
4030     continue;
4031     }
4032    
4033     /* If \Q, enter "literal" mode */
4034    
4035     if (-c == ESC_Q)
4036     {
4037     inescq = TRUE;
4038     continue;
4039     }
4040    
4041     /* \X is supported only if Unicode property support is compiled */
4042    
4043     #ifndef SUPPORT_UCP
4044     if (-c == ESC_X)
4045     {
4046     errorcode = ERR45;
4047     goto PCRE_ERROR_RETURN;
4048     }
4049     #endif
4050    
4051     /* \P and \p are for Unicode properties, but only when the support has
4052     been compiled. Each item needs 2 bytes. */
4053    
4054     else if (-c == ESC_P || -c == ESC_p)
4055     {
4056     #ifdef SUPPORT_UCP
4057     BOOL negated;
4058     length += 2;
4059     lastitemlength = 2;
4060     if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4061     continue;
4062     #else
4063     errorcode = ERR45;
4064     goto PCRE_ERROR_RETURN;
4065     #endif
4066     }
4067    
4068     /* Other escapes need one byte */
4069    
4070     length++;
4071    
4072     /* A back reference needs an additional 2 bytes, plus either one or 5
4073     bytes for a repeat. We also need to keep the value of the highest
4074     back reference. */
4075    
4076     if (c <= -ESC_REF)
4077     {
4078     int refnum = -c - ESC_REF;
4079     compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4080     if (refnum > compile_block.top_backref)
4081     compile_block.top_backref = refnum;
4082     length += 2; /* For single back reference */
4083     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4084     {
4085     ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4086     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4087     if ((min == 0 && (max == 1 || max == -1)) ||
4088     (min == 1 && max == -1))
4089     length++;
4090     else length += 5;
4091     if (ptr[1] == '?') ptr++;
4092     }
4093     }
4094     continue;
4095    
4096     case '^': /* Single-byte metacharacters */
4097     case '.':
4098     case '$':
4099     length++;
4100     lastitemlength = 1;
4101     continue;
4102    
4103     case '*': /* These repeats won't be after brackets; */
4104     case '+': /* those are handled separately */
4105     case '?':
4106     length++;
4107     goto POSESSIVE; /* A few lines below */
4108    
4109     /* This covers the cases of braced repeats after a single char, metachar,
4110     class, or back reference. */
4111    
4112     case '{':
4113     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4114     ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4115     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4116    
4117     /* These special cases just insert one extra opcode */
4118    
4119     if ((min == 0 && (max == 1 || max == -1)) ||
4120     (min == 1 && max == -1))
4121     length++;
4122    
4123     /* These cases might insert additional copies of a preceding character. */
4124    
4125     else
4126     {
4127     if (min != 1)
4128     {
4129     length -= lastitemlength; /* Uncount the original char or metachar */
4130     if (min > 0) length += 3 + lastitemlength;
4131     }
4132     length += lastitemlength + ((max > 0)? 3 : 1);
4133     }
4134    
4135     if (ptr[1] == '?') ptr++; /* Needs no extra length */
4136    
4137     POSESSIVE: /* Test for possessive quantifier */
4138     if (ptr[1] == '+')
4139     {
4140     ptr++;
4141     length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4142     }
4143     continue;
4144    
4145     /* An alternation contains an offset to the next branch or ket. If any ims
4146     options changed in the previous branch(es), and/or if we are in a
4147     lookbehind assertion, extra space will be needed at the start of the
4148     branch. This is handled by branch_extra. */
4149    
4150     case '|':
4151     length += 1 + LINK_SIZE + branch_extra;
4152     continue;
4153    
4154     /* A character class uses 33 characters provided that all the character
4155     values are less than 256. Otherwise, it uses a bit map for low valued
4156     characters, and individual items for others. Don't worry about character
4157     types that aren't allowed in classes - they'll get picked up during the
4158     compile. A character class that contains only one single-byte character
4159     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4160     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4161    
4162     case '[':
4163     if (*(++ptr) == '^')
4164     {
4165     class_optcount = 10; /* Greater than one */
4166     ptr++;
4167     }
4168     else class_optcount = 0;
4169    
4170     #ifdef SUPPORT_UTF8
4171     class_utf8 = FALSE;
4172     #endif
4173    
4174     /* Written as a "do" so that an initial ']' is taken as data */
4175    
4176     if (*ptr != 0) do
4177     {
4178     /* Inside \Q...\E everything is literal except \E */
4179    
4180     if (inescq)
4181     {
4182     if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4183     inescq = FALSE;
4184     ptr += 1;
4185     continue;
4186     }
4187    
4188     /* Outside \Q...\E, check for escapes */
4189    
4190     if (*ptr == '\\')
4191     {
4192     c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4193     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4194    
4195     /* \b is backspace inside a class; \X is literal */
4196    
4197     if (-c == ESC_b) c = '\b';
4198     else if (-c == ESC_X) c = 'X';
4199    
4200     /* \Q enters quoting mode */
4201    
4202     else if (-c == ESC_Q)
4203     {
4204     inescq = TRUE;
4205     continue;
4206     }
4207    
4208     /* Handle escapes that turn into characters */
4209    
4210     if (c >= 0) goto NON_SPECIAL_CHARACTER;
4211    
4212     /* Escapes that are meta-things. The normal ones just affect the
4213     bit map, but Unicode properties require an XCLASS extended item. */
4214    
4215     else
4216     {
4217     class_optcount = 10; /* \d, \s etc; make sure > 1 */
4218     #ifdef SUPPORT_UTF8
4219     if (-c == ESC_p || -c == ESC_P)
4220     {
4221     if (!class_utf8)
4222     {
4223     class_utf8 = TRUE;
4224     length += LINK_SIZE + 2;
4225     }
4226     length += 2;
4227     }
4228     #endif
4229     }
4230     }
4231    
4232     /* Check the syntax for POSIX stuff. The bits we actually handle are
4233     checked during the real compile phase. */
4234    
4235     else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4236     {
4237     ptr++;
4238     class_optcount = 10; /* Make sure > 1 */
4239     }
4240    
4241     /* Anything else increments the possible optimization count. We have to
4242     detect ranges here so that we can compute the number of extra ranges for
4243     caseless wide characters when UCP support is available. If there are wide
4244     characters, we are going to have to use an XCLASS, even for single
4245     characters. */
4246    
4247     else
4248     {
4249     int d;
4250    
4251     GET_ONE_CHARACTER:
4252    
4253     #ifdef SUPPORT_UTF8
4254     if (utf8)
4255     {
4256     int extra = 0;
4257     GETCHARLEN(c, ptr, extra);
4258     ptr += extra;
4259     }
4260     else c = *ptr;
4261     #else
4262     c = *ptr;
4263     #endif
4264    
4265     /* Come here from handling \ above when it escapes to a char value */
4266    
4267     NON_SPECIAL_CHARACTER:
4268     class_optcount++;
4269    
4270     d = -1;
4271     if (ptr[1] == '-')
4272     {
4273     uschar const *hyptr = ptr++;
4274     if (ptr[1] == '\\')
4275     {
4276     ptr++;
4277     d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4278     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4279     if (-d == ESC_b) d = '\b'; /* backspace */
4280     else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4281     }
4282     else if (ptr[1] != 0 && ptr[1] != ']')
4283     {
4284     ptr++;
4285     #ifdef SUPPORT_UTF8
4286     if (utf8)
4287     {
4288     int extra = 0;
4289     GETCHARLEN(d, ptr, extra);
4290     ptr += extra;
4291     }
4292     else
4293     #endif
4294     d = *ptr;
4295     }
4296     if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4297     }
4298    
4299     /* If d >= 0 we have a range. In UTF-8 mode, if