/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 87 - (hide annotations) (download)
Sat Feb 24 21:41:21 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 162063 byte(s)
Load pcre-6.5 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45     #include "pcre_internal.h"
46    
47    
48 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
49     used by pcretest. DEBUG is not defined when building a production library. */
50    
51     #ifdef DEBUG
52     #include "pcre_printint.src"
53     #endif
54    
55    
56    
57 nigel 77 /*************************************************
58     * Code parameters and static tables *
59     *************************************************/
60    
61     /* Maximum number of items on the nested bracket stacks at compile time. This
62     applies to the nesting of all kinds of parentheses. It does not limit
63     un-nested, non-capturing parentheses. This number can be made bigger if
64     necessary - it is used to dimension one int and one unsigned char vector at
65     compile time. */
66    
67     #define BRASTACK_SIZE 200
68    
69    
70     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
71     are simple data values; negative values are for special things like \d and so
72     on. Zero means further processing is needed (for things like \x), or the escape
73     is invalid. */
74    
75     #if !EBCDIC /* This is the "normal" table for ASCII systems */
76     static const short int escapes[] = {
77     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
78     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
79     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
80     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
81     -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
82     -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
83     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
84     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
85     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
86     0, 0, -ESC_z /* x - z */
87     };
88    
89     #else /* This is the "abnormal" table for EBCDIC systems */
90     static const short int escapes[] = {
91     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
92     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
93     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
94     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
95     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
96     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
97     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
98     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
99     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
100     /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
101     /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
102     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
103     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
104     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
105     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
106     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
107     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
108     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
109     /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
110     /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
111     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
112     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
113     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
114     };
115     #endif
116    
117    
118     /* Tables of names of POSIX character classes and their lengths. The list is
119 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
120 nigel 77 as this is assumed for handling case independence. */
121    
122     static const char *const posix_names[] = {
123     "alpha", "lower", "upper",
124     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
125     "print", "punct", "space", "word", "xdigit" };
126    
127     static const uschar posix_name_lengths[] = {
128     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
129    
130 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
131     base map, with an optional addition or removal of another map. Then, for some
132     classes, there is some additional tweaking: for [:blank:] the vertical space
133     characters are removed, and for [:alpha:] and [:alnum:] the underscore
134     character is removed. The triples in the table consist of the base map offset,
135     second map offset or -1 if no second map, and a non-negative value for map
136     addition or a negative value for map subtraction (if there are two maps). The
137     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
138     remove vertical space characters, 2 => remove underscore. */
139 nigel 77
140     static const int posix_class_maps[] = {
141 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
142     cbit_lower, -1, 0, /* lower */
143     cbit_upper, -1, 0, /* upper */
144     cbit_word, -1, 2, /* alnum - word without underscore */
145     cbit_print, cbit_cntrl, 0, /* ascii */
146     cbit_space, -1, 1, /* blank - a GNU extension */
147     cbit_cntrl, -1, 0, /* cntrl */
148     cbit_digit, -1, 0, /* digit */
149     cbit_graph, -1, 0, /* graph */
150     cbit_print, -1, 0, /* print */
151     cbit_punct, -1, 0, /* punct */
152     cbit_space, -1, 0, /* space */
153     cbit_word, -1, 0, /* word - a Perl extension */
154     cbit_xdigit,-1, 0 /* xdigit */
155 nigel 77 };
156    
157    
158     /* The texts of compile-time error messages. These are "char *" because they
159     are passed to the outside world. */
160    
161     static const char *error_texts[] = {
162     "no error",
163     "\\ at end of pattern",
164     "\\c at end of pattern",
165     "unrecognized character follows \\",
166     "numbers out of order in {} quantifier",
167     /* 5 */
168     "number too big in {} quantifier",
169     "missing terminating ] for character class",
170     "invalid escape sequence in character class",
171     "range out of order in character class",
172     "nothing to repeat",
173     /* 10 */
174     "operand of unlimited repeat could match the empty string",
175     "internal error: unexpected repeat",
176     "unrecognized character after (?",
177     "POSIX named classes are supported only within a class",
178     "missing )",
179     /* 15 */
180     "reference to non-existent subpattern",
181     "erroffset passed as NULL",
182     "unknown option bit(s) set",
183     "missing ) after comment",
184     "parentheses nested too deeply",
185     /* 20 */
186     "regular expression too large",
187     "failed to get memory",
188     "unmatched parentheses",
189     "internal error: code overflow",
190     "unrecognized character after (?<",
191     /* 25 */
192     "lookbehind assertion is not fixed length",
193     "malformed number after (?(",
194     "conditional group contains more than two branches",
195     "assertion expected after (?(",
196     "(?R or (?digits must be followed by )",
197     /* 30 */
198     "unknown POSIX class name",
199     "POSIX collating elements are not supported",
200     "this version of PCRE is not compiled with PCRE_UTF8 support",
201     "spare error",
202     "character value in \\x{...} sequence is too large",
203     /* 35 */
204     "invalid condition (?(0)",
205     "\\C not allowed in lookbehind assertion",
206     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
207     "number after (?C is > 255",
208     "closing ) for (?C expected",
209     /* 40 */
210     "recursive call could loop indefinitely",
211     "unrecognized character after (?P",
212     "syntax error after (?P",
213     "two named groups have the same name",
214     "invalid UTF-8 string",
215     /* 45 */
216     "support for \\P, \\p, and \\X has not been compiled",
217     "malformed \\P or \\p sequence",
218     "unknown property name after \\P or \\p"
219     };
220    
221    
222     /* Table to identify digits and hex digits. This is used when compiling
223     patterns. Note that the tables in chartables are dependent on the locale, and
224     may mark arbitrary characters as digits - but the PCRE compiling code expects
225     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
226     a private table here. It costs 256 bytes, but it is a lot faster than doing
227     character value tests (at least in some simple cases I timed), and in some
228     applications one wants PCRE to compile efficiently as well as match
229     efficiently.
230    
231     For convenience, we use the same bit definitions as in chartables:
232    
233     0x04 decimal digit
234     0x08 hexadecimal digit
235    
236     Then we can use ctype_digit and ctype_xdigit in the code. */
237    
238     #if !EBCDIC /* This is the "normal" case, for ASCII systems */
239     static const unsigned char digitab[] =
240     {
241     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
242     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
243     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
244     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
245     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
246     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
247     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
248     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
249     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
250     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
251     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
252     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
253     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
254     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
255     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
256     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
257     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
258     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
259     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
260     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
261     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
262     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
263     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
264     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
265     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
266     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
267     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
268     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
273    
274     #else /* This is the "abnormal" case, for EBCDIC systems */
275     static const unsigned char digitab[] =
276     {
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
281     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
293     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
301     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
307     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
308     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
309    
310     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
311     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
312     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
313     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
315     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
316     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
319     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
320     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
321     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
322     0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
324     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
327     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
328     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
329     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
330     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
331     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
332     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
333     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
334     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
335     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
336     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
337     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
338     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
339     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
340     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
341     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
342     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
343     #endif
344    
345    
346     /* Definition to allow mutual recursion */
347    
348     static BOOL
349     compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
350     int *, int *, branch_chain *, compile_data *);
351    
352    
353    
354     /*************************************************
355     * Handle escapes *
356     *************************************************/
357    
358     /* This function is called when a \ has been encountered. It either returns a
359     positive value for a simple escape such as \n, or a negative value which
360     encodes one of the more complicated things such as \d. When UTF-8 is enabled,
361     a positive value greater than 255 may be returned. On entry, ptr is pointing at
362     the \. On exit, it is on the final character of the escape sequence.
363    
364     Arguments:
365     ptrptr points to the pattern position pointer
366     errorcodeptr points to the errorcode variable
367     bracount number of previous extracting brackets
368     options the options bits
369     isclass TRUE if inside a character class
370    
371     Returns: zero or positive => a data character
372     negative => a special escape sequence
373     on error, errorptr is set
374     */
375    
376     static int
377     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
378     int options, BOOL isclass)
379     {
380 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
381     const uschar *ptr = *ptrptr + 1;
382 nigel 77 int c, i;
383    
384 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
385     ptr--; /* Set pointer back to the last byte */
386    
387 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
388    
389     if (c == 0) *errorcodeptr = ERR1;
390    
391     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
392     a table. A non-zero result is something that can be returned immediately.
393     Otherwise further processing may be required. */
394    
395     #if !EBCDIC /* ASCII coding */
396     else if (c < '0' || c > 'z') {} /* Not alphameric */
397     else if ((i = escapes[c - '0']) != 0) c = i;
398    
399     #else /* EBCDIC coding */
400     else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
401     else if ((i = escapes[c - 0x48]) != 0) c = i;
402     #endif
403    
404     /* Escapes that need further processing, or are illegal. */
405    
406     else
407     {
408     const uschar *oldptr;
409     switch (c)
410     {
411     /* A number of Perl escapes are not handled by PCRE. We give an explicit
412     error. */
413    
414     case 'l':
415     case 'L':
416     case 'N':
417     case 'u':
418     case 'U':
419     *errorcodeptr = ERR37;
420     break;
421    
422     /* The handling of escape sequences consisting of a string of digits
423     starting with one that is not zero is not straightforward. By experiment,
424     the way Perl works seems to be as follows:
425    
426     Outside a character class, the digits are read as a decimal number. If the
427     number is less than 10, or if there are that many previous extracting
428     left brackets, then it is a back reference. Otherwise, up to three octal
429     digits are read to form an escaped byte. Thus \123 is likely to be octal
430     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
431     value is greater than 377, the least significant 8 bits are taken. Inside a
432     character class, \ followed by a digit is always an octal number. */
433    
434     case '1': case '2': case '3': case '4': case '5':
435     case '6': case '7': case '8': case '9':
436    
437     if (!isclass)
438     {
439     oldptr = ptr;
440     c -= '0';
441     while ((digitab[ptr[1]] & ctype_digit) != 0)
442     c = c * 10 + *(++ptr) - '0';
443     if (c < 10 || c <= bracount)
444     {
445     c = -(ESC_REF + c);
446     break;
447     }
448     ptr = oldptr; /* Put the pointer back and fall through */
449     }
450    
451     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
452     generates a binary zero byte and treats the digit as a following literal.
453     Thus we have to pull back the pointer by one. */
454    
455     if ((c = *ptr) >= '8')
456     {
457     ptr--;
458     c = 0;
459     break;
460     }
461    
462     /* \0 always starts an octal number, but we may drop through to here with a
463     larger first octal digit. */
464    
465     case '0':
466     c -= '0';
467     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
468     c = c * 8 + *(++ptr) - '0';
469     c &= 255; /* Take least significant 8 bits */
470     break;
471    
472 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
473     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
474     treated as a data character. */
475 nigel 77
476     case 'x':
477 nigel 87 if (ptr[1] == '{')
478 nigel 77 {
479     const uschar *pt = ptr + 2;
480 nigel 87 int count = 0;
481    
482 nigel 77 c = 0;
483     while ((digitab[*pt] & ctype_xdigit) != 0)
484     {
485 nigel 87 register int cc = *pt++;
486     if (c == 0 && cc == '0') continue; /* Leading zeroes */
487 nigel 77 count++;
488 nigel 87
489 nigel 77 #if !EBCDIC /* ASCII coding */
490     if (cc >= 'a') cc -= 32; /* Convert to upper case */
491 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
492 nigel 77 #else /* EBCDIC coding */
493     if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
494 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
495 nigel 77 #endif
496     }
497 nigel 87
498 nigel 77 if (*pt == '}')
499     {
500 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
501 nigel 77 ptr = pt;
502     break;
503     }
504 nigel 87
505 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
506     recognize this construct; fall through to the normal \x handling. */
507     }
508    
509 nigel 87 /* Read just a single-byte hex-defined char */
510 nigel 77
511     c = 0;
512     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
513     {
514     int cc; /* Some compilers don't like ++ */
515     cc = *(++ptr); /* in initializers */
516     #if !EBCDIC /* ASCII coding */
517     if (cc >= 'a') cc -= 32; /* Convert to upper case */
518     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
519     #else /* EBCDIC coding */
520     if (cc <= 'z') cc += 64; /* Convert to upper case */
521     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
522     #endif
523     }
524     break;
525    
526     /* Other special escapes not starting with a digit are straightforward */
527    
528     case 'c':
529     c = *(++ptr);
530     if (c == 0)
531     {
532     *errorcodeptr = ERR2;
533     return 0;
534     }
535    
536     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
537     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
538     (However, an EBCDIC equivalent has now been added.) */
539    
540     #if !EBCDIC /* ASCII coding */
541     if (c >= 'a' && c <= 'z') c -= 32;
542     c ^= 0x40;
543     #else /* EBCDIC coding */
544     if (c >= 'a' && c <= 'z') c += 64;
545     c ^= 0xC0;
546     #endif
547     break;
548    
549     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
550     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
551     for Perl compatibility, it is a literal. This code looks a bit odd, but
552     there used to be some cases other than the default, and there may be again
553     in future, so I haven't "optimized" it. */
554    
555     default:
556     if ((options & PCRE_EXTRA) != 0) switch(c)
557     {
558     default:
559     *errorcodeptr = ERR3;
560     break;
561     }
562     break;
563     }
564     }
565    
566     *ptrptr = ptr;
567     return c;
568     }
569    
570    
571    
572     #ifdef SUPPORT_UCP
573     /*************************************************
574     * Handle \P and \p *
575     *************************************************/
576    
577     /* This function is called after \P or \p has been encountered, provided that
578     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
579     pointing at the P or p. On exit, it is pointing at the final character of the
580     escape sequence.
581    
582     Argument:
583     ptrptr points to the pattern position pointer
584     negptr points to a boolean that is set TRUE for negation else FALSE
585 nigel 87 dptr points to an int that is set to the detailed property value
586 nigel 77 errorcodeptr points to the error code variable
587    
588 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
589 nigel 77 */
590    
591     static int
592 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
593 nigel 77 {
594     int c, i, bot, top;
595     const uschar *ptr = *ptrptr;
596 nigel 87 char name[32];
597 nigel 77
598     c = *(++ptr);
599     if (c == 0) goto ERROR_RETURN;
600    
601     *negptr = FALSE;
602    
603 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
604     negation. */
605 nigel 77
606     if (c == '{')
607     {
608     if (ptr[1] == '^')
609     {
610     *negptr = TRUE;
611     ptr++;
612     }
613 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
614 nigel 77 {
615     c = *(++ptr);
616     if (c == 0) goto ERROR_RETURN;
617     if (c == '}') break;
618     name[i] = c;
619     }
620 nigel 87 if (c !='}') goto ERROR_RETURN;
621 nigel 77 name[i] = 0;
622     }
623    
624     /* Otherwise there is just one following character */
625    
626     else
627     {
628     name[0] = c;
629     name[1] = 0;
630     }
631    
632     *ptrptr = ptr;
633    
634     /* Search for a recognized property name using binary chop */
635    
636     bot = 0;
637     top = _pcre_utt_size;
638    
639     while (bot < top)
640     {
641 nigel 87 i = (bot + top) >> 1;
642 nigel 77 c = strcmp(name, _pcre_utt[i].name);
643 nigel 87 if (c == 0)
644     {
645     *dptr = _pcre_utt[i].value;
646     return _pcre_utt[i].type;
647     }
648 nigel 77 if (c > 0) bot = i + 1; else top = i;
649     }
650    
651     *errorcodeptr = ERR47;
652     *ptrptr = ptr;
653     return -1;
654    
655     ERROR_RETURN:
656     *errorcodeptr = ERR46;
657     *ptrptr = ptr;
658     return -1;
659     }
660     #endif
661    
662    
663    
664    
665     /*************************************************
666     * Check for counted repeat *
667     *************************************************/
668    
669     /* This function is called when a '{' is encountered in a place where it might
670     start a quantifier. It looks ahead to see if it really is a quantifier or not.
671     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
672     where the ddds are digits.
673    
674     Arguments:
675     p pointer to the first char after '{'
676    
677     Returns: TRUE or FALSE
678     */
679    
680     static BOOL
681     is_counted_repeat(const uschar *p)
682     {
683     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
684     while ((digitab[*p] & ctype_digit) != 0) p++;
685     if (*p == '}') return TRUE;
686    
687     if (*p++ != ',') return FALSE;
688     if (*p == '}') return TRUE;
689    
690     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
691     while ((digitab[*p] & ctype_digit) != 0) p++;
692    
693     return (*p == '}');
694     }
695    
696    
697    
698     /*************************************************
699     * Read repeat counts *
700     *************************************************/
701    
702     /* Read an item of the form {n,m} and return the values. This is called only
703     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
704     so the syntax is guaranteed to be correct, but we need to check the values.
705    
706     Arguments:
707     p pointer to first char after '{'
708     minp pointer to int for min
709     maxp pointer to int for max
710     returned as -1 if no max
711     errorcodeptr points to error code variable
712    
713     Returns: pointer to '}' on success;
714     current ptr on error, with errorcodeptr set non-zero
715     */
716    
717     static const uschar *
718     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
719     {
720     int min = 0;
721     int max = -1;
722    
723 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
724     an integer overflow. */
725    
726 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
727 nigel 81 if (min < 0 || min > 65535)
728     {
729     *errorcodeptr = ERR5;
730     return p;
731     }
732 nigel 77
733 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
734     Also, max must not be less than min. */
735    
736 nigel 77 if (*p == '}') max = min; else
737     {
738     if (*(++p) != '}')
739     {
740     max = 0;
741     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
742 nigel 81 if (max < 0 || max > 65535)
743     {
744     *errorcodeptr = ERR5;
745     return p;
746     }
747 nigel 77 if (max < min)
748     {
749     *errorcodeptr = ERR4;
750     return p;
751     }
752     }
753     }
754    
755 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
756     '}'. */
757 nigel 77
758 nigel 81 *minp = min;
759     *maxp = max;
760 nigel 77 return p;
761     }
762    
763    
764    
765     /*************************************************
766     * Find first significant op code *
767     *************************************************/
768    
769     /* This is called by several functions that scan a compiled expression looking
770     for a fixed first character, or an anchoring op code etc. It skips over things
771     that do not influence this. For some calls, a change of option is important.
772     For some calls, it makes sense to skip negative forward and all backward
773     assertions, and also the \b assertion; for others it does not.
774    
775     Arguments:
776     code pointer to the start of the group
777     options pointer to external options
778     optbit the option bit whose changing is significant, or
779     zero if none are
780     skipassert TRUE if certain assertions are to be skipped
781    
782     Returns: pointer to the first significant opcode
783     */
784    
785     static const uschar*
786     first_significant_code(const uschar *code, int *options, int optbit,
787     BOOL skipassert)
788     {
789     for (;;)
790     {
791     switch ((int)*code)
792     {
793     case OP_OPT:
794     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
795     *options = (int)code[1];
796     code += 2;
797     break;
798    
799     case OP_ASSERT_NOT:
800     case OP_ASSERTBACK:
801     case OP_ASSERTBACK_NOT:
802     if (!skipassert) return code;
803     do code += GET(code, 1); while (*code == OP_ALT);
804     code += _pcre_OP_lengths[*code];
805     break;
806    
807     case OP_WORD_BOUNDARY:
808     case OP_NOT_WORD_BOUNDARY:
809     if (!skipassert) return code;
810     /* Fall through */
811    
812     case OP_CALLOUT:
813     case OP_CREF:
814     case OP_BRANUMBER:
815     code += _pcre_OP_lengths[*code];
816     break;
817    
818     default:
819     return code;
820     }
821     }
822     /* Control never reaches here */
823     }
824    
825    
826    
827    
828     /*************************************************
829     * Find the fixed length of a pattern *
830     *************************************************/
831    
832     /* Scan a pattern and compute the fixed length of subject that will match it,
833     if the length is fixed. This is needed for dealing with backward assertions.
834     In UTF8 mode, the result is in characters rather than bytes.
835    
836     Arguments:
837     code points to the start of the pattern (the bracket)
838     options the compiling options
839    
840     Returns: the fixed length, or -1 if there is no fixed length,
841     or -2 if \C was encountered
842     */
843    
844     static int
845     find_fixedlength(uschar *code, int options)
846     {
847     int length = -1;
848    
849     register int branchlength = 0;
850     register uschar *cc = code + 1 + LINK_SIZE;
851    
852     /* Scan along the opcodes for this branch. If we get to the end of the
853     branch, check the length against that of the other branches. */
854    
855     for (;;)
856     {
857     int d;
858     register int op = *cc;
859     if (op >= OP_BRA) op = OP_BRA;
860    
861     switch (op)
862     {
863     case OP_BRA:
864     case OP_ONCE:
865     case OP_COND:
866     d = find_fixedlength(cc, options);
867     if (d < 0) return d;
868     branchlength += d;
869     do cc += GET(cc, 1); while (*cc == OP_ALT);
870     cc += 1 + LINK_SIZE;
871     break;
872    
873     /* Reached end of a branch; if it's a ket it is the end of a nested
874     call. If it's ALT it is an alternation in a nested call. If it is
875     END it's the end of the outer call. All can be handled by the same code. */
876    
877     case OP_ALT:
878     case OP_KET:
879     case OP_KETRMAX:
880     case OP_KETRMIN:
881     case OP_END:
882     if (length < 0) length = branchlength;
883     else if (length != branchlength) return -1;
884     if (*cc != OP_ALT) return length;
885     cc += 1 + LINK_SIZE;
886     branchlength = 0;
887     break;
888    
889     /* Skip over assertive subpatterns */
890    
891     case OP_ASSERT:
892     case OP_ASSERT_NOT:
893     case OP_ASSERTBACK:
894     case OP_ASSERTBACK_NOT:
895     do cc += GET(cc, 1); while (*cc == OP_ALT);
896     /* Fall through */
897    
898     /* Skip over things that don't match chars */
899    
900     case OP_REVERSE:
901     case OP_BRANUMBER:
902     case OP_CREF:
903     case OP_OPT:
904     case OP_CALLOUT:
905     case OP_SOD:
906     case OP_SOM:
907     case OP_EOD:
908     case OP_EODN:
909     case OP_CIRC:
910     case OP_DOLL:
911     case OP_NOT_WORD_BOUNDARY:
912     case OP_WORD_BOUNDARY:
913     cc += _pcre_OP_lengths[*cc];
914     break;
915    
916     /* Handle literal characters */
917    
918     case OP_CHAR:
919     case OP_CHARNC:
920     branchlength++;
921     cc += 2;
922     #ifdef SUPPORT_UTF8
923     if ((options & PCRE_UTF8) != 0)
924     {
925     while ((*cc & 0xc0) == 0x80) cc++;
926     }
927     #endif
928     break;
929    
930     /* Handle exact repetitions. The count is already in characters, but we
931     need to skip over a multibyte character in UTF8 mode. */
932    
933     case OP_EXACT:
934     branchlength += GET2(cc,1);
935     cc += 4;
936     #ifdef SUPPORT_UTF8
937     if ((options & PCRE_UTF8) != 0)
938     {
939     while((*cc & 0x80) == 0x80) cc++;
940     }
941     #endif
942     break;
943    
944     case OP_TYPEEXACT:
945     branchlength += GET2(cc,1);
946     cc += 4;
947     break;
948    
949     /* Handle single-char matchers */
950    
951     case OP_PROP:
952     case OP_NOTPROP:
953 nigel 87 cc += 2;
954 nigel 77 /* Fall through */
955    
956     case OP_NOT_DIGIT:
957     case OP_DIGIT:
958     case OP_NOT_WHITESPACE:
959     case OP_WHITESPACE:
960     case OP_NOT_WORDCHAR:
961     case OP_WORDCHAR:
962     case OP_ANY:
963     branchlength++;
964     cc++;
965     break;
966    
967     /* The single-byte matcher isn't allowed */
968    
969     case OP_ANYBYTE:
970     return -2;
971    
972     /* Check a class for variable quantification */
973    
974     #ifdef SUPPORT_UTF8
975     case OP_XCLASS:
976     cc += GET(cc, 1) - 33;
977     /* Fall through */
978     #endif
979    
980     case OP_CLASS:
981     case OP_NCLASS:
982     cc += 33;
983    
984     switch (*cc)
985     {
986     case OP_CRSTAR:
987     case OP_CRMINSTAR:
988     case OP_CRQUERY:
989     case OP_CRMINQUERY:
990     return -1;
991    
992     case OP_CRRANGE:
993     case OP_CRMINRANGE:
994     if (GET2(cc,1) != GET2(cc,3)) return -1;
995     branchlength += GET2(cc,1);
996     cc += 5;
997     break;
998    
999     default:
1000     branchlength++;
1001     }
1002     break;
1003    
1004     /* Anything else is variable length */
1005    
1006     default:
1007     return -1;
1008     }
1009     }
1010     /* Control never gets here */
1011     }
1012    
1013    
1014    
1015    
1016     /*************************************************
1017     * Scan compiled regex for numbered bracket *
1018     *************************************************/
1019    
1020     /* This little function scans through a compiled pattern until it finds a
1021     capturing bracket with the given number.
1022    
1023     Arguments:
1024     code points to start of expression
1025     utf8 TRUE in UTF-8 mode
1026     number the required bracket number
1027    
1028     Returns: pointer to the opcode for the bracket, or NULL if not found
1029     */
1030    
1031     static const uschar *
1032     find_bracket(const uschar *code, BOOL utf8, int number)
1033     {
1034     #ifndef SUPPORT_UTF8
1035     utf8 = utf8; /* Stop pedantic compilers complaining */
1036     #endif
1037    
1038     for (;;)
1039     {
1040     register int c = *code;
1041     if (c == OP_END) return NULL;
1042     else if (c > OP_BRA)
1043     {
1044     int n = c - OP_BRA;
1045     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1046     if (n == number) return (uschar *)code;
1047     code += _pcre_OP_lengths[OP_BRA];
1048     }
1049     else
1050     {
1051     code += _pcre_OP_lengths[c];
1052    
1053     #ifdef SUPPORT_UTF8
1054    
1055     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1056     by a multi-byte character. The length in the table is a minimum, so we have
1057     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1058     can use relatively efficient code. */
1059    
1060     if (utf8) switch(c)
1061     {
1062     case OP_CHAR:
1063     case OP_CHARNC:
1064     case OP_EXACT:
1065     case OP_UPTO:
1066     case OP_MINUPTO:
1067     case OP_STAR:
1068     case OP_MINSTAR:
1069     case OP_PLUS:
1070     case OP_MINPLUS:
1071     case OP_QUERY:
1072     case OP_MINQUERY:
1073     while ((*code & 0xc0) == 0x80) code++;
1074     break;
1075    
1076     /* XCLASS is used for classes that cannot be represented just by a bit
1077     map. This includes negated single high-valued characters. The length in
1078     the table is zero; the actual length is stored in the compiled code. */
1079    
1080     case OP_XCLASS:
1081     code += GET(code, 1) + 1;
1082     break;
1083     }
1084     #endif
1085     }
1086     }
1087     }
1088    
1089    
1090    
1091     /*************************************************
1092     * Scan compiled regex for recursion reference *
1093     *************************************************/
1094    
1095     /* This little function scans through a compiled pattern until it finds an
1096     instance of OP_RECURSE.
1097    
1098     Arguments:
1099     code points to start of expression
1100     utf8 TRUE in UTF-8 mode
1101    
1102     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1103     */
1104    
1105     static const uschar *
1106     find_recurse(const uschar *code, BOOL utf8)
1107     {
1108     #ifndef SUPPORT_UTF8
1109     utf8 = utf8; /* Stop pedantic compilers complaining */
1110     #endif
1111    
1112     for (;;)
1113     {
1114     register int c = *code;
1115     if (c == OP_END) return NULL;
1116     else if (c == OP_RECURSE) return code;
1117     else if (c > OP_BRA)
1118     {
1119     code += _pcre_OP_lengths[OP_BRA];
1120     }
1121     else
1122     {
1123     code += _pcre_OP_lengths[c];
1124    
1125     #ifdef SUPPORT_UTF8
1126    
1127     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1128     by a multi-byte character. The length in the table is a minimum, so we have
1129     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1130     can use relatively efficient code. */
1131    
1132     if (utf8) switch(c)
1133     {
1134     case OP_CHAR:
1135     case OP_CHARNC:
1136     case OP_EXACT:
1137     case OP_UPTO:
1138     case OP_MINUPTO:
1139     case OP_STAR:
1140     case OP_MINSTAR:
1141     case OP_PLUS:
1142     case OP_MINPLUS:
1143     case OP_QUERY:
1144     case OP_MINQUERY:
1145     while ((*code & 0xc0) == 0x80) code++;
1146     break;
1147    
1148     /* XCLASS is used for classes that cannot be represented just by a bit
1149     map. This includes negated single high-valued characters. The length in
1150     the table is zero; the actual length is stored in the compiled code. */
1151    
1152     case OP_XCLASS:
1153     code += GET(code, 1) + 1;
1154     break;
1155     }
1156     #endif
1157     }
1158     }
1159     }
1160    
1161    
1162    
1163     /*************************************************
1164     * Scan compiled branch for non-emptiness *
1165     *************************************************/
1166    
1167     /* This function scans through a branch of a compiled pattern to see whether it
1168     can match the empty string or not. It is called only from could_be_empty()
1169     below. Note that first_significant_code() skips over assertions. If we hit an
1170     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1171     whose current branch will already have been scanned.
1172    
1173     Arguments:
1174     code points to start of search
1175     endcode points to where to stop
1176     utf8 TRUE if in UTF8 mode
1177    
1178     Returns: TRUE if what is matched could be empty
1179     */
1180    
1181     static BOOL
1182     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1183     {
1184     register int c;
1185     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1186     code < endcode;
1187     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1188     {
1189     const uschar *ccode;
1190    
1191     c = *code;
1192    
1193     if (c >= OP_BRA)
1194     {
1195     BOOL empty_branch;
1196     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1197    
1198     /* Scan a closed bracket */
1199    
1200     empty_branch = FALSE;
1201     do
1202     {
1203     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1204     empty_branch = TRUE;
1205     code += GET(code, 1);
1206     }
1207     while (*code == OP_ALT);
1208     if (!empty_branch) return FALSE; /* All branches are non-empty */
1209     code += 1 + LINK_SIZE;
1210     c = *code;
1211     }
1212    
1213     else switch (c)
1214     {
1215     /* Check for quantifiers after a class */
1216    
1217     #ifdef SUPPORT_UTF8
1218     case OP_XCLASS:
1219     ccode = code + GET(code, 1);
1220     goto CHECK_CLASS_REPEAT;
1221     #endif
1222    
1223     case OP_CLASS:
1224     case OP_NCLASS:
1225     ccode = code + 33;
1226    
1227     #ifdef SUPPORT_UTF8
1228     CHECK_CLASS_REPEAT:
1229     #endif
1230    
1231     switch (*ccode)
1232     {
1233     case OP_CRSTAR: /* These could be empty; continue */
1234     case OP_CRMINSTAR:
1235     case OP_CRQUERY:
1236     case OP_CRMINQUERY:
1237     break;
1238    
1239     default: /* Non-repeat => class must match */
1240     case OP_CRPLUS: /* These repeats aren't empty */
1241     case OP_CRMINPLUS:
1242     return FALSE;
1243    
1244     case OP_CRRANGE:
1245     case OP_CRMINRANGE:
1246     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1247     break;
1248     }
1249     break;
1250    
1251     /* Opcodes that must match a character */
1252    
1253     case OP_PROP:
1254     case OP_NOTPROP:
1255     case OP_EXTUNI:
1256     case OP_NOT_DIGIT:
1257     case OP_DIGIT:
1258     case OP_NOT_WHITESPACE:
1259     case OP_WHITESPACE:
1260     case OP_NOT_WORDCHAR:
1261     case OP_WORDCHAR:
1262     case OP_ANY:
1263     case OP_ANYBYTE:
1264     case OP_CHAR:
1265     case OP_CHARNC:
1266     case OP_NOT:
1267     case OP_PLUS:
1268     case OP_MINPLUS:
1269     case OP_EXACT:
1270     case OP_NOTPLUS:
1271     case OP_NOTMINPLUS:
1272     case OP_NOTEXACT:
1273     case OP_TYPEPLUS:
1274     case OP_TYPEMINPLUS:
1275     case OP_TYPEEXACT:
1276     return FALSE;
1277    
1278     /* End of branch */
1279    
1280     case OP_KET:
1281     case OP_KETRMAX:
1282     case OP_KETRMIN:
1283     case OP_ALT:
1284     return TRUE;
1285    
1286     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1287     followed by a multibyte character */
1288    
1289     #ifdef SUPPORT_UTF8
1290     case OP_STAR:
1291     case OP_MINSTAR:
1292     case OP_QUERY:
1293     case OP_MINQUERY:
1294     case OP_UPTO:
1295     case OP_MINUPTO:
1296     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1297     break;
1298     #endif
1299     }
1300     }
1301    
1302     return TRUE;
1303     }
1304    
1305    
1306    
1307     /*************************************************
1308     * Scan compiled regex for non-emptiness *
1309     *************************************************/
1310    
1311     /* This function is called to check for left recursive calls. We want to check
1312     the current branch of the current pattern to see if it could match the empty
1313     string. If it could, we must look outwards for branches at other levels,
1314     stopping when we pass beyond the bracket which is the subject of the recursion.
1315    
1316     Arguments:
1317     code points to start of the recursion
1318     endcode points to where to stop (current RECURSE item)
1319     bcptr points to the chain of current (unclosed) branch starts
1320     utf8 TRUE if in UTF-8 mode
1321    
1322     Returns: TRUE if what is matched could be empty
1323     */
1324    
1325     static BOOL
1326     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1327     BOOL utf8)
1328     {
1329     while (bcptr != NULL && bcptr->current >= code)
1330     {
1331     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1332     bcptr = bcptr->outer;
1333     }
1334     return TRUE;
1335     }
1336    
1337    
1338    
1339     /*************************************************
1340     * Check for POSIX class syntax *
1341     *************************************************/
1342    
1343     /* This function is called when the sequence "[:" or "[." or "[=" is
1344     encountered in a character class. It checks whether this is followed by an
1345     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1346     ".]" or "=]".
1347    
1348     Argument:
1349     ptr pointer to the initial [
1350     endptr where to return the end pointer
1351     cd pointer to compile data
1352    
1353     Returns: TRUE or FALSE
1354     */
1355    
1356     static BOOL
1357     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1358     {
1359     int terminator; /* Don't combine these lines; the Solaris cc */
1360     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1361     if (*(++ptr) == '^') ptr++;
1362     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1363     if (*ptr == terminator && ptr[1] == ']')
1364     {
1365     *endptr = ptr;
1366     return TRUE;
1367     }
1368     return FALSE;
1369     }
1370    
1371    
1372    
1373    
1374     /*************************************************
1375     * Check POSIX class name *
1376     *************************************************/
1377    
1378     /* This function is called to check the name given in a POSIX-style class entry
1379     such as [:alnum:].
1380    
1381     Arguments:
1382     ptr points to the first letter
1383     len the length of the name
1384    
1385     Returns: a value representing the name, or -1 if unknown
1386     */
1387    
1388     static int
1389     check_posix_name(const uschar *ptr, int len)
1390     {
1391     register int yield = 0;
1392     while (posix_name_lengths[yield] != 0)
1393     {
1394     if (len == posix_name_lengths[yield] &&
1395     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1396     yield++;
1397     }
1398     return -1;
1399     }
1400    
1401    
1402     /*************************************************
1403     * Adjust OP_RECURSE items in repeated group *
1404     *************************************************/
1405    
1406     /* OP_RECURSE items contain an offset from the start of the regex to the group
1407     that is referenced. This means that groups can be replicated for fixed
1408     repetition simply by copying (because the recursion is allowed to refer to
1409     earlier groups that are outside the current group). However, when a group is
1410     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1411     it, after it has been compiled. This means that any OP_RECURSE items within it
1412     that refer to the group itself or any contained groups have to have their
1413     offsets adjusted. That is the job of this function. Before it is called, the
1414     partially compiled regex must be temporarily terminated with OP_END.
1415    
1416     Arguments:
1417     group points to the start of the group
1418     adjust the amount by which the group is to be moved
1419     utf8 TRUE in UTF-8 mode
1420     cd contains pointers to tables etc.
1421    
1422     Returns: nothing
1423     */
1424    
1425     static void
1426     adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1427     {
1428     uschar *ptr = group;
1429     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1430     {
1431     int offset = GET(ptr, 1);
1432     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1433     ptr += 1 + LINK_SIZE;
1434     }
1435     }
1436    
1437    
1438    
1439     /*************************************************
1440     * Insert an automatic callout point *
1441     *************************************************/
1442    
1443     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1444     callout points before each pattern item.
1445    
1446     Arguments:
1447     code current code pointer
1448     ptr current pattern pointer
1449     cd pointers to tables etc
1450    
1451     Returns: new code pointer
1452     */
1453    
1454     static uschar *
1455     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1456     {
1457     *code++ = OP_CALLOUT;
1458     *code++ = 255;
1459     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1460     PUT(code, LINK_SIZE, 0); /* Default length */
1461     return code + 2*LINK_SIZE;
1462     }
1463    
1464    
1465    
1466     /*************************************************
1467     * Complete a callout item *
1468     *************************************************/
1469    
1470     /* A callout item contains the length of the next item in the pattern, which
1471     we can't fill in till after we have reached the relevant point. This is used
1472     for both automatic and manual callouts.
1473    
1474     Arguments:
1475     previous_callout points to previous callout item
1476     ptr current pattern pointer
1477     cd pointers to tables etc
1478    
1479     Returns: nothing
1480     */
1481    
1482     static void
1483     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1484     {
1485     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1486     PUT(previous_callout, 2 + LINK_SIZE, length);
1487     }
1488    
1489    
1490    
1491     #ifdef SUPPORT_UCP
1492     /*************************************************
1493     * Get othercase range *
1494     *************************************************/
1495    
1496     /* This function is passed the start and end of a class range, in UTF-8 mode
1497     with UCP support. It searches up the characters, looking for internal ranges of
1498     characters in the "other" case. Each call returns the next one, updating the
1499     start address.
1500    
1501     Arguments:
1502     cptr points to starting character value; updated
1503     d end value
1504     ocptr where to put start of othercase range
1505     odptr where to put end of othercase range
1506    
1507     Yield: TRUE when range returned; FALSE when no more
1508     */
1509    
1510     static BOOL
1511     get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1512     {
1513 nigel 87 int c, othercase, next;
1514 nigel 77
1515     for (c = *cptr; c <= d; c++)
1516 nigel 87 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
1517 nigel 77
1518     if (c > d) return FALSE;
1519    
1520     *ocptr = othercase;
1521     next = othercase + 1;
1522    
1523     for (++c; c <= d; c++)
1524     {
1525 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1526 nigel 77 next++;
1527     }
1528    
1529     *odptr = next - 1;
1530     *cptr = c;
1531    
1532     return TRUE;
1533     }
1534     #endif /* SUPPORT_UCP */
1535    
1536    
1537     /*************************************************
1538     * Compile one branch *
1539     *************************************************/
1540    
1541     /* Scan the pattern, compiling it into the code vector. If the options are
1542     changed during the branch, the pointer is used to change the external options
1543     bits.
1544    
1545     Arguments:
1546     optionsptr pointer to the option bits
1547     brackets points to number of extracting brackets used
1548     codeptr points to the pointer to the current code point
1549     ptrptr points to the current pattern pointer
1550     errorcodeptr points to error code variable
1551     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1552     reqbyteptr set to the last literal character required, else < 0
1553     bcptr points to current branch chain
1554     cd contains pointers to tables etc.
1555    
1556     Returns: TRUE on success
1557     FALSE, with *errorcodeptr set non-zero on error
1558     */
1559    
1560     static BOOL
1561     compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1562     const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1563     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1564     {
1565     int repeat_type, op_type;
1566     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1567     int bravalue = 0;
1568     int greedy_default, greedy_non_default;
1569     int firstbyte, reqbyte;
1570     int zeroreqbyte, zerofirstbyte;
1571     int req_caseopt, reqvary, tempreqvary;
1572     int condcount = 0;
1573     int options = *optionsptr;
1574     int after_manual_callout = 0;
1575     register int c;
1576     register uschar *code = *codeptr;
1577     uschar *tempcode;
1578     BOOL inescq = FALSE;
1579     BOOL groupsetfirstbyte = FALSE;
1580     const uschar *ptr = *ptrptr;
1581     const uschar *tempptr;
1582     uschar *previous = NULL;
1583     uschar *previous_callout = NULL;
1584     uschar classbits[32];
1585    
1586     #ifdef SUPPORT_UTF8
1587     BOOL class_utf8;
1588     BOOL utf8 = (options & PCRE_UTF8) != 0;
1589     uschar *class_utf8data;
1590     uschar utf8_char[6];
1591     #else
1592     BOOL utf8 = FALSE;
1593     #endif
1594    
1595     /* Set up the default and non-default settings for greediness */
1596    
1597     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1598     greedy_non_default = greedy_default ^ 1;
1599    
1600     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1601     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1602     matches a non-fixed char first char; reqbyte just remains unset if we never
1603     find one.
1604    
1605     When we hit a repeat whose minimum is zero, we may have to adjust these values
1606     to take the zero repeat into account. This is implemented by setting them to
1607     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1608     item types that can be repeated set these backoff variables appropriately. */
1609    
1610     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1611    
1612     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1613     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1614     value > 255. It is added into the firstbyte or reqbyte variables to record the
1615     case status of the value. This is used only for ASCII characters. */
1616    
1617     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1618    
1619     /* Switch on next character until the end of the branch */
1620    
1621     for (;; ptr++)
1622     {
1623     BOOL negate_class;
1624     BOOL possessive_quantifier;
1625     BOOL is_quantifier;
1626     int class_charcount;
1627     int class_lastchar;
1628     int newoptions;
1629     int recno;
1630     int skipbytes;
1631     int subreqbyte;
1632     int subfirstbyte;
1633     int mclength;
1634     uschar mcbuffer[8];
1635    
1636     /* Next byte in the pattern */
1637    
1638     c = *ptr;
1639    
1640     /* If in \Q...\E, check for the end; if not, we have a literal */
1641    
1642     if (inescq && c != 0)
1643     {
1644     if (c == '\\' && ptr[1] == 'E')
1645     {
1646     inescq = FALSE;
1647     ptr++;
1648     continue;
1649     }
1650     else
1651     {
1652     if (previous_callout != NULL)
1653     {
1654     complete_callout(previous_callout, ptr, cd);
1655     previous_callout = NULL;
1656     }
1657     if ((options & PCRE_AUTO_CALLOUT) != 0)
1658     {
1659     previous_callout = code;
1660     code = auto_callout(code, ptr, cd);
1661     }
1662     goto NORMAL_CHAR;
1663     }
1664     }
1665    
1666     /* Fill in length of a previous callout, except when the next thing is
1667     a quantifier. */
1668    
1669     is_quantifier = c == '*' || c == '+' || c == '?' ||
1670     (c == '{' && is_counted_repeat(ptr+1));
1671    
1672     if (!is_quantifier && previous_callout != NULL &&
1673     after_manual_callout-- <= 0)
1674     {
1675     complete_callout(previous_callout, ptr, cd);
1676     previous_callout = NULL;
1677     }
1678    
1679     /* In extended mode, skip white space and comments */
1680    
1681     if ((options & PCRE_EXTENDED) != 0)
1682     {
1683     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1684     if (c == '#')
1685     {
1686     /* The space before the ; is to avoid a warning on a silly compiler
1687     on the Macintosh. */
1688     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1689     if (c != 0) continue; /* Else fall through to handle end of string */
1690     }
1691     }
1692    
1693     /* No auto callout for quantifiers. */
1694    
1695     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1696     {
1697     previous_callout = code;
1698     code = auto_callout(code, ptr, cd);
1699     }
1700    
1701     switch(c)
1702     {
1703     /* The branch terminates at end of string, |, or ). */
1704    
1705     case 0:
1706     case '|':
1707     case ')':
1708     *firstbyteptr = firstbyte;
1709     *reqbyteptr = reqbyte;
1710     *codeptr = code;
1711     *ptrptr = ptr;
1712     return TRUE;
1713    
1714     /* Handle single-character metacharacters. In multiline mode, ^ disables
1715     the setting of any following char as a first character. */
1716    
1717     case '^':
1718     if ((options & PCRE_MULTILINE) != 0)
1719     {
1720     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1721     }
1722     previous = NULL;
1723     *code++ = OP_CIRC;
1724     break;
1725    
1726     case '$':
1727     previous = NULL;
1728     *code++ = OP_DOLL;
1729     break;
1730    
1731     /* There can never be a first char if '.' is first, whatever happens about
1732     repeats. The value of reqbyte doesn't change either. */
1733    
1734     case '.':
1735     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1736     zerofirstbyte = firstbyte;
1737     zeroreqbyte = reqbyte;
1738     previous = code;
1739     *code++ = OP_ANY;
1740     break;
1741    
1742 nigel 87 /* Character classes. If the included characters are all < 256, we build a
1743     32-byte bitmap of the permitted characters, except in the special case
1744     where there is only one such character. For negated classes, we build the
1745     map as usual, then invert it at the end. However, we use a different opcode
1746     so that data characters > 255 can be handled correctly.
1747 nigel 77
1748     If the class contains characters outside the 0-255 range, a different
1749     opcode is compiled. It may optionally have a bit map for characters < 256,
1750     but those above are are explicitly listed afterwards. A flag byte tells
1751     whether the bitmap is present, and whether this is a negated class or not.
1752     */
1753    
1754     case '[':
1755     previous = code;
1756    
1757     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1758     they are encountered at the top level, so we'll do that too. */
1759    
1760     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1761     check_posix_syntax(ptr, &tempptr, cd))
1762     {
1763     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1764     goto FAILED;
1765     }
1766    
1767     /* If the first character is '^', set the negation flag and skip it. */
1768    
1769     if ((c = *(++ptr)) == '^')
1770     {
1771     negate_class = TRUE;
1772     c = *(++ptr);
1773     }
1774     else
1775     {
1776     negate_class = FALSE;
1777     }
1778    
1779     /* Keep a count of chars with values < 256 so that we can optimize the case
1780     of just a single character (as long as it's < 256). For higher valued UTF-8
1781     characters, we don't yet do any optimization. */
1782    
1783     class_charcount = 0;
1784     class_lastchar = -1;
1785    
1786     #ifdef SUPPORT_UTF8
1787     class_utf8 = FALSE; /* No chars >= 256 */
1788     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1789     #endif
1790    
1791     /* Initialize the 32-char bit map to all zeros. We have to build the
1792     map in a temporary bit of store, in case the class contains only 1
1793     character (< 256), because in that case the compiled code doesn't use the
1794     bit map. */
1795    
1796     memset(classbits, 0, 32 * sizeof(uschar));
1797    
1798     /* Process characters until ] is reached. By writing this as a "do" it
1799     means that an initial ] is taken as a data character. The first pass
1800     through the regex checked the overall syntax, so we don't need to be very
1801     strict here. At the start of the loop, c contains the first byte of the
1802     character. */
1803    
1804     do
1805     {
1806     #ifdef SUPPORT_UTF8
1807     if (utf8 && c > 127)
1808     { /* Braces are required because the */
1809     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1810     }
1811     #endif
1812    
1813     /* Inside \Q...\E everything is literal except \E */
1814    
1815     if (inescq)
1816     {
1817     if (c == '\\' && ptr[1] == 'E')
1818     {
1819     inescq = FALSE;
1820     ptr++;
1821     continue;
1822     }
1823     else goto LONE_SINGLE_CHARACTER;
1824     }
1825    
1826     /* Handle POSIX class names. Perl allows a negation extension of the
1827     form [:^name:]. A square bracket that doesn't match the syntax is
1828     treated as a literal. We also recognize the POSIX constructions
1829     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1830     5.6 and 5.8 do. */
1831    
1832     if (c == '[' &&
1833     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1834     check_posix_syntax(ptr, &tempptr, cd))
1835     {
1836     BOOL local_negate = FALSE;
1837 nigel 87 int posix_class, taboffset, tabopt;
1838 nigel 77 register const uschar *cbits = cd->cbits;
1839 nigel 87 uschar pbits[32];
1840 nigel 77
1841     if (ptr[1] != ':')
1842     {
1843     *errorcodeptr = ERR31;
1844     goto FAILED;
1845     }
1846    
1847     ptr += 2;
1848     if (*ptr == '^')
1849     {
1850     local_negate = TRUE;
1851     ptr++;
1852     }
1853    
1854     posix_class = check_posix_name(ptr, tempptr - ptr);
1855     if (posix_class < 0)
1856     {
1857     *errorcodeptr = ERR30;
1858     goto FAILED;
1859     }
1860    
1861     /* If matching is caseless, upper and lower are converted to
1862     alpha. This relies on the fact that the class table starts with
1863     alpha, lower, upper as the first 3 entries. */
1864    
1865     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1866     posix_class = 0;
1867    
1868 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
1869     because we may be adding and subtracting from it, and we don't want to
1870     subtract bits that may be in the main map already. At the end we or the
1871     result into the bit map that is being built. */
1872 nigel 77
1873     posix_class *= 3;
1874 nigel 87
1875     /* Copy in the first table (always present) */
1876    
1877     memcpy(pbits, cbits + posix_class_maps[posix_class],
1878     32 * sizeof(uschar));
1879    
1880     /* If there is a second table, add or remove it as required. */
1881    
1882     taboffset = posix_class_maps[posix_class + 1];
1883     tabopt = posix_class_maps[posix_class + 2];
1884    
1885     if (taboffset >= 0)
1886 nigel 77 {
1887 nigel 87 if (tabopt >= 0)
1888     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
1889 nigel 77 else
1890 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
1891 nigel 77 }
1892    
1893 nigel 87 /* Not see if we need to remove any special characters. An option
1894     value of 1 removes vertical space and 2 removes underscore. */
1895    
1896     if (tabopt < 0) tabopt = -tabopt;
1897     if (tabopt == 1) pbits[1] &= ~0x3c;
1898     else if (tabopt == 2) pbits[11] &= 0x7f;
1899    
1900     /* Add the POSIX table or its complement into the main table that is
1901     being built and we are done. */
1902    
1903     if (local_negate)
1904     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
1905     else
1906     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
1907    
1908 nigel 77 ptr = tempptr + 1;
1909     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1910     continue; /* End of POSIX syntax handling */
1911     }
1912    
1913     /* Backslash may introduce a single character, or it may introduce one
1914     of the specials, which just set a flag. Escaped items are checked for
1915     validity in the pre-compiling pass. The sequence \b is a special case.
1916     Inside a class (and only there) it is treated as backspace. Elsewhere
1917     it marks a word boundary. Other escapes have preset maps ready to
1918     or into the one we are building. We assume they have more than one
1919     character in them, so set class_charcount bigger than one. */
1920    
1921     if (c == '\\')
1922     {
1923     c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1924    
1925     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1926     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1927     else if (-c == ESC_Q) /* Handle start of quoted string */
1928     {
1929     if (ptr[1] == '\\' && ptr[2] == 'E')
1930     {
1931     ptr += 2; /* avoid empty string */
1932     }
1933     else inescq = TRUE;
1934     continue;
1935     }
1936    
1937     if (c < 0)
1938     {
1939     register const uschar *cbits = cd->cbits;
1940     class_charcount += 2; /* Greater than 1 is what matters */
1941     switch (-c)
1942     {
1943     case ESC_d:
1944     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1945     continue;
1946    
1947     case ESC_D:
1948     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1949     continue;
1950    
1951     case ESC_w:
1952     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1953     continue;
1954    
1955     case ESC_W:
1956     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1957     continue;
1958    
1959     case ESC_s:
1960     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1961     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1962     continue;
1963    
1964     case ESC_S:
1965     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1966     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1967     continue;
1968    
1969     #ifdef SUPPORT_UCP
1970     case ESC_p:
1971     case ESC_P:
1972     {
1973     BOOL negated;
1974 nigel 87 int pdata;
1975     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
1976     if (ptype < 0) goto FAILED;
1977 nigel 77 class_utf8 = TRUE;
1978     *class_utf8data++ = ((-c == ESC_p) != negated)?
1979     XCL_PROP : XCL_NOTPROP;
1980 nigel 87 *class_utf8data++ = ptype;
1981     *class_utf8data++ = pdata;
1982 nigel 77 class_charcount -= 2; /* Not a < 256 character */
1983     }
1984     continue;
1985     #endif
1986    
1987     /* Unrecognized escapes are faulted if PCRE is running in its
1988     strict mode. By default, for compatibility with Perl, they are
1989     treated as literals. */
1990    
1991     default:
1992     if ((options & PCRE_EXTRA) != 0)
1993     {
1994     *errorcodeptr = ERR7;
1995     goto FAILED;
1996     }
1997     c = *ptr; /* The final character */
1998     class_charcount -= 2; /* Undo the default count from above */
1999     }
2000     }
2001    
2002     /* Fall through if we have a single character (c >= 0). This may be
2003     > 256 in UTF-8 mode. */
2004    
2005     } /* End of backslash handling */
2006    
2007     /* A single character may be followed by '-' to form a range. However,
2008     Perl does not permit ']' to be the end of the range. A '-' character
2009     here is treated as a literal. */
2010    
2011     if (ptr[1] == '-' && ptr[2] != ']')
2012     {
2013     int d;
2014     ptr += 2;
2015    
2016     #ifdef SUPPORT_UTF8
2017     if (utf8)
2018     { /* Braces are required because the */
2019     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2020     }
2021     else
2022     #endif
2023     d = *ptr; /* Not UTF-8 mode */
2024    
2025     /* The second part of a range can be a single-character escape, but
2026     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2027     in such circumstances. */
2028    
2029     if (d == '\\')
2030     {
2031     const uschar *oldptr = ptr;
2032     d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2033    
2034     /* \b is backslash; \X is literal X; any other special means the '-'
2035     was literal */
2036    
2037     if (d < 0)
2038     {
2039     if (d == -ESC_b) d = '\b';
2040     else if (d == -ESC_X) d = 'X'; else
2041     {
2042     ptr = oldptr - 2;
2043     goto LONE_SINGLE_CHARACTER; /* A few lines below */
2044     }
2045     }
2046     }
2047    
2048     /* The check that the two values are in the correct order happens in
2049     the pre-pass. Optimize one-character ranges */
2050    
2051     if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2052    
2053     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2054     matching, we have to use an XCLASS with extra data items. Caseless
2055     matching for characters > 127 is available only if UCP support is
2056     available. */
2057    
2058     #ifdef SUPPORT_UTF8
2059     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2060     {
2061     class_utf8 = TRUE;
2062    
2063     /* With UCP support, we can find the other case equivalents of
2064     the relevant characters. There may be several ranges. Optimize how
2065     they fit with the basic range. */
2066    
2067     #ifdef SUPPORT_UCP
2068     if ((options & PCRE_CASELESS) != 0)
2069     {
2070     int occ, ocd;
2071     int cc = c;
2072     int origd = d;
2073     while (get_othercase_range(&cc, origd, &occ, &ocd))
2074     {
2075     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2076    
2077     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2078     { /* if there is overlap, */
2079     c = occ; /* noting that if occ < c */
2080     continue; /* we can't have ocd > d */
2081     } /* because a subrange is */
2082     if (ocd > d && occ <= d + 1) /* always shorter than */
2083     { /* the basic range. */
2084     d = ocd;
2085     continue;
2086     }
2087    
2088     if (occ == ocd)
2089     {
2090     *class_utf8data++ = XCL_SINGLE;
2091     }
2092     else
2093     {
2094     *class_utf8data++ = XCL_RANGE;
2095     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2096     }
2097     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2098     }
2099     }
2100     #endif /* SUPPORT_UCP */
2101    
2102     /* Now record the original range, possibly modified for UCP caseless
2103     overlapping ranges. */
2104    
2105     *class_utf8data++ = XCL_RANGE;
2106     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2107     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2108    
2109     /* With UCP support, we are done. Without UCP support, there is no
2110     caseless matching for UTF-8 characters > 127; we can use the bit map
2111     for the smaller ones. */
2112    
2113     #ifdef SUPPORT_UCP
2114     continue; /* With next character in the class */
2115     #else
2116     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2117    
2118     /* Adjust upper limit and fall through to set up the map */
2119    
2120     d = 127;
2121    
2122     #endif /* SUPPORT_UCP */
2123     }
2124     #endif /* SUPPORT_UTF8 */
2125    
2126     /* We use the bit map for all cases when not in UTF-8 mode; else
2127     ranges that lie entirely within 0-127 when there is UCP support; else
2128     for partial ranges without UCP support. */
2129    
2130     for (; c <= d; c++)
2131     {
2132     classbits[c/8] |= (1 << (c&7));
2133     if ((options & PCRE_CASELESS) != 0)
2134     {
2135     int uc = cd->fcc[c]; /* flip case */
2136     classbits[uc/8] |= (1 << (uc&7));
2137     }
2138     class_charcount++; /* in case a one-char range */
2139     class_lastchar = c;
2140     }
2141    
2142     continue; /* Go get the next char in the class */
2143     }
2144    
2145     /* Handle a lone single character - we can get here for a normal
2146     non-escape char, or after \ that introduces a single character or for an
2147     apparent range that isn't. */
2148    
2149     LONE_SINGLE_CHARACTER:
2150    
2151     /* Handle a character that cannot go in the bit map */
2152    
2153     #ifdef SUPPORT_UTF8
2154     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2155     {
2156     class_utf8 = TRUE;
2157     *class_utf8data++ = XCL_SINGLE;
2158     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2159    
2160     #ifdef SUPPORT_UCP
2161     if ((options & PCRE_CASELESS) != 0)
2162     {
2163     int othercase;
2164 nigel 87 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
2165 nigel 77 {
2166     *class_utf8data++ = XCL_SINGLE;
2167     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2168     }
2169     }
2170     #endif /* SUPPORT_UCP */
2171    
2172     }
2173     else
2174     #endif /* SUPPORT_UTF8 */
2175    
2176     /* Handle a single-byte character */
2177     {
2178     classbits[c/8] |= (1 << (c&7));
2179     if ((options & PCRE_CASELESS) != 0)
2180     {
2181     c = cd->fcc[c]; /* flip case */
2182     classbits[c/8] |= (1 << (c&7));
2183     }
2184     class_charcount++;
2185     class_lastchar = c;
2186     }
2187     }
2188    
2189     /* Loop until ']' reached; the check for end of string happens inside the
2190     loop. This "while" is the end of the "do" above. */
2191    
2192     while ((c = *(++ptr)) != ']' || inescq);
2193    
2194     /* If class_charcount is 1, we saw precisely one character whose value is
2195     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2196     can optimize the negative case only if there were no characters >= 128
2197     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2198     single-bytes only. This is an historical hangover. Maybe one day we can
2199     tidy these opcodes to handle multi-byte characters.
2200    
2201     The optimization throws away the bit map. We turn the item into a
2202     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2203     that OP_NOT does not support multibyte characters. In the positive case, it
2204     can cause firstbyte to be set. Otherwise, there can be no first char if
2205     this item is first, whatever repeat count may follow. In the case of
2206     reqbyte, save the previous value for reinstating. */
2207    
2208     #ifdef SUPPORT_UTF8
2209     if (class_charcount == 1 &&
2210     (!utf8 ||
2211     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2212    
2213     #else
2214     if (class_charcount == 1)
2215     #endif
2216     {
2217     zeroreqbyte = reqbyte;
2218    
2219     /* The OP_NOT opcode works on one-byte characters only. */
2220    
2221     if (negate_class)
2222     {
2223     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2224     zerofirstbyte = firstbyte;
2225     *code++ = OP_NOT;
2226     *code++ = class_lastchar;
2227     break;
2228     }
2229    
2230     /* For a single, positive character, get the value into mcbuffer, and
2231     then we can handle this with the normal one-character code. */
2232    
2233     #ifdef SUPPORT_UTF8
2234     if (utf8 && class_lastchar > 127)
2235     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2236     else
2237     #endif
2238     {
2239     mcbuffer[0] = class_lastchar;
2240     mclength = 1;
2241     }
2242     goto ONE_CHAR;
2243     } /* End of 1-char optimization */
2244    
2245     /* The general case - not the one-char optimization. If this is the first
2246     thing in the branch, there can be no first char setting, whatever the
2247     repeat count. Any reqbyte setting must remain unchanged after any kind of
2248     repeat. */
2249    
2250     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2251     zerofirstbyte = firstbyte;
2252     zeroreqbyte = reqbyte;
2253    
2254     /* If there are characters with values > 255, we have to compile an
2255     extended class, with its own opcode. If there are no characters < 256,
2256     we can omit the bitmap. */
2257    
2258     #ifdef SUPPORT_UTF8
2259     if (class_utf8)
2260     {
2261     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2262     *code++ = OP_XCLASS;
2263     code += LINK_SIZE;
2264     *code = negate_class? XCL_NOT : 0;
2265    
2266     /* If the map is required, install it, and move on to the end of
2267     the extra data */
2268    
2269     if (class_charcount > 0)
2270     {
2271     *code++ |= XCL_MAP;
2272     memcpy(code, classbits, 32);
2273     code = class_utf8data;
2274     }
2275    
2276     /* If the map is not required, slide down the extra data. */
2277    
2278     else
2279     {
2280     int len = class_utf8data - (code + 33);
2281     memmove(code + 1, code + 33, len);
2282     code += len + 1;
2283     }
2284    
2285     /* Now fill in the complete length of the item */
2286    
2287     PUT(previous, 1, code - previous);
2288     break; /* End of class handling */
2289     }
2290     #endif
2291    
2292     /* If there are no characters > 255, negate the 32-byte map if necessary,
2293     and copy it into the code vector. If this is the first thing in the branch,
2294     there can be no first char setting, whatever the repeat count. Any reqbyte
2295     setting must remain unchanged after any kind of repeat. */
2296    
2297     if (negate_class)
2298     {
2299     *code++ = OP_NCLASS;
2300     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2301     }
2302     else
2303     {
2304     *code++ = OP_CLASS;
2305     memcpy(code, classbits, 32);
2306     }
2307     code += 32;
2308     break;
2309    
2310     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2311     has been tested above. */
2312    
2313     case '{':
2314     if (!is_quantifier) goto NORMAL_CHAR;
2315     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2316     if (*errorcodeptr != 0) goto FAILED;
2317     goto REPEAT;
2318    
2319     case '*':
2320     repeat_min = 0;
2321     repeat_max = -1;
2322     goto REPEAT;
2323    
2324     case '+':
2325     repeat_min = 1;
2326     repeat_max = -1;
2327     goto REPEAT;
2328    
2329     case '?':
2330     repeat_min = 0;
2331     repeat_max = 1;
2332    
2333     REPEAT:
2334     if (previous == NULL)
2335     {
2336     *errorcodeptr = ERR9;
2337     goto FAILED;
2338     }
2339    
2340     if (repeat_min == 0)
2341     {
2342     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2343     reqbyte = zeroreqbyte; /* Ditto */
2344     }
2345    
2346     /* Remember whether this is a variable length repeat */
2347    
2348     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2349    
2350     op_type = 0; /* Default single-char op codes */
2351     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2352    
2353     /* Save start of previous item, in case we have to move it up to make space
2354     for an inserted OP_ONCE for the additional '+' extension. */
2355    
2356     tempcode = previous;
2357    
2358     /* If the next character is '+', we have a possessive quantifier. This
2359     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2360     If the next character is '?' this is a minimizing repeat, by default,
2361     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2362     repeat type to the non-default. */
2363    
2364     if (ptr[1] == '+')
2365     {
2366     repeat_type = 0; /* Force greedy */
2367     possessive_quantifier = TRUE;
2368     ptr++;
2369     }
2370     else if (ptr[1] == '?')
2371     {
2372     repeat_type = greedy_non_default;
2373     ptr++;
2374     }
2375     else repeat_type = greedy_default;
2376    
2377     /* If previous was a recursion, we need to wrap it inside brackets so that
2378     it can be replicated if necessary. */
2379    
2380     if (*previous == OP_RECURSE)
2381     {
2382     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2383     code += 1 + LINK_SIZE;
2384     *previous = OP_BRA;
2385     PUT(previous, 1, code - previous);
2386     *code = OP_KET;
2387     PUT(code, 1, code - previous);
2388     code += 1 + LINK_SIZE;
2389     }
2390    
2391     /* If previous was a character match, abolish the item and generate a
2392     repeat item instead. If a char item has a minumum of more than one, ensure
2393     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2394     the first thing in a branch because the x will have gone into firstbyte
2395     instead. */
2396    
2397     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2398     {
2399     /* Deal with UTF-8 characters that take up more than one byte. It's
2400     easier to write this out separately than try to macrify it. Use c to
2401     hold the length of the character in bytes, plus 0x80 to flag that it's a
2402     length rather than a small character. */
2403    
2404     #ifdef SUPPORT_UTF8
2405     if (utf8 && (code[-1] & 0x80) != 0)
2406     {
2407     uschar *lastchar = code - 1;
2408     while((*lastchar & 0xc0) == 0x80) lastchar--;
2409     c = code - lastchar; /* Length of UTF-8 character */
2410     memcpy(utf8_char, lastchar, c); /* Save the char */
2411     c |= 0x80; /* Flag c as a length */
2412     }
2413     else
2414     #endif
2415    
2416     /* Handle the case of a single byte - either with no UTF8 support, or
2417     with UTF-8 disabled, or for a UTF-8 character < 128. */
2418    
2419     {
2420     c = code[-1];
2421     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2422     }
2423    
2424     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2425     }
2426    
2427     /* If previous was a single negated character ([^a] or similar), we use
2428     one of the special opcodes, replacing it. The code is shared with single-
2429     character repeats by setting opt_type to add a suitable offset into
2430     repeat_type. OP_NOT is currently used only for single-byte chars. */
2431    
2432     else if (*previous == OP_NOT)
2433     {
2434     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2435     c = previous[1];
2436     goto OUTPUT_SINGLE_REPEAT;
2437     }
2438    
2439     /* If previous was a character type match (\d or similar), abolish it and
2440     create a suitable repeat item. The code is shared with single-character
2441     repeats by setting op_type to add a suitable offset into repeat_type. Note
2442     the the Unicode property types will be present only when SUPPORT_UCP is
2443     defined, but we don't wrap the little bits of code here because it just
2444     makes it horribly messy. */
2445    
2446     else if (*previous < OP_EODN)
2447     {
2448     uschar *oldcode;
2449 nigel 87 int prop_type, prop_value;
2450 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2451     c = *previous;
2452    
2453     OUTPUT_SINGLE_REPEAT:
2454 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
2455     {
2456     prop_type = previous[1];
2457     prop_value = previous[2];
2458     }
2459     else prop_type = prop_value = -1;
2460 nigel 77
2461     oldcode = code;
2462     code = previous; /* Usually overwrite previous item */
2463    
2464     /* If the maximum is zero then the minimum must also be zero; Perl allows
2465     this case, so we do too - by simply omitting the item altogether. */
2466    
2467     if (repeat_max == 0) goto END_REPEAT;
2468    
2469     /* All real repeats make it impossible to handle partial matching (maybe
2470     one day we will be able to remove this restriction). */
2471    
2472     if (repeat_max != 1) cd->nopartial = TRUE;
2473    
2474     /* Combine the op_type with the repeat_type */
2475    
2476     repeat_type += op_type;
2477    
2478     /* A minimum of zero is handled either as the special case * or ?, or as
2479     an UPTO, with the maximum given. */
2480    
2481     if (repeat_min == 0)
2482     {
2483     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2484     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2485     else
2486     {
2487     *code++ = OP_UPTO + repeat_type;
2488     PUT2INC(code, 0, repeat_max);
2489     }
2490     }
2491    
2492     /* A repeat minimum of 1 is optimized into some special cases. If the
2493     maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2494     left in place and, if the maximum is greater than 1, we use OP_UPTO with
2495     one less than the maximum. */
2496    
2497     else if (repeat_min == 1)
2498     {
2499     if (repeat_max == -1)
2500     *code++ = OP_PLUS + repeat_type;
2501     else
2502     {
2503     code = oldcode; /* leave previous item in place */
2504     if (repeat_max == 1) goto END_REPEAT;
2505     *code++ = OP_UPTO + repeat_type;
2506     PUT2INC(code, 0, repeat_max - 1);
2507     }
2508     }
2509    
2510     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2511     handled as an EXACT followed by an UPTO. */
2512    
2513     else
2514     {
2515     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2516     PUT2INC(code, 0, repeat_min);
2517    
2518     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2519     we have to insert the character for the previous code. For a repeated
2520 nigel 87 Unicode property match, there are two extra bytes that define the
2521 nigel 77 required property. In UTF-8 mode, long characters have their length in
2522     c, with the 0x80 bit as a flag. */
2523    
2524     if (repeat_max < 0)
2525     {
2526     #ifdef SUPPORT_UTF8
2527     if (utf8 && c >= 128)
2528     {
2529     memcpy(code, utf8_char, c & 7);
2530     code += c & 7;
2531     }
2532     else
2533     #endif
2534     {
2535     *code++ = c;
2536 nigel 87 if (prop_type >= 0)
2537     {
2538     *code++ = prop_type;
2539     *code++ = prop_value;
2540     }
2541 nigel 77 }
2542     *code++ = OP_STAR + repeat_type;
2543     }
2544    
2545     /* Else insert an UPTO if the max is greater than the min, again
2546     preceded by the character, for the previously inserted code. */
2547    
2548     else if (repeat_max != repeat_min)
2549     {
2550     #ifdef SUPPORT_UTF8
2551     if (utf8 && c >= 128)
2552     {
2553     memcpy(code, utf8_char, c & 7);
2554     code += c & 7;
2555     }
2556     else
2557     #endif
2558     *code++ = c;
2559 nigel 87 if (prop_type >= 0)
2560     {
2561     *code++ = prop_type;
2562     *code++ = prop_value;
2563     }
2564 nigel 77 repeat_max -= repeat_min;
2565     *code++ = OP_UPTO + repeat_type;
2566     PUT2INC(code, 0, repeat_max);
2567     }
2568     }
2569    
2570     /* The character or character type itself comes last in all cases. */
2571    
2572     #ifdef SUPPORT_UTF8
2573     if (utf8 && c >= 128)
2574     {
2575     memcpy(code, utf8_char, c & 7);
2576     code += c & 7;
2577     }
2578     else
2579     #endif
2580     *code++ = c;
2581    
2582 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
2583     define the required property. */
2584 nigel 77
2585     #ifdef SUPPORT_UCP
2586 nigel 87 if (prop_type >= 0)
2587     {
2588     *code++ = prop_type;
2589     *code++ = prop_value;
2590     }
2591 nigel 77 #endif
2592     }
2593    
2594     /* If previous was a character class or a back reference, we put the repeat
2595     stuff after it, but just skip the item if the repeat was {0,0}. */
2596    
2597     else if (*previous == OP_CLASS ||
2598     *previous == OP_NCLASS ||
2599     #ifdef SUPPORT_UTF8
2600     *previous == OP_XCLASS ||
2601     #endif
2602     *previous == OP_REF)
2603     {
2604     if (repeat_max == 0)
2605     {
2606     code = previous;
2607     goto END_REPEAT;
2608     }
2609    
2610     /* All real repeats make it impossible to handle partial matching (maybe
2611     one day we will be able to remove this restriction). */
2612    
2613     if (repeat_max != 1) cd->nopartial = TRUE;
2614    
2615     if (repeat_min == 0 && repeat_max == -1)
2616     *code++ = OP_CRSTAR + repeat_type;
2617     else if (repeat_min == 1 && repeat_max == -1)
2618     *code++ = OP_CRPLUS + repeat_type;
2619     else if (repeat_min == 0 && repeat_max == 1)
2620     *code++ = OP_CRQUERY + repeat_type;
2621     else
2622     {
2623     *code++ = OP_CRRANGE + repeat_type;
2624     PUT2INC(code, 0, repeat_min);
2625     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2626     PUT2INC(code, 0, repeat_max);
2627     }
2628     }
2629    
2630     /* If previous was a bracket group, we may have to replicate it in certain
2631     cases. */
2632    
2633     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2634     *previous == OP_COND)
2635     {
2636     register int i;
2637     int ketoffset = 0;
2638     int len = code - previous;
2639     uschar *bralink = NULL;
2640    
2641     /* If the maximum repeat count is unlimited, find the end of the bracket
2642     by scanning through from the start, and compute the offset back to it
2643     from the current code pointer. There may be an OP_OPT setting following
2644     the final KET, so we can't find the end just by going back from the code
2645     pointer. */
2646    
2647     if (repeat_max == -1)
2648     {
2649     register uschar *ket = previous;
2650     do ket += GET(ket, 1); while (*ket != OP_KET);
2651     ketoffset = code - ket;
2652     }
2653    
2654     /* The case of a zero minimum is special because of the need to stick
2655     OP_BRAZERO in front of it, and because the group appears once in the
2656     data, whereas in other cases it appears the minimum number of times. For
2657     this reason, it is simplest to treat this case separately, as otherwise
2658     the code gets far too messy. There are several special subcases when the
2659     minimum is zero. */
2660    
2661     if (repeat_min == 0)
2662     {
2663     /* If the maximum is also zero, we just omit the group from the output
2664     altogether. */
2665    
2666     if (repeat_max == 0)
2667     {
2668     code = previous;
2669     goto END_REPEAT;
2670     }
2671    
2672     /* If the maximum is 1 or unlimited, we just have to stick in the
2673     BRAZERO and do no more at this point. However, we do need to adjust
2674     any OP_RECURSE calls inside the group that refer to the group itself or
2675     any internal group, because the offset is from the start of the whole
2676     regex. Temporarily terminate the pattern while doing this. */
2677    
2678     if (repeat_max <= 1)
2679     {
2680     *code = OP_END;
2681     adjust_recurse(previous, 1, utf8, cd);
2682     memmove(previous+1, previous, len);
2683     code++;
2684     *previous++ = OP_BRAZERO + repeat_type;
2685     }
2686    
2687     /* If the maximum is greater than 1 and limited, we have to replicate
2688     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2689     The first one has to be handled carefully because it's the original
2690     copy, which has to be moved up. The remainder can be handled by code
2691     that is common with the non-zero minimum case below. We have to
2692     adjust the value or repeat_max, since one less copy is required. Once
2693     again, we may have to adjust any OP_RECURSE calls inside the group. */
2694    
2695     else
2696     {
2697     int offset;
2698     *code = OP_END;
2699     adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2700     memmove(previous + 2 + LINK_SIZE, previous, len);
2701     code += 2 + LINK_SIZE;
2702     *previous++ = OP_BRAZERO + repeat_type;
2703     *previous++ = OP_BRA;
2704    
2705     /* We chain together the bracket offset fields that have to be
2706     filled in later when the ends of the brackets are reached. */
2707    
2708     offset = (bralink == NULL)? 0 : previous - bralink;
2709     bralink = previous;
2710     PUTINC(previous, 0, offset);
2711     }
2712    
2713     repeat_max--;
2714     }
2715    
2716     /* If the minimum is greater than zero, replicate the group as many
2717     times as necessary, and adjust the maximum to the number of subsequent
2718     copies that we need. If we set a first char from the group, and didn't
2719     set a required char, copy the latter from the former. */
2720    
2721     else
2722     {
2723     if (repeat_min > 1)
2724     {
2725     if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2726     for (i = 1; i < repeat_min; i++)
2727     {
2728     memcpy(code, previous, len);
2729     code += len;
2730     }
2731     }
2732     if (repeat_max > 0) repeat_max -= repeat_min;
2733     }
2734    
2735     /* This code is common to both the zero and non-zero minimum cases. If
2736     the maximum is limited, it replicates the group in a nested fashion,
2737     remembering the bracket starts on a stack. In the case of a zero minimum,
2738     the first one was set up above. In all cases the repeat_max now specifies
2739     the number of additional copies needed. */
2740    
2741     if (repeat_max >= 0)
2742     {
2743     for (i = repeat_max - 1; i >= 0; i--)
2744     {
2745     *code++ = OP_BRAZERO + repeat_type;
2746    
2747     /* All but the final copy start a new nesting, maintaining the
2748     chain of brackets outstanding. */
2749    
2750     if (i != 0)
2751     {
2752     int offset;
2753     *code++ = OP_BRA;
2754     offset = (bralink == NULL)? 0 : code - bralink;
2755     bralink = code;
2756     PUTINC(code, 0, offset);
2757     }
2758    
2759     memcpy(code, previous, len);
2760     code += len;
2761     }
2762    
2763     /* Now chain through the pending brackets, and fill in their length
2764     fields (which are holding the chain links pro tem). */
2765    
2766     while (bralink != NULL)
2767     {
2768     int oldlinkoffset;
2769     int offset = code - bralink + 1;
2770     uschar *bra = code - offset;
2771     oldlinkoffset = GET(bra, 1);
2772     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2773     *code++ = OP_KET;
2774     PUTINC(code, 0, offset);
2775     PUT(bra, 1, offset);
2776     }
2777     }
2778    
2779     /* If the maximum is unlimited, set a repeater in the final copy. We
2780     can't just offset backwards from the current code point, because we
2781     don't know if there's been an options resetting after the ket. The
2782     correct offset was computed above. */
2783    
2784     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2785     }
2786    
2787     /* Else there's some kind of shambles */
2788    
2789     else
2790     {
2791     *errorcodeptr = ERR11;
2792     goto FAILED;
2793     }
2794    
2795     /* If the character following a repeat is '+', we wrap the entire repeated
2796     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2797     Sun's Java package. The repeated item starts at tempcode, not at previous,
2798     which might be the first part of a string whose (former) last char we
2799     repeated. However, we don't support '+' after a greediness '?'. */
2800    
2801     if (possessive_quantifier)
2802     {
2803     int len = code - tempcode;
2804     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2805     code += 1 + LINK_SIZE;
2806     len += 1 + LINK_SIZE;
2807     tempcode[0] = OP_ONCE;
2808     *code++ = OP_KET;
2809     PUTINC(code, 0, len);
2810     PUT(tempcode, 1, len);
2811     }
2812    
2813     /* In all case we no longer have a previous item. We also set the
2814     "follows varying string" flag for subsequently encountered reqbytes if
2815     it isn't already set and we have just passed a varying length item. */
2816    
2817     END_REPEAT:
2818     previous = NULL;
2819     cd->req_varyopt |= reqvary;
2820     break;
2821    
2822    
2823     /* Start of nested bracket sub-expression, or comment or lookahead or
2824     lookbehind or option setting or condition. First deal with special things
2825     that can come after a bracket; all are introduced by ?, and the appearance
2826     of any of them means that this is not a referencing group. They were
2827     checked for validity in the first pass over the string, so we don't have to
2828     check for syntax errors here. */
2829    
2830     case '(':
2831     newoptions = options;
2832     skipbytes = 0;
2833    
2834     if (*(++ptr) == '?')
2835     {
2836     int set, unset;
2837     int *optset;
2838    
2839     switch (*(++ptr))
2840     {
2841     case '#': /* Comment; skip to ket */
2842     ptr++;
2843     while (*ptr != ')') ptr++;
2844     continue;
2845    
2846     case ':': /* Non-extracting bracket */
2847     bravalue = OP_BRA;
2848     ptr++;
2849     break;
2850    
2851     case '(':
2852     bravalue = OP_COND; /* Conditional group */
2853    
2854     /* Condition to test for recursion */
2855    
2856     if (ptr[1] == 'R')
2857     {
2858     code[1+LINK_SIZE] = OP_CREF;
2859     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2860     skipbytes = 3;
2861     ptr += 3;
2862     }
2863    
2864     /* Condition to test for a numbered subpattern match. We know that
2865     if a digit follows ( then there will just be digits until ) because
2866     the syntax was checked in the first pass. */
2867    
2868     else if ((digitab[ptr[1]] && ctype_digit) != 0)
2869     {
2870     int condref; /* Don't amalgamate; some compilers */
2871     condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2872     while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2873     if (condref == 0)
2874     {
2875     *errorcodeptr = ERR35;
2876     goto FAILED;
2877     }
2878     ptr++;
2879     code[1+LINK_SIZE] = OP_CREF;
2880     PUT2(code, 2+LINK_SIZE, condref);
2881     skipbytes = 3;
2882     }
2883     /* For conditions that are assertions, we just fall through, having
2884     set bravalue above. */
2885     break;
2886    
2887     case '=': /* Positive lookahead */
2888     bravalue = OP_ASSERT;
2889     ptr++;
2890     break;
2891    
2892     case '!': /* Negative lookahead */
2893     bravalue = OP_ASSERT_NOT;
2894     ptr++;
2895     break;
2896    
2897     case '<': /* Lookbehinds */
2898     switch (*(++ptr))
2899     {
2900     case '=': /* Positive lookbehind */
2901     bravalue = OP_ASSERTBACK;
2902     ptr++;
2903     break;
2904    
2905     case '!': /* Negative lookbehind */
2906     bravalue = OP_ASSERTBACK_NOT;
2907     ptr++;
2908     break;
2909     }
2910     break;
2911    
2912     case '>': /* One-time brackets */
2913     bravalue = OP_ONCE;
2914     ptr++;
2915     break;
2916    
2917     case 'C': /* Callout - may be followed by digits; */
2918     previous_callout = code; /* Save for later completion */
2919     after_manual_callout = 1; /* Skip one item before completing */
2920     *code++ = OP_CALLOUT; /* Already checked that the terminating */
2921     { /* closing parenthesis is present. */
2922     int n = 0;
2923     while ((digitab[*(++ptr)] & ctype_digit) != 0)
2924     n = n * 10 + *ptr - '0';
2925     if (n > 255)
2926     {
2927     *errorcodeptr = ERR38;
2928     goto FAILED;
2929     }
2930     *code++ = n;
2931     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2932     PUT(code, LINK_SIZE, 0); /* Default length */
2933     code += 2 * LINK_SIZE;
2934     }
2935     previous = NULL;
2936     continue;
2937    
2938     case 'P': /* Named subpattern handling */
2939     if (*(++ptr) == '<') /* Definition */
2940     {
2941     int i, namelen;
2942     uschar *slot = cd->name_table;
2943     const uschar *name; /* Don't amalgamate; some compilers */
2944     name = ++ptr; /* grumble at autoincrement in declaration */
2945    
2946     while (*ptr++ != '>');
2947     namelen = ptr - name - 1;
2948    
2949     for (i = 0; i < cd->names_found; i++)
2950     {
2951     int crc = memcmp(name, slot+2, namelen);
2952     if (crc == 0)
2953     {
2954     if (slot[2+namelen] == 0)
2955     {
2956     *errorcodeptr = ERR43;
2957     goto FAILED;
2958     }
2959     crc = -1; /* Current name is substring */
2960     }
2961     if (crc < 0)
2962     {
2963     memmove(slot + cd->name_entry_size, slot,
2964     (cd->names_found - i) * cd->name_entry_size);
2965     break;
2966     }
2967     slot += cd->name_entry_size;
2968     }
2969    
2970     PUT2(slot, 0, *brackets + 1);
2971     memcpy(slot + 2, name, namelen);
2972     slot[2+namelen] = 0;
2973     cd->names_found++;
2974     goto NUMBERED_GROUP;
2975     }
2976    
2977     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2978     {
2979     int i, namelen;
2980     int type = *ptr++;
2981     const uschar *name = ptr;
2982     uschar *slot = cd->name_table;
2983    
2984     while (*ptr != ')') ptr++;
2985     namelen = ptr - name;
2986    
2987     for (i = 0; i < cd->names_found; i++)
2988     {
2989     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2990     slot += cd->name_entry_size;
2991     }
2992     if (i >= cd->names_found)
2993     {
2994     *errorcodeptr = ERR15;
2995     goto FAILED;
2996     }
2997    
2998     recno = GET2(slot, 0);
2999    
3000     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3001    
3002     /* Back reference */
3003    
3004     previous = code;
3005     *code++ = OP_REF;
3006     PUT2INC(code, 0, recno);
3007     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3008     if (recno > cd->top_backref) cd->top_backref = recno;
3009     continue;
3010     }
3011    
3012     /* Should never happen */
3013     break;
3014    
3015     case 'R': /* Pattern recursion */
3016     ptr++; /* Same as (?0) */
3017     /* Fall through */
3018    
3019     /* Recursion or "subroutine" call */
3020    
3021     case '0': case '1': case '2': case '3': case '4':
3022     case '5': case '6': case '7': case '8': case '9':
3023     {
3024     const uschar *called;
3025     recno = 0;
3026     while((digitab[*ptr] & ctype_digit) != 0)
3027     recno = recno * 10 + *ptr++ - '0';
3028    
3029     /* Come here from code above that handles a named recursion */
3030    
3031     HANDLE_RECURSION:
3032    
3033     previous = code;
3034    
3035     /* Find the bracket that is being referenced. Temporarily end the
3036     regex in case it doesn't exist. */
3037    
3038     *code = OP_END;
3039     called = (recno == 0)?
3040     cd->start_code : find_bracket(cd->start_code, utf8, recno);
3041    
3042     if (called == NULL)
3043     {
3044     *errorcodeptr = ERR15;
3045     goto FAILED;
3046     }
3047    
3048     /* If the subpattern is still open, this is a recursive call. We
3049     check to see if this is a left recursion that could loop for ever,
3050     and diagnose that case. */
3051    
3052     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3053     {
3054     *errorcodeptr = ERR40;
3055     goto FAILED;
3056     }
3057    
3058 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside
3059     "once" brackets. */
3060 nigel 77
3061 nigel 87 *code = OP_ONCE;
3062     PUT(code, 1, 2 + 2*LINK_SIZE);
3063     code += 1 + LINK_SIZE;
3064    
3065 nigel 77 *code = OP_RECURSE;
3066     PUT(code, 1, called - cd->start_code);
3067     code += 1 + LINK_SIZE;
3068 nigel 87
3069     *code = OP_KET;
3070     PUT(code, 1, 2 + 2*LINK_SIZE);
3071     code += 1 + LINK_SIZE;
3072 nigel 77 }
3073     continue;
3074    
3075     /* Character after (? not specially recognized */
3076    
3077     default: /* Option setting */
3078     set = unset = 0;
3079     optset = &set;
3080    
3081     while (*ptr != ')' && *ptr != ':')
3082     {
3083     switch (*ptr++)
3084     {
3085     case '-': optset = &unset; break;
3086    
3087     case 'i': *optset |= PCRE_CASELESS; break;
3088     case 'm': *optset |= PCRE_MULTILINE; break;
3089     case 's': *optset |= PCRE_DOTALL; break;
3090     case 'x': *optset |= PCRE_EXTENDED; break;
3091     case 'U': *optset |= PCRE_UNGREEDY; break;
3092     case 'X': *optset |= PCRE_EXTRA; break;
3093     }
3094     }
3095    
3096     /* Set up the changed option bits, but don't change anything yet. */
3097    
3098     newoptions = (options | set) & (~unset);
3099    
3100     /* If the options ended with ')' this is not the start of a nested
3101     group with option changes, so the options change at this level. Compile
3102     code to change the ims options if this setting actually changes any of
3103     them. We also pass the new setting back so that it can be put at the
3104     start of any following branches, and when this group ends (if we are in
3105     a group), a resetting item can be compiled.
3106    
3107     Note that if this item is right at the start of the pattern, the
3108     options will have been abstracted and made global, so there will be no
3109     change to compile. */
3110    
3111     if (*ptr == ')')
3112     {
3113     if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3114     {
3115     *code++ = OP_OPT;
3116     *code++ = newoptions & PCRE_IMS;
3117     }
3118    
3119     /* Change options at this level, and pass them back for use
3120     in subsequent branches. Reset the greedy defaults and the case
3121     value for firstbyte and reqbyte. */
3122    
3123     *optionsptr = options = newoptions;
3124     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3125     greedy_non_default = greedy_default ^ 1;
3126     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3127    
3128     previous = NULL; /* This item can't be repeated */
3129     continue; /* It is complete */
3130     }
3131    
3132     /* If the options ended with ':' we are heading into a nested group
3133     with possible change of options. Such groups are non-capturing and are
3134     not assertions of any kind. All we need to do is skip over the ':';
3135     the newoptions value is handled below. */
3136    
3137     bravalue = OP_BRA;
3138     ptr++;
3139     }
3140     }
3141    
3142     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3143     non-capturing and behave like (?:...) brackets */
3144    
3145     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3146     {
3147     bravalue = OP_BRA;
3148     }
3149    
3150     /* Else we have a referencing group; adjust the opcode. If the bracket
3151     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3152     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3153    
3154     else
3155     {
3156     NUMBERED_GROUP:
3157     if (++(*brackets) > EXTRACT_BASIC_MAX)
3158     {
3159     bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3160     code[1+LINK_SIZE] = OP_BRANUMBER;
3161     PUT2(code, 2+LINK_SIZE, *brackets);
3162     skipbytes = 3;
3163     }
3164     else bravalue = OP_BRA + *brackets;
3165     }
3166    
3167     /* Process nested bracketed re. Assertions may not be repeated, but other
3168     kinds can be. We copy code into a non-register variable in order to be able
3169     to pass its address because some compilers complain otherwise. Pass in a
3170     new setting for the ims options if they have changed. */
3171    
3172     previous = (bravalue >= OP_ONCE)? code : NULL;
3173     *code = bravalue;
3174     tempcode = code;
3175     tempreqvary = cd->req_varyopt; /* Save value before bracket */
3176    
3177     if (!compile_regex(
3178     newoptions, /* The complete new option state */
3179     options & PCRE_IMS, /* The previous ims option state */
3180     brackets, /* Extracting bracket count */
3181     &tempcode, /* Where to put code (updated) */
3182     &ptr, /* Input pointer (updated) */
3183     errorcodeptr, /* Where to put an error message */
3184     (bravalue == OP_ASSERTBACK ||
3185     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3186     skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3187     &subfirstbyte, /* For possible first char */
3188     &subreqbyte, /* For possible last char */
3189     bcptr, /* Current branch chain */
3190     cd)) /* Tables block */
3191     goto FAILED;
3192    
3193     /* At the end of compiling, code is still pointing to the start of the
3194     group, while tempcode has been updated to point past the end of the group
3195     and any option resetting that may follow it. The pattern pointer (ptr)
3196     is on the bracket. */
3197    
3198     /* If this is a conditional bracket, check that there are no more than
3199     two branches in the group. */
3200    
3201     else if (bravalue == OP_COND)
3202     {
3203     uschar *tc = code;
3204     condcount = 0;
3205    
3206     do {
3207     condcount++;
3208     tc += GET(tc,1);
3209     }
3210     while (*tc != OP_KET);
3211    
3212     if (condcount > 2)
3213     {
3214     *errorcodeptr = ERR27;
3215     goto FAILED;
3216     }
3217    
3218     /* If there is just one branch, we must not make use of its firstbyte or
3219     reqbyte, because this is equivalent to an empty second branch. */
3220    
3221     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3222     }
3223    
3224     /* Handle updating of the required and first characters. Update for normal
3225     brackets of all kinds, and conditions with two branches (see code above).
3226     If the bracket is followed by a quantifier with zero repeat, we have to
3227     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3228     main loop so that they can be accessed for the back off. */
3229    
3230     zeroreqbyte = reqbyte;
3231     zerofirstbyte = firstbyte;
3232     groupsetfirstbyte = FALSE;
3233    
3234     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3235     {
3236     /* If we have not yet set a firstbyte in this branch, take it from the
3237     subpattern, remembering that it was set here so that a repeat of more
3238     than one can replicate it as reqbyte if necessary. If the subpattern has
3239     no firstbyte, set "none" for the whole branch. In both cases, a zero
3240     repeat forces firstbyte to "none". */
3241    
3242     if (firstbyte == REQ_UNSET)
3243     {
3244     if (subfirstbyte >= 0)
3245     {
3246     firstbyte = subfirstbyte;
3247     groupsetfirstbyte = TRUE;
3248     }
3249     else firstbyte = REQ_NONE;
3250     zerofirstbyte = REQ_NONE;
3251     }
3252    
3253     /* If firstbyte was previously set, convert the subpattern's firstbyte
3254     into reqbyte if there wasn't one, using the vary flag that was in
3255     existence beforehand. */
3256    
3257     else if (subfirstbyte >= 0 && subreqbyte < 0)
3258     subreqbyte = subfirstbyte | tempreqvary;
3259    
3260     /* If the subpattern set a required byte (or set a first byte that isn't
3261     really the first byte - see above), set it. */
3262    
3263     if (subreqbyte >= 0) reqbyte = subreqbyte;
3264     }
3265    
3266     /* For a forward assertion, we take the reqbyte, if set. This can be
3267     helpful if the pattern that follows the assertion doesn't set a different
3268     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3269     for an assertion, however because it leads to incorrect effect for patterns
3270     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3271     of a firstbyte. This is overcome by a scan at the end if there's no
3272     firstbyte, looking for an asserted first char. */
3273    
3274     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3275    
3276     /* Now update the main code pointer to the end of the group. */
3277    
3278     code = tempcode;
3279    
3280     /* Error if hit end of pattern */
3281    
3282     if (*ptr != ')')
3283     {
3284     *errorcodeptr = ERR14;
3285     goto FAILED;
3286     }
3287     break;
3288    
3289     /* Check \ for being a real metacharacter; if not, fall through and handle
3290     it as a data character at the start of a string. Escape items are checked
3291     for validity in the pre-compiling pass. */
3292    
3293     case '\\':
3294     tempptr = ptr;
3295     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3296    
3297     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3298     are arranged to be the negation of the corresponding OP_values. For the
3299     back references, the values are ESC_REF plus the reference number. Only
3300     back references and those types that consume a character may be repeated.
3301     We can test for values between ESC_b and ESC_Z for the latter; this may
3302     have to change if any new ones are ever created. */
3303    
3304     if (c < 0)
3305     {
3306     if (-c == ESC_Q) /* Handle start of quoted string */
3307     {
3308     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3309     else inescq = TRUE;
3310     continue;
3311     }
3312    
3313     /* For metasequences that actually match a character, we disable the
3314     setting of a first character if it hasn't already been set. */
3315    
3316     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3317     firstbyte = REQ_NONE;
3318    
3319     /* Set values to reset to if this is followed by a zero repeat. */
3320    
3321     zerofirstbyte = firstbyte;
3322     zeroreqbyte = reqbyte;
3323    
3324     /* Back references are handled specially */
3325    
3326     if (-c >= ESC_REF)
3327     {
3328     int number = -c - ESC_REF;
3329     previous = code;
3330     *code++ = OP_REF;
3331     PUT2INC(code, 0, number);
3332     }
3333    
3334     /* So are Unicode property matches, if supported. We know that get_ucp
3335     won't fail because it was tested in the pre-pass. */
3336    
3337     #ifdef SUPPORT_UCP
3338     else if (-c == ESC_P || -c == ESC_p)
3339     {
3340     BOOL negated;
3341 nigel 87 int pdata;
3342     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3343 nigel 77 previous = code;
3344     *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3345 nigel 87 *code++ = ptype;
3346     *code++ = pdata;
3347 nigel 77 }
3348     #endif
3349    
3350     /* For the rest, we can obtain the OP value by negating the escape
3351     value */
3352    
3353     else
3354     {
3355     previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3356     *code++ = -c;
3357     }
3358     continue;
3359     }
3360    
3361     /* We have a data character whose value is in c. In UTF-8 mode it may have
3362     a value > 127. We set its representation in the length/buffer, and then
3363     handle it as a data character. */
3364    
3365     #ifdef SUPPORT_UTF8
3366     if (utf8 && c > 127)
3367     mclength = _pcre_ord2utf8(c, mcbuffer);
3368     else
3369     #endif
3370    
3371     {
3372     mcbuffer[0] = c;
3373     mclength = 1;
3374     }
3375    
3376     goto ONE_CHAR;
3377    
3378     /* Handle a literal character. It is guaranteed not to be whitespace or #
3379     when the extended flag is set. If we are in UTF-8 mode, it may be a
3380     multi-byte literal character. */
3381    
3382     default:
3383     NORMAL_CHAR:
3384     mclength = 1;
3385     mcbuffer[0] = c;
3386    
3387     #ifdef SUPPORT_UTF8
3388     if (utf8 && (c & 0xc0) == 0xc0)
3389     {
3390     while ((ptr[1] & 0xc0) == 0x80)
3391     mcbuffer[mclength++] = *(++ptr);
3392     }
3393     #endif
3394    
3395     /* At this point we have the character's bytes in mcbuffer, and the length
3396     in mclength. When not in UTF-8 mode, the length is always 1. */
3397    
3398     ONE_CHAR:
3399     previous = code;
3400     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3401     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3402    
3403     /* Set the first and required bytes appropriately. If no previous first
3404     byte, set it from this character, but revert to none on a zero repeat.
3405     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3406     repeat. */
3407    
3408     if (firstbyte == REQ_UNSET)
3409     {
3410     zerofirstbyte = REQ_NONE;
3411     zeroreqbyte = reqbyte;
3412    
3413     /* If the character is more than one byte long, we can set firstbyte
3414     only if it is not to be matched caselessly. */
3415    
3416     if (mclength == 1 || req_caseopt == 0)
3417     {
3418     firstbyte = mcbuffer[0] | req_caseopt;
3419     if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3420     }
3421     else firstbyte = reqbyte = REQ_NONE;
3422     }
3423    
3424     /* firstbyte was previously set; we can set reqbyte only the length is
3425     1 or the matching is caseful. */
3426    
3427     else
3428     {
3429     zerofirstbyte = firstbyte;
3430     zeroreqbyte = reqbyte;
3431     if (mclength == 1 || req_caseopt == 0)
3432     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3433     }
3434    
3435     break; /* End of literal character handling */
3436     }
3437     } /* end of big loop */
3438    
3439     /* Control never reaches here by falling through, only by a goto for all the
3440     error states. Pass back the position in the pattern so that it can be displayed
3441     to the user for diagnosing the error. */
3442    
3443     FAILED:
3444     *ptrptr = ptr;
3445     return FALSE;
3446     }
3447    
3448    
3449    
3450    
3451     /*************************************************
3452     * Compile sequence of alternatives *
3453     *************************************************/
3454    
3455     /* On entry, ptr is pointing past the bracket character, but on return
3456     it points to the closing bracket, or vertical bar, or end of string.
3457     The code variable is pointing at the byte into which the BRA operator has been
3458     stored. If the ims options are changed at the start (for a (?ims: group) or
3459     during any branch, we need to insert an OP_OPT item at the start of every
3460     following branch to ensure they get set correctly at run time, and also pass
3461     the new options into every subsequent branch compile.
3462    
3463     Argument:
3464     options option bits, including any changes for this subpattern
3465     oldims previous settings of ims option bits
3466     brackets -> int containing the number of extracting brackets used
3467     codeptr -> the address of the current code pointer
3468     ptrptr -> the address of the current pattern pointer
3469     errorcodeptr -> pointer to error code variable
3470     lookbehind TRUE if this is a lookbehind assertion
3471     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3472     firstbyteptr place to put the first required character, or a negative number
3473     reqbyteptr place to put the last required character, or a negative number
3474     bcptr pointer to the chain of currently open branches
3475     cd points to the data block with tables pointers etc.
3476    
3477     Returns: TRUE on success
3478     */
3479    
3480     static BOOL
3481     compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3482     const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3483     int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3484     {
3485     const uschar *ptr = *ptrptr;
3486     uschar *code = *codeptr;
3487     uschar *last_branch = code;
3488     uschar *start_bracket = code;
3489     uschar *reverse_count = NULL;
3490     int firstbyte, reqbyte;
3491     int branchfirstbyte, branchreqbyte;
3492     branch_chain bc;
3493    
3494     bc.outer = bcptr;
3495     bc.current = code;
3496    
3497     firstbyte = reqbyte = REQ_UNSET;
3498    
3499     /* Offset is set zero to mark that this bracket is still open */
3500    
3501     PUT(code, 1, 0);
3502     code += 1 + LINK_SIZE + skipbytes;
3503    
3504     /* Loop for each alternative branch */
3505    
3506     for (;;)
3507     {
3508     /* Handle a change of ims options at the start of the branch */
3509    
3510     if ((options & PCRE_IMS) != oldims)
3511     {
3512     *code++ = OP_OPT;
3513     *code++ = options & PCRE_IMS;
3514     }
3515    
3516     /* Set up dummy OP_REVERSE if lookbehind assertion */
3517    
3518     if (lookbehind)
3519     {
3520     *code++ = OP_REVERSE;
3521     reverse_count = code;
3522     PUTINC(code, 0, 0);
3523     }
3524    
3525     /* Now compile the branch */
3526    
3527     if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3528     &branchfirstbyte, &branchreqbyte, &bc, cd))
3529     {
3530     *ptrptr = ptr;
3531     return FALSE;
3532     }
3533    
3534     /* If this is the first branch, the firstbyte and reqbyte values for the
3535     branch become the values for the regex. */
3536    
3537     if (*last_branch != OP_ALT)
3538     {
3539     firstbyte = branchfirstbyte;
3540     reqbyte = branchreqbyte;
3541     }
3542    
3543     /* If this is not the first branch, the first char and reqbyte have to
3544     match the values from all the previous branches, except that if the previous
3545     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3546     REQ_VARY for the regex. */
3547    
3548     else
3549     {
3550     /* If we previously had a firstbyte, but it doesn't match the new branch,
3551     we have to abandon the firstbyte for the regex, but if there was previously
3552     no reqbyte, it takes on the value of the old firstbyte. */
3553    
3554     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3555     {
3556     if (reqbyte < 0) reqbyte = firstbyte;
3557     firstbyte = REQ_NONE;
3558     }
3559    
3560     /* If we (now or from before) have no firstbyte, a firstbyte from the
3561     branch becomes a reqbyte if there isn't a branch reqbyte. */
3562    
3563     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3564     branchreqbyte = branchfirstbyte;
3565    
3566     /* Now ensure that the reqbytes match */
3567    
3568     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3569     reqbyte = REQ_NONE;
3570     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3571     }
3572    
3573     /* If lookbehind, check that this branch matches a fixed-length string,
3574     and put the length into the OP_REVERSE item. Temporarily mark the end of
3575     the branch with OP_END. */
3576    
3577     if (lookbehind)
3578     {
3579     int length;
3580     *code = OP_END;
3581     length = find_fixedlength(last_branch, options);
3582     DPRINTF(("fixed length = %d\n", length));
3583     if (length < 0)
3584     {
3585     *errorcodeptr = (length == -2)? ERR36 : ERR25;
3586     *ptrptr = ptr;
3587     return FALSE;
3588     }
3589     PUT(reverse_count, 0, length);
3590     }
3591    
3592     /* Reached end of expression, either ')' or end of pattern. Go back through
3593     the alternative branches and reverse the chain of offsets, with the field in
3594     the BRA item now becoming an offset to the first alternative. If there are
3595     no alternatives, it points to the end of the group. The length in the
3596     terminating ket is always the length of the whole bracketed item. If any of
3597     the ims options were changed inside the group, compile a resetting op-code
3598     following, except at the very end of the pattern. Return leaving the pointer
3599     at the terminating char. */
3600    
3601     if (*ptr != '|')
3602     {
3603     int length = code - last_branch;
3604     do
3605     {
3606     int prev_length = GET(last_branch, 1);
3607     PUT(last_branch, 1, length);
3608     length = prev_length;
3609     last_branch -= length;
3610     }
3611     while (length > 0);
3612    
3613     /* Fill in the ket */
3614    
3615     *code = OP_KET;
3616     PUT(code, 1, code - start_bracket);
3617     code += 1 + LINK_SIZE;
3618    
3619     /* Resetting option if needed */
3620    
3621     if ((options & PCRE_IMS) != oldims && *ptr == ')')
3622     {
3623     *code++ = OP_OPT;
3624     *code++ = oldims;
3625     }
3626    
3627     /* Set values to pass back */
3628    
3629     *codeptr = code;
3630     *ptrptr = ptr;
3631     *firstbyteptr = firstbyte;
3632     *reqbyteptr = reqbyte;
3633     return TRUE;
3634     }
3635    
3636     /* Another branch follows; insert an "or" node. Its length field points back
3637     to the previous branch while the bracket remains open. At the end the chain
3638     is reversed. It's done like this so that the start of the bracket has a
3639     zero offset until it is closed, making it possible to detect recursion. */
3640    
3641     *code = OP_ALT;
3642     PUT(code, 1, code - last_branch);
3643     bc.current = last_branch = code;
3644     code += 1 + LINK_SIZE;
3645     ptr++;
3646     }
3647     /* Control never reaches here */
3648     }
3649    
3650    
3651    
3652    
3653     /*************************************************
3654     * Check for anchored expression *
3655     *************************************************/
3656    
3657     /* Try to find out if this is an anchored regular expression. Consider each
3658     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3659     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3660     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3661     counts, since OP_CIRC can match in the middle.
3662    
3663     We can also consider a regex to be anchored if OP_SOM starts all its branches.
3664     This is the code for \G, which means "match at start of match position, taking
3665     into account the match offset".
3666    
3667     A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3668     because that will try the rest of the pattern at all possible matching points,
3669     so there is no point trying again.... er ....
3670    
3671     .... except when the .* appears inside capturing parentheses, and there is a
3672     subsequent back reference to those parentheses. We haven't enough information
3673     to catch that case precisely.
3674    
3675     At first, the best we could do was to detect when .* was in capturing brackets
3676     and the highest back reference was greater than or equal to that level.
3677     However, by keeping a bitmap of the first 31 back references, we can catch some
3678     of the more common cases more precisely.
3679    
3680     Arguments:
3681     code points to start of expression (the bracket)
3682     options points to the options setting
3683     bracket_map a bitmap of which brackets we are inside while testing; this
3684     handles up to substring 31; after that we just have to take
3685     the less precise approach
3686     backref_map the back reference bitmap
3687    
3688     Returns: TRUE or FALSE
3689     */
3690    
3691     static BOOL
3692     is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3693     unsigned int backref_map)
3694     {
3695     do {
3696     const uschar *scode =
3697     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3698     register int op = *scode;
3699    
3700     /* Capturing brackets */
3701    
3702     if (op > OP_BRA)
3703     {
3704     int new_map;
3705     op -= OP_BRA;
3706     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3707     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3708     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3709     }
3710    
3711     /* Other brackets */
3712    
3713     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3714     {
3715     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3716     }
3717    
3718     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3719     are or may be referenced. */
3720    
3721     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3722     (*options & PCRE_DOTALL) != 0)
3723     {
3724     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3725     }
3726    
3727     /* Check for explicit anchoring */
3728    
3729     else if (op != OP_SOD && op != OP_SOM &&
3730     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3731     return FALSE;
3732     code += GET(code, 1);
3733     }
3734     while (*code == OP_ALT); /* Loop for each alternative */
3735     return TRUE;
3736     }
3737    
3738    
3739    
3740     /*************************************************
3741     * Check for starting with ^ or .* *
3742     *************************************************/
3743    
3744     /* This is called to find out if every branch starts with ^ or .* so that
3745     "first char" processing can be done to speed things up in multiline
3746     matching and for non-DOTALL patterns that start with .* (which must start at
3747     the beginning or after \n). As in the case of is_anchored() (see above), we
3748     have to take account of back references to capturing brackets that contain .*
3749     because in that case we can't make the assumption.
3750    
3751     Arguments:
3752     code points to start of expression (the bracket)
3753     bracket_map a bitmap of which brackets we are inside while testing; this
3754     handles up to substring 31; after that we just have to take
3755     the less precise approach
3756     backref_map the back reference bitmap
3757    
3758     Returns: TRUE or FALSE
3759     */
3760    
3761     static BOOL
3762     is_startline(const uschar *code, unsigned int bracket_map,
3763     unsigned int backref_map)
3764     {
3765     do {
3766     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3767     FALSE);
3768     register int op = *scode;
3769    
3770     /* Capturing brackets */
3771    
3772     if (op > OP_BRA)
3773     {
3774     int new_map;
3775     op -= OP_BRA;
3776     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3777     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3778     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3779     }
3780    
3781     /* Other brackets */
3782    
3783     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3784     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3785    
3786     /* .* means "start at start or after \n" if it isn't in brackets that
3787     may be referenced. */
3788    
3789     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3790     {
3791     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3792     }
3793    
3794     /* Check for explicit circumflex */
3795    
3796     else if (op != OP_CIRC) return FALSE;
3797    
3798     /* Move on to the next alternative */
3799    
3800     code += GET(code, 1);
3801     }
3802     while (*code == OP_ALT); /* Loop for each alternative */
3803     return TRUE;
3804     }
3805    
3806    
3807    
3808     /*************************************************
3809     * Check for asserted fixed first char *
3810     *************************************************/
3811    
3812     /* During compilation, the "first char" settings from forward assertions are
3813     discarded, because they can cause conflicts with actual literals that follow.
3814     However, if we end up without a first char setting for an unanchored pattern,
3815     it is worth scanning the regex to see if there is an initial asserted first
3816     char. If all branches start with the same asserted char, or with a bracket all
3817     of whose alternatives start with the same asserted char (recurse ad lib), then
3818     we return that char, otherwise -1.
3819    
3820     Arguments:
3821     code points to start of expression (the bracket)
3822     options pointer to the options (used to check casing changes)
3823     inassert TRUE if in an assertion
3824    
3825     Returns: -1 or the fixed first char
3826     */
3827    
3828     static int
3829     find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3830     {
3831     register int c = -1;
3832     do {
3833     int d;
3834     const uschar *scode =
3835     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3836     register int op = *scode;
3837    
3838     if (op >= OP_BRA) op = OP_BRA;
3839    
3840     switch(op)
3841     {
3842     default:
3843     return -1;
3844    
3845     case OP_BRA:
3846     case OP_ASSERT:
3847     case OP_ONCE:
3848     case OP_COND:
3849     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3850     return -1;
3851     if (c < 0) c = d; else if (c != d) return -1;
3852     break;
3853    
3854     case OP_EXACT: /* Fall through */
3855     scode += 2;
3856    
3857     case OP_CHAR:
3858     case OP_CHARNC:
3859     case OP_PLUS:
3860     case OP_MINPLUS:
3861     if (!inassert) return -1;
3862     if (c < 0)
3863     {
3864     c = scode[1];
3865     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3866     }
3867     else if (c != scode[1]) return -1;
3868     break;
3869     }
3870    
3871     code += GET(code, 1);
3872     }
3873     while (*code == OP_ALT);
3874     return c;
3875     }
3876    
3877    
3878    
3879     /*************************************************
3880     * Compile a Regular Expression *
3881     *************************************************/
3882    
3883     /* This function takes a string and returns a pointer to a block of store
3884     holding a compiled version of the expression. The original API for this
3885     function had no error code return variable; it is retained for backwards
3886     compatibility. The new function is given a new name.
3887    
3888     Arguments:
3889     pattern the regular expression
3890     options various option bits
3891     errorcodeptr pointer to error code variable (pcre_compile2() only)
3892     can be NULL if you don't want a code value
3893     errorptr pointer to pointer to error text
3894     erroroffset ptr offset in pattern where error was detected
3895     tables pointer to character tables or NULL
3896    
3897     Returns: pointer to compiled data block, or NULL on error,
3898     with errorptr and erroroffset set
3899     */
3900    
3901 nigel 87 PCRE_DATA_SCOPE pcre *
3902 nigel 77 pcre_compile(const char *pattern, int options, const char **errorptr,
3903     int *erroroffset, const unsigned char *tables)
3904     {
3905     return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3906     }
3907    
3908    
3909 nigel 87 PCRE_DATA_SCOPE pcre *
3910 nigel 77 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3911     const char **errorptr, int *erroroffset, const unsigned char *tables)
3912     {
3913     real_pcre *re;
3914     int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3915     int c, firstbyte, reqbyte;
3916     int bracount = 0;
3917     int branch_extra = 0;
3918     int branch_newextra;
3919     int item_count = -1;
3920     int name_count = 0;
3921     int max_name_size = 0;
3922     int lastitemlength = 0;
3923     int errorcode = 0;
3924     #ifdef SUPPORT_UTF8
3925     BOOL utf8;
3926     BOOL class_utf8;
3927     #endif
3928     BOOL inescq = FALSE;
3929 nigel 81 BOOL capturing;
3930 nigel 77 unsigned int brastackptr = 0;
3931     size_t size;
3932     uschar *code;
3933     const uschar *codestart;
3934     const uschar *ptr;
3935     compile_data compile_block;
3936     int brastack[BRASTACK_SIZE];
3937     uschar bralenstack[BRASTACK_SIZE];
3938    
3939     /* We can't pass back an error message if errorptr is NULL; I guess the best we
3940     can do is just return NULL, but we can set a code value if there is a code
3941     pointer. */
3942    
3943     if (errorptr == NULL)
3944     {
3945     if (errorcodeptr != NULL) *errorcodeptr = 99;
3946     return NULL;
3947     }
3948    
3949     *errorptr = NULL;
3950     if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3951    
3952     /* However, we can give a message for this error */
3953    
3954     if (erroroffset == NULL)
3955     {
3956     errorcode = ERR16;
3957     goto PCRE_EARLY_ERROR_RETURN;
3958     }
3959    
3960     *erroroffset = 0;
3961    
3962     /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3963    
3964     #ifdef SUPPORT_UTF8
3965     utf8 = (options & PCRE_UTF8) != 0;
3966     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3967     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3968     {
3969     errorcode = ERR44;
3970     goto PCRE_EARLY_ERROR_RETURN;
3971     }
3972     #else
3973     if ((options & PCRE_UTF8) != 0)
3974     {
3975     errorcode = ERR32;
3976     goto PCRE_EARLY_ERROR_RETURN;
3977     }
3978     #endif
3979    
3980     if ((options & ~PUBLIC_OPTIONS) != 0)
3981     {
3982     errorcode = ERR17;
3983     goto PCRE_EARLY_ERROR_RETURN;
3984     }
3985    
3986     /* Set up pointers to the individual character tables */
3987    
3988     if (tables == NULL) tables = _pcre_default_tables;
3989     compile_block.lcc = tables + lcc_offset;
3990     compile_block.fcc = tables + fcc_offset;
3991     compile_block.cbits = tables + cbits_offset;
3992     compile_block.ctypes = tables + ctypes_offset;
3993    
3994     /* Maximum back reference and backref bitmap. This is updated for numeric
3995     references during the first pass, but for named references during the actual
3996     compile pass. The bitmap records up to 31 back references to help in deciding
3997     whether (.*) can be treated as anchored or not. */
3998    
3999     compile_block.top_backref = 0;
4000     compile_block.backref_map = 0;
4001    
4002     /* Reflect pattern for debugging output */
4003    
4004     DPRINTF(("------------------------------------------------------------------\n"));
4005     DPRINTF(("%s\n", pattern));
4006    
4007     /* The first thing to do is to make a pass over the pattern to compute the
4008     amount of store required to hold the compiled code. This does not have to be
4009     perfect as long as errors are overestimates. At the same time we can detect any
4010     flag settings right at the start, and extract them. Make an attempt to correct
4011     for any counted white space if an "extended" flag setting appears late in the
4012     pattern. We can't be so clever for #-comments. */
4013    
4014     ptr = (const uschar *)(pattern - 1);
4015     while ((c = *(++ptr)) != 0)
4016     {
4017     int min, max;
4018     int class_optcount;
4019     int bracket_length;
4020     int duplength;
4021    
4022     /* If we are inside a \Q...\E sequence, all chars are literal */
4023    
4024     if (inescq)
4025     {
4026     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4027     goto NORMAL_CHAR;
4028     }
4029    
4030     /* Otherwise, first check for ignored whitespace and comments */
4031    
4032     if ((options & PCRE_EXTENDED) != 0)
4033     {
4034     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4035     if (c == '#')
4036     {
4037     /* The space before the ; is to avoid a warning on a silly compiler
4038     on the Macintosh. */
4039     while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4040     if (c == 0) break;
4041     continue;
4042     }
4043     }
4044    
4045     item_count++; /* Is zero for the first non-comment item */
4046    
4047     /* Allow space for auto callout before every item except quantifiers. */
4048    
4049     if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4050     c != '*' && c != '+' && c != '?' &&
4051     (c != '{' || !is_counted_repeat(ptr + 1)))
4052     length += 2 + 2*LINK_SIZE;
4053    
4054     switch(c)
4055     {
4056     /* A backslashed item may be an escaped data character or it may be a
4057     character type. */
4058    
4059     case '\\':
4060     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4061     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4062    
4063     lastitemlength = 1; /* Default length of last item for repeats */
4064    
4065     if (c >= 0) /* Data character */
4066     {
4067     length += 2; /* For a one-byte character */
4068    
4069     #ifdef SUPPORT_UTF8
4070     if (utf8 && c > 127)
4071     {
4072     int i;
4073     for (i = 0; i < _pcre_utf8_table1_size; i++)
4074     if (c <= _pcre_utf8_table1[i]) break;
4075     length += i;
4076     lastitemlength += i;
4077     }
4078     #endif
4079    
4080     continue;
4081     }
4082    
4083     /* If \Q, enter "literal" mode */
4084    
4085     if (-c == ESC_Q)
4086     {
4087     inescq = TRUE;
4088     continue;
4089     }
4090    
4091     /* \X is supported only if Unicode property support is compiled */
4092    
4093     #ifndef SUPPORT_UCP
4094     if (-c == ESC_X)
4095     {
4096     errorcode = ERR45;
4097     goto PCRE_ERROR_RETURN;
4098     }
4099     #endif
4100    
4101     /* \P and \p are for Unicode properties, but only when the support has
4102 nigel 87 been compiled. Each item needs 3 bytes. */
4103 nigel 77
4104     else if (-c == ESC_P || -c == ESC_p)
4105     {
4106     #ifdef SUPPORT_UCP
4107     BOOL negated;
4108 nigel 87 BOOL pdata;
4109     length += 3;
4110     lastitemlength = 3;
4111     if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
4112     goto PCRE_ERROR_RETURN;
4113 nigel 77 continue;
4114     #else
4115     errorcode = ERR45;
4116     goto PCRE_ERROR_RETURN;
4117     #endif
4118     }
4119    
4120     /* Other escapes need one byte */
4121    
4122     length++;
4123    
4124     /* A back reference needs an additional 2 bytes, plus either one or 5
4125     bytes for a repeat. We also need to keep the value of the highest
4126     back reference. */
4127    
4128     if (c <= -ESC_REF)
4129     {
4130     int refnum = -c - ESC_REF;
4131     compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4132     if (refnum > compile_block.top_backref)
4133     compile_block.top_backref = refnum;
4134     length += 2; /* For single back reference */
4135     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4136     {
4137     ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4138     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4139     if ((min == 0 && (max == 1 || max == -1)) ||
4140     (min == 1 && max == -1))
4141     length++;
4142     else length += 5;
4143     if (ptr[1] == '?') ptr++;
4144     }
4145     }
4146     continue;
4147    
4148     case '^': /* Single-byte metacharacters */
4149     case '.':
4150     case '$':
4151     length++;
4152     lastitemlength = 1;
4153     continue;
4154    
4155     case '*': /* These repeats won't be after brackets; */
4156     case '+': /* those are handled separately */
4157     case '?':
4158     length++;
4159     goto POSESSIVE; /* A few lines below */
4160    
4161     /* This covers the cases of braced repeats after a single char, metachar,
4162     class, or back reference. */
4163    
4164     case '{':
4165     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4166     ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4167     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4168    
4169     /* These special cases just insert one extra opcode */
4170    
4171     if ((min == 0 && (max == 1 || max == -1)) ||
4172     (min == 1 && max == -1))
4173     length++;
4174    
4175     /* These cases might insert additional copies of a preceding character. */
4176    
4177     else
4178     {
4179     if (min != 1)
4180     {
4181     length -= lastitemlength; /* Uncount the original char or metachar */
4182     if (min > 0) length += 3 + lastitemlength;
4183     }
4184     length += lastitemlength + ((max > 0)? 3 : 1);
4185     }
4186    
4187     if (ptr[1] == '?') ptr++; /* Needs no extra length */
4188    
4189     POSESSIVE: /* Test for possessive quantifier */
4190     if (ptr[1] == '+')
4191     {
4192     ptr++;
4193     length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4194     }
4195     continue;
4196    
4197     /* An alternation contains an offset to the next branch or ket. If any ims
4198     options changed in the previous branch(es), and/or if we are in a
4199     lookbehind assertion, extra space will be needed at the start of the
4200     branch. This is handled by branch_extra. */
4201    
4202     case '|':
4203     length += 1 + LINK_SIZE + branch_extra;
4204     continue;
4205    
4206     /* A character class uses 33 characters provided that all the character
4207     values are less than 256. Otherwise, it uses a bit map for low valued
4208     characters, and individual items for others. Don't worry about character
4209     types that aren't allowed in classes - they'll get picked up during the
4210     compile. A character class that contains only one single-byte character
4211     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4212     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4213    
4214     case '[':
4215     if (*(++ptr) == '^')
4216     {
4217     class_optcount = 10; /* Greater than one */
4218     ptr++;
4219     }
4220     else class_optcount = 0;
4221    
4222     #ifdef SUPPORT_UTF8
4223     class_utf8 = FALSE;
4224     #endif
4225    
4226     /* Written as a "do" so that an initial ']' is taken as data */
4227    
4228     if (*ptr != 0) do
4229     {
4230     /* Inside \Q...\E everything is literal except \E */
4231    
4232     if (inescq)
4233     {
4234     if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4235     inescq = FALSE;
4236     ptr += 1;
4237     continue;
4238     }
4239    
4240     /* Outside \Q...\E, check for escapes */
4241    
4242     if (*ptr == '\\')
4243     {
4244     c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4245     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4246    
4247     /* \b is backspace inside a class; \X is literal */
4248    
4249     if (-c == ESC_b) c = '\b';
4250     else if (-c == ESC_X) c = 'X';
4251    
4252     /* \Q enters quoting mode */
4253    
4254     else if (-c == ESC_Q)
4255     {
4256     inescq = TRUE;
4257     continue;
4258     }
4259    
4260     /* Handle escapes that turn into characters */
4261    
4262     if (c >= 0) goto NON_SPECIAL_CHARACTER;
4263    
4264     /* Escapes that are meta-things. The normal ones just affect the
4265     bit map, but Unicode properties require an XCLASS extended item. */
4266    
4267     else
4268     {
4269     class_optcount = 10; /* \d, \s etc; make sure > 1 */
4270