/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (hide annotations) (download)
Sat Feb 24 21:41:34 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 166857 byte(s)
Load pcre-6.7 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 91 #define NLBLOCK cd /* The block containing newline information */
46 nigel 77 #include "pcre_internal.h"
47    
48    
49 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
50     used by pcretest. DEBUG is not defined when building a production library. */
51    
52     #ifdef DEBUG
53     #include "pcre_printint.src"
54     #endif
55    
56    
57    
58 nigel 77 /*************************************************
59     * Code parameters and static tables *
60     *************************************************/
61    
62     /* Maximum number of items on the nested bracket stacks at compile time. This
63     applies to the nesting of all kinds of parentheses. It does not limit
64     un-nested, non-capturing parentheses. This number can be made bigger if
65     necessary - it is used to dimension one int and one unsigned char vector at
66     compile time. */
67    
68     #define BRASTACK_SIZE 200
69    
70    
71     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
72     are simple data values; negative values are for special things like \d and so
73     on. Zero means further processing is needed (for things like \x), or the escape
74     is invalid. */
75    
76     #if !EBCDIC /* This is the "normal" table for ASCII systems */
77     static const short int escapes[] = {
78     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
79     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
80     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
81     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
82     -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
83     -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
84     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
85     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
86     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
87     0, 0, -ESC_z /* x - z */
88     };
89    
90     #else /* This is the "abnormal" table for EBCDIC systems */
91     static const short int escapes[] = {
92     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
93     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
94     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
95     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
96     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
97     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
98     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
99     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
100     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
101     /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
102     /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
103     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
104     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
105     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
107     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
108     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
109     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
110     /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
111     /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
112     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
113     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
114     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
115     };
116     #endif
117    
118    
119     /* Tables of names of POSIX character classes and their lengths. The list is
120 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
121 nigel 77 as this is assumed for handling case independence. */
122    
123     static const char *const posix_names[] = {
124     "alpha", "lower", "upper",
125     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
126     "print", "punct", "space", "word", "xdigit" };
127    
128     static const uschar posix_name_lengths[] = {
129     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
130    
131 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
132     base map, with an optional addition or removal of another map. Then, for some
133     classes, there is some additional tweaking: for [:blank:] the vertical space
134     characters are removed, and for [:alpha:] and [:alnum:] the underscore
135     character is removed. The triples in the table consist of the base map offset,
136     second map offset or -1 if no second map, and a non-negative value for map
137     addition or a negative value for map subtraction (if there are two maps). The
138     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
139     remove vertical space characters, 2 => remove underscore. */
140 nigel 77
141     static const int posix_class_maps[] = {
142 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
143     cbit_lower, -1, 0, /* lower */
144     cbit_upper, -1, 0, /* upper */
145     cbit_word, -1, 2, /* alnum - word without underscore */
146     cbit_print, cbit_cntrl, 0, /* ascii */
147     cbit_space, -1, 1, /* blank - a GNU extension */
148     cbit_cntrl, -1, 0, /* cntrl */
149     cbit_digit, -1, 0, /* digit */
150     cbit_graph, -1, 0, /* graph */
151     cbit_print, -1, 0, /* print */
152     cbit_punct, -1, 0, /* punct */
153     cbit_space, -1, 0, /* space */
154     cbit_word, -1, 0, /* word - a Perl extension */
155     cbit_xdigit,-1, 0 /* xdigit */
156 nigel 77 };
157    
158    
159     /* The texts of compile-time error messages. These are "char *" because they
160     are passed to the outside world. */
161    
162     static const char *error_texts[] = {
163     "no error",
164     "\\ at end of pattern",
165     "\\c at end of pattern",
166     "unrecognized character follows \\",
167     "numbers out of order in {} quantifier",
168     /* 5 */
169     "number too big in {} quantifier",
170     "missing terminating ] for character class",
171     "invalid escape sequence in character class",
172     "range out of order in character class",
173     "nothing to repeat",
174     /* 10 */
175     "operand of unlimited repeat could match the empty string",
176     "internal error: unexpected repeat",
177     "unrecognized character after (?",
178     "POSIX named classes are supported only within a class",
179     "missing )",
180     /* 15 */
181     "reference to non-existent subpattern",
182     "erroffset passed as NULL",
183     "unknown option bit(s) set",
184     "missing ) after comment",
185     "parentheses nested too deeply",
186     /* 20 */
187     "regular expression too large",
188     "failed to get memory",
189     "unmatched parentheses",
190     "internal error: code overflow",
191     "unrecognized character after (?<",
192     /* 25 */
193     "lookbehind assertion is not fixed length",
194 nigel 91 "malformed number or name after (?(",
195 nigel 77 "conditional group contains more than two branches",
196     "assertion expected after (?(",
197     "(?R or (?digits must be followed by )",
198     /* 30 */
199     "unknown POSIX class name",
200     "POSIX collating elements are not supported",
201     "this version of PCRE is not compiled with PCRE_UTF8 support",
202     "spare error",
203     "character value in \\x{...} sequence is too large",
204     /* 35 */
205     "invalid condition (?(0)",
206     "\\C not allowed in lookbehind assertion",
207     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
208     "number after (?C is > 255",
209     "closing ) for (?C expected",
210     /* 40 */
211     "recursive call could loop indefinitely",
212     "unrecognized character after (?P",
213     "syntax error after (?P",
214 nigel 91 "two named subpatterns have the same name",
215 nigel 77 "invalid UTF-8 string",
216     /* 45 */
217     "support for \\P, \\p, and \\X has not been compiled",
218     "malformed \\P or \\p sequence",
219 nigel 91 "unknown property name after \\P or \\p",
220     "subpattern name is too long (maximum 32 characters)",
221     "too many named subpatterns (maximum 10,000)",
222     /* 50 */
223     "repeated subpattern is too long",
224     "octal value is greater than \\377 (not in UTF-8 mode)"
225 nigel 77 };
226    
227    
228     /* Table to identify digits and hex digits. This is used when compiling
229     patterns. Note that the tables in chartables are dependent on the locale, and
230     may mark arbitrary characters as digits - but the PCRE compiling code expects
231     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
232     a private table here. It costs 256 bytes, but it is a lot faster than doing
233     character value tests (at least in some simple cases I timed), and in some
234     applications one wants PCRE to compile efficiently as well as match
235     efficiently.
236    
237     For convenience, we use the same bit definitions as in chartables:
238    
239     0x04 decimal digit
240     0x08 hexadecimal digit
241    
242     Then we can use ctype_digit and ctype_xdigit in the code. */
243    
244     #if !EBCDIC /* This is the "normal" case, for ASCII systems */
245     static const unsigned char digitab[] =
246     {
247     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
248     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
249     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
250     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
251     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
252     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
253     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
254     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
255     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
256     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
257     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
258     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
259     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
260     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
261     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
262     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
263     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
264     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
265     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
266     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
267     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
268     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
275     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
276     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
279    
280     #else /* This is the "abnormal" case, for EBCDIC systems */
281     static const unsigned char digitab[] =
282     {
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
299     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
307     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
313     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
314     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
315    
316     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
317     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
318     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
319     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
321     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
325     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
326     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
328     0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
330     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
333     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
334     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
335     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
336     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
337     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
338     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
339     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
340     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
341     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
342     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
343     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
344     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
345     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
346     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
347     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
348     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
349     #endif
350    
351    
352     /* Definition to allow mutual recursion */
353    
354     static BOOL
355     compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
356     int *, int *, branch_chain *, compile_data *);
357    
358    
359    
360     /*************************************************
361     * Handle escapes *
362     *************************************************/
363    
364     /* This function is called when a \ has been encountered. It either returns a
365     positive value for a simple escape such as \n, or a negative value which
366     encodes one of the more complicated things such as \d. When UTF-8 is enabled,
367     a positive value greater than 255 may be returned. On entry, ptr is pointing at
368     the \. On exit, it is on the final character of the escape sequence.
369    
370     Arguments:
371     ptrptr points to the pattern position pointer
372     errorcodeptr points to the errorcode variable
373     bracount number of previous extracting brackets
374     options the options bits
375     isclass TRUE if inside a character class
376    
377     Returns: zero or positive => a data character
378     negative => a special escape sequence
379     on error, errorptr is set
380     */
381    
382     static int
383     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
384     int options, BOOL isclass)
385     {
386 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
387     const uschar *ptr = *ptrptr + 1;
388 nigel 77 int c, i;
389    
390 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
391     ptr--; /* Set pointer back to the last byte */
392    
393 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
394    
395     if (c == 0) *errorcodeptr = ERR1;
396    
397     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
398     a table. A non-zero result is something that can be returned immediately.
399     Otherwise further processing may be required. */
400    
401     #if !EBCDIC /* ASCII coding */
402     else if (c < '0' || c > 'z') {} /* Not alphameric */
403     else if ((i = escapes[c - '0']) != 0) c = i;
404    
405     #else /* EBCDIC coding */
406     else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
407     else if ((i = escapes[c - 0x48]) != 0) c = i;
408     #endif
409    
410     /* Escapes that need further processing, or are illegal. */
411    
412     else
413     {
414     const uschar *oldptr;
415     switch (c)
416     {
417     /* A number of Perl escapes are not handled by PCRE. We give an explicit
418     error. */
419    
420     case 'l':
421     case 'L':
422     case 'N':
423     case 'u':
424     case 'U':
425     *errorcodeptr = ERR37;
426     break;
427    
428     /* The handling of escape sequences consisting of a string of digits
429     starting with one that is not zero is not straightforward. By experiment,
430     the way Perl works seems to be as follows:
431    
432     Outside a character class, the digits are read as a decimal number. If the
433     number is less than 10, or if there are that many previous extracting
434     left brackets, then it is a back reference. Otherwise, up to three octal
435     digits are read to form an escaped byte. Thus \123 is likely to be octal
436     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
437     value is greater than 377, the least significant 8 bits are taken. Inside a
438     character class, \ followed by a digit is always an octal number. */
439    
440     case '1': case '2': case '3': case '4': case '5':
441     case '6': case '7': case '8': case '9':
442    
443     if (!isclass)
444     {
445     oldptr = ptr;
446     c -= '0';
447     while ((digitab[ptr[1]] & ctype_digit) != 0)
448     c = c * 10 + *(++ptr) - '0';
449     if (c < 10 || c <= bracount)
450     {
451     c = -(ESC_REF + c);
452     break;
453     }
454     ptr = oldptr; /* Put the pointer back and fall through */
455     }
456    
457     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
458     generates a binary zero byte and treats the digit as a following literal.
459     Thus we have to pull back the pointer by one. */
460    
461     if ((c = *ptr) >= '8')
462     {
463     ptr--;
464     c = 0;
465     break;
466     }
467    
468     /* \0 always starts an octal number, but we may drop through to here with a
469 nigel 91 larger first octal digit. The original code used just to take the least
470     significant 8 bits of octal numbers (I think this is what early Perls used
471     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
472     than 3 octal digits. */
473 nigel 77
474     case '0':
475     c -= '0';
476     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
477     c = c * 8 + *(++ptr) - '0';
478 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
479 nigel 77 break;
480    
481 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
482     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
483     treated as a data character. */
484 nigel 77
485     case 'x':
486 nigel 87 if (ptr[1] == '{')
487 nigel 77 {
488     const uschar *pt = ptr + 2;
489 nigel 87 int count = 0;
490    
491 nigel 77 c = 0;
492     while ((digitab[*pt] & ctype_xdigit) != 0)
493     {
494 nigel 87 register int cc = *pt++;
495     if (c == 0 && cc == '0') continue; /* Leading zeroes */
496 nigel 77 count++;
497 nigel 87
498 nigel 77 #if !EBCDIC /* ASCII coding */
499     if (cc >= 'a') cc -= 32; /* Convert to upper case */
500 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
501 nigel 77 #else /* EBCDIC coding */
502     if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
503 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
504 nigel 77 #endif
505     }
506 nigel 87
507 nigel 77 if (*pt == '}')
508     {
509 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
510 nigel 77 ptr = pt;
511     break;
512     }
513 nigel 87
514 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
515     recognize this construct; fall through to the normal \x handling. */
516     }
517    
518 nigel 87 /* Read just a single-byte hex-defined char */
519 nigel 77
520     c = 0;
521     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
522     {
523     int cc; /* Some compilers don't like ++ */
524     cc = *(++ptr); /* in initializers */
525     #if !EBCDIC /* ASCII coding */
526     if (cc >= 'a') cc -= 32; /* Convert to upper case */
527     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
528     #else /* EBCDIC coding */
529     if (cc <= 'z') cc += 64; /* Convert to upper case */
530     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
531     #endif
532     }
533     break;
534    
535     /* Other special escapes not starting with a digit are straightforward */
536    
537     case 'c':
538     c = *(++ptr);
539     if (c == 0)
540     {
541     *errorcodeptr = ERR2;
542     return 0;
543     }
544    
545     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
546     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
547     (However, an EBCDIC equivalent has now been added.) */
548    
549     #if !EBCDIC /* ASCII coding */
550     if (c >= 'a' && c <= 'z') c -= 32;
551     c ^= 0x40;
552     #else /* EBCDIC coding */
553     if (c >= 'a' && c <= 'z') c += 64;
554     c ^= 0xC0;
555     #endif
556     break;
557    
558     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
559     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
560     for Perl compatibility, it is a literal. This code looks a bit odd, but
561     there used to be some cases other than the default, and there may be again
562     in future, so I haven't "optimized" it. */
563    
564     default:
565     if ((options & PCRE_EXTRA) != 0) switch(c)
566     {
567     default:
568     *errorcodeptr = ERR3;
569     break;
570     }
571     break;
572     }
573     }
574    
575     *ptrptr = ptr;
576     return c;
577     }
578    
579    
580    
581     #ifdef SUPPORT_UCP
582     /*************************************************
583     * Handle \P and \p *
584     *************************************************/
585    
586     /* This function is called after \P or \p has been encountered, provided that
587     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
588     pointing at the P or p. On exit, it is pointing at the final character of the
589     escape sequence.
590    
591     Argument:
592     ptrptr points to the pattern position pointer
593     negptr points to a boolean that is set TRUE for negation else FALSE
594 nigel 87 dptr points to an int that is set to the detailed property value
595 nigel 77 errorcodeptr points to the error code variable
596    
597 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
598 nigel 77 */
599    
600     static int
601 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
602 nigel 77 {
603     int c, i, bot, top;
604     const uschar *ptr = *ptrptr;
605 nigel 87 char name[32];
606 nigel 77
607     c = *(++ptr);
608     if (c == 0) goto ERROR_RETURN;
609    
610     *negptr = FALSE;
611    
612 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
613     negation. */
614 nigel 77
615     if (c == '{')
616     {
617     if (ptr[1] == '^')
618     {
619     *negptr = TRUE;
620     ptr++;
621     }
622 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
623 nigel 77 {
624     c = *(++ptr);
625     if (c == 0) goto ERROR_RETURN;
626     if (c == '}') break;
627     name[i] = c;
628     }
629 nigel 87 if (c !='}') goto ERROR_RETURN;
630 nigel 77 name[i] = 0;
631     }
632    
633     /* Otherwise there is just one following character */
634    
635     else
636     {
637     name[0] = c;
638     name[1] = 0;
639     }
640    
641     *ptrptr = ptr;
642    
643     /* Search for a recognized property name using binary chop */
644    
645     bot = 0;
646     top = _pcre_utt_size;
647    
648     while (bot < top)
649     {
650 nigel 87 i = (bot + top) >> 1;
651 nigel 77 c = strcmp(name, _pcre_utt[i].name);
652 nigel 87 if (c == 0)
653     {
654     *dptr = _pcre_utt[i].value;
655     return _pcre_utt[i].type;
656     }
657 nigel 77 if (c > 0) bot = i + 1; else top = i;
658     }
659    
660     *errorcodeptr = ERR47;
661     *ptrptr = ptr;
662     return -1;
663    
664     ERROR_RETURN:
665     *errorcodeptr = ERR46;
666     *ptrptr = ptr;
667     return -1;
668     }
669     #endif
670    
671    
672    
673    
674     /*************************************************
675     * Check for counted repeat *
676     *************************************************/
677    
678     /* This function is called when a '{' is encountered in a place where it might
679     start a quantifier. It looks ahead to see if it really is a quantifier or not.
680     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
681     where the ddds are digits.
682    
683     Arguments:
684     p pointer to the first char after '{'
685    
686     Returns: TRUE or FALSE
687     */
688    
689     static BOOL
690     is_counted_repeat(const uschar *p)
691     {
692     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
693     while ((digitab[*p] & ctype_digit) != 0) p++;
694     if (*p == '}') return TRUE;
695    
696     if (*p++ != ',') return FALSE;
697     if (*p == '}') return TRUE;
698    
699     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
700     while ((digitab[*p] & ctype_digit) != 0) p++;
701    
702     return (*p == '}');
703     }
704    
705    
706    
707     /*************************************************
708     * Read repeat counts *
709     *************************************************/
710    
711     /* Read an item of the form {n,m} and return the values. This is called only
712     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
713     so the syntax is guaranteed to be correct, but we need to check the values.
714    
715     Arguments:
716     p pointer to first char after '{'
717     minp pointer to int for min
718     maxp pointer to int for max
719     returned as -1 if no max
720     errorcodeptr points to error code variable
721    
722     Returns: pointer to '}' on success;
723     current ptr on error, with errorcodeptr set non-zero
724     */
725    
726     static const uschar *
727     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
728     {
729     int min = 0;
730     int max = -1;
731    
732 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
733     an integer overflow. */
734    
735 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
736 nigel 81 if (min < 0 || min > 65535)
737     {
738     *errorcodeptr = ERR5;
739     return p;
740     }
741 nigel 77
742 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
743     Also, max must not be less than min. */
744    
745 nigel 77 if (*p == '}') max = min; else
746     {
747     if (*(++p) != '}')
748     {
749     max = 0;
750     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
751 nigel 81 if (max < 0 || max > 65535)
752     {
753     *errorcodeptr = ERR5;
754     return p;
755     }
756 nigel 77 if (max < min)
757     {
758     *errorcodeptr = ERR4;
759     return p;
760     }
761     }
762     }
763    
764 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
765     '}'. */
766 nigel 77
767 nigel 81 *minp = min;
768     *maxp = max;
769 nigel 77 return p;
770     }
771    
772    
773    
774     /*************************************************
775 nigel 91 * Find forward referenced named subpattern *
776     *************************************************/
777    
778     /* This function scans along a pattern looking for capturing subpatterns, and
779     counting them. If it finds a named pattern that matches the name it is given,
780     it returns its number. This is used for forward references to named
781     subpatterns. We know that if (?P< is encountered, the name will be terminated
782     by '>' because that is checked in the first pass.
783    
784     Arguments:
785     pointer current position in the pattern
786     count current count of capturing parens
787     name name to seek
788     namelen name length
789    
790     Returns: the number of the named subpattern, or -1 if not found
791     */
792    
793     static int
794     find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)
795     {
796     const uschar *thisname;
797     for (; *ptr != 0; ptr++)
798     {
799     if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }
800     if (*ptr != '(') continue;
801     if (ptr[1] != '?') { count++; continue; }
802     if (ptr[2] == '(') { ptr += 2; continue; }
803     if (ptr[2] != 'P' || ptr[3] != '<') continue;
804     count++;
805     ptr += 4;
806     thisname = ptr;
807     while (*ptr != '>') ptr++;
808     if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)
809     return count;
810     }
811     return -1;
812     }
813    
814    
815    
816     /*************************************************
817 nigel 77 * Find first significant op code *
818     *************************************************/
819    
820     /* This is called by several functions that scan a compiled expression looking
821     for a fixed first character, or an anchoring op code etc. It skips over things
822     that do not influence this. For some calls, a change of option is important.
823     For some calls, it makes sense to skip negative forward and all backward
824     assertions, and also the \b assertion; for others it does not.
825    
826     Arguments:
827     code pointer to the start of the group
828     options pointer to external options
829     optbit the option bit whose changing is significant, or
830     zero if none are
831     skipassert TRUE if certain assertions are to be skipped
832    
833     Returns: pointer to the first significant opcode
834     */
835    
836     static const uschar*
837     first_significant_code(const uschar *code, int *options, int optbit,
838     BOOL skipassert)
839     {
840     for (;;)
841     {
842     switch ((int)*code)
843     {
844     case OP_OPT:
845     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
846     *options = (int)code[1];
847     code += 2;
848     break;
849    
850     case OP_ASSERT_NOT:
851     case OP_ASSERTBACK:
852     case OP_ASSERTBACK_NOT:
853     if (!skipassert) return code;
854     do code += GET(code, 1); while (*code == OP_ALT);
855     code += _pcre_OP_lengths[*code];
856     break;
857    
858     case OP_WORD_BOUNDARY:
859     case OP_NOT_WORD_BOUNDARY:
860     if (!skipassert) return code;
861     /* Fall through */
862    
863     case OP_CALLOUT:
864     case OP_CREF:
865     case OP_BRANUMBER:
866     code += _pcre_OP_lengths[*code];
867     break;
868    
869     default:
870     return code;
871     }
872     }
873     /* Control never reaches here */
874     }
875    
876    
877    
878    
879     /*************************************************
880     * Find the fixed length of a pattern *
881     *************************************************/
882    
883     /* Scan a pattern and compute the fixed length of subject that will match it,
884     if the length is fixed. This is needed for dealing with backward assertions.
885     In UTF8 mode, the result is in characters rather than bytes.
886    
887     Arguments:
888     code points to the start of the pattern (the bracket)
889     options the compiling options
890    
891     Returns: the fixed length, or -1 if there is no fixed length,
892     or -2 if \C was encountered
893     */
894    
895     static int
896     find_fixedlength(uschar *code, int options)
897     {
898     int length = -1;
899    
900     register int branchlength = 0;
901     register uschar *cc = code + 1 + LINK_SIZE;
902    
903     /* Scan along the opcodes for this branch. If we get to the end of the
904     branch, check the length against that of the other branches. */
905    
906     for (;;)
907     {
908     int d;
909     register int op = *cc;
910     if (op >= OP_BRA) op = OP_BRA;
911    
912     switch (op)
913     {
914     case OP_BRA:
915     case OP_ONCE:
916     case OP_COND:
917     d = find_fixedlength(cc, options);
918     if (d < 0) return d;
919     branchlength += d;
920     do cc += GET(cc, 1); while (*cc == OP_ALT);
921     cc += 1 + LINK_SIZE;
922     break;
923    
924     /* Reached end of a branch; if it's a ket it is the end of a nested
925     call. If it's ALT it is an alternation in a nested call. If it is
926     END it's the end of the outer call. All can be handled by the same code. */
927    
928     case OP_ALT:
929     case OP_KET:
930     case OP_KETRMAX:
931     case OP_KETRMIN:
932     case OP_END:
933     if (length < 0) length = branchlength;
934     else if (length != branchlength) return -1;
935     if (*cc != OP_ALT) return length;
936     cc += 1 + LINK_SIZE;
937     branchlength = 0;
938     break;
939    
940     /* Skip over assertive subpatterns */
941    
942     case OP_ASSERT:
943     case OP_ASSERT_NOT:
944     case OP_ASSERTBACK:
945     case OP_ASSERTBACK_NOT:
946     do cc += GET(cc, 1); while (*cc == OP_ALT);
947     /* Fall through */
948    
949     /* Skip over things that don't match chars */
950    
951     case OP_REVERSE:
952     case OP_BRANUMBER:
953     case OP_CREF:
954     case OP_OPT:
955     case OP_CALLOUT:
956     case OP_SOD:
957     case OP_SOM:
958     case OP_EOD:
959     case OP_EODN:
960     case OP_CIRC:
961     case OP_DOLL:
962     case OP_NOT_WORD_BOUNDARY:
963     case OP_WORD_BOUNDARY:
964     cc += _pcre_OP_lengths[*cc];
965     break;
966    
967     /* Handle literal characters */
968    
969     case OP_CHAR:
970     case OP_CHARNC:
971 nigel 91 case OP_NOT:
972 nigel 77 branchlength++;
973     cc += 2;
974     #ifdef SUPPORT_UTF8
975     if ((options & PCRE_UTF8) != 0)
976     {
977     while ((*cc & 0xc0) == 0x80) cc++;
978     }
979     #endif
980     break;
981    
982     /* Handle exact repetitions. The count is already in characters, but we
983     need to skip over a multibyte character in UTF8 mode. */
984    
985     case OP_EXACT:
986     branchlength += GET2(cc,1);
987     cc += 4;
988     #ifdef SUPPORT_UTF8
989     if ((options & PCRE_UTF8) != 0)
990     {
991     while((*cc & 0x80) == 0x80) cc++;
992     }
993     #endif
994     break;
995    
996     case OP_TYPEEXACT:
997     branchlength += GET2(cc,1);
998     cc += 4;
999     break;
1000    
1001     /* Handle single-char matchers */
1002    
1003     case OP_PROP:
1004     case OP_NOTPROP:
1005 nigel 87 cc += 2;
1006 nigel 77 /* Fall through */
1007    
1008     case OP_NOT_DIGIT:
1009     case OP_DIGIT:
1010     case OP_NOT_WHITESPACE:
1011     case OP_WHITESPACE:
1012     case OP_NOT_WORDCHAR:
1013     case OP_WORDCHAR:
1014     case OP_ANY:
1015     branchlength++;
1016     cc++;
1017     break;
1018    
1019     /* The single-byte matcher isn't allowed */
1020    
1021     case OP_ANYBYTE:
1022     return -2;
1023    
1024     /* Check a class for variable quantification */
1025    
1026     #ifdef SUPPORT_UTF8
1027     case OP_XCLASS:
1028     cc += GET(cc, 1) - 33;
1029     /* Fall through */
1030     #endif
1031    
1032     case OP_CLASS:
1033     case OP_NCLASS:
1034     cc += 33;
1035    
1036     switch (*cc)
1037     {
1038     case OP_CRSTAR:
1039     case OP_CRMINSTAR:
1040     case OP_CRQUERY:
1041     case OP_CRMINQUERY:
1042     return -1;
1043    
1044     case OP_CRRANGE:
1045     case OP_CRMINRANGE:
1046     if (GET2(cc,1) != GET2(cc,3)) return -1;
1047     branchlength += GET2(cc,1);
1048     cc += 5;
1049     break;
1050    
1051     default:
1052     branchlength++;
1053     }
1054     break;
1055    
1056     /* Anything else is variable length */
1057    
1058     default:
1059     return -1;
1060     }
1061     }
1062     /* Control never gets here */
1063     }
1064    
1065    
1066    
1067    
1068     /*************************************************
1069     * Scan compiled regex for numbered bracket *
1070     *************************************************/
1071    
1072     /* This little function scans through a compiled pattern until it finds a
1073     capturing bracket with the given number.
1074    
1075     Arguments:
1076     code points to start of expression
1077     utf8 TRUE in UTF-8 mode
1078     number the required bracket number
1079    
1080     Returns: pointer to the opcode for the bracket, or NULL if not found
1081     */
1082    
1083     static const uschar *
1084     find_bracket(const uschar *code, BOOL utf8, int number)
1085     {
1086     for (;;)
1087     {
1088     register int c = *code;
1089     if (c == OP_END) return NULL;
1090 nigel 91
1091     /* XCLASS is used for classes that cannot be represented just by a bit
1092     map. This includes negated single high-valued characters. The length in
1093     the table is zero; the actual length is stored in the compiled code. */
1094    
1095     if (c == OP_XCLASS) code += GET(code, 1);
1096    
1097     /* Handle bracketed group */
1098    
1099 nigel 77 else if (c > OP_BRA)
1100     {
1101     int n = c - OP_BRA;
1102     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1103     if (n == number) return (uschar *)code;
1104     code += _pcre_OP_lengths[OP_BRA];
1105     }
1106 nigel 91
1107     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1108     that are followed by a character may be followed by a multi-byte character.
1109     The length in the table is a minimum, so we have to scan along to skip the
1110     extra bytes. All opcodes are less than 128, so we can use relatively
1111     efficient code. */
1112    
1113 nigel 77 else
1114     {
1115     code += _pcre_OP_lengths[c];
1116     if (utf8) switch(c)
1117     {
1118     case OP_CHAR:
1119     case OP_CHARNC:
1120     case OP_EXACT:
1121     case OP_UPTO:
1122     case OP_MINUPTO:
1123     case OP_STAR:
1124     case OP_MINSTAR:
1125     case OP_PLUS:
1126     case OP_MINPLUS:
1127     case OP_QUERY:
1128     case OP_MINQUERY:
1129     while ((*code & 0xc0) == 0x80) code++;
1130     break;
1131     }
1132     }
1133     }
1134     }
1135    
1136    
1137    
1138     /*************************************************
1139     * Scan compiled regex for recursion reference *
1140     *************************************************/
1141    
1142     /* This little function scans through a compiled pattern until it finds an
1143     instance of OP_RECURSE.
1144    
1145     Arguments:
1146     code points to start of expression
1147     utf8 TRUE in UTF-8 mode
1148    
1149     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1150     */
1151    
1152     static const uschar *
1153     find_recurse(const uschar *code, BOOL utf8)
1154     {
1155     for (;;)
1156     {
1157     register int c = *code;
1158     if (c == OP_END) return NULL;
1159 nigel 91 if (c == OP_RECURSE) return code;
1160    
1161     /* XCLASS is used for classes that cannot be represented just by a bit
1162     map. This includes negated single high-valued characters. The length in
1163     the table is zero; the actual length is stored in the compiled code. */
1164    
1165     if (c == OP_XCLASS) code += GET(code, 1);
1166    
1167     /* All bracketed groups have the same length. */
1168    
1169 nigel 77 else if (c > OP_BRA)
1170     {
1171     code += _pcre_OP_lengths[OP_BRA];
1172     }
1173 nigel 91
1174     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1175     that are followed by a character may be followed by a multi-byte character.
1176     The length in the table is a minimum, so we have to scan along to skip the
1177     extra bytes. All opcodes are less than 128, so we can use relatively
1178     efficient code. */
1179    
1180 nigel 77 else
1181     {
1182     code += _pcre_OP_lengths[c];
1183     if (utf8) switch(c)
1184     {
1185     case OP_CHAR:
1186     case OP_CHARNC:
1187     case OP_EXACT:
1188     case OP_UPTO:
1189     case OP_MINUPTO:
1190     case OP_STAR:
1191     case OP_MINSTAR:
1192     case OP_PLUS:
1193     case OP_MINPLUS:
1194     case OP_QUERY:
1195     case OP_MINQUERY:
1196     while ((*code & 0xc0) == 0x80) code++;
1197     break;
1198     }
1199     }
1200     }
1201     }
1202    
1203    
1204    
1205     /*************************************************
1206     * Scan compiled branch for non-emptiness *
1207     *************************************************/
1208    
1209     /* This function scans through a branch of a compiled pattern to see whether it
1210     can match the empty string or not. It is called only from could_be_empty()
1211     below. Note that first_significant_code() skips over assertions. If we hit an
1212     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1213     whose current branch will already have been scanned.
1214    
1215     Arguments:
1216     code points to start of search
1217     endcode points to where to stop
1218     utf8 TRUE if in UTF8 mode
1219    
1220     Returns: TRUE if what is matched could be empty
1221     */
1222    
1223     static BOOL
1224     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1225     {
1226     register int c;
1227     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1228     code < endcode;
1229     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1230     {
1231     const uschar *ccode;
1232    
1233     c = *code;
1234    
1235     if (c >= OP_BRA)
1236     {
1237     BOOL empty_branch;
1238     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1239    
1240     /* Scan a closed bracket */
1241    
1242     empty_branch = FALSE;
1243     do
1244     {
1245     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1246     empty_branch = TRUE;
1247     code += GET(code, 1);
1248     }
1249     while (*code == OP_ALT);
1250     if (!empty_branch) return FALSE; /* All branches are non-empty */
1251     code += 1 + LINK_SIZE;
1252     c = *code;
1253     }
1254    
1255     else switch (c)
1256     {
1257     /* Check for quantifiers after a class */
1258    
1259     #ifdef SUPPORT_UTF8
1260     case OP_XCLASS:
1261     ccode = code + GET(code, 1);
1262     goto CHECK_CLASS_REPEAT;
1263     #endif
1264    
1265     case OP_CLASS:
1266     case OP_NCLASS:
1267     ccode = code + 33;
1268    
1269     #ifdef SUPPORT_UTF8
1270     CHECK_CLASS_REPEAT:
1271     #endif
1272    
1273     switch (*ccode)
1274     {
1275     case OP_CRSTAR: /* These could be empty; continue */
1276     case OP_CRMINSTAR:
1277     case OP_CRQUERY:
1278     case OP_CRMINQUERY:
1279     break;
1280    
1281     default: /* Non-repeat => class must match */
1282     case OP_CRPLUS: /* These repeats aren't empty */
1283     case OP_CRMINPLUS:
1284     return FALSE;
1285    
1286     case OP_CRRANGE:
1287     case OP_CRMINRANGE:
1288     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1289     break;
1290     }
1291     break;
1292    
1293     /* Opcodes that must match a character */
1294    
1295     case OP_PROP:
1296     case OP_NOTPROP:
1297     case OP_EXTUNI:
1298     case OP_NOT_DIGIT:
1299     case OP_DIGIT:
1300     case OP_NOT_WHITESPACE:
1301     case OP_WHITESPACE:
1302     case OP_NOT_WORDCHAR:
1303     case OP_WORDCHAR:
1304     case OP_ANY:
1305     case OP_ANYBYTE:
1306     case OP_CHAR:
1307     case OP_CHARNC:
1308     case OP_NOT:
1309     case OP_PLUS:
1310     case OP_MINPLUS:
1311     case OP_EXACT:
1312     case OP_NOTPLUS:
1313     case OP_NOTMINPLUS:
1314     case OP_NOTEXACT:
1315     case OP_TYPEPLUS:
1316     case OP_TYPEMINPLUS:
1317     case OP_TYPEEXACT:
1318     return FALSE;
1319    
1320     /* End of branch */
1321    
1322     case OP_KET:
1323     case OP_KETRMAX:
1324     case OP_KETRMIN:
1325     case OP_ALT:
1326     return TRUE;
1327    
1328     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1329     followed by a multibyte character */
1330    
1331     #ifdef SUPPORT_UTF8
1332     case OP_STAR:
1333     case OP_MINSTAR:
1334     case OP_QUERY:
1335     case OP_MINQUERY:
1336     case OP_UPTO:
1337     case OP_MINUPTO:
1338     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1339     break;
1340     #endif
1341     }
1342     }
1343    
1344     return TRUE;
1345     }
1346    
1347    
1348    
1349     /*************************************************
1350     * Scan compiled regex for non-emptiness *
1351     *************************************************/
1352    
1353     /* This function is called to check for left recursive calls. We want to check
1354     the current branch of the current pattern to see if it could match the empty
1355     string. If it could, we must look outwards for branches at other levels,
1356     stopping when we pass beyond the bracket which is the subject of the recursion.
1357    
1358     Arguments:
1359     code points to start of the recursion
1360     endcode points to where to stop (current RECURSE item)
1361     bcptr points to the chain of current (unclosed) branch starts
1362     utf8 TRUE if in UTF-8 mode
1363    
1364     Returns: TRUE if what is matched could be empty
1365     */
1366    
1367     static BOOL
1368     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1369     BOOL utf8)
1370     {
1371     while (bcptr != NULL && bcptr->current >= code)
1372     {
1373     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1374     bcptr = bcptr->outer;
1375     }
1376     return TRUE;
1377     }
1378    
1379    
1380    
1381     /*************************************************
1382     * Check for POSIX class syntax *
1383     *************************************************/
1384    
1385     /* This function is called when the sequence "[:" or "[." or "[=" is
1386     encountered in a character class. It checks whether this is followed by an
1387     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1388     ".]" or "=]".
1389    
1390     Argument:
1391     ptr pointer to the initial [
1392     endptr where to return the end pointer
1393     cd pointer to compile data
1394    
1395     Returns: TRUE or FALSE
1396     */
1397    
1398     static BOOL
1399     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1400     {
1401     int terminator; /* Don't combine these lines; the Solaris cc */
1402     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1403     if (*(++ptr) == '^') ptr++;
1404     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1405     if (*ptr == terminator && ptr[1] == ']')
1406     {
1407     *endptr = ptr;
1408     return TRUE;
1409     }
1410     return FALSE;
1411     }
1412    
1413    
1414    
1415    
1416     /*************************************************
1417     * Check POSIX class name *
1418     *************************************************/
1419    
1420     /* This function is called to check the name given in a POSIX-style class entry
1421     such as [:alnum:].
1422    
1423     Arguments:
1424     ptr points to the first letter
1425     len the length of the name
1426    
1427     Returns: a value representing the name, or -1 if unknown
1428     */
1429    
1430     static int
1431     check_posix_name(const uschar *ptr, int len)
1432     {
1433     register int yield = 0;
1434     while (posix_name_lengths[yield] != 0)
1435     {
1436     if (len == posix_name_lengths[yield] &&
1437     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1438     yield++;
1439     }
1440     return -1;
1441     }
1442    
1443    
1444     /*************************************************
1445     * Adjust OP_RECURSE items in repeated group *
1446     *************************************************/
1447    
1448     /* OP_RECURSE items contain an offset from the start of the regex to the group
1449     that is referenced. This means that groups can be replicated for fixed
1450     repetition simply by copying (because the recursion is allowed to refer to
1451     earlier groups that are outside the current group). However, when a group is
1452     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1453     it, after it has been compiled. This means that any OP_RECURSE items within it
1454     that refer to the group itself or any contained groups have to have their
1455     offsets adjusted. That is the job of this function. Before it is called, the
1456     partially compiled regex must be temporarily terminated with OP_END.
1457    
1458     Arguments:
1459     group points to the start of the group
1460     adjust the amount by which the group is to be moved
1461     utf8 TRUE in UTF-8 mode
1462     cd contains pointers to tables etc.
1463    
1464     Returns: nothing
1465     */
1466    
1467     static void
1468     adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1469     {
1470     uschar *ptr = group;
1471     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1472     {
1473     int offset = GET(ptr, 1);
1474     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1475     ptr += 1 + LINK_SIZE;
1476     }
1477     }
1478    
1479    
1480    
1481     /*************************************************
1482     * Insert an automatic callout point *
1483     *************************************************/
1484    
1485     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1486     callout points before each pattern item.
1487    
1488     Arguments:
1489     code current code pointer
1490     ptr current pattern pointer
1491     cd pointers to tables etc
1492    
1493     Returns: new code pointer
1494     */
1495    
1496     static uschar *
1497     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1498     {
1499     *code++ = OP_CALLOUT;
1500     *code++ = 255;
1501     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1502     PUT(code, LINK_SIZE, 0); /* Default length */
1503     return code + 2*LINK_SIZE;
1504     }
1505    
1506    
1507    
1508     /*************************************************
1509     * Complete a callout item *
1510     *************************************************/
1511    
1512     /* A callout item contains the length of the next item in the pattern, which
1513     we can't fill in till after we have reached the relevant point. This is used
1514     for both automatic and manual callouts.
1515    
1516     Arguments:
1517     previous_callout points to previous callout item
1518     ptr current pattern pointer
1519     cd pointers to tables etc
1520    
1521     Returns: nothing
1522     */
1523    
1524     static void
1525     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1526     {
1527     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1528     PUT(previous_callout, 2 + LINK_SIZE, length);
1529     }
1530    
1531    
1532    
1533     #ifdef SUPPORT_UCP
1534     /*************************************************
1535     * Get othercase range *
1536     *************************************************/
1537    
1538     /* This function is passed the start and end of a class range, in UTF-8 mode
1539     with UCP support. It searches up the characters, looking for internal ranges of
1540     characters in the "other" case. Each call returns the next one, updating the
1541     start address.
1542    
1543     Arguments:
1544     cptr points to starting character value; updated
1545     d end value
1546     ocptr where to put start of othercase range
1547     odptr where to put end of othercase range
1548    
1549     Yield: TRUE when range returned; FALSE when no more
1550     */
1551    
1552     static BOOL
1553     get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1554     {
1555 nigel 87 int c, othercase, next;
1556 nigel 77
1557     for (c = *cptr; c <= d; c++)
1558 nigel 87 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
1559 nigel 77
1560     if (c > d) return FALSE;
1561    
1562     *ocptr = othercase;
1563     next = othercase + 1;
1564    
1565     for (++c; c <= d; c++)
1566     {
1567 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1568 nigel 77 next++;
1569     }
1570    
1571     *odptr = next - 1;
1572     *cptr = c;
1573    
1574     return TRUE;
1575     }
1576     #endif /* SUPPORT_UCP */
1577    
1578    
1579     /*************************************************
1580     * Compile one branch *
1581     *************************************************/
1582    
1583     /* Scan the pattern, compiling it into the code vector. If the options are
1584     changed during the branch, the pointer is used to change the external options
1585     bits.
1586    
1587     Arguments:
1588     optionsptr pointer to the option bits
1589     brackets points to number of extracting brackets used
1590     codeptr points to the pointer to the current code point
1591     ptrptr points to the current pattern pointer
1592     errorcodeptr points to error code variable
1593     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1594     reqbyteptr set to the last literal character required, else < 0
1595     bcptr points to current branch chain
1596     cd contains pointers to tables etc.
1597    
1598     Returns: TRUE on success
1599     FALSE, with *errorcodeptr set non-zero on error
1600     */
1601    
1602     static BOOL
1603     compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1604     const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1605     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1606     {
1607     int repeat_type, op_type;
1608     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1609     int bravalue = 0;
1610     int greedy_default, greedy_non_default;
1611     int firstbyte, reqbyte;
1612     int zeroreqbyte, zerofirstbyte;
1613     int req_caseopt, reqvary, tempreqvary;
1614     int options = *optionsptr;
1615     int after_manual_callout = 0;
1616     register int c;
1617     register uschar *code = *codeptr;
1618     uschar *tempcode;
1619     BOOL inescq = FALSE;
1620     BOOL groupsetfirstbyte = FALSE;
1621     const uschar *ptr = *ptrptr;
1622     const uschar *tempptr;
1623     uschar *previous = NULL;
1624     uschar *previous_callout = NULL;
1625     uschar classbits[32];
1626    
1627     #ifdef SUPPORT_UTF8
1628     BOOL class_utf8;
1629     BOOL utf8 = (options & PCRE_UTF8) != 0;
1630     uschar *class_utf8data;
1631     uschar utf8_char[6];
1632     #else
1633     BOOL utf8 = FALSE;
1634     #endif
1635    
1636     /* Set up the default and non-default settings for greediness */
1637    
1638     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1639     greedy_non_default = greedy_default ^ 1;
1640    
1641     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1642     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1643     matches a non-fixed char first char; reqbyte just remains unset if we never
1644     find one.
1645    
1646     When we hit a repeat whose minimum is zero, we may have to adjust these values
1647     to take the zero repeat into account. This is implemented by setting them to
1648     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1649     item types that can be repeated set these backoff variables appropriately. */
1650    
1651     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1652    
1653     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1654     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1655     value > 255. It is added into the firstbyte or reqbyte variables to record the
1656     case status of the value. This is used only for ASCII characters. */
1657    
1658     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1659    
1660     /* Switch on next character until the end of the branch */
1661    
1662     for (;; ptr++)
1663     {
1664     BOOL negate_class;
1665     BOOL possessive_quantifier;
1666     BOOL is_quantifier;
1667     int class_charcount;
1668     int class_lastchar;
1669     int newoptions;
1670     int recno;
1671     int skipbytes;
1672     int subreqbyte;
1673     int subfirstbyte;
1674     int mclength;
1675     uschar mcbuffer[8];
1676    
1677     /* Next byte in the pattern */
1678    
1679     c = *ptr;
1680    
1681     /* If in \Q...\E, check for the end; if not, we have a literal */
1682    
1683     if (inescq && c != 0)
1684     {
1685     if (c == '\\' && ptr[1] == 'E')
1686     {
1687     inescq = FALSE;
1688     ptr++;
1689     continue;
1690     }
1691     else
1692     {
1693     if (previous_callout != NULL)
1694     {
1695     complete_callout(previous_callout, ptr, cd);
1696     previous_callout = NULL;
1697     }
1698     if ((options & PCRE_AUTO_CALLOUT) != 0)
1699     {
1700     previous_callout = code;
1701     code = auto_callout(code, ptr, cd);
1702     }
1703     goto NORMAL_CHAR;
1704     }
1705     }
1706    
1707     /* Fill in length of a previous callout, except when the next thing is
1708     a quantifier. */
1709    
1710     is_quantifier = c == '*' || c == '+' || c == '?' ||
1711     (c == '{' && is_counted_repeat(ptr+1));
1712    
1713     if (!is_quantifier && previous_callout != NULL &&
1714     after_manual_callout-- <= 0)
1715     {
1716     complete_callout(previous_callout, ptr, cd);
1717     previous_callout = NULL;
1718     }
1719    
1720     /* In extended mode, skip white space and comments */
1721    
1722     if ((options & PCRE_EXTENDED) != 0)
1723     {
1724     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1725     if (c == '#')
1726     {
1727 nigel 91 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
1728     if (*ptr != 0)
1729     {
1730     ptr += cd->nllen - 1;
1731     continue;
1732     }
1733     /* Else fall through to handle end of string */
1734     c = 0;
1735 nigel 77 }
1736     }
1737    
1738     /* No auto callout for quantifiers. */
1739    
1740     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1741     {
1742     previous_callout = code;
1743     code = auto_callout(code, ptr, cd);
1744     }
1745    
1746     switch(c)
1747     {
1748     /* The branch terminates at end of string, |, or ). */
1749    
1750     case 0:
1751     case '|':
1752     case ')':
1753     *firstbyteptr = firstbyte;
1754     *reqbyteptr = reqbyte;
1755     *codeptr = code;
1756     *ptrptr = ptr;
1757     return TRUE;
1758    
1759     /* Handle single-character metacharacters. In multiline mode, ^ disables
1760     the setting of any following char as a first character. */
1761    
1762     case '^':
1763     if ((options & PCRE_MULTILINE) != 0)
1764     {
1765     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1766     }
1767     previous = NULL;
1768     *code++ = OP_CIRC;
1769     break;
1770    
1771     case '$':
1772     previous = NULL;
1773     *code++ = OP_DOLL;
1774     break;
1775    
1776     /* There can never be a first char if '.' is first, whatever happens about
1777     repeats. The value of reqbyte doesn't change either. */
1778    
1779     case '.':
1780     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1781     zerofirstbyte = firstbyte;
1782     zeroreqbyte = reqbyte;
1783     previous = code;
1784     *code++ = OP_ANY;
1785     break;
1786    
1787 nigel 87 /* Character classes. If the included characters are all < 256, we build a
1788     32-byte bitmap of the permitted characters, except in the special case
1789     where there is only one such character. For negated classes, we build the
1790     map as usual, then invert it at the end. However, we use a different opcode
1791     so that data characters > 255 can be handled correctly.
1792 nigel 77
1793     If the class contains characters outside the 0-255 range, a different
1794     opcode is compiled. It may optionally have a bit map for characters < 256,
1795     but those above are are explicitly listed afterwards. A flag byte tells
1796     whether the bitmap is present, and whether this is a negated class or not.
1797     */
1798    
1799     case '[':
1800     previous = code;
1801    
1802     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1803     they are encountered at the top level, so we'll do that too. */
1804    
1805     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1806     check_posix_syntax(ptr, &tempptr, cd))
1807     {
1808     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1809     goto FAILED;
1810     }
1811    
1812     /* If the first character is '^', set the negation flag and skip it. */
1813    
1814     if ((c = *(++ptr)) == '^')
1815     {
1816     negate_class = TRUE;
1817     c = *(++ptr);
1818     }
1819     else
1820     {
1821     negate_class = FALSE;
1822     }
1823    
1824     /* Keep a count of chars with values < 256 so that we can optimize the case
1825     of just a single character (as long as it's < 256). For higher valued UTF-8
1826     characters, we don't yet do any optimization. */
1827    
1828     class_charcount = 0;
1829     class_lastchar = -1;
1830    
1831     #ifdef SUPPORT_UTF8
1832     class_utf8 = FALSE; /* No chars >= 256 */
1833     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1834     #endif
1835    
1836     /* Initialize the 32-char bit map to all zeros. We have to build the
1837     map in a temporary bit of store, in case the class contains only 1
1838     character (< 256), because in that case the compiled code doesn't use the
1839     bit map. */
1840    
1841     memset(classbits, 0, 32 * sizeof(uschar));
1842    
1843     /* Process characters until ] is reached. By writing this as a "do" it
1844     means that an initial ] is taken as a data character. The first pass
1845     through the regex checked the overall syntax, so we don't need to be very
1846     strict here. At the start of the loop, c contains the first byte of the
1847     character. */
1848    
1849     do
1850     {
1851     #ifdef SUPPORT_UTF8
1852     if (utf8 && c > 127)
1853     { /* Braces are required because the */
1854     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1855     }
1856     #endif
1857    
1858     /* Inside \Q...\E everything is literal except \E */
1859    
1860     if (inescq)
1861     {
1862     if (c == '\\' && ptr[1] == 'E')
1863     {
1864     inescq = FALSE;
1865     ptr++;
1866     continue;
1867     }
1868     else goto LONE_SINGLE_CHARACTER;
1869     }
1870    
1871     /* Handle POSIX class names. Perl allows a negation extension of the
1872     form [:^name:]. A square bracket that doesn't match the syntax is
1873     treated as a literal. We also recognize the POSIX constructions
1874     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1875     5.6 and 5.8 do. */
1876    
1877     if (c == '[' &&
1878     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1879     check_posix_syntax(ptr, &tempptr, cd))
1880     {
1881     BOOL local_negate = FALSE;
1882 nigel 87 int posix_class, taboffset, tabopt;
1883 nigel 77 register const uschar *cbits = cd->cbits;
1884 nigel 87 uschar pbits[32];
1885 nigel 77
1886     if (ptr[1] != ':')
1887     {
1888     *errorcodeptr = ERR31;
1889     goto FAILED;
1890     }
1891    
1892     ptr += 2;
1893     if (*ptr == '^')
1894     {
1895     local_negate = TRUE;
1896     ptr++;
1897     }
1898    
1899     posix_class = check_posix_name(ptr, tempptr - ptr);
1900     if (posix_class < 0)
1901     {
1902     *errorcodeptr = ERR30;
1903     goto FAILED;
1904     }
1905    
1906     /* If matching is caseless, upper and lower are converted to
1907     alpha. This relies on the fact that the class table starts with
1908     alpha, lower, upper as the first 3 entries. */
1909    
1910     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1911     posix_class = 0;
1912    
1913 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
1914     because we may be adding and subtracting from it, and we don't want to
1915     subtract bits that may be in the main map already. At the end we or the
1916     result into the bit map that is being built. */
1917 nigel 77
1918     posix_class *= 3;
1919 nigel 87
1920     /* Copy in the first table (always present) */
1921    
1922     memcpy(pbits, cbits + posix_class_maps[posix_class],
1923     32 * sizeof(uschar));
1924    
1925     /* If there is a second table, add or remove it as required. */
1926    
1927     taboffset = posix_class_maps[posix_class + 1];
1928     tabopt = posix_class_maps[posix_class + 2];
1929    
1930     if (taboffset >= 0)
1931 nigel 77 {
1932 nigel 87 if (tabopt >= 0)
1933     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
1934 nigel 77 else
1935 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
1936 nigel 77 }
1937    
1938 nigel 87 /* Not see if we need to remove any special characters. An option
1939     value of 1 removes vertical space and 2 removes underscore. */
1940    
1941     if (tabopt < 0) tabopt = -tabopt;
1942     if (tabopt == 1) pbits[1] &= ~0x3c;
1943     else if (tabopt == 2) pbits[11] &= 0x7f;
1944    
1945     /* Add the POSIX table or its complement into the main table that is
1946     being built and we are done. */
1947    
1948     if (local_negate)
1949     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
1950     else
1951     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
1952    
1953 nigel 77 ptr = tempptr + 1;
1954     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1955     continue; /* End of POSIX syntax handling */
1956     }
1957    
1958     /* Backslash may introduce a single character, or it may introduce one
1959     of the specials, which just set a flag. Escaped items are checked for
1960     validity in the pre-compiling pass. The sequence \b is a special case.
1961     Inside a class (and only there) it is treated as backspace. Elsewhere
1962     it marks a word boundary. Other escapes have preset maps ready to
1963     or into the one we are building. We assume they have more than one
1964     character in them, so set class_charcount bigger than one. */
1965    
1966     if (c == '\\')
1967     {
1968     c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1969    
1970     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1971     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1972     else if (-c == ESC_Q) /* Handle start of quoted string */
1973     {
1974     if (ptr[1] == '\\' && ptr[2] == 'E')
1975     {
1976     ptr += 2; /* avoid empty string */
1977     }
1978     else inescq = TRUE;
1979     continue;
1980     }
1981    
1982     if (c < 0)
1983     {
1984     register const uschar *cbits = cd->cbits;
1985     class_charcount += 2; /* Greater than 1 is what matters */
1986     switch (-c)
1987     {
1988     case ESC_d:
1989     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1990     continue;
1991    
1992     case ESC_D:
1993     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1994     continue;
1995    
1996     case ESC_w:
1997     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1998     continue;
1999    
2000     case ESC_W:
2001     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2002     continue;
2003    
2004     case ESC_s:
2005     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2006     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2007     continue;
2008    
2009     case ESC_S:
2010     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2011     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2012     continue;
2013    
2014     #ifdef SUPPORT_UCP
2015     case ESC_p:
2016     case ESC_P:
2017     {
2018     BOOL negated;
2019 nigel 87 int pdata;
2020     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2021     if (ptype < 0) goto FAILED;
2022 nigel 77 class_utf8 = TRUE;
2023     *class_utf8data++ = ((-c == ESC_p) != negated)?
2024     XCL_PROP : XCL_NOTPROP;
2025 nigel 87 *class_utf8data++ = ptype;
2026     *class_utf8data++ = pdata;
2027 nigel 77 class_charcount -= 2; /* Not a < 256 character */
2028     }
2029     continue;
2030     #endif
2031    
2032     /* Unrecognized escapes are faulted if PCRE is running in its
2033     strict mode. By default, for compatibility with Perl, they are
2034     treated as literals. */
2035    
2036     default:
2037     if ((options & PCRE_EXTRA) != 0)
2038     {
2039     *errorcodeptr = ERR7;
2040     goto FAILED;
2041     }
2042     c = *ptr; /* The final character */
2043     class_charcount -= 2; /* Undo the default count from above */
2044     }
2045     }
2046    
2047     /* Fall through if we have a single character (c >= 0). This may be
2048     > 256 in UTF-8 mode. */
2049    
2050     } /* End of backslash handling */
2051    
2052     /* A single character may be followed by '-' to form a range. However,
2053     Perl does not permit ']' to be the end of the range. A '-' character
2054     here is treated as a literal. */
2055    
2056     if (ptr[1] == '-' && ptr[2] != ']')
2057     {
2058     int d;
2059     ptr += 2;
2060    
2061     #ifdef SUPPORT_UTF8
2062     if (utf8)
2063     { /* Braces are required because the */
2064     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2065     }
2066     else
2067     #endif
2068     d = *ptr; /* Not UTF-8 mode */
2069    
2070     /* The second part of a range can be a single-character escape, but
2071     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2072     in such circumstances. */
2073    
2074     if (d == '\\')
2075     {
2076     const uschar *oldptr = ptr;
2077     d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2078    
2079     /* \b is backslash; \X is literal X; any other special means the '-'
2080     was literal */
2081    
2082     if (d < 0)
2083     {
2084     if (d == -ESC_b) d = '\b';
2085     else if (d == -ESC_X) d = 'X'; else
2086     {
2087     ptr = oldptr - 2;
2088     goto LONE_SINGLE_CHARACTER; /* A few lines below */
2089     }
2090     }
2091     }
2092    
2093     /* The check that the two values are in the correct order happens in
2094     the pre-pass. Optimize one-character ranges */
2095    
2096     if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2097    
2098     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2099     matching, we have to use an XCLASS with extra data items. Caseless
2100     matching for characters > 127 is available only if UCP support is
2101     available. */
2102    
2103     #ifdef SUPPORT_UTF8
2104     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2105     {
2106     class_utf8 = TRUE;
2107    
2108     /* With UCP support, we can find the other case equivalents of
2109     the relevant characters. There may be several ranges. Optimize how
2110     they fit with the basic range. */
2111    
2112     #ifdef SUPPORT_UCP
2113     if ((options & PCRE_CASELESS) != 0)
2114     {
2115     int occ, ocd;
2116     int cc = c;
2117     int origd = d;
2118     while (get_othercase_range(&cc, origd, &occ, &ocd))
2119     {
2120     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2121    
2122     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2123     { /* if there is overlap, */
2124     c = occ; /* noting that if occ < c */
2125     continue; /* we can't have ocd > d */
2126     } /* because a subrange is */
2127     if (ocd > d && occ <= d + 1) /* always shorter than */
2128     { /* the basic range. */
2129     d = ocd;
2130     continue;
2131     }
2132    
2133     if (occ == ocd)
2134     {
2135     *class_utf8data++ = XCL_SINGLE;
2136     }
2137     else
2138     {
2139     *class_utf8data++ = XCL_RANGE;
2140     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2141     }
2142     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2143     }
2144     }
2145     #endif /* SUPPORT_UCP */
2146    
2147     /* Now record the original range, possibly modified for UCP caseless
2148     overlapping ranges. */
2149    
2150     *class_utf8data++ = XCL_RANGE;
2151     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2152     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2153    
2154     /* With UCP support, we are done. Without UCP support, there is no
2155     caseless matching for UTF-8 characters > 127; we can use the bit map
2156     for the smaller ones. */
2157    
2158     #ifdef SUPPORT_UCP
2159     continue; /* With next character in the class */
2160     #else
2161     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2162    
2163     /* Adjust upper limit and fall through to set up the map */
2164    
2165     d = 127;
2166    
2167     #endif /* SUPPORT_UCP */
2168     }
2169     #endif /* SUPPORT_UTF8 */
2170    
2171     /* We use the bit map for all cases when not in UTF-8 mode; else
2172     ranges that lie entirely within 0-127 when there is UCP support; else
2173     for partial ranges without UCP support. */
2174    
2175     for (; c <= d; c++)
2176     {
2177     classbits[c/8] |= (1 << (c&7));
2178     if ((options & PCRE_CASELESS) != 0)
2179     {
2180     int uc = cd->fcc[c]; /* flip case */
2181     classbits[uc/8] |= (1 << (uc&7));
2182     }
2183     class_charcount++; /* in case a one-char range */
2184     class_lastchar = c;
2185     }
2186    
2187     continue; /* Go get the next char in the class */
2188     }
2189    
2190     /* Handle a lone single character - we can get here for a normal
2191     non-escape char, or after \ that introduces a single character or for an
2192     apparent range that isn't. */
2193    
2194     LONE_SINGLE_CHARACTER:
2195    
2196     /* Handle a character that cannot go in the bit map */
2197    
2198     #ifdef SUPPORT_UTF8
2199     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2200     {
2201     class_utf8 = TRUE;
2202     *class_utf8data++ = XCL_SINGLE;
2203     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2204    
2205     #ifdef SUPPORT_UCP
2206     if ((options & PCRE_CASELESS) != 0)
2207     {
2208     int othercase;
2209 nigel 87 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
2210 nigel 77 {
2211     *class_utf8data++ = XCL_SINGLE;
2212     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2213     }
2214     }
2215     #endif /* SUPPORT_UCP */
2216    
2217     }
2218     else
2219     #endif /* SUPPORT_UTF8 */
2220    
2221     /* Handle a single-byte character */
2222     {
2223     classbits[c/8] |= (1 << (c&7));
2224     if ((options & PCRE_CASELESS) != 0)
2225     {
2226     c = cd->fcc[c]; /* flip case */
2227     classbits[c/8] |= (1 << (c&7));
2228     }
2229     class_charcount++;
2230     class_lastchar = c;
2231     }
2232     }
2233    
2234     /* Loop until ']' reached; the check for end of string happens inside the
2235     loop. This "while" is the end of the "do" above. */
2236    
2237     while ((c = *(++ptr)) != ']' || inescq);
2238    
2239     /* If class_charcount is 1, we saw precisely one character whose value is
2240     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2241     can optimize the negative case only if there were no characters >= 128
2242     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2243     single-bytes only. This is an historical hangover. Maybe one day we can
2244     tidy these opcodes to handle multi-byte characters.
2245    
2246     The optimization throws away the bit map. We turn the item into a
2247     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2248     that OP_NOT does not support multibyte characters. In the positive case, it
2249     can cause firstbyte to be set. Otherwise, there can be no first char if
2250     this item is first, whatever repeat count may follow. In the case of
2251     reqbyte, save the previous value for reinstating. */
2252    
2253     #ifdef SUPPORT_UTF8
2254     if (class_charcount == 1 &&
2255     (!utf8 ||
2256     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2257    
2258     #else
2259     if (class_charcount == 1)
2260     #endif
2261     {
2262     zeroreqbyte = reqbyte;
2263    
2264     /* The OP_NOT opcode works on one-byte characters only. */
2265    
2266     if (negate_class)
2267     {
2268     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2269     zerofirstbyte = firstbyte;
2270     *code++ = OP_NOT;
2271     *code++ = class_lastchar;
2272     break;
2273     }
2274    
2275     /* For a single, positive character, get the value into mcbuffer, and
2276     then we can handle this with the normal one-character code. */
2277    
2278     #ifdef SUPPORT_UTF8
2279     if (utf8 && class_lastchar > 127)
2280     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2281     else
2282     #endif
2283     {
2284     mcbuffer[0] = class_lastchar;
2285     mclength = 1;
2286     }
2287     goto ONE_CHAR;
2288     } /* End of 1-char optimization */
2289    
2290     /* The general case - not the one-char optimization. If this is the first
2291     thing in the branch, there can be no first char setting, whatever the
2292     repeat count. Any reqbyte setting must remain unchanged after any kind of
2293     repeat. */
2294    
2295     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2296     zerofirstbyte = firstbyte;
2297     zeroreqbyte = reqbyte;
2298    
2299     /* If there are characters with values > 255, we have to compile an
2300     extended class, with its own opcode. If there are no characters < 256,
2301     we can omit the bitmap. */
2302    
2303     #ifdef SUPPORT_UTF8
2304     if (class_utf8)
2305     {
2306     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2307     *code++ = OP_XCLASS;
2308     code += LINK_SIZE;
2309     *code = negate_class? XCL_NOT : 0;
2310    
2311     /* If the map is required, install it, and move on to the end of
2312     the extra data */
2313    
2314     if (class_charcount > 0)
2315     {
2316     *code++ |= XCL_MAP;
2317     memcpy(code, classbits, 32);
2318     code = class_utf8data;
2319     }
2320    
2321     /* If the map is not required, slide down the extra data. */
2322    
2323     else
2324     {
2325     int len = class_utf8data - (code + 33);
2326     memmove(code + 1, code + 33, len);
2327     code += len + 1;
2328     }
2329    
2330     /* Now fill in the complete length of the item */
2331    
2332     PUT(previous, 1, code - previous);
2333     break; /* End of class handling */
2334     }
2335     #endif
2336    
2337     /* If there are no characters > 255, negate the 32-byte map if necessary,
2338     and copy it into the code vector. If this is the first thing in the branch,
2339     there can be no first char setting, whatever the repeat count. Any reqbyte
2340     setting must remain unchanged after any kind of repeat. */
2341    
2342     if (negate_class)
2343     {
2344     *code++ = OP_NCLASS;
2345     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2346     }
2347     else
2348     {
2349     *code++ = OP_CLASS;
2350     memcpy(code, classbits, 32);
2351     }
2352     code += 32;
2353     break;
2354    
2355     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2356     has been tested above. */
2357    
2358     case '{':
2359     if (!is_quantifier) goto NORMAL_CHAR;
2360     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2361     if (*errorcodeptr != 0) goto FAILED;
2362     goto REPEAT;
2363    
2364     case '*':
2365     repeat_min = 0;
2366     repeat_max = -1;
2367     goto REPEAT;
2368    
2369     case '+':
2370     repeat_min = 1;
2371     repeat_max = -1;
2372     goto REPEAT;
2373    
2374     case '?':
2375     repeat_min = 0;
2376     repeat_max = 1;
2377    
2378     REPEAT:
2379     if (previous == NULL)
2380     {
2381     *errorcodeptr = ERR9;
2382     goto FAILED;
2383     }
2384    
2385     if (repeat_min == 0)
2386     {
2387     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2388     reqbyte = zeroreqbyte; /* Ditto */
2389     }
2390    
2391     /* Remember whether this is a variable length repeat */
2392    
2393     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2394    
2395     op_type = 0; /* Default single-char op codes */
2396     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2397    
2398     /* Save start of previous item, in case we have to move it up to make space
2399     for an inserted OP_ONCE for the additional '+' extension. */
2400    
2401     tempcode = previous;
2402    
2403     /* If the next character is '+', we have a possessive quantifier. This
2404     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2405     If the next character is '?' this is a minimizing repeat, by default,
2406     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2407     repeat type to the non-default. */
2408    
2409     if (ptr[1] == '+')
2410     {
2411     repeat_type = 0; /* Force greedy */
2412     possessive_quantifier = TRUE;
2413     ptr++;
2414     }
2415     else if (ptr[1] == '?')
2416     {
2417     repeat_type = greedy_non_default;
2418     ptr++;
2419     }
2420     else repeat_type = greedy_default;
2421    
2422     /* If previous was a recursion, we need to wrap it inside brackets so that
2423     it can be replicated if necessary. */
2424    
2425     if (*previous == OP_RECURSE)
2426     {
2427     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2428     code += 1 + LINK_SIZE;
2429     *previous = OP_BRA;
2430     PUT(previous, 1, code - previous);
2431     *code = OP_KET;
2432     PUT(code, 1, code - previous);
2433     code += 1 + LINK_SIZE;
2434     }
2435    
2436     /* If previous was a character match, abolish the item and generate a
2437     repeat item instead. If a char item has a minumum of more than one, ensure
2438     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2439     the first thing in a branch because the x will have gone into firstbyte
2440     instead. */
2441    
2442     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2443     {
2444     /* Deal with UTF-8 characters that take up more than one byte. It's
2445     easier to write this out separately than try to macrify it. Use c to
2446     hold the length of the character in bytes, plus 0x80 to flag that it's a
2447     length rather than a small character. */
2448    
2449     #ifdef SUPPORT_UTF8
2450     if (utf8 && (code[-1] & 0x80) != 0)
2451     {
2452     uschar *lastchar = code - 1;
2453     while((*lastchar & 0xc0) == 0x80) lastchar--;
2454     c = code - lastchar; /* Length of UTF-8 character */
2455     memcpy(utf8_char, lastchar, c); /* Save the char */
2456     c |= 0x80; /* Flag c as a length */
2457     }
2458     else
2459     #endif
2460    
2461     /* Handle the case of a single byte - either with no UTF8 support, or
2462     with UTF-8 disabled, or for a UTF-8 character < 128. */
2463    
2464     {
2465     c = code[-1];
2466     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2467     }
2468    
2469     goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2470     }
2471    
2472     /* If previous was a single negated character ([^a] or similar), we use
2473     one of the special opcodes, replacing it. The code is shared with single-
2474     character repeats by setting opt_type to add a suitable offset into
2475     repeat_type. OP_NOT is currently used only for single-byte chars. */
2476    
2477     else if (*previous == OP_NOT)
2478     {
2479     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2480     c = previous[1];
2481     goto OUTPUT_SINGLE_REPEAT;
2482     }
2483    
2484     /* If previous was a character type match (\d or similar), abolish it and
2485     create a suitable repeat item. The code is shared with single-character
2486     repeats by setting op_type to add a suitable offset into repeat_type. Note
2487     the the Unicode property types will be present only when SUPPORT_UCP is
2488     defined, but we don't wrap the little bits of code here because it just
2489     makes it horribly messy. */
2490    
2491     else if (*previous < OP_EODN)
2492     {
2493     uschar *oldcode;
2494 nigel 87 int prop_type, prop_value;
2495 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2496     c = *previous;
2497    
2498     OUTPUT_SINGLE_REPEAT:
2499 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
2500     {
2501     prop_type = previous[1];
2502     prop_value = previous[2];
2503     }
2504     else prop_type = prop_value = -1;
2505 nigel 77
2506     oldcode = code;
2507     code = previous; /* Usually overwrite previous item */
2508    
2509     /* If the maximum is zero then the minimum must also be zero; Perl allows
2510     this case, so we do too - by simply omitting the item altogether. */
2511    
2512     if (repeat_max == 0) goto END_REPEAT;
2513    
2514     /* All real repeats make it impossible to handle partial matching (maybe
2515     one day we will be able to remove this restriction). */
2516    
2517     if (repeat_max != 1) cd->nopartial = TRUE;
2518    
2519     /* Combine the op_type with the repeat_type */
2520    
2521     repeat_type += op_type;
2522    
2523     /* A minimum of zero is handled either as the special case * or ?, or as
2524     an UPTO, with the maximum given. */
2525    
2526     if (repeat_min == 0)
2527     {
2528     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2529     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2530     else
2531     {
2532     *code++ = OP_UPTO + repeat_type;
2533     PUT2INC(code, 0, repeat_max);
2534     }
2535     }
2536    
2537     /* A repeat minimum of 1 is optimized into some special cases. If the
2538     maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2539     left in place and, if the maximum is greater than 1, we use OP_UPTO with
2540     one less than the maximum. */
2541    
2542     else if (repeat_min == 1)
2543     {
2544     if (repeat_max == -1)
2545     *code++ = OP_PLUS + repeat_type;
2546     else
2547     {
2548     code = oldcode; /* leave previous item in place */
2549     if (repeat_max == 1) goto END_REPEAT;
2550     *code++ = OP_UPTO + repeat_type;
2551     PUT2INC(code, 0, repeat_max - 1);
2552     }
2553     }
2554    
2555     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2556     handled as an EXACT followed by an UPTO. */
2557    
2558     else
2559     {
2560     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2561     PUT2INC(code, 0, repeat_min);
2562    
2563     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2564     we have to insert the character for the previous code. For a repeated
2565 nigel 87 Unicode property match, there are two extra bytes that define the
2566 nigel 77 required property. In UTF-8 mode, long characters have their length in
2567     c, with the 0x80 bit as a flag. */
2568    
2569     if (repeat_max < 0)
2570     {
2571     #ifdef SUPPORT_UTF8
2572     if (utf8 && c >= 128)
2573     {
2574     memcpy(code, utf8_char, c & 7);
2575     code += c & 7;
2576     }
2577     else
2578     #endif
2579     {
2580     *code++ = c;
2581 nigel 87 if (prop_type >= 0)
2582     {
2583     *code++ = prop_type;
2584     *code++ = prop_value;
2585     }
2586 nigel 77 }
2587     *code++ = OP_STAR + repeat_type;
2588     }
2589    
2590     /* Else insert an UPTO if the max is greater than the min, again
2591     preceded by the character, for the previously inserted code. */
2592    
2593     else if (repeat_max != repeat_min)
2594     {
2595     #ifdef SUPPORT_UTF8
2596     if (utf8 && c >= 128)
2597     {
2598     memcpy(code, utf8_char, c & 7);
2599     code += c & 7;
2600     }
2601     else
2602     #endif
2603     *code++ = c;
2604 nigel 87 if (prop_type >= 0)
2605     {
2606     *code++ = prop_type;
2607     *code++ = prop_value;
2608     }
2609 nigel 77 repeat_max -= repeat_min;
2610     *code++ = OP_UPTO + repeat_type;
2611     PUT2INC(code, 0, repeat_max);
2612     }
2613     }
2614    
2615     /* The character or character type itself comes last in all cases. */
2616    
2617     #ifdef SUPPORT_UTF8
2618     if (utf8 && c >= 128)
2619     {
2620     memcpy(code, utf8_char, c & 7);
2621     code += c & 7;
2622     }
2623     else
2624     #endif
2625     *code++ = c;
2626    
2627 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
2628     define the required property. */
2629 nigel 77
2630     #ifdef SUPPORT_UCP
2631 nigel 87 if (prop_type >= 0)
2632     {
2633     *code++ = prop_type;
2634     *code++ = prop_value;
2635     }
2636 nigel 77 #endif
2637     }
2638    
2639     /* If previous was a character class or a back reference, we put the repeat
2640     stuff after it, but just skip the item if the repeat was {0,0}. */
2641    
2642     else if (*previous == OP_CLASS ||
2643     *previous == OP_NCLASS ||
2644     #ifdef SUPPORT_UTF8
2645     *previous == OP_XCLASS ||
2646     #endif
2647     *previous == OP_REF)
2648     {
2649     if (repeat_max == 0)
2650     {
2651     code = previous;
2652     goto END_REPEAT;
2653     }
2654    
2655     /* All real repeats make it impossible to handle partial matching (maybe
2656     one day we will be able to remove this restriction). */
2657    
2658     if (repeat_max != 1) cd->nopartial = TRUE;
2659    
2660     if (repeat_min == 0 && repeat_max == -1)
2661     *code++ = OP_CRSTAR + repeat_type;
2662     else if (repeat_min == 1 && repeat_max == -1)
2663     *code++ = OP_CRPLUS + repeat_type;
2664     else if (repeat_min == 0 && repeat_max == 1)
2665     *code++ = OP_CRQUERY + repeat_type;
2666     else
2667     {
2668     *code++ = OP_CRRANGE + repeat_type;
2669     PUT2INC(code, 0, repeat_min);
2670     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2671     PUT2INC(code, 0, repeat_max);
2672     }
2673     }
2674    
2675     /* If previous was a bracket group, we may have to replicate it in certain
2676     cases. */
2677    
2678     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2679     *previous == OP_COND)
2680     {
2681     register int i;
2682     int ketoffset = 0;
2683     int len = code - previous;
2684     uschar *bralink = NULL;
2685    
2686     /* If the maximum repeat count is unlimited, find the end of the bracket
2687     by scanning through from the start, and compute the offset back to it
2688     from the current code pointer. There may be an OP_OPT setting following
2689     the final KET, so we can't find the end just by going back from the code
2690     pointer. */
2691    
2692     if (repeat_max == -1)
2693     {
2694     register uschar *ket = previous;
2695     do ket += GET(ket, 1); while (*ket != OP_KET);
2696     ketoffset = code - ket;
2697     }
2698    
2699     /* The case of a zero minimum is special because of the need to stick
2700     OP_BRAZERO in front of it, and because the group appears once in the
2701     data, whereas in other cases it appears the minimum number of times. For
2702     this reason, it is simplest to treat this case separately, as otherwise
2703     the code gets far too messy. There are several special subcases when the
2704     minimum is zero. */
2705    
2706     if (repeat_min == 0)
2707     {
2708     /* If the maximum is also zero, we just omit the group from the output
2709     altogether. */
2710    
2711     if (repeat_max == 0)
2712     {
2713     code = previous;
2714     goto END_REPEAT;
2715     }
2716    
2717     /* If the maximum is 1 or unlimited, we just have to stick in the
2718     BRAZERO and do no more at this point. However, we do need to adjust
2719     any OP_RECURSE calls inside the group that refer to the group itself or
2720     any internal group, because the offset is from the start of the whole
2721     regex. Temporarily terminate the pattern while doing this. */
2722    
2723     if (repeat_max <= 1)
2724     {
2725     *code = OP_END;
2726     adjust_recurse(previous, 1, utf8, cd);
2727     memmove(previous+1, previous, len);
2728     code++;
2729     *previous++ = OP_BRAZERO + repeat_type;
2730     }
2731    
2732     /* If the maximum is greater than 1 and limited, we have to replicate
2733     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2734     The first one has to be handled carefully because it's the original
2735     copy, which has to be moved up. The remainder can be handled by code
2736     that is common with the non-zero minimum case below. We have to
2737     adjust the value or repeat_max, since one less copy is required. Once
2738     again, we may have to adjust any OP_RECURSE calls inside the group. */
2739    
2740     else
2741     {
2742     int offset;
2743     *code = OP_END;
2744     adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2745     memmove(previous + 2 + LINK_SIZE, previous, len);
2746     code += 2 + LINK_SIZE;
2747     *previous++ = OP_BRAZERO + repeat_type;
2748     *previous++ = OP_BRA;
2749    
2750     /* We chain together the bracket offset fields that have to be
2751     filled in later when the ends of the brackets are reached. */
2752    
2753     offset = (bralink == NULL)? 0 : previous - bralink;
2754     bralink = previous;
2755     PUTINC(previous, 0, offset);
2756     }
2757    
2758     repeat_max--;
2759     }
2760    
2761     /* If the minimum is greater than zero, replicate the group as many
2762     times as necessary, and adjust the maximum to the number of subsequent
2763     copies that we need. If we set a first char from the group, and didn't
2764     set a required char, copy the latter from the former. */
2765    
2766     else
2767     {
2768     if (repeat_min > 1)
2769     {
2770     if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2771     for (i = 1; i < repeat_min; i++)
2772     {
2773     memcpy(code, previous, len);
2774     code += len;
2775     }
2776     }
2777     if (repeat_max > 0) repeat_max -= repeat_min;
2778     }
2779    
2780     /* This code is common to both the zero and non-zero minimum cases. If
2781     the maximum is limited, it replicates the group in a nested fashion,
2782     remembering the bracket starts on a stack. In the case of a zero minimum,
2783     the first one was set up above. In all cases the repeat_max now specifies
2784     the number of additional copies needed. */
2785    
2786     if (repeat_max >= 0)
2787     {
2788     for (i = repeat_max - 1; i >= 0; i--)
2789     {
2790     *code++ = OP_BRAZERO + repeat_type;
2791    
2792     /* All but the final copy start a new nesting, maintaining the
2793     chain of brackets outstanding. */
2794    
2795     if (i != 0)
2796     {
2797     int offset;
2798     *code++ = OP_BRA;
2799     offset = (bralink == NULL)? 0 : code - bralink;
2800     bralink = code;
2801     PUTINC(code, 0, offset);
2802     }
2803    
2804     memcpy(code, previous, len);
2805     code += len;
2806     }
2807    
2808     /* Now chain through the pending brackets, and fill in their length
2809     fields (which are holding the chain links pro tem). */
2810    
2811     while (bralink != NULL)
2812     {
2813     int oldlinkoffset;
2814     int offset = code - bralink + 1;
2815     uschar *bra = code - offset;
2816     oldlinkoffset = GET(bra, 1);
2817     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2818     *code++ = OP_KET;
2819     PUTINC(code, 0, offset);
2820     PUT(bra, 1, offset);
2821     }
2822     }
2823    
2824     /* If the maximum is unlimited, set a repeater in the final copy. We
2825     can't just offset backwards from the current code point, because we
2826     don't know if there's been an options resetting after the ket. The
2827     correct offset was computed above. */
2828    
2829     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2830     }
2831    
2832     /* Else there's some kind of shambles */
2833    
2834     else
2835     {
2836     *errorcodeptr = ERR11;
2837     goto FAILED;
2838     }
2839    
2840     /* If the character following a repeat is '+', we wrap the entire repeated
2841     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2842     Sun's Java package. The repeated item starts at tempcode, not at previous,
2843     which might be the first part of a string whose (former) last char we
2844     repeated. However, we don't support '+' after a greediness '?'. */
2845    
2846     if (possessive_quantifier)
2847     {
2848     int len = code - tempcode;
2849     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2850     code += 1 + LINK_SIZE;
2851     len += 1 + LINK_SIZE;
2852     tempcode[0] = OP_ONCE;
2853     *code++ = OP_KET;
2854     PUTINC(code, 0, len);
2855     PUT(tempcode, 1, len);
2856     }
2857    
2858     /* In all case we no longer have a previous item. We also set the
2859     "follows varying string" flag for subsequently encountered reqbytes if
2860     it isn't already set and we have just passed a varying length item. */
2861    
2862     END_REPEAT:
2863     previous = NULL;
2864     cd->req_varyopt |= reqvary;
2865     break;
2866    
2867    
2868     /* Start of nested bracket sub-expression, or comment or lookahead or
2869     lookbehind or option setting or condition. First deal with special things
2870     that can come after a bracket; all are introduced by ?, and the appearance
2871     of any of them means that this is not a referencing group. They were
2872     checked for validity in the first pass over the string, so we don't have to
2873     check for syntax errors here. */
2874    
2875     case '(':
2876     newoptions = options;
2877     skipbytes = 0;
2878    
2879     if (*(++ptr) == '?')
2880     {
2881     int set, unset;
2882     int *optset;
2883    
2884     switch (*(++ptr))
2885     {
2886     case '#': /* Comment; skip to ket */
2887     ptr++;
2888     while (*ptr != ')') ptr++;
2889     continue;
2890    
2891     case ':': /* Non-extracting bracket */
2892     bravalue = OP_BRA;
2893     ptr++;
2894     break;
2895    
2896     case '(':
2897     bravalue = OP_COND; /* Conditional group */
2898    
2899 nigel 91 /* A condition can be a number, referring to a numbered group, a name,
2900     referring to a named group, 'R', referring to recursion, or an
2901     assertion. There are two unfortunate ambiguities, caused by history.
2902     (a) 'R' can be the recursive thing or the name 'R', and (b) a number
2903     could be a name that consists of digits. In both cases, we look for a
2904     name first; if not found, we try the other cases. If the first
2905     character after (?( is a word character, we know the rest up to ) will
2906     also be word characters because the syntax was checked in the first
2907     pass. */
2908 nigel 77
2909 nigel 91 if ((cd->ctypes[ptr[1]] & ctype_word) != 0)
2910 nigel 77 {
2911 nigel 91 int i, namelen;
2912     int condref = 0;
2913     const uschar *name;
2914     uschar *slot = cd->name_table;
2915    
2916     /* This is needed for all successful cases. */
2917    
2918 nigel 77 skipbytes = 3;
2919    
2920 nigel 91 /* Read the name, but also get it as a number if it's all digits */
2921 nigel 77
2922 nigel 91 name = ++ptr;
2923     while (*ptr != ')')
2924 nigel 77 {
2925 nigel 91 if (condref >= 0)
2926     condref = ((digitab[*ptr] & ctype_digit) != 0)?
2927     condref * 10 + *ptr - '0' : -1;
2928     ptr++;
2929     }
2930     namelen = ptr - name;
2931     ptr++;
2932    
2933     for (i = 0; i < cd->names_found; i++)
2934     {
2935     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2936     slot += cd->name_entry_size;
2937     }
2938    
2939     /* Found a previous named subpattern */
2940    
2941     if (i < cd->names_found)
2942     {
2943     condref = GET2(slot, 0);
2944     code[1+LINK_SIZE] = OP_CREF;
2945     PUT2(code, 2+LINK_SIZE, condref);
2946     }
2947    
2948     /* Search the pattern for a forward reference */
2949    
2950     else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)
2951     {
2952     code[1+LINK_SIZE] = OP_CREF;
2953     PUT2(code, 2+LINK_SIZE, i);
2954     }
2955    
2956     /* Check for 'R' for recursion */
2957    
2958     else if (namelen == 1 && *name == 'R')
2959     {
2960     code[1+LINK_SIZE] = OP_CREF;
2961     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2962     }
2963    
2964     /* Check for a subpattern number */
2965    
2966     else if (condref > 0)
2967     {
2968     code[1+LINK_SIZE] = OP_CREF;
2969     PUT2(code, 2+LINK_SIZE, condref);
2970     }
2971    
2972     /* Either an unidentified subpattern, or a reference to (?(0) */
2973    
2974     else
2975     {
2976     *errorcodeptr = (condref == 0)? ERR35: ERR15;
2977 nigel 77 goto FAILED;
2978     }
2979     }
2980 nigel 91
2981 nigel 77 /* For conditions that are assertions, we just fall through, having
2982     set bravalue above. */
2983 nigel 91
2984 nigel 77 break;
2985    
2986     case '=': /* Positive lookahead */
2987     bravalue = OP_ASSERT;
2988     ptr++;
2989     break;
2990    
2991     case '!': /* Negative lookahead */
2992     bravalue = OP_ASSERT_NOT;
2993     ptr++;
2994     break;
2995    
2996     case '<': /* Lookbehinds */
2997     switch (*(++ptr))
2998     {
2999     case '=': /* Positive lookbehind */
3000     bravalue = OP_ASSERTBACK;
3001     ptr++;
3002     break;
3003    
3004     case '!': /* Negative lookbehind */
3005     bravalue = OP_ASSERTBACK_NOT;
3006     ptr++;
3007     break;
3008     }
3009     break;
3010    
3011     case '>': /* One-time brackets */
3012     bravalue = OP_ONCE;
3013     ptr++;
3014     break;
3015    
3016     case 'C': /* Callout - may be followed by digits; */
3017     previous_callout = code; /* Save for later completion */
3018     after_manual_callout = 1; /* Skip one item before completing */
3019     *code++ = OP_CALLOUT; /* Already checked that the terminating */
3020     { /* closing parenthesis is present. */
3021     int n = 0;
3022     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3023     n = n * 10 + *ptr - '0';
3024     if (n > 255)
3025     {
3026     *errorcodeptr = ERR38;
3027     goto FAILED;
3028     }
3029     *code++ = n;
3030     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3031     PUT(code, LINK_SIZE, 0); /* Default length */
3032     code += 2 * LINK_SIZE;
3033     }
3034     previous = NULL;
3035     continue;
3036    
3037     case 'P': /* Named subpattern handling */
3038     if (*(++ptr) == '<') /* Definition */
3039     {
3040     int i, namelen;
3041     uschar *slot = cd->name_table;
3042     const uschar *name; /* Don't amalgamate; some compilers */
3043     name = ++ptr; /* grumble at autoincrement in declaration */
3044    
3045     while (*ptr++ != '>');
3046     namelen = ptr - name - 1;
3047    
3048     for (i = 0; i < cd->names_found; i++)
3049     {
3050     int crc = memcmp(name, slot+2, namelen);
3051     if (crc == 0)
3052     {
3053     if (slot[2+namelen] == 0)
3054     {
3055 nigel 91 if ((options & PCRE_DUPNAMES) == 0)
3056     {
3057     *errorcodeptr = ERR43;
3058     goto FAILED;
3059     }
3060 nigel 77 }
3061 nigel 91 else crc = -1; /* Current name is substring */
3062 nigel 77 }
3063     if (crc < 0)
3064     {
3065     memmove(slot + cd->name_entry_size, slot,
3066     (cd->names_found - i) * cd->name_entry_size);
3067     break;
3068     }
3069     slot += cd->name_entry_size;
3070     }
3071    
3072     PUT2(slot, 0, *brackets + 1);
3073     memcpy(slot + 2, name, namelen);
3074     slot[2+namelen] = 0;
3075     cd->names_found++;
3076     goto NUMBERED_GROUP;
3077     }
3078    
3079     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3080     {
3081     int i, namelen;
3082     int type = *ptr++;
3083     const uschar *name = ptr;
3084     uschar *slot = cd->name_table;
3085    
3086     while (*ptr != ')') ptr++;
3087     namelen = ptr - name;
3088    
3089     for (i = 0; i < cd->names_found; i++)
3090     {
3091     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3092     slot += cd->name_entry_size;
3093     }
3094 nigel 91
3095     if (i < cd->names_found) /* Back reference */
3096 nigel 77 {
3097 nigel 91 recno = GET2(slot, 0);
3098     }
3099     else if ((recno = /* Forward back reference */
3100     find_named_parens(ptr, *brackets, name, namelen)) <= 0)
3101     {
3102 nigel 77 *errorcodeptr = ERR15;
3103     goto FAILED;
3104     }
3105    
3106     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3107    
3108     /* Back reference */
3109    
3110     previous = code;
3111     *code++ = OP_REF;
3112     PUT2INC(code, 0, recno);
3113     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3114     if (recno > cd->top_backref) cd->top_backref = recno;
3115     continue;
3116     }
3117    
3118     /* Should never happen */
3119     break;
3120    
3121     case 'R': /* Pattern recursion */
3122     ptr++; /* Same as (?0) */
3123     /* Fall through */
3124    
3125     /* Recursion or "subroutine" call */
3126    
3127     case '0': case '1': case '2': case '3': case '4':
3128     case '5': case '6': case '7': case '8': case '9':
3129     {
3130     const uschar *called;
3131     recno = 0;
3132     while((digitab[*ptr] & ctype_digit) != 0)
3133     recno = recno * 10 + *ptr++ - '0';
3134    
3135     /* Come here from code above that handles a named recursion */
3136    
3137     HANDLE_RECURSION:
3138    
3139     previous = code;
3140    
3141     /* Find the bracket that is being referenced. Temporarily end the
3142     regex in case it doesn't exist. */
3143    
3144     *code = OP_END;
3145 nigel 91 called = (recno == 0)? cd->start_code :
3146     find_bracket(cd->start_code, utf8, recno);
3147 nigel 77 if (called == NULL)
3148     {
3149     *errorcodeptr = ERR15;
3150     goto FAILED;
3151     }
3152    
3153     /* If the subpattern is still open, this is a recursive call. We
3154     check to see if this is a left recursion that could loop for ever,
3155     and diagnose that case. */
3156    
3157     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3158     {
3159     *errorcodeptr = ERR40;
3160     goto FAILED;
3161     }
3162    
3163 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside
3164     "once" brackets. */
3165 nigel 77
3166 nigel 87 *code = OP_ONCE;
3167     PUT(code, 1, 2 + 2*LINK_SIZE);
3168     code += 1 + LINK_SIZE;
3169    
3170 nigel 77 *code = OP_RECURSE;
3171     PUT(code, 1, called - cd->start_code);
3172     code += 1 + LINK_SIZE;
3173 nigel 87
3174     *code = OP_KET;
3175     PUT(code, 1, 2 + 2*LINK_SIZE);
3176     code += 1 + LINK_SIZE;
3177 nigel 77 }
3178     continue;
3179    
3180     /* Character after (? not specially recognized */
3181    
3182     default: /* Option setting */
3183     set = unset = 0;
3184     optset = &set;
3185    
3186     while (*ptr != ')' && *ptr != ':')
3187     {
3188     switch (*ptr++)
3189     {
3190     case '-': optset = &unset; break;
3191    
3192     case 'i': *optset |= PCRE_CASELESS; break;
3193 nigel 91 case 'J': *optset |= PCRE_DUPNAMES; break;
3194 nigel 77 case 'm': *optset |= PCRE_MULTILINE; break;
3195     case 's': *optset |= PCRE_DOTALL; break;
3196     case 'x': *optset |= PCRE_EXTENDED; break;
3197     case 'U': *optset |= PCRE_UNGREEDY; break;
3198     case 'X': *optset |= PCRE_EXTRA; break;
3199     }
3200     }
3201    
3202     /* Set up the changed option bits, but don't change anything yet. */
3203    
3204     newoptions = (options | set) & (~unset);
3205    
3206     /* If the options ended with ')' this is not the start of a nested
3207     group with option changes, so the options change at this level. Compile
3208     code to change the ims options if this setting actually changes any of
3209     them. We also pass the new setting back so that it can be put at the
3210     start of any following branches, and when this group ends (if we are in
3211     a group), a resetting item can be compiled.
3212    
3213     Note that if this item is right at the start of the pattern, the
3214     options will have been abstracted and made global, so there will be no
3215     change to compile. */
3216    
3217     if (*ptr == ')')
3218     {
3219     if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3220     {
3221     *code++ = OP_OPT;
3222     *code++ = newoptions & PCRE_IMS;
3223     }
3224    
3225     /* Change options at this level, and pass them back for use
3226     in subsequent branches. Reset the greedy defaults and the case
3227     value for firstbyte and reqbyte. */
3228    
3229     *optionsptr = options = newoptions;
3230     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3231     greedy_non_default = greedy_default ^ 1;
3232     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3233    
3234     previous = NULL; /* This item can't be repeated */
3235     continue; /* It is complete */
3236     }
3237    
3238     /* If the options ended with ':' we are heading into a nested group
3239     with possible change of options. Such groups are non-capturing and are
3240     not assertions of any kind. All we need to do is skip over the ':';
3241     the newoptions value is handled below. */
3242    
3243     bravalue = OP_BRA;
3244     ptr++;
3245     }
3246     }
3247    
3248     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3249     non-capturing and behave like (?:...) brackets */
3250    
3251     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3252     {
3253     bravalue = OP_BRA;
3254     }
3255    
3256     /* Else we have a referencing group; adjust the opcode. If the bracket
3257     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3258     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3259    
3260     else
3261     {
3262     NUMBERED_GROUP:
3263     if (++(*brackets) > EXTRACT_BASIC_MAX)
3264     {
3265     bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3266     code[1+LINK_SIZE] = OP_BRANUMBER;
3267     PUT2(code, 2+LINK_SIZE, *brackets);
3268     skipbytes = 3;
3269     }
3270     else bravalue = OP_BRA + *brackets;
3271     }
3272    
3273     /* Process nested bracketed re. Assertions may not be repeated, but other
3274     kinds can be. We copy code into a non-register variable in order to be able
3275     to pass its address because some compilers complain otherwise. Pass in a
3276     new setting for the ims options if they have changed. */
3277    
3278     previous = (bravalue >= OP_ONCE)? code : NULL;
3279     *code = bravalue;
3280     tempcode = code;
3281     tempreqvary = cd->req_varyopt; /* Save value before bracket */
3282    
3283     if (!compile_regex(
3284     newoptions, /* The complete new option state */
3285     options & PCRE_IMS, /* The previous ims option state */
3286     brackets, /* Extracting bracket count */
3287     &tempcode, /* Where to put code (updated) */
3288     &ptr, /* Input pointer (updated) */
3289     errorcodeptr, /* Where to put an error message */
3290     (bravalue == OP_ASSERTBACK ||
3291     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3292     skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3293     &subfirstbyte, /* For possible first char */
3294     &subreqbyte, /* For possible last char */
3295     bcptr, /* Current branch chain */
3296     cd)) /* Tables block */
3297     goto FAILED;
3298    
3299     /* At the end of compiling, code is still pointing to the start of the
3300     group, while tempcode has been updated to point past the end of the group
3301     and any option resetting that may follow it. The pattern pointer (ptr)
3302     is on the bracket. */
3303    
3304     /* If this is a conditional bracket, check that there are no more than
3305     two branches in the group. */
3306    
3307     else if (bravalue == OP_COND)
3308     {
3309     uschar *tc = code;
3310 nigel 91 int condcount = 0;
3311 nigel 77
3312     do {
3313     condcount++;
3314     tc += GET(tc,1);
3315     }
3316     while (*tc != OP_KET);
3317    
3318     if (condcount > 2)
3319     {
3320     *errorcodeptr = ERR27;
3321     goto FAILED;
3322     }
3323    
3324     /* If there is just one branch, we must not make use of its firstbyte or
3325     reqbyte, because this is equivalent to an empty second branch. */
3326    
3327     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3328     }
3329    
3330     /* Handle updating of the required and first characters. Update for normal
3331     brackets of all kinds, and conditions with two branches (see code above).
3332     If the bracket is followed by a quantifier with zero repeat, we have to
3333     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3334     main loop so that they can be accessed for the back off. */
3335    
3336     zeroreqbyte = reqbyte;
3337     zerofirstbyte = firstbyte;
3338     groupsetfirstbyte = FALSE;
3339    
3340     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3341     {
3342     /* If we have not yet set a firstbyte in this branch, take it from the
3343     subpattern, remembering that it was set here so that a repeat of more
3344     than one can replicate it as reqbyte if necessary. If the subpattern has
3345     no firstbyte, set "none" for the whole branch. In both cases, a zero
3346     repeat forces firstbyte to "none". */
3347    
3348     if (firstbyte == REQ_UNSET)
3349     {
3350     if (subfirstbyte >= 0)
3351     {
3352     firstbyte = subfirstbyte;
3353     groupsetfirstbyte = TRUE;
3354     }
3355     else firstbyte = REQ_NONE;
3356     zerofirstbyte = REQ_NONE;
3357     }
3358    
3359     /* If firstbyte was previously set, convert the subpattern's firstbyte
3360     into reqbyte if there wasn't one, using the vary flag that was in
3361     existence beforehand. */
3362    
3363     else if (subfirstbyte >= 0 && subreqbyte < 0)
3364     subreqbyte = subfirstbyte | tempreqvary;
3365    
3366     /* If the subpattern set a required byte (or set a first byte that isn't
3367     really the first byte - see above), set it. */
3368    
3369     if (subreqbyte >= 0) reqbyte = subreqbyte;
3370     }
3371    
3372     /* For a forward assertion, we take the reqbyte, if set. This can be
3373     helpful if the pattern that follows the assertion doesn't set a different
3374     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3375     for an assertion, however because it leads to incorrect effect for patterns
3376     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3377     of a firstbyte. This is overcome by a scan at the end if there's no
3378     firstbyte, looking for an asserted first char. */
3379    
3380     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3381    
3382     /* Now update the main code pointer to the end of the group. */
3383    
3384     code = tempcode;
3385    
3386     /* Error if hit end of pattern */
3387    
3388     if (*ptr != ')')
3389     {
3390     *errorcodeptr = ERR14;
3391     goto FAILED;
3392     }
3393     break;
3394    
3395     /* Check \ for being a real metacharacter; if not, fall through and handle
3396     it as a data character at the start of a string. Escape items are checked
3397     for validity in the pre-compiling pass. */
3398    
3399     case '\\':
3400     tempptr = ptr;
3401     c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3402    
3403     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3404     are arranged to be the negation of the corresponding OP_values. For the
3405     back references, the values are ESC_REF plus the reference number. Only
3406     back references and those types that consume a character may be repeated.
3407     We can test for values between ESC_b and ESC_Z for the latter; this may
3408     have to change if any new ones are ever created. */
3409    
3410     if (c < 0)
3411     {
3412     if (-c == ESC_Q) /* Handle start of quoted string */
3413     {
3414     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3415     else inescq = TRUE;
3416     continue;
3417     }
3418    
3419     /* For metasequences that actually match a character, we disable the
3420     setting of a first character if it hasn't already been set. */
3421    
3422     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3423     firstbyte = REQ_NONE;
3424    
3425     /* Set values to reset to if this is followed by a zero repeat. */
3426    
3427     zerofirstbyte = firstbyte;
3428     zeroreqbyte = reqbyte;
3429    
3430     /* Back references are handled specially */
3431    
3432     if (-c >= ESC_REF)
3433     {
3434     int number = -c - ESC_REF;
3435     previous = code;
3436     *code++ = OP_REF;
3437     PUT2INC(code, 0, number);
3438     }
3439    
3440     /* So are Unicode property matches, if supported. We know that get_ucp
3441     won't fail because it was tested in the pre-pass. */
3442    
3443     #ifdef SUPPORT_UCP
3444     else if (-c == ESC_P || -c == ESC_p)
3445     {
3446     BOOL negated;
3447 nigel 87 int pdata;
3448     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3449 nigel 77 previous = code;
3450     *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3451 nigel 87 *code++ = ptype;
3452     *code++ = pdata;
3453 nigel 77 }
3454     #endif
3455    
3456     /* For the rest, we can obtain the OP value by negating the escape
3457     value */
3458    
3459     else
3460     {
3461     previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3462     *code++ = -c;
3463     }
3464     continue;
3465     }
3466    
3467     /* We have a data character whose value is in c. In UTF-8 mode it may have
3468     a value > 127. We set its representation in the length/buffer, and then
3469     handle it as a data character. */
3470    
3471     #ifdef SUPPORT_UTF8
3472     if (utf8 && c > 127)
3473     mclength = _pcre_ord2utf8(c, mcbuffer);
3474     else
3475     #endif
3476    
3477     {
3478     mcbuffer[0] = c;
3479     mclength = 1;
3480     }
3481    
3482     goto ONE_CHAR;
3483    
3484     /* Handle a literal character. It is guaranteed not to be whitespace or #
3485     when the extended flag is set. If we are in UTF-8 mode, it may be a
3486     multi-byte literal character. */
3487    
3488     default:
3489     NORMAL_CHAR:
3490     mclength = 1;
3491     mcbuffer[0] = c;
3492    
3493     #ifdef SUPPORT_UTF8
3494     if (utf8 && (c & 0xc0) == 0xc0)
3495     {
3496     while ((ptr[1] & 0xc0) == 0x80)
3497     mcbuffer[mclength++] = *(++ptr);
3498     }
3499     #endif
3500    
3501     /* At this point we have the character's bytes in mcbuffer, and the length
3502     in mclength. When not in UTF-8 mode, the length is always 1. */
3503    
3504     ONE_CHAR:
3505     previous = code;
3506     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3507     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3508    
3509     /* Set the first and required bytes appropriately. If no previous first
3510     byte, set it from this character, but revert to none on a zero repeat.
3511     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3512     repeat. */
3513    
3514     if (firstbyte == REQ_UNSET)
3515     {
3516     zerofirstbyte = REQ_NONE;
3517     zeroreqbyte = reqbyte;
3518    
3519     /* If the character is more than one byte long, we can set firstbyte
3520     only if it is not to be matched caselessly. */
3521    
3522     if (mclength == 1 || req_caseopt == 0)
3523     {
3524     firstbyte = mcbuffer[0] | req_caseopt;
3525     if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3526     }
3527     else firstbyte = reqbyte = REQ_NONE;
3528     }
3529    
3530     /* firstbyte was previously set; we can set reqbyte only the length is
3531     1 or the matching is caseful. */
3532    
3533     else
3534     {
3535     zerofirstbyte = firstbyte;
3536     zeroreqbyte = reqbyte;
3537     if (mclength == 1 || req_caseopt == 0)
3538     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3539     }
3540    
3541     break; /* End of literal character handling */
3542     }
3543     } /* end of big loop */
3544    
3545     /* Control never reaches here by falling through, only by a goto for all the
3546     error states. Pass back the position in the pattern so that it can be displayed
3547     to the user for diagnosing the error. */
3548    
3549     FAILED:
3550     *ptrptr = ptr;
3551     return FALSE;
3552     }
3553    
3554    
3555    
3556    
3557     /*************************************************
3558     * Compile sequence of alternatives *
3559     *************************************************/
3560    
3561     /* On entry, ptr is pointing past the bracket character, but on return
3562     it points to the closing bracket, or vertical bar, or end of string.
3563     The code variable is pointing at the byte into which the BRA operator has been
3564     stored. If the ims options are changed at the start (for a (?ims: group) or
3565     during any branch, we need to insert an OP_OPT item at the start of every
3566     following branch to ensure they get set correctly at run time, and also pass
3567     the new options into every subsequent branch compile.
3568    
3569     Argument:
3570     options option bits, including any changes for this subpattern
3571     oldims previous settings of ims option bits
3572     brackets -> int containing the number of extracting brackets used
3573     codeptr -> the address of the current code pointer
3574     ptrptr -> the address of the current pattern pointer
3575     errorcodeptr -> pointer to error code variable
3576     lookbehind TRUE if this is a lookbehind assertion
3577     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3578     firstbyteptr place to put the first required character, or a negative number
3579     reqbyteptr place to put the last required character, or a negative number
3580     bcptr pointer to the chain of currently open branches
3581     cd points to the data block with tables pointers etc.
3582    
3583     Returns: TRUE on success
3584     */
3585    
3586     static BOOL
3587     compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3588     const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3589     int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3590     {
3591     const uschar *ptr = *ptrptr;
3592     uschar *code = *codeptr;
3593     uschar *last_branch = code;
3594     uschar *start_bracket = code;
3595     uschar *reverse_count = NULL;
3596     int firstbyte, reqbyte;
3597     int branchfirstbyte, branchreqbyte;
3598     branch_chain bc;
3599    
3600     bc.outer = bcptr;
3601     bc.current = code;
3602    
3603     firstbyte = reqbyte = REQ_UNSET;
3604    
3605     /* Offset is set zero to mark that this bracket is still open */
3606    
3607     PUT(code, 1, 0);
3608     code += 1 + LINK_SIZE + skipbytes;
3609    
3610     /* Loop for each alternative branch */
3611    
3612     for (;;)
3613     {
3614     /* Handle a change of ims options at the start of the branch */
3615    
3616     if ((options & PCRE_IMS) != oldims)
3617     {
3618     *code++ = OP_OPT;
3619     *code++ = options & PCRE_IMS;
3620     }
3621    
3622     /* Set up dummy OP_REVERSE if lookbehind assertion */
3623    
3624     if (lookbehind)
3625     {
3626     *code++ = OP_REVERSE;
3627     reverse_count = code;
3628     PUTINC(code, 0, 0);
3629     }
3630    
3631     /* Now compile the branch */
3632    
3633     if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3634     &branchfirstbyte, &branchreqbyte, &bc, cd))
3635     {
3636     *ptrptr = ptr;
3637     return FALSE;
3638     }
3639    
3640     /* If this is the first branch, the firstbyte and reqbyte values for the
3641     branch become the values for the regex. */
3642    
3643     if (*last_branch != OP_ALT)
3644     {
3645     firstbyte = branchfirstbyte;
3646     reqbyte = branchreqbyte;
3647     }
3648    
3649     /* If this is not the first branch, the first char and reqbyte have to
3650     match the values from all the previous branches, except that if the previous
3651     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3652     REQ_VARY for the regex. */
3653    
3654     else
3655     {
3656     /* If we previously had a firstbyte, but it doesn't match the new branch,
3657     we have to abandon the firstbyte for the regex, but if there was previously
3658     no reqbyte, it takes on the value of the old firstbyte. */
3659    
3660     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3661     {
3662     if (reqbyte < 0) reqbyte = firstbyte;
3663     firstbyte = REQ_NONE;
3664     }
3665    
3666     /* If we (now or from before) have no firstbyte, a firstbyte from the
3667     branch becomes a reqbyte if there isn't a branch reqbyte. */
3668    
3669     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3670     branchreqbyte = branchfirstbyte;
3671    
3672     /* Now ensure that the reqbytes match */
3673    
3674     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3675     reqbyte = REQ_NONE;
3676     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3677     }
3678    
3679     /* If lookbehind, check that this branch matches a fixed-length string,
3680     and put the length into the OP_REVERSE item. Temporarily mark the end of
3681     the branch with OP_END. */
3682    
3683     if (lookbehind)
3684     {
3685     int length;
3686     *code = OP_END;
3687     length = find_fixedlength(last_branch, options);
3688     DPRINTF(("fixed length = %d\n", length));
3689     if (length < 0)
3690     {
3691     *errorcodeptr = (length == -2)? ERR36 : ERR25;
3692     *ptrptr = ptr;
3693     return FALSE;
3694     }
3695     PUT(reverse_count, 0, length);
3696     }
3697    
3698     /* Reached end of expression, either ')' or end of pattern. Go back through
3699     the alternative branches and reverse the chain of offsets, with the field in
3700     the BRA item now becoming an offset to the first alternative. If there are
3701     no alternatives, it points to the end of the group. The length in the
3702     terminating ket is always the length of the whole bracketed item. If any of
3703     the ims options were changed inside the group, compile a resetting op-code
3704     following, except at the very end of the pattern. Return leaving the pointer
3705     at the terminating char. */
3706    
3707     if (*ptr != '|')
3708     {
3709     int length = code - last_branch;
3710     do
3711     {
3712     int prev_length = GET(last_branch, 1);
3713     PUT(last_branch, 1, length);
3714     length = prev_length;
3715     last_branch -= length;
3716     }
3717     while (length > 0);
3718    
3719     /* Fill in the ket */
3720    
3721     *code = OP_KET;
3722     PUT(code, 1, code - start_bracket);
3723     code += 1 + LINK_SIZE;
3724    
3725     /* Resetting option if needed */
3726    
3727     if ((options & PCRE_IMS) != oldims && *ptr == ')')
3728     {
3729     *code++ = OP_OPT;
3730     *code++ = oldims;
3731     }
3732    
3733     /* Set values to pass back */
3734    
3735     *codeptr = code;
3736     *ptrptr = ptr;
3737     *firstbyteptr = firstbyte;
3738     *reqbyteptr = reqbyte;
3739     return TRUE;
3740     }
3741    
3742     /* Another branch follows; insert an "or" node. Its length field points back
3743     to the previous branch while the bracket remains open. At the end the chain
3744     is reversed. It's done like this so that the start of the bracket has a
3745     zero offset until it is closed, making it possible to detect recursion. */
3746    
3747     *code = OP_ALT;
3748     PUT(code, 1, code - last_branch);
3749     bc.current = last_branch = code;
3750     code += 1 + LINK_SIZE;
3751     ptr++;
3752     }
3753     /* Control never reaches here */
3754     }
3755    
3756    
3757    
3758    
3759     /*************************************************
3760     * Check for anchored expression *
3761     *************************************************/
3762    
3763     /* Try to find out if this is an anchored regular expression. Consider each
3764     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3765     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3766     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3767     counts, since OP_CIRC can match in the middle.
3768    
3769     We can also consider a regex to be anchored if OP_SOM starts all its branches.
3770     This is the code for \G, which means "match at start of match position, taking
3771     into account the match offset".
3772    
3773     A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3774     because that will try the rest of the pattern at all possible matching points,
3775     so there is no point trying again.... er ....
3776    
3777     .... except when the .* appears inside capturing parentheses, and there is a
3778     subsequent back reference to those parentheses. We haven't enough information
3779     to catch that case precisely.
3780    
3781     At first, the best we could do was to detect when .* was in capturing brackets
3782     and the highest back reference was greater than or equal to that level.
3783     However, by keeping a bitmap of the first 31 back references, we can catch some
3784     of the more common cases more precisely.
3785    
3786     Arguments:
3787     code points to start of expression (the bracket)
3788     options points to the options setting
3789     bracket_map a bitmap of which brackets we are inside while testing; this
3790     handles up to substring 31; after that we just have to take
3791     the less precise approach
3792     backref_map the back reference bitmap
3793    
3794     Returns: TRUE or FALSE
3795     */
3796    
3797     static BOOL
3798     is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3799     unsigned int backref_map)
3800     {
3801     do {
3802     const uschar *scode =
3803     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3804     register int op = *scode;
3805    
3806     /* Capturing brackets */
3807    
3808     if (op > OP_BRA)
3809     {
3810     int new_map;
3811     op -= OP_BRA;
3812     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3813     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3814     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3815     }
3816    
3817     /* Other brackets */
3818    
3819     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3820     {
3821     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3822     }
3823    
3824     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3825     are or may be referenced. */
3826    
3827     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3828     (*options & PCRE_DOTALL) != 0)
3829     {
3830     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3831     }
3832    
3833     /* Check for explicit anchoring */
3834    
3835     else if (op != OP_SOD && op != OP_SOM &&
3836     ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3837     return FALSE;
3838     code += GET(code, 1);
3839     }
3840     while (*code == OP_ALT); /* Loop for each alternative */
3841     return TRUE;
3842     }
3843    
3844    
3845    
3846     /*************************************************
3847     * Check for starting with ^ or .* *
3848     *************************************************/
3849    
3850     /* This is called to find out if every branch starts with ^ or .* so that
3851     "first char" processing can be done to speed things up in multiline
3852     matching and for non-DOTALL patterns that start with .* (which must start at
3853     the beginning or after \n). As in the case of is_anchored() (see above), we
3854     have to take account of back references to capturing brackets that contain .*
3855     because in that case we can't make the assumption.
3856    
3857     Arguments:
3858     code points to start of expression (the bracket)
3859     bracket_map a bitmap of which brackets we are inside while testing; this
3860     handles up to substring 31; after that we just have to take
3861     the less precise approach
3862     backref_map the back reference bitmap
3863    
3864     Returns: TRUE or FALSE
3865     */
3866    
3867     static BOOL
3868     is_startline(const uschar *code, unsigned int bracket_map,
3869     unsigned int backref_map)
3870     {
3871     do {
3872     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3873     FALSE);
3874     register int op = *scode;
3875    
3876     /* Capturing brackets */
3877    
3878     if (op > OP_BRA)
3879     {
3880     int new_map;
3881     op -= OP_BRA;
3882     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3883     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3884     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3885     }
3886    
3887     /* Other brackets */
3888    
3889     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3890     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3891    
3892     /* .* means "start at start or after \n" if it isn't in brackets that
3893     may be referenced. */
3894    
3895     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3896     {
3897     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3898     }
3899    
3900     /* Check for explicit circumflex */
3901    
3902     else if (op != OP_CIRC) return FALSE;
3903    
3904     /* Move on to the next alternative */
3905    
3906     code += GET(code, 1);
3907     }
3908     while (*code == OP_ALT); /* Loop for each alternative */
3909     return TRUE;
3910     }
3911    
3912    
3913    
3914     /*************************************************
3915     * Check for asserted fixed first char *
3916     *************************************************/
3917    
3918     /* During compilation, the "first char" settings from forward assertions are
3919     discarded, because they can cause conflicts with actual literals that follow.
3920     However, if we end up without a first char setting for an unanchored pattern,
3921     it is worth scanning the regex to see if there is an initial asserted first
3922     char. If all branches start with the same asserted char, or with a bracket all
3923     of whose alternatives start with the same asserted char (recurse ad lib), then
3924     we return that char, otherwise -1.
3925    
3926     Arguments:
3927     code points to start of expression (the bracket)
3928     options pointer to the options (used to check casing changes)
3929     inassert TRUE if in an assertion
3930    
3931     Returns: -1 or the fixed first char
3932     */
3933    
3934     static int
3935     find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3936     {
3937     register int c = -1;
3938     do {
3939     int d;
3940     const uschar *scode =
3941     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3942     register int op = *scode;
3943    
3944     if (op >= OP_BRA) op = OP_BRA;
3945    
3946     switch(op)
3947     {
3948     default:
3949     return -1;
3950    
3951     case OP_BRA:
3952     case OP_ASSERT:
3953     case OP_ONCE:
3954     case OP_COND:
3955     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3956     return -1;
3957     if (c < 0) c = d; else if (c != d) return -1;
3958     break;
3959    
3960     case OP_EXACT: /* Fall through */
3961     scode += 2;
3962    
3963     case OP_CHAR:
3964     case OP_CHARNC:
3965     case OP_PLUS:
3966     case OP_MINPLUS:
3967     if (!inassert) return -1;
3968     if (c < 0)
3969     {
3970     c = scode[1];
3971     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3972     }
3973     else if (c != scode[1]) return -1;
3974     break;
3975     }
3976    
3977     code += GET(code, 1);
3978     }
3979     while (*code == OP_ALT);
3980     return c;
3981     }
3982    
3983    
3984    
3985     /*************************************************
3986     * Compile a Regular Expression *
3987     *************************************************/
3988    
3989     /* This function takes a string and returns a pointer to a block of store
3990     holding a compiled version of the expression. The original API for this
3991     function had no error code return variable; it is retained for backwards
3992     compatibility. The new function is given a new name.
3993    
3994     Arguments:
3995     pattern the regular expression
3996     options various option bits
3997     errorcodeptr pointer to error code variable (pcre_compile2() only)
3998     can be NULL if you don't want a code value
3999     errorptr pointer to pointer to error text
4000     erroroffset ptr offset in pattern where error was detected
4001     tables pointer to character tables or NULL
4002    
4003     Returns: pointer to compiled data block, or NULL on error,
4004     with errorptr and erroroffset set
4005     */
4006    
4007 nigel 87 PCRE_DATA_SCOPE pcre *
4008 nigel 77 pcre_compile(const char *pattern, int options, const char **errorptr,
4009     int *erroroffset, const unsigned char *tables)
4010     {
4011     return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
4012     }
4013    
4014    
4015 nigel 91
4016 nigel 87 PCRE_DATA_SCOPE pcre *
4017 nigel 77 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
4018     const char **errorptr, int *erroroffset, const unsigned char *tables)
4019     {
4020     real_pcre *re;
4021     int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4022 nigel 91 int c, firstbyte, reqbyte, newline;
4023 nigel 77 int bracount = 0;
4024     int branch_extra = 0;
4025     int branch_newextra;
4026     int item_count = -1;
4027     int name_count = 0;
4028     int max_name_size = 0;
4029     int lastitemlength = 0;
4030     int errorcode = 0;
4031     #ifdef SUPPORT_UTF8
4032     BOOL utf8;
4033     BOOL class_utf8;
4034     #endif
4035     BOOL inescq = FALSE;
4036 nigel 81 BOOL capturing;
4037 nigel 77 unsigned int brastackptr = 0;
4038     size_t size;
4039     uschar *code;
4040     const uschar *codestart;
4041     const uschar *ptr;
4042     compile_data compile_block;
4043 nigel 91 compile_data *cd = &compile_block;
4044 nigel 77 int brastack[BRASTACK_SIZE];
4045     uschar bralenstack[BRASTACK_SIZE];
4046    
4047     /* We can't pass back an error message if errorptr is NULL; I guess the best we
4048     can do is just return NULL, but we can set a code value if there is a code
4049     pointer. */
4050    
4051     if (errorptr == NULL)
4052     {
4053     if (errorcodeptr != NULL) *errorcodeptr = 99;
4054     return NULL;
4055     }
4056    
4057     *errorptr = NULL;
4058     if (errorcodeptr != NULL) *errorcodeptr = ERR0;
4059    
4060     /* However, we can give a message for this error */
4061    
4062     if (erroroffset == NULL)
4063     {
4064     errorcode = ERR16;
4065     goto PCRE_EARLY_ERROR_RETURN;
4066     }
4067    
4068     *erroroffset = 0;
4069    
4070     /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4071    
4072     #ifdef SUPPORT_UTF8
4073     utf8 = (options & PCRE_UTF8) != 0;
4074     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4075     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
4076     {
4077     errorcode = ERR44;
4078     goto PCRE_EARLY_ERROR_RETURN;
4079     }
4080     #else
4081     if ((options & PCRE_UTF8) != 0)
4082     {
4083     errorcode = ERR32;
4084     goto PCRE_EARLY_ERROR_RETURN;
4085     }
4086     #endif
4087    
4088     if ((options & ~PUBLIC_OPTIONS) != 0)
4089     {
4090     errorcode = ERR17;
4091     goto PCRE_EARLY_ERROR_RETURN;
4092     }
4093    
4094     /* Set up pointers to the individual character tables */
4095    
4096     if (tables == NULL) tables = _pcre_default_tables;
4097 nigel 91 cd->lcc = tables + lcc_offset;
4098     cd->fcc = tables + fcc_offset;
4099     cd->cbits = tables + cbits_offset;
4100     cd->ctypes = tables + ctypes_offset;
4101 nigel 77
4102 nigel 91 /* Handle different types of newline. The two bits give four cases. The current
4103     code allows for one- or two-byte sequences. */
4104    
4105     switch (options & PCRE_NEWLINE_CRLF)
4106     {
4107     default: newline = NEWLINE; break; /* Compile-time default */
4108     case PCRE_NEWLINE_CR: newline = '\r'; break;
4109     case PCRE_NEWLINE_LF: newline = '\n'; break;
4110     case PCRE_NEWLINE_CR+
4111     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4112     }
4113    
4114     if (newline > 255)
4115     {
4116     cd->nllen = 2;
4117     cd->nl[0] = (newline >> 8) & 255;
4118     cd->nl[1] = newline & 255;
4119     }
4120     else
4121     {
4122     cd->nllen = 1;
4123     cd->nl[0] = newline;
4124     }
4125    
4126 nigel 77 /* Maximum back reference and backref bitmap. This is updated for numeric
4127     references during the first pass, but for named references during the actual
4128     compile pass. The bitmap records up to 31 back references to help in deciding
4129     whether (.*) can be treated as anchored or not. */
4130    
4131 nigel 91 cd->top_backref = 0;
4132     cd->backref_map = 0;
4133 nigel 77
4134     /* Reflect pattern for debugging output */
4135    
4136     DPRINTF(("------------------------------------------------------------------\n"));
4137     DPRINTF(("%s\n", pattern));
4138    
4139     /* The first thing to do is to make a pass over the pattern to compute the
4140     amount of store required to hold the compiled code. This does not have to be
4141     perfect as long as errors are overestimates. At the same time we can detect any
4142     flag settings right at the start, and extract them. Make an attempt to correct
4143     for any counted white space if an "extended" flag setting appears late in the
4144     pattern. We can't be so clever for #-comments. */
4145    
4146     ptr = (const uschar *)(pattern - 1);
4147     while ((c = *(++ptr)) != 0)
4148     {
4149     int min, max;
4150     int class_optcount;
4151     int bracket_length;
4152     int duplength;
4153    
4154     /* If we are inside a \Q...\E sequence, all chars are literal */
4155    
4156     if (inescq)
4157     {
4158     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4159     goto NORMAL_CHAR;
4160     }
4161    
4162     /* Otherwise, first check for ignored whitespace and comments */
4163    
4164     if ((options & PCRE_EXTENDED) != 0)
4165     {
4166 nigel 91 if ((cd->ctypes[c] & ctype_space) != 0) continue;
4167 nigel 77 if (c == '#')
4168     {
4169 nigel 91 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
4170     if (*ptr != 0)
4171     {
4172     ptr += cd->nllen - 1;
4173     continue;
4174     }
4175     break; /* End loop at end of pattern */
4176 nigel 77 }
4177     }
4178    
4179     item_count++; /* Is zero for the first non-comment item */
4180    
4181     /* Allow space for auto callout before every item except quantifiers. */
4182    
4183     if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4184     c != '*' && c != '+' && c != '?' &&
4185     (c != '{' || !is_counted_repeat(ptr + 1)))
4186     length += 2 + 2*LINK_SIZE;
4187    
4188     switch(c)
4189     {
4190     /* A backslashed item may be an escaped data character or it may be a
4191     character type. */
4192    
4193     case '\\':
4194     c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4195     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4196    
4197     lastitemlength = 1; /* Default length of last item for repeats */
4198    
4199     if (c >= 0) /* Data character */
4200     {
4201     length += 2; /* For a one-byte character */
4202    
4203     #ifdef SUPPORT_UTF8
4204     if (utf8 && c > 127)
4205     {
4206     int i;
4207     for (i = 0; i < _pcre_utf8_table1_size; i++)
4208     if (c <= _pcre_utf8_table1[i]) break;
4209     length += i;
4210     lastitemlength += i;
4211     }
4212     #endif
4213    
4214     continue;
4215     }
4216    
4217     /* If \Q, enter "literal" mode */
4218    
4219     if (-c == ESC_Q)
4220     {
4221     inescq = TRUE;
4222     continue;
4223     }
4224    
4225     /* \X is supported only if Unicode property support is compiled */
4226    
4227     #ifndef SUPPORT_UCP
4228     if (-c == ESC_X)
4229     {
4230     errorcode = ERR45;
4231     goto PCRE_ERROR_RETURN;
4232     }
4233     #endif
4234    
4235     /* \P and \p are for Unicode properties, but only when the support has
4236 nigel 87 been compiled. Each item needs 3 bytes. */
4237 nigel 77
4238     else if (-c == ESC_P || -c == ESC_p)
4239     {
4240     #ifdef SUPPORT_UCP
4241     BOOL negated;
4242 nigel 87 BOOL pdata;
4243     length += 3;
4244     lastitemlength = 3;
4245     if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
4246     goto PCRE_ERROR_RETURN;
4247 nigel 77 continue;
4248     #else
4249     errorcode = ERR45;
4250     goto PCRE_ERROR_RETURN;
4251     #endif
4252     }
4253    
4254     /* Other escapes need one byte */
4255    
4256     length++;
4257    
4258     /* A back reference needs an additional 2 bytes, plus either one or 5
4259     bytes for a repeat. We also need to keep the value of the highest
4260     back reference. */
4261    
4262     if (c <= -ESC_REF)
4263     {
4264     int refnum = -c - ESC_REF;
4265 nigel 91 cd->backref_map |= (refnum < 32)? (1 << refnum) : 1;
4266     if (refnum > cd->top_backref)
4267     cd->top_backref = refnum;
4268 nigel 77 length += 2; /* For single back reference */
4269     if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4270     {
4271     ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4272     if (errorcode != 0) goto PCRE_ERROR_RETURN;
4273     if ((min == 0 && (max == 1 || max == -1)) ||
4274     (min == 1 && max == -1))
4275     length++;
4276     else length += 5;
4277     if (ptr[1] == '?') ptr++;
4278     }
4279     }
4280     continue;
4281