/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 175 - (hide annotations) (download)
Mon Jun 11 13:38:38 2007 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 176642 byte(s)
Added the Perl 5.10 (?| "branch reset" feature.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 nigel 77 /*************************************************
62     * Code parameters and static tables *
63     *************************************************/
64    
65 nigel 93 /* This value specifies the size of stack workspace that is used during the
66     first pre-compile phase that determines how much memory is required. The regex
67     is partly compiled into this space, but the compiled parts are discarded as
68     soon as they can be, so that hopefully there will never be an overrun. The code
69     does, however, check for an overrun. The largest amount I've seen used is 218,
70     so this number is very generous.
71 nigel 77
72 nigel 93 The same workspace is used during the second, actual compile phase for
73     remembering forward references to groups so that they can be filled in at the
74     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75     is 4 there is plenty of room. */
76 nigel 77
77 nigel 93 #define COMPILE_WORK_SIZE (4096)
78 nigel 77
79 nigel 93
80 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81     are simple data values; negative values are for special things like \d and so
82     on. Zero means further processing is needed (for things like \x), or the escape
83     is invalid. */
84    
85 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 nigel 77 static const short int escapes[] = {
87     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 ph10 168 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 nigel 93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 nigel 93 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 nigel 77 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96     0, 0, -ESC_z /* x - z */
97     };
98    
99 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
100 nigel 77 static const short int escapes[] = {
101     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 nigel 77 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124     };
125     #endif
126    
127    
128     /* Tables of names of POSIX character classes and their lengths. The list is
129 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 nigel 77 as this is assumed for handling case independence. */
131    
132     static const char *const posix_names[] = {
133     "alpha", "lower", "upper",
134     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135     "print", "punct", "space", "word", "xdigit" };
136    
137     static const uschar posix_name_lengths[] = {
138     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
141     base map, with an optional addition or removal of another map. Then, for some
142     classes, there is some additional tweaking: for [:blank:] the vertical space
143     characters are removed, and for [:alpha:] and [:alnum:] the underscore
144     character is removed. The triples in the table consist of the base map offset,
145     second map offset or -1 if no second map, and a non-negative value for map
146     addition or a negative value for map subtraction (if there are two maps). The
147     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148     remove vertical space characters, 2 => remove underscore. */
149 nigel 77
150     static const int posix_class_maps[] = {
151 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
152     cbit_lower, -1, 0, /* lower */
153     cbit_upper, -1, 0, /* upper */
154     cbit_word, -1, 2, /* alnum - word without underscore */
155     cbit_print, cbit_cntrl, 0, /* ascii */
156     cbit_space, -1, 1, /* blank - a GNU extension */
157     cbit_cntrl, -1, 0, /* cntrl */
158     cbit_digit, -1, 0, /* digit */
159     cbit_graph, -1, 0, /* graph */
160     cbit_print, -1, 0, /* print */
161     cbit_punct, -1, 0, /* punct */
162     cbit_space, -1, 0, /* space */
163     cbit_word, -1, 0, /* word - a Perl extension */
164     cbit_xdigit,-1, 0 /* xdigit */
165 nigel 77 };
166    
167    
168 nigel 93 #define STRING(a) # a
169     #define XSTRING(s) STRING(s)
170    
171 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
172 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
173     they are documented. Always add a new error instead. Messages marked DEAD below
174     are no longer used. */
175 nigel 77
176     static const char *error_texts[] = {
177     "no error",
178     "\\ at end of pattern",
179     "\\c at end of pattern",
180     "unrecognized character follows \\",
181     "numbers out of order in {} quantifier",
182     /* 5 */
183     "number too big in {} quantifier",
184     "missing terminating ] for character class",
185     "invalid escape sequence in character class",
186     "range out of order in character class",
187     "nothing to repeat",
188     /* 10 */
189 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 nigel 77 "internal error: unexpected repeat",
191     "unrecognized character after (?",
192     "POSIX named classes are supported only within a class",
193     "missing )",
194     /* 15 */
195     "reference to non-existent subpattern",
196     "erroffset passed as NULL",
197     "unknown option bit(s) set",
198     "missing ) after comment",
199 nigel 93 "parentheses nested too deeply", /** DEAD **/
200 nigel 77 /* 20 */
201     "regular expression too large",
202     "failed to get memory",
203     "unmatched parentheses",
204     "internal error: code overflow",
205     "unrecognized character after (?<",
206     /* 25 */
207     "lookbehind assertion is not fixed length",
208 nigel 91 "malformed number or name after (?(",
209 nigel 77 "conditional group contains more than two branches",
210     "assertion expected after (?(",
211 ph10 166 "(?R or (?[+-]digits must be followed by )",
212 nigel 77 /* 30 */
213     "unknown POSIX class name",
214     "POSIX collating elements are not supported",
215     "this version of PCRE is not compiled with PCRE_UTF8 support",
216 nigel 93 "spare error", /** DEAD **/
217 nigel 77 "character value in \\x{...} sequence is too large",
218     /* 35 */
219     "invalid condition (?(0)",
220     "\\C not allowed in lookbehind assertion",
221     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222     "number after (?C is > 255",
223     "closing ) for (?C expected",
224     /* 40 */
225     "recursive call could loop indefinitely",
226     "unrecognized character after (?P",
227 nigel 93 "syntax error in subpattern name (missing terminator)",
228 nigel 91 "two named subpatterns have the same name",
229 nigel 77 "invalid UTF-8 string",
230     /* 45 */
231     "support for \\P, \\p, and \\X has not been compiled",
232     "malformed \\P or \\p sequence",
233 nigel 91 "unknown property name after \\P or \\p",
234 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 nigel 91 /* 50 */
237     "repeated subpattern is too long",
238 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
239     "internal error: overran compiling workspace",
240     "internal error: previously-checked referenced subpattern not found",
241     "DEFINE group contains more than one branch",
242     /* 55 */
243     "repeating a DEFINE group is not allowed",
244     "inconsistent NEWLINE options",
245 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
246 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 nigel 77 };
248    
249    
250     /* Table to identify digits and hex digits. This is used when compiling
251     patterns. Note that the tables in chartables are dependent on the locale, and
252     may mark arbitrary characters as digits - but the PCRE compiling code expects
253     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254     a private table here. It costs 256 bytes, but it is a lot faster than doing
255     character value tests (at least in some simple cases I timed), and in some
256     applications one wants PCRE to compile efficiently as well as match
257     efficiently.
258    
259     For convenience, we use the same bit definitions as in chartables:
260    
261     0x04 decimal digit
262     0x08 hexadecimal digit
263    
264     Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 nigel 77 static const unsigned char digitab[] =
268     {
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
303 nigel 77 static const unsigned char digitab[] =
304     {
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337    
338     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371     #endif
372    
373    
374     /* Definition to allow mutual recursion */
375    
376     static BOOL
377 ph10 175 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
378     int *, int *, branch_chain *, compile_data *, int *);
379 nigel 77
380    
381    
382     /*************************************************
383     * Handle escapes *
384     *************************************************/
385    
386     /* This function is called when a \ has been encountered. It either returns a
387     positive value for a simple escape such as \n, or a negative value which
388 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
389     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391     ptr is pointing at the \. On exit, it is on the final character of the escape
392     sequence.
393 nigel 77
394     Arguments:
395     ptrptr points to the pattern position pointer
396     errorcodeptr points to the errorcode variable
397     bracount number of previous extracting brackets
398     options the options bits
399     isclass TRUE if inside a character class
400    
401     Returns: zero or positive => a data character
402     negative => a special escape sequence
403     on error, errorptr is set
404     */
405    
406     static int
407     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408     int options, BOOL isclass)
409     {
410 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
411     const uschar *ptr = *ptrptr + 1;
412 nigel 77 int c, i;
413    
414 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415     ptr--; /* Set pointer back to the last byte */
416    
417 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
418    
419     if (c == 0) *errorcodeptr = ERR1;
420    
421     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422     a table. A non-zero result is something that can be returned immediately.
423     Otherwise further processing may be required. */
424    
425 ph10 97 #ifndef EBCDIC /* ASCII coding */
426 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
427     else if ((i = escapes[c - '0']) != 0) c = i;
428    
429 ph10 97 #else /* EBCDIC coding */
430 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431     else if ((i = escapes[c - 0x48]) != 0) c = i;
432     #endif
433    
434     /* Escapes that need further processing, or are illegal. */
435    
436     else
437     {
438     const uschar *oldptr;
439 nigel 93 BOOL braced, negated;
440    
441 nigel 77 switch (c)
442     {
443     /* A number of Perl escapes are not handled by PCRE. We give an explicit
444     error. */
445    
446     case 'l':
447     case 'L':
448     case 'N':
449     case 'u':
450     case 'U':
451     *errorcodeptr = ERR37;
452     break;
453    
454 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
455     is an absolute backreference. If negative, it is a relative backreference.
456 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457     reference to a named group. This is part of Perl's movement towards a
458     unified syntax for back references. As this is synonymous with \k{name}, we
459 ph10 171 fudge it up by pretending it really was \k. */
460 nigel 93
461     case 'g':
462     if (ptr[1] == '{')
463     {
464 ph10 171 const uschar *p;
465     for (p = ptr+2; *p != 0 && *p != '}'; p++)
466     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467 ph10 172 if (*p != 0 && *p != '}')
468 ph10 171 {
469     c = -ESC_k;
470     break;
471 ph10 172 }
472 nigel 93 braced = TRUE;
473     ptr++;
474     }
475     else braced = FALSE;
476    
477     if (ptr[1] == '-')
478     {
479     negated = TRUE;
480     ptr++;
481     }
482     else negated = FALSE;
483    
484     c = 0;
485     while ((digitab[ptr[1]] & ctype_digit) != 0)
486     c = c * 10 + *(++ptr) - '0';
487    
488     if (c == 0 || (braced && *(++ptr) != '}'))
489     {
490     *errorcodeptr = ERR57;
491     return 0;
492     }
493    
494     if (negated)
495     {
496     if (c > bracount)
497     {
498     *errorcodeptr = ERR15;
499     return 0;
500     }
501     c = bracount - (c - 1);
502     }
503    
504     c = -(ESC_REF + c);
505     break;
506    
507 nigel 77 /* The handling of escape sequences consisting of a string of digits
508     starting with one that is not zero is not straightforward. By experiment,
509     the way Perl works seems to be as follows:
510    
511     Outside a character class, the digits are read as a decimal number. If the
512     number is less than 10, or if there are that many previous extracting
513     left brackets, then it is a back reference. Otherwise, up to three octal
514     digits are read to form an escaped byte. Thus \123 is likely to be octal
515     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
516     value is greater than 377, the least significant 8 bits are taken. Inside a
517     character class, \ followed by a digit is always an octal number. */
518    
519     case '1': case '2': case '3': case '4': case '5':
520     case '6': case '7': case '8': case '9':
521    
522     if (!isclass)
523     {
524     oldptr = ptr;
525     c -= '0';
526     while ((digitab[ptr[1]] & ctype_digit) != 0)
527     c = c * 10 + *(++ptr) - '0';
528     if (c < 10 || c <= bracount)
529     {
530     c = -(ESC_REF + c);
531     break;
532     }
533     ptr = oldptr; /* Put the pointer back and fall through */
534     }
535    
536     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
537     generates a binary zero byte and treats the digit as a following literal.
538     Thus we have to pull back the pointer by one. */
539    
540     if ((c = *ptr) >= '8')
541     {
542     ptr--;
543     c = 0;
544     break;
545     }
546    
547     /* \0 always starts an octal number, but we may drop through to here with a
548 nigel 91 larger first octal digit. The original code used just to take the least
549     significant 8 bits of octal numbers (I think this is what early Perls used
550     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551     than 3 octal digits. */
552 nigel 77
553     case '0':
554     c -= '0';
555     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556     c = c * 8 + *(++ptr) - '0';
557 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
558 nigel 77 break;
559    
560 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
561     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562     treated as a data character. */
563 nigel 77
564     case 'x':
565 nigel 87 if (ptr[1] == '{')
566 nigel 77 {
567     const uschar *pt = ptr + 2;
568 nigel 87 int count = 0;
569    
570 nigel 77 c = 0;
571     while ((digitab[*pt] & ctype_xdigit) != 0)
572     {
573 nigel 87 register int cc = *pt++;
574     if (c == 0 && cc == '0') continue; /* Leading zeroes */
575 nigel 77 count++;
576 nigel 87
577 ph10 97 #ifndef EBCDIC /* ASCII coding */
578 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
579 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580 ph10 97 #else /* EBCDIC coding */
581 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
582 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583 nigel 77 #endif
584     }
585 nigel 87
586 nigel 77 if (*pt == '}')
587     {
588 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589 nigel 77 ptr = pt;
590     break;
591     }
592 nigel 87
593 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
594     recognize this construct; fall through to the normal \x handling. */
595     }
596    
597 nigel 87 /* Read just a single-byte hex-defined char */
598 nigel 77
599     c = 0;
600     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601     {
602     int cc; /* Some compilers don't like ++ */
603     cc = *(++ptr); /* in initializers */
604 ph10 97 #ifndef EBCDIC /* ASCII coding */
605 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
606     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607 ph10 97 #else /* EBCDIC coding */
608 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
609     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610     #endif
611     }
612     break;
613    
614 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615     This coding is ASCII-specific, but then the whole concept of \cx is
616     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617 nigel 77
618     case 'c':
619     c = *(++ptr);
620     if (c == 0)
621     {
622     *errorcodeptr = ERR2;
623     return 0;
624     }
625    
626 ph10 97 #ifndef EBCDIC /* ASCII coding */
627 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
628     c ^= 0x40;
629 ph10 97 #else /* EBCDIC coding */
630 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
631     c ^= 0xC0;
632     #endif
633     break;
634    
635     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
636     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
637     for Perl compatibility, it is a literal. This code looks a bit odd, but
638     there used to be some cases other than the default, and there may be again
639     in future, so I haven't "optimized" it. */
640    
641     default:
642     if ((options & PCRE_EXTRA) != 0) switch(c)
643     {
644     default:
645     *errorcodeptr = ERR3;
646     break;
647     }
648     break;
649     }
650     }
651    
652     *ptrptr = ptr;
653     return c;
654     }
655    
656    
657    
658     #ifdef SUPPORT_UCP
659     /*************************************************
660     * Handle \P and \p *
661     *************************************************/
662    
663     /* This function is called after \P or \p has been encountered, provided that
664     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
665     pointing at the P or p. On exit, it is pointing at the final character of the
666     escape sequence.
667    
668     Argument:
669     ptrptr points to the pattern position pointer
670     negptr points to a boolean that is set TRUE for negation else FALSE
671 nigel 87 dptr points to an int that is set to the detailed property value
672 nigel 77 errorcodeptr points to the error code variable
673    
674 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
675 nigel 77 */
676    
677     static int
678 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679 nigel 77 {
680     int c, i, bot, top;
681     const uschar *ptr = *ptrptr;
682 nigel 87 char name[32];
683 nigel 77
684     c = *(++ptr);
685     if (c == 0) goto ERROR_RETURN;
686    
687     *negptr = FALSE;
688    
689 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690     negation. */
691 nigel 77
692     if (c == '{')
693     {
694     if (ptr[1] == '^')
695     {
696     *negptr = TRUE;
697     ptr++;
698     }
699 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
700 nigel 77 {
701     c = *(++ptr);
702     if (c == 0) goto ERROR_RETURN;
703     if (c == '}') break;
704     name[i] = c;
705     }
706 nigel 87 if (c !='}') goto ERROR_RETURN;
707 nigel 77 name[i] = 0;
708     }
709    
710     /* Otherwise there is just one following character */
711    
712     else
713     {
714     name[0] = c;
715     name[1] = 0;
716     }
717    
718     *ptrptr = ptr;
719    
720     /* Search for a recognized property name using binary chop */
721    
722     bot = 0;
723     top = _pcre_utt_size;
724    
725     while (bot < top)
726     {
727 nigel 87 i = (bot + top) >> 1;
728 nigel 77 c = strcmp(name, _pcre_utt[i].name);
729 nigel 87 if (c == 0)
730     {
731     *dptr = _pcre_utt[i].value;
732     return _pcre_utt[i].type;
733     }
734 nigel 77 if (c > 0) bot = i + 1; else top = i;
735     }
736    
737     *errorcodeptr = ERR47;
738     *ptrptr = ptr;
739     return -1;
740    
741     ERROR_RETURN:
742     *errorcodeptr = ERR46;
743     *ptrptr = ptr;
744     return -1;
745     }
746     #endif
747    
748    
749    
750    
751     /*************************************************
752     * Check for counted repeat *
753     *************************************************/
754    
755     /* This function is called when a '{' is encountered in a place where it might
756     start a quantifier. It looks ahead to see if it really is a quantifier or not.
757     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
758     where the ddds are digits.
759    
760     Arguments:
761     p pointer to the first char after '{'
762    
763     Returns: TRUE or FALSE
764     */
765    
766     static BOOL
767     is_counted_repeat(const uschar *p)
768     {
769     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
770     while ((digitab[*p] & ctype_digit) != 0) p++;
771     if (*p == '}') return TRUE;
772    
773     if (*p++ != ',') return FALSE;
774     if (*p == '}') return TRUE;
775    
776     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777     while ((digitab[*p] & ctype_digit) != 0) p++;
778    
779     return (*p == '}');
780     }
781    
782    
783    
784     /*************************************************
785     * Read repeat counts *
786     *************************************************/
787    
788     /* Read an item of the form {n,m} and return the values. This is called only
789     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
790     so the syntax is guaranteed to be correct, but we need to check the values.
791    
792     Arguments:
793     p pointer to first char after '{'
794     minp pointer to int for min
795     maxp pointer to int for max
796     returned as -1 if no max
797     errorcodeptr points to error code variable
798    
799     Returns: pointer to '}' on success;
800     current ptr on error, with errorcodeptr set non-zero
801     */
802    
803     static const uschar *
804     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
805     {
806     int min = 0;
807     int max = -1;
808    
809 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
810     an integer overflow. */
811    
812 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813 nigel 81 if (min < 0 || min > 65535)
814     {
815     *errorcodeptr = ERR5;
816     return p;
817     }
818 nigel 77
819 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
820     Also, max must not be less than min. */
821    
822 nigel 77 if (*p == '}') max = min; else
823     {
824     if (*(++p) != '}')
825     {
826     max = 0;
827     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828 nigel 81 if (max < 0 || max > 65535)
829     {
830     *errorcodeptr = ERR5;
831     return p;
832     }
833 nigel 77 if (max < min)
834     {
835     *errorcodeptr = ERR4;
836     return p;
837     }
838     }
839     }
840    
841 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
842     '}'. */
843 nigel 77
844 nigel 81 *minp = min;
845     *maxp = max;
846 nigel 77 return p;
847     }
848    
849    
850    
851     /*************************************************
852 nigel 93 * Find forward referenced subpattern *
853 nigel 91 *************************************************/
854    
855 nigel 93 /* This function scans along a pattern's text looking for capturing
856     subpatterns, and counting them. If it finds a named pattern that matches the
857     name it is given, it returns its number. Alternatively, if the name is NULL, it
858     returns when it reaches a given numbered subpattern. This is used for forward
859     references to subpatterns. We know that if (?P< is encountered, the name will
860     be terminated by '>' because that is checked in the first pass.
861 nigel 91
862     Arguments:
863 nigel 93 ptr current position in the pattern
864     count current count of capturing parens so far encountered
865     name name to seek, or NULL if seeking a numbered subpattern
866     lorn name length, or subpattern number if name is NULL
867     xmode TRUE if we are in /x mode
868 nigel 91
869     Returns: the number of the named subpattern, or -1 if not found
870     */
871    
872     static int
873 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874     BOOL xmode)
875 nigel 91 {
876     const uschar *thisname;
877 nigel 93
878 nigel 91 for (; *ptr != 0; ptr++)
879     {
880 nigel 93 int term;
881    
882     /* Skip over backslashed characters and also entire \Q...\E */
883    
884     if (*ptr == '\\')
885     {
886     if (*(++ptr) == 0) return -1;
887     if (*ptr == 'Q') for (;;)
888     {
889     while (*(++ptr) != 0 && *ptr != '\\');
890     if (*ptr == 0) return -1;
891     if (*(++ptr) == 'E') break;
892     }
893     continue;
894     }
895    
896     /* Skip over character classes */
897    
898     if (*ptr == '[')
899     {
900     while (*(++ptr) != ']')
901     {
902     if (*ptr == '\\')
903     {
904     if (*(++ptr) == 0) return -1;
905     if (*ptr == 'Q') for (;;)
906     {
907     while (*(++ptr) != 0 && *ptr != '\\');
908     if (*ptr == 0) return -1;
909     if (*(++ptr) == 'E') break;
910     }
911     continue;
912     }
913     }
914     continue;
915     }
916    
917     /* Skip comments in /x mode */
918    
919     if (xmode && *ptr == '#')
920     {
921     while (*(++ptr) != 0 && *ptr != '\n');
922     if (*ptr == 0) return -1;
923     continue;
924     }
925    
926     /* An opening parens must now be a real metacharacter */
927    
928 nigel 91 if (*ptr != '(') continue;
929 nigel 93 if (ptr[1] != '?')
930     {
931     count++;
932     if (name == NULL && count == lorn) return count;
933     continue;
934     }
935    
936     ptr += 2;
937     if (*ptr == 'P') ptr++; /* Allow optional P */
938    
939     /* We have to disambiguate (?<! and (?<= from (?<name> */
940    
941     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942     *ptr != '\'')
943     continue;
944    
945 nigel 91 count++;
946 nigel 93
947     if (name == NULL && count == lorn) return count;
948     term = *ptr++;
949     if (term == '<') term = '>';
950 nigel 91 thisname = ptr;
951 nigel 93 while (*ptr != term) ptr++;
952     if (name != NULL && lorn == ptr - thisname &&
953     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954 nigel 91 return count;
955     }
956 nigel 93
957 nigel 91 return -1;
958     }
959    
960    
961    
962     /*************************************************
963 nigel 77 * Find first significant op code *
964     *************************************************/
965    
966     /* This is called by several functions that scan a compiled expression looking
967     for a fixed first character, or an anchoring op code etc. It skips over things
968     that do not influence this. For some calls, a change of option is important.
969     For some calls, it makes sense to skip negative forward and all backward
970     assertions, and also the \b assertion; for others it does not.
971    
972     Arguments:
973     code pointer to the start of the group
974     options pointer to external options
975     optbit the option bit whose changing is significant, or
976     zero if none are
977     skipassert TRUE if certain assertions are to be skipped
978    
979     Returns: pointer to the first significant opcode
980     */
981    
982     static const uschar*
983     first_significant_code(const uschar *code, int *options, int optbit,
984     BOOL skipassert)
985     {
986     for (;;)
987     {
988     switch ((int)*code)
989     {
990     case OP_OPT:
991     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
992     *options = (int)code[1];
993     code += 2;
994     break;
995    
996     case OP_ASSERT_NOT:
997     case OP_ASSERTBACK:
998     case OP_ASSERTBACK_NOT:
999     if (!skipassert) return code;
1000     do code += GET(code, 1); while (*code == OP_ALT);
1001     code += _pcre_OP_lengths[*code];
1002     break;
1003    
1004     case OP_WORD_BOUNDARY:
1005     case OP_NOT_WORD_BOUNDARY:
1006     if (!skipassert) return code;
1007     /* Fall through */
1008    
1009     case OP_CALLOUT:
1010     case OP_CREF:
1011 nigel 93 case OP_RREF:
1012     case OP_DEF:
1013 nigel 77 code += _pcre_OP_lengths[*code];
1014     break;
1015    
1016     default:
1017     return code;
1018     }
1019     }
1020     /* Control never reaches here */
1021     }
1022    
1023    
1024    
1025    
1026     /*************************************************
1027     * Find the fixed length of a pattern *
1028     *************************************************/
1029    
1030     /* Scan a pattern and compute the fixed length of subject that will match it,
1031     if the length is fixed. This is needed for dealing with backward assertions.
1032     In UTF8 mode, the result is in characters rather than bytes.
1033    
1034     Arguments:
1035     code points to the start of the pattern (the bracket)
1036     options the compiling options
1037    
1038     Returns: the fixed length, or -1 if there is no fixed length,
1039     or -2 if \C was encountered
1040     */
1041    
1042     static int
1043     find_fixedlength(uschar *code, int options)
1044     {
1045     int length = -1;
1046    
1047     register int branchlength = 0;
1048     register uschar *cc = code + 1 + LINK_SIZE;
1049    
1050     /* Scan along the opcodes for this branch. If we get to the end of the
1051     branch, check the length against that of the other branches. */
1052    
1053     for (;;)
1054     {
1055     int d;
1056     register int op = *cc;
1057    
1058     switch (op)
1059     {
1060 nigel 93 case OP_CBRA:
1061 nigel 77 case OP_BRA:
1062     case OP_ONCE:
1063     case OP_COND:
1064 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065 nigel 77 if (d < 0) return d;
1066     branchlength += d;
1067     do cc += GET(cc, 1); while (*cc == OP_ALT);
1068     cc += 1 + LINK_SIZE;
1069     break;
1070    
1071     /* Reached end of a branch; if it's a ket it is the end of a nested
1072     call. If it's ALT it is an alternation in a nested call. If it is
1073     END it's the end of the outer call. All can be handled by the same code. */
1074    
1075     case OP_ALT:
1076     case OP_KET:
1077     case OP_KETRMAX:
1078     case OP_KETRMIN:
1079     case OP_END:
1080     if (length < 0) length = branchlength;
1081     else if (length != branchlength) return -1;
1082     if (*cc != OP_ALT) return length;
1083     cc += 1 + LINK_SIZE;
1084     branchlength = 0;
1085     break;
1086    
1087     /* Skip over assertive subpatterns */
1088    
1089     case OP_ASSERT:
1090     case OP_ASSERT_NOT:
1091     case OP_ASSERTBACK:
1092     case OP_ASSERTBACK_NOT:
1093     do cc += GET(cc, 1); while (*cc == OP_ALT);
1094     /* Fall through */
1095    
1096     /* Skip over things that don't match chars */
1097    
1098     case OP_REVERSE:
1099     case OP_CREF:
1100 nigel 93 case OP_RREF:
1101     case OP_DEF:
1102 nigel 77 case OP_OPT:
1103     case OP_CALLOUT:
1104     case OP_SOD:
1105     case OP_SOM:
1106     case OP_EOD:
1107     case OP_EODN:
1108     case OP_CIRC:
1109     case OP_DOLL:
1110     case OP_NOT_WORD_BOUNDARY:
1111     case OP_WORD_BOUNDARY:
1112     cc += _pcre_OP_lengths[*cc];
1113     break;
1114    
1115     /* Handle literal characters */
1116    
1117     case OP_CHAR:
1118     case OP_CHARNC:
1119 nigel 91 case OP_NOT:
1120 nigel 77 branchlength++;
1121     cc += 2;
1122     #ifdef SUPPORT_UTF8
1123     if ((options & PCRE_UTF8) != 0)
1124     {
1125     while ((*cc & 0xc0) == 0x80) cc++;
1126     }
1127     #endif
1128     break;
1129    
1130     /* Handle exact repetitions. The count is already in characters, but we
1131     need to skip over a multibyte character in UTF8 mode. */
1132    
1133     case OP_EXACT:
1134     branchlength += GET2(cc,1);
1135     cc += 4;
1136     #ifdef SUPPORT_UTF8
1137     if ((options & PCRE_UTF8) != 0)
1138     {
1139     while((*cc & 0x80) == 0x80) cc++;
1140     }
1141     #endif
1142     break;
1143    
1144     case OP_TYPEEXACT:
1145     branchlength += GET2(cc,1);
1146     cc += 4;
1147     break;
1148    
1149     /* Handle single-char matchers */
1150    
1151     case OP_PROP:
1152     case OP_NOTPROP:
1153 nigel 87 cc += 2;
1154 nigel 77 /* Fall through */
1155    
1156     case OP_NOT_DIGIT:
1157     case OP_DIGIT:
1158     case OP_NOT_WHITESPACE:
1159     case OP_WHITESPACE:
1160     case OP_NOT_WORDCHAR:
1161     case OP_WORDCHAR:
1162     case OP_ANY:
1163     branchlength++;
1164     cc++;
1165     break;
1166    
1167     /* The single-byte matcher isn't allowed */
1168    
1169     case OP_ANYBYTE:
1170     return -2;
1171    
1172     /* Check a class for variable quantification */
1173    
1174     #ifdef SUPPORT_UTF8
1175     case OP_XCLASS:
1176     cc += GET(cc, 1) - 33;
1177     /* Fall through */
1178     #endif
1179    
1180     case OP_CLASS:
1181     case OP_NCLASS:
1182     cc += 33;
1183    
1184     switch (*cc)
1185     {
1186     case OP_CRSTAR:
1187     case OP_CRMINSTAR:
1188     case OP_CRQUERY:
1189     case OP_CRMINQUERY:
1190     return -1;
1191    
1192     case OP_CRRANGE:
1193     case OP_CRMINRANGE:
1194     if (GET2(cc,1) != GET2(cc,3)) return -1;
1195     branchlength += GET2(cc,1);
1196     cc += 5;
1197     break;
1198    
1199     default:
1200     branchlength++;
1201     }
1202     break;
1203    
1204     /* Anything else is variable length */
1205    
1206     default:
1207     return -1;
1208     }
1209     }
1210     /* Control never gets here */
1211     }
1212    
1213    
1214    
1215    
1216     /*************************************************
1217     * Scan compiled regex for numbered bracket *
1218     *************************************************/
1219    
1220     /* This little function scans through a compiled pattern until it finds a
1221     capturing bracket with the given number.
1222    
1223     Arguments:
1224     code points to start of expression
1225     utf8 TRUE in UTF-8 mode
1226     number the required bracket number
1227    
1228     Returns: pointer to the opcode for the bracket, or NULL if not found
1229     */
1230    
1231     static const uschar *
1232     find_bracket(const uschar *code, BOOL utf8, int number)
1233     {
1234     for (;;)
1235     {
1236     register int c = *code;
1237     if (c == OP_END) return NULL;
1238 nigel 91
1239     /* XCLASS is used for classes that cannot be represented just by a bit
1240     map. This includes negated single high-valued characters. The length in
1241     the table is zero; the actual length is stored in the compiled code. */
1242    
1243     if (c == OP_XCLASS) code += GET(code, 1);
1244    
1245 nigel 93 /* Handle capturing bracket */
1246 nigel 91
1247 nigel 93 else if (c == OP_CBRA)
1248 nigel 77 {
1249 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1250 nigel 77 if (n == number) return (uschar *)code;
1251 nigel 93 code += _pcre_OP_lengths[c];
1252 nigel 77 }
1253 nigel 91
1254 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255     a multi-byte character. The length in the table is a minimum, so we have to
1256     arrange to skip the extra bytes. */
1257 nigel 91
1258 nigel 77 else
1259     {
1260     code += _pcre_OP_lengths[c];
1261 ph10 107 #ifdef SUPPORT_UTF8
1262 nigel 77 if (utf8) switch(c)
1263     {
1264     case OP_CHAR:
1265     case OP_CHARNC:
1266     case OP_EXACT:
1267     case OP_UPTO:
1268     case OP_MINUPTO:
1269 nigel 93 case OP_POSUPTO:
1270 nigel 77 case OP_STAR:
1271     case OP_MINSTAR:
1272 nigel 93 case OP_POSSTAR:
1273 nigel 77 case OP_PLUS:
1274     case OP_MINPLUS:
1275 nigel 93 case OP_POSPLUS:
1276 nigel 77 case OP_QUERY:
1277     case OP_MINQUERY:
1278 nigel 93 case OP_POSQUERY:
1279     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280 nigel 77 break;
1281     }
1282 ph10 111 #endif
1283 nigel 77 }
1284     }
1285     }
1286    
1287    
1288    
1289     /*************************************************
1290     * Scan compiled regex for recursion reference *
1291     *************************************************/
1292    
1293     /* This little function scans through a compiled pattern until it finds an
1294     instance of OP_RECURSE.
1295    
1296     Arguments:
1297     code points to start of expression
1298     utf8 TRUE in UTF-8 mode
1299    
1300     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1301     */
1302    
1303     static const uschar *
1304     find_recurse(const uschar *code, BOOL utf8)
1305     {
1306     for (;;)
1307     {
1308     register int c = *code;
1309     if (c == OP_END) return NULL;
1310 nigel 91 if (c == OP_RECURSE) return code;
1311    
1312     /* XCLASS is used for classes that cannot be represented just by a bit
1313     map. This includes negated single high-valued characters. The length in
1314     the table is zero; the actual length is stored in the compiled code. */
1315    
1316     if (c == OP_XCLASS) code += GET(code, 1);
1317    
1318     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319     that are followed by a character may be followed by a multi-byte character.
1320 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1321     bytes. */
1322 nigel 91
1323 nigel 77 else
1324     {
1325     code += _pcre_OP_lengths[c];
1326 ph10 107 #ifdef SUPPORT_UTF8
1327 nigel 77 if (utf8) switch(c)
1328     {
1329     case OP_CHAR:
1330     case OP_CHARNC:
1331     case OP_EXACT:
1332     case OP_UPTO:
1333     case OP_MINUPTO:
1334 nigel 93 case OP_POSUPTO:
1335 nigel 77 case OP_STAR:
1336     case OP_MINSTAR:
1337 nigel 93 case OP_POSSTAR:
1338 nigel 77 case OP_PLUS:
1339     case OP_MINPLUS:
1340 nigel 93 case OP_POSPLUS:
1341 nigel 77 case OP_QUERY:
1342     case OP_MINQUERY:
1343 nigel 93 case OP_POSQUERY:
1344     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345 nigel 77 break;
1346     }
1347 ph10 111 #endif
1348 nigel 77 }
1349     }
1350     }
1351    
1352    
1353    
1354     /*************************************************
1355     * Scan compiled branch for non-emptiness *
1356     *************************************************/
1357    
1358     /* This function scans through a branch of a compiled pattern to see whether it
1359 nigel 93 can match the empty string or not. It is called from could_be_empty()
1360     below and from compile_branch() when checking for an unlimited repeat of a
1361     group that can match nothing. Note that first_significant_code() skips over
1362     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363     struck an inner bracket whose current branch will already have been scanned.
1364 nigel 77
1365     Arguments:
1366     code points to start of search
1367     endcode points to where to stop
1368     utf8 TRUE if in UTF8 mode
1369    
1370     Returns: TRUE if what is matched could be empty
1371     */
1372    
1373     static BOOL
1374     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375     {
1376     register int c;
1377 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378 nigel 77 code < endcode;
1379     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380     {
1381     const uschar *ccode;
1382    
1383     c = *code;
1384 ph10 172
1385 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1386 nigel 77
1387 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388     {
1389 ph10 172 code += _pcre_OP_lengths[c];
1390 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1391     c = *code;
1392     continue;
1393     }
1394    
1395     /* For other groups, scan the branches. */
1396 ph10 172
1397 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398 nigel 77 {
1399     BOOL empty_branch;
1400     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1401    
1402     /* Scan a closed bracket */
1403    
1404     empty_branch = FALSE;
1405     do
1406     {
1407     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1408     empty_branch = TRUE;
1409     code += GET(code, 1);
1410     }
1411     while (*code == OP_ALT);
1412     if (!empty_branch) return FALSE; /* All branches are non-empty */
1413 ph10 172 c = *code;
1414 nigel 93 continue;
1415 nigel 77 }
1416    
1417 nigel 93 /* Handle the other opcodes */
1418    
1419     switch (c)
1420 nigel 77 {
1421     /* Check for quantifiers after a class */
1422    
1423     #ifdef SUPPORT_UTF8
1424     case OP_XCLASS:
1425     ccode = code + GET(code, 1);
1426     goto CHECK_CLASS_REPEAT;
1427     #endif
1428    
1429     case OP_CLASS:
1430     case OP_NCLASS:
1431     ccode = code + 33;
1432    
1433     #ifdef SUPPORT_UTF8
1434     CHECK_CLASS_REPEAT:
1435     #endif
1436    
1437     switch (*ccode)
1438     {
1439     case OP_CRSTAR: /* These could be empty; continue */
1440     case OP_CRMINSTAR:
1441     case OP_CRQUERY:
1442     case OP_CRMINQUERY:
1443     break;
1444    
1445     default: /* Non-repeat => class must match */
1446     case OP_CRPLUS: /* These repeats aren't empty */
1447     case OP_CRMINPLUS:
1448     return FALSE;
1449    
1450     case OP_CRRANGE:
1451     case OP_CRMINRANGE:
1452     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1453     break;
1454     }
1455     break;
1456    
1457     /* Opcodes that must match a character */
1458    
1459     case OP_PROP:
1460     case OP_NOTPROP:
1461     case OP_EXTUNI:
1462     case OP_NOT_DIGIT:
1463     case OP_DIGIT:
1464     case OP_NOT_WHITESPACE:
1465     case OP_WHITESPACE:
1466     case OP_NOT_WORDCHAR:
1467     case OP_WORDCHAR:
1468     case OP_ANY:
1469     case OP_ANYBYTE:
1470     case OP_CHAR:
1471     case OP_CHARNC:
1472     case OP_NOT:
1473     case OP_PLUS:
1474     case OP_MINPLUS:
1475 nigel 93 case OP_POSPLUS:
1476 nigel 77 case OP_EXACT:
1477     case OP_NOTPLUS:
1478     case OP_NOTMINPLUS:
1479 nigel 93 case OP_NOTPOSPLUS:
1480 nigel 77 case OP_NOTEXACT:
1481     case OP_TYPEPLUS:
1482     case OP_TYPEMINPLUS:
1483 nigel 93 case OP_TYPEPOSPLUS:
1484 nigel 77 case OP_TYPEEXACT:
1485     return FALSE;
1486    
1487     /* End of branch */
1488    
1489     case OP_KET:
1490     case OP_KETRMAX:
1491     case OP_KETRMIN:
1492     case OP_ALT:
1493     return TRUE;
1494    
1495 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1496     MINUPTO, and POSUPTO may be followed by a multibyte character */
1497 nigel 77
1498     #ifdef SUPPORT_UTF8
1499     case OP_STAR:
1500     case OP_MINSTAR:
1501 nigel 93 case OP_POSSTAR:
1502 nigel 77 case OP_QUERY:
1503     case OP_MINQUERY:
1504 nigel 93 case OP_POSQUERY:
1505 nigel 77 case OP_UPTO:
1506     case OP_MINUPTO:
1507 nigel 93 case OP_POSUPTO:
1508 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1509     break;
1510     #endif
1511     }
1512     }
1513    
1514     return TRUE;
1515     }
1516    
1517    
1518    
1519     /*************************************************
1520     * Scan compiled regex for non-emptiness *
1521     *************************************************/
1522    
1523     /* This function is called to check for left recursive calls. We want to check
1524     the current branch of the current pattern to see if it could match the empty
1525     string. If it could, we must look outwards for branches at other levels,
1526     stopping when we pass beyond the bracket which is the subject of the recursion.
1527    
1528     Arguments:
1529     code points to start of the recursion
1530     endcode points to where to stop (current RECURSE item)
1531     bcptr points to the chain of current (unclosed) branch starts
1532     utf8 TRUE if in UTF-8 mode
1533    
1534     Returns: TRUE if what is matched could be empty
1535     */
1536    
1537     static BOOL
1538     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1539     BOOL utf8)
1540     {
1541     while (bcptr != NULL && bcptr->current >= code)
1542     {
1543     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1544     bcptr = bcptr->outer;
1545     }
1546     return TRUE;
1547     }
1548    
1549    
1550    
1551     /*************************************************
1552     * Check for POSIX class syntax *
1553     *************************************************/
1554    
1555     /* This function is called when the sequence "[:" or "[." or "[=" is
1556     encountered in a character class. It checks whether this is followed by an
1557     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1558     ".]" or "=]".
1559    
1560     Argument:
1561     ptr pointer to the initial [
1562     endptr where to return the end pointer
1563     cd pointer to compile data
1564    
1565     Returns: TRUE or FALSE
1566     */
1567    
1568     static BOOL
1569     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1570     {
1571     int terminator; /* Don't combine these lines; the Solaris cc */
1572     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1573     if (*(++ptr) == '^') ptr++;
1574     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1575     if (*ptr == terminator && ptr[1] == ']')
1576     {
1577     *endptr = ptr;
1578     return TRUE;
1579     }
1580     return FALSE;
1581     }
1582    
1583    
1584    
1585    
1586     /*************************************************
1587     * Check POSIX class name *
1588     *************************************************/
1589    
1590     /* This function is called to check the name given in a POSIX-style class entry
1591     such as [:alnum:].
1592    
1593     Arguments:
1594     ptr points to the first letter
1595     len the length of the name
1596    
1597     Returns: a value representing the name, or -1 if unknown
1598     */
1599    
1600     static int
1601     check_posix_name(const uschar *ptr, int len)
1602     {
1603     register int yield = 0;
1604     while (posix_name_lengths[yield] != 0)
1605     {
1606     if (len == posix_name_lengths[yield] &&
1607     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1608     yield++;
1609     }
1610     return -1;
1611     }
1612    
1613    
1614     /*************************************************
1615     * Adjust OP_RECURSE items in repeated group *
1616     *************************************************/
1617    
1618     /* OP_RECURSE items contain an offset from the start of the regex to the group
1619     that is referenced. This means that groups can be replicated for fixed
1620     repetition simply by copying (because the recursion is allowed to refer to
1621     earlier groups that are outside the current group). However, when a group is
1622     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1623     it, after it has been compiled. This means that any OP_RECURSE items within it
1624     that refer to the group itself or any contained groups have to have their
1625 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1626     the partially compiled regex must be temporarily terminated with OP_END.
1627 nigel 77
1628 nigel 93 This function has been extended with the possibility of forward references for
1629     recursions and subroutine calls. It must also check the list of such references
1630     for the group we are dealing with. If it finds that one of the recursions in
1631     the current group is on this list, it adjusts the offset in the list, not the
1632     value in the reference (which is a group number).
1633    
1634 nigel 77 Arguments:
1635     group points to the start of the group
1636     adjust the amount by which the group is to be moved
1637     utf8 TRUE in UTF-8 mode
1638     cd contains pointers to tables etc.
1639 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1640 nigel 77
1641     Returns: nothing
1642     */
1643    
1644     static void
1645 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1646     uschar *save_hwm)
1647 nigel 77 {
1648     uschar *ptr = group;
1649     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1650     {
1651 nigel 93 int offset;
1652     uschar *hc;
1653    
1654     /* See if this recursion is on the forward reference list. If so, adjust the
1655     reference. */
1656    
1657     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1658     {
1659     offset = GET(hc, 0);
1660     if (cd->start_code + offset == ptr + 1)
1661     {
1662     PUT(hc, 0, offset + adjust);
1663     break;
1664     }
1665     }
1666    
1667     /* Otherwise, adjust the recursion offset if it's after the start of this
1668     group. */
1669    
1670     if (hc >= cd->hwm)
1671     {
1672     offset = GET(ptr, 1);
1673     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1674     }
1675    
1676 nigel 77 ptr += 1 + LINK_SIZE;
1677     }
1678     }
1679    
1680    
1681    
1682     /*************************************************
1683     * Insert an automatic callout point *
1684     *************************************************/
1685    
1686     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1687     callout points before each pattern item.
1688    
1689     Arguments:
1690     code current code pointer
1691     ptr current pattern pointer
1692     cd pointers to tables etc
1693    
1694     Returns: new code pointer
1695     */
1696    
1697     static uschar *
1698     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1699     {
1700     *code++ = OP_CALLOUT;
1701     *code++ = 255;
1702     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1703     PUT(code, LINK_SIZE, 0); /* Default length */
1704     return code + 2*LINK_SIZE;
1705     }
1706    
1707    
1708    
1709     /*************************************************
1710     * Complete a callout item *
1711     *************************************************/
1712    
1713     /* A callout item contains the length of the next item in the pattern, which
1714     we can't fill in till after we have reached the relevant point. This is used
1715     for both automatic and manual callouts.
1716    
1717     Arguments:
1718     previous_callout points to previous callout item
1719     ptr current pattern pointer
1720     cd pointers to tables etc
1721    
1722     Returns: nothing
1723     */
1724    
1725     static void
1726     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1727     {
1728     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1729     PUT(previous_callout, 2 + LINK_SIZE, length);
1730     }
1731    
1732    
1733    
1734     #ifdef SUPPORT_UCP
1735     /*************************************************
1736     * Get othercase range *
1737     *************************************************/
1738    
1739     /* This function is passed the start and end of a class range, in UTF-8 mode
1740     with UCP support. It searches up the characters, looking for internal ranges of
1741     characters in the "other" case. Each call returns the next one, updating the
1742     start address.
1743    
1744     Arguments:
1745     cptr points to starting character value; updated
1746     d end value
1747     ocptr where to put start of othercase range
1748     odptr where to put end of othercase range
1749    
1750     Yield: TRUE when range returned; FALSE when no more
1751     */
1752    
1753     static BOOL
1754 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1755     unsigned int *odptr)
1756 nigel 77 {
1757 nigel 93 unsigned int c, othercase, next;
1758 nigel 77
1759     for (c = *cptr; c <= d; c++)
1760 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1761 nigel 77
1762     if (c > d) return FALSE;
1763    
1764     *ocptr = othercase;
1765     next = othercase + 1;
1766    
1767     for (++c; c <= d; c++)
1768     {
1769 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1770 nigel 77 next++;
1771     }
1772    
1773     *odptr = next - 1;
1774     *cptr = c;
1775    
1776     return TRUE;
1777     }
1778     #endif /* SUPPORT_UCP */
1779    
1780    
1781 nigel 93
1782 nigel 77 /*************************************************
1783 nigel 93 * Check if auto-possessifying is possible *
1784     *************************************************/
1785    
1786     /* This function is called for unlimited repeats of certain items, to see
1787     whether the next thing could possibly match the repeated item. If not, it makes
1788     sense to automatically possessify the repeated item.
1789    
1790     Arguments:
1791     op_code the repeated op code
1792     this data for this item, depends on the opcode
1793     utf8 TRUE in UTF-8 mode
1794     utf8_char used for utf8 character bytes, NULL if not relevant
1795     ptr next character in pattern
1796     options options bits
1797     cd contains pointers to tables etc.
1798    
1799     Returns: TRUE if possessifying is wanted
1800     */
1801    
1802     static BOOL
1803     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1804     const uschar *ptr, int options, compile_data *cd)
1805     {
1806     int next;
1807    
1808     /* Skip whitespace and comments in extended mode */
1809    
1810     if ((options & PCRE_EXTENDED) != 0)
1811     {
1812     for (;;)
1813     {
1814     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1815     if (*ptr == '#')
1816     {
1817     while (*(++ptr) != 0)
1818     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1819     }
1820     else break;
1821     }
1822     }
1823    
1824     /* If the next item is one that we can handle, get its value. A non-negative
1825     value is a character, a negative value is an escape value. */
1826    
1827     if (*ptr == '\\')
1828     {
1829     int temperrorcode = 0;
1830     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1831     if (temperrorcode != 0) return FALSE;
1832     ptr++; /* Point after the escape sequence */
1833     }
1834    
1835     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1836     {
1837     #ifdef SUPPORT_UTF8
1838     if (utf8) { GETCHARINC(next, ptr); } else
1839     #endif
1840     next = *ptr++;
1841     }
1842    
1843     else return FALSE;
1844    
1845     /* Skip whitespace and comments in extended mode */
1846    
1847     if ((options & PCRE_EXTENDED) != 0)
1848     {
1849     for (;;)
1850     {
1851     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1852     if (*ptr == '#')
1853     {
1854     while (*(++ptr) != 0)
1855     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1856     }
1857     else break;
1858     }
1859     }
1860    
1861     /* If the next thing is itself optional, we have to give up. */
1862    
1863     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1864     return FALSE;
1865    
1866     /* Now compare the next item with the previous opcode. If the previous is a
1867     positive single character match, "item" either contains the character or, if
1868     "item" is greater than 127 in utf8 mode, the character's bytes are in
1869     utf8_char. */
1870    
1871    
1872     /* Handle cases when the next item is a character. */
1873    
1874     if (next >= 0) switch(op_code)
1875     {
1876     case OP_CHAR:
1877     #ifdef SUPPORT_UTF8
1878     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1879     #endif
1880     return item != next;
1881    
1882     /* For CHARNC (caseless character) we must check the other case. If we have
1883     Unicode property support, we can use it to test the other case of
1884     high-valued characters. */
1885    
1886     case OP_CHARNC:
1887     #ifdef SUPPORT_UTF8
1888     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1889     #endif
1890     if (item == next) return FALSE;
1891     #ifdef SUPPORT_UTF8
1892     if (utf8)
1893     {
1894     unsigned int othercase;
1895     if (next < 128) othercase = cd->fcc[next]; else
1896     #ifdef SUPPORT_UCP
1897     othercase = _pcre_ucp_othercase((unsigned int)next);
1898     #else
1899     othercase = NOTACHAR;
1900     #endif
1901     return (unsigned int)item != othercase;
1902     }
1903     else
1904     #endif /* SUPPORT_UTF8 */
1905     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1906    
1907     /* For OP_NOT, "item" must be a single-byte character. */
1908    
1909     case OP_NOT:
1910     if (next < 0) return FALSE; /* Not a character */
1911     if (item == next) return TRUE;
1912     if ((options & PCRE_CASELESS) == 0) return FALSE;
1913     #ifdef SUPPORT_UTF8
1914     if (utf8)
1915     {
1916     unsigned int othercase;
1917     if (next < 128) othercase = cd->fcc[next]; else
1918     #ifdef SUPPORT_UCP
1919     othercase = _pcre_ucp_othercase(next);
1920     #else
1921     othercase = NOTACHAR;
1922     #endif
1923     return (unsigned int)item == othercase;
1924     }
1925     else
1926     #endif /* SUPPORT_UTF8 */
1927     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1928    
1929     case OP_DIGIT:
1930     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1931    
1932     case OP_NOT_DIGIT:
1933     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1934    
1935     case OP_WHITESPACE:
1936     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1937    
1938     case OP_NOT_WHITESPACE:
1939     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1940    
1941     case OP_WORDCHAR:
1942     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1943    
1944     case OP_NOT_WORDCHAR:
1945     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1946    
1947     default:
1948     return FALSE;
1949     }
1950    
1951    
1952     /* Handle the case when the next item is \d, \s, etc. */
1953    
1954     switch(op_code)
1955     {
1956     case OP_CHAR:
1957     case OP_CHARNC:
1958     #ifdef SUPPORT_UTF8
1959     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1960     #endif
1961     switch(-next)
1962     {
1963     case ESC_d:
1964     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1965    
1966     case ESC_D:
1967     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1968    
1969     case ESC_s:
1970     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1971    
1972     case ESC_S:
1973     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1974    
1975     case ESC_w:
1976     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1977    
1978     case ESC_W:
1979     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1980    
1981     default:
1982     return FALSE;
1983     }
1984    
1985     case OP_DIGIT:
1986     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1987    
1988     case OP_NOT_DIGIT:
1989     return next == -ESC_d;
1990    
1991     case OP_WHITESPACE:
1992     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1993    
1994     case OP_NOT_WHITESPACE:
1995     return next == -ESC_s;
1996    
1997     case OP_WORDCHAR:
1998     return next == -ESC_W || next == -ESC_s;
1999    
2000     case OP_NOT_WORDCHAR:
2001     return next == -ESC_w || next == -ESC_d;
2002    
2003     default:
2004     return FALSE;
2005     }
2006    
2007     /* Control does not reach here */
2008     }
2009    
2010    
2011    
2012     /*************************************************
2013 nigel 77 * Compile one branch *
2014     *************************************************/
2015    
2016 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2017 nigel 77 changed during the branch, the pointer is used to change the external options
2018 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2019     to find out the amount of memory needed, as well as during the real compile
2020     phase. The value of lengthptr distinguishes the two phases.
2021 nigel 77
2022     Arguments:
2023     optionsptr pointer to the option bits
2024     codeptr points to the pointer to the current code point
2025     ptrptr points to the current pattern pointer
2026     errorcodeptr points to error code variable
2027     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2028     reqbyteptr set to the last literal character required, else < 0
2029     bcptr points to current branch chain
2030     cd contains pointers to tables etc.
2031 nigel 93 lengthptr NULL during the real compile phase
2032     points to length accumulator during pre-compile phase
2033 nigel 77
2034     Returns: TRUE on success
2035     FALSE, with *errorcodeptr set non-zero on error
2036     */
2037    
2038     static BOOL
2039 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2040     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2041     compile_data *cd, int *lengthptr)
2042 nigel 77 {
2043     int repeat_type, op_type;
2044     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2045     int bravalue = 0;
2046     int greedy_default, greedy_non_default;
2047     int firstbyte, reqbyte;
2048     int zeroreqbyte, zerofirstbyte;
2049     int req_caseopt, reqvary, tempreqvary;
2050     int options = *optionsptr;
2051     int after_manual_callout = 0;
2052 nigel 93 int length_prevgroup = 0;
2053 nigel 77 register int c;
2054     register uschar *code = *codeptr;
2055 nigel 93 uschar *last_code = code;
2056     uschar *orig_code = code;
2057 nigel 77 uschar *tempcode;
2058     BOOL inescq = FALSE;
2059     BOOL groupsetfirstbyte = FALSE;
2060     const uschar *ptr = *ptrptr;
2061     const uschar *tempptr;
2062     uschar *previous = NULL;
2063     uschar *previous_callout = NULL;
2064 nigel 93 uschar *save_hwm = NULL;
2065 nigel 77 uschar classbits[32];
2066    
2067     #ifdef SUPPORT_UTF8
2068     BOOL class_utf8;
2069     BOOL utf8 = (options & PCRE_UTF8) != 0;
2070     uschar *class_utf8data;
2071     uschar utf8_char[6];
2072     #else
2073     BOOL utf8 = FALSE;
2074 nigel 93 uschar *utf8_char = NULL;
2075 nigel 77 #endif
2076    
2077 nigel 93 #ifdef DEBUG
2078     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2079     #endif
2080    
2081 nigel 77 /* Set up the default and non-default settings for greediness */
2082    
2083     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2084     greedy_non_default = greedy_default ^ 1;
2085    
2086     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2087     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2088     matches a non-fixed char first char; reqbyte just remains unset if we never
2089     find one.
2090    
2091     When we hit a repeat whose minimum is zero, we may have to adjust these values
2092     to take the zero repeat into account. This is implemented by setting them to
2093     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2094     item types that can be repeated set these backoff variables appropriately. */
2095    
2096     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2097    
2098     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2099     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2100     value > 255. It is added into the firstbyte or reqbyte variables to record the
2101     case status of the value. This is used only for ASCII characters. */
2102    
2103     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2104    
2105     /* Switch on next character until the end of the branch */
2106    
2107     for (;; ptr++)
2108     {
2109     BOOL negate_class;
2110     BOOL possessive_quantifier;
2111     BOOL is_quantifier;
2112 nigel 93 BOOL is_recurse;
2113 ph10 175 BOOL reset_bracount;
2114 nigel 77 int class_charcount;
2115     int class_lastchar;
2116     int newoptions;
2117     int recno;
2118 ph10 172 int refsign;
2119 nigel 77 int skipbytes;
2120     int subreqbyte;
2121     int subfirstbyte;
2122 nigel 93 int terminator;
2123 nigel 77 int mclength;
2124     uschar mcbuffer[8];
2125    
2126 nigel 93 /* Get next byte in the pattern */
2127 nigel 77
2128     c = *ptr;
2129    
2130 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2131     previous cycle of this loop. */
2132    
2133     if (lengthptr != NULL)
2134     {
2135     #ifdef DEBUG
2136     if (code > cd->hwm) cd->hwm = code; /* High water info */
2137     #endif
2138     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2139     {
2140     *errorcodeptr = ERR52;
2141     goto FAILED;
2142     }
2143    
2144     /* There is at least one situation where code goes backwards: this is the
2145     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2146     the class is simply eliminated. However, it is created first, so we have to
2147     allow memory for it. Therefore, don't ever reduce the length at this point.
2148     */
2149    
2150     if (code < last_code) code = last_code;
2151     *lengthptr += code - last_code;
2152     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2153    
2154     /* If "previous" is set and it is not at the start of the work space, move
2155     it back to there, in order to avoid filling up the work space. Otherwise,
2156     if "previous" is NULL, reset the current code pointer to the start. */
2157    
2158     if (previous != NULL)
2159     {
2160     if (previous > orig_code)
2161     {
2162     memmove(orig_code, previous, code - previous);
2163     code -= previous - orig_code;
2164     previous = orig_code;
2165     }
2166     }
2167     else code = orig_code;
2168    
2169     /* Remember where this code item starts so we can pick up the length
2170     next time round. */
2171    
2172     last_code = code;
2173     }
2174    
2175     /* In the real compile phase, just check the workspace used by the forward
2176     reference list. */
2177    
2178     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2179     {
2180     *errorcodeptr = ERR52;
2181     goto FAILED;
2182     }
2183    
2184 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2185    
2186     if (inescq && c != 0)
2187     {
2188     if (c == '\\' && ptr[1] == 'E')
2189     {
2190     inescq = FALSE;
2191     ptr++;
2192     continue;
2193     }
2194     else
2195     {
2196     if (previous_callout != NULL)
2197     {
2198 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2199     complete_callout(previous_callout, ptr, cd);
2200 nigel 77 previous_callout = NULL;
2201     }
2202     if ((options & PCRE_AUTO_CALLOUT) != 0)
2203     {
2204     previous_callout = code;
2205     code = auto_callout(code, ptr, cd);
2206     }
2207     goto NORMAL_CHAR;
2208     }
2209     }
2210    
2211     /* Fill in length of a previous callout, except when the next thing is
2212     a quantifier. */
2213    
2214     is_quantifier = c == '*' || c == '+' || c == '?' ||
2215     (c == '{' && is_counted_repeat(ptr+1));
2216    
2217     if (!is_quantifier && previous_callout != NULL &&
2218     after_manual_callout-- <= 0)
2219     {
2220 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2221     complete_callout(previous_callout, ptr, cd);
2222 nigel 77 previous_callout = NULL;
2223     }
2224    
2225     /* In extended mode, skip white space and comments */
2226    
2227     if ((options & PCRE_EXTENDED) != 0)
2228     {
2229     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2230     if (c == '#')
2231     {
2232 nigel 93 while (*(++ptr) != 0)
2233 nigel 91 {
2234 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2235 nigel 91 }
2236 nigel 93 if (*ptr != 0) continue;
2237    
2238 nigel 91 /* Else fall through to handle end of string */
2239     c = 0;
2240 nigel 77 }
2241     }
2242    
2243     /* No auto callout for quantifiers. */
2244    
2245     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2246     {
2247     previous_callout = code;
2248     code = auto_callout(code, ptr, cd);
2249     }
2250    
2251     switch(c)
2252     {
2253 nigel 93 /* ===================================================================*/
2254     case 0: /* The branch terminates at string end */
2255     case '|': /* or | or ) */
2256 nigel 77 case ')':
2257     *firstbyteptr = firstbyte;
2258     *reqbyteptr = reqbyte;
2259     *codeptr = code;
2260     *ptrptr = ptr;
2261 nigel 93 if (lengthptr != NULL)
2262     {
2263     *lengthptr += code - last_code; /* To include callout length */
2264     DPRINTF((">> end branch\n"));
2265     }
2266 nigel 77 return TRUE;
2267    
2268 nigel 93
2269     /* ===================================================================*/
2270 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2271     the setting of any following char as a first character. */
2272    
2273     case '^':
2274     if ((options & PCRE_MULTILINE) != 0)
2275     {
2276     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2277     }
2278     previous = NULL;
2279     *code++ = OP_CIRC;
2280     break;
2281    
2282     case '$':
2283     previous = NULL;
2284     *code++ = OP_DOLL;
2285     break;
2286    
2287     /* There can never be a first char if '.' is first, whatever happens about
2288     repeats. The value of reqbyte doesn't change either. */
2289    
2290     case '.':
2291     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2292     zerofirstbyte = firstbyte;
2293     zeroreqbyte = reqbyte;
2294     previous = code;
2295     *code++ = OP_ANY;
2296     break;
2297    
2298 nigel 93
2299     /* ===================================================================*/
2300 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2301     32-byte bitmap of the permitted characters, except in the special case
2302     where there is only one such character. For negated classes, we build the
2303     map as usual, then invert it at the end. However, we use a different opcode
2304     so that data characters > 255 can be handled correctly.
2305 nigel 77
2306     If the class contains characters outside the 0-255 range, a different
2307     opcode is compiled. It may optionally have a bit map for characters < 256,
2308     but those above are are explicitly listed afterwards. A flag byte tells
2309     whether the bitmap is present, and whether this is a negated class or not.
2310     */
2311    
2312     case '[':
2313     previous = code;
2314    
2315     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2316     they are encountered at the top level, so we'll do that too. */
2317    
2318     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2319     check_posix_syntax(ptr, &tempptr, cd))
2320     {
2321     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2322     goto FAILED;
2323     }
2324    
2325     /* If the first character is '^', set the negation flag and skip it. */
2326    
2327     if ((c = *(++ptr)) == '^')
2328     {
2329     negate_class = TRUE;
2330     c = *(++ptr);
2331     }
2332     else
2333     {
2334     negate_class = FALSE;
2335     }
2336    
2337     /* Keep a count of chars with values < 256 so that we can optimize the case
2338 nigel 93 of just a single character (as long as it's < 256). However, For higher
2339     valued UTF-8 characters, we don't yet do any optimization. */
2340 nigel 77
2341     class_charcount = 0;
2342     class_lastchar = -1;
2343    
2344 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2345     temporary bit of memory, in case the class contains only 1 character (less
2346     than 256), because in that case the compiled code doesn't use the bit map.
2347     */
2348    
2349     memset(classbits, 0, 32 * sizeof(uschar));
2350    
2351 nigel 77 #ifdef SUPPORT_UTF8
2352     class_utf8 = FALSE; /* No chars >= 256 */
2353 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2354 nigel 77 #endif
2355    
2356     /* Process characters until ] is reached. By writing this as a "do" it
2357 nigel 93 means that an initial ] is taken as a data character. At the start of the
2358     loop, c contains the first byte of the character. */
2359 nigel 77
2360 nigel 93 if (c != 0) do
2361 nigel 77 {
2362 nigel 93 const uschar *oldptr;
2363    
2364 nigel 77 #ifdef SUPPORT_UTF8
2365     if (utf8 && c > 127)
2366     { /* Braces are required because the */
2367     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2368     }
2369     #endif
2370    
2371     /* Inside \Q...\E everything is literal except \E */
2372    
2373     if (inescq)
2374     {
2375 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2376 nigel 77 {
2377 nigel 93 inescq = FALSE; /* Reset literal state */
2378     ptr++; /* Skip the 'E' */
2379     continue; /* Carry on with next */
2380 nigel 77 }
2381 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2382 nigel 77 }
2383    
2384     /* Handle POSIX class names. Perl allows a negation extension of the
2385     form [:^name:]. A square bracket that doesn't match the syntax is
2386     treated as a literal. We also recognize the POSIX constructions
2387     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2388     5.6 and 5.8 do. */
2389    
2390     if (c == '[' &&
2391     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2392     check_posix_syntax(ptr, &tempptr, cd))
2393     {
2394     BOOL local_negate = FALSE;
2395 nigel 87 int posix_class, taboffset, tabopt;
2396 nigel 77 register const uschar *cbits = cd->cbits;
2397 nigel 87 uschar pbits[32];
2398 nigel 77
2399     if (ptr[1] != ':')
2400     {
2401     *errorcodeptr = ERR31;
2402     goto FAILED;
2403     }
2404    
2405     ptr += 2;
2406     if (*ptr == '^')
2407     {
2408     local_negate = TRUE;
2409     ptr++;
2410     }
2411    
2412     posix_class = check_posix_name(ptr, tempptr - ptr);
2413     if (posix_class < 0)
2414     {
2415     *errorcodeptr = ERR30;
2416     goto FAILED;
2417     }
2418    
2419     /* If matching is caseless, upper and lower are converted to
2420     alpha. This relies on the fact that the class table starts with
2421     alpha, lower, upper as the first 3 entries. */
2422    
2423     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2424     posix_class = 0;
2425    
2426 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2427     because we may be adding and subtracting from it, and we don't want to
2428     subtract bits that may be in the main map already. At the end we or the
2429     result into the bit map that is being built. */
2430 nigel 77
2431     posix_class *= 3;
2432 nigel 87
2433     /* Copy in the first table (always present) */
2434    
2435     memcpy(pbits, cbits + posix_class_maps[posix_class],
2436     32 * sizeof(uschar));
2437    
2438     /* If there is a second table, add or remove it as required. */
2439    
2440     taboffset = posix_class_maps[posix_class + 1];
2441     tabopt = posix_class_maps[posix_class + 2];
2442    
2443     if (taboffset >= 0)
2444 nigel 77 {
2445 nigel 87 if (tabopt >= 0)
2446     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2447 nigel 77 else
2448 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2449 nigel 77 }
2450    
2451 nigel 87 /* Not see if we need to remove any special characters. An option
2452     value of 1 removes vertical space and 2 removes underscore. */
2453    
2454     if (tabopt < 0) tabopt = -tabopt;
2455     if (tabopt == 1) pbits[1] &= ~0x3c;
2456     else if (tabopt == 2) pbits[11] &= 0x7f;
2457    
2458     /* Add the POSIX table or its complement into the main table that is
2459     being built and we are done. */
2460    
2461     if (local_negate)
2462     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2463     else
2464     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2465    
2466 nigel 77 ptr = tempptr + 1;
2467     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2468     continue; /* End of POSIX syntax handling */
2469     }
2470    
2471     /* Backslash may introduce a single character, or it may introduce one
2472 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2473     case. Inside a class (and only there) it is treated as backspace.
2474     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2475     to or into the one we are building. We assume they have more than one
2476 nigel 77 character in them, so set class_charcount bigger than one. */
2477    
2478     if (c == '\\')
2479     {
2480 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2481     if (*errorcodeptr != 0) goto FAILED;
2482 nigel 77
2483     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2484     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2485 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2486 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2487     {
2488     if (ptr[1] == '\\' && ptr[2] == 'E')
2489     {
2490     ptr += 2; /* avoid empty string */
2491     }
2492     else inescq = TRUE;
2493     continue;
2494     }
2495    
2496     if (c < 0)
2497     {
2498     register const uschar *cbits = cd->cbits;
2499     class_charcount += 2; /* Greater than 1 is what matters */
2500 nigel 93
2501     /* Save time by not doing this in the pre-compile phase. */
2502    
2503     if (lengthptr == NULL) switch (-c)
2504 nigel 77 {
2505     case ESC_d:
2506     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2507     continue;
2508    
2509     case ESC_D:
2510     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2511     continue;
2512    
2513     case ESC_w:
2514     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2515     continue;
2516    
2517     case ESC_W:
2518     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2519     continue;
2520    
2521     case ESC_s:
2522     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2523     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2524     continue;
2525    
2526     case ESC_S:
2527     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2528     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2529     continue;
2530    
2531 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2532     continue;
2533    
2534     default: /* Not recognized; fall through */
2535     break; /* Need "default" setting to stop compiler warning. */
2536     }
2537    
2538     /* In the pre-compile phase, just do the recognition. */
2539    
2540     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2541     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2542    
2543     /* We need to deal with \P and \p in both phases. */
2544    
2545 nigel 77 #ifdef SUPPORT_UCP
2546 nigel 93 if (-c == ESC_p || -c == ESC_P)
2547     {
2548     BOOL negated;
2549     int pdata;
2550     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2551     if (ptype < 0) goto FAILED;
2552     class_utf8 = TRUE;
2553     *class_utf8data++ = ((-c == ESC_p) != negated)?
2554     XCL_PROP : XCL_NOTPROP;
2555     *class_utf8data++ = ptype;
2556     *class_utf8data++ = pdata;
2557     class_charcount -= 2; /* Not a < 256 character */
2558 nigel 77 continue;
2559 nigel 93 }
2560 nigel 77 #endif
2561 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2562     strict mode. By default, for compatibility with Perl, they are
2563     treated as literals. */
2564 nigel 77
2565 nigel 93 if ((options & PCRE_EXTRA) != 0)
2566     {
2567     *errorcodeptr = ERR7;
2568     goto FAILED;
2569     }
2570 nigel 77
2571 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2572     c = *ptr; /* Get the final character and fall through */
2573 nigel 77 }
2574    
2575     /* Fall through if we have a single character (c >= 0). This may be
2576 nigel 93 greater than 256 in UTF-8 mode. */
2577 nigel 77
2578     } /* End of backslash handling */
2579    
2580     /* A single character may be followed by '-' to form a range. However,
2581     Perl does not permit ']' to be the end of the range. A '-' character
2582 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2583     entirely. The code for handling \Q and \E is messy. */
2584 nigel 77
2585 nigel 93 CHECK_RANGE:
2586     while (ptr[1] == '\\' && ptr[2] == 'E')
2587 nigel 77 {
2588 nigel 93 inescq = FALSE;
2589     ptr += 2;
2590     }
2591    
2592     oldptr = ptr;
2593    
2594     if (!inescq && ptr[1] == '-')
2595     {
2596 nigel 77 int d;
2597     ptr += 2;
2598 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2599 nigel 77
2600 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2601     mode. */
2602    
2603     while (*ptr == '\\' && ptr[1] == 'Q')
2604     {
2605     ptr += 2;
2606     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2607     inescq = TRUE;
2608     break;
2609     }
2610    
2611     if (*ptr == 0 || (!inescq && *ptr == ']'))
2612     {
2613     ptr = oldptr;
2614     goto LONE_SINGLE_CHARACTER;
2615     }
2616    
2617 nigel 77 #ifdef SUPPORT_UTF8
2618     if (utf8)
2619     { /* Braces are required because the */
2620     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2621     }
2622     else
2623     #endif
2624     d = *ptr; /* Not UTF-8 mode */
2625    
2626     /* The second part of a range can be a single-character escape, but
2627     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2628     in such circumstances. */
2629    
2630 nigel 93 if (!inescq && d == '\\')
2631 nigel 77 {
2632 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2633     if (*errorcodeptr != 0) goto FAILED;
2634 nigel 77
2635 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2636     special means the '-' was literal */
2637 nigel 77
2638     if (d < 0)
2639     {
2640     if (d == -ESC_b) d = '\b';
2641 nigel 93 else if (d == -ESC_X) d = 'X';
2642     else if (d == -ESC_R) d = 'R'; else
2643 nigel 77 {
2644 nigel 93 ptr = oldptr;
2645 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2646     }
2647     }
2648     }
2649    
2650 nigel 93 /* Check that the two values are in the correct order. Optimize
2651     one-character ranges */
2652 nigel 77
2653 nigel 93 if (d < c)
2654     {
2655     *errorcodeptr = ERR8;
2656     goto FAILED;
2657     }
2658    
2659 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2660    
2661     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2662     matching, we have to use an XCLASS with extra data items. Caseless
2663     matching for characters > 127 is available only if UCP support is
2664     available. */
2665    
2666     #ifdef SUPPORT_UTF8
2667     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2668     {
2669     class_utf8 = TRUE;
2670    
2671     /* With UCP support, we can find the other case equivalents of
2672     the relevant characters. There may be several ranges. Optimize how
2673     they fit with the basic range. */
2674    
2675     #ifdef SUPPORT_UCP
2676     if ((options & PCRE_CASELESS) != 0)
2677     {
2678 nigel 93 unsigned int occ, ocd;
2679     unsigned int cc = c;
2680     unsigned int origd = d;
2681 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2682     {
2683     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2684    
2685     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2686     { /* if there is overlap, */
2687     c = occ; /* noting that if occ < c */
2688     continue; /* we can't have ocd > d */
2689     } /* because a subrange is */
2690     if (ocd > d && occ <= d + 1) /* always shorter than */
2691     { /* the basic range. */
2692     d = ocd;
2693     continue;
2694     }
2695    
2696     if (occ == ocd)
2697     {
2698     *class_utf8data++ = XCL_SINGLE;
2699     }
2700     else
2701     {
2702     *class_utf8data++ = XCL_RANGE;
2703     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2704     }
2705     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2706     }
2707     }
2708     #endif /* SUPPORT_UCP */
2709    
2710     /* Now record the original range, possibly modified for UCP caseless
2711     overlapping ranges. */
2712    
2713     *class_utf8data++ = XCL_RANGE;
2714     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2715     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2716    
2717     /* With UCP support, we are done. Without UCP support, there is no
2718     caseless matching for UTF-8 characters > 127; we can use the bit map
2719     for the smaller ones. */
2720    
2721     #ifdef SUPPORT_UCP
2722     continue; /* With next character in the class */
2723     #else
2724     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2725    
2726     /* Adjust upper limit and fall through to set up the map */
2727    
2728     d = 127;
2729    
2730     #endif /* SUPPORT_UCP */
2731     }
2732     #endif /* SUPPORT_UTF8 */
2733    
2734     /* We use the bit map for all cases when not in UTF-8 mode; else
2735     ranges that lie entirely within 0-127 when there is UCP support; else
2736     for partial ranges without UCP support. */
2737    
2738 nigel 93 class_charcount += d - c + 1;
2739     class_lastchar = d;
2740    
2741     /* We can save a bit of time by skipping this in the pre-compile. */
2742    
2743     if (lengthptr == NULL) for (; c <= d; c++)
2744 nigel 77 {
2745     classbits[c/8] |= (1 << (c&7));
2746     if ((options & PCRE_CASELESS) != 0)
2747     {
2748     int uc = cd->fcc[c]; /* flip case */
2749     classbits[uc/8] |= (1 << (uc&7));
2750     }
2751     }
2752    
2753     continue; /* Go get the next char in the class */
2754     }
2755    
2756     /* Handle a lone single character - we can get here for a normal
2757     non-escape char, or after \ that introduces a single character or for an
2758     apparent range that isn't. */
2759    
2760     LONE_SINGLE_CHARACTER:
2761    
2762     /* Handle a character that cannot go in the bit map */
2763    
2764     #ifdef SUPPORT_UTF8
2765     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2766     {
2767     class_utf8 = TRUE;
2768     *class_utf8data++ = XCL_SINGLE;
2769     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2770    
2771     #ifdef SUPPORT_UCP
2772     if ((options & PCRE_CASELESS) != 0)
2773     {
2774 nigel 93 unsigned int othercase;
2775     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2776 nigel 77 {
2777     *class_utf8data++ = XCL_SINGLE;
2778     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2779     }
2780     }
2781     #endif /* SUPPORT_UCP */
2782    
2783     }
2784     else
2785     #endif /* SUPPORT_UTF8 */
2786    
2787     /* Handle a single-byte character */
2788     {
2789     classbits[c/8] |= (1 << (c&7));
2790     if ((options & PCRE_CASELESS) != 0)
2791     {
2792     c = cd->fcc[c]; /* flip case */
2793     classbits[c/8] |= (1 << (c&7));
2794     }
2795     class_charcount++;
2796     class_lastchar = c;
2797     }
2798     }
2799    
2800 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2801 nigel 77
2802 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2803 nigel 77
2804 nigel 93 if (c == 0) /* Missing terminating ']' */
2805     {
2806     *errorcodeptr = ERR6;
2807     goto FAILED;
2808     }
2809    
2810 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2811     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2812     can optimize the negative case only if there were no characters >= 128
2813     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2814     single-bytes only. This is an historical hangover. Maybe one day we can
2815     tidy these opcodes to handle multi-byte characters.
2816    
2817     The optimization throws away the bit map. We turn the item into a
2818     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2819     that OP_NOT does not support multibyte characters. In the positive case, it
2820     can cause firstbyte to be set. Otherwise, there can be no first char if
2821     this item is first, whatever repeat count may follow. In the case of
2822     reqbyte, save the previous value for reinstating. */
2823    
2824     #ifdef SUPPORT_UTF8
2825     if (class_charcount == 1 &&
2826     (!utf8 ||
2827     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2828    
2829     #else
2830     if (class_charcount == 1)
2831     #endif
2832     {
2833     zeroreqbyte = reqbyte;
2834    
2835     /* The OP_NOT opcode works on one-byte characters only. */
2836    
2837     if (negate_class)
2838     {
2839     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2840     zerofirstbyte = firstbyte;
2841     *code++ = OP_NOT;
2842     *code++ = class_lastchar;
2843     break;
2844     }
2845    
2846     /* For a single, positive character, get the value into mcbuffer, and
2847     then we can handle this with the normal one-character code. */
2848    
2849     #ifdef SUPPORT_UTF8
2850     if (utf8 && class_lastchar > 127)
2851     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2852     else
2853     #endif
2854     {
2855     mcbuffer[0] = class_lastchar;
2856     mclength = 1;
2857     }
2858     goto ONE_CHAR;
2859     } /* End of 1-char optimization */
2860    
2861     /* The general case - not the one-char optimization. If this is the first
2862     thing in the branch, there can be no first char setting, whatever the
2863     repeat count. Any reqbyte setting must remain unchanged after any kind of
2864     repeat. */
2865    
2866     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2867     zerofirstbyte = firstbyte;
2868     zeroreqbyte = reqbyte;
2869    
2870     /* If there are characters with values > 255, we have to compile an
2871     extended class, with its own opcode. If there are no characters < 256,
2872 nigel 93 we can omit the bitmap in the actual compiled code. */
2873 nigel 77
2874     #ifdef SUPPORT_UTF8
2875     if (class_utf8)
2876     {
2877     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2878     *code++ = OP_XCLASS;
2879     code += LINK_SIZE;
2880     *code = negate_class? XCL_NOT : 0;
2881    
2882 nigel 93 /* If the map is required, move up the extra data to make room for it;
2883     otherwise just move the code pointer to the end of the extra data. */
2884 nigel 77
2885     if (class_charcount > 0)
2886     {
2887     *code++ |= XCL_MAP;
2888 nigel 93 memmove(code + 32, code, class_utf8data - code);
2889 nigel 77 memcpy(code, classbits, 32);
2890 nigel 93 code = class_utf8data + 32;
2891 nigel 77 }
2892 nigel 93 else code = class_utf8data;
2893 nigel 77
2894     /* Now fill in the complete length of the item */
2895    
2896     PUT(previous, 1, code - previous);
2897     break; /* End of class handling */
2898     }
2899     #endif
2900    
2901     /* If there are no characters > 255, negate the 32-byte map if necessary,
2902     and copy it into the code vector. If this is the first thing in the branch,
2903     there can be no first char setting, whatever the repeat count. Any reqbyte
2904     setting must remain unchanged after any kind of repeat. */
2905    
2906     if (negate_class)
2907     {
2908     *code++ = OP_NCLASS;
2909 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2910     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2911 nigel 77 }
2912     else
2913     {
2914     *code++ = OP_CLASS;
2915     memcpy(code, classbits, 32);
2916     }
2917     code += 32;
2918     break;
2919    
2920 nigel 93
2921     /* ===================================================================*/
2922 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2923     has been tested above. */
2924    
2925     case '{':
2926     if (!is_quantifier) goto NORMAL_CHAR;
2927     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2928     if (*errorcodeptr != 0) goto FAILED;
2929     goto REPEAT;
2930    
2931     case '*':
2932     repeat_min = 0;
2933     repeat_max = -1;
2934     goto REPEAT;
2935    
2936     case '+':
2937     repeat_min = 1;
2938     repeat_max = -1;
2939     goto REPEAT;
2940    
2941     case '?':
2942     repeat_min = 0;
2943     repeat_max = 1;
2944    
2945     REPEAT:
2946     if (previous == NULL)
2947     {
2948     *errorcodeptr = ERR9;
2949     goto FAILED;
2950     }
2951    
2952     if (repeat_min == 0)
2953     {
2954     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2955     reqbyte = zeroreqbyte; /* Ditto */
2956     }
2957    
2958     /* Remember whether this is a variable length repeat */
2959    
2960     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2961    
2962     op_type = 0; /* Default single-char op codes */
2963     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2964    
2965     /* Save start of previous item, in case we have to move it up to make space
2966     for an inserted OP_ONCE for the additional '+' extension. */
2967    
2968     tempcode = previous;
2969    
2970     /* If the next character is '+', we have a possessive quantifier. This
2971     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2972     If the next character is '?' this is a minimizing repeat, by default,
2973     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2974     repeat type to the non-default. */
2975    
2976     if (ptr[1] == '+')
2977     {
2978     repeat_type = 0; /* Force greedy */
2979     possessive_quantifier = TRUE;
2980     ptr++;
2981     }
2982     else if (ptr[1] == '?')
2983     {
2984     repeat_type = greedy_non_default;
2985     ptr++;
2986     }
2987     else repeat_type = greedy_default;
2988    
2989     /* If previous was a character match, abolish the item and generate a
2990     repeat item instead. If a char item has a minumum of more than one, ensure
2991     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2992     the first thing in a branch because the x will have gone into firstbyte
2993     instead. */
2994    
2995     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2996     {
2997     /* Deal with UTF-8 characters that take up more than one byte. It's
2998     easier to write this out separately than try to macrify it. Use c to
2999     hold the length of the character in bytes, plus 0x80 to flag that it's a
3000     length rather than a small character. */
3001    
3002     #ifdef SUPPORT_UTF8
3003     if (utf8 && (code[-1] & 0x80) != 0)
3004     {
3005     uschar *lastchar = code - 1;
3006     while((*lastchar & 0xc0) == 0x80) lastchar--;
3007     c = code - lastchar; /* Length of UTF-8 character */
3008     memcpy(utf8_char, lastchar, c); /* Save the char */
3009     c |= 0x80; /* Flag c as a length */
3010     }
3011     else
3012     #endif
3013    
3014     /* Handle the case of a single byte - either with no UTF8 support, or
3015     with UTF-8 disabled, or for a UTF-8 character < 128. */
3016    
3017     {
3018     c = code[-1];
3019     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3020     }
3021    
3022 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3023     the line is something that cannot possibly match this character. If so,
3024     automatically possessifying this item gains some performance in the case
3025     where the match fails. */
3026    
3027     if (!possessive_quantifier &&
3028     repeat_max < 0 &&
3029     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3030     options, cd))
3031     {
3032     repeat_type = 0; /* Force greedy */
3033     possessive_quantifier = TRUE;
3034     }
3035    
3036 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3037     }
3038    
3039     /* If previous was a single negated character ([^a] or similar), we use
3040     one of the special opcodes, replacing it. The code is shared with single-
3041     character repeats by setting opt_type to add a suitable offset into
3042 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3043     currently used only for single-byte chars. */
3044 nigel 77
3045     else if (*previous == OP_NOT)
3046     {
3047     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3048     c = previous[1];
3049 nigel 93 if (!possessive_quantifier &&
3050     repeat_max < 0 &&
3051     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3052     {
3053     repeat_type = 0; /* Force greedy */
3054     possessive_quantifier = TRUE;
3055     }
3056 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3057     }
3058    
3059     /* If previous was a character type match (\d or similar), abolish it and
3060     create a suitable repeat item. The code is shared with single-character
3061     repeats by setting op_type to add a suitable offset into repeat_type. Note
3062     the the Unicode property types will be present only when SUPPORT_UCP is
3063     defined, but we don't wrap the little bits of code here because it just
3064     makes it horribly messy. */
3065    
3066     else if (*previous < OP_EODN)
3067     {
3068     uschar *oldcode;
3069 nigel 87 int prop_type, prop_value;
3070 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3071     c = *previous;
3072    
3073 nigel 93 if (!possessive_quantifier &&
3074     repeat_max < 0 &&
3075     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3076     {
3077     repeat_type = 0; /* Force greedy */
3078     possessive_quantifier = TRUE;
3079     }
3080    
3081 nigel 77 OUTPUT_SINGLE_REPEAT:
3082 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3083     {
3084     prop_type = previous[1];
3085     prop_value = previous[2];
3086     }
3087     else prop_type = prop_value = -1;
3088 nigel 77
3089     oldcode = code;
3090     code = previous; /* Usually overwrite previous item */
3091    
3092     /* If the maximum is zero then the minimum must also be zero; Perl allows
3093     this case, so we do too - by simply omitting the item altogether. */
3094    
3095     if (repeat_max == 0) goto END_REPEAT;
3096    
3097     /* All real repeats make it impossible to handle partial matching (maybe
3098     one day we will be able to remove this restriction). */
3099    
3100     if (repeat_max != 1) cd->nopartial = TRUE;
3101    
3102     /* Combine the op_type with the repeat_type */
3103    
3104     repeat_type += op_type;
3105    
3106     /* A minimum of zero is handled either as the special case * or ?, or as
3107     an UPTO, with the maximum given. */
3108    
3109     if (repeat_min == 0)
3110     {
3111     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3112     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3113     else
3114     {
3115     *code++ = OP_UPTO + repeat_type;
3116     PUT2INC(code, 0, repeat_max);
3117     }
3118     }
3119    
3120     /* A repeat minimum of 1 is optimized into some special cases. If the
3121 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3122 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3123     one less than the maximum. */
3124    
3125     else if (repeat_min == 1)
3126     {
3127     if (repeat_max == -1)
3128     *code++ = OP_PLUS + repeat_type;
3129     else
3130     {
3131     code = oldcode; /* leave previous item in place */
3132     if (repeat_max == 1) goto END_REPEAT;
3133     *code++ = OP_UPTO + repeat_type;
3134     PUT2INC(code, 0, repeat_max - 1);
3135     }
3136     }
3137    
3138     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3139     handled as an EXACT followed by an UPTO. */
3140    
3141     else
3142     {
3143     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3144     PUT2INC(code, 0, repeat_min);
3145    
3146     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3147     we have to insert the character for the previous code. For a repeated
3148 nigel 87 Unicode property match, there are two extra bytes that define the
3149 nigel 77 required property. In UTF-8 mode, long characters have their length in
3150     c, with the 0x80 bit as a flag. */
3151    
3152     if (repeat_max < 0)
3153     {
3154     #ifdef SUPPORT_UTF8
3155     if (utf8 && c >= 128)
3156     {
3157     memcpy(code, utf8_char, c & 7);
3158     code += c & 7;
3159     }
3160     else
3161     #endif
3162     {
3163     *code++ = c;
3164 nigel 87 if (prop_type >= 0)
3165     {
3166     *code++ = prop_type;
3167     *code++ = prop_value;
3168     }
3169 nigel 77 }
3170     *code++ = OP_STAR + repeat_type;
3171     }
3172    
3173     /* Else insert an UPTO if the max is greater than the min, again
3174 nigel 93 preceded by the character, for the previously inserted code. If the
3175     UPTO is just for 1 instance, we can use QUERY instead. */
3176 nigel 77
3177     else if (repeat_max != repeat_min)
3178     {
3179     #ifdef SUPPORT_UTF8
3180     if (utf8 && c >= 128)
3181     {
3182     memcpy(code, utf8_char, c & 7);
3183     code += c & 7;
3184     }
3185     else
3186     #endif
3187     *code++ = c;
3188 nigel 87 if (prop_type >= 0)
3189     {
3190     *code++ = prop_type;
3191     *code++ = prop_value;
3192     }
3193 nigel 77 repeat_max -= repeat_min;
3194 nigel 93
3195     if (repeat_max == 1)
3196     {
3197     *code++ = OP_QUERY + repeat_type;
3198     }
3199     else
3200     {
3201     *code++ = OP_UPTO + repeat_type;
3202     PUT2INC(code, 0, repeat_max);
3203     }
3204 nigel 77 }
3205     }
3206    
3207     /* The character or character type itself comes last in all cases. */
3208    
3209     #ifdef SUPPORT_UTF8
3210     if (utf8 && c >= 128)
3211     {
3212     memcpy(code, utf8_char, c & 7);
3213     code += c & 7;
3214     }
3215     else
3216     #endif
3217     *code++ = c;
3218    
3219 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3220     define the required property. */
3221 nigel 77
3222     #ifdef SUPPORT_UCP
3223 nigel 87 if (prop_type >= 0)
3224     {
3225     *code++ = prop_type;
3226     *code++ = prop_value;
3227     }
3228 nigel 77 #endif
3229     }
3230    
3231     /* If previous was a character class or a back reference, we put the repeat
3232     stuff after it, but just skip the item if the repeat was {0,0}. */
3233    
3234     else if (*previous == OP_CLASS ||
3235     *previous == OP_NCLASS ||
3236     #ifdef SUPPORT_UTF8
3237     *previous == OP_XCLASS ||
3238     #endif
3239     *previous == OP_REF)
3240     {
3241     if (repeat_max == 0)
3242     {
3243     code = previous;
3244     goto END_REPEAT;
3245     }
3246    
3247     /* All real repeats make it impossible to handle partial matching (maybe
3248     one day we will be able to remove this restriction). */
3249    
3250     if (repeat_max != 1) cd->nopartial = TRUE;
3251    
3252     if (repeat_min == 0 && repeat_max == -1)
3253     *code++ = OP_CRSTAR + repeat_type;
3254     else if (repeat_min == 1 && repeat_max == -1)
3255     *code++ = OP_CRPLUS + repeat_type;
3256     else if (repeat_min == 0 && repeat_max == 1)
3257     *code++ = OP_CRQUERY + repeat_type;
3258     else
3259     {
3260     *code++ = OP_CRRANGE + repeat_type;
3261     PUT2INC(code, 0, repeat_min);
3262     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3263     PUT2INC(code, 0, repeat_max);
3264     }
3265     }
3266    
3267     /* If previous was a bracket group, we may have to replicate it in certain
3268     cases. */
3269    
3270 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3271     *previous == OP_ONCE || *previous == OP_COND)
3272 nigel 77 {
3273     register int i;
3274     int ketoffset = 0;
3275     int len = code - previous;
3276     uschar *bralink = NULL;
3277    
3278 nigel 93 /* Repeating a DEFINE group is pointless */
3279    
3280     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3281     {
3282     *errorcodeptr = ERR55;
3283     goto FAILED;
3284     }
3285    
3286     /* This is a paranoid check to stop integer overflow later on */
3287    
3288     if (len > MAX_DUPLENGTH)
3289     {
3290     *errorcodeptr = ERR50;
3291     goto FAILED;
3292     }
3293    
3294 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3295     by scanning through from the start, and compute the offset back to it
3296     from the current code pointer. There may be an OP_OPT setting following
3297     the final KET, so we can't find the end just by going back from the code
3298     pointer. */
3299    
3300     if (repeat_max == -1)
3301     {
3302     register uschar *ket = previous;
3303     do ket += GET(ket, 1); while (*ket != OP_KET);
3304     ketoffset = code - ket;
3305     }
3306    
3307     /* The case of a zero minimum is special because of the need to stick
3308     OP_BRAZERO in front of it, and because the group appears once in the
3309     data, whereas in other cases it appears the minimum number of times. For
3310     this reason, it is simplest to treat this case separately, as otherwise
3311     the code gets far too messy. There are several special subcases when the
3312     minimum is zero. */
3313    
3314     if (repeat_min == 0)
3315     {
3316     /* If the maximum is also zero, we just omit the group from the output
3317     altogether. */
3318    
3319     if (repeat_max == 0)
3320     {
3321     code = previous;
3322     goto END_REPEAT;
3323     }
3324    
3325     /* If the maximum is 1 or unlimited, we just have to stick in the
3326     BRAZERO and do no more at this point. However, we do need to adjust
3327     any OP_RECURSE calls inside the group that refer to the group itself or
3328 nigel 93 any internal or forward referenced group, because the offset is from
3329     the start of the whole regex. Temporarily terminate the pattern while
3330     doing this. */
3331 nigel 77
3332     if (repeat_max <= 1)
3333     {
3334     *code = OP_END;
3335 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3336 nigel 77 memmove(previous+1, previous, len);
3337     code++;
3338     *previous++ = OP_BRAZERO + repeat_type;
3339     }
3340    
3341     /* If the maximum is greater than 1 and limited, we have to replicate
3342     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3343     The first one has to be handled carefully because it's the original
3344     copy, which has to be moved up. The remainder can be handled by code
3345     that is common with the non-zero minimum case below. We have to
3346     adjust the value or repeat_max, since one less copy is required. Once
3347     again, we may have to adjust any OP_RECURSE calls inside the group. */
3348    
3349     else
3350     {
3351     int offset;
3352     *code = OP_END;
3353 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3354 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3355     code += 2 + LINK_SIZE;
3356     *previous++ = OP_BRAZERO + repeat_type;
3357     *previous++ = OP_BRA;
3358    
3359     /* We chain together the bracket offset fields that have to be
3360     filled in later when the ends of the brackets are reached. */
3361    
3362     offset = (bralink == NULL)? 0 : previous - bralink;
3363     bralink = previous;
3364     PUTINC(previous, 0, offset);
3365     }
3366    
3367     repeat_max--;
3368     }
3369    
3370     /* If the minimum is greater than zero, replicate the group as many
3371     times as necessary, and adjust the maximum to the number of subsequent
3372     copies that we need. If we set a first char from the group, and didn't
3373 nigel 93 set a required char, copy the latter from the former. If there are any
3374     forward reference subroutine calls in the group, there will be entries on
3375     the workspace list; replicate these with an appropriate increment. */
3376 nigel 77
3377     else
3378     {
3379     if (repeat_min > 1)
3380     {
3381 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3382     just adjust the length as if we had. */
3383    
3384     if (lengthptr != NULL)
3385     *lengthptr += (repeat_min - 1)*length_prevgroup;
3386    
3387     /* This is compiling for real */
3388    
3389     else
3390 nigel 77 {
3391 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3392     for (i = 1; i < repeat_min; i++)
3393     {
3394     uschar *hc;
3395     uschar *this_hwm = cd->hwm;
3396     memcpy(code, previous, len);
3397     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3398     {
3399     PUT(cd->hwm, 0, GET(hc, 0) + len);
3400     cd->hwm += LINK_SIZE;
3401     }
3402     save_hwm = this_hwm;
3403     code += len;
3404     }
3405 nigel 77 }
3406     }
3407 nigel 93
3408 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3409     }
3410    
3411     /* This code is common to both the zero and non-zero minimum cases. If
3412     the maximum is limited, it replicates the group in a nested fashion,
3413     remembering the bracket starts on a stack. In the case of a zero minimum,
3414     the first one was set up above. In all cases the repeat_max now specifies
3415 nigel 93 the number of additional copies needed. Again, we must remember to
3416     replicate entries on the forward reference list. */
3417 nigel 77
3418     if (repeat_max >= 0)
3419     {
3420 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3421     just adjust the length as if we had. For each repetition we must add 1
3422     to the length for BRAZERO and for all but the last repetition we must
3423     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3424    
3425     if (lengthptr != NULL && repeat_max > 0)
3426     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3427     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3428    
3429     /* This is compiling for real */
3430    
3431     else for (i = repeat_max - 1; i >= 0; i--)
3432 nigel 77 {
3433 nigel 93 uschar *hc;
3434     uschar *this_hwm = cd->hwm;
3435    
3436 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3437    
3438     /* All but the final copy start a new nesting, maintaining the
3439     chain of brackets outstanding. */
3440    
3441     if (i != 0)
3442     {
3443     int offset;
3444     *code++ = OP_BRA;
3445     offset = (bralink == NULL)? 0 : code - bralink;
3446     bralink = code;
3447     PUTINC(code, 0, offset);
3448     }
3449    
3450     memcpy(code, previous, len);
3451 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3452     {
3453     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3454     cd->hwm += LINK_SIZE;
3455     }
3456     save_hwm = this_hwm;
3457 nigel 77 code += len;
3458     }
3459    
3460     /* Now chain through the pending brackets, and fill in their length
3461     fields (which are holding the chain links pro tem). */
3462    
3463     while (bralink != NULL)
3464     {
3465     int oldlinkoffset;
3466     int offset = code - bralink + 1;
3467     uschar *bra = code - offset;
3468     oldlinkoffset = GET(bra, 1);
3469     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3470     *code++ = OP_KET;
3471     PUTINC(code, 0, offset);
3472     PUT(bra, 1, offset);
3473     }
3474     }
3475    
3476     /* If the maximum is unlimited, set a repeater in the final copy. We
3477     can't just offset backwards from the current code point, because we
3478     don't know if there's been an options resetting after the ket. The
3479 nigel 93 correct offset was computed above.
3480 nigel 77
3481 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3482     this group is a non-atomic one that could match an empty string. If so,
3483     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3484     that runtime checking can be done. [This check is also applied to
3485     atomic groups at runtime, but in a different way.] */
3486    
3487     else
3488     {
3489     uschar *ketcode = code - ketoffset;
3490     uschar *bracode = ketcode - GET(ketcode, 1);
3491     *ketcode = OP_KETRMAX + repeat_type;
3492     if (lengthptr == NULL && *bracode != OP_ONCE)
3493     {
3494     uschar *scode = bracode;
3495     do
3496     {
3497     if (could_be_empty_branch(scode, ketcode, utf8))
3498     {
3499     *bracode += OP_SBRA - OP_BRA;
3500     break;
3501     }
3502     scode += GET(scode, 1);
3503     }
3504     while (*scode == OP_ALT);
3505     }
3506     }
3507 nigel 77 }
3508    
3509     /* Else there's some kind of shambles */
3510    
3511     else
3512     {
3513     *errorcodeptr = ERR11;
3514     goto FAILED;
3515     }
3516    
3517 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3518     tests above succeeded, possessive_quantifier is TRUE. For some of the
3519     simpler opcodes, there is an special alternative opcode for this. For
3520     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3521     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3522     but the special opcodes can optimize it a bit. The repeated item starts at
3523     tempcode, not at previous, which might be the first part of a string whose
3524     (former) last char we repeated.
3525 nigel 77
3526 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3527     an 'upto' may follow. We skip over an 'exact' item, and then test the
3528     length of what remains before proceeding. */
3529    
3530 nigel 77 if (possessive_quantifier)
3531     {
3532 nigel 93 int len;
3533     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3534     *tempcode == OP_NOTEXACT)
3535     tempcode += _pcre_OP_lengths[*tempcode];
3536     len = code - tempcode;
3537     if (len > 0) switch (*tempcode)
3538     {
3539     case OP_STAR: *tempcode = OP_POSSTAR; break;
3540     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3541     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3542     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3543    
3544     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3545     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3546     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3547     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3548    
3549     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3550     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3551     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3552     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3553    
3554     default:
3555     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3556     code += 1 + LINK_SIZE;
3557     len += 1 + LINK_SIZE;
3558     tempcode[0] = OP_ONCE;
3559     *code++ = OP_KET;
3560     PUTINC(code, 0, len);
3561     PUT(tempcode, 1, len);
3562     break;
3563     }
3564 nigel 77 }
3565    
3566     /* In all case we no longer have a previous item. We also set the
3567     "follows varying string" flag for subsequently encountered reqbytes if
3568     it isn't already set and we have just passed a varying length item. */
3569    
3570     END_REPEAT:
3571     previous = NULL;
3572     cd->req_varyopt |= reqvary;
3573     break;
3574    
3575    
3576 nigel 93 /* ===================================================================*/
3577     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3578     lookbehind or option setting or condition or all the other extended
3579     parenthesis forms. First deal with the specials; all are introduced by ?,
3580     and the appearance of any of them means that this is not a capturing
3581     group. */
3582 nigel 77
3583     case '(':
3584     newoptions = options;
3585     skipbytes = 0;
3586 nigel 93 bravalue = OP_CBRA;
3587     save_hwm = cd->hwm;
3588 ph10 175 reset_bracount = FALSE;
3589 nigel 77
3590     if (*(++ptr) == '?')
3591     {
3592 nigel 93 int i, set, unset, namelen;
3593 nigel 77 int *optset;
3594 nigel 93 const uschar *name;
3595     uschar *slot;
3596 nigel 77
3597     switch (*(++ptr))
3598     {
3599     case '#': /* Comment; skip to ket */
3600     ptr++;
3601 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3602     if (*ptr == 0)
3603     {
3604     *errorcodeptr = ERR18;
3605     goto FAILED;
3606     }
3607 nigel 77 continue;
3608    
3609 nigel 93
3610     /* ------------------------------------------------------------ */
3611 ph10 175 case '|': /* Reset capture count for each branch */
3612     reset_bracount = TRUE;
3613     /* Fall through */
3614    
3615     /* ------------------------------------------------------------ */
3616 nigel 93 case ':': /* Non-capturing bracket */
3617 nigel 77 bravalue = OP_BRA;
3618     ptr++;
3619     break;
3620    
3621 nigel 93
3622     /* ------------------------------------------------------------ */
3623 nigel 77 case '(':
3624     bravalue = OP_COND; /* Conditional group */
3625    
3626 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3627     group), a name (referring to a named group), or 'R', referring to
3628     recursion. R<digits> and R&name are also permitted for recursion tests.
3629 nigel 77
3630 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3631     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3632    
3633     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3634     be the recursive thing or the name 'R' (and similarly for 'R' followed
3635     by digits), and (b) a number could be a name that consists of digits.
3636     In both cases, we look for a name first; if not found, we try the other
3637     cases. */
3638    
3639     /* For conditions that are assertions, check the syntax, and then exit
3640     the switch. This will take control down to where bracketed groups,
3641     including assertions, are processed. */
3642    
3643     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3644     break;
3645    
3646     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3647     below), and all need to skip 3 bytes at the start of the group. */
3648    
3649     code[1+LINK_SIZE] = OP_CREF;
3650     skipbytes = 3;
3651 ph10 172 refsign = -1;
3652 nigel 93
3653     /* Check for a test for recursion in a named group. */
3654    
3655     if (ptr[1] == 'R' && ptr[2] == '&')
3656 nigel 77 {
3657 nigel 93 terminator = -1;
3658     ptr += 2;
3659     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3660     }
3661 nigel 91
3662 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3663     syntax (?(<name>) or (?('name') */
3664 nigel 91
3665 nigel 93 else if (ptr[1] == '<')
3666     {
3667     terminator = '>';
3668     ptr++;
3669     }
3670     else if (ptr[1] == '\'')
3671     {
3672     terminator = '\'';
3673     ptr++;
3674     }
3675 ph10 172 else
3676 ph10 167 {
3677     terminator = 0;
3678 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3679     }
3680 nigel 77
3681 nigel 93 /* We now expect to read a name; any thing else is an error */
3682 nigel 77
3683 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3684     {
3685     ptr += 1; /* To get the right offset */
3686     *errorcodeptr = ERR28;
3687     goto FAILED;
3688     }
3689    
3690     /* Read the name, but also get it as a number if it's all digits */
3691    
3692     recno = 0;
3693     name = ++ptr;
3694     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3695     {
3696     if (recno >= 0)
3697     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3698     recno * 10 + *ptr - '0' : -1;
3699 nigel 91 ptr++;
3700 nigel 93 }
3701     namelen = ptr - name;
3702 nigel 91
3703 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3704     {
3705     ptr--; /* Error offset */
3706     *errorcodeptr = ERR26;
3707     goto FAILED;
3708     }
3709 nigel 91
3710 nigel 93 /* Do no further checking in the pre-compile phase. */
3711 nigel 91
3712 nigel 93 if (lengthptr != NULL) break;
3713 nigel 91
3714 nigel 93 /* In the real compile we do the work of looking for the actual
3715 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3716     be digits, in which case recno will be set. */
3717 ph10 172
3718 ph10 167 if (refsign > 0)
3719     {
3720     if (recno <= 0)
3721     {
3722     *errorcodeptr = ERR58;
3723     goto FAILED;
3724 ph10 172 }
3725 ph10 167 if (refsign == '-')
3726     {
3727 ph10 172 recno = cd->bracount - recno + 1;
3728 ph10 167 if (recno <= 0)
3729     {
3730     *errorcodeptr = ERR15;
3731     goto FAILED;
3732 ph10 172 }
3733 ph10 167 }
3734 ph10 172 else recno += cd->bracount;
3735 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
3736     break;
3737 ph10 172 }
3738 nigel 91
3739 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3740     name. */
3741 ph10 172
3742 nigel 93 slot = cd->name_table;
3743     for (i = 0; i < cd->names_found; i++)
3744     {
3745     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3746     slot += cd->name_entry_size;
3747     }
3748 nigel 91
3749 nigel 93 /* Found a previous named subpattern */
3750 nigel 91
3751 nigel 93 if (i < cd->names_found)
3752     {
3753     recno = GET2(slot, 0);
3754     PUT2(code, 2+LINK_SIZE, recno);
3755     }
3756 nigel 91
3757 nigel 93 /* Search the pattern for a forward reference */
3758 nigel 91
3759 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3760     (options & PCRE_EXTENDED) != 0)) > 0)
3761     {
3762     PUT2(code, 2+LINK_SIZE, i);
3763     }
3764 nigel 91
3765 nigel 93 /* If terminator == 0 it means that the name followed directly after
3766     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3767     some further alternatives to try. For the cases where terminator != 0
3768     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3769     now checked all the possibilities, so give an error. */
3770 nigel 91
3771 nigel 93 else if (terminator != 0)
3772     {
3773     *errorcodeptr = ERR15;
3774     goto FAILED;
3775     }
3776    
3777     /* Check for (?(R) for recursion. Allow digits after R to specify a
3778     specific group number. */
3779    
3780     else if (*name == 'R')
3781     {
3782     recno = 0;
3783     for (i = 1; i < namelen; i++)
3784 nigel 91 {
3785 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3786     {
3787     *errorcodeptr = ERR15;
3788     goto FAILED;
3789     }
3790     recno = recno * 10 + name[i] - '0';
3791 nigel 77 }
3792 nigel 93 if (recno == 0) recno = RREF_ANY;
3793     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3794     PUT2(code, 2+LINK_SIZE, recno);
3795 nigel 77 }
3796 nigel 91
3797 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3798     false. */
3799 nigel 91
3800 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3801     {
3802     code[1+LINK_SIZE] = OP_DEF;
3803     skipbytes = 1;
3804     }
3805    
3806     /* Check for the "name" actually being a subpattern number. */
3807    
3808     else if (recno > 0)
3809     {
3810     PUT2(code, 2+LINK_SIZE, recno);
3811     }
3812    
3813     /* Either an unidentified subpattern, or a reference to (?(0) */
3814    
3815     else
3816     {
3817     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3818     goto FAILED;
3819     }
3820 nigel 77 break;
3821    
3822 nigel 93
3823     /* ------------------------------------------------------------ */
3824 nigel 77 case '=': /* Positive lookahead */
3825     bravalue = OP_ASSERT;
3826     ptr++;
3827     break;
3828    
3829 nigel 93
3830     /* ------------------------------------------------------------ */
3831 nigel 77 case '!': /* Negative lookahead */
3832     bravalue = OP_ASSERT_NOT;
3833     ptr++;
3834     break;
3835    
3836 nigel 93
3837     /* ------------------------------------------------------------ */
3838     case '<': /* Lookbehind or named define */
3839     switch (ptr[1])
3840 nigel 77 {
3841     case '=': /* Positive lookbehind */
3842     bravalue = OP_ASSERTBACK;
3843 nigel 93 ptr += 2;
3844 nigel 77 break;
3845    
3846     case '!': /* Negative lookbehind */
3847     bravalue = OP_ASSERTBACK_NOT;
3848 nigel 93 ptr += 2;
3849 nigel 77 break;
3850 nigel 93
3851     default: /* Could be name define, else bad */
3852     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3853     ptr++; /* Correct offset for error */
3854     *errorcodeptr = ERR24;
3855     goto FAILED;
3856 nigel 77 }
3857     break;
3858    
3859 nigel 93
3860     /* ------------------------------------------------------------ */
3861 nigel 77 case '>': /* One-time brackets */
3862     bravalue = OP_ONCE;
3863     ptr++;
3864     break;
3865    
3866 nigel 93
3867     /* ------------------------------------------------------------ */
3868 nigel 77 case 'C': /* Callout - may be followed by digits; */
3869     previous_callout = code; /* Save for later completion */
3870     after_manual_callout = 1; /* Skip one item before completing */
3871 nigel 93 *code++ = OP_CALLOUT;
3872     {
3873 nigel 77 int n = 0;
3874     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3875     n = n * 10 + *ptr - '0';
3876 nigel 93 if (*ptr != ')')
3877     {
3878     *errorcodeptr = ERR39;
3879     goto FAILED;
3880     }
3881 nigel 77 if (n > 255)
3882     {
3883     *errorcodeptr = ERR38;
3884     goto FAILED;
3885     }
3886     *code++ = n;
3887     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3888     PUT(code, LINK_SIZE, 0); /* Default length */
3889     code += 2 * LINK_SIZE;
3890     }
3891     previous = NULL;
3892     continue;
3893    
3894 nigel 93
3895     /* ------------------------------------------------------------ */
3896     case 'P': /* Python-style named subpattern handling */
3897     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3898 nigel 77 {
3899 nigel 93 is_recurse = *ptr == '>';
3900     terminator = ')';
3901     goto NAMED_REF_OR_RECURSE;
3902     }
3903     else if (*ptr != '<') /* Test for Python-style definition */
3904     {
3905     *errorcodeptr = ERR41;
3906     goto FAILED;
3907     }
3908     /* Fall through to handle (?P< as (?< is handled */
3909 nigel 77
3910    
3911 nigel 93 /* ------------------------------------------------------------ */
3912     DEFINE_NAME: /* Come here from (?< handling */
3913     case '\'':
3914     {
3915     terminator = (*ptr == '<')? '>' : '\'';
3916     name = ++ptr;
3917    
3918     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3919     namelen = ptr - name;
3920    
3921     /* In the pre-compile phase, just do a syntax check. */
3922    
3923     if (lengthptr != NULL)
3924 nigel 77 {
3925 nigel 93 if (*ptr != terminator)
3926 nigel 77 {
3927 nigel 93 *errorcodeptr = ERR42;
3928     goto FAILED;
3929     }
3930     if (cd->names_found >= MAX_NAME_COUNT)
3931     {
3932     *errorcodeptr = ERR49;
3933     goto FAILED;
3934     }
3935     if (namelen + 3 > cd->name_entry_size)
3936     {
3937     cd->name_entry_size = namelen + 3;
3938     if (namelen > MAX_NAME_SIZE)
3939 nigel 77 {
3940 nigel 93 *errorcodeptr = ERR48;
3941     goto FAILED;
3942     }
3943     }
3944     }
3945    
3946     /* In the real compile, create the entry in the table */
3947    
3948     else
3949     {
3950     slot = cd->name_table;
3951     for (i = 0; i < cd->names_found; i++)
3952     {
3953     int crc = memcmp(name, slot+2, namelen);
3954     if (crc == 0)
3955     {
3956     if (slot[2+namelen] == 0)
3957 nigel 91 {
3958 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
3959     {
3960     *errorcodeptr = ERR43;
3961     goto FAILED;
3962     }
3963 nigel 91 }
3964 nigel 93 else crc = -1; /* Current name is substring */
3965 nigel 77 }
3966 nigel 93 if (crc < 0)
3967     {
3968     memmove(slot + cd->name_entry_size, slot,
3969     (cd->names_found - i) * cd->name_entry_size);
3970     break;
3971     }
3972     slot += cd->name_entry_size;
3973 nigel 77 }
3974 nigel 93
3975     PUT2(slot, 0, cd->bracount + 1);
3976     memcpy(slot + 2, name, namelen);
3977     slot[2+namelen] = 0;
3978 nigel 77 }
3979     }
3980    
3981 nigel 93 /* In both cases, count the number of names we've encountered. */
3982    
3983     ptr++; /* Move past > or ' */
3984     cd->names_found++;
3985     goto NUMBERED_GROUP;
3986    
3987    
3988     /* ------------------------------------------------------------ */
3989     case '&': /* Perl recursion/subroutine syntax */
3990     terminator = ')';
3991     is_recurse = TRUE;
3992     /* Fall through */
3993    
3994     /* We come here from the Python syntax above that handles both
3995     references (?P=name) and recursion (?P>name), as well as falling
3996     through from the Perl recursion syntax (?&name). */
3997    
3998     NAMED_REF_OR_RECURSE:
3999     name = ++ptr;
4000     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4001     namelen = ptr - name;
4002    
4003     /* In the pre-compile phase, do a syntax check and set a dummy
4004     reference number. */
4005    
4006     if (lengthptr != NULL)
4007 nigel 77 {
4008 nigel 93 if (*ptr != terminator)
4009     {
4010     *errorcodeptr = ERR42;
4011     goto FAILED;
4012     }
4013     if (namelen > MAX_NAME_SIZE)
4014     {
4015     *errorcodeptr = ERR48;
4016     goto FAILED;
4017     }
4018     recno = 0;
4019     }
4020 nigel 77
4021 nigel 93 /* In the real compile, seek the name in the table */
4022 nigel 77
4023 nigel 93 else
4024     {
4025     slot = cd->name_table;
4026 nigel 77 for (i = 0; i < cd->names_found; i++)
4027     {
4028     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4029     slot += cd->name_entry_size;
4030     }
4031 nigel 91
4032     if (i < cd->names_found) /* Back reference */
4033 nigel 77 {
4034 nigel 91 recno = GET2(slot, 0);
4035     }
4036     else if ((recno = /* Forward back reference */
4037 nigel 93 find_parens(ptr, cd->bracount, name, namelen,
4038     (options & PCRE_EXTENDED) != 0)) <= 0)
4039 nigel 91 {
4040 nigel 77 *errorcodeptr = ERR15;
4041     goto FAILED;
4042     }
4043 nigel 93 }
4044 nigel 77
4045 nigel 93 /* In both phases, we can now go to the code than handles numerical
4046     recursion or backreferences. */
4047 nigel 77
4048 nigel 93 if (is_recurse) goto HANDLE_RECURSION;
4049     else goto HANDLE_REFERENCE;
4050 nigel 77
4051    
4052 nigel 93 /* ------------------------------------------------------------ */
4053     case 'R': /* Recursion */
4054 nigel 77 ptr++; /* Same as (?0) */
4055     /* Fall through */
4056    
4057    
4058 nigel 93 /* ------------------------------------------------------------ */
4059 ph10 166 case '-': case '+':
4060 nigel 93 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4061     case '5': case '6': case '7': case '8': case '9': /* subroutine */
4062 nigel 77 {
4063     const uschar *called;
4064 ph10 166
4065 ph10 167 if ((refsign = *ptr) == '+') ptr++;
4066 ph10 172 else if (refsign == '-')
4067 ph10 166 {
4068     if ((digitab[ptr[1]] & ctype_digit) == 0)
4069     goto OTHER_CHAR_AFTER_QUERY;
4070 ph10 172 ptr++;
4071     }
4072    
4073 nigel 77 recno = 0;
4074     while((digitab[*ptr] & ctype_digit) != 0)
4075     recno = recno * 10 + *ptr++ - '0';
4076 ph10 166
4077 nigel 93 if (*ptr != ')')
4078     {
4079     *errorcodeptr = ERR29;
4080     goto FAILED;
4081     }
4082 ph10 172
4083 ph10 167 if (refsign == '-')
4084 ph10 166 {
4085     if (recno == 0)
4086     {
4087     *errorcodeptr = ERR58;
4088     goto FAILED;
4089 ph10 172 }
4090     recno = cd->bracount - recno + 1;
4091 ph10 166 if (recno <= 0)
4092     {
4093     *errorcodeptr = ERR15;
4094     goto FAILED;
4095 ph10 172 }
4096 ph10 166 }
4097 ph10 167 else if (refsign == '+')
4098 ph10 166 {
4099     if (recno == 0)
4100     {
4101     *errorcodeptr = ERR58;
4102     goto FAILED;
4103 ph10 172 }
4104     recno += cd->bracount;
4105     }
4106 nigel 77
4107     /* Come here from code above that handles a named recursion */
4108    
4109     HANDLE_RECURSION:
4110    
4111     previous = code;
4112 nigel 93 called = cd->start_code;
4113 nigel 77
4114 nigel 93 /* When we are actually compiling, find the bracket that is being
4115     referenced. Temporarily end the regex in case it doesn't exist before
4116     this point. If we end up with a forward reference, first check that
4117     the bracket does occur later so we can give the error (and position)
4118     now. Then remember this forward reference in the workspace so it can
4119     be filled in at the end. */
4120 nigel 77
4121 nigel 93 if (lengthptr == NULL)
4122 nigel 77 {
4123 nigel 93 *code = OP_END;
4124     if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4125 nigel 77
4126 nigel 93 /* Forward reference */
4127 nigel 77
4128 nigel 93 if (called == NULL)
4129     {
4130     if (find_parens(ptr, cd->bracount, NULL, recno,
4131     (options & PCRE_EXTENDED) != 0) < 0)
4132     {
4133     *errorcodeptr = ERR15;
4134     goto FAILED;
4135     }
4136     called = cd->start_code + recno;
4137     PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4138     }
4139    
4140     /* If not a forward reference, and the subpattern is still open,
4141     this is a recursive call. We check to see if this is a left
4142     recursion that could loop for ever, and diagnose that case. */
4143    
4144     else if (GET(called, 1) == 0 &&
4145     could_be_empty(called, code, bcptr, utf8))
4146     {
4147     *errorcodeptr = ERR40;
4148     goto FAILED;
4149     }
4150 nigel 77 }
4151    
4152 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside