/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 176 - (hide annotations) (download)
Mon Jun 11 13:48:37 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 176819 byte(s)
Inserted some (unsigned int) casts to kill compiler warnings.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 nigel 77 /*************************************************
62     * Code parameters and static tables *
63     *************************************************/
64    
65 nigel 93 /* This value specifies the size of stack workspace that is used during the
66     first pre-compile phase that determines how much memory is required. The regex
67     is partly compiled into this space, but the compiled parts are discarded as
68     soon as they can be, so that hopefully there will never be an overrun. The code
69     does, however, check for an overrun. The largest amount I've seen used is 218,
70     so this number is very generous.
71 nigel 77
72 nigel 93 The same workspace is used during the second, actual compile phase for
73     remembering forward references to groups so that they can be filled in at the
74     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75     is 4 there is plenty of room. */
76 nigel 77
77 nigel 93 #define COMPILE_WORK_SIZE (4096)
78 nigel 77
79 nigel 93
80 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81     are simple data values; negative values are for special things like \d and so
82     on. Zero means further processing is needed (for things like \x), or the escape
83     is invalid. */
84    
85 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 nigel 77 static const short int escapes[] = {
87     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 ph10 168 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 nigel 93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 nigel 93 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 nigel 77 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96     0, 0, -ESC_z /* x - z */
97     };
98    
99 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
100 nigel 77 static const short int escapes[] = {
101     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 nigel 77 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124     };
125     #endif
126    
127    
128     /* Tables of names of POSIX character classes and their lengths. The list is
129 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 nigel 77 as this is assumed for handling case independence. */
131    
132     static const char *const posix_names[] = {
133     "alpha", "lower", "upper",
134     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135     "print", "punct", "space", "word", "xdigit" };
136    
137     static const uschar posix_name_lengths[] = {
138     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
141     base map, with an optional addition or removal of another map. Then, for some
142     classes, there is some additional tweaking: for [:blank:] the vertical space
143     characters are removed, and for [:alpha:] and [:alnum:] the underscore
144     character is removed. The triples in the table consist of the base map offset,
145     second map offset or -1 if no second map, and a non-negative value for map
146     addition or a negative value for map subtraction (if there are two maps). The
147     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148     remove vertical space characters, 2 => remove underscore. */
149 nigel 77
150     static const int posix_class_maps[] = {
151 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
152     cbit_lower, -1, 0, /* lower */
153     cbit_upper, -1, 0, /* upper */
154     cbit_word, -1, 2, /* alnum - word without underscore */
155     cbit_print, cbit_cntrl, 0, /* ascii */
156     cbit_space, -1, 1, /* blank - a GNU extension */
157     cbit_cntrl, -1, 0, /* cntrl */
158     cbit_digit, -1, 0, /* digit */
159     cbit_graph, -1, 0, /* graph */
160     cbit_print, -1, 0, /* print */
161     cbit_punct, -1, 0, /* punct */
162     cbit_space, -1, 0, /* space */
163     cbit_word, -1, 0, /* word - a Perl extension */
164     cbit_xdigit,-1, 0 /* xdigit */
165 nigel 77 };
166    
167    
168 nigel 93 #define STRING(a) # a
169     #define XSTRING(s) STRING(s)
170    
171 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
172 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
173     they are documented. Always add a new error instead. Messages marked DEAD below
174     are no longer used. */
175 nigel 77
176     static const char *error_texts[] = {
177     "no error",
178     "\\ at end of pattern",
179     "\\c at end of pattern",
180     "unrecognized character follows \\",
181     "numbers out of order in {} quantifier",
182     /* 5 */
183     "number too big in {} quantifier",
184     "missing terminating ] for character class",
185     "invalid escape sequence in character class",
186     "range out of order in character class",
187     "nothing to repeat",
188     /* 10 */
189 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 nigel 77 "internal error: unexpected repeat",
191     "unrecognized character after (?",
192     "POSIX named classes are supported only within a class",
193     "missing )",
194     /* 15 */
195     "reference to non-existent subpattern",
196     "erroffset passed as NULL",
197     "unknown option bit(s) set",
198     "missing ) after comment",
199 nigel 93 "parentheses nested too deeply", /** DEAD **/
200 nigel 77 /* 20 */
201     "regular expression too large",
202     "failed to get memory",
203     "unmatched parentheses",
204     "internal error: code overflow",
205     "unrecognized character after (?<",
206     /* 25 */
207     "lookbehind assertion is not fixed length",
208 nigel 91 "malformed number or name after (?(",
209 nigel 77 "conditional group contains more than two branches",
210     "assertion expected after (?(",
211 ph10 166 "(?R or (?[+-]digits must be followed by )",
212 nigel 77 /* 30 */
213     "unknown POSIX class name",
214     "POSIX collating elements are not supported",
215     "this version of PCRE is not compiled with PCRE_UTF8 support",
216 nigel 93 "spare error", /** DEAD **/
217 nigel 77 "character value in \\x{...} sequence is too large",
218     /* 35 */
219     "invalid condition (?(0)",
220     "\\C not allowed in lookbehind assertion",
221     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222     "number after (?C is > 255",
223     "closing ) for (?C expected",
224     /* 40 */
225     "recursive call could loop indefinitely",
226     "unrecognized character after (?P",
227 nigel 93 "syntax error in subpattern name (missing terminator)",
228 nigel 91 "two named subpatterns have the same name",
229 nigel 77 "invalid UTF-8 string",
230     /* 45 */
231     "support for \\P, \\p, and \\X has not been compiled",
232     "malformed \\P or \\p sequence",
233 nigel 91 "unknown property name after \\P or \\p",
234 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 nigel 91 /* 50 */
237     "repeated subpattern is too long",
238 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
239     "internal error: overran compiling workspace",
240     "internal error: previously-checked referenced subpattern not found",
241     "DEFINE group contains more than one branch",
242     /* 55 */
243     "repeating a DEFINE group is not allowed",
244     "inconsistent NEWLINE options",
245 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
246 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 nigel 77 };
248    
249    
250     /* Table to identify digits and hex digits. This is used when compiling
251     patterns. Note that the tables in chartables are dependent on the locale, and
252     may mark arbitrary characters as digits - but the PCRE compiling code expects
253     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254     a private table here. It costs 256 bytes, but it is a lot faster than doing
255     character value tests (at least in some simple cases I timed), and in some
256     applications one wants PCRE to compile efficiently as well as match
257     efficiently.
258    
259     For convenience, we use the same bit definitions as in chartables:
260    
261     0x04 decimal digit
262     0x08 hexadecimal digit
263    
264     Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 nigel 77 static const unsigned char digitab[] =
268     {
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
303 nigel 77 static const unsigned char digitab[] =
304     {
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337    
338     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371     #endif
372    
373    
374     /* Definition to allow mutual recursion */
375    
376     static BOOL
377 ph10 175 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
378     int *, int *, branch_chain *, compile_data *, int *);
379 nigel 77
380    
381    
382     /*************************************************
383     * Handle escapes *
384     *************************************************/
385    
386     /* This function is called when a \ has been encountered. It either returns a
387     positive value for a simple escape such as \n, or a negative value which
388 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
389     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391     ptr is pointing at the \. On exit, it is on the final character of the escape
392     sequence.
393 nigel 77
394     Arguments:
395     ptrptr points to the pattern position pointer
396     errorcodeptr points to the errorcode variable
397     bracount number of previous extracting brackets
398     options the options bits
399     isclass TRUE if inside a character class
400    
401     Returns: zero or positive => a data character
402     negative => a special escape sequence
403     on error, errorptr is set
404     */
405    
406     static int
407     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408     int options, BOOL isclass)
409     {
410 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
411     const uschar *ptr = *ptrptr + 1;
412 nigel 77 int c, i;
413    
414 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415     ptr--; /* Set pointer back to the last byte */
416    
417 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
418    
419     if (c == 0) *errorcodeptr = ERR1;
420    
421     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422     a table. A non-zero result is something that can be returned immediately.
423     Otherwise further processing may be required. */
424    
425 ph10 97 #ifndef EBCDIC /* ASCII coding */
426 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
427     else if ((i = escapes[c - '0']) != 0) c = i;
428    
429 ph10 97 #else /* EBCDIC coding */
430 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431     else if ((i = escapes[c - 0x48]) != 0) c = i;
432     #endif
433    
434     /* Escapes that need further processing, or are illegal. */
435    
436     else
437     {
438     const uschar *oldptr;
439 nigel 93 BOOL braced, negated;
440    
441 nigel 77 switch (c)
442     {
443     /* A number of Perl escapes are not handled by PCRE. We give an explicit
444     error. */
445    
446     case 'l':
447     case 'L':
448     case 'N':
449     case 'u':
450     case 'U':
451     *errorcodeptr = ERR37;
452     break;
453    
454 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
455     is an absolute backreference. If negative, it is a relative backreference.
456 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457     reference to a named group. This is part of Perl's movement towards a
458     unified syntax for back references. As this is synonymous with \k{name}, we
459 ph10 171 fudge it up by pretending it really was \k. */
460 nigel 93
461     case 'g':
462     if (ptr[1] == '{')
463     {
464 ph10 171 const uschar *p;
465     for (p = ptr+2; *p != 0 && *p != '}'; p++)
466     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467 ph10 172 if (*p != 0 && *p != '}')
468 ph10 171 {
469     c = -ESC_k;
470     break;
471 ph10 172 }
472 nigel 93 braced = TRUE;
473     ptr++;
474     }
475     else braced = FALSE;
476    
477     if (ptr[1] == '-')
478     {
479     negated = TRUE;
480     ptr++;
481     }
482     else negated = FALSE;
483    
484     c = 0;
485     while ((digitab[ptr[1]] & ctype_digit) != 0)
486     c = c * 10 + *(++ptr) - '0';
487    
488     if (c == 0 || (braced && *(++ptr) != '}'))
489     {
490     *errorcodeptr = ERR57;
491     return 0;
492     }
493    
494     if (negated)
495     {
496     if (c > bracount)
497     {
498     *errorcodeptr = ERR15;
499     return 0;
500     }
501     c = bracount - (c - 1);
502     }
503    
504     c = -(ESC_REF + c);
505     break;
506    
507 nigel 77 /* The handling of escape sequences consisting of a string of digits
508     starting with one that is not zero is not straightforward. By experiment,
509     the way Perl works seems to be as follows:
510    
511     Outside a character class, the digits are read as a decimal number. If the
512     number is less than 10, or if there are that many previous extracting
513     left brackets, then it is a back reference. Otherwise, up to three octal
514     digits are read to form an escaped byte. Thus \123 is likely to be octal
515     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
516     value is greater than 377, the least significant 8 bits are taken. Inside a
517     character class, \ followed by a digit is always an octal number. */
518    
519     case '1': case '2': case '3': case '4': case '5':
520     case '6': case '7': case '8': case '9':
521    
522     if (!isclass)
523     {
524     oldptr = ptr;
525     c -= '0';
526     while ((digitab[ptr[1]] & ctype_digit) != 0)
527     c = c * 10 + *(++ptr) - '0';
528     if (c < 10 || c <= bracount)
529     {
530     c = -(ESC_REF + c);
531     break;
532     }
533     ptr = oldptr; /* Put the pointer back and fall through */
534     }
535    
536     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
537     generates a binary zero byte and treats the digit as a following literal.
538     Thus we have to pull back the pointer by one. */
539    
540     if ((c = *ptr) >= '8')
541     {
542     ptr--;
543     c = 0;
544     break;
545     }
546    
547     /* \0 always starts an octal number, but we may drop through to here with a
548 nigel 91 larger first octal digit. The original code used just to take the least
549     significant 8 bits of octal numbers (I think this is what early Perls used
550     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551     than 3 octal digits. */
552 nigel 77
553     case '0':
554     c -= '0';
555     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556     c = c * 8 + *(++ptr) - '0';
557 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
558 nigel 77 break;
559    
560 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
561     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562     treated as a data character. */
563 nigel 77
564     case 'x':
565 nigel 87 if (ptr[1] == '{')
566 nigel 77 {
567     const uschar *pt = ptr + 2;
568 nigel 87 int count = 0;
569    
570 nigel 77 c = 0;
571     while ((digitab[*pt] & ctype_xdigit) != 0)
572     {
573 nigel 87 register int cc = *pt++;
574     if (c == 0 && cc == '0') continue; /* Leading zeroes */
575 nigel 77 count++;
576 nigel 87
577 ph10 97 #ifndef EBCDIC /* ASCII coding */
578 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
579 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580 ph10 97 #else /* EBCDIC coding */
581 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
582 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583 nigel 77 #endif
584     }
585 nigel 87
586 nigel 77 if (*pt == '}')
587     {
588 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589 nigel 77 ptr = pt;
590     break;
591     }
592 nigel 87
593 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
594     recognize this construct; fall through to the normal \x handling. */
595     }
596    
597 nigel 87 /* Read just a single-byte hex-defined char */
598 nigel 77
599     c = 0;
600     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601     {
602     int cc; /* Some compilers don't like ++ */
603     cc = *(++ptr); /* in initializers */
604 ph10 97 #ifndef EBCDIC /* ASCII coding */
605 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
606     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607 ph10 97 #else /* EBCDIC coding */
608 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
609     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610     #endif
611     }
612     break;
613    
614 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615     This coding is ASCII-specific, but then the whole concept of \cx is
616     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617 nigel 77
618     case 'c':
619     c = *(++ptr);
620     if (c == 0)
621     {
622     *errorcodeptr = ERR2;
623     return 0;
624     }
625    
626 ph10 97 #ifndef EBCDIC /* ASCII coding */
627 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
628     c ^= 0x40;
629 ph10 97 #else /* EBCDIC coding */
630 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
631     c ^= 0xC0;
632     #endif
633     break;
634    
635     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
636     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
637     for Perl compatibility, it is a literal. This code looks a bit odd, but
638     there used to be some cases other than the default, and there may be again
639     in future, so I haven't "optimized" it. */
640    
641     default:
642     if ((options & PCRE_EXTRA) != 0) switch(c)
643     {
644     default:
645     *errorcodeptr = ERR3;
646     break;
647     }
648     break;
649     }
650     }
651    
652     *ptrptr = ptr;
653     return c;
654     }
655    
656    
657    
658     #ifdef SUPPORT_UCP
659     /*************************************************
660     * Handle \P and \p *
661     *************************************************/
662    
663     /* This function is called after \P or \p has been encountered, provided that
664     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
665     pointing at the P or p. On exit, it is pointing at the final character of the
666     escape sequence.
667    
668     Argument:
669     ptrptr points to the pattern position pointer
670     negptr points to a boolean that is set TRUE for negation else FALSE
671 nigel 87 dptr points to an int that is set to the detailed property value
672 nigel 77 errorcodeptr points to the error code variable
673    
674 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
675 nigel 77 */
676    
677     static int
678 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679 nigel 77 {
680     int c, i, bot, top;
681     const uschar *ptr = *ptrptr;
682 nigel 87 char name[32];
683 nigel 77
684     c = *(++ptr);
685     if (c == 0) goto ERROR_RETURN;
686    
687     *negptr = FALSE;
688    
689 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690     negation. */
691 nigel 77
692     if (c == '{')
693     {
694     if (ptr[1] == '^')
695     {
696     *negptr = TRUE;
697     ptr++;
698     }
699 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
700 nigel 77 {
701     c = *(++ptr);
702     if (c == 0) goto ERROR_RETURN;
703     if (c == '}') break;
704     name[i] = c;
705     }
706 nigel 87 if (c !='}') goto ERROR_RETURN;
707 nigel 77 name[i] = 0;
708     }
709    
710     /* Otherwise there is just one following character */
711    
712     else
713     {
714     name[0] = c;
715     name[1] = 0;
716     }
717    
718     *ptrptr = ptr;
719    
720     /* Search for a recognized property name using binary chop */
721    
722     bot = 0;
723     top = _pcre_utt_size;
724    
725     while (bot < top)
726     {
727 nigel 87 i = (bot + top) >> 1;
728 nigel 77 c = strcmp(name, _pcre_utt[i].name);
729 nigel 87 if (c == 0)
730     {
731     *dptr = _pcre_utt[i].value;
732     return _pcre_utt[i].type;
733     }
734 nigel 77 if (c > 0) bot = i + 1; else top = i;
735     }
736    
737     *errorcodeptr = ERR47;
738     *ptrptr = ptr;
739     return -1;
740    
741     ERROR_RETURN:
742     *errorcodeptr = ERR46;
743     *ptrptr = ptr;
744     return -1;
745     }
746     #endif
747    
748    
749    
750    
751     /*************************************************
752     * Check for counted repeat *
753     *************************************************/
754    
755     /* This function is called when a '{' is encountered in a place where it might
756     start a quantifier. It looks ahead to see if it really is a quantifier or not.
757     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
758     where the ddds are digits.
759    
760     Arguments:
761     p pointer to the first char after '{'
762    
763     Returns: TRUE or FALSE
764     */
765    
766     static BOOL
767     is_counted_repeat(const uschar *p)
768     {
769     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
770     while ((digitab[*p] & ctype_digit) != 0) p++;
771     if (*p == '}') return TRUE;
772    
773     if (*p++ != ',') return FALSE;
774     if (*p == '}') return TRUE;
775    
776     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777     while ((digitab[*p] & ctype_digit) != 0) p++;
778    
779     return (*p == '}');
780     }
781    
782    
783    
784     /*************************************************
785     * Read repeat counts *
786     *************************************************/
787    
788     /* Read an item of the form {n,m} and return the values. This is called only
789     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
790     so the syntax is guaranteed to be correct, but we need to check the values.
791    
792     Arguments:
793     p pointer to first char after '{'
794     minp pointer to int for min
795     maxp pointer to int for max
796     returned as -1 if no max
797     errorcodeptr points to error code variable
798    
799     Returns: pointer to '}' on success;
800     current ptr on error, with errorcodeptr set non-zero
801     */
802    
803     static const uschar *
804     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
805     {
806     int min = 0;
807     int max = -1;
808    
809 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
810     an integer overflow. */
811    
812 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813 nigel 81 if (min < 0 || min > 65535)
814     {
815     *errorcodeptr = ERR5;
816     return p;
817     }
818 nigel 77
819 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
820     Also, max must not be less than min. */
821    
822 nigel 77 if (*p == '}') max = min; else
823     {
824     if (*(++p) != '}')
825     {
826     max = 0;
827     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828 nigel 81 if (max < 0 || max > 65535)
829     {
830     *errorcodeptr = ERR5;
831     return p;
832     }
833 nigel 77 if (max < min)
834     {
835     *errorcodeptr = ERR4;
836     return p;
837     }
838     }
839     }
840    
841 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
842     '}'. */
843 nigel 77
844 nigel 81 *minp = min;
845     *maxp = max;
846 nigel 77 return p;
847     }
848    
849    
850    
851     /*************************************************
852 nigel 93 * Find forward referenced subpattern *
853 nigel 91 *************************************************/
854    
855 nigel 93 /* This function scans along a pattern's text looking for capturing
856     subpatterns, and counting them. If it finds a named pattern that matches the
857     name it is given, it returns its number. Alternatively, if the name is NULL, it
858     returns when it reaches a given numbered subpattern. This is used for forward
859     references to subpatterns. We know that if (?P< is encountered, the name will
860     be terminated by '>' because that is checked in the first pass.
861 nigel 91
862     Arguments:
863 nigel 93 ptr current position in the pattern
864     count current count of capturing parens so far encountered
865     name name to seek, or NULL if seeking a numbered subpattern
866     lorn name length, or subpattern number if name is NULL
867     xmode TRUE if we are in /x mode
868 nigel 91
869     Returns: the number of the named subpattern, or -1 if not found
870     */
871    
872     static int
873 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874     BOOL xmode)
875 nigel 91 {
876     const uschar *thisname;
877 nigel 93
878 nigel 91 for (; *ptr != 0; ptr++)
879     {
880 nigel 93 int term;
881    
882     /* Skip over backslashed characters and also entire \Q...\E */
883    
884     if (*ptr == '\\')
885     {
886     if (*(++ptr) == 0) return -1;
887     if (*ptr == 'Q') for (;;)
888     {
889     while (*(++ptr) != 0 && *ptr != '\\');
890     if (*ptr == 0) return -1;
891     if (*(++ptr) == 'E') break;
892     }
893     continue;
894     }
895    
896     /* Skip over character classes */
897    
898     if (*ptr == '[')
899     {
900     while (*(++ptr) != ']')
901     {
902     if (*ptr == '\\')
903     {
904     if (*(++ptr) == 0) return -1;
905     if (*ptr == 'Q') for (;;)
906     {
907     while (*(++ptr) != 0 && *ptr != '\\');
908     if (*ptr == 0) return -1;
909     if (*(++ptr) == 'E') break;
910     }
911     continue;
912     }
913     }
914     continue;
915     }
916    
917     /* Skip comments in /x mode */
918    
919     if (xmode && *ptr == '#')
920     {
921     while (*(++ptr) != 0 && *ptr != '\n');
922     if (*ptr == 0) return -1;
923     continue;
924     }
925    
926     /* An opening parens must now be a real metacharacter */
927    
928 nigel 91 if (*ptr != '(') continue;
929 nigel 93 if (ptr[1] != '?')
930     {
931     count++;
932     if (name == NULL && count == lorn) return count;
933     continue;
934     }
935    
936     ptr += 2;
937     if (*ptr == 'P') ptr++; /* Allow optional P */
938    
939     /* We have to disambiguate (?<! and (?<= from (?<name> */
940    
941     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942     *ptr != '\'')
943     continue;
944    
945 nigel 91 count++;
946 nigel 93
947     if (name == NULL && count == lorn) return count;
948     term = *ptr++;
949     if (term == '<') term = '>';
950 nigel 91 thisname = ptr;
951 nigel 93 while (*ptr != term) ptr++;
952     if (name != NULL && lorn == ptr - thisname &&
953     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954 nigel 91 return count;
955     }
956 nigel 93
957 nigel 91 return -1;
958     }
959    
960    
961    
962     /*************************************************
963 nigel 77 * Find first significant op code *
964     *************************************************/
965    
966     /* This is called by several functions that scan a compiled expression looking
967     for a fixed first character, or an anchoring op code etc. It skips over things
968     that do not influence this. For some calls, a change of option is important.
969     For some calls, it makes sense to skip negative forward and all backward
970     assertions, and also the \b assertion; for others it does not.
971    
972     Arguments:
973     code pointer to the start of the group
974     options pointer to external options
975     optbit the option bit whose changing is significant, or
976     zero if none are
977     skipassert TRUE if certain assertions are to be skipped
978    
979     Returns: pointer to the first significant opcode
980     */
981    
982     static const uschar*
983     first_significant_code(const uschar *code, int *options, int optbit,
984     BOOL skipassert)
985     {
986     for (;;)
987     {
988     switch ((int)*code)
989     {
990     case OP_OPT:
991     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
992     *options = (int)code[1];
993     code += 2;
994     break;
995    
996     case OP_ASSERT_NOT:
997     case OP_ASSERTBACK:
998     case OP_ASSERTBACK_NOT:
999     if (!skipassert) return code;
1000     do code += GET(code, 1); while (*code == OP_ALT);
1001     code += _pcre_OP_lengths[*code];
1002     break;
1003    
1004     case OP_WORD_BOUNDARY:
1005     case OP_NOT_WORD_BOUNDARY:
1006     if (!skipassert) return code;
1007     /* Fall through */
1008    
1009     case OP_CALLOUT:
1010     case OP_CREF:
1011 nigel 93 case OP_RREF:
1012     case OP_DEF:
1013 nigel 77 code += _pcre_OP_lengths[*code];
1014     break;
1015    
1016     default:
1017     return code;
1018     }
1019     }
1020     /* Control never reaches here */
1021     }
1022    
1023    
1024    
1025    
1026     /*************************************************
1027     * Find the fixed length of a pattern *
1028     *************************************************/
1029    
1030     /* Scan a pattern and compute the fixed length of subject that will match it,
1031     if the length is fixed. This is needed for dealing with backward assertions.
1032     In UTF8 mode, the result is in characters rather than bytes.
1033    
1034     Arguments:
1035     code points to the start of the pattern (the bracket)
1036     options the compiling options
1037    
1038     Returns: the fixed length, or -1 if there is no fixed length,
1039     or -2 if \C was encountered
1040     */
1041    
1042     static int
1043     find_fixedlength(uschar *code, int options)
1044     {
1045     int length = -1;
1046    
1047     register int branchlength = 0;
1048     register uschar *cc = code + 1 + LINK_SIZE;
1049    
1050     /* Scan along the opcodes for this branch. If we get to the end of the
1051     branch, check the length against that of the other branches. */
1052    
1053     for (;;)
1054     {
1055     int d;
1056     register int op = *cc;
1057    
1058     switch (op)
1059     {
1060 nigel 93 case OP_CBRA:
1061 nigel 77 case OP_BRA:
1062     case OP_ONCE:
1063     case OP_COND:
1064 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065 nigel 77 if (d < 0) return d;
1066     branchlength += d;
1067     do cc += GET(cc, 1); while (*cc == OP_ALT);
1068     cc += 1 + LINK_SIZE;
1069     break;
1070    
1071     /* Reached end of a branch; if it's a ket it is the end of a nested
1072     call. If it's ALT it is an alternation in a nested call. If it is
1073     END it's the end of the outer call. All can be handled by the same code. */
1074    
1075     case OP_ALT:
1076     case OP_KET:
1077     case OP_KETRMAX:
1078     case OP_KETRMIN:
1079     case OP_END:
1080     if (length < 0) length = branchlength;
1081     else if (length != branchlength) return -1;
1082     if (*cc != OP_ALT) return length;
1083     cc += 1 + LINK_SIZE;
1084     branchlength = 0;
1085     break;
1086    
1087     /* Skip over assertive subpatterns */
1088    
1089     case OP_ASSERT:
1090     case OP_ASSERT_NOT:
1091     case OP_ASSERTBACK:
1092     case OP_ASSERTBACK_NOT:
1093     do cc += GET(cc, 1); while (*cc == OP_ALT);
1094     /* Fall through */
1095    
1096     /* Skip over things that don't match chars */
1097    
1098     case OP_REVERSE:
1099     case OP_CREF:
1100 nigel 93 case OP_RREF:
1101     case OP_DEF:
1102 nigel 77 case OP_OPT:
1103     case OP_CALLOUT:
1104     case OP_SOD:
1105     case OP_SOM:
1106     case OP_EOD:
1107     case OP_EODN:
1108     case OP_CIRC:
1109     case OP_DOLL:
1110     case OP_NOT_WORD_BOUNDARY:
1111     case OP_WORD_BOUNDARY:
1112     cc += _pcre_OP_lengths[*cc];
1113     break;
1114    
1115     /* Handle literal characters */
1116    
1117     case OP_CHAR:
1118     case OP_CHARNC:
1119 nigel 91 case OP_NOT:
1120 nigel 77 branchlength++;
1121     cc += 2;
1122     #ifdef SUPPORT_UTF8
1123     if ((options & PCRE_UTF8) != 0)
1124     {
1125     while ((*cc & 0xc0) == 0x80) cc++;
1126     }
1127     #endif
1128     break;
1129    
1130     /* Handle exact repetitions. The count is already in characters, but we
1131     need to skip over a multibyte character in UTF8 mode. */
1132    
1133     case OP_EXACT:
1134     branchlength += GET2(cc,1);
1135     cc += 4;
1136     #ifdef SUPPORT_UTF8
1137     if ((options & PCRE_UTF8) != 0)
1138     {
1139     while((*cc & 0x80) == 0x80) cc++;
1140     }
1141     #endif
1142     break;
1143    
1144     case OP_TYPEEXACT:
1145     branchlength += GET2(cc,1);
1146     cc += 4;
1147     break;
1148    
1149     /* Handle single-char matchers */
1150    
1151     case OP_PROP:
1152     case OP_NOTPROP:
1153 nigel 87 cc += 2;
1154 nigel 77 /* Fall through */
1155    
1156     case OP_NOT_DIGIT:
1157     case OP_DIGIT:
1158     case OP_NOT_WHITESPACE:
1159     case OP_WHITESPACE:
1160     case OP_NOT_WORDCHAR:
1161     case OP_WORDCHAR:
1162     case OP_ANY:
1163     branchlength++;
1164     cc++;
1165     break;
1166    
1167     /* The single-byte matcher isn't allowed */
1168    
1169     case OP_ANYBYTE:
1170     return -2;
1171    
1172     /* Check a class for variable quantification */
1173    
1174     #ifdef SUPPORT_UTF8
1175     case OP_XCLASS:
1176     cc += GET(cc, 1) - 33;
1177     /* Fall through */
1178     #endif
1179    
1180     case OP_CLASS:
1181     case OP_NCLASS:
1182     cc += 33;
1183    
1184     switch (*cc)
1185     {
1186     case OP_CRSTAR:
1187     case OP_CRMINSTAR:
1188     case OP_CRQUERY:
1189     case OP_CRMINQUERY:
1190     return -1;
1191    
1192     case OP_CRRANGE:
1193     case OP_CRMINRANGE:
1194     if (GET2(cc,1) != GET2(cc,3)) return -1;
1195     branchlength += GET2(cc,1);
1196     cc += 5;
1197     break;
1198    
1199     default:
1200     branchlength++;
1201     }
1202     break;
1203    
1204     /* Anything else is variable length */
1205    
1206     default:
1207     return -1;
1208     }
1209     }
1210     /* Control never gets here */
1211     }
1212    
1213    
1214    
1215    
1216     /*************************************************
1217     * Scan compiled regex for numbered bracket *
1218     *************************************************/
1219    
1220     /* This little function scans through a compiled pattern until it finds a
1221     capturing bracket with the given number.
1222    
1223     Arguments:
1224     code points to start of expression
1225     utf8 TRUE in UTF-8 mode
1226     number the required bracket number
1227    
1228     Returns: pointer to the opcode for the bracket, or NULL if not found
1229     */
1230    
1231     static const uschar *
1232     find_bracket(const uschar *code, BOOL utf8, int number)
1233     {
1234     for (;;)
1235     {
1236     register int c = *code;
1237     if (c == OP_END) return NULL;
1238 nigel 91
1239     /* XCLASS is used for classes that cannot be represented just by a bit
1240     map. This includes negated single high-valued characters. The length in
1241     the table is zero; the actual length is stored in the compiled code. */
1242    
1243     if (c == OP_XCLASS) code += GET(code, 1);
1244    
1245 nigel 93 /* Handle capturing bracket */
1246 nigel 91
1247 nigel 93 else if (c == OP_CBRA)
1248 nigel 77 {
1249 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1250 nigel 77 if (n == number) return (uschar *)code;
1251 nigel 93 code += _pcre_OP_lengths[c];
1252 nigel 77 }
1253 nigel 91
1254 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255     a multi-byte character. The length in the table is a minimum, so we have to
1256     arrange to skip the extra bytes. */
1257 nigel 91
1258 nigel 77 else
1259     {
1260     code += _pcre_OP_lengths[c];
1261 ph10 107 #ifdef SUPPORT_UTF8
1262 nigel 77 if (utf8) switch(c)
1263     {
1264     case OP_CHAR:
1265     case OP_CHARNC:
1266     case OP_EXACT:
1267     case OP_UPTO:
1268     case OP_MINUPTO:
1269 nigel 93 case OP_POSUPTO:
1270 nigel 77 case OP_STAR:
1271     case OP_MINSTAR:
1272 nigel 93 case OP_POSSTAR:
1273 nigel 77 case OP_PLUS:
1274     case OP_MINPLUS:
1275 nigel 93 case OP_POSPLUS:
1276 nigel 77 case OP_QUERY:
1277     case OP_MINQUERY:
1278 nigel 93 case OP_POSQUERY:
1279     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280 nigel 77 break;
1281     }
1282 ph10 111 #endif
1283 nigel 77 }
1284     }
1285     }
1286    
1287    
1288    
1289     /*************************************************
1290     * Scan compiled regex for recursion reference *
1291     *************************************************/
1292    
1293     /* This little function scans through a compiled pattern until it finds an
1294     instance of OP_RECURSE.
1295    
1296     Arguments:
1297     code points to start of expression
1298     utf8 TRUE in UTF-8 mode
1299    
1300     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1301     */
1302    
1303     static const uschar *
1304     find_recurse(const uschar *code, BOOL utf8)
1305     {
1306     for (;;)
1307     {
1308     register int c = *code;
1309     if (c == OP_END) return NULL;
1310 nigel 91 if (c == OP_RECURSE) return code;
1311    
1312     /* XCLASS is used for classes that cannot be represented just by a bit
1313     map. This includes negated single high-valued characters. The length in
1314     the table is zero; the actual length is stored in the compiled code. */
1315    
1316     if (c == OP_XCLASS) code += GET(code, 1);
1317    
1318     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319     that are followed by a character may be followed by a multi-byte character.
1320 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1321     bytes. */
1322 nigel 91
1323 nigel 77 else
1324     {
1325     code += _pcre_OP_lengths[c];
1326 ph10 107 #ifdef SUPPORT_UTF8
1327 nigel 77 if (utf8) switch(c)
1328     {
1329     case OP_CHAR:
1330     case OP_CHARNC:
1331     case OP_EXACT:
1332     case OP_UPTO:
1333     case OP_MINUPTO:
1334 nigel 93 case OP_POSUPTO:
1335 nigel 77 case OP_STAR:
1336     case OP_MINSTAR:
1337 nigel 93 case OP_POSSTAR:
1338 nigel 77 case OP_PLUS:
1339     case OP_MINPLUS:
1340 nigel 93 case OP_POSPLUS:
1341 nigel 77 case OP_QUERY:
1342     case OP_MINQUERY:
1343 nigel 93 case OP_POSQUERY:
1344     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345 nigel 77 break;
1346     }
1347 ph10 111 #endif
1348 nigel 77 }
1349     }
1350     }
1351    
1352    
1353    
1354     /*************************************************
1355     * Scan compiled branch for non-emptiness *
1356     *************************************************/
1357    
1358     /* This function scans through a branch of a compiled pattern to see whether it
1359 nigel 93 can match the empty string or not. It is called from could_be_empty()
1360     below and from compile_branch() when checking for an unlimited repeat of a
1361     group that can match nothing. Note that first_significant_code() skips over
1362     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363     struck an inner bracket whose current branch will already have been scanned.
1364 nigel 77
1365     Arguments:
1366     code points to start of search
1367     endcode points to where to stop
1368     utf8 TRUE if in UTF8 mode
1369    
1370     Returns: TRUE if what is matched could be empty
1371     */
1372    
1373     static BOOL
1374     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375     {
1376     register int c;
1377 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378 nigel 77 code < endcode;
1379     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380     {
1381     const uschar *ccode;
1382    
1383     c = *code;
1384 ph10 172
1385 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1386 nigel 77
1387 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388     {
1389 ph10 172 code += _pcre_OP_lengths[c];
1390 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1391     c = *code;
1392     continue;
1393     }
1394    
1395     /* For other groups, scan the branches. */
1396 ph10 172
1397 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398 nigel 77 {
1399     BOOL empty_branch;
1400     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1401    
1402     /* Scan a closed bracket */
1403    
1404     empty_branch = FALSE;
1405     do
1406     {
1407     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1408     empty_branch = TRUE;
1409     code += GET(code, 1);
1410     }
1411     while (*code == OP_ALT);
1412     if (!empty_branch) return FALSE; /* All branches are non-empty */
1413 ph10 172 c = *code;
1414 nigel 93 continue;
1415 nigel 77 }
1416    
1417 nigel 93 /* Handle the other opcodes */
1418    
1419     switch (c)
1420 nigel 77 {
1421     /* Check for quantifiers after a class */
1422    
1423     #ifdef SUPPORT_UTF8
1424     case OP_XCLASS:
1425     ccode = code + GET(code, 1);
1426     goto CHECK_CLASS_REPEAT;
1427     #endif
1428    
1429     case OP_CLASS:
1430     case OP_NCLASS:
1431     ccode = code + 33;
1432    
1433     #ifdef SUPPORT_UTF8
1434     CHECK_CLASS_REPEAT:
1435     #endif
1436    
1437     switch (*ccode)
1438     {
1439     case OP_CRSTAR: /* These could be empty; continue */
1440     case OP_CRMINSTAR:
1441     case OP_CRQUERY:
1442     case OP_CRMINQUERY:
1443     break;
1444    
1445     default: /* Non-repeat => class must match */
1446     case OP_CRPLUS: /* These repeats aren't empty */
1447     case OP_CRMINPLUS:
1448     return FALSE;
1449    
1450     case OP_CRRANGE:
1451     case OP_CRMINRANGE:
1452     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1453     break;
1454     }
1455     break;
1456    
1457     /* Opcodes that must match a character */
1458    
1459     case OP_PROP:
1460     case OP_NOTPROP:
1461     case OP_EXTUNI:
1462     case OP_NOT_DIGIT:
1463     case OP_DIGIT:
1464     case OP_NOT_WHITESPACE:
1465     case OP_WHITESPACE:
1466     case OP_NOT_WORDCHAR:
1467     case OP_WORDCHAR:
1468     case OP_ANY:
1469     case OP_ANYBYTE:
1470     case OP_CHAR:
1471     case OP_CHARNC:
1472     case OP_NOT:
1473     case OP_PLUS:
1474     case OP_MINPLUS:
1475 nigel 93 case OP_POSPLUS:
1476 nigel 77 case OP_EXACT:
1477     case OP_NOTPLUS:
1478     case OP_NOTMINPLUS:
1479 nigel 93 case OP_NOTPOSPLUS:
1480 nigel 77 case OP_NOTEXACT:
1481     case OP_TYPEPLUS:
1482     case OP_TYPEMINPLUS:
1483 nigel 93 case OP_TYPEPOSPLUS:
1484 nigel 77 case OP_TYPEEXACT:
1485     return FALSE;
1486    
1487     /* End of branch */
1488    
1489     case OP_KET:
1490     case OP_KETRMAX:
1491     case OP_KETRMIN:
1492     case OP_ALT:
1493     return TRUE;
1494    
1495 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1496     MINUPTO, and POSUPTO may be followed by a multibyte character */
1497 nigel 77
1498     #ifdef SUPPORT_UTF8
1499     case OP_STAR:
1500     case OP_MINSTAR:
1501 nigel 93 case OP_POSSTAR:
1502 nigel 77 case OP_QUERY:
1503     case OP_MINQUERY:
1504 nigel 93 case OP_POSQUERY:
1505 nigel 77 case OP_UPTO:
1506     case OP_MINUPTO:
1507 nigel 93 case OP_POSUPTO:
1508 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1509     break;
1510     #endif
1511     }
1512     }
1513    
1514     return TRUE;
1515     }
1516    
1517    
1518    
1519     /*************************************************
1520     * Scan compiled regex for non-emptiness *
1521     *************************************************/
1522    
1523     /* This function is called to check for left recursive calls. We want to check
1524     the current branch of the current pattern to see if it could match the empty
1525     string. If it could, we must look outwards for branches at other levels,
1526     stopping when we pass beyond the bracket which is the subject of the recursion.
1527    
1528     Arguments:
1529     code points to start of the recursion
1530     endcode points to where to stop (current RECURSE item)
1531     bcptr points to the chain of current (unclosed) branch starts
1532     utf8 TRUE if in UTF-8 mode
1533    
1534     Returns: TRUE if what is matched could be empty
1535     */
1536    
1537     static BOOL
1538     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1539     BOOL utf8)
1540     {
1541     while (bcptr != NULL && bcptr->current >= code)
1542     {
1543     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1544     bcptr = bcptr->outer;
1545     }
1546     return TRUE;
1547     }
1548    
1549    
1550    
1551     /*************************************************
1552     * Check for POSIX class syntax *
1553     *************************************************/
1554    
1555     /* This function is called when the sequence "[:" or "[." or "[=" is
1556     encountered in a character class. It checks whether this is followed by an
1557     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1558     ".]" or "=]".
1559    
1560     Argument:
1561     ptr pointer to the initial [
1562     endptr where to return the end pointer
1563     cd pointer to compile data
1564    
1565     Returns: TRUE or FALSE
1566     */
1567    
1568     static BOOL
1569     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1570     {
1571     int terminator; /* Don't combine these lines; the Solaris cc */
1572     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1573     if (*(++ptr) == '^') ptr++;
1574     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1575     if (*ptr == terminator && ptr[1] == ']')
1576     {
1577     *endptr = ptr;
1578     return TRUE;
1579     }
1580     return FALSE;
1581     }
1582    
1583    
1584    
1585    
1586     /*************************************************
1587     * Check POSIX class name *
1588     *************************************************/
1589    
1590     /* This function is called to check the name given in a POSIX-style class entry
1591     such as [:alnum:].
1592    
1593     Arguments:
1594     ptr points to the first letter
1595     len the length of the name
1596    
1597     Returns: a value representing the name, or -1 if unknown
1598     */
1599    
1600     static int
1601     check_posix_name(const uschar *ptr, int len)
1602     {
1603     register int yield = 0;
1604     while (posix_name_lengths[yield] != 0)
1605     {
1606     if (len == posix_name_lengths[yield] &&
1607     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1608     yield++;
1609     }
1610     return -1;
1611     }
1612    
1613    
1614     /*************************************************
1615     * Adjust OP_RECURSE items in repeated group *
1616     *************************************************/
1617    
1618     /* OP_RECURSE items contain an offset from the start of the regex to the group
1619     that is referenced. This means that groups can be replicated for fixed
1620     repetition simply by copying (because the recursion is allowed to refer to
1621     earlier groups that are outside the current group). However, when a group is
1622     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1623     it, after it has been compiled. This means that any OP_RECURSE items within it
1624     that refer to the group itself or any contained groups have to have their
1625 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1626     the partially compiled regex must be temporarily terminated with OP_END.
1627 nigel 77
1628 nigel 93 This function has been extended with the possibility of forward references for
1629     recursions and subroutine calls. It must also check the list of such references
1630     for the group we are dealing with. If it finds that one of the recursions in
1631     the current group is on this list, it adjusts the offset in the list, not the
1632     value in the reference (which is a group number).
1633    
1634 nigel 77 Arguments:
1635     group points to the start of the group
1636     adjust the amount by which the group is to be moved
1637     utf8 TRUE in UTF-8 mode
1638     cd contains pointers to tables etc.
1639 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1640 nigel 77
1641     Returns: nothing
1642     */
1643    
1644     static void
1645 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1646     uschar *save_hwm)
1647 nigel 77 {
1648     uschar *ptr = group;
1649     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1650     {
1651 nigel 93 int offset;
1652     uschar *hc;
1653    
1654     /* See if this recursion is on the forward reference list. If so, adjust the
1655     reference. */
1656    
1657     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1658     {
1659     offset = GET(hc, 0);
1660     if (cd->start_code + offset == ptr + 1)
1661     {
1662     PUT(hc, 0, offset + adjust);
1663     break;
1664     }
1665     }
1666    
1667     /* Otherwise, adjust the recursion offset if it's after the start of this
1668     group. */
1669    
1670     if (hc >= cd->hwm)
1671     {
1672     offset = GET(ptr, 1);
1673     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1674     }
1675    
1676 nigel 77 ptr += 1 + LINK_SIZE;
1677     }
1678     }
1679    
1680    
1681    
1682     /*************************************************
1683     * Insert an automatic callout point *
1684     *************************************************/
1685    
1686     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1687     callout points before each pattern item.
1688    
1689     Arguments:
1690     code current code pointer
1691     ptr current pattern pointer
1692     cd pointers to tables etc
1693    
1694     Returns: new code pointer
1695     */
1696    
1697     static uschar *
1698     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1699     {
1700     *code++ = OP_CALLOUT;
1701     *code++ = 255;
1702     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1703     PUT(code, LINK_SIZE, 0); /* Default length */
1704     return code + 2*LINK_SIZE;
1705     }
1706    
1707    
1708    
1709     /*************************************************
1710     * Complete a callout item *
1711     *************************************************/
1712    
1713     /* A callout item contains the length of the next item in the pattern, which
1714     we can't fill in till after we have reached the relevant point. This is used
1715     for both automatic and manual callouts.
1716    
1717     Arguments:
1718     previous_callout points to previous callout item
1719     ptr current pattern pointer
1720     cd pointers to tables etc
1721    
1722     Returns: nothing
1723     */
1724    
1725     static void
1726     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1727     {
1728     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1729     PUT(previous_callout, 2 + LINK_SIZE, length);
1730     }
1731    
1732    
1733    
1734     #ifdef SUPPORT_UCP
1735     /*************************************************
1736     * Get othercase range *
1737     *************************************************/
1738    
1739     /* This function is passed the start and end of a class range, in UTF-8 mode
1740     with UCP support. It searches up the characters, looking for internal ranges of
1741     characters in the "other" case. Each call returns the next one, updating the
1742     start address.
1743    
1744     Arguments:
1745     cptr points to starting character value; updated
1746     d end value
1747     ocptr where to put start of othercase range
1748     odptr where to put end of othercase range
1749    
1750     Yield: TRUE when range returned; FALSE when no more
1751     */
1752    
1753     static BOOL
1754 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1755     unsigned int *odptr)
1756 nigel 77 {
1757 nigel 93 unsigned int c, othercase, next;
1758 nigel 77
1759     for (c = *cptr; c <= d; c++)
1760 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1761 nigel 77
1762     if (c > d) return FALSE;
1763    
1764     *ocptr = othercase;
1765     next = othercase + 1;
1766    
1767     for (++c; c <= d; c++)
1768     {
1769 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1770 nigel 77 next++;
1771     }
1772    
1773     *odptr = next - 1;
1774     *cptr = c;
1775    
1776     return TRUE;
1777     }
1778     #endif /* SUPPORT_UCP */
1779    
1780    
1781 nigel 93
1782 nigel 77 /*************************************************
1783 nigel 93 * Check if auto-possessifying is possible *
1784     *************************************************/
1785    
1786     /* This function is called for unlimited repeats of certain items, to see
1787     whether the next thing could possibly match the repeated item. If not, it makes
1788     sense to automatically possessify the repeated item.
1789    
1790     Arguments:
1791     op_code the repeated op code
1792     this data for this item, depends on the opcode
1793     utf8 TRUE in UTF-8 mode
1794     utf8_char used for utf8 character bytes, NULL if not relevant
1795     ptr next character in pattern
1796     options options bits
1797     cd contains pointers to tables etc.
1798    
1799     Returns: TRUE if possessifying is wanted
1800     */
1801    
1802     static BOOL
1803     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1804     const uschar *ptr, int options, compile_data *cd)
1805     {
1806     int next;
1807    
1808     /* Skip whitespace and comments in extended mode */
1809    
1810     if ((options & PCRE_EXTENDED) != 0)
1811     {
1812     for (;;)
1813     {
1814     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1815     if (*ptr == '#')
1816     {
1817     while (*(++ptr) != 0)
1818     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1819     }
1820     else break;
1821     }
1822     }
1823    
1824     /* If the next item is one that we can handle, get its value. A non-negative
1825     value is a character, a negative value is an escape value. */
1826    
1827     if (*ptr == '\\')
1828     {
1829     int temperrorcode = 0;
1830     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1831     if (temperrorcode != 0) return FALSE;
1832     ptr++; /* Point after the escape sequence */
1833     }
1834    
1835     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1836     {
1837     #ifdef SUPPORT_UTF8
1838     if (utf8) { GETCHARINC(next, ptr); } else
1839     #endif
1840     next = *ptr++;
1841     }
1842    
1843     else return FALSE;
1844    
1845     /* Skip whitespace and comments in extended mode */
1846    
1847     if ((options & PCRE_EXTENDED) != 0)
1848     {
1849     for (;;)
1850     {
1851     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1852     if (*ptr == '#')
1853     {
1854     while (*(++ptr) != 0)
1855     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1856     }
1857     else break;
1858     }
1859     }
1860    
1861     /* If the next thing is itself optional, we have to give up. */
1862    
1863     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1864     return FALSE;
1865    
1866     /* Now compare the next item with the previous opcode. If the previous is a
1867     positive single character match, "item" either contains the character or, if
1868     "item" is greater than 127 in utf8 mode, the character's bytes are in
1869     utf8_char. */
1870    
1871    
1872     /* Handle cases when the next item is a character. */
1873    
1874     if (next >= 0) switch(op_code)
1875     {
1876     case OP_CHAR:
1877     #ifdef SUPPORT_UTF8
1878     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1879     #endif
1880     return item != next;
1881    
1882     /* For CHARNC (caseless character) we must check the other case. If we have
1883     Unicode property support, we can use it to test the other case of
1884     high-valued characters. */
1885    
1886     case OP_CHARNC:
1887     #ifdef SUPPORT_UTF8
1888     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1889     #endif
1890     if (item == next) return FALSE;
1891     #ifdef SUPPORT_UTF8
1892     if (utf8)
1893     {
1894     unsigned int othercase;
1895     if (next < 128) othercase = cd->fcc[next]; else
1896     #ifdef SUPPORT_UCP
1897     othercase = _pcre_ucp_othercase((unsigned int)next);
1898     #else
1899     othercase = NOTACHAR;
1900     #endif
1901     return (unsigned int)item != othercase;
1902     }
1903     else
1904     #endif /* SUPPORT_UTF8 */
1905     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1906    
1907     /* For OP_NOT, "item" must be a single-byte character. */
1908    
1909     case OP_NOT:
1910     if (next < 0) return FALSE; /* Not a character */
1911     if (item == next) return TRUE;
1912     if ((options & PCRE_CASELESS) == 0) return FALSE;
1913     #ifdef SUPPORT_UTF8
1914     if (utf8)
1915     {
1916     unsigned int othercase;
1917     if (next < 128) othercase = cd->fcc[next]; else
1918     #ifdef SUPPORT_UCP
1919     othercase = _pcre_ucp_othercase(next);
1920     #else
1921     othercase = NOTACHAR;
1922     #endif
1923     return (unsigned int)item == othercase;
1924     }
1925     else
1926     #endif /* SUPPORT_UTF8 */
1927     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1928    
1929     case OP_DIGIT:
1930     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1931    
1932     case OP_NOT_DIGIT:
1933     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1934    
1935     case OP_WHITESPACE:
1936     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1937    
1938     case OP_NOT_WHITESPACE:
1939     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1940    
1941     case OP_WORDCHAR:
1942     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1943    
1944     case OP_NOT_WORDCHAR:
1945     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1946    
1947     default:
1948     return FALSE;
1949     }
1950    
1951    
1952     /* Handle the case when the next item is \d, \s, etc. */
1953    
1954     switch(op_code)
1955     {
1956     case OP_CHAR:
1957     case OP_CHARNC:
1958     #ifdef SUPPORT_UTF8
1959     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1960     #endif
1961     switch(-next)
1962     {
1963     case ESC_d:
1964     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1965    
1966     case ESC_D:
1967     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1968    
1969     case ESC_s:
1970     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1971    
1972     case ESC_S:
1973     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1974    
1975     case ESC_w:
1976     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1977    
1978     case ESC_W:
1979     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1980    
1981     default:
1982     return FALSE;
1983     }
1984    
1985     case OP_DIGIT:
1986     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1987    
1988     case OP_NOT_DIGIT:
1989     return next == -ESC_d;
1990    
1991     case OP_WHITESPACE:
1992     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1993    
1994     case OP_NOT_WHITESPACE:
1995     return next == -ESC_s;
1996    
1997     case OP_WORDCHAR:
1998     return next == -ESC_W || next == -ESC_s;
1999    
2000     case OP_NOT_WORDCHAR:
2001     return next == -ESC_w || next == -ESC_d;
2002    
2003     default:
2004     return FALSE;
2005     }
2006    
2007     /* Control does not reach here */
2008     }
2009    
2010    
2011    
2012     /*************************************************
2013 nigel 77 * Compile one branch *
2014     *************************************************/
2015    
2016 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2017 nigel 77 changed during the branch, the pointer is used to change the external options
2018 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2019     to find out the amount of memory needed, as well as during the real compile
2020     phase. The value of lengthptr distinguishes the two phases.
2021 nigel 77
2022     Arguments:
2023     optionsptr pointer to the option bits
2024     codeptr points to the pointer to the current code point
2025     ptrptr points to the current pattern pointer
2026     errorcodeptr points to error code variable
2027     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2028     reqbyteptr set to the last literal character required, else < 0
2029     bcptr points to current branch chain
2030     cd contains pointers to tables etc.
2031 nigel 93 lengthptr NULL during the real compile phase
2032     points to length accumulator during pre-compile phase
2033 nigel 77
2034     Returns: TRUE on success
2035     FALSE, with *errorcodeptr set non-zero on error
2036     */
2037    
2038     static BOOL
2039 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2040     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2041     compile_data *cd, int *lengthptr)
2042 nigel 77 {
2043     int repeat_type, op_type;
2044     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2045     int bravalue = 0;
2046     int greedy_default, greedy_non_default;
2047     int firstbyte, reqbyte;
2048     int zeroreqbyte, zerofirstbyte;
2049     int req_caseopt, reqvary, tempreqvary;
2050     int options = *optionsptr;
2051     int after_manual_callout = 0;
2052 nigel 93 int length_prevgroup = 0;
2053 nigel 77 register int c;
2054     register uschar *code = *codeptr;
2055 nigel 93 uschar *last_code = code;
2056     uschar *orig_code = code;
2057 nigel 77 uschar *tempcode;
2058     BOOL inescq = FALSE;
2059     BOOL groupsetfirstbyte = FALSE;
2060     const uschar *ptr = *ptrptr;
2061     const uschar *tempptr;
2062     uschar *previous = NULL;
2063     uschar *previous_callout = NULL;
2064 nigel 93 uschar *save_hwm = NULL;
2065 nigel 77 uschar classbits[32];
2066    
2067     #ifdef SUPPORT_UTF8
2068     BOOL class_utf8;
2069     BOOL utf8 = (options & PCRE_UTF8) != 0;
2070     uschar *class_utf8data;
2071     uschar utf8_char[6];
2072     #else
2073     BOOL utf8 = FALSE;
2074 nigel 93 uschar *utf8_char = NULL;
2075 nigel 77 #endif
2076    
2077 nigel 93 #ifdef DEBUG
2078     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2079     #endif
2080    
2081 nigel 77 /* Set up the default and non-default settings for greediness */
2082    
2083     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2084     greedy_non_default = greedy_default ^ 1;
2085    
2086     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2087     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2088     matches a non-fixed char first char; reqbyte just remains unset if we never
2089     find one.
2090    
2091     When we hit a repeat whose minimum is zero, we may have to adjust these values
2092     to take the zero repeat into account. This is implemented by setting them to
2093     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2094     item types that can be repeated set these backoff variables appropriately. */
2095    
2096     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2097    
2098     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2099     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2100     value > 255. It is added into the firstbyte or reqbyte variables to record the
2101     case status of the value. This is used only for ASCII characters. */
2102    
2103     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2104    
2105     /* Switch on next character until the end of the branch */
2106    
2107     for (;; ptr++)
2108     {
2109     BOOL negate_class;
2110     BOOL possessive_quantifier;
2111     BOOL is_quantifier;
2112 nigel 93 BOOL is_recurse;
2113 ph10 175 BOOL reset_bracount;
2114 nigel 77 int class_charcount;
2115     int class_lastchar;
2116     int newoptions;
2117     int recno;
2118 ph10 172 int refsign;
2119 nigel 77 int skipbytes;
2120     int subreqbyte;
2121     int subfirstbyte;
2122 nigel 93 int terminator;
2123 nigel 77 int mclength;
2124     uschar mcbuffer[8];
2125    
2126 nigel 93 /* Get next byte in the pattern */
2127 nigel 77
2128     c = *ptr;
2129    
2130 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2131     previous cycle of this loop. */
2132    
2133     if (lengthptr != NULL)
2134     {
2135     #ifdef DEBUG
2136     if (code > cd->hwm) cd->hwm = code; /* High water info */
2137     #endif
2138     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2139     {
2140     *errorcodeptr = ERR52;
2141     goto FAILED;
2142     }
2143    
2144     /* There is at least one situation where code goes backwards: this is the
2145     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2146     the class is simply eliminated. However, it is created first, so we have to
2147     allow memory for it. Therefore, don't ever reduce the length at this point.
2148     */
2149    
2150     if (code < last_code) code = last_code;
2151     *lengthptr += code - last_code;
2152     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2153    
2154     /* If "previous" is set and it is not at the start of the work space, move
2155     it back to there, in order to avoid filling up the work space. Otherwise,
2156     if "previous" is NULL, reset the current code pointer to the start. */
2157    
2158     if (previous != NULL)
2159     {
2160     if (previous > orig_code)
2161     {
2162     memmove(orig_code, previous, code - previous);
2163     code -= previous - orig_code;
2164     previous = orig_code;
2165     }
2166     }
2167     else code = orig_code;
2168    
2169     /* Remember where this code item starts so we can pick up the length
2170     next time round. */
2171    
2172     last_code = code;
2173     }
2174    
2175     /* In the real compile phase, just check the workspace used by the forward
2176     reference list. */
2177    
2178     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2179     {
2180     *errorcodeptr = ERR52;
2181     goto FAILED;
2182     }
2183    
2184 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2185    
2186     if (inescq && c != 0)
2187     {
2188     if (c == '\\' && ptr[1] == 'E')
2189     {
2190     inescq = FALSE;
2191     ptr++;
2192     continue;
2193     }
2194     else
2195     {
2196     if (previous_callout != NULL)
2197     {
2198 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2199     complete_callout(previous_callout, ptr, cd);
2200 nigel 77 previous_callout = NULL;
2201     }
2202     if ((options & PCRE_AUTO_CALLOUT) != 0)
2203     {
2204     previous_callout = code;
2205     code = auto_callout(code, ptr, cd);
2206     }
2207     goto NORMAL_CHAR;
2208     }
2209     }
2210    
2211     /* Fill in length of a previous callout, except when the next thing is
2212     a quantifier. */
2213    
2214     is_quantifier = c == '*' || c == '+' || c == '?' ||
2215     (c == '{' && is_counted_repeat(ptr+1));
2216    
2217     if (!is_quantifier && previous_callout != NULL &&
2218     after_manual_callout-- <= 0)
2219     {
2220 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2221     complete_callout(previous_callout, ptr, cd);
2222 nigel 77 previous_callout = NULL;
2223     }
2224    
2225     /* In extended mode, skip white space and comments */
2226    
2227     if ((options & PCRE_EXTENDED) != 0)
2228     {
2229     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2230     if (c == '#')
2231     {
2232 nigel 93 while (*(++ptr) != 0)
2233 nigel 91 {
2234 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2235 nigel 91 }
2236 nigel 93 if (*ptr != 0) continue;
2237    
2238 nigel 91 /* Else fall through to handle end of string */
2239     c = 0;
2240 nigel 77 }
2241     }
2242    
2243     /* No auto callout for quantifiers. */
2244    
2245     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2246     {
2247     previous_callout = code;
2248     code = auto_callout(code, ptr, cd);
2249     }
2250    
2251     switch(c)
2252     {
2253 nigel 93 /* ===================================================================*/
2254     case 0: /* The branch terminates at string end */
2255     case '|': /* or | or ) */
2256 nigel 77 case ')':
2257     *firstbyteptr = firstbyte;
2258     *reqbyteptr = reqbyte;
2259     *codeptr = code;
2260     *ptrptr = ptr;
2261 nigel 93 if (lengthptr != NULL)
2262     {
2263     *lengthptr += code - last_code; /* To include callout length */
2264     DPRINTF((">> end branch\n"));
2265     }
2266 nigel 77 return TRUE;
2267    
2268 nigel 93
2269     /* ===================================================================*/
2270 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2271     the setting of any following char as a first character. */
2272    
2273     case '^':
2274     if ((options & PCRE_MULTILINE) != 0)
2275     {
2276     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2277     }
2278     previous = NULL;
2279     *code++ = OP_CIRC;
2280     break;
2281    
2282     case '$':
2283     previous = NULL;
2284     *code++ = OP_DOLL;
2285     break;
2286    
2287     /* There can never be a first char if '.' is first, whatever happens about
2288     repeats. The value of reqbyte doesn't change either. */
2289    
2290     case '.':
2291     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2292     zerofirstbyte = firstbyte;
2293     zeroreqbyte = reqbyte;
2294     previous = code;
2295     *code++ = OP_ANY;
2296     break;
2297    
2298 nigel 93
2299     /* ===================================================================*/
2300 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2301     32-byte bitmap of the permitted characters, except in the special case
2302     where there is only one such character. For negated classes, we build the
2303     map as usual, then invert it at the end. However, we use a different opcode
2304     so that data characters > 255 can be handled correctly.
2305 nigel 77
2306     If the class contains characters outside the 0-255 range, a different
2307     opcode is compiled. It may optionally have a bit map for characters < 256,
2308     but those above are are explicitly listed afterwards. A flag byte tells
2309     whether the bitmap is present, and whether this is a negated class or not.
2310     */
2311    
2312     case '[':
2313     previous = code;
2314    
2315     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2316     they are encountered at the top level, so we'll do that too. */
2317    
2318     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2319     check_posix_syntax(ptr, &tempptr, cd))
2320     {
2321     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2322     goto FAILED;
2323     }
2324    
2325     /* If the first character is '^', set the negation flag and skip it. */
2326    
2327     if ((c = *(++ptr)) == '^')
2328     {
2329     negate_class = TRUE;
2330     c = *(++ptr);
2331     }
2332     else
2333     {
2334     negate_class = FALSE;
2335     }
2336    
2337     /* Keep a count of chars with values < 256 so that we can optimize the case
2338 nigel 93 of just a single character (as long as it's < 256). However, For higher
2339     valued UTF-8 characters, we don't yet do any optimization. */
2340 nigel 77
2341     class_charcount = 0;
2342     class_lastchar = -1;
2343    
2344 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2345     temporary bit of memory, in case the class contains only 1 character (less
2346     than 256), because in that case the compiled code doesn't use the bit map.
2347     */
2348    
2349     memset(classbits, 0, 32 * sizeof(uschar));
2350    
2351 nigel 77 #ifdef SUPPORT_UTF8
2352     class_utf8 = FALSE; /* No chars >= 256 */
2353 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2354 nigel 77 #endif
2355    
2356     /* Process characters until ] is reached. By writing this as a "do" it
2357 nigel 93 means that an initial ] is taken as a data character. At the start of the
2358     loop, c contains the first byte of the character. */
2359 nigel 77
2360 nigel 93 if (c != 0) do
2361 nigel 77 {
2362 nigel 93 const uschar *oldptr;
2363    
2364 nigel 77 #ifdef SUPPORT_UTF8
2365     if (utf8 && c > 127)
2366     { /* Braces are required because the */
2367     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2368     }
2369     #endif
2370    
2371     /* Inside \Q...\E everything is literal except \E */
2372    
2373     if (inescq)
2374     {
2375 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2376 nigel 77 {
2377 nigel 93 inescq = FALSE; /* Reset literal state */
2378     ptr++; /* Skip the 'E' */
2379     continue; /* Carry on with next */
2380 nigel 77 }
2381 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2382 nigel 77 }
2383    
2384     /* Handle POSIX class names. Perl allows a negation extension of the
2385     form [:^name:]. A square bracket that doesn't match the syntax is
2386     treated as a literal. We also recognize the POSIX constructions
2387     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2388     5.6 and 5.8 do. */
2389    
2390     if (c == '[' &&
2391     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2392     check_posix_syntax(ptr, &tempptr, cd))
2393     {
2394     BOOL local_negate = FALSE;
2395 nigel 87 int posix_class, taboffset, tabopt;
2396 nigel 77 register const uschar *cbits = cd->cbits;
2397 nigel 87 uschar pbits[32];
2398 nigel 77
2399     if (ptr[1] != ':')
2400     {
2401     *errorcodeptr = ERR31;
2402     goto FAILED;
2403     }
2404    
2405     ptr += 2;
2406     if (*ptr == '^')
2407     {
2408     local_negate = TRUE;
2409     ptr++;
2410     }
2411    
2412     posix_class = check_posix_name(ptr, tempptr - ptr);
2413     if (posix_class < 0)
2414     {
2415     *errorcodeptr = ERR30;
2416     goto FAILED;
2417     }
2418    
2419     /* If matching is caseless, upper and lower are converted to
2420     alpha. This relies on the fact that the class table starts with
2421     alpha, lower, upper as the first 3 entries. */
2422    
2423     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2424     posix_class = 0;
2425    
2426 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2427     because we may be adding and subtracting from it, and we don't want to
2428     subtract bits that may be in the main map already. At the end we or the
2429     result into the bit map that is being built. */
2430 nigel 77
2431     posix_class *= 3;
2432 nigel 87
2433     /* Copy in the first table (always present) */
2434    
2435     memcpy(pbits, cbits + posix_class_maps[posix_class],
2436     32 * sizeof(uschar));
2437    
2438     /* If there is a second table, add or remove it as required. */
2439    
2440     taboffset = posix_class_maps[posix_class + 1];
2441     tabopt = posix_class_maps[posix_class + 2];
2442    
2443     if (taboffset >= 0)
2444 nigel 77 {
2445 nigel 87 if (tabopt >= 0)
2446     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2447 nigel 77 else
2448 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2449 nigel 77 }
2450    
2451 nigel 87 /* Not see if we need to remove any special characters. An option
2452     value of 1 removes vertical space and 2 removes underscore. */
2453    
2454     if (tabopt < 0) tabopt = -tabopt;
2455     if (tabopt == 1) pbits[1] &= ~0x3c;
2456     else if (tabopt == 2) pbits[11] &= 0x7f;
2457    
2458     /* Add the POSIX table or its complement into the main table that is
2459     being built and we are done. */
2460    
2461     if (local_negate)
2462     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2463     else
2464     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2465    
2466 nigel 77 ptr = tempptr + 1;
2467     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2468     continue; /* End of POSIX syntax handling */
2469     }
2470    
2471     /* Backslash may introduce a single character, or it may introduce one
2472 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2473     case. Inside a class (and only there) it is treated as backspace.
2474     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2475     to or into the one we are building. We assume they have more than one
2476 nigel 77 character in them, so set class_charcount bigger than one. */
2477    
2478     if (c == '\\')
2479     {
2480 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2481     if (*errorcodeptr != 0) goto FAILED;
2482 nigel 77
2483     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2484     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2485 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2486 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2487     {
2488     if (ptr[1] == '\\' && ptr[2] == 'E')
2489     {
2490     ptr += 2; /* avoid empty string */
2491     }
2492     else inescq = TRUE;
2493     continue;
2494     }
2495    
2496     if (c < 0)
2497     {
2498     register const uschar *cbits = cd->cbits;
2499     class_charcount += 2; /* Greater than 1 is what matters */
2500 nigel 93
2501     /* Save time by not doing this in the pre-compile phase. */
2502    
2503     if (lengthptr == NULL) switch (-c)
2504 nigel 77 {
2505     case ESC_d:
2506     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2507     continue;
2508    
2509     case ESC_D:
2510     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2511     continue;
2512    
2513     case ESC_w:
2514     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2515     continue;
2516    
2517     case ESC_W:
2518     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2519     continue;
2520    
2521     case ESC_s:
2522     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2523     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2524     continue;
2525    
2526     case ESC_S:
2527     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2528     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2529     continue;
2530    
2531 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2532     continue;
2533    
2534     default: /* Not recognized; fall through */
2535     break; /* Need "default" setting to stop compiler warning. */
2536     }
2537    
2538     /* In the pre-compile phase, just do the recognition. */
2539    
2540     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2541     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2542    
2543     /* We need to deal with \P and \p in both phases. */
2544    
2545 nigel 77 #ifdef SUPPORT_UCP
2546 nigel 93 if (-c == ESC_p || -c == ESC_P)
2547     {
2548     BOOL negated;
2549     int pdata;
2550     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2551     if (ptype < 0) goto FAILED;
2552     class_utf8 = TRUE;
2553     *class_utf8data++ = ((-c == ESC_p) != negated)?
2554     XCL_PROP : XCL_NOTPROP;
2555     *class_utf8data++ = ptype;
2556     *class_utf8data++ = pdata;
2557     class_charcount -= 2; /* Not a < 256 character */
2558 nigel 77 continue;
2559 nigel 93 }
2560 nigel 77 #endif
2561 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2562     strict mode. By default, for compatibility with Perl, they are
2563     treated as literals. */
2564 nigel 77
2565 nigel 93 if ((options & PCRE_EXTRA) != 0)
2566     {
2567     *errorcodeptr = ERR7;
2568     goto FAILED;
2569     }
2570 nigel 77
2571 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2572     c = *ptr; /* Get the final character and fall through */
2573 nigel 77 }
2574    
2575     /* Fall through if we have a single character (c >= 0). This may be
2576 nigel 93 greater than 256 in UTF-8 mode. */
2577 nigel 77
2578     } /* End of backslash handling */
2579    
2580     /* A single character may be followed by '-' to form a range. However,
2581     Perl does not permit ']' to be the end of the range. A '-' character
2582 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2583     entirely. The code for handling \Q and \E is messy. */
2584 nigel 77
2585 nigel 93 CHECK_RANGE:
2586     while (ptr[1] == '\\' && ptr[2] == 'E')
2587 nigel 77 {
2588 nigel 93 inescq = FALSE;
2589     ptr += 2;
2590     }
2591    
2592     oldptr = ptr;
2593    
2594     if (!inescq && ptr[1] == '-')
2595     {
2596 nigel 77 int d;
2597     ptr += 2;
2598 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2599 nigel 77
2600 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2601     mode. */
2602    
2603     while (*ptr == '\\' && ptr[1] == 'Q')
2604     {
2605     ptr += 2;
2606     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2607     inescq = TRUE;
2608     break;
2609     }
2610    
2611     if (*ptr == 0 || (!inescq && *ptr == ']'))
2612     {
2613     ptr = oldptr;
2614     goto LONE_SINGLE_CHARACTER;
2615     }
2616    
2617 nigel 77 #ifdef SUPPORT_UTF8
2618     if (utf8)
2619     { /* Braces are required because the */
2620     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2621     }
2622     else
2623     #endif
2624     d = *ptr; /* Not UTF-8 mode */
2625    
2626     /* The second part of a range can be a single-character escape, but
2627     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2628     in such circumstances. */
2629    
2630 nigel 93 if (!inescq && d == '\\')
2631 nigel 77 {
2632 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2633     if (*errorcodeptr != 0) goto FAILED;
2634 nigel 77
2635 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2636     special means the '-' was literal */
2637 nigel 77
2638     if (d < 0)
2639     {
2640     if (d == -ESC_b) d = '\b';
2641 nigel 93 else if (d == -ESC_X) d = 'X';
2642     else if (d == -ESC_R) d = 'R'; else
2643 nigel 77 {
2644 nigel 93 ptr = oldptr;
2645 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2646     }
2647     }
2648     }
2649    
2650 nigel 93 /* Check that the two values are in the correct order. Optimize
2651     one-character ranges */
2652 nigel 77
2653 nigel 93 if (d < c)
2654     {
2655     *errorcodeptr = ERR8;
2656     goto FAILED;
2657     }
2658    
2659 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2660    
2661     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2662     matching, we have to use an XCLASS with extra data items. Caseless
2663     matching for characters > 127 is available only if UCP support is
2664     available. */
2665    
2666     #ifdef SUPPORT_UTF8
2667     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2668     {
2669     class_utf8 = TRUE;
2670    
2671     /* With UCP support, we can find the other case equivalents of
2672     the relevant characters. There may be several ranges. Optimize how
2673     they fit with the basic range. */
2674    
2675     #ifdef SUPPORT_UCP
2676     if ((options & PCRE_CASELESS) != 0)
2677     {
2678 nigel 93 unsigned int occ, ocd;
2679     unsigned int cc = c;
2680     unsigned int origd = d;
2681 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2682     {
2683 ph10 176 if (occ >= (unsigned int)c &&
2684     ocd <= (unsigned int)d)
2685     continue; /* Skip embedded ranges */
2686 nigel 77
2687 ph10 176 if (occ < (unsigned int)c &&
2688     ocd >= (unsigned int)c - 1) /* Extend the basic range */
2689 nigel 77 { /* if there is overlap, */
2690     c = occ; /* noting that if occ < c */
2691     continue; /* we can't have ocd > d */
2692     } /* because a subrange is */
2693 ph10 176 if (ocd > (unsigned int)d &&
2694     occ <= (unsigned int)d + 1) /* always shorter than */
2695 nigel 77 { /* the basic range. */
2696     d = ocd;
2697     continue;
2698     }
2699    
2700     if (occ == ocd)
2701     {
2702     *class_utf8data++ = XCL_SINGLE;
2703     }
2704     else
2705     {
2706     *class_utf8data++ = XCL_RANGE;
2707     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2708     }
2709     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2710     }
2711     }
2712     #endif /* SUPPORT_UCP */
2713    
2714     /* Now record the original range, possibly modified for UCP caseless
2715     overlapping ranges. */
2716    
2717     *class_utf8data++ = XCL_RANGE;
2718     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2719     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2720    
2721     /* With UCP support, we are done. Without UCP support, there is no
2722     caseless matching for UTF-8 characters > 127; we can use the bit map
2723     for the smaller ones. */
2724    
2725     #ifdef SUPPORT_UCP
2726     continue; /* With next character in the class */
2727     #else
2728     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2729    
2730     /* Adjust upper limit and fall through to set up the map */
2731    
2732     d = 127;
2733    
2734     #endif /* SUPPORT_UCP */
2735     }
2736     #endif /* SUPPORT_UTF8 */
2737    
2738     /* We use the bit map for all cases when not in UTF-8 mode; else
2739     ranges that lie entirely within 0-127 when there is UCP support; else
2740     for partial ranges without UCP support. */
2741    
2742 nigel 93 class_charcount += d - c + 1;
2743     class_lastchar = d;
2744    
2745     /* We can save a bit of time by skipping this in the pre-compile. */
2746    
2747     if (lengthptr == NULL) for (; c <= d; c++)
2748 nigel 77 {
2749     classbits[c/8] |= (1 << (c&7));
2750     if ((options & PCRE_CASELESS) != 0)
2751     {
2752     int uc = cd->fcc[c]; /* flip case */
2753     classbits[uc/8] |= (1 << (uc&7));
2754     }
2755     }
2756    
2757     continue; /* Go get the next char in the class */
2758     }
2759    
2760     /* Handle a lone single character - we can get here for a normal
2761     non-escape char, or after \ that introduces a single character or for an
2762     apparent range that isn't. */
2763    
2764     LONE_SINGLE_CHARACTER:
2765    
2766     /* Handle a character that cannot go in the bit map */
2767    
2768     #ifdef SUPPORT_UTF8
2769     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2770     {
2771     class_utf8 = TRUE;
2772     *class_utf8data++ = XCL_SINGLE;
2773     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2774    
2775     #ifdef SUPPORT_UCP
2776     if ((options & PCRE_CASELESS) != 0)
2777     {
2778 nigel 93 unsigned int othercase;
2779     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2780 nigel 77 {
2781     *class_utf8data++ = XCL_SINGLE;
2782     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2783     }
2784     }
2785     #endif /* SUPPORT_UCP */
2786    
2787     }
2788     else
2789     #endif /* SUPPORT_UTF8 */
2790    
2791     /* Handle a single-byte character */
2792     {
2793     classbits[c/8] |= (1 << (c&7));
2794     if ((options & PCRE_CASELESS) != 0)
2795     {
2796     c = cd->fcc[c]; /* flip case */
2797     classbits[c/8] |= (1 << (c&7));
2798     }
2799     class_charcount++;
2800     class_lastchar = c;
2801     }
2802     }
2803    
2804 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2805 nigel 77
2806 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2807 nigel 77
2808 nigel 93 if (c == 0) /* Missing terminating ']' */
2809     {
2810     *errorcodeptr = ERR6;
2811     goto FAILED;
2812     }
2813    
2814 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2815     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2816     can optimize the negative case only if there were no characters >= 128
2817     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2818     single-bytes only. This is an historical hangover. Maybe one day we can
2819     tidy these opcodes to handle multi-byte characters.
2820    
2821     The optimization throws away the bit map. We turn the item into a
2822     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2823     that OP_NOT does not support multibyte characters. In the positive case, it
2824     can cause firstbyte to be set. Otherwise, there can be no first char if
2825     this item is first, whatever repeat count may follow. In the case of
2826     reqbyte, save the previous value for reinstating. */
2827    
2828     #ifdef SUPPORT_UTF8
2829     if (class_charcount == 1 &&
2830     (!utf8 ||
2831     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2832    
2833     #else
2834     if (class_charcount == 1)
2835     #endif
2836     {
2837     zeroreqbyte = reqbyte;
2838    
2839     /* The OP_NOT opcode works on one-byte characters only. */
2840    
2841     if (negate_class)
2842     {
2843     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2844     zerofirstbyte = firstbyte;
2845     *code++ = OP_NOT;
2846     *code++ = class_lastchar;
2847     break;
2848     }
2849    
2850     /* For a single, positive character, get the value into mcbuffer, and
2851     then we can handle this with the normal one-character code. */
2852    
2853     #ifdef SUPPORT_UTF8
2854     if (utf8 && class_lastchar > 127)
2855     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2856     else
2857     #endif
2858     {
2859     mcbuffer[0] = class_lastchar;
2860     mclength = 1;
2861     }
2862     goto ONE_CHAR;
2863     } /* End of 1-char optimization */
2864    
2865     /* The general case - not the one-char optimization. If this is the first
2866     thing in the branch, there can be no first char setting, whatever the
2867     repeat count. Any reqbyte setting must remain unchanged after any kind of
2868     repeat. */
2869    
2870     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2871     zerofirstbyte = firstbyte;
2872     zeroreqbyte = reqbyte;
2873    
2874     /* If there are characters with values > 255, we have to compile an
2875     extended class, with its own opcode. If there are no characters < 256,
2876 nigel 93 we can omit the bitmap in the actual compiled code. */
2877 nigel 77
2878     #ifdef SUPPORT_UTF8
2879     if (class_utf8)
2880     {
2881     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2882     *code++ = OP_XCLASS;
2883     code += LINK_SIZE;
2884     *code = negate_class? XCL_NOT : 0;
2885    
2886 nigel 93 /* If the map is required, move up the extra data to make room for it;
2887     otherwise just move the code pointer to the end of the extra data. */
2888 nigel 77
2889     if (class_charcount > 0)
2890     {
2891     *code++ |= XCL_MAP;
2892 nigel 93 memmove(code + 32, code, class_utf8data - code);
2893 nigel 77 memcpy(code, classbits, 32);
2894 nigel 93 code = class_utf8data + 32;
2895 nigel 77 }
2896 nigel 93 else code = class_utf8data;
2897 nigel 77
2898     /* Now fill in the complete length of the item */
2899    
2900     PUT(previous, 1, code - previous);
2901     break; /* End of class handling */
2902     }
2903     #endif
2904    
2905     /* If there are no characters > 255, negate the 32-byte map if necessary,
2906     and copy it into the code vector. If this is the first thing in the branch,
2907     there can be no first char setting, whatever the repeat count. Any reqbyte
2908     setting must remain unchanged after any kind of repeat. */
2909    
2910     if (negate_class)
2911     {
2912     *code++ = OP_NCLASS;
2913 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2914     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2915 nigel 77 }
2916     else
2917     {
2918     *code++ = OP_CLASS;
2919     memcpy(code, classbits, 32);
2920     }
2921     code += 32;
2922     break;
2923    
2924 nigel 93
2925     /* ===================================================================*/
2926 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2927     has been tested above. */
2928    
2929     case '{':
2930     if (!is_quantifier) goto NORMAL_CHAR;
2931     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2932     if (*errorcodeptr != 0) goto FAILED;
2933     goto REPEAT;
2934    
2935     case '*':
2936     repeat_min = 0;
2937     repeat_max = -1;
2938     goto REPEAT;
2939    
2940     case '+':
2941     repeat_min = 1;
2942     repeat_max = -1;
2943     goto REPEAT;
2944    
2945     case '?':
2946     repeat_min = 0;
2947     repeat_max = 1;
2948    
2949     REPEAT:
2950     if (previous == NULL)
2951     {
2952     *errorcodeptr = ERR9;
2953     goto FAILED;
2954     }
2955    
2956     if (repeat_min == 0)
2957     {
2958     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2959     reqbyte = zeroreqbyte; /* Ditto */
2960     }
2961    
2962     /* Remember whether this is a variable length repeat */
2963    
2964     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2965    
2966     op_type = 0; /* Default single-char op codes */
2967     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2968    
2969     /* Save start of previous item, in case we have to move it up to make space
2970     for an inserted OP_ONCE for the additional '+' extension. */
2971    
2972     tempcode = previous;
2973    
2974     /* If the next character is '+', we have a possessive quantifier. This
2975     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2976     If the next character is '?' this is a minimizing repeat, by default,
2977     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2978     repeat type to the non-default. */
2979    
2980     if (ptr[1] == '+')
2981     {
2982     repeat_type = 0; /* Force greedy */
2983     possessive_quantifier = TRUE;
2984     ptr++;
2985     }
2986     else if (ptr[1] == '?')
2987     {
2988     repeat_type = greedy_non_default;
2989     ptr++;
2990     }
2991     else repeat_type = greedy_default;
2992    
2993     /* If previous was a character match, abolish the item and generate a
2994     repeat item instead. If a char item has a minumum of more than one, ensure
2995     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2996     the first thing in a branch because the x will have gone into firstbyte
2997     instead. */
2998    
2999     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3000     {
3001     /* Deal with UTF-8 characters that take up more than one byte. It's
3002     easier to write this out separately than try to macrify it. Use c to
3003     hold the length of the character in bytes, plus 0x80 to flag that it's a
3004     length rather than a small character. */
3005    
3006     #ifdef SUPPORT_UTF8
3007     if (utf8 && (code[-1] & 0x80) != 0)
3008     {
3009     uschar *lastchar = code - 1;
3010     while((*lastchar & 0xc0) == 0x80) lastchar--;
3011     c = code - lastchar; /* Length of UTF-8 character */
3012     memcpy(utf8_char, lastchar, c); /* Save the char */
3013     c |= 0x80; /* Flag c as a length */
3014     }
3015     else
3016     #endif
3017    
3018     /* Handle the case of a single byte - either with no UTF8 support, or
3019     with UTF-8 disabled, or for a UTF-8 character < 128. */
3020    
3021     {
3022     c = code[-1];
3023     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3024     }
3025    
3026 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3027     the line is something that cannot possibly match this character. If so,
3028     automatically possessifying this item gains some performance in the case
3029     where the match fails. */
3030    
3031     if (!possessive_quantifier &&
3032     repeat_max < 0 &&
3033     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3034     options, cd))
3035     {
3036     repeat_type = 0; /* Force greedy */
3037     possessive_quantifier = TRUE;
3038     }
3039    
3040 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3041     }
3042    
3043     /* If previous was a single negated character ([^a] or similar), we use
3044     one of the special opcodes, replacing it. The code is shared with single-
3045     character repeats by setting opt_type to add a suitable offset into
3046 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3047     currently used only for single-byte chars. */
3048 nigel 77
3049     else if (*previous == OP_NOT)
3050     {
3051     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3052     c = previous[1];
3053 nigel 93 if (!possessive_quantifier &&
3054     repeat_max < 0 &&
3055     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3056     {
3057     repeat_type = 0; /* Force greedy */
3058     possessive_quantifier = TRUE;
3059     }
3060 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3061     }
3062    
3063     /* If previous was a character type match (\d or similar), abolish it and
3064     create a suitable repeat item. The code is shared with single-character
3065     repeats by setting op_type to add a suitable offset into repeat_type. Note
3066     the the Unicode property types will be present only when SUPPORT_UCP is
3067     defined, but we don't wrap the little bits of code here because it just
3068     makes it horribly messy. */
3069    
3070     else if (*previous < OP_EODN)
3071     {
3072     uschar *oldcode;
3073 nigel 87 int prop_type, prop_value;
3074 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3075     c = *previous;
3076    
3077 nigel 93 if (!possessive_quantifier &&
3078     repeat_max < 0 &&
3079     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3080     {
3081     repeat_type = 0; /* Force greedy */
3082     possessive_quantifier = TRUE;
3083     }
3084    
3085 nigel 77 OUTPUT_SINGLE_REPEAT:
3086 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3087     {
3088     prop_type = previous[1];
3089     prop_value = previous[2];
3090     }
3091     else prop_type = prop_value = -1;
3092 nigel 77
3093     oldcode = code;
3094     code = previous; /* Usually overwrite previous item */
3095    
3096     /* If the maximum is zero then the minimum must also be zero; Perl allows
3097     this case, so we do too - by simply omitting the item altogether. */
3098    
3099     if (repeat_max == 0) goto END_REPEAT;
3100    
3101     /* All real repeats make it impossible to handle partial matching (maybe
3102     one day we will be able to remove this restriction). */
3103    
3104     if (repeat_max != 1) cd->nopartial = TRUE;
3105    
3106     /* Combine the op_type with the repeat_type */
3107    
3108     repeat_type += op_type;
3109    
3110     /* A minimum of zero is handled either as the special case * or ?, or as
3111     an UPTO, with the maximum given. */
3112    
3113     if (repeat_min == 0)
3114     {
3115     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3116     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3117     else
3118     {
3119     *code++ = OP_UPTO + repeat_type;
3120     PUT2INC(code, 0, repeat_max);
3121     }
3122     }
3123    
3124     /* A repeat minimum of 1 is optimized into some special cases. If the
3125 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3126 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3127     one less than the maximum. */
3128    
3129     else if (repeat_min == 1)
3130     {
3131     if (repeat_max == -1)
3132     *code++ = OP_PLUS + repeat_type;
3133     else
3134     {
3135     code = oldcode; /* leave previous item in place */
3136     if (repeat_max == 1) goto END_REPEAT;
3137     *code++ = OP_UPTO + repeat_type;
3138     PUT2INC(code, 0, repeat_max - 1);
3139     }
3140     }
3141    
3142     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3143     handled as an EXACT followed by an UPTO. */
3144    
3145     else
3146     {
3147     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3148     PUT2INC(code, 0, repeat_min);
3149    
3150     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3151     we have to insert the character for the previous code. For a repeated
3152 nigel 87 Unicode property match, there are two extra bytes that define the
3153 nigel 77 required property. In UTF-8 mode, long characters have their length in
3154     c, with the 0x80 bit as a flag. */
3155    
3156     if (repeat_max < 0)
3157     {
3158     #ifdef SUPPORT_UTF8
3159     if (utf8 && c >= 128)
3160     {
3161     memcpy(code, utf8_char, c & 7);
3162     code += c & 7;
3163     }
3164     else
3165     #endif
3166     {
3167     *code++ = c;
3168 nigel 87 if (prop_type >= 0)
3169     {
3170     *code++ = prop_type;
3171     *code++ = prop_value;
3172     }
3173 nigel 77 }
3174     *code++ = OP_STAR + repeat_type;
3175     }
3176    
3177     /* Else insert an UPTO if the max is greater than the min, again
3178 nigel 93 preceded by the character, for the previously inserted code. If the
3179     UPTO is just for 1 instance, we can use QUERY instead. */
3180 nigel 77
3181     else if (repeat_max != repeat_min)
3182     {
3183     #ifdef SUPPORT_UTF8
3184     if (utf8 && c >= 128)
3185     {
3186     memcpy(code, utf8_char, c & 7);
3187     code += c & 7;
3188     }
3189     else
3190     #endif
3191     *code++ = c;
3192 nigel 87 if (prop_type >= 0)
3193     {
3194     *code++ = prop_type;
3195     *code++ = prop_value;
3196     }
3197 nigel 77 repeat_max -= repeat_min;
3198 nigel 93
3199     if (repeat_max == 1)
3200     {
3201     *code++ = OP_QUERY + repeat_type;
3202     }
3203     else
3204     {
3205     *code++ = OP_UPTO + repeat_type;
3206     PUT2INC(code, 0, repeat_max);
3207     }
3208 nigel 77 }
3209     }
3210    
3211     /* The character or character type itself comes last in all cases. */
3212    
3213     #ifdef SUPPORT_UTF8
3214     if (utf8 && c >= 128)
3215     {
3216     memcpy(code, utf8_char, c & 7);
3217     code += c & 7;
3218     }
3219     else
3220     #endif
3221     *code++ = c;
3222    
3223 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3224     define the required property. */
3225 nigel 77
3226     #ifdef SUPPORT_UCP
3227 nigel 87 if (prop_type >= 0)
3228     {
3229     *code++ = prop_type;
3230     *code++ = prop_value;
3231     }
3232 nigel 77 #endif
3233     }
3234    
3235     /* If previous was a character class or a back reference, we put the repeat
3236     stuff after it, but just skip the item if the repeat was {0,0}. */
3237    
3238     else if (*previous == OP_CLASS ||
3239     *previous == OP_NCLASS ||
3240     #ifdef SUPPORT_UTF8
3241     *previous == OP_XCLASS ||
3242     #endif
3243     *previous == OP_REF)
3244     {
3245     if (repeat_max == 0)
3246     {
3247     code = previous;
3248     goto END_REPEAT;
3249     }
3250    
3251     /* All real repeats make it impossible to handle partial matching (maybe
3252     one day we will be able to remove this restriction). */
3253    
3254     if (repeat_max != 1) cd->nopartial = TRUE;
3255    
3256     if (repeat_min == 0 && repeat_max == -1)
3257     *code++ = OP_CRSTAR + repeat_type;
3258     else if (repeat_min == 1 && repeat_max == -1)
3259     *code++ = OP_CRPLUS + repeat_type;
3260     else if (repeat_min == 0 && repeat_max == 1)
3261     *code++ = OP_CRQUERY + repeat_type;
3262     else
3263     {
3264     *code++ = OP_CRRANGE + repeat_type;
3265     PUT2INC(code, 0, repeat_min);
3266     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3267     PUT2INC(code, 0, repeat_max);
3268     }
3269     }
3270    
3271     /* If previous was a bracket group, we may have to replicate it in certain
3272     cases. */
3273    
3274 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3275     *previous == OP_ONCE || *previous == OP_COND)
3276 nigel 77 {
3277     register int i;
3278     int ketoffset = 0;
3279     int len = code - previous;
3280     uschar *bralink = NULL;
3281    
3282 nigel 93 /* Repeating a DEFINE group is pointless */
3283    
3284     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3285     {
3286     *errorcodeptr = ERR55;
3287     goto FAILED;
3288     }
3289    
3290     /* This is a paranoid check to stop integer overflow later on */
3291    
3292     if (len > MAX_DUPLENGTH)
3293     {
3294     *errorcodeptr = ERR50;
3295     goto FAILED;
3296     }
3297    
3298 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3299     by scanning through from the start, and compute the offset back to it
3300     from the current code pointer. There may be an OP_OPT setting following
3301     the final KET, so we can't find the end just by going back from the code
3302     pointer. */
3303    
3304     if (repeat_max == -1)
3305     {
3306     register uschar *ket = previous;
3307     do ket += GET(ket, 1); while (*ket != OP_KET);
3308     ketoffset = code - ket;
3309     }
3310    
3311     /* The case of a zero minimum is special because of the need to stick
3312     OP_BRAZERO in front of it, and because the group appears once in the
3313     data, whereas in other cases it appears the minimum number of times. For
3314     this reason, it is simplest to treat this case separately, as otherwise
3315     the code gets far too messy. There are several special subcases when the
3316     minimum is zero. */
3317    
3318     if (repeat_min == 0)
3319     {
3320     /* If the maximum is also zero, we just omit the group from the output
3321     altogether. */
3322    
3323     if (repeat_max == 0)
3324     {
3325     code = previous;
3326     goto END_REPEAT;
3327     }
3328    
3329     /* If the maximum is 1 or unlimited, we just have to stick in the
3330     BRAZERO and do no more at this point. However, we do need to adjust
3331     any OP_RECURSE calls inside the group that refer to the group itself or
3332 nigel 93 any internal or forward referenced group, because the offset is from
3333     the start of the whole regex. Temporarily terminate the pattern while
3334     doing this. */
3335 nigel 77
3336     if (repeat_max <= 1)
3337     {
3338     *code = OP_END;
3339 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3340 nigel 77 memmove(previous+1, previous, len);
3341     code++;
3342     *previous++ = OP_BRAZERO + repeat_type;
3343     }
3344    
3345     /* If the maximum is greater than 1 and limited, we have to replicate
3346     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3347     The first one has to be handled carefully because it's the original
3348     copy, which has to be moved up. The remainder can be handled by code
3349     that is common with the non-zero minimum case below. We have to
3350     adjust the value or repeat_max, since one less copy is required. Once
3351     again, we may have to adjust any OP_RECURSE calls inside the group. */
3352    
3353     else
3354     {
3355     int offset;
3356     *code = OP_END;
3357 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3358 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3359     code += 2 + LINK_SIZE;
3360     *previous++ = OP_BRAZERO + repeat_type;
3361     *previous++ = OP_BRA;
3362    
3363     /* We chain together the bracket offset fields that have to be
3364     filled in later when the ends of the brackets are reached. */
3365    
3366     offset = (bralink == NULL)? 0 : previous - bralink;
3367     bralink = previous;
3368     PUTINC(previous, 0, offset);
3369     }
3370    
3371     repeat_max--;
3372     }
3373    
3374     /* If the minimum is greater than zero, replicate the group as many
3375     times as necessary, and adjust the maximum to the number of subsequent
3376     copies that we need. If we set a first char from the group, and didn't
3377 nigel 93 set a required char, copy the latter from the former. If there are any
3378     forward reference subroutine calls in the group, there will be entries on
3379     the workspace list; replicate these with an appropriate increment. */
3380 nigel 77
3381     else
3382     {
3383     if (repeat_min > 1)
3384     {
3385 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3386     just adjust the length as if we had. */
3387    
3388     if (lengthptr != NULL)
3389     *lengthptr += (repeat_min - 1)*length_prevgroup;
3390    
3391     /* This is compiling for real */
3392    
3393     else
3394 nigel 77 {
3395 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3396     for (i = 1; i < repeat_min; i++)
3397     {
3398     uschar *hc;
3399     uschar *this_hwm = cd->hwm;
3400     memcpy(code, previous, len);
3401     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3402     {
3403     PUT(cd->hwm, 0, GET(hc, 0) + len);
3404     cd->hwm += LINK_SIZE;
3405     }
3406     save_hwm = this_hwm;
3407     code += len;
3408     }
3409 nigel 77 }
3410     }
3411 nigel 93
3412 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3413     }
3414    
3415     /* This code is common to both the zero and non-zero minimum cases. If
3416     the maximum is limited, it replicates the group in a nested fashion,
3417     remembering the bracket starts on a stack. In the case of a zero minimum,
3418     the first one was set up above. In all cases the repeat_max now specifies
3419 nigel 93 the number of additional copies needed. Again, we must remember to
3420     replicate entries on the forward reference list. */
3421 nigel 77
3422     if (repeat_max >= 0)
3423     {
3424 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3425     just adjust the length as if we had. For each repetition we must add 1
3426     to the length for BRAZERO and for all but the last repetition we must
3427     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3428    
3429     if (lengthptr != NULL && repeat_max > 0)
3430     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3431     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3432    
3433     /* This is compiling for real */
3434    
3435     else for (i = repeat_max - 1; i >= 0; i--)
3436 nigel 77 {
3437 nigel 93 uschar *hc;
3438     uschar *this_hwm = cd->hwm;
3439    
3440 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3441    
3442     /* All but the final copy start a new nesting, maintaining the
3443     chain of brackets outstanding. */
3444    
3445     if (i != 0)
3446     {
3447     int offset;
3448     *code++ = OP_BRA;
3449     offset = (bralink == NULL)? 0 : code - bralink;
3450     bralink = code;
3451     PUTINC(code, 0, offset);
3452     }
3453    
3454     memcpy(code, previous, len);
3455 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3456     {
3457     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3458     cd->hwm += LINK_SIZE;
3459     }
3460     save_hwm = this_hwm;
3461 nigel 77 code += len;
3462     }
3463    
3464     /* Now chain through the pending brackets, and fill in their length
3465     fields (which are holding the chain links pro tem). */
3466    
3467     while (bralink != NULL)
3468     {
3469     int oldlinkoffset;
3470     int offset = code - bralink + 1;
3471     uschar *bra = code - offset;
3472     oldlinkoffset = GET(bra, 1);
3473     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3474     *code++ = OP_KET;
3475     PUTINC(code, 0, offset);
3476     PUT(bra, 1, offset);
3477     }
3478     }
3479    
3480     /* If the maximum is unlimited, set a repeater in the final copy. We
3481     can't just offset backwards from the current code point, because we
3482     don't know if there's been an options resetting after the ket. The
3483 nigel 93 correct offset was computed above.
3484 nigel 77
3485 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3486     this group is a non-atomic one that could match an empty string. If so,
3487     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3488     that runtime checking can be done. [This check is also applied to
3489     atomic groups at runtime, but in a different way.] */
3490    
3491     else
3492     {
3493     uschar *ketcode = code - ketoffset;
3494     uschar *bracode = ketcode - GET(ketcode, 1);
3495     *ketcode = OP_KETRMAX + repeat_type;
3496     if (lengthptr == NULL && *bracode != OP_ONCE)
3497     {
3498     uschar *scode = bracode;
3499     do
3500     {
3501     if (could_be_empty_branch(scode, ketcode, utf8))
3502     {
3503     *bracode += OP_SBRA - OP_BRA;
3504     break;
3505     }
3506     scode += GET(scode, 1);
3507     }
3508     while (*scode == OP_ALT);
3509     }
3510     }
3511 nigel 77 }
3512    
3513     /* Else there's some kind of shambles */
3514    
3515     else
3516     {
3517     *errorcodeptr = ERR11;
3518     goto FAILED;
3519     }
3520    
3521 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3522     tests above succeeded, possessive_quantifier is TRUE. For some of the
3523     simpler opcodes, there is an special alternative opcode for this. For
3524     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3525     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3526     but the special opcodes can optimize it a bit. The repeated item starts at
3527     tempcode, not at previous, which might be the first part of a string whose
3528     (former) last char we repeated.
3529 nigel 77
3530 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3531     an 'upto' may follow. We skip over an 'exact' item, and then test the
3532     length of what remains before proceeding. */
3533    
3534 nigel 77 if (possessive_quantifier)
3535     {
3536 nigel 93 int len;
3537     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3538     *tempcode == OP_NOTEXACT)
3539     tempcode += _pcre_OP_lengths[*tempcode];
3540     len = code - tempcode;
3541     if (len > 0) switch (*tempcode)
3542     {
3543     case OP_STAR: *tempcode = OP_POSSTAR; break;
3544     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3545     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3546     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3547    
3548     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3549     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3550     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3551     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3552    
3553     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3554     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3555     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3556     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3557    
3558     default:
3559     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3560     code += 1 + LINK_SIZE;
3561     len += 1 + LINK_SIZE;
3562     tempcode[0] = OP_ONCE;
3563     *code++ = OP_KET;
3564     PUTINC(code, 0, len);
3565     PUT(tempcode, 1, len);
3566     break;
3567     }
3568 nigel 77 }
3569    
3570     /* In all case we no longer have a previous item. We also set the
3571     "follows varying string" flag for subsequently encountered reqbytes if
3572     it isn't already set and we have just passed a varying length item. */
3573    
3574     END_REPEAT:
3575     previous = NULL;
3576     cd->req_varyopt |= reqvary;
3577     break;
3578    
3579    
3580 nigel 93 /* ===================================================================*/
3581     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3582     lookbehind or option setting or condition or all the other extended
3583     parenthesis forms. First deal with the specials; all are introduced by ?,
3584     and the appearance of any of them means that this is not a capturing
3585     group. */
3586 nigel 77
3587     case '(':
3588     newoptions = options;
3589     skipbytes = 0;
3590 nigel 93 bravalue = OP_CBRA;
3591     save_hwm = cd->hwm;
3592 ph10 175 reset_bracount = FALSE;
3593 nigel 77
3594     if (*(++ptr) == '?')
3595     {
3596 nigel 93 int i, set, unset, namelen;
3597 nigel 77 int *optset;
3598 nigel 93 const uschar *name;
3599     uschar *slot;
3600 nigel 77
3601     switch (*(++ptr))
3602     {
3603     case '#': /* Comment; skip to ket */
3604     ptr++;
3605 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3606     if (*ptr == 0)
3607     {
3608     *errorcodeptr = ERR18;
3609     goto FAILED;
3610     }
3611 nigel 77 continue;
3612    
3613 nigel 93
3614     /* ------------------------------------------------------------ */
3615 ph10 175 case '|': /* Reset capture count for each branch */
3616     reset_bracount = TRUE;
3617     /* Fall through */
3618    
3619     /* ------------------------------------------------------------ */
3620 nigel 93 case ':': /* Non-capturing bracket */
3621 nigel 77 bravalue = OP_BRA;
3622     ptr++;
3623     break;
3624    
3625 nigel 93
3626     /* ------------------------------------------------------------ */
3627 nigel 77 case '(':
3628     bravalue = OP_COND; /* Conditional group */
3629    
3630 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3631     group), a name (referring to a named group), or 'R', referring to
3632     recursion. R<digits> and R&name are also permitted for recursion tests.
3633 nigel 77
3634 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3635     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3636    
3637     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3638     be the recursive thing or the name 'R' (and similarly for 'R' followed
3639     by digits), and (b) a number could be a name that consists of digits.
3640     In both cases, we look for a name first; if not found, we try the other
3641     cases. */
3642    
3643     /* For conditions that are assertions, check the syntax, and then exit
3644     the switch. This will take control down to where bracketed groups,
3645     including assertions, are processed. */
3646    
3647     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3648     break;
3649    
3650     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3651     below), and all need to skip 3 bytes at the start of the group. */
3652    
3653     code[1+LINK_SIZE] = OP_CREF;
3654     skipbytes = 3;
3655 ph10 172 refsign = -1;
3656 nigel 93
3657     /* Check for a test for recursion in a named group. */
3658    
3659     if (ptr[1] == 'R' && ptr[2] == '&')
3660 nigel 77 {
3661 nigel 93 terminator = -1;
3662     ptr += 2;
3663     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3664     }
3665 nigel 91
3666 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3667     syntax (?(<name>) or (?('name') */
3668 nigel 91
3669 nigel 93 else if (ptr[1] == '<')
3670     {
3671     terminator = '>';
3672     ptr++;
3673     }
3674     else if (ptr[1] == '\'')
3675     {
3676     terminator = '\'';
3677     ptr++;
3678     }
3679 ph10 172 else
3680 ph10 167 {
3681     terminator = 0;
3682 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3683     }
3684 nigel 77
3685 nigel 93 /* We now expect to read a name; any thing else is an error */
3686 nigel 77
3687 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3688     {
3689     ptr += 1; /* To get the right offset */
3690     *errorcodeptr = ERR28;
3691     goto FAILED;
3692     }
3693    
3694     /* Read the name, but also get it as a number if it's all digits */
3695    
3696     recno = 0;
3697     name = ++ptr;
3698     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3699     {
3700     if (recno >= 0)
3701     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3702     recno * 10 + *ptr - '0' : -1;
3703 nigel 91 ptr++;
3704 nigel 93 }
3705     namelen = ptr - name;
3706 nigel 91
3707 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3708     {
3709     ptr--; /* Error offset */
3710     *errorcodeptr = ERR26;
3711     goto FAILED;
3712     }
3713 nigel 91
3714 nigel 93 /* Do no further checking in the pre-compile phase. */
3715 nigel 91
3716 nigel 93 if (lengthptr != NULL) break;
3717 nigel 91
3718 nigel 93 /* In the real compile we do the work of looking for the actual
3719 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3720     be digits, in which case recno will be set. */
3721 ph10 172
3722 ph10 167 if (refsign > 0)
3723     {
3724     if (recno <= 0)
3725     {
3726     *errorcodeptr = ERR58;
3727     goto FAILED;
3728 ph10 172 }
3729 ph10 167 if (refsign == '-')
3730     {
3731 ph10 172 recno = cd->bracount - recno + 1;
3732 ph10 167 if (recno <= 0)
3733     {
3734     *errorcodeptr = ERR15;
3735     goto FAILED;
3736 ph10 172 }
3737 ph10 167 }
3738 ph10 172 else recno += cd->bracount;
3739 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
3740     break;
3741 ph10 172 }
3742 nigel 91
3743 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3744     name. */
3745 ph10 172
3746 nigel 93 slot = cd->name_table;
3747     for (i = 0; i < cd->names_found; i++)
3748     {
3749     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3750     slot += cd->name_entry_size;
3751     }
3752 nigel 91
3753 nigel 93 /* Found a previous named subpattern */
3754 nigel 91
3755 nigel 93 if (i < cd->names_found)
3756     {
3757     recno = GET2(slot, 0);
3758     PUT2(code, 2+LINK_SIZE, recno);
3759     }
3760 nigel 91
3761 nigel 93 /* Search the pattern for a forward reference */
3762 nigel 91
3763 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3764     (options & PCRE_EXTENDED) != 0)) > 0)
3765     {
3766     PUT2(code, 2+LINK_SIZE, i);
3767     }
3768 nigel 91
3769 nigel 93 /* If terminator == 0 it means that the name followed directly after
3770     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3771     some further alternatives to try. For the cases where terminator != 0
3772     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3773     now checked all the possibilities, so give an error. */
3774 nigel 91
3775 nigel 93 else if (terminator != 0)
3776     {
3777     *errorcodeptr = ERR15;
3778     goto FAILED;
3779     }
3780    
3781     /* Check for (?(R) for recursion. Allow digits after R to specify a
3782     specific group number. */
3783    
3784     else if (*name == 'R')
3785     {
3786     recno = 0;
3787     for (i = 1; i < namelen; i++)
3788 nigel 91 {
3789 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3790     {
3791     *errorcodeptr = ERR15;
3792     goto FAILED;
3793     }
3794     recno = recno * 10 + name[i] - '0';
3795 nigel 77 }
3796 nigel 93 if (recno == 0) recno = RREF_ANY;
3797     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3798     PUT2(code, 2+LINK_SIZE, recno);
3799 nigel 77 }
3800 nigel 91
3801 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3802     false. */
3803 nigel 91
3804 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3805     {
3806     code[1+LINK_SIZE] = OP_DEF;
3807     skipbytes = 1;
3808     }
3809    
3810     /* Check for the "name" actually being a subpattern number. */
3811    
3812     else if (recno > 0)
3813     {
3814     PUT2(code, 2+LINK_SIZE, recno);
3815     }
3816    
3817     /* Either an unidentified subpattern, or a reference to (?(0) */
3818    
3819     else
3820     {
3821     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3822     goto FAILED;
3823     }
3824 nigel 77 break;
3825    
3826 nigel 93
3827     /* ------------------------------------------------------------ */
3828 nigel 77 case '=': /* Positive lookahead */
3829     bravalue = OP_ASSERT;
3830     ptr++;
3831     break;
3832    
3833 nigel 93
3834     /* ------------------------------------------------------------ */
3835 nigel 77 case '!': /* Negative lookahead */
3836     bravalue = OP_ASSERT_NOT;
3837     ptr++;
3838     break;
3839    
3840 nigel 93
3841     /* ------------------------------------------------------------ */
3842     case '<': /* Lookbehind or named define */
3843     switch (ptr[1])
3844 nigel 77 {
3845     case '=': /* Positive lookbehind */
3846     bravalue = OP_ASSERTBACK;
3847 nigel 93 ptr += 2;
3848 nigel 77 break;
3849    
3850     case '!': /* Negative lookbehind */
3851     bravalue = OP_ASSERTBACK_NOT;
3852 nigel 93 ptr += 2;
3853 nigel 77 break;
3854 nigel 93
3855     default: /* Could be name define, else bad */
3856     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3857     ptr++; /* Correct offset for error */
3858     *errorcodeptr = ERR24;
3859     goto FAILED;
3860 nigel 77 }
3861     break;
3862    
3863 nigel 93
3864     /* ------------------------------------------------------------ */
3865 nigel 77 case '>': /* One-time brackets */
3866     bravalue = OP_ONCE;
3867     ptr++;
3868     break;
3869    
3870 nigel 93
3871     /* ------------------------------------------------------------ */
3872 nigel 77 case 'C': /* Callout - may be followed by digits; */
3873     previous_callout = code; /* Save for later completion */
3874     after_manual_callout = 1; /* Skip one item before completing */
3875 nigel 93 *code++ = OP_CALLOUT;
3876     {
3877 nigel 77 int n = 0;
3878     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3879     n = n * 10 + *ptr - '0';
3880 nigel 93 if (*ptr != ')')
3881     {
3882     *errorcodeptr = ERR39;
3883     goto FAILED;
3884     }
3885 nigel 77 if (n > 255)
3886     {
3887     *errorcodeptr = ERR38;
3888     goto FAILED;
3889     }
3890     *code++ = n;
3891     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3892     PUT(code, LINK_SIZE, 0); /* Default length */
3893     code += 2 * LINK_SIZE;
3894     }
3895     previous = NULL;
3896     continue;
3897    
3898 nigel 93
3899     /* ------------------------------------------------------------ */
3900     case 'P': /* Python-style named subpattern handling */
3901     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3902 nigel 77 {
3903 nigel 93 is_recurse = *ptr == '>';
3904     terminator = ')';
3905     goto NAMED_REF_OR_RECURSE;
3906     }
3907     else if (*ptr != '<') /* Test for Python-style definition */
3908     {
3909     *errorcodeptr = ERR41;
3910     goto FAILED;
3911     }
3912     /* Fall through to handle (?P< as (?< is handled */
3913 nigel 77
3914    
3915 nigel 93 /* ------------------------------------------------------------ */
3916     DEFINE_NAME: /* Come here from (?< handling */
3917     case '\'':
3918     {
3919     terminator = (*ptr == '<')? '>' : '\'';
3920     name = ++ptr;
3921    
3922     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3923     namelen = ptr - name;
3924    
3925     /* In the pre-compile phase, just do a syntax check. */
3926    
3927     if (lengthptr != NULL)
3928 nigel 77 {
3929 nigel 93 if (*ptr != terminator)
3930 nigel 77 {
3931 nigel 93 *errorcodeptr = ERR42;
3932     goto FAILED;
3933     }
3934     if (cd->names_found >= MAX_NAME_COUNT)
3935     {
3936     *errorcodeptr = ERR49;
3937     goto FAILED;
3938     }
3939     if (namelen + 3 > cd->name_entry_size)
3940     {
3941     cd->name_entry_size = namelen + 3;
3942     if (namelen > MAX_NAME_SIZE)
3943 nigel 77 {
3944 nigel 93 *errorcodeptr = ERR48;
3945     goto FAILED;
3946     }
3947     }
3948     }
3949    
3950     /* In the real compile, create the entry in the table */
3951    
3952     else
3953     {
3954     slot = cd->name_table;
3955     for (i = 0; i < cd->names_found; i++)
3956     {
3957     int crc = memcmp(name, slot+2, namelen);
3958     if (crc == 0)
3959     {
3960     if (slot[2+namelen] == 0)
3961 nigel 91 {
3962 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
3963     {
3964     *errorcodeptr = ERR43;
3965     goto FAILED;
3966     }
3967 nigel 91 }
3968 nigel 93 else crc = -1; /* Current name is substring */
3969 nigel 77 }
3970 nigel 93 if (crc < 0)
3971     {
3972     memmove(slot + cd->name_entry_size, slot,
3973     (cd->names_found - i) * cd->name_entry_size);
3974     break;
3975     }
3976     slot += cd->name_entry_size;
3977 nigel 77 }
3978 nigel 93
3979     PUT2(slot, 0, cd->bracount + 1);
3980     memcpy(slot + 2, name, namelen);
3981     slot[2+namelen] = 0;
3982 nigel 77 }
3983     }
3984    
3985 nigel 93 /* In both cases, count the number of names we've encountered. */
3986    
3987     ptr++; /* Move past > or ' */
3988     cd->names_found++;
3989     goto NUMBERED_GROUP;
3990    
3991    
3992     /* ------------------------------------------------------------ */
3993     case '&': /* Perl recursion/subroutine syntax */
3994     terminator = ')';
3995     is_recurse = TRUE;
3996     /* Fall through */
3997    
3998     /* We come here from the Python syntax above that handles both
3999     references (?P=name) and recursion (?P>name), as well as falling
4000     through from the Perl recursion syntax (?&name). */
4001    
4002     NAMED_REF_OR_RECURSE:
4003     name = ++ptr;
4004     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4005     namelen = ptr - name;
4006    
4007     /* In the pre-compile phase, do a syntax check and set a dummy
4008     reference number. */
4009    
4010     if (lengthptr != NULL)
4011 nigel 77 {
4012 nigel 93 if (*ptr != terminator)
4013     {
4014     *errorcodeptr = ERR42;
4015     goto FAILED;
4016     }
4017     if (namelen > MAX_NAME_SIZE)
4018     {
4019     *errorcodeptr = ERR48;
4020     goto FAILED;
4021     }
4022     recno = 0;
4023     }
4024 nigel 77
4025 nigel 93 /* In the real compile, seek the name in the table */
4026 nigel 77
4027 nigel 93 else
4028     {
4029     slot = cd->name_table;
4030 nigel 77 for (i = 0; i < cd->names_found; i++)
4031     {
4032     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4033     slot += cd->name_entry_size;
4034     }
4035 nigel 91
4036     if (i < cd->names_found) /* Back reference */
4037 nigel 77 {
4038 nigel 91 recno = GET2(slot, 0);
4039     }
4040     else if ((recno = /* Forward back reference */
4041 nigel 93 find_parens(ptr, cd->bracount, name, namelen,
4042     (options & PCRE_EXTENDED) != 0)) <= 0)
4043 nigel 91 {
4044 nigel 77 *errorcodeptr = ERR15;
4045     goto FAILED;
4046     }
4047 nigel 93 }
4048 nigel 77
4049 nigel 93 /* In both phases, we can now go to the code than handles numerical
4050     recursion or backreferences. */
4051 nigel 77
4052 nigel 93 if (is_recurse) goto HANDLE_RECURSION;
4053     else goto HANDLE_REFERENCE;
4054 nigel 77
4055    
4056 nigel 93 /* ------------------------------------------------------------ */
4057     case 'R': /* Recursion */
4058 nigel 77 ptr++; /* Same as (?0) */
4059     /* Fall through */
4060    
4061    
4062 nigel 93 /* ------------------------------------------------------------ */
4063 ph10 166 case '-': case '+':
4064 nigel 93 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4065     case '5': case '6': case '7': case '8': case '9': /* subroutine */
4066 nigel 77 {
4067     const uschar *called;
4068 ph10 166
4069 ph10 167 if ((refsign = *ptr) == '+') ptr++;
4070 ph10 172 else if (refsign == '-')
4071 ph10 166 {
4072     if ((digitab[ptr[1]] & ctype_digit) == 0)
4073     goto OTHER_CHAR_AFTER_QUERY;
4074 ph10 172 ptr++;
4075     }
4076    
4077 nigel 77 recno = 0;
4078     while((digitab[*ptr] & ctype_digit) != 0)
4079     recno = recno * 10 + *ptr++ - '0';
4080 ph10 166
4081 nigel 93 if (*ptr != ')')
4082     {
4083     *errorcodeptr = ERR29;
4084     goto FAILED;
4085     }
4086 ph10 172
4087 ph10 167 if (refsign == '-')
4088 ph10 166 {
4089     if (recno == 0)
4090     {
4091     *errorcodeptr = ERR58;
4092     goto FAILED;
4093 ph10 172 }
4094     recno = cd->bracount - recno + 1;
4095 ph10 166 if (recno <= 0)
4096     {
4097     *errorcodeptr = ERR15;
4098     goto FAILED;
4099 ph10 172 }
4100 ph10 166 }
4101 ph10 167 else if (refsign == '+')
4102 ph10 166 {
4103     if (recno == 0)
4104     {
4105     *errorcodeptr = ERR58;
4106     goto FAILED;
4107 ph10 172 }
4108     recno += cd->bracount;
4109     }
4110 nigel 77
4111     /* Come here from code above that handles a named recursion */
4112    
4113     HANDLE_RECURSION:
4114    
4115     previous = code;
4116 nigel 93 called = cd->start_code;
4117 nigel 77
4118 nigel 93 /* When we are actually compiling, find the bracket that is being
4119     referenced. Temporarily end the regex in case it doesn't exist before
4120     this point. If we end up with a forward reference, first check that
4121     the bracket does occur later so we can give the error (and position)
4122     now. Then remember this forward reference in the workspace so it can
4123     be filled in at the end. */
4124 nigel 77
4125 nigel 93 if (lengthptr == NULL)
4126 nigel 77 {
4127 nigel 93 *code = OP_END;
4128     if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4129 nigel 77
4130 nigel 93 /* Forward reference */
4131 nigel 77
4132 nigel 93 if (called == NULL)
4133     {
4134     if (find_parens(ptr, cd->bracount, NULL, recno,
4135     (options & PCRE_EXTENDED) != 0) < 0)
4136     {
4137     *errorcodeptr = ERR15;
4138     goto FAILED;
4139     }
4140     called = cd->start_code + recno;
4141     PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4142     }
4143    
4144     /* If not a forward reference, and the subpattern is still open,
4145     this is a recursive call. We check to see if this is a left
4146     recursion that could loop for ever, and diagnose that case. */
4147    
4148     else if (GET(called, 1) == 0 &&
4149     could_be_empty(called, code, bcptr, utf8))
4150     {
4151     *errorcodeptr = ERR40;
4152     goto FAILED;
4153     }
4154 nigel 77 }