/[pcre]/code/tags/pcre-7.7/pcre_compile.c
ViewVC logotype

Contents of /code/tags/pcre-7.7/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 172 - (hide annotations) (download)
Tue Jun 5 10:40:13 2007 UTC (7 years, 1 month ago) by ph10
Original Path: code/trunk/pcre_compile.c
File MIME type: text/plain
File size: 175665 byte(s)
Drastically reduce workspace used for alternatives in groups; also some 
trailing space removals for a test release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 nigel 77 /*************************************************
62     * Code parameters and static tables *
63     *************************************************/
64    
65 nigel 93 /* This value specifies the size of stack workspace that is used during the
66     first pre-compile phase that determines how much memory is required. The regex
67     is partly compiled into this space, but the compiled parts are discarded as
68     soon as they can be, so that hopefully there will never be an overrun. The code
69     does, however, check for an overrun. The largest amount I've seen used is 218,
70     so this number is very generous.
71 nigel 77
72 nigel 93 The same workspace is used during the second, actual compile phase for
73     remembering forward references to groups so that they can be filled in at the
74     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75     is 4 there is plenty of room. */
76 nigel 77
77 nigel 93 #define COMPILE_WORK_SIZE (4096)
78 nigel 77
79 nigel 93
80 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81     are simple data values; negative values are for special things like \d and so
82     on. Zero means further processing is needed (for things like \x), or the escape
83     is invalid. */
84    
85 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 nigel 77 static const short int escapes[] = {
87     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 ph10 168 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 nigel 93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 nigel 93 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 nigel 77 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96     0, 0, -ESC_z /* x - z */
97     };
98    
99 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
100 nigel 77 static const short int escapes[] = {
101     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 nigel 77 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124     };
125     #endif
126    
127    
128     /* Tables of names of POSIX character classes and their lengths. The list is
129 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 nigel 77 as this is assumed for handling case independence. */
131    
132     static const char *const posix_names[] = {
133     "alpha", "lower", "upper",
134     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135     "print", "punct", "space", "word", "xdigit" };
136    
137     static const uschar posix_name_lengths[] = {
138     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
141     base map, with an optional addition or removal of another map. Then, for some
142     classes, there is some additional tweaking: for [:blank:] the vertical space
143     characters are removed, and for [:alpha:] and [:alnum:] the underscore
144     character is removed. The triples in the table consist of the base map offset,
145     second map offset or -1 if no second map, and a non-negative value for map
146     addition or a negative value for map subtraction (if there are two maps). The
147     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148     remove vertical space characters, 2 => remove underscore. */
149 nigel 77
150     static const int posix_class_maps[] = {
151 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
152     cbit_lower, -1, 0, /* lower */
153     cbit_upper, -1, 0, /* upper */
154     cbit_word, -1, 2, /* alnum - word without underscore */
155     cbit_print, cbit_cntrl, 0, /* ascii */
156     cbit_space, -1, 1, /* blank - a GNU extension */
157     cbit_cntrl, -1, 0, /* cntrl */
158     cbit_digit, -1, 0, /* digit */
159     cbit_graph, -1, 0, /* graph */
160     cbit_print, -1, 0, /* print */
161     cbit_punct, -1, 0, /* punct */
162     cbit_space, -1, 0, /* space */
163     cbit_word, -1, 0, /* word - a Perl extension */
164     cbit_xdigit,-1, 0 /* xdigit */
165 nigel 77 };
166    
167    
168 nigel 93 #define STRING(a) # a
169     #define XSTRING(s) STRING(s)
170    
171 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
172 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
173     they are documented. Always add a new error instead. Messages marked DEAD below
174     are no longer used. */
175 nigel 77
176     static const char *error_texts[] = {
177     "no error",
178     "\\ at end of pattern",
179     "\\c at end of pattern",
180     "unrecognized character follows \\",
181     "numbers out of order in {} quantifier",
182     /* 5 */
183     "number too big in {} quantifier",
184     "missing terminating ] for character class",
185     "invalid escape sequence in character class",
186     "range out of order in character class",
187     "nothing to repeat",
188     /* 10 */
189 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 nigel 77 "internal error: unexpected repeat",
191     "unrecognized character after (?",
192     "POSIX named classes are supported only within a class",
193     "missing )",
194     /* 15 */
195     "reference to non-existent subpattern",
196     "erroffset passed as NULL",
197     "unknown option bit(s) set",
198     "missing ) after comment",
199 nigel 93 "parentheses nested too deeply", /** DEAD **/
200 nigel 77 /* 20 */
201     "regular expression too large",
202     "failed to get memory",
203     "unmatched parentheses",
204     "internal error: code overflow",
205     "unrecognized character after (?<",
206     /* 25 */
207     "lookbehind assertion is not fixed length",
208 nigel 91 "malformed number or name after (?(",
209 nigel 77 "conditional group contains more than two branches",
210     "assertion expected after (?(",
211 ph10 166 "(?R or (?[+-]digits must be followed by )",
212 nigel 77 /* 30 */
213     "unknown POSIX class name",
214     "POSIX collating elements are not supported",
215     "this version of PCRE is not compiled with PCRE_UTF8 support",
216 nigel 93 "spare error", /** DEAD **/
217 nigel 77 "character value in \\x{...} sequence is too large",
218     /* 35 */
219     "invalid condition (?(0)",
220     "\\C not allowed in lookbehind assertion",
221     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222     "number after (?C is > 255",
223     "closing ) for (?C expected",
224     /* 40 */
225     "recursive call could loop indefinitely",
226     "unrecognized character after (?P",
227 nigel 93 "syntax error in subpattern name (missing terminator)",
228 nigel 91 "two named subpatterns have the same name",
229 nigel 77 "invalid UTF-8 string",
230     /* 45 */
231     "support for \\P, \\p, and \\X has not been compiled",
232     "malformed \\P or \\p sequence",
233 nigel 91 "unknown property name after \\P or \\p",
234 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 nigel 91 /* 50 */
237     "repeated subpattern is too long",
238 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
239     "internal error: overran compiling workspace",
240     "internal error: previously-checked referenced subpattern not found",
241     "DEFINE group contains more than one branch",
242     /* 55 */
243     "repeating a DEFINE group is not allowed",
244     "inconsistent NEWLINE options",
245 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
246 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 nigel 77 };
248    
249    
250     /* Table to identify digits and hex digits. This is used when compiling
251     patterns. Note that the tables in chartables are dependent on the locale, and
252     may mark arbitrary characters as digits - but the PCRE compiling code expects
253     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254     a private table here. It costs 256 bytes, but it is a lot faster than doing
255     character value tests (at least in some simple cases I timed), and in some
256     applications one wants PCRE to compile efficiently as well as match
257     efficiently.
258    
259     For convenience, we use the same bit definitions as in chartables:
260    
261     0x04 decimal digit
262     0x08 hexadecimal digit
263    
264     Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 nigel 77 static const unsigned char digitab[] =
268     {
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
303 nigel 77 static const unsigned char digitab[] =
304     {
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337    
338     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371     #endif
372    
373    
374     /* Definition to allow mutual recursion */
375    
376     static BOOL
377 nigel 93 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378     int *, branch_chain *, compile_data *, int *);
379 nigel 77
380    
381    
382     /*************************************************
383     * Handle escapes *
384     *************************************************/
385    
386     /* This function is called when a \ has been encountered. It either returns a
387     positive value for a simple escape such as \n, or a negative value which
388 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
389     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391     ptr is pointing at the \. On exit, it is on the final character of the escape
392     sequence.
393 nigel 77
394     Arguments:
395     ptrptr points to the pattern position pointer
396     errorcodeptr points to the errorcode variable
397     bracount number of previous extracting brackets
398     options the options bits
399     isclass TRUE if inside a character class
400    
401     Returns: zero or positive => a data character
402     negative => a special escape sequence
403     on error, errorptr is set
404     */
405    
406     static int
407     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408     int options, BOOL isclass)
409     {
410 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
411     const uschar *ptr = *ptrptr + 1;
412 nigel 77 int c, i;
413    
414 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415     ptr--; /* Set pointer back to the last byte */
416    
417 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
418    
419     if (c == 0) *errorcodeptr = ERR1;
420    
421     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422     a table. A non-zero result is something that can be returned immediately.
423     Otherwise further processing may be required. */
424    
425 ph10 97 #ifndef EBCDIC /* ASCII coding */
426 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
427     else if ((i = escapes[c - '0']) != 0) c = i;
428    
429 ph10 97 #else /* EBCDIC coding */
430 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431     else if ((i = escapes[c - 0x48]) != 0) c = i;
432     #endif
433    
434     /* Escapes that need further processing, or are illegal. */
435    
436     else
437     {
438     const uschar *oldptr;
439 nigel 93 BOOL braced, negated;
440    
441 nigel 77 switch (c)
442     {
443     /* A number of Perl escapes are not handled by PCRE. We give an explicit
444     error. */
445    
446     case 'l':
447     case 'L':
448     case 'N':
449     case 'u':
450     case 'U':
451     *errorcodeptr = ERR37;
452     break;
453    
454 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
455     is an absolute backreference. If negative, it is a relative backreference.
456 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
457     reference to a named group. This is part of Perl's movement towards a
458     unified syntax for back references. As this is synonymous with \k{name}, we
459 ph10 171 fudge it up by pretending it really was \k. */
460 nigel 93
461     case 'g':
462     if (ptr[1] == '{')
463     {
464 ph10 171 const uschar *p;
465     for (p = ptr+2; *p != 0 && *p != '}'; p++)
466     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
467 ph10 172 if (*p != 0 && *p != '}')
468 ph10 171 {
469     c = -ESC_k;
470     break;
471 ph10 172 }
472 nigel 93 braced = TRUE;
473     ptr++;
474     }
475     else braced = FALSE;
476    
477     if (ptr[1] == '-')
478     {
479     negated = TRUE;
480     ptr++;
481     }
482     else negated = FALSE;
483    
484     c = 0;
485     while ((digitab[ptr[1]] & ctype_digit) != 0)
486     c = c * 10 + *(++ptr) - '0';
487    
488     if (c == 0 || (braced && *(++ptr) != '}'))
489     {
490     *errorcodeptr = ERR57;
491     return 0;
492     }
493    
494     if (negated)
495     {
496     if (c > bracount)
497     {
498     *errorcodeptr = ERR15;
499     return 0;
500     }
501     c = bracount - (c - 1);
502     }
503    
504     c = -(ESC_REF + c);
505     break;
506    
507 nigel 77 /* The handling of escape sequences consisting of a string of digits
508     starting with one that is not zero is not straightforward. By experiment,
509     the way Perl works seems to be as follows:
510    
511     Outside a character class, the digits are read as a decimal number. If the
512     number is less than 10, or if there are that many previous extracting
513     left brackets, then it is a back reference. Otherwise, up to three octal
514     digits are read to form an escaped byte. Thus \123 is likely to be octal
515     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
516     value is greater than 377, the least significant 8 bits are taken. Inside a
517     character class, \ followed by a digit is always an octal number. */
518    
519     case '1': case '2': case '3': case '4': case '5':
520     case '6': case '7': case '8': case '9':
521    
522     if (!isclass)
523     {
524     oldptr = ptr;
525     c -= '0';
526     while ((digitab[ptr[1]] & ctype_digit) != 0)
527     c = c * 10 + *(++ptr) - '0';
528     if (c < 10 || c <= bracount)
529     {
530     c = -(ESC_REF + c);
531     break;
532     }
533     ptr = oldptr; /* Put the pointer back and fall through */
534     }
535    
536     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
537     generates a binary zero byte and treats the digit as a following literal.
538     Thus we have to pull back the pointer by one. */
539    
540     if ((c = *ptr) >= '8')
541     {
542     ptr--;
543     c = 0;
544     break;
545     }
546    
547     /* \0 always starts an octal number, but we may drop through to here with a
548 nigel 91 larger first octal digit. The original code used just to take the least
549     significant 8 bits of octal numbers (I think this is what early Perls used
550     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
551     than 3 octal digits. */
552 nigel 77
553     case '0':
554     c -= '0';
555     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
556     c = c * 8 + *(++ptr) - '0';
557 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
558 nigel 77 break;
559    
560 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
561     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
562     treated as a data character. */
563 nigel 77
564     case 'x':
565 nigel 87 if (ptr[1] == '{')
566 nigel 77 {
567     const uschar *pt = ptr + 2;
568 nigel 87 int count = 0;
569    
570 nigel 77 c = 0;
571     while ((digitab[*pt] & ctype_xdigit) != 0)
572     {
573 nigel 87 register int cc = *pt++;
574     if (c == 0 && cc == '0') continue; /* Leading zeroes */
575 nigel 77 count++;
576 nigel 87
577 ph10 97 #ifndef EBCDIC /* ASCII coding */
578 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
579 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
580 ph10 97 #else /* EBCDIC coding */
581 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
582 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
583 nigel 77 #endif
584     }
585 nigel 87
586 nigel 77 if (*pt == '}')
587     {
588 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
589 nigel 77 ptr = pt;
590     break;
591     }
592 nigel 87
593 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
594     recognize this construct; fall through to the normal \x handling. */
595     }
596    
597 nigel 87 /* Read just a single-byte hex-defined char */
598 nigel 77
599     c = 0;
600     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
601     {
602     int cc; /* Some compilers don't like ++ */
603     cc = *(++ptr); /* in initializers */
604 ph10 97 #ifndef EBCDIC /* ASCII coding */
605 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
606     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
607 ph10 97 #else /* EBCDIC coding */
608 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
609     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
610     #endif
611     }
612     break;
613    
614 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
615     This coding is ASCII-specific, but then the whole concept of \cx is
616     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
617 nigel 77
618     case 'c':
619     c = *(++ptr);
620     if (c == 0)
621     {
622     *errorcodeptr = ERR2;
623     return 0;
624     }
625    
626 ph10 97 #ifndef EBCDIC /* ASCII coding */
627 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
628     c ^= 0x40;
629 ph10 97 #else /* EBCDIC coding */
630 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
631     c ^= 0xC0;
632     #endif
633     break;
634    
635     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
636     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
637     for Perl compatibility, it is a literal. This code looks a bit odd, but
638     there used to be some cases other than the default, and there may be again
639     in future, so I haven't "optimized" it. */
640    
641     default:
642     if ((options & PCRE_EXTRA) != 0) switch(c)
643     {
644     default:
645     *errorcodeptr = ERR3;
646     break;
647     }
648     break;
649     }
650     }
651    
652     *ptrptr = ptr;
653     return c;
654     }
655    
656    
657    
658     #ifdef SUPPORT_UCP
659     /*************************************************
660     * Handle \P and \p *
661     *************************************************/
662    
663     /* This function is called after \P or \p has been encountered, provided that
664     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
665     pointing at the P or p. On exit, it is pointing at the final character of the
666     escape sequence.
667    
668     Argument:
669     ptrptr points to the pattern position pointer
670     negptr points to a boolean that is set TRUE for negation else FALSE
671 nigel 87 dptr points to an int that is set to the detailed property value
672 nigel 77 errorcodeptr points to the error code variable
673    
674 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
675 nigel 77 */
676    
677     static int
678 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
679 nigel 77 {
680     int c, i, bot, top;
681     const uschar *ptr = *ptrptr;
682 nigel 87 char name[32];
683 nigel 77
684     c = *(++ptr);
685     if (c == 0) goto ERROR_RETURN;
686    
687     *negptr = FALSE;
688    
689 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
690     negation. */
691 nigel 77
692     if (c == '{')
693     {
694     if (ptr[1] == '^')
695     {
696     *negptr = TRUE;
697     ptr++;
698     }
699 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
700 nigel 77 {
701     c = *(++ptr);
702     if (c == 0) goto ERROR_RETURN;
703     if (c == '}') break;
704     name[i] = c;
705     }
706 nigel 87 if (c !='}') goto ERROR_RETURN;
707 nigel 77 name[i] = 0;
708     }
709    
710     /* Otherwise there is just one following character */
711    
712     else
713     {
714     name[0] = c;
715     name[1] = 0;
716     }
717    
718     *ptrptr = ptr;
719    
720     /* Search for a recognized property name using binary chop */
721    
722     bot = 0;
723     top = _pcre_utt_size;
724    
725     while (bot < top)
726     {
727 nigel 87 i = (bot + top) >> 1;
728 nigel 77 c = strcmp(name, _pcre_utt[i].name);
729 nigel 87 if (c == 0)
730     {
731     *dptr = _pcre_utt[i].value;
732     return _pcre_utt[i].type;
733     }
734 nigel 77 if (c > 0) bot = i + 1; else top = i;
735     }
736    
737     *errorcodeptr = ERR47;
738     *ptrptr = ptr;
739     return -1;
740    
741     ERROR_RETURN:
742     *errorcodeptr = ERR46;
743     *ptrptr = ptr;
744     return -1;
745     }
746     #endif
747    
748    
749    
750    
751     /*************************************************
752     * Check for counted repeat *
753     *************************************************/
754    
755     /* This function is called when a '{' is encountered in a place where it might
756     start a quantifier. It looks ahead to see if it really is a quantifier or not.
757     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
758     where the ddds are digits.
759    
760     Arguments:
761     p pointer to the first char after '{'
762    
763     Returns: TRUE or FALSE
764     */
765    
766     static BOOL
767     is_counted_repeat(const uschar *p)
768     {
769     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
770     while ((digitab[*p] & ctype_digit) != 0) p++;
771     if (*p == '}') return TRUE;
772    
773     if (*p++ != ',') return FALSE;
774     if (*p == '}') return TRUE;
775    
776     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777     while ((digitab[*p] & ctype_digit) != 0) p++;
778    
779     return (*p == '}');
780     }
781    
782    
783    
784     /*************************************************
785     * Read repeat counts *
786     *************************************************/
787    
788     /* Read an item of the form {n,m} and return the values. This is called only
789     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
790     so the syntax is guaranteed to be correct, but we need to check the values.
791    
792     Arguments:
793     p pointer to first char after '{'
794     minp pointer to int for min
795     maxp pointer to int for max
796     returned as -1 if no max
797     errorcodeptr points to error code variable
798    
799     Returns: pointer to '}' on success;
800     current ptr on error, with errorcodeptr set non-zero
801     */
802    
803     static const uschar *
804     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
805     {
806     int min = 0;
807     int max = -1;
808    
809 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
810     an integer overflow. */
811    
812 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
813 nigel 81 if (min < 0 || min > 65535)
814     {
815     *errorcodeptr = ERR5;
816     return p;
817     }
818 nigel 77
819 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
820     Also, max must not be less than min. */
821    
822 nigel 77 if (*p == '}') max = min; else
823     {
824     if (*(++p) != '}')
825     {
826     max = 0;
827     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
828 nigel 81 if (max < 0 || max > 65535)
829     {
830     *errorcodeptr = ERR5;
831     return p;
832     }
833 nigel 77 if (max < min)
834     {
835     *errorcodeptr = ERR4;
836     return p;
837     }
838     }
839     }
840    
841 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
842     '}'. */
843 nigel 77
844 nigel 81 *minp = min;
845     *maxp = max;
846 nigel 77 return p;
847     }
848    
849    
850    
851     /*************************************************
852 nigel 93 * Find forward referenced subpattern *
853 nigel 91 *************************************************/
854    
855 nigel 93 /* This function scans along a pattern's text looking for capturing
856     subpatterns, and counting them. If it finds a named pattern that matches the
857     name it is given, it returns its number. Alternatively, if the name is NULL, it
858     returns when it reaches a given numbered subpattern. This is used for forward
859     references to subpatterns. We know that if (?P< is encountered, the name will
860     be terminated by '>' because that is checked in the first pass.
861 nigel 91
862     Arguments:
863 nigel 93 ptr current position in the pattern
864     count current count of capturing parens so far encountered
865     name name to seek, or NULL if seeking a numbered subpattern
866     lorn name length, or subpattern number if name is NULL
867     xmode TRUE if we are in /x mode
868 nigel 91
869     Returns: the number of the named subpattern, or -1 if not found
870     */
871    
872     static int
873 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
874     BOOL xmode)
875 nigel 91 {
876     const uschar *thisname;
877 nigel 93
878 nigel 91 for (; *ptr != 0; ptr++)
879     {
880 nigel 93 int term;
881    
882     /* Skip over backslashed characters and also entire \Q...\E */
883    
884     if (*ptr == '\\')
885     {
886     if (*(++ptr) == 0) return -1;
887     if (*ptr == 'Q') for (;;)
888     {
889     while (*(++ptr) != 0 && *ptr != '\\');
890     if (*ptr == 0) return -1;
891     if (*(++ptr) == 'E') break;
892     }
893     continue;
894     }
895    
896     /* Skip over character classes */
897    
898     if (*ptr == '[')
899     {
900     while (*(++ptr) != ']')
901     {
902     if (*ptr == '\\')
903     {
904     if (*(++ptr) == 0) return -1;
905     if (*ptr == 'Q') for (;;)
906     {
907     while (*(++ptr) != 0 && *ptr != '\\');
908     if (*ptr == 0) return -1;
909     if (*(++ptr) == 'E') break;
910     }
911     continue;
912     }
913     }
914     continue;
915     }
916    
917     /* Skip comments in /x mode */
918    
919     if (xmode && *ptr == '#')
920     {
921     while (*(++ptr) != 0 && *ptr != '\n');
922     if (*ptr == 0) return -1;
923     continue;
924     }
925    
926     /* An opening parens must now be a real metacharacter */
927    
928 nigel 91 if (*ptr != '(') continue;
929 nigel 93 if (ptr[1] != '?')
930     {
931     count++;
932     if (name == NULL && count == lorn) return count;
933     continue;
934     }
935    
936     ptr += 2;
937     if (*ptr == 'P') ptr++; /* Allow optional P */
938    
939     /* We have to disambiguate (?<! and (?<= from (?<name> */
940    
941     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
942     *ptr != '\'')
943     continue;
944    
945 nigel 91 count++;
946 nigel 93
947     if (name == NULL && count == lorn) return count;
948     term = *ptr++;
949     if (term == '<') term = '>';
950 nigel 91 thisname = ptr;
951 nigel 93 while (*ptr != term) ptr++;
952     if (name != NULL && lorn == ptr - thisname &&
953     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
954 nigel 91 return count;
955     }
956 nigel 93
957 nigel 91 return -1;
958     }
959    
960    
961    
962     /*************************************************
963 nigel 77 * Find first significant op code *
964     *************************************************/
965    
966     /* This is called by several functions that scan a compiled expression looking
967     for a fixed first character, or an anchoring op code etc. It skips over things
968     that do not influence this. For some calls, a change of option is important.
969     For some calls, it makes sense to skip negative forward and all backward
970     assertions, and also the \b assertion; for others it does not.
971    
972     Arguments:
973     code pointer to the start of the group
974     options pointer to external options
975     optbit the option bit whose changing is significant, or
976     zero if none are
977     skipassert TRUE if certain assertions are to be skipped
978    
979     Returns: pointer to the first significant opcode
980     */
981    
982     static const uschar*
983     first_significant_code(const uschar *code, int *options, int optbit,
984     BOOL skipassert)
985     {
986     for (;;)
987     {
988     switch ((int)*code)
989     {
990     case OP_OPT:
991     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
992     *options = (int)code[1];
993     code += 2;
994     break;
995    
996     case OP_ASSERT_NOT:
997     case OP_ASSERTBACK:
998     case OP_ASSERTBACK_NOT:
999     if (!skipassert) return code;
1000     do code += GET(code, 1); while (*code == OP_ALT);
1001     code += _pcre_OP_lengths[*code];
1002     break;
1003    
1004     case OP_WORD_BOUNDARY:
1005     case OP_NOT_WORD_BOUNDARY:
1006     if (!skipassert) return code;
1007     /* Fall through */
1008    
1009     case OP_CALLOUT:
1010     case OP_CREF:
1011 nigel 93 case OP_RREF:
1012     case OP_DEF:
1013 nigel 77 code += _pcre_OP_lengths[*code];
1014     break;
1015    
1016     default:
1017     return code;
1018     }
1019     }
1020     /* Control never reaches here */
1021     }
1022    
1023    
1024    
1025    
1026     /*************************************************
1027     * Find the fixed length of a pattern *
1028     *************************************************/
1029    
1030     /* Scan a pattern and compute the fixed length of subject that will match it,
1031     if the length is fixed. This is needed for dealing with backward assertions.
1032     In UTF8 mode, the result is in characters rather than bytes.
1033    
1034     Arguments:
1035     code points to the start of the pattern (the bracket)
1036     options the compiling options
1037    
1038     Returns: the fixed length, or -1 if there is no fixed length,
1039     or -2 if \C was encountered
1040     */
1041    
1042     static int
1043     find_fixedlength(uschar *code, int options)
1044     {
1045     int length = -1;
1046    
1047     register int branchlength = 0;
1048     register uschar *cc = code + 1 + LINK_SIZE;
1049    
1050     /* Scan along the opcodes for this branch. If we get to the end of the
1051     branch, check the length against that of the other branches. */
1052    
1053     for (;;)
1054     {
1055     int d;
1056     register int op = *cc;
1057    
1058     switch (op)
1059     {
1060 nigel 93 case OP_CBRA:
1061 nigel 77 case OP_BRA:
1062     case OP_ONCE:
1063     case OP_COND:
1064 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1065 nigel 77 if (d < 0) return d;
1066     branchlength += d;
1067     do cc += GET(cc, 1); while (*cc == OP_ALT);
1068     cc += 1 + LINK_SIZE;
1069     break;
1070    
1071     /* Reached end of a branch; if it's a ket it is the end of a nested
1072     call. If it's ALT it is an alternation in a nested call. If it is
1073     END it's the end of the outer call. All can be handled by the same code. */
1074    
1075     case OP_ALT:
1076     case OP_KET:
1077     case OP_KETRMAX:
1078     case OP_KETRMIN:
1079     case OP_END:
1080     if (length < 0) length = branchlength;
1081     else if (length != branchlength) return -1;
1082     if (*cc != OP_ALT) return length;
1083     cc += 1 + LINK_SIZE;
1084     branchlength = 0;
1085     break;
1086    
1087     /* Skip over assertive subpatterns */
1088    
1089     case OP_ASSERT:
1090     case OP_ASSERT_NOT:
1091     case OP_ASSERTBACK:
1092     case OP_ASSERTBACK_NOT:
1093     do cc += GET(cc, 1); while (*cc == OP_ALT);
1094     /* Fall through */
1095    
1096     /* Skip over things that don't match chars */
1097    
1098     case OP_REVERSE:
1099     case OP_CREF:
1100 nigel 93 case OP_RREF:
1101     case OP_DEF:
1102 nigel 77 case OP_OPT:
1103     case OP_CALLOUT:
1104     case OP_SOD:
1105     case OP_SOM:
1106     case OP_EOD:
1107     case OP_EODN:
1108     case OP_CIRC:
1109     case OP_DOLL:
1110     case OP_NOT_WORD_BOUNDARY:
1111     case OP_WORD_BOUNDARY:
1112     cc += _pcre_OP_lengths[*cc];
1113     break;
1114    
1115     /* Handle literal characters */
1116    
1117     case OP_CHAR:
1118     case OP_CHARNC:
1119 nigel 91 case OP_NOT:
1120 nigel 77 branchlength++;
1121     cc += 2;
1122     #ifdef SUPPORT_UTF8
1123     if ((options & PCRE_UTF8) != 0)
1124     {
1125     while ((*cc & 0xc0) == 0x80) cc++;
1126     }
1127     #endif
1128     break;
1129    
1130     /* Handle exact repetitions. The count is already in characters, but we
1131     need to skip over a multibyte character in UTF8 mode. */
1132    
1133     case OP_EXACT:
1134     branchlength += GET2(cc,1);
1135     cc += 4;
1136     #ifdef SUPPORT_UTF8
1137     if ((options & PCRE_UTF8) != 0)
1138     {
1139     while((*cc & 0x80) == 0x80) cc++;
1140     }
1141     #endif
1142     break;
1143    
1144     case OP_TYPEEXACT:
1145     branchlength += GET2(cc,1);
1146     cc += 4;
1147     break;
1148    
1149     /* Handle single-char matchers */
1150    
1151     case OP_PROP:
1152     case OP_NOTPROP:
1153 nigel 87 cc += 2;
1154 nigel 77 /* Fall through */
1155    
1156     case OP_NOT_DIGIT:
1157     case OP_DIGIT:
1158     case OP_NOT_WHITESPACE:
1159     case OP_WHITESPACE:
1160     case OP_NOT_WORDCHAR:
1161     case OP_WORDCHAR:
1162     case OP_ANY:
1163     branchlength++;
1164     cc++;
1165     break;
1166    
1167     /* The single-byte matcher isn't allowed */
1168    
1169     case OP_ANYBYTE:
1170     return -2;
1171    
1172     /* Check a class for variable quantification */
1173    
1174     #ifdef SUPPORT_UTF8
1175     case OP_XCLASS:
1176     cc += GET(cc, 1) - 33;
1177     /* Fall through */
1178     #endif
1179    
1180     case OP_CLASS:
1181     case OP_NCLASS:
1182     cc += 33;
1183    
1184     switch (*cc)
1185     {
1186     case OP_CRSTAR:
1187     case OP_CRMINSTAR:
1188     case OP_CRQUERY:
1189     case OP_CRMINQUERY:
1190     return -1;
1191    
1192     case OP_CRRANGE:
1193     case OP_CRMINRANGE:
1194     if (GET2(cc,1) != GET2(cc,3)) return -1;
1195     branchlength += GET2(cc,1);
1196     cc += 5;
1197     break;
1198    
1199     default:
1200     branchlength++;
1201     }
1202     break;
1203    
1204     /* Anything else is variable length */
1205    
1206     default:
1207     return -1;
1208     }
1209     }
1210     /* Control never gets here */
1211     }
1212    
1213    
1214    
1215    
1216     /*************************************************
1217     * Scan compiled regex for numbered bracket *
1218     *************************************************/
1219    
1220     /* This little function scans through a compiled pattern until it finds a
1221     capturing bracket with the given number.
1222    
1223     Arguments:
1224     code points to start of expression
1225     utf8 TRUE in UTF-8 mode
1226     number the required bracket number
1227    
1228     Returns: pointer to the opcode for the bracket, or NULL if not found
1229     */
1230    
1231     static const uschar *
1232     find_bracket(const uschar *code, BOOL utf8, int number)
1233     {
1234     for (;;)
1235     {
1236     register int c = *code;
1237     if (c == OP_END) return NULL;
1238 nigel 91
1239     /* XCLASS is used for classes that cannot be represented just by a bit
1240     map. This includes negated single high-valued characters. The length in
1241     the table is zero; the actual length is stored in the compiled code. */
1242    
1243     if (c == OP_XCLASS) code += GET(code, 1);
1244    
1245 nigel 93 /* Handle capturing bracket */
1246 nigel 91
1247 nigel 93 else if (c == OP_CBRA)
1248 nigel 77 {
1249 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1250 nigel 77 if (n == number) return (uschar *)code;
1251 nigel 93 code += _pcre_OP_lengths[c];
1252 nigel 77 }
1253 nigel 91
1254 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1255     a multi-byte character. The length in the table is a minimum, so we have to
1256     arrange to skip the extra bytes. */
1257 nigel 91
1258 nigel 77 else
1259     {
1260     code += _pcre_OP_lengths[c];
1261 ph10 107 #ifdef SUPPORT_UTF8
1262 nigel 77 if (utf8) switch(c)
1263     {
1264     case OP_CHAR:
1265     case OP_CHARNC:
1266     case OP_EXACT:
1267     case OP_UPTO:
1268     case OP_MINUPTO:
1269 nigel 93 case OP_POSUPTO:
1270 nigel 77 case OP_STAR:
1271     case OP_MINSTAR:
1272 nigel 93 case OP_POSSTAR:
1273 nigel 77 case OP_PLUS:
1274     case OP_MINPLUS:
1275 nigel 93 case OP_POSPLUS:
1276 nigel 77 case OP_QUERY:
1277     case OP_MINQUERY:
1278 nigel 93 case OP_POSQUERY:
1279     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1280 nigel 77 break;
1281     }
1282 ph10 111 #endif
1283 nigel 77 }
1284     }
1285     }
1286    
1287    
1288    
1289     /*************************************************
1290     * Scan compiled regex for recursion reference *
1291     *************************************************/
1292    
1293     /* This little function scans through a compiled pattern until it finds an
1294     instance of OP_RECURSE.
1295    
1296     Arguments:
1297     code points to start of expression
1298     utf8 TRUE in UTF-8 mode
1299    
1300     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1301     */
1302    
1303     static const uschar *
1304     find_recurse(const uschar *code, BOOL utf8)
1305     {
1306     for (;;)
1307     {
1308     register int c = *code;
1309     if (c == OP_END) return NULL;
1310 nigel 91 if (c == OP_RECURSE) return code;
1311    
1312     /* XCLASS is used for classes that cannot be represented just by a bit
1313     map. This includes negated single high-valued characters. The length in
1314     the table is zero; the actual length is stored in the compiled code. */
1315    
1316     if (c == OP_XCLASS) code += GET(code, 1);
1317    
1318     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1319     that are followed by a character may be followed by a multi-byte character.
1320 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1321     bytes. */
1322 nigel 91
1323 nigel 77 else
1324     {
1325     code += _pcre_OP_lengths[c];
1326 ph10 107 #ifdef SUPPORT_UTF8
1327 nigel 77 if (utf8) switch(c)
1328     {
1329     case OP_CHAR:
1330     case OP_CHARNC:
1331     case OP_EXACT:
1332     case OP_UPTO:
1333     case OP_MINUPTO:
1334 nigel 93 case OP_POSUPTO:
1335 nigel 77 case OP_STAR:
1336     case OP_MINSTAR:
1337 nigel 93 case OP_POSSTAR:
1338 nigel 77 case OP_PLUS:
1339     case OP_MINPLUS:
1340 nigel 93 case OP_POSPLUS:
1341 nigel 77 case OP_QUERY:
1342     case OP_MINQUERY:
1343 nigel 93 case OP_POSQUERY:
1344     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1345 nigel 77 break;
1346     }
1347 ph10 111 #endif
1348 nigel 77 }
1349     }
1350     }
1351    
1352    
1353    
1354     /*************************************************
1355     * Scan compiled branch for non-emptiness *
1356     *************************************************/
1357    
1358     /* This function scans through a branch of a compiled pattern to see whether it
1359 nigel 93 can match the empty string or not. It is called from could_be_empty()
1360     below and from compile_branch() when checking for an unlimited repeat of a
1361     group that can match nothing. Note that first_significant_code() skips over
1362     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1363     struck an inner bracket whose current branch will already have been scanned.
1364 nigel 77
1365     Arguments:
1366     code points to start of search
1367     endcode points to where to stop
1368     utf8 TRUE if in UTF8 mode
1369    
1370     Returns: TRUE if what is matched could be empty
1371     */
1372    
1373     static BOOL
1374     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1375     {
1376     register int c;
1377 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1378 nigel 77 code < endcode;
1379     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1380     {
1381     const uschar *ccode;
1382    
1383     c = *code;
1384 ph10 172
1385 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1386 nigel 77
1387 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1388     {
1389 ph10 172 code += _pcre_OP_lengths[c];
1390 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1391     c = *code;
1392     continue;
1393     }
1394    
1395     /* For other groups, scan the branches. */
1396 ph10 172
1397 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1398 nigel 77 {
1399     BOOL empty_branch;
1400     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1401    
1402     /* Scan a closed bracket */
1403    
1404     empty_branch = FALSE;
1405     do
1406     {
1407     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1408     empty_branch = TRUE;
1409     code += GET(code, 1);
1410     }
1411     while (*code == OP_ALT);
1412     if (!empty_branch) return FALSE; /* All branches are non-empty */
1413 ph10 172 c = *code;
1414 nigel 93 continue;
1415 nigel 77 }
1416    
1417 nigel 93 /* Handle the other opcodes */
1418    
1419     switch (c)
1420 nigel 77 {
1421     /* Check for quantifiers after a class */
1422    
1423     #ifdef SUPPORT_UTF8
1424     case OP_XCLASS:
1425     ccode = code + GET(code, 1);
1426     goto CHECK_CLASS_REPEAT;
1427     #endif
1428    
1429     case OP_CLASS:
1430     case OP_NCLASS:
1431     ccode = code + 33;
1432    
1433     #ifdef SUPPORT_UTF8
1434     CHECK_CLASS_REPEAT:
1435     #endif
1436    
1437     switch (*ccode)
1438     {
1439     case OP_CRSTAR: /* These could be empty; continue */
1440     case OP_CRMINSTAR:
1441     case OP_CRQUERY:
1442     case OP_CRMINQUERY:
1443     break;
1444    
1445     default: /* Non-repeat => class must match */
1446     case OP_CRPLUS: /* These repeats aren't empty */
1447     case OP_CRMINPLUS:
1448     return FALSE;
1449    
1450     case OP_CRRANGE:
1451     case OP_CRMINRANGE:
1452     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1453     break;
1454     }
1455     break;
1456    
1457     /* Opcodes that must match a character */
1458    
1459     case OP_PROP:
1460     case OP_NOTPROP:
1461     case OP_EXTUNI:
1462     case OP_NOT_DIGIT:
1463     case OP_DIGIT:
1464     case OP_NOT_WHITESPACE:
1465     case OP_WHITESPACE:
1466     case OP_NOT_WORDCHAR:
1467     case OP_WORDCHAR:
1468     case OP_ANY:
1469     case OP_ANYBYTE:
1470     case OP_CHAR:
1471     case OP_CHARNC:
1472     case OP_NOT:
1473     case OP_PLUS:
1474     case OP_MINPLUS:
1475 nigel 93 case OP_POSPLUS:
1476 nigel 77 case OP_EXACT:
1477     case OP_NOTPLUS:
1478     case OP_NOTMINPLUS:
1479 nigel 93 case OP_NOTPOSPLUS:
1480 nigel 77 case OP_NOTEXACT:
1481     case OP_TYPEPLUS:
1482     case OP_TYPEMINPLUS:
1483 nigel 93 case OP_TYPEPOSPLUS:
1484 nigel 77 case OP_TYPEEXACT:
1485     return FALSE;
1486    
1487     /* End of branch */
1488    
1489     case OP_KET:
1490     case OP_KETRMAX:
1491     case OP_KETRMIN:
1492     case OP_ALT:
1493     return TRUE;
1494    
1495 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1496     MINUPTO, and POSUPTO may be followed by a multibyte character */
1497 nigel 77
1498     #ifdef SUPPORT_UTF8
1499     case OP_STAR:
1500     case OP_MINSTAR:
1501 nigel 93 case OP_POSSTAR:
1502 nigel 77 case OP_QUERY:
1503     case OP_MINQUERY:
1504 nigel 93 case OP_POSQUERY:
1505 nigel 77 case OP_UPTO:
1506     case OP_MINUPTO:
1507 nigel 93 case OP_POSUPTO:
1508 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1509     break;
1510     #endif
1511     }
1512     }
1513    
1514     return TRUE;
1515     }
1516    
1517    
1518    
1519     /*************************************************
1520     * Scan compiled regex for non-emptiness *
1521     *************************************************/
1522    
1523     /* This function is called to check for left recursive calls. We want to check
1524     the current branch of the current pattern to see if it could match the empty
1525     string. If it could, we must look outwards for branches at other levels,
1526     stopping when we pass beyond the bracket which is the subject of the recursion.
1527    
1528     Arguments:
1529     code points to start of the recursion
1530     endcode points to where to stop (current RECURSE item)
1531     bcptr points to the chain of current (unclosed) branch starts
1532     utf8 TRUE if in UTF-8 mode
1533    
1534     Returns: TRUE if what is matched could be empty
1535     */
1536    
1537     static BOOL
1538     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1539     BOOL utf8)
1540     {
1541     while (bcptr != NULL && bcptr->current >= code)
1542     {
1543     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1544     bcptr = bcptr->outer;
1545     }
1546     return TRUE;
1547     }
1548    
1549    
1550    
1551     /*************************************************
1552     * Check for POSIX class syntax *
1553     *************************************************/
1554    
1555     /* This function is called when the sequence "[:" or "[." or "[=" is
1556     encountered in a character class. It checks whether this is followed by an
1557     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1558     ".]" or "=]".
1559    
1560     Argument:
1561     ptr pointer to the initial [
1562     endptr where to return the end pointer
1563     cd pointer to compile data
1564    
1565     Returns: TRUE or FALSE
1566     */
1567    
1568     static BOOL
1569     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1570     {
1571     int terminator; /* Don't combine these lines; the Solaris cc */
1572     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1573     if (*(++ptr) == '^') ptr++;
1574     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1575     if (*ptr == terminator && ptr[1] == ']')
1576     {
1577     *endptr = ptr;
1578     return TRUE;
1579     }
1580     return FALSE;
1581     }
1582    
1583    
1584    
1585    
1586     /*************************************************
1587     * Check POSIX class name *
1588     *************************************************/
1589    
1590     /* This function is called to check the name given in a POSIX-style class entry
1591     such as [:alnum:].
1592    
1593     Arguments:
1594     ptr points to the first letter
1595     len the length of the name
1596    
1597     Returns: a value representing the name, or -1 if unknown
1598     */
1599    
1600     static int
1601     check_posix_name(const uschar *ptr, int len)
1602     {
1603     register int yield = 0;
1604     while (posix_name_lengths[yield] != 0)
1605     {
1606     if (len == posix_name_lengths[yield] &&
1607     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1608     yield++;
1609     }
1610     return -1;
1611     }
1612    
1613    
1614     /*************************************************
1615     * Adjust OP_RECURSE items in repeated group *
1616     *************************************************/
1617    
1618     /* OP_RECURSE items contain an offset from the start of the regex to the group
1619     that is referenced. This means that groups can be replicated for fixed
1620     repetition simply by copying (because the recursion is allowed to refer to
1621     earlier groups that are outside the current group). However, when a group is
1622     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1623     it, after it has been compiled. This means that any OP_RECURSE items within it
1624     that refer to the group itself or any contained groups have to have their
1625 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1626     the partially compiled regex must be temporarily terminated with OP_END.
1627 nigel 77
1628 nigel 93 This function has been extended with the possibility of forward references for
1629     recursions and subroutine calls. It must also check the list of such references
1630     for the group we are dealing with. If it finds that one of the recursions in
1631     the current group is on this list, it adjusts the offset in the list, not the
1632     value in the reference (which is a group number).
1633    
1634 nigel 77 Arguments:
1635     group points to the start of the group
1636     adjust the amount by which the group is to be moved
1637     utf8 TRUE in UTF-8 mode
1638     cd contains pointers to tables etc.
1639 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1640 nigel 77
1641     Returns: nothing
1642     */
1643    
1644     static void
1645 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1646     uschar *save_hwm)
1647 nigel 77 {
1648     uschar *ptr = group;
1649     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1650     {
1651 nigel 93 int offset;
1652     uschar *hc;
1653    
1654     /* See if this recursion is on the forward reference list. If so, adjust the
1655     reference. */
1656    
1657     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1658     {
1659     offset = GET(hc, 0);
1660     if (cd->start_code + offset == ptr + 1)
1661     {
1662     PUT(hc, 0, offset + adjust);
1663     break;
1664     }
1665     }
1666    
1667     /* Otherwise, adjust the recursion offset if it's after the start of this
1668     group. */
1669    
1670     if (hc >= cd->hwm)
1671     {
1672     offset = GET(ptr, 1);
1673     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1674     }
1675    
1676 nigel 77 ptr += 1 + LINK_SIZE;
1677     }
1678     }
1679    
1680    
1681    
1682     /*************************************************
1683     * Insert an automatic callout point *
1684     *************************************************/
1685    
1686     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1687     callout points before each pattern item.
1688    
1689     Arguments:
1690     code current code pointer
1691     ptr current pattern pointer
1692     cd pointers to tables etc
1693    
1694     Returns: new code pointer
1695     */
1696    
1697     static uschar *
1698     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1699     {
1700     *code++ = OP_CALLOUT;
1701     *code++ = 255;
1702     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1703     PUT(code, LINK_SIZE, 0); /* Default length */
1704     return code + 2*LINK_SIZE;
1705     }
1706    
1707    
1708    
1709     /*************************************************
1710     * Complete a callout item *
1711     *************************************************/
1712    
1713     /* A callout item contains the length of the next item in the pattern, which
1714     we can't fill in till after we have reached the relevant point. This is used
1715     for both automatic and manual callouts.
1716    
1717     Arguments:
1718     previous_callout points to previous callout item
1719     ptr current pattern pointer
1720     cd pointers to tables etc
1721    
1722     Returns: nothing
1723     */
1724    
1725     static void
1726     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1727     {
1728     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1729     PUT(previous_callout, 2 + LINK_SIZE, length);
1730     }
1731    
1732    
1733    
1734     #ifdef SUPPORT_UCP
1735     /*************************************************
1736     * Get othercase range *
1737     *************************************************/
1738    
1739     /* This function is passed the start and end of a class range, in UTF-8 mode
1740     with UCP support. It searches up the characters, looking for internal ranges of
1741     characters in the "other" case. Each call returns the next one, updating the
1742     start address.
1743    
1744     Arguments:
1745     cptr points to starting character value; updated
1746     d end value
1747     ocptr where to put start of othercase range
1748     odptr where to put end of othercase range
1749    
1750     Yield: TRUE when range returned; FALSE when no more
1751     */
1752    
1753     static BOOL
1754 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1755     unsigned int *odptr)
1756 nigel 77 {
1757 nigel 93 unsigned int c, othercase, next;
1758 nigel 77
1759     for (c = *cptr; c <= d; c++)
1760 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1761 nigel 77
1762     if (c > d) return FALSE;
1763    
1764     *ocptr = othercase;
1765     next = othercase + 1;
1766    
1767     for (++c; c <= d; c++)
1768     {
1769 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1770 nigel 77 next++;
1771     }
1772    
1773     *odptr = next - 1;
1774     *cptr = c;
1775    
1776     return TRUE;
1777     }
1778     #endif /* SUPPORT_UCP */
1779    
1780    
1781 nigel 93
1782 nigel 77 /*************************************************
1783 nigel 93 * Check if auto-possessifying is possible *
1784     *************************************************/
1785    
1786     /* This function is called for unlimited repeats of certain items, to see
1787     whether the next thing could possibly match the repeated item. If not, it makes
1788     sense to automatically possessify the repeated item.
1789    
1790     Arguments:
1791     op_code the repeated op code
1792     this data for this item, depends on the opcode
1793     utf8 TRUE in UTF-8 mode
1794     utf8_char used for utf8 character bytes, NULL if not relevant
1795     ptr next character in pattern
1796     options options bits
1797     cd contains pointers to tables etc.
1798    
1799     Returns: TRUE if possessifying is wanted
1800     */
1801    
1802     static BOOL
1803     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1804     const uschar *ptr, int options, compile_data *cd)
1805     {
1806     int next;
1807    
1808     /* Skip whitespace and comments in extended mode */
1809    
1810     if ((options & PCRE_EXTENDED) != 0)
1811     {
1812     for (;;)
1813     {
1814     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1815     if (*ptr == '#')
1816     {
1817     while (*(++ptr) != 0)
1818     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1819     }
1820     else break;
1821     }
1822     }
1823    
1824     /* If the next item is one that we can handle, get its value. A non-negative
1825     value is a character, a negative value is an escape value. */
1826    
1827     if (*ptr == '\\')
1828     {
1829     int temperrorcode = 0;
1830     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1831     if (temperrorcode != 0) return FALSE;
1832     ptr++; /* Point after the escape sequence */
1833     }
1834    
1835     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1836     {
1837     #ifdef SUPPORT_UTF8
1838     if (utf8) { GETCHARINC(next, ptr); } else
1839     #endif
1840     next = *ptr++;
1841     }
1842    
1843     else return FALSE;
1844    
1845     /* Skip whitespace and comments in extended mode */
1846    
1847     if ((options & PCRE_EXTENDED) != 0)
1848     {
1849     for (;;)
1850     {
1851     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1852     if (*ptr == '#')
1853     {
1854     while (*(++ptr) != 0)
1855     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1856     }
1857     else break;
1858     }
1859     }
1860    
1861     /* If the next thing is itself optional, we have to give up. */
1862    
1863     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1864     return FALSE;
1865    
1866     /* Now compare the next item with the previous opcode. If the previous is a
1867     positive single character match, "item" either contains the character or, if
1868     "item" is greater than 127 in utf8 mode, the character's bytes are in
1869     utf8_char. */
1870    
1871    
1872     /* Handle cases when the next item is a character. */
1873    
1874     if (next >= 0) switch(op_code)
1875     {
1876     case OP_CHAR:
1877     #ifdef SUPPORT_UTF8
1878     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1879     #endif
1880     return item != next;
1881    
1882     /* For CHARNC (caseless character) we must check the other case. If we have
1883     Unicode property support, we can use it to test the other case of
1884     high-valued characters. */
1885    
1886     case OP_CHARNC:
1887     #ifdef SUPPORT_UTF8
1888     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1889     #endif
1890     if (item == next) return FALSE;
1891     #ifdef SUPPORT_UTF8
1892     if (utf8)
1893     {
1894     unsigned int othercase;
1895     if (next < 128) othercase = cd->fcc[next]; else
1896     #ifdef SUPPORT_UCP
1897     othercase = _pcre_ucp_othercase((unsigned int)next);
1898     #else
1899     othercase = NOTACHAR;
1900     #endif
1901     return (unsigned int)item != othercase;
1902     }
1903     else
1904     #endif /* SUPPORT_UTF8 */
1905     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1906    
1907     /* For OP_NOT, "item" must be a single-byte character. */
1908    
1909     case OP_NOT:
1910     if (next < 0) return FALSE; /* Not a character */
1911     if (item == next) return TRUE;
1912     if ((options & PCRE_CASELESS) == 0) return FALSE;
1913     #ifdef SUPPORT_UTF8
1914     if (utf8)
1915     {
1916     unsigned int othercase;
1917     if (next < 128) othercase = cd->fcc[next]; else
1918     #ifdef SUPPORT_UCP
1919     othercase = _pcre_ucp_othercase(next);
1920     #else
1921     othercase = NOTACHAR;
1922     #endif
1923     return (unsigned int)item == othercase;
1924     }
1925     else
1926     #endif /* SUPPORT_UTF8 */
1927     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1928    
1929     case OP_DIGIT:
1930     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1931    
1932     case OP_NOT_DIGIT:
1933     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1934    
1935     case OP_WHITESPACE:
1936     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1937    
1938     case OP_NOT_WHITESPACE:
1939     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1940    
1941     case OP_WORDCHAR:
1942     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1943    
1944     case OP_NOT_WORDCHAR:
1945     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1946    
1947     default:
1948     return FALSE;
1949     }
1950    
1951    
1952     /* Handle the case when the next item is \d, \s, etc. */
1953    
1954     switch(op_code)
1955     {
1956     case OP_CHAR:
1957     case OP_CHARNC:
1958     #ifdef SUPPORT_UTF8
1959     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1960     #endif
1961     switch(-next)
1962     {
1963     case ESC_d:
1964     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1965    
1966     case ESC_D:
1967     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1968    
1969     case ESC_s:
1970     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1971    
1972     case ESC_S:
1973     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1974    
1975     case ESC_w:
1976     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1977    
1978     case ESC_W:
1979     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1980    
1981     default:
1982     return FALSE;
1983     }
1984    
1985     case OP_DIGIT:
1986     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1987    
1988     case OP_NOT_DIGIT:
1989     return next == -ESC_d;
1990    
1991     case OP_WHITESPACE:
1992     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1993    
1994     case OP_NOT_WHITESPACE:
1995     return next == -ESC_s;
1996    
1997     case OP_WORDCHAR:
1998     return next == -ESC_W || next == -ESC_s;
1999    
2000     case OP_NOT_WORDCHAR:
2001     return next == -ESC_w || next == -ESC_d;
2002    
2003     default:
2004     return FALSE;
2005     }
2006    
2007     /* Control does not reach here */
2008     }
2009    
2010    
2011    
2012     /*************************************************
2013 nigel 77 * Compile one branch *
2014     *************************************************/
2015    
2016 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2017 nigel 77 changed during the branch, the pointer is used to change the external options
2018 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2019     to find out the amount of memory needed, as well as during the real compile
2020     phase. The value of lengthptr distinguishes the two phases.
2021 nigel 77
2022     Arguments:
2023     optionsptr pointer to the option bits
2024     codeptr points to the pointer to the current code point
2025     ptrptr points to the current pattern pointer
2026     errorcodeptr points to error code variable
2027     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2028     reqbyteptr set to the last literal character required, else < 0
2029     bcptr points to current branch chain
2030     cd contains pointers to tables etc.
2031 nigel 93 lengthptr NULL during the real compile phase
2032     points to length accumulator during pre-compile phase
2033 nigel 77
2034     Returns: TRUE on success
2035     FALSE, with *errorcodeptr set non-zero on error
2036     */
2037    
2038     static BOOL
2039 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2040     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2041     compile_data *cd, int *lengthptr)
2042 nigel 77 {
2043     int repeat_type, op_type;
2044     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2045     int bravalue = 0;
2046     int greedy_default, greedy_non_default;
2047     int firstbyte, reqbyte;
2048     int zeroreqbyte, zerofirstbyte;
2049     int req_caseopt, reqvary, tempreqvary;
2050     int options = *optionsptr;
2051     int after_manual_callout = 0;
2052 nigel 93 int length_prevgroup = 0;
2053 nigel 77 register int c;
2054     register uschar *code = *codeptr;
2055 nigel 93 uschar *last_code = code;
2056     uschar *orig_code = code;
2057 nigel 77 uschar *tempcode;
2058     BOOL inescq = FALSE;
2059     BOOL groupsetfirstbyte = FALSE;
2060     const uschar *ptr = *ptrptr;
2061     const uschar *tempptr;
2062     uschar *previous = NULL;
2063     uschar *previous_callout = NULL;
2064 nigel 93 uschar *save_hwm = NULL;
2065 nigel 77 uschar classbits[32];
2066    
2067     #ifdef SUPPORT_UTF8
2068     BOOL class_utf8;
2069     BOOL utf8 = (options & PCRE_UTF8) != 0;
2070     uschar *class_utf8data;
2071     uschar utf8_char[6];
2072     #else
2073     BOOL utf8 = FALSE;
2074 nigel 93 uschar *utf8_char = NULL;
2075 nigel 77 #endif
2076    
2077 nigel 93 #ifdef DEBUG
2078     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2079     #endif
2080    
2081 nigel 77 /* Set up the default and non-default settings for greediness */
2082    
2083     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2084     greedy_non_default = greedy_default ^ 1;
2085    
2086     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2087     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2088     matches a non-fixed char first char; reqbyte just remains unset if we never
2089     find one.
2090    
2091     When we hit a repeat whose minimum is zero, we may have to adjust these values
2092     to take the zero repeat into account. This is implemented by setting them to
2093     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2094     item types that can be repeated set these backoff variables appropriately. */
2095    
2096     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2097    
2098     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2099     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2100     value > 255. It is added into the firstbyte or reqbyte variables to record the
2101     case status of the value. This is used only for ASCII characters. */
2102    
2103     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2104    
2105     /* Switch on next character until the end of the branch */
2106    
2107     for (;; ptr++)
2108     {
2109     BOOL negate_class;
2110     BOOL possessive_quantifier;
2111     BOOL is_quantifier;
2112 nigel 93 BOOL is_recurse;
2113 nigel 77 int class_charcount;
2114     int class_lastchar;
2115     int newoptions;
2116     int recno;
2117 ph10 172 int refsign;
2118 nigel 77 int skipbytes;
2119     int subreqbyte;
2120     int subfirstbyte;
2121 nigel 93 int terminator;
2122 nigel 77 int mclength;
2123     uschar mcbuffer[8];
2124    
2125 nigel 93 /* Get next byte in the pattern */
2126 nigel 77
2127     c = *ptr;
2128    
2129 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2130     previous cycle of this loop. */
2131    
2132     if (lengthptr != NULL)
2133     {
2134     #ifdef DEBUG
2135     if (code > cd->hwm) cd->hwm = code; /* High water info */
2136     #endif
2137     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2138     {
2139     *errorcodeptr = ERR52;
2140     goto FAILED;
2141     }
2142    
2143     /* There is at least one situation where code goes backwards: this is the
2144     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2145     the class is simply eliminated. However, it is created first, so we have to
2146     allow memory for it. Therefore, don't ever reduce the length at this point.
2147     */
2148    
2149     if (code < last_code) code = last_code;
2150     *lengthptr += code - last_code;
2151     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2152    
2153     /* If "previous" is set and it is not at the start of the work space, move
2154     it back to there, in order to avoid filling up the work space. Otherwise,
2155     if "previous" is NULL, reset the current code pointer to the start. */
2156    
2157     if (previous != NULL)
2158     {
2159     if (previous > orig_code)
2160     {
2161     memmove(orig_code, previous, code - previous);
2162     code -= previous - orig_code;
2163     previous = orig_code;
2164     }
2165     }
2166     else code = orig_code;
2167    
2168     /* Remember where this code item starts so we can pick up the length
2169     next time round. */
2170    
2171     last_code = code;
2172     }
2173    
2174     /* In the real compile phase, just check the workspace used by the forward
2175     reference list. */
2176    
2177     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2178     {
2179     *errorcodeptr = ERR52;
2180     goto FAILED;
2181     }
2182    
2183 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2184    
2185     if (inescq && c != 0)
2186     {
2187     if (c == '\\' && ptr[1] == 'E')
2188     {
2189     inescq = FALSE;
2190     ptr++;
2191     continue;
2192     }
2193     else
2194     {
2195     if (previous_callout != NULL)
2196     {
2197 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2198     complete_callout(previous_callout, ptr, cd);
2199 nigel 77 previous_callout = NULL;
2200     }
2201     if ((options & PCRE_AUTO_CALLOUT) != 0)
2202     {
2203     previous_callout = code;
2204     code = auto_callout(code, ptr, cd);
2205     }
2206     goto NORMAL_CHAR;
2207     }
2208     }
2209    
2210     /* Fill in length of a previous callout, except when the next thing is
2211     a quantifier. */
2212    
2213     is_quantifier = c == '*' || c == '+' || c == '?' ||
2214     (c == '{' && is_counted_repeat(ptr+1));
2215    
2216     if (!is_quantifier && previous_callout != NULL &&
2217     after_manual_callout-- <= 0)
2218     {
2219 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2220     complete_callout(previous_callout, ptr, cd);
2221 nigel 77 previous_callout = NULL;
2222     }
2223    
2224     /* In extended mode, skip white space and comments */
2225    
2226     if ((options & PCRE_EXTENDED) != 0)
2227     {
2228     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2229     if (c == '#')
2230     {
2231 nigel 93 while (*(++ptr) != 0)
2232 nigel 91 {
2233 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2234 nigel 91 }
2235 nigel 93 if (*ptr != 0) continue;
2236    
2237 nigel 91 /* Else fall through to handle end of string */
2238     c = 0;
2239 nigel 77 }
2240     }
2241    
2242     /* No auto callout for quantifiers. */
2243    
2244     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2245     {
2246     previous_callout = code;
2247     code = auto_callout(code, ptr, cd);
2248     }
2249    
2250     switch(c)
2251     {
2252 nigel 93 /* ===================================================================*/
2253     case 0: /* The branch terminates at string end */
2254     case '|': /* or | or ) */
2255 nigel 77 case ')':
2256     *firstbyteptr = firstbyte;
2257     *reqbyteptr = reqbyte;
2258     *codeptr = code;
2259     *ptrptr = ptr;
2260 nigel 93 if (lengthptr != NULL)
2261     {
2262     *lengthptr += code - last_code; /* To include callout length */
2263     DPRINTF((">> end branch\n"));
2264     }
2265 nigel 77 return TRUE;
2266    
2267 nigel 93
2268     /* ===================================================================*/
2269 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2270     the setting of any following char as a first character. */
2271    
2272     case '^':
2273     if ((options & PCRE_MULTILINE) != 0)
2274     {
2275     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2276     }
2277     previous = NULL;
2278     *code++ = OP_CIRC;
2279     break;
2280    
2281     case '$':
2282     previous = NULL;
2283     *code++ = OP_DOLL;
2284     break;
2285    
2286     /* There can never be a first char if '.' is first, whatever happens about
2287     repeats. The value of reqbyte doesn't change either. */
2288    
2289     case '.':
2290     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2291     zerofirstbyte = firstbyte;
2292     zeroreqbyte = reqbyte;
2293     previous = code;
2294     *code++ = OP_ANY;
2295     break;
2296    
2297 nigel 93
2298     /* ===================================================================*/
2299 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2300     32-byte bitmap of the permitted characters, except in the special case
2301     where there is only one such character. For negated classes, we build the
2302     map as usual, then invert it at the end. However, we use a different opcode
2303     so that data characters > 255 can be handled correctly.
2304 nigel 77
2305     If the class contains characters outside the 0-255 range, a different
2306     opcode is compiled. It may optionally have a bit map for characters < 256,
2307     but those above are are explicitly listed afterwards. A flag byte tells
2308     whether the bitmap is present, and whether this is a negated class or not.
2309     */
2310    
2311     case '[':
2312     previous = code;
2313    
2314     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2315     they are encountered at the top level, so we'll do that too. */
2316    
2317     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2318     check_posix_syntax(ptr, &tempptr, cd))
2319     {
2320     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2321     goto FAILED;
2322     }
2323    
2324     /* If the first character is '^', set the negation flag and skip it. */
2325    
2326     if ((c = *(++ptr)) == '^')
2327     {
2328     negate_class = TRUE;
2329     c = *(++ptr);
2330     }
2331     else
2332     {
2333     negate_class = FALSE;
2334     }
2335    
2336     /* Keep a count of chars with values < 256 so that we can optimize the case
2337 nigel 93 of just a single character (as long as it's < 256). However, For higher
2338     valued UTF-8 characters, we don't yet do any optimization. */
2339 nigel 77
2340     class_charcount = 0;
2341     class_lastchar = -1;
2342    
2343 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2344     temporary bit of memory, in case the class contains only 1 character (less
2345     than 256), because in that case the compiled code doesn't use the bit map.
2346     */
2347    
2348     memset(classbits, 0, 32 * sizeof(uschar));
2349    
2350 nigel 77 #ifdef SUPPORT_UTF8
2351     class_utf8 = FALSE; /* No chars >= 256 */
2352 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2353 nigel 77 #endif
2354    
2355     /* Process characters until ] is reached. By writing this as a "do" it
2356 nigel 93 means that an initial ] is taken as a data character. At the start of the
2357     loop, c contains the first byte of the character. */
2358 nigel 77
2359 nigel 93 if (c != 0) do
2360 nigel 77 {
2361 nigel 93 const uschar *oldptr;
2362    
2363 nigel 77 #ifdef SUPPORT_UTF8
2364     if (utf8 && c > 127)
2365     { /* Braces are required because the */
2366     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2367     }
2368     #endif
2369    
2370     /* Inside \Q...\E everything is literal except \E */
2371    
2372     if (inescq)
2373     {
2374 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2375 nigel 77 {
2376 nigel 93 inescq = FALSE; /* Reset literal state */
2377     ptr++; /* Skip the 'E' */
2378     continue; /* Carry on with next */
2379 nigel 77 }
2380 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2381 nigel 77 }
2382    
2383     /* Handle POSIX class names. Perl allows a negation extension of the
2384     form [:^name:]. A square bracket that doesn't match the syntax is
2385     treated as a literal. We also recognize the POSIX constructions
2386     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2387     5.6 and 5.8 do. */
2388    
2389     if (c == '[' &&
2390     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2391     check_posix_syntax(ptr, &tempptr, cd))
2392     {
2393     BOOL local_negate = FALSE;
2394 nigel 87 int posix_class, taboffset, tabopt;
2395 nigel 77 register const uschar *cbits = cd->cbits;
2396 nigel 87 uschar pbits[32];
2397 nigel 77
2398     if (ptr[1] != ':')
2399     {
2400     *errorcodeptr = ERR31;
2401     goto FAILED;
2402     }
2403    
2404     ptr += 2;
2405     if (*ptr == '^')
2406     {
2407     local_negate = TRUE;
2408     ptr++;
2409     }
2410    
2411     posix_class = check_posix_name(ptr, tempptr - ptr);
2412     if (posix_class < 0)
2413     {
2414     *errorcodeptr = ERR30;
2415     goto FAILED;
2416     }
2417    
2418     /* If matching is caseless, upper and lower are converted to
2419     alpha. This relies on the fact that the class table starts with
2420     alpha, lower, upper as the first 3 entries. */
2421    
2422     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2423     posix_class = 0;
2424    
2425 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2426     because we may be adding and subtracting from it, and we don't want to
2427     subtract bits that may be in the main map already. At the end we or the
2428     result into the bit map that is being built. */
2429 nigel 77
2430     posix_class *= 3;
2431 nigel 87
2432     /* Copy in the first table (always present) */
2433    
2434     memcpy(pbits, cbits + posix_class_maps[posix_class],
2435     32 * sizeof(uschar));
2436    
2437     /* If there is a second table, add or remove it as required. */
2438    
2439     taboffset = posix_class_maps[posix_class + 1];
2440     tabopt = posix_class_maps[posix_class + 2];
2441    
2442     if (taboffset >= 0)
2443 nigel 77 {
2444 nigel 87 if (tabopt >= 0)
2445     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2446 nigel 77 else
2447 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2448 nigel 77 }
2449    
2450 nigel 87 /* Not see if we need to remove any special characters. An option
2451     value of 1 removes vertical space and 2 removes underscore. */
2452    
2453     if (tabopt < 0) tabopt = -tabopt;
2454     if (tabopt == 1) pbits[1] &= ~0x3c;
2455     else if (tabopt == 2) pbits[11] &= 0x7f;
2456    
2457     /* Add the POSIX table or its complement into the main table that is
2458     being built and we are done. */
2459    
2460     if (local_negate)
2461     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2462     else
2463     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2464    
2465 nigel 77 ptr = tempptr + 1;
2466     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2467     continue; /* End of POSIX syntax handling */
2468     }
2469    
2470     /* Backslash may introduce a single character, or it may introduce one
2471 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2472     case. Inside a class (and only there) it is treated as backspace.
2473     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2474     to or into the one we are building. We assume they have more than one
2475 nigel 77 character in them, so set class_charcount bigger than one. */
2476    
2477     if (c == '\\')
2478     {
2479 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2480     if (*errorcodeptr != 0) goto FAILED;
2481 nigel 77
2482     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2483     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2484 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2485 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2486     {
2487     if (ptr[1] == '\\' && ptr[2] == 'E')
2488     {
2489     ptr += 2; /* avoid empty string */
2490     }
2491     else inescq = TRUE;
2492     continue;
2493     }
2494    
2495     if (c < 0)
2496     {
2497     register const uschar *cbits = cd->cbits;
2498     class_charcount += 2; /* Greater than 1 is what matters */
2499 nigel 93
2500     /* Save time by not doing this in the pre-compile phase. */
2501    
2502     if (lengthptr == NULL) switch (-c)
2503 nigel 77 {
2504     case ESC_d:
2505     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2506     continue;
2507    
2508     case ESC_D:
2509     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2510     continue;
2511    
2512     case ESC_w:
2513     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2514     continue;
2515    
2516     case ESC_W:
2517     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2518     continue;
2519    
2520     case ESC_s:
2521     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2522     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2523     continue;
2524    
2525     case ESC_S:
2526     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2527     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2528     continue;
2529    
2530 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2531     continue;
2532    
2533     default: /* Not recognized; fall through */
2534     break; /* Need "default" setting to stop compiler warning. */
2535     }
2536    
2537     /* In the pre-compile phase, just do the recognition. */
2538    
2539     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2540     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2541    
2542     /* We need to deal with \P and \p in both phases. */
2543    
2544 nigel 77 #ifdef SUPPORT_UCP
2545 nigel 93 if (-c == ESC_p || -c == ESC_P)
2546     {
2547     BOOL negated;
2548     int pdata;
2549     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2550     if (ptype < 0) goto FAILED;
2551     class_utf8 = TRUE;
2552     *class_utf8data++ = ((-c == ESC_p) != negated)?
2553     XCL_PROP : XCL_NOTPROP;
2554     *class_utf8data++ = ptype;
2555     *class_utf8data++ = pdata;
2556     class_charcount -= 2; /* Not a < 256 character */
2557 nigel 77 continue;
2558 nigel 93 }
2559 nigel 77 #endif
2560 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2561     strict mode. By default, for compatibility with Perl, they are
2562     treated as literals. */
2563 nigel 77
2564 nigel 93 if ((options & PCRE_EXTRA) != 0)
2565     {
2566     *errorcodeptr = ERR7;
2567     goto FAILED;
2568     }
2569 nigel 77
2570 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2571     c = *ptr; /* Get the final character and fall through */
2572 nigel 77 }
2573    
2574     /* Fall through if we have a single character (c >= 0). This may be
2575 nigel 93 greater than 256 in UTF-8 mode. */
2576 nigel 77
2577     } /* End of backslash handling */
2578    
2579     /* A single character may be followed by '-' to form a range. However,
2580     Perl does not permit ']' to be the end of the range. A '-' character
2581 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2582     entirely. The code for handling \Q and \E is messy. */
2583 nigel 77
2584 nigel 93 CHECK_RANGE:
2585     while (ptr[1] == '\\' && ptr[2] == 'E')
2586 nigel 77 {
2587 nigel 93 inescq = FALSE;
2588     ptr += 2;
2589     }
2590    
2591     oldptr = ptr;
2592    
2593     if (!inescq && ptr[1] == '-')
2594     {
2595 nigel 77 int d;
2596     ptr += 2;
2597 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2598 nigel 77
2599 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2600     mode. */
2601    
2602     while (*ptr == '\\' && ptr[1] == 'Q')
2603     {
2604     ptr += 2;
2605     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2606     inescq = TRUE;
2607     break;
2608     }
2609    
2610     if (*ptr == 0 || (!inescq && *ptr == ']'))
2611     {
2612     ptr = oldptr;
2613     goto LONE_SINGLE_CHARACTER;
2614     }
2615    
2616 nigel 77 #ifdef SUPPORT_UTF8
2617     if (utf8)
2618     { /* Braces are required because the */
2619     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2620     }
2621     else
2622     #endif
2623     d = *ptr; /* Not UTF-8 mode */
2624    
2625     /* The second part of a range can be a single-character escape, but
2626     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2627     in such circumstances. */
2628    
2629 nigel 93 if (!inescq && d == '\\')
2630 nigel 77 {
2631 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2632     if (*errorcodeptr != 0) goto FAILED;
2633 nigel 77
2634 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2635     special means the '-' was literal */
2636 nigel 77
2637     if (d < 0)
2638     {
2639     if (d == -ESC_b) d = '\b';
2640 nigel 93 else if (d == -ESC_X) d = 'X';
2641     else if (d == -ESC_R) d = 'R'; else
2642 nigel 77 {
2643 nigel 93 ptr = oldptr;
2644 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2645     }
2646     }
2647     }
2648    
2649 nigel 93 /* Check that the two values are in the correct order. Optimize
2650     one-character ranges */
2651 nigel 77
2652 nigel 93 if (d < c)
2653     {
2654     *errorcodeptr = ERR8;
2655     goto FAILED;
2656     }
2657    
2658 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2659    
2660     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2661     matching, we have to use an XCLASS with extra data items. Caseless
2662     matching for characters > 127 is available only if UCP support is
2663     available. */
2664    
2665     #ifdef SUPPORT_UTF8
2666     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2667     {
2668     class_utf8 = TRUE;
2669    
2670     /* With UCP support, we can find the other case equivalents of
2671     the relevant characters. There may be several ranges. Optimize how
2672     they fit with the basic range. */
2673    
2674     #ifdef SUPPORT_UCP
2675     if ((options & PCRE_CASELESS) != 0)
2676     {
2677 nigel 93 unsigned int occ, ocd;
2678     unsigned int cc = c;
2679     unsigned int origd = d;
2680 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2681     {
2682     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2683    
2684     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2685     { /* if there is overlap, */
2686     c = occ; /* noting that if occ < c */
2687     continue; /* we can't have ocd > d */
2688     } /* because a subrange is */
2689     if (ocd > d && occ <= d + 1) /* always shorter than */
2690     { /* the basic range. */
2691     d = ocd;
2692     continue;
2693     }
2694    
2695     if (occ == ocd)
2696     {
2697     *class_utf8data++ = XCL_SINGLE;
2698     }
2699     else
2700     {
2701     *class_utf8data++ = XCL_RANGE;
2702     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2703     }
2704     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2705     }
2706     }
2707     #endif /* SUPPORT_UCP */
2708    
2709     /* Now record the original range, possibly modified for UCP caseless
2710     overlapping ranges. */
2711    
2712     *class_utf8data++ = XCL_RANGE;
2713     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2714     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2715    
2716     /* With UCP support, we are done. Without UCP support, there is no
2717     caseless matching for UTF-8 characters > 127; we can use the bit map
2718     for the smaller ones. */
2719    
2720     #ifdef SUPPORT_UCP
2721     continue; /* With next character in the class */
2722     #else
2723     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2724    
2725     /* Adjust upper limit and fall through to set up the map */
2726    
2727     d = 127;
2728    
2729     #endif /* SUPPORT_UCP */
2730     }
2731     #endif /* SUPPORT_UTF8 */
2732    
2733     /* We use the bit map for all cases when not in UTF-8 mode; else
2734     ranges that lie entirely within 0-127 when there is UCP support; else
2735     for partial ranges without UCP support. */
2736    
2737 nigel 93 class_charcount += d - c + 1;
2738     class_lastchar = d;
2739    
2740     /* We can save a bit of time by skipping this in the pre-compile. */
2741    
2742     if (lengthptr == NULL) for (; c <= d; c++)
2743 nigel 77 {
2744     classbits[c/8] |= (1 << (c&7));
2745     if ((options & PCRE_CASELESS) != 0)
2746     {
2747     int uc = cd->fcc[c]; /* flip case */
2748     classbits[uc/8] |= (1 << (uc&7));
2749     }
2750     }
2751    
2752     continue; /* Go get the next char in the class */
2753     }
2754    
2755     /* Handle a lone single character - we can get here for a normal
2756     non-escape char, or after \ that introduces a single character or for an
2757     apparent range that isn't. */
2758    
2759     LONE_SINGLE_CHARACTER:
2760    
2761     /* Handle a character that cannot go in the bit map */
2762    
2763     #ifdef SUPPORT_UTF8
2764     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2765     {
2766     class_utf8 = TRUE;
2767     *class_utf8data++ = XCL_SINGLE;
2768     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2769    
2770     #ifdef SUPPORT_UCP
2771     if ((options & PCRE_CASELESS) != 0)
2772     {
2773 nigel 93 unsigned int othercase;
2774     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2775 nigel 77 {
2776     *class_utf8data++ = XCL_SINGLE;
2777     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2778     }
2779     }
2780     #endif /* SUPPORT_UCP */
2781    
2782     }
2783     else
2784     #endif /* SUPPORT_UTF8 */
2785    
2786     /* Handle a single-byte character */
2787     {
2788     classbits[c/8] |= (1 << (c&7));
2789     if ((options & PCRE_CASELESS) != 0)
2790     {
2791     c = cd->fcc[c]; /* flip case */
2792     classbits[c/8] |= (1 << (c&7));
2793     }
2794     class_charcount++;
2795     class_lastchar = c;
2796     }
2797     }
2798    
2799 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2800 nigel 77
2801 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2802 nigel 77
2803 nigel 93 if (c == 0) /* Missing terminating ']' */
2804     {
2805     *errorcodeptr = ERR6;
2806     goto FAILED;
2807     }
2808    
2809 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2810     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2811     can optimize the negative case only if there were no characters >= 128
2812     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2813     single-bytes only. This is an historical hangover. Maybe one day we can
2814     tidy these opcodes to handle multi-byte characters.
2815    
2816     The optimization throws away the bit map. We turn the item into a
2817     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2818     that OP_NOT does not support multibyte characters. In the positive case, it
2819     can cause firstbyte to be set. Otherwise, there can be no first char if
2820     this item is first, whatever repeat count may follow. In the case of
2821     reqbyte, save the previous value for reinstating. */
2822    
2823     #ifdef SUPPORT_UTF8
2824     if (class_charcount == 1 &&
2825     (!utf8 ||
2826     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2827    
2828     #else
2829     if (class_charcount == 1)
2830     #endif
2831     {
2832     zeroreqbyte = reqbyte;
2833    
2834     /* The OP_NOT opcode works on one-byte characters only. */
2835    
2836     if (negate_class)
2837     {
2838     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2839     zerofirstbyte = firstbyte;
2840     *code++ = OP_NOT;
2841     *code++ = class_lastchar;
2842     break;
2843     }
2844    
2845     /* For a single, positive character, get the value into mcbuffer, and
2846     then we can handle this with the normal one-character code. */
2847    
2848     #ifdef SUPPORT_UTF8
2849     if (utf8 && class_lastchar > 127)
2850     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2851     else
2852     #endif
2853     {
2854     mcbuffer[0] = class_lastchar;
2855     mclength = 1;
2856     }
2857     goto ONE_CHAR;
2858     } /* End of 1-char optimization */
2859    
2860     /* The general case - not the one-char optimization. If this is the first
2861     thing in the branch, there can be no first char setting, whatever the
2862     repeat count. Any reqbyte setting must remain unchanged after any kind of
2863     repeat. */
2864    
2865     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2866     zerofirstbyte = firstbyte;
2867     zeroreqbyte = reqbyte;
2868    
2869     /* If there are characters with values > 255, we have to compile an
2870     extended class, with its own opcode. If there are no characters < 256,
2871 nigel 93 we can omit the bitmap in the actual compiled code. */
2872 nigel 77
2873     #ifdef SUPPORT_UTF8
2874     if (class_utf8)
2875     {
2876     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2877     *code++ = OP_XCLASS;
2878     code += LINK_SIZE;
2879     *code = negate_class? XCL_NOT : 0;
2880    
2881 nigel 93 /* If the map is required, move up the extra data to make room for it;
2882     otherwise just move the code pointer to the end of the extra data. */
2883 nigel 77
2884     if (class_charcount > 0)
2885     {
2886     *code++ |= XCL_MAP;
2887 nigel 93 memmove(code + 32, code, class_utf8data - code);
2888 nigel 77 memcpy(code, classbits, 32);
2889 nigel 93 code = class_utf8data + 32;
2890 nigel 77 }
2891 nigel 93 else code = class_utf8data;
2892 nigel 77
2893     /* Now fill in the complete length of the item */
2894    
2895     PUT(previous, 1, code - previous);
2896     break; /* End of class handling */
2897     }
2898     #endif
2899    
2900     /* If there are no characters > 255, negate the 32-byte map if necessary,
2901     and copy it into the code vector. If this is the first thing in the branch,
2902     there can be no first char setting, whatever the repeat count. Any reqbyte
2903     setting must remain unchanged after any kind of repeat. */
2904    
2905     if (negate_class)
2906     {
2907     *code++ = OP_NCLASS;
2908 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2909     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2910 nigel 77 }
2911     else
2912     {
2913     *code++ = OP_CLASS;
2914     memcpy(code, classbits, 32);
2915     }
2916     code += 32;
2917     break;
2918    
2919 nigel 93
2920     /* ===================================================================*/
2921 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2922     has been tested above. */
2923    
2924     case '{':
2925     if (!is_quantifier) goto NORMAL_CHAR;
2926     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2927     if (*errorcodeptr != 0) goto FAILED;
2928     goto REPEAT;
2929    
2930     case '*':
2931     repeat_min = 0;
2932     repeat_max = -1;
2933     goto REPEAT;
2934    
2935     case '+':
2936     repeat_min = 1;
2937     repeat_max = -1;
2938     goto REPEAT;
2939    
2940     case '?':
2941     repeat_min = 0;
2942     repeat_max = 1;
2943    
2944     REPEAT:
2945     if (previous == NULL)
2946     {
2947     *errorcodeptr = ERR9;
2948     goto FAILED;
2949     }
2950    
2951     if (repeat_min == 0)
2952     {
2953     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2954     reqbyte = zeroreqbyte; /* Ditto */
2955     }
2956    
2957     /* Remember whether this is a variable length repeat */
2958    
2959     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2960    
2961     op_type = 0; /* Default single-char op codes */
2962     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2963    
2964     /* Save start of previous item, in case we have to move it up to make space
2965     for an inserted OP_ONCE for the additional '+' extension. */
2966    
2967     tempcode = previous;
2968    
2969     /* If the next character is '+', we have a possessive quantifier. This
2970     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2971     If the next character is '?' this is a minimizing repeat, by default,
2972     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2973     repeat type to the non-default. */
2974    
2975     if (ptr[1] == '+')
2976     {
2977     repeat_type = 0; /* Force greedy */
2978     possessive_quantifier = TRUE;
2979     ptr++;
2980     }
2981     else if (ptr[1] == '?')
2982     {
2983     repeat_type = greedy_non_default;
2984     ptr++;
2985     }
2986     else repeat_type = greedy_default;
2987    
2988     /* If previous was a character match, abolish the item and generate a
2989     repeat item instead. If a char item has a minumum of more than one, ensure
2990     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2991     the first thing in a branch because the x will have gone into firstbyte
2992     instead. */
2993    
2994     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2995     {
2996     /* Deal with UTF-8 characters that take up more than one byte. It's
2997     easier to write this out separately than try to macrify it. Use c to
2998     hold the length of the character in bytes, plus 0x80 to flag that it's a
2999     length rather than a small character. */
3000    
3001     #ifdef SUPPORT_UTF8
3002     if (utf8 && (code[-1] & 0x80) != 0)
3003     {
3004     uschar *lastchar = code - 1;
3005     while((*lastchar & 0xc0) == 0x80) lastchar--;
3006     c = code - lastchar; /* Length of UTF-8 character */
3007     memcpy(utf8_char, lastchar, c); /* Save the char */
3008     c |= 0x80; /* Flag c as a length */
3009     }
3010     else
3011     #endif
3012    
3013     /* Handle the case of a single byte - either with no UTF8 support, or
3014     with UTF-8 disabled, or for a UTF-8 character < 128. */
3015    
3016     {
3017     c = code[-1];
3018     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3019     }
3020    
3021 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3022     the line is something that cannot possibly match this character. If so,
3023     automatically possessifying this item gains some performance in the case
3024     where the match fails. */
3025    
3026     if (!possessive_quantifier &&
3027     repeat_max < 0 &&
3028     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3029     options, cd))
3030     {
3031     repeat_type = 0; /* Force greedy */
3032     possessive_quantifier = TRUE;
3033     }
3034    
3035 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3036     }
3037    
3038     /* If previous was a single negated character ([^a] or similar), we use
3039     one of the special opcodes, replacing it. The code is shared with single-
3040     character repeats by setting opt_type to add a suitable offset into
3041 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3042     currently used only for single-byte chars. */
3043 nigel 77
3044     else if (*previous == OP_NOT)
3045     {
3046     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3047     c = previous[1];
3048 nigel 93 if (!possessive_quantifier &&
3049     repeat_max < 0 &&
3050     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3051     {
3052     repeat_type = 0; /* Force greedy */
3053     possessive_quantifier = TRUE;
3054     }
3055 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3056     }
3057    
3058     /* If previous was a character type match (\d or similar), abolish it and
3059     create a suitable repeat item. The code is shared with single-character
3060     repeats by setting op_type to add a suitable offset into repeat_type. Note
3061     the the Unicode property types will be present only when SUPPORT_UCP is
3062     defined, but we don't wrap the little bits of code here because it just
3063     makes it horribly messy. */
3064    
3065     else if (*previous < OP_EODN)
3066     {
3067     uschar *oldcode;
3068 nigel 87 int prop_type, prop_value;
3069 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3070     c = *previous;
3071    
3072 nigel 93 if (!possessive_quantifier &&
3073     repeat_max < 0 &&
3074     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3075     {
3076     repeat_type = 0; /* Force greedy */
3077     possessive_quantifier = TRUE;
3078     }
3079    
3080 nigel 77 OUTPUT_SINGLE_REPEAT:
3081 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3082     {
3083     prop_type = previous[1];
3084     prop_value = previous[2];
3085     }
3086     else prop_type = prop_value = -1;
3087 nigel 77
3088     oldcode = code;
3089     code = previous; /* Usually overwrite previous item */
3090    
3091     /* If the maximum is zero then the minimum must also be zero; Perl allows
3092     this case, so we do too - by simply omitting the item altogether. */
3093    
3094     if (repeat_max == 0) goto END_REPEAT;
3095    
3096     /* All real repeats make it impossible to handle partial matching (maybe
3097     one day we will be able to remove this restriction). */
3098    
3099     if (repeat_max != 1) cd->nopartial = TRUE;
3100    
3101     /* Combine the op_type with the repeat_type */
3102    
3103     repeat_type += op_type;
3104    
3105     /* A minimum of zero is handled either as the special case * or ?, or as
3106     an UPTO, with the maximum given. */
3107    
3108     if (repeat_min == 0)
3109     {
3110     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3111     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3112     else
3113     {
3114     *code++ = OP_UPTO + repeat_type;
3115     PUT2INC(code, 0, repeat_max);
3116     }
3117     }
3118    
3119     /* A repeat minimum of 1 is optimized into some special cases. If the
3120 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3121 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3122     one less than the maximum. */
3123    
3124     else if (repeat_min == 1)
3125     {
3126     if (repeat_max == -1)
3127     *code++ = OP_PLUS + repeat_type;
3128     else
3129     {
3130     code = oldcode; /* leave previous item in place */
3131     if (repeat_max == 1) goto END_REPEAT;
3132     *code++ = OP_UPTO + repeat_type;
3133     PUT2INC(code, 0, repeat_max - 1);
3134     }
3135     }
3136    
3137     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3138     handled as an EXACT followed by an UPTO. */
3139    
3140     else
3141     {
3142     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3143     PUT2INC(code, 0, repeat_min);
3144    
3145     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3146     we have to insert the character for the previous code. For a repeated
3147 nigel 87 Unicode property match, there are two extra bytes that define the
3148 nigel 77 required property. In UTF-8 mode, long characters have their length in
3149     c, with the 0x80 bit as a flag. */
3150    
3151     if (repeat_max < 0)
3152     {
3153     #ifdef SUPPORT_UTF8
3154     if (utf8 && c >= 128)
3155     {
3156     memcpy(code, utf8_char, c & 7);
3157     code += c & 7;
3158     }
3159     else
3160     #endif
3161     {
3162     *code++ = c;
3163 nigel 87 if (prop_type >= 0)
3164     {
3165     *code++ = prop_type;
3166     *code++ = prop_value;
3167     }
3168 nigel 77 }
3169     *code++ = OP_STAR + repeat_type;
3170     }
3171    
3172     /* Else insert an UPTO if the max is greater than the min, again
3173 nigel 93 preceded by the character, for the previously inserted code. If the
3174     UPTO is just for 1 instance, we can use QUERY instead. */
3175 nigel 77
3176     else if (repeat_max != repeat_min)
3177     {
3178     #ifdef SUPPORT_UTF8
3179     if (utf8 && c >= 128)
3180     {
3181     memcpy(code, utf8_char, c & 7);
3182     code += c & 7;
3183     }
3184     else
3185     #endif
3186     *code++ = c;
3187 nigel 87 if (prop_type >= 0)
3188     {
3189     *code++ = prop_type;
3190     *code++ = prop_value;
3191     }
3192 nigel 77 repeat_max -= repeat_min;
3193 nigel 93
3194     if (repeat_max == 1)
3195     {
3196     *code++ = OP_QUERY + repeat_type;
3197     }
3198     else
3199     {
3200     *code++ = OP_UPTO + repeat_type;
3201     PUT2INC(code, 0, repeat_max);
3202     }
3203 nigel 77 }
3204     }
3205    
3206     /* The character or character type itself comes last in all cases. */
3207    
3208     #ifdef SUPPORT_UTF8
3209     if (utf8 && c >= 128)
3210     {
3211     memcpy(code, utf8_char, c & 7);
3212     code += c & 7;
3213     }
3214     else
3215     #endif
3216     *code++ = c;
3217    
3218 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3219     define the required property. */
3220 nigel 77
3221     #ifdef SUPPORT_UCP
3222 nigel 87 if (prop_type >= 0)
3223     {
3224     *code++ = prop_type;
3225     *code++ = prop_value;
3226     }
3227 nigel 77 #endif
3228     }
3229    
3230     /* If previous was a character class or a back reference, we put the repeat
3231     stuff after it, but just skip the item if the repeat was {0,0}. */
3232    
3233     else if (*previous == OP_CLASS ||
3234     *previous == OP_NCLASS ||
3235     #ifdef SUPPORT_UTF8
3236     *previous == OP_XCLASS ||
3237     #endif
3238     *previous == OP_REF)
3239     {
3240     if (repeat_max == 0)
3241     {
3242     code = previous;
3243     goto END_REPEAT;
3244     }
3245    
3246     /* All real repeats make it impossible to handle partial matching (maybe
3247     one day we will be able to remove this restriction). */
3248    
3249     if (repeat_max != 1) cd->nopartial = TRUE;
3250    
3251     if (repeat_min == 0 && repeat_max == -1)
3252     *code++ = OP_CRSTAR + repeat_type;
3253     else if (repeat_min == 1 && repeat_max == -1)
3254     *code++ = OP_CRPLUS + repeat_type;
3255     else if (repeat_min == 0 && repeat_max == 1)
3256     *code++ = OP_CRQUERY + repeat_type;
3257     else
3258     {
3259     *code++ = OP_CRRANGE + repeat_type;
3260     PUT2INC(code, 0, repeat_min);
3261     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3262     PUT2INC(code, 0, repeat_max);
3263     }
3264     }
3265    
3266     /* If previous was a bracket group, we may have to replicate it in certain
3267     cases. */
3268    
3269 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3270     *previous == OP_ONCE || *previous == OP_COND)
3271 nigel 77 {
3272     register int i;
3273     int ketoffset = 0;
3274     int len = code - previous;
3275     uschar *bralink = NULL;
3276    
3277 nigel 93 /* Repeating a DEFINE group is pointless */
3278    
3279     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3280     {
3281     *errorcodeptr = ERR55;
3282     goto FAILED;
3283     }
3284    
3285     /* This is a paranoid check to stop integer overflow later on */
3286    
3287     if (len > MAX_DUPLENGTH)
3288     {
3289     *errorcodeptr = ERR50;
3290     goto FAILED;
3291     }
3292    
3293 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3294     by scanning through from the start, and compute the offset back to it
3295     from the current code pointer. There may be an OP_OPT setting following
3296     the final KET, so we can't find the end just by going back from the code
3297     pointer. */
3298    
3299     if (repeat_max == -1)
3300     {
3301     register uschar *ket = previous;
3302     do ket += GET(ket, 1); while (*ket != OP_KET);
3303     ketoffset = code - ket;
3304     }
3305    
3306     /* The case of a zero minimum is special because of the need to stick
3307     OP_BRAZERO in front of it, and because the group appears once in the
3308     data, whereas in other cases it appears the minimum number of times. For
3309     this reason, it is simplest to treat this case separately, as otherwise
3310     the code gets far too messy. There are several special subcases when the
3311     minimum is zero. */
3312    
3313     if (repeat_min == 0)
3314     {
3315     /* If the maximum is also zero, we just omit the group from the output
3316     altogether. */
3317    
3318     if (repeat_max == 0)
3319     {
3320     code = previous;
3321     goto END_REPEAT;
3322     }
3323    
3324     /* If the maximum is 1 or unlimited, we just have to stick in the
3325     BRAZERO and do no more at this point. However, we do need to adjust
3326     any OP_RECURSE calls inside the group that refer to the group itself or
3327 nigel 93 any internal or forward referenced group, because the offset is from
3328     the start of the whole regex. Temporarily terminate the pattern while
3329     doing this. */
3330 nigel 77
3331     if (repeat_max <= 1)
3332     {
3333     *code = OP_END;
3334 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3335 nigel 77 memmove(previous+1, previous, len);
3336     code++;
3337     *previous++ = OP_BRAZERO + repeat_type;
3338     }
3339    
3340     /* If the maximum is greater than 1 and limited, we have to replicate
3341     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3342     The first one has to be handled carefully because it's the original
3343     copy, which has to be moved up. The remainder can be handled by code
3344     that is common with the non-zero minimum case below. We have to
3345     adjust the value or repeat_max, since one less copy is required. Once
3346     again, we may have to adjust any OP_RECURSE calls inside the group. */
3347    
3348     else
3349     {
3350     int offset;
3351     *code = OP_END;
3352 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3353 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3354     code += 2 + LINK_SIZE;
3355     *previous++ = OP_BRAZERO + repeat_type;
3356     *previous++ = OP_BRA;
3357    
3358     /* We chain together the bracket offset fields that have to be
3359     filled in later when the ends of the brackets are reached. */
3360    
3361     offset = (bralink == NULL)? 0 : previous - bralink;
3362     bralink = previous;
3363     PUTINC(previous, 0, offset);
3364     }
3365    
3366     repeat_max--;
3367     }
3368    
3369     /* If the minimum is greater than zero, replicate the group as many
3370     times as necessary, and adjust the maximum to the number of subsequent
3371     copies that we need. If we set a first char from the group, and didn't
3372 nigel 93 set a required char, copy the latter from the former. If there are any
3373     forward reference subroutine calls in the group, there will be entries on
3374     the workspace list; replicate these with an appropriate increment. */
3375 nigel 77
3376     else
3377     {
3378     if (repeat_min > 1)
3379     {
3380 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3381     just adjust the length as if we had. */
3382    
3383     if (lengthptr != NULL)
3384     *lengthptr += (repeat_min - 1)*length_prevgroup;
3385    
3386     /* This is compiling for real */
3387    
3388     else
3389 nigel 77 {
3390 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3391     for (i = 1; i < repeat_min; i++)
3392     {
3393     uschar *hc;
3394     uschar *this_hwm = cd->hwm;
3395     memcpy(code, previous, len);
3396     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3397     {
3398     PUT(cd->hwm, 0, GET(hc, 0) + len);
3399     cd->hwm += LINK_SIZE;
3400     }
3401     save_hwm = this_hwm;
3402     code += len;
3403     }
3404 nigel 77 }
3405     }
3406 nigel 93
3407 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3408     }
3409    
3410     /* This code is common to both the zero and non-zero minimum cases. If
3411     the maximum is limited, it replicates the group in a nested fashion,
3412     remembering the bracket starts on a stack. In the case of a zero minimum,
3413     the first one was set up above. In all cases the repeat_max now specifies
3414 nigel 93 the number of additional copies needed. Again, we must remember to
3415     replicate entries on the forward reference list. */
3416 nigel 77
3417     if (repeat_max >= 0)
3418     {
3419 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3420     just adjust the length as if we had. For each repetition we must add 1
3421     to the length for BRAZERO and for all but the last repetition we must
3422     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3423    
3424     if (lengthptr != NULL && repeat_max > 0)
3425     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3426     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3427    
3428     /* This is compiling for real */
3429    
3430     else for (i = repeat_max - 1; i >= 0; i--)
3431 nigel 77 {
3432 nigel 93 uschar *hc;
3433     uschar *this_hwm = cd->hwm;
3434    
3435 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3436    
3437     /* All but the final copy start a new nesting, maintaining the
3438     chain of brackets outstanding. */
3439    
3440     if (i != 0)
3441     {
3442     int offset;
3443     *code++ = OP_BRA;
3444     offset = (bralink == NULL)? 0 : code - bralink;
3445     bralink = code;
3446     PUTINC(code, 0, offset);
3447     }
3448    
3449     memcpy(code, previous, len);
3450 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3451     {
3452     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3453     cd->hwm += LINK_SIZE;
3454     }
3455     save_hwm = this_hwm;
3456 nigel 77 code += len;
3457     }
3458    
3459     /* Now chain through the pending brackets, and fill in their length
3460     fields (which are holding the chain links pro tem). */
3461    
3462     while (bralink != NULL)
3463     {
3464     int oldlinkoffset;
3465     int offset = code - bralink + 1;
3466     uschar *bra = code - offset;
3467     oldlinkoffset = GET(bra, 1);
3468     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3469     *code++ = OP_KET;
3470     PUTINC(code, 0, offset);
3471     PUT(bra, 1, offset);
3472     }
3473     }
3474    
3475     /* If the maximum is unlimited, set a repeater in the final copy. We
3476     can't just offset backwards from the current code point, because we
3477     don't know if there's been an options resetting after the ket. The
3478 nigel 93 correct offset was computed above.
3479 nigel 77
3480 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3481     this group is a non-atomic one that could match an empty string. If so,
3482     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3483     that runtime checking can be done. [This check is also applied to
3484     atomic groups at runtime, but in a different way.] */
3485    
3486     else
3487     {
3488     uschar *ketcode = code - ketoffset;
3489     uschar *bracode = ketcode - GET(ketcode, 1);
3490     *ketcode = OP_KETRMAX + repeat_type;
3491     if (lengthptr == NULL && *bracode != OP_ONCE)
3492     {
3493     uschar *scode = bracode;
3494     do
3495     {
3496     if (could_be_empty_branch(scode, ketcode, utf8))
3497     {
3498     *bracode += OP_SBRA - OP_BRA;
3499     break;
3500     }
3501     scode += GET(scode, 1);
3502     }
3503     while (*scode == OP_ALT);
3504     }
3505     }
3506 nigel 77 }
3507    
3508     /* Else there's some kind of shambles */
3509    
3510     else
3511     {
3512     *errorcodeptr = ERR11;
3513     goto FAILED;
3514     }
3515    
3516 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3517     tests above succeeded, possessive_quantifier is TRUE. For some of the
3518     simpler opcodes, there is an special alternative opcode for this. For
3519     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3520     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3521     but the special opcodes can optimize it a bit. The repeated item starts at
3522     tempcode, not at previous, which might be the first part of a string whose
3523     (former) last char we repeated.
3524 nigel 77
3525 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3526     an 'upto' may follow. We skip over an 'exact' item, and then test the
3527     length of what remains before proceeding. */
3528    
3529 nigel 77 if (possessive_quantifier)
3530     {
3531 nigel 93 int len;
3532     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3533     *tempcode == OP_NOTEXACT)
3534     tempcode += _pcre_OP_lengths[*tempcode];
3535     len = code - tempcode;
3536     if (len > 0) switch (*tempcode)
3537     {
3538     case OP_STAR: *tempcode = OP_POSSTAR; break;
3539     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3540     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3541     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3542    
3543     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3544     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3545     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3546     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3547    
3548     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3549     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3550     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3551     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3552    
3553     default:
3554     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3555     code += 1 + LINK_SIZE;
3556     len += 1 + LINK_SIZE;
3557     tempcode[0] = OP_ONCE;
3558     *code++ = OP_KET;
3559     PUTINC(code, 0, len);
3560     PUT(tempcode, 1, len);
3561     break;
3562     }
3563 nigel 77 }
3564    
3565     /* In all case we no longer have a previous item. We also set the
3566     "follows varying string" flag for subsequently encountered reqbytes if
3567     it isn't already set and we have just passed a varying length item. */
3568    
3569     END_REPEAT:
3570     previous = NULL;
3571     cd->req_varyopt |= reqvary;
3572     break;
3573    
3574    
3575 nigel 93 /* ===================================================================*/
3576     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3577     lookbehind or option setting or condition or all the other extended
3578     parenthesis forms. First deal with the specials; all are introduced by ?,
3579     and the appearance of any of them means that this is not a capturing
3580     group. */
3581 nigel 77
3582     case '(':
3583     newoptions = options;
3584     skipbytes = 0;
3585 nigel 93 bravalue = OP_CBRA;
3586     save_hwm = cd->hwm;
3587 nigel 77
3588     if (*(++ptr) == '?')
3589     {
3590 nigel 93 int i, set, unset, namelen;
3591 nigel 77 int *optset;
3592 nigel 93 const uschar *name;
3593     uschar *slot;
3594 nigel 77
3595     switch (*(++ptr))
3596     {
3597     case '#': /* Comment; skip to ket */
3598     ptr++;
3599 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3600     if (*ptr == 0)
3601     {
3602     *errorcodeptr = ERR18;
3603     goto FAILED;
3604     }
3605 nigel 77 continue;
3606    
3607 nigel 93
3608     /* ------------------------------------------------------------ */
3609     case ':': /* Non-capturing bracket */
3610 nigel 77 bravalue = OP_BRA;
3611     ptr++;
3612     break;
3613    
3614 nigel 93
3615     /* ------------------------------------------------------------ */
3616 nigel 77 case '(':
3617     bravalue = OP_COND; /* Conditional group */
3618    
3619 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3620     group), a name (referring to a named group), or 'R', referring to
3621     recursion. R<digits> and R&name are also permitted for recursion tests.
3622 nigel 77
3623 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3624     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3625    
3626     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3627     be the recursive thing or the name 'R' (and similarly for 'R' followed
3628     by digits), and (b) a number could be a name that consists of digits.
3629     In both cases, we look for a name first; if not found, we try the other
3630     cases. */
3631    
3632     /* For conditions that are assertions, check the syntax, and then exit
3633     the switch. This will take control down to where bracketed groups,
3634     including assertions, are processed. */
3635    
3636     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3637     break;
3638    
3639     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3640     below), and all need to skip 3 bytes at the start of the group. */
3641    
3642     code[1+LINK_SIZE] = OP_CREF;
3643     skipbytes = 3;
3644 ph10 172 refsign = -1;
3645 nigel 93
3646     /* Check for a test for recursion in a named group. */
3647    
3648     if (ptr[1] == 'R' && ptr[2] == '&')
3649 nigel 77 {
3650 nigel 93 terminator = -1;
3651     ptr += 2;
3652     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3653     }
3654 nigel 91
3655 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3656     syntax (?(<name>) or (?('name') */
3657 nigel 91
3658 nigel 93 else if (ptr[1] == '<')
3659     {
3660     terminator = '>';
3661     ptr++;
3662     }
3663     else if (ptr[1] == '\'')
3664     {
3665     terminator = '\'';
3666     ptr++;
3667     }
3668 ph10 172 else
3669 ph10 167 {
3670     terminator = 0;
3671 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3672     }
3673 nigel 77
3674 nigel 93 /* We now expect to read a name; any thing else is an error */
3675 nigel 77
3676 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3677     {
3678     ptr += 1; /* To get the right offset */
3679     *errorcodeptr = ERR28;
3680     goto FAILED;
3681     }
3682    
3683     /* Read the name, but also get it as a number if it's all digits */
3684    
3685     recno = 0;
3686     name = ++ptr;
3687     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3688     {
3689     if (recno >= 0)
3690     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3691     recno * 10 + *ptr - '0' : -1;
3692 nigel 91 ptr++;
3693 nigel 93 }
3694     namelen = ptr - name;
3695 nigel 91
3696 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3697     {
3698     ptr--; /* Error offset */
3699     *errorcodeptr = ERR26;
3700     goto FAILED;
3701     }
3702 nigel 91
3703 nigel 93 /* Do no further checking in the pre-compile phase. */
3704 nigel 91
3705 nigel 93 if (lengthptr != NULL) break;
3706 nigel 91
3707 nigel 93 /* In the real compile we do the work of looking for the actual
3708 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3709     be digits, in which case recno will be set. */
3710 ph10 172
3711 ph10 167 if (refsign > 0)
3712     {
3713     if (recno <= 0)
3714     {
3715     *errorcodeptr = ERR58;
3716     goto FAILED;
3717 ph10 172 }
3718 ph10 167 if (refsign == '-')
3719     {
3720 ph10 172 recno = cd->bracount - recno + 1;
3721 ph10 167 if (recno <= 0)
3722     {
3723     *errorcodeptr = ERR15;
3724     goto FAILED;
3725 ph10 172 }
3726 ph10 167 }
3727 ph10 172 else recno += cd->bracount;
3728 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
3729     break;
3730 ph10 172 }
3731 nigel 91
3732 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3733     name. */
3734 ph10 172
3735 nigel 93 slot = cd->name_table;
3736     for (i = 0; i < cd->names_found; i++)
3737     {
3738     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3739     slot += cd->name_entry_size;
3740     }
3741 nigel 91
3742 nigel 93 /* Found a previous named subpattern */
3743 nigel 91
3744 nigel 93 if (i < cd->names_found)
3745     {
3746     recno = GET2(slot, 0);
3747     PUT2(code, 2+LINK_SIZE, recno);
3748     }
3749 nigel 91
3750 nigel 93 /* Search the pattern for a forward reference */
3751 nigel 91
3752 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3753     (options & PCRE_EXTENDED) != 0)) > 0)
3754     {
3755     PUT2(code, 2+LINK_SIZE, i);
3756     }
3757 nigel 91
3758 nigel 93 /* If terminator == 0 it means that the name followed directly after
3759     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3760     some further alternatives to try. For the cases where terminator != 0
3761     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3762     now checked all the possibilities, so give an error. */
3763 nigel 91
3764 nigel 93 else if (terminator != 0)
3765     {
3766     *errorcodeptr = ERR15;
3767     goto FAILED;
3768     }
3769    
3770     /* Check for (?(R) for recursion. Allow digits after R to specify a
3771     specific group number. */
3772    
3773     else if (*name == 'R')
3774     {
3775     recno = 0;
3776     for (i = 1; i < namelen; i++)
3777 nigel 91 {
3778 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3779     {
3780     *errorcodeptr = ERR15;
3781     goto FAILED;
3782     }
3783     recno = recno * 10 + name[i] - '0';
3784 nigel 77 }
3785 nigel 93 if (recno == 0) recno = RREF_ANY;
3786     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3787     PUT2(code, 2+LINK_SIZE, recno);
3788 nigel 77 }
3789 nigel 91
3790 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3791     false. */
3792 nigel 91
3793 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3794     {
3795     code[1+LINK_SIZE] = OP_DEF;
3796     skipbytes = 1;
3797     }
3798    
3799     /* Check for the "name" actually being a subpattern number. */
3800    
3801     else if (recno > 0)
3802     {
3803     PUT2(code, 2+LINK_SIZE, recno);
3804     }
3805    
3806     /* Either an unidentified subpattern, or a reference to (?(0) */
3807    
3808     else
3809     {
3810     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3811     goto FAILED;
3812     }
3813 nigel 77 break;
3814    
3815 nigel 93
3816     /* ------------------------------------------------------------ */
3817 nigel 77 case '=': /* Positive lookahead */
3818     bravalue = OP_ASSERT;
3819     ptr++;
3820     break;
3821    
3822 nigel 93
3823     /* ------------------------------------------------------------ */
3824 nigel 77 case '!': /* Negative lookahead */
3825     bravalue = OP_ASSERT_NOT;
3826     ptr++;
3827     break;
3828    
3829 nigel 93
3830     /* ------------------------------------------------------------ */
3831     case '<': /* Lookbehind or named define */
3832     switch (ptr[1])
3833 nigel 77 {
3834     case '=': /* Positive lookbehind */
3835     bravalue = OP_ASSERTBACK;
3836 nigel 93 ptr += 2;
3837 nigel 77 break;
3838    
3839     case '!': /* Negative lookbehind */
3840     bravalue = OP_ASSERTBACK_NOT;
3841 nigel 93 ptr += 2;
3842 nigel 77 break;
3843 nigel 93
3844     default: /* Could be name define, else bad */
3845     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3846     ptr++; /* Correct offset for error */
3847     *errorcodeptr = ERR24;
3848     goto FAILED;
3849 nigel 77 }
3850     break;
3851    
3852 nigel 93
3853     /* ------------------------------------------------------------ */
3854 nigel 77 case '>': /* One-time brackets */
3855     bravalue = OP_ONCE;
3856     ptr++;
3857     break;
3858    
3859 nigel 93
3860     /* ------------------------------------------------------------ */
3861 nigel 77 case 'C': /* Callout - may be followed by digits; */
3862     previous_callout = code; /* Save for later completion */
3863     after_manual_callout = 1; /* Skip one item before completing */
3864 nigel 93 *code++ = OP_CALLOUT;
3865     {
3866 nigel 77 int n = 0;
3867     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3868     n = n * 10 + *ptr - '0';
3869 nigel 93 if (*ptr != ')')
3870     {
3871     *errorcodeptr = ERR39;
3872     goto FAILED;
3873     }
3874 nigel 77 if (n > 255)
3875     {
3876     *errorcodeptr = ERR38;
3877     goto FAILED;
3878     }
3879     *code++ = n;
3880     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3881     PUT(code, LINK_SIZE, 0); /* Default length */
3882     code += 2 * LINK_SIZE;
3883     }
3884     previous = NULL;
3885     continue;
3886    
3887 nigel 93
3888     /* ------------------------------------------------------------ */
3889     case 'P': /* Python-style named subpattern handling */
3890     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3891 nigel 77 {
3892 nigel 93 is_recurse = *ptr == '>';
3893     terminator = ')';
3894     goto NAMED_REF_OR_RECURSE;
3895     }
3896     else if (*ptr != '<') /* Test for Python-style definition */
3897     {
3898     *errorcodeptr = ERR41;
3899     goto FAILED;
3900     }
3901     /* Fall through to handle (?P< as (?< is handled */
3902 nigel 77
3903    
3904 nigel 93 /* ------------------------------------------------------------ */
3905     DEFINE_NAME: /* Come here from (?< handling */
3906     case '\'':
3907     {
3908     terminator = (*ptr == '<')? '>' : '\'';
3909     name = ++ptr;
3910    
3911     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3912     namelen = ptr - name;
3913    
3914     /* In the pre-compile phase, just do a syntax check. */
3915    
3916     if (lengthptr != NULL)
3917 nigel 77 {
3918 nigel 93 if (*ptr != terminator)
3919 nigel 77 {
3920 nigel 93 *errorcodeptr = ERR42;
3921     goto FAILED;
3922     }
3923     if (cd->names_found >= MAX_NAME_COUNT)
3924     {
3925     *errorcodeptr = ERR49;
3926     goto FAILED;
3927     }
3928     if (namelen + 3 > cd->name_entry_size)
3929     {
3930     cd->name_entry_size = namelen + 3;
3931     if (namelen > MAX_NAME_SIZE)
3932 nigel 77 {
3933 nigel 93 *errorcodeptr = ERR48;
3934     goto FAILED;
3935     }
3936     }
3937     }
3938    
3939     /* In the real compile, create the entry in the table */
3940    
3941     else
3942     {
3943     slot = cd->name_table;
3944     for (i = 0; i < cd->names_found; i++)
3945     {
3946     int crc = memcmp(name, slot+2, namelen);
3947     if (crc == 0)
3948     {
3949     if (slot[2+namelen] == 0)
3950 nigel 91 {
3951 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
3952     {
3953     *errorcodeptr = ERR43;
3954     goto FAILED;
3955     }
3956 nigel 91 }
3957 nigel 93 else crc = -1; /* Current name is substring */
3958 nigel 77 }
3959 nigel 93 if (crc < 0)
3960     {
3961     memmove(slot + cd->name_entry_size, slot,
3962     (cd->names_found - i) * cd->name_entry_size);
3963     break;
3964     }
3965     slot += cd->name_entry_size;
3966 nigel 77 }
3967 nigel 93
3968     PUT2(slot, 0, cd->bracount + 1);
3969     memcpy(slot + 2, name, namelen);
3970     slot[2+namelen] = 0;
3971 nigel 77 }
3972     }
3973    
3974 nigel 93 /* In both cases, count the number of names we've encountered. */
3975    
3976     ptr++; /* Move past > or ' */
3977     cd->names_found++;
3978     goto NUMBERED_GROUP;
3979    
3980    
3981     /* ------------------------------------------------------------ */
3982     case '&': /* Perl recursion/subroutine syntax */
3983     terminator = ')';
3984     is_recurse = TRUE;
3985     /* Fall through */
3986    
3987     /* We come here from the Python syntax above that handles both
3988     references (?P=name) and recursion (?P>name), as well as falling
3989     through from the Perl recursion syntax (?&name). */
3990    
3991     NAMED_REF_OR_RECURSE:
3992     name = ++ptr;
3993     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3994     namelen = ptr - name;
3995    
3996     /* In the pre-compile phase, do a syntax check and set a dummy
3997     reference number. */
3998    
3999     if (lengthptr != NULL)
4000 nigel 77 {
4001 nigel 93 if (*ptr != terminator)
4002     {
4003     *errorcodeptr = ERR42;
4004     goto FAILED;
4005     }
4006     if (namelen > MAX_NAME_SIZE)
4007     {
4008     *errorcodeptr = ERR48;
4009     goto FAILED;
4010     }
4011     recno = 0;
4012     }
4013 nigel 77
4014 nigel 93 /* In the real compile, seek the name in the table */
4015 nigel 77
4016 nigel 93 else
4017     {
4018     slot = cd->name_table;
4019 nigel 77 for (i = 0; i < cd->names_found; i++)
4020     {
4021     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4022     slot += cd->name_entry_size;
4023     }
4024 nigel 91
4025     if (i < cd->names_found) /* Back reference */
4026 nigel 77 {
4027 nigel 91 recno = GET2(slot, 0);
4028     }
4029     else if ((recno = /* Forward back reference */
4030 nigel 93 find_parens(ptr, cd->bracount, name, namelen,
4031     (options & PCRE_EXTENDED) != 0)) <= 0)
4032 nigel 91 {
4033 nigel 77 *errorcodeptr = ERR15;
4034     goto FAILED;
4035     }
4036 nigel 93 }
4037 nigel 77
4038 nigel 93 /* In both phases, we can now go to the code than handles numerical
4039     recursion or backreferences. */
4040 nigel 77
4041 nigel 93 if (is_recurse) goto HANDLE_RECURSION;
4042     else goto HANDLE_REFERENCE;
4043 nigel 77
4044    
4045 nigel 93 /* ------------------------------------------------------------ */
4046     case 'R': /* Recursion */
4047 nigel 77 ptr++; /* Same as (?0) */
4048     /* Fall through */
4049    
4050    
4051 nigel 93 /* ------------------------------------------------------------ */
4052 ph10 166 case '-': case '+':
4053 nigel 93 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4054     case '5': case '6': case '7': case '8': case '9': /* subroutine */
4055 nigel 77 {
4056     const uschar *called;
4057 ph10 166
4058 ph10 167 if ((refsign = *ptr) == '+') ptr++;
4059 ph10 172 else if (refsign == '-')
4060 ph10 166 {
4061     if ((digitab[ptr[1]] & ctype_digit) == 0)
4062     goto OTHER_CHAR_AFTER_QUERY;
4063 ph10 172 ptr++;
4064     }
4065    
4066 nigel 77 recno = 0;
4067     while((digitab[*ptr] & ctype_digit) != 0)
4068     recno = recno * 10 + *ptr++ - '0';
4069 ph10 166
4070 nigel 93 if (*ptr != ')')
4071     {
4072     *errorcodeptr = ERR29;
4073     goto FAILED;
4074     }
4075 ph10 172
4076 ph10 167 if (refsign == '-')
4077 ph10 166 {
4078     if (recno == 0)
4079     {
4080     *errorcodeptr = ERR58;
4081     goto FAILED;
4082 ph10 172 }
4083     recno = cd->bracount - recno + 1;
4084 ph10 166 if (recno <= 0)
4085     {
4086     *errorcodeptr = ERR15;
4087     goto FAILED;
4088 ph10 172 }
4089 ph10 166 }
4090 ph10 167 else if (refsign == '+')
4091 ph10 166 {
4092     if (recno == 0)
4093     {
4094     *errorcodeptr = ERR58;
4095     goto FAILED;
4096 ph10 172 }
4097     recno += cd->bracount;
4098     }
4099 nigel 77
4100     /* Come here from code above that handles a named recursion */
4101    
4102     HANDLE_RECURSION:
4103    
4104     previous = code;
4105 nigel 93 called = cd->start_code;
4106 nigel 77
4107 nigel 93 /* When we are actually compiling, find the bracket that is being
4108     referenced. Temporarily end the regex in case it doesn't exist before
4109     this point. If we end up with a forward reference, first check that
4110     the bracket does occur later so we can give the error (and position)
4111     now. Then remember this forward reference in the workspace so it can
4112     be filled in at the end. */
4113 nigel 77
4114 nigel 93 if (lengthptr == NULL)
4115 nigel 77 {
4116 nigel 93 *code = OP_END;
4117     if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4118 nigel 77
4119 nigel 93 /* Forward reference */
4120 nigel 77
4121 nigel 93 if (called == NULL)
4122     {
4123     if (find_parens(ptr, cd->bracount, NULL, recno,
4124     (options & PCRE_EXTENDED) != 0) < 0)
4125     {
4126     *errorcodeptr = ERR15;
4127     goto FAILED;
4128     }
4129     called = cd->start_code + recno;
4130     PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4131     }
4132    
4133     /* If not a forward reference, and the subpattern is still open,
4134     this is a recursive call. We check to see if this is a left
4135     recursion that could loop for ever, and diagnose that case. */
4136    
4137     else if (GET(called, 1) == 0 &&
4138     could_be_empty(called, code, bcptr, utf8))
4139     {
4140     *errorcodeptr = ERR40;
4141     goto FAILED;
4142     }
4143 nigel 77 }
4144    
4145 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside
4146 nigel 93 "once" brackets. Set up a "previous group" length so that a
4147     subsequent quantifier will work. */