/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 170 - (hide annotations) (download)
Mon Jun 4 11:21:13 2007 UTC (6 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 174621 byte(s)
Fix bug in detecting potentially empty groups.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 nigel 77 /*************************************************
62     * Code parameters and static tables *
63     *************************************************/
64    
65 nigel 93 /* This value specifies the size of stack workspace that is used during the
66     first pre-compile phase that determines how much memory is required. The regex
67     is partly compiled into this space, but the compiled parts are discarded as
68     soon as they can be, so that hopefully there will never be an overrun. The code
69     does, however, check for an overrun. The largest amount I've seen used is 218,
70     so this number is very generous.
71 nigel 77
72 nigel 93 The same workspace is used during the second, actual compile phase for
73     remembering forward references to groups so that they can be filled in at the
74     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75     is 4 there is plenty of room. */
76 nigel 77
77 nigel 93 #define COMPILE_WORK_SIZE (4096)
78 nigel 77
79 nigel 93
80 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81     are simple data values; negative values are for special things like \d and so
82     on. Zero means further processing is needed (for things like \x), or the escape
83     is invalid. */
84    
85 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 nigel 77 static const short int escapes[] = {
87     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 ph10 168 0, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
91 nigel 93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 nigel 93 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 nigel 77 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96     0, 0, -ESC_z /* x - z */
97     };
98    
99 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
100 nigel 77 static const short int escapes[] = {
101     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 nigel 77 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124     };
125     #endif
126    
127    
128     /* Tables of names of POSIX character classes and their lengths. The list is
129 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 nigel 77 as this is assumed for handling case independence. */
131    
132     static const char *const posix_names[] = {
133     "alpha", "lower", "upper",
134     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135     "print", "punct", "space", "word", "xdigit" };
136    
137     static const uschar posix_name_lengths[] = {
138     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
141     base map, with an optional addition or removal of another map. Then, for some
142     classes, there is some additional tweaking: for [:blank:] the vertical space
143     characters are removed, and for [:alpha:] and [:alnum:] the underscore
144     character is removed. The triples in the table consist of the base map offset,
145     second map offset or -1 if no second map, and a non-negative value for map
146     addition or a negative value for map subtraction (if there are two maps). The
147     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148     remove vertical space characters, 2 => remove underscore. */
149 nigel 77
150     static const int posix_class_maps[] = {
151 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
152     cbit_lower, -1, 0, /* lower */
153     cbit_upper, -1, 0, /* upper */
154     cbit_word, -1, 2, /* alnum - word without underscore */
155     cbit_print, cbit_cntrl, 0, /* ascii */
156     cbit_space, -1, 1, /* blank - a GNU extension */
157     cbit_cntrl, -1, 0, /* cntrl */
158     cbit_digit, -1, 0, /* digit */
159     cbit_graph, -1, 0, /* graph */
160     cbit_print, -1, 0, /* print */
161     cbit_punct, -1, 0, /* punct */
162     cbit_space, -1, 0, /* space */
163     cbit_word, -1, 0, /* word - a Perl extension */
164     cbit_xdigit,-1, 0 /* xdigit */
165 nigel 77 };
166    
167    
168 nigel 93 #define STRING(a) # a
169     #define XSTRING(s) STRING(s)
170    
171 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
172 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
173     they are documented. Always add a new error instead. Messages marked DEAD below
174     are no longer used. */
175 nigel 77
176     static const char *error_texts[] = {
177     "no error",
178     "\\ at end of pattern",
179     "\\c at end of pattern",
180     "unrecognized character follows \\",
181     "numbers out of order in {} quantifier",
182     /* 5 */
183     "number too big in {} quantifier",
184     "missing terminating ] for character class",
185     "invalid escape sequence in character class",
186     "range out of order in character class",
187     "nothing to repeat",
188     /* 10 */
189 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 nigel 77 "internal error: unexpected repeat",
191     "unrecognized character after (?",
192     "POSIX named classes are supported only within a class",
193     "missing )",
194     /* 15 */
195     "reference to non-existent subpattern",
196     "erroffset passed as NULL",
197     "unknown option bit(s) set",
198     "missing ) after comment",
199 nigel 93 "parentheses nested too deeply", /** DEAD **/
200 nigel 77 /* 20 */
201     "regular expression too large",
202     "failed to get memory",
203     "unmatched parentheses",
204     "internal error: code overflow",
205     "unrecognized character after (?<",
206     /* 25 */
207     "lookbehind assertion is not fixed length",
208 nigel 91 "malformed number or name after (?(",
209 nigel 77 "conditional group contains more than two branches",
210     "assertion expected after (?(",
211 ph10 166 "(?R or (?[+-]digits must be followed by )",
212 nigel 77 /* 30 */
213     "unknown POSIX class name",
214     "POSIX collating elements are not supported",
215     "this version of PCRE is not compiled with PCRE_UTF8 support",
216 nigel 93 "spare error", /** DEAD **/
217 nigel 77 "character value in \\x{...} sequence is too large",
218     /* 35 */
219     "invalid condition (?(0)",
220     "\\C not allowed in lookbehind assertion",
221     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222     "number after (?C is > 255",
223     "closing ) for (?C expected",
224     /* 40 */
225     "recursive call could loop indefinitely",
226     "unrecognized character after (?P",
227 nigel 93 "syntax error in subpattern name (missing terminator)",
228 nigel 91 "two named subpatterns have the same name",
229 nigel 77 "invalid UTF-8 string",
230     /* 45 */
231     "support for \\P, \\p, and \\X has not been compiled",
232     "malformed \\P or \\p sequence",
233 nigel 91 "unknown property name after \\P or \\p",
234 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 nigel 91 /* 50 */
237     "repeated subpattern is too long",
238 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
239     "internal error: overran compiling workspace",
240     "internal error: previously-checked referenced subpattern not found",
241     "DEFINE group contains more than one branch",
242     /* 55 */
243     "repeating a DEFINE group is not allowed",
244     "inconsistent NEWLINE options",
245 ph10 166 "\\g is not followed by an (optionally braced) non-zero number",
246 ph10 167 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
247 nigel 77 };
248    
249    
250     /* Table to identify digits and hex digits. This is used when compiling
251     patterns. Note that the tables in chartables are dependent on the locale, and
252     may mark arbitrary characters as digits - but the PCRE compiling code expects
253     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
254     a private table here. It costs 256 bytes, but it is a lot faster than doing
255     character value tests (at least in some simple cases I timed), and in some
256     applications one wants PCRE to compile efficiently as well as match
257     efficiently.
258    
259     For convenience, we use the same bit definitions as in chartables:
260    
261     0x04 decimal digit
262     0x08 hexadecimal digit
263    
264     Then we can use ctype_digit and ctype_xdigit in the code. */
265    
266 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
267 nigel 77 static const unsigned char digitab[] =
268     {
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
275     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
276     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
277     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
281     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
301    
302 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
303 nigel 77 static const unsigned char digitab[] =
304     {
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
316 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
317 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
336     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337    
338     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
339     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
340     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
341     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
343     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
347     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
348     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
350 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
351 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
352     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
355     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
356     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
357     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
358     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
359     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
360     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
361     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
362     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
363     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
364     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
365     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
366     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
367     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
368     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
369     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
370     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
371     #endif
372    
373    
374     /* Definition to allow mutual recursion */
375    
376     static BOOL
377 nigel 93 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
378     int *, branch_chain *, compile_data *, int *);
379 nigel 77
380    
381    
382     /*************************************************
383     * Handle escapes *
384     *************************************************/
385    
386     /* This function is called when a \ has been encountered. It either returns a
387     positive value for a simple escape such as \n, or a negative value which
388 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
389     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
390     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
391     ptr is pointing at the \. On exit, it is on the final character of the escape
392     sequence.
393 nigel 77
394     Arguments:
395     ptrptr points to the pattern position pointer
396     errorcodeptr points to the errorcode variable
397     bracount number of previous extracting brackets
398     options the options bits
399     isclass TRUE if inside a character class
400    
401     Returns: zero or positive => a data character
402     negative => a special escape sequence
403     on error, errorptr is set
404     */
405    
406     static int
407     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
408     int options, BOOL isclass)
409     {
410 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
411     const uschar *ptr = *ptrptr + 1;
412 nigel 77 int c, i;
413    
414 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
415     ptr--; /* Set pointer back to the last byte */
416    
417 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
418    
419     if (c == 0) *errorcodeptr = ERR1;
420    
421     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
422     a table. A non-zero result is something that can be returned immediately.
423     Otherwise further processing may be required. */
424    
425 ph10 97 #ifndef EBCDIC /* ASCII coding */
426 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
427     else if ((i = escapes[c - '0']) != 0) c = i;
428    
429 ph10 97 #else /* EBCDIC coding */
430 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
431     else if ((i = escapes[c - 0x48]) != 0) c = i;
432     #endif
433    
434     /* Escapes that need further processing, or are illegal. */
435    
436     else
437     {
438     const uschar *oldptr;
439 nigel 93 BOOL braced, negated;
440    
441 nigel 77 switch (c)
442     {
443     /* A number of Perl escapes are not handled by PCRE. We give an explicit
444     error. */
445    
446     case 'l':
447     case 'L':
448     case 'N':
449     case 'u':
450     case 'U':
451     *errorcodeptr = ERR37;
452     break;
453    
454 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
455     is an absolute backreference. If negative, it is a relative backreference.
456     This is a Perl 5.10 feature. */
457    
458     case 'g':
459     if (ptr[1] == '{')
460     {
461     braced = TRUE;
462     ptr++;
463     }
464     else braced = FALSE;
465    
466     if (ptr[1] == '-')
467     {
468     negated = TRUE;
469     ptr++;
470     }
471     else negated = FALSE;
472    
473     c = 0;
474     while ((digitab[ptr[1]] & ctype_digit) != 0)
475     c = c * 10 + *(++ptr) - '0';
476    
477     if (c == 0 || (braced && *(++ptr) != '}'))
478     {
479     *errorcodeptr = ERR57;
480     return 0;
481     }
482    
483     if (negated)
484     {
485     if (c > bracount)
486     {
487     *errorcodeptr = ERR15;
488     return 0;
489     }
490     c = bracount - (c - 1);
491     }
492    
493     c = -(ESC_REF + c);
494     break;
495    
496 nigel 77 /* The handling of escape sequences consisting of a string of digits
497     starting with one that is not zero is not straightforward. By experiment,
498     the way Perl works seems to be as follows:
499    
500     Outside a character class, the digits are read as a decimal number. If the
501     number is less than 10, or if there are that many previous extracting
502     left brackets, then it is a back reference. Otherwise, up to three octal
503     digits are read to form an escaped byte. Thus \123 is likely to be octal
504     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
505     value is greater than 377, the least significant 8 bits are taken. Inside a
506     character class, \ followed by a digit is always an octal number. */
507    
508     case '1': case '2': case '3': case '4': case '5':
509     case '6': case '7': case '8': case '9':
510    
511     if (!isclass)
512     {
513     oldptr = ptr;
514     c -= '0';
515     while ((digitab[ptr[1]] & ctype_digit) != 0)
516     c = c * 10 + *(++ptr) - '0';
517     if (c < 10 || c <= bracount)
518     {
519     c = -(ESC_REF + c);
520     break;
521     }
522     ptr = oldptr; /* Put the pointer back and fall through */
523     }
524    
525     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
526     generates a binary zero byte and treats the digit as a following literal.
527     Thus we have to pull back the pointer by one. */
528    
529     if ((c = *ptr) >= '8')
530     {
531     ptr--;
532     c = 0;
533     break;
534     }
535    
536     /* \0 always starts an octal number, but we may drop through to here with a
537 nigel 91 larger first octal digit. The original code used just to take the least
538     significant 8 bits of octal numbers (I think this is what early Perls used
539     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
540     than 3 octal digits. */
541 nigel 77
542     case '0':
543     c -= '0';
544     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
545     c = c * 8 + *(++ptr) - '0';
546 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
547 nigel 77 break;
548    
549 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
550     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
551     treated as a data character. */
552 nigel 77
553     case 'x':
554 nigel 87 if (ptr[1] == '{')
555 nigel 77 {
556     const uschar *pt = ptr + 2;
557 nigel 87 int count = 0;
558    
559 nigel 77 c = 0;
560     while ((digitab[*pt] & ctype_xdigit) != 0)
561     {
562 nigel 87 register int cc = *pt++;
563     if (c == 0 && cc == '0') continue; /* Leading zeroes */
564 nigel 77 count++;
565 nigel 87
566 ph10 97 #ifndef EBCDIC /* ASCII coding */
567 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
568 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
569 ph10 97 #else /* EBCDIC coding */
570 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
571 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
572 nigel 77 #endif
573     }
574 nigel 87
575 nigel 77 if (*pt == '}')
576     {
577 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
578 nigel 77 ptr = pt;
579     break;
580     }
581 nigel 87
582 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
583     recognize this construct; fall through to the normal \x handling. */
584     }
585    
586 nigel 87 /* Read just a single-byte hex-defined char */
587 nigel 77
588     c = 0;
589     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
590     {
591     int cc; /* Some compilers don't like ++ */
592     cc = *(++ptr); /* in initializers */
593 ph10 97 #ifndef EBCDIC /* ASCII coding */
594 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
595     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
596 ph10 97 #else /* EBCDIC coding */
597 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
598     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
599     #endif
600     }
601     break;
602    
603 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
604     This coding is ASCII-specific, but then the whole concept of \cx is
605     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
606 nigel 77
607     case 'c':
608     c = *(++ptr);
609     if (c == 0)
610     {
611     *errorcodeptr = ERR2;
612     return 0;
613     }
614    
615 ph10 97 #ifndef EBCDIC /* ASCII coding */
616 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
617     c ^= 0x40;
618 ph10 97 #else /* EBCDIC coding */
619 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
620     c ^= 0xC0;
621     #endif
622     break;
623    
624     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
625     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
626     for Perl compatibility, it is a literal. This code looks a bit odd, but
627     there used to be some cases other than the default, and there may be again
628     in future, so I haven't "optimized" it. */
629    
630     default:
631     if ((options & PCRE_EXTRA) != 0) switch(c)
632     {
633     default:
634     *errorcodeptr = ERR3;
635     break;
636     }
637     break;
638     }
639     }
640    
641     *ptrptr = ptr;
642     return c;
643     }
644    
645    
646    
647     #ifdef SUPPORT_UCP
648     /*************************************************
649     * Handle \P and \p *
650     *************************************************/
651    
652     /* This function is called after \P or \p has been encountered, provided that
653     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
654     pointing at the P or p. On exit, it is pointing at the final character of the
655     escape sequence.
656    
657     Argument:
658     ptrptr points to the pattern position pointer
659     negptr points to a boolean that is set TRUE for negation else FALSE
660 nigel 87 dptr points to an int that is set to the detailed property value
661 nigel 77 errorcodeptr points to the error code variable
662    
663 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
664 nigel 77 */
665    
666     static int
667 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
668 nigel 77 {
669     int c, i, bot, top;
670     const uschar *ptr = *ptrptr;
671 nigel 87 char name[32];
672 nigel 77
673     c = *(++ptr);
674     if (c == 0) goto ERROR_RETURN;
675    
676     *negptr = FALSE;
677    
678 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
679     negation. */
680 nigel 77
681     if (c == '{')
682     {
683     if (ptr[1] == '^')
684     {
685     *negptr = TRUE;
686     ptr++;
687     }
688 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
689 nigel 77 {
690     c = *(++ptr);
691     if (c == 0) goto ERROR_RETURN;
692     if (c == '}') break;
693     name[i] = c;
694     }
695 nigel 87 if (c !='}') goto ERROR_RETURN;
696 nigel 77 name[i] = 0;
697     }
698    
699     /* Otherwise there is just one following character */
700    
701     else
702     {
703     name[0] = c;
704     name[1] = 0;
705     }
706    
707     *ptrptr = ptr;
708    
709     /* Search for a recognized property name using binary chop */
710    
711     bot = 0;
712     top = _pcre_utt_size;
713    
714     while (bot < top)
715     {
716 nigel 87 i = (bot + top) >> 1;
717 nigel 77 c = strcmp(name, _pcre_utt[i].name);
718 nigel 87 if (c == 0)
719     {
720     *dptr = _pcre_utt[i].value;
721     return _pcre_utt[i].type;
722     }
723 nigel 77 if (c > 0) bot = i + 1; else top = i;
724     }
725    
726     *errorcodeptr = ERR47;
727     *ptrptr = ptr;
728     return -1;
729    
730     ERROR_RETURN:
731     *errorcodeptr = ERR46;
732     *ptrptr = ptr;
733     return -1;
734     }
735     #endif
736    
737    
738    
739    
740     /*************************************************
741     * Check for counted repeat *
742     *************************************************/
743    
744     /* This function is called when a '{' is encountered in a place where it might
745     start a quantifier. It looks ahead to see if it really is a quantifier or not.
746     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
747     where the ddds are digits.
748    
749     Arguments:
750     p pointer to the first char after '{'
751    
752     Returns: TRUE or FALSE
753     */
754    
755     static BOOL
756     is_counted_repeat(const uschar *p)
757     {
758     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
759     while ((digitab[*p] & ctype_digit) != 0) p++;
760     if (*p == '}') return TRUE;
761    
762     if (*p++ != ',') return FALSE;
763     if (*p == '}') return TRUE;
764    
765     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
766     while ((digitab[*p] & ctype_digit) != 0) p++;
767    
768     return (*p == '}');
769     }
770    
771    
772    
773     /*************************************************
774     * Read repeat counts *
775     *************************************************/
776    
777     /* Read an item of the form {n,m} and return the values. This is called only
778     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
779     so the syntax is guaranteed to be correct, but we need to check the values.
780    
781     Arguments:
782     p pointer to first char after '{'
783     minp pointer to int for min
784     maxp pointer to int for max
785     returned as -1 if no max
786     errorcodeptr points to error code variable
787    
788     Returns: pointer to '}' on success;
789     current ptr on error, with errorcodeptr set non-zero
790     */
791    
792     static const uschar *
793     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
794     {
795     int min = 0;
796     int max = -1;
797    
798 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
799     an integer overflow. */
800    
801 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
802 nigel 81 if (min < 0 || min > 65535)
803     {
804     *errorcodeptr = ERR5;
805     return p;
806     }
807 nigel 77
808 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
809     Also, max must not be less than min. */
810    
811 nigel 77 if (*p == '}') max = min; else
812     {
813     if (*(++p) != '}')
814     {
815     max = 0;
816     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
817 nigel 81 if (max < 0 || max > 65535)
818     {
819     *errorcodeptr = ERR5;
820     return p;
821     }
822 nigel 77 if (max < min)
823     {
824     *errorcodeptr = ERR4;
825     return p;
826     }
827     }
828     }
829    
830 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
831     '}'. */
832 nigel 77
833 nigel 81 *minp = min;
834     *maxp = max;
835 nigel 77 return p;
836     }
837    
838    
839    
840     /*************************************************
841 nigel 93 * Find forward referenced subpattern *
842 nigel 91 *************************************************/
843    
844 nigel 93 /* This function scans along a pattern's text looking for capturing
845     subpatterns, and counting them. If it finds a named pattern that matches the
846     name it is given, it returns its number. Alternatively, if the name is NULL, it
847     returns when it reaches a given numbered subpattern. This is used for forward
848     references to subpatterns. We know that if (?P< is encountered, the name will
849     be terminated by '>' because that is checked in the first pass.
850 nigel 91
851     Arguments:
852 nigel 93 ptr current position in the pattern
853     count current count of capturing parens so far encountered
854     name name to seek, or NULL if seeking a numbered subpattern
855     lorn name length, or subpattern number if name is NULL
856     xmode TRUE if we are in /x mode
857 nigel 91
858     Returns: the number of the named subpattern, or -1 if not found
859     */
860    
861     static int
862 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
863     BOOL xmode)
864 nigel 91 {
865     const uschar *thisname;
866 nigel 93
867 nigel 91 for (; *ptr != 0; ptr++)
868     {
869 nigel 93 int term;
870    
871     /* Skip over backslashed characters and also entire \Q...\E */
872    
873     if (*ptr == '\\')
874     {
875     if (*(++ptr) == 0) return -1;
876     if (*ptr == 'Q') for (;;)
877     {
878     while (*(++ptr) != 0 && *ptr != '\\');
879     if (*ptr == 0) return -1;
880     if (*(++ptr) == 'E') break;
881     }
882     continue;
883     }
884    
885     /* Skip over character classes */
886    
887     if (*ptr == '[')
888     {
889     while (*(++ptr) != ']')
890     {
891     if (*ptr == '\\')
892     {
893     if (*(++ptr) == 0) return -1;
894     if (*ptr == 'Q') for (;;)
895     {
896     while (*(++ptr) != 0 && *ptr != '\\');
897     if (*ptr == 0) return -1;
898     if (*(++ptr) == 'E') break;
899     }
900     continue;
901     }
902     }
903     continue;
904     }
905    
906     /* Skip comments in /x mode */
907    
908     if (xmode && *ptr == '#')
909     {
910     while (*(++ptr) != 0 && *ptr != '\n');
911     if (*ptr == 0) return -1;
912     continue;
913     }
914    
915     /* An opening parens must now be a real metacharacter */
916    
917 nigel 91 if (*ptr != '(') continue;
918 nigel 93 if (ptr[1] != '?')
919     {
920     count++;
921     if (name == NULL && count == lorn) return count;
922     continue;
923     }
924    
925     ptr += 2;
926     if (*ptr == 'P') ptr++; /* Allow optional P */
927    
928     /* We have to disambiguate (?<! and (?<= from (?<name> */
929    
930     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
931     *ptr != '\'')
932     continue;
933    
934 nigel 91 count++;
935 nigel 93
936     if (name == NULL && count == lorn) return count;
937     term = *ptr++;
938     if (term == '<') term = '>';
939 nigel 91 thisname = ptr;
940 nigel 93 while (*ptr != term) ptr++;
941     if (name != NULL && lorn == ptr - thisname &&
942     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
943 nigel 91 return count;
944     }
945 nigel 93
946 nigel 91 return -1;
947     }
948    
949    
950    
951     /*************************************************
952 nigel 77 * Find first significant op code *
953     *************************************************/
954    
955     /* This is called by several functions that scan a compiled expression looking
956     for a fixed first character, or an anchoring op code etc. It skips over things
957     that do not influence this. For some calls, a change of option is important.
958     For some calls, it makes sense to skip negative forward and all backward
959     assertions, and also the \b assertion; for others it does not.
960    
961     Arguments:
962     code pointer to the start of the group
963     options pointer to external options
964     optbit the option bit whose changing is significant, or
965     zero if none are
966     skipassert TRUE if certain assertions are to be skipped
967    
968     Returns: pointer to the first significant opcode
969     */
970    
971     static const uschar*
972     first_significant_code(const uschar *code, int *options, int optbit,
973     BOOL skipassert)
974     {
975     for (;;)
976     {
977     switch ((int)*code)
978     {
979     case OP_OPT:
980     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
981     *options = (int)code[1];
982     code += 2;
983     break;
984    
985     case OP_ASSERT_NOT:
986     case OP_ASSERTBACK:
987     case OP_ASSERTBACK_NOT:
988     if (!skipassert) return code;
989     do code += GET(code, 1); while (*code == OP_ALT);
990     code += _pcre_OP_lengths[*code];
991     break;
992    
993     case OP_WORD_BOUNDARY:
994     case OP_NOT_WORD_BOUNDARY:
995     if (!skipassert) return code;
996     /* Fall through */
997    
998     case OP_CALLOUT:
999     case OP_CREF:
1000 nigel 93 case OP_RREF:
1001     case OP_DEF:
1002 nigel 77 code += _pcre_OP_lengths[*code];
1003     break;
1004    
1005     default:
1006     return code;
1007     }
1008     }
1009     /* Control never reaches here */
1010     }
1011    
1012    
1013    
1014    
1015     /*************************************************
1016     * Find the fixed length of a pattern *
1017     *************************************************/
1018    
1019     /* Scan a pattern and compute the fixed length of subject that will match it,
1020     if the length is fixed. This is needed for dealing with backward assertions.
1021     In UTF8 mode, the result is in characters rather than bytes.
1022    
1023     Arguments:
1024     code points to the start of the pattern (the bracket)
1025     options the compiling options
1026    
1027     Returns: the fixed length, or -1 if there is no fixed length,
1028     or -2 if \C was encountered
1029     */
1030    
1031     static int
1032     find_fixedlength(uschar *code, int options)
1033     {
1034     int length = -1;
1035    
1036     register int branchlength = 0;
1037     register uschar *cc = code + 1 + LINK_SIZE;
1038    
1039     /* Scan along the opcodes for this branch. If we get to the end of the
1040     branch, check the length against that of the other branches. */
1041    
1042     for (;;)
1043     {
1044     int d;
1045     register int op = *cc;
1046    
1047     switch (op)
1048     {
1049 nigel 93 case OP_CBRA:
1050 nigel 77 case OP_BRA:
1051     case OP_ONCE:
1052     case OP_COND:
1053 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1054 nigel 77 if (d < 0) return d;
1055     branchlength += d;
1056     do cc += GET(cc, 1); while (*cc == OP_ALT);
1057     cc += 1 + LINK_SIZE;
1058     break;
1059    
1060     /* Reached end of a branch; if it's a ket it is the end of a nested
1061     call. If it's ALT it is an alternation in a nested call. If it is
1062     END it's the end of the outer call. All can be handled by the same code. */
1063    
1064     case OP_ALT:
1065     case OP_KET:
1066     case OP_KETRMAX:
1067     case OP_KETRMIN:
1068     case OP_END:
1069     if (length < 0) length = branchlength;
1070     else if (length != branchlength) return -1;
1071     if (*cc != OP_ALT) return length;
1072     cc += 1 + LINK_SIZE;
1073     branchlength = 0;
1074     break;
1075    
1076     /* Skip over assertive subpatterns */
1077    
1078     case OP_ASSERT:
1079     case OP_ASSERT_NOT:
1080     case OP_ASSERTBACK:
1081     case OP_ASSERTBACK_NOT:
1082     do cc += GET(cc, 1); while (*cc == OP_ALT);
1083     /* Fall through */
1084    
1085     /* Skip over things that don't match chars */
1086    
1087     case OP_REVERSE:
1088     case OP_CREF:
1089 nigel 93 case OP_RREF:
1090     case OP_DEF:
1091 nigel 77 case OP_OPT:
1092     case OP_CALLOUT:
1093     case OP_SOD:
1094     case OP_SOM:
1095     case OP_EOD:
1096     case OP_EODN:
1097     case OP_CIRC:
1098     case OP_DOLL:
1099     case OP_NOT_WORD_BOUNDARY:
1100     case OP_WORD_BOUNDARY:
1101     cc += _pcre_OP_lengths[*cc];
1102     break;
1103    
1104     /* Handle literal characters */
1105    
1106     case OP_CHAR:
1107     case OP_CHARNC:
1108 nigel 91 case OP_NOT:
1109 nigel 77 branchlength++;
1110     cc += 2;
1111     #ifdef SUPPORT_UTF8
1112     if ((options & PCRE_UTF8) != 0)
1113     {
1114     while ((*cc & 0xc0) == 0x80) cc++;
1115     }
1116     #endif
1117     break;
1118    
1119     /* Handle exact repetitions. The count is already in characters, but we
1120     need to skip over a multibyte character in UTF8 mode. */
1121    
1122     case OP_EXACT:
1123     branchlength += GET2(cc,1);
1124     cc += 4;
1125     #ifdef SUPPORT_UTF8
1126     if ((options & PCRE_UTF8) != 0)
1127     {
1128     while((*cc & 0x80) == 0x80) cc++;
1129     }
1130     #endif
1131     break;
1132    
1133     case OP_TYPEEXACT:
1134     branchlength += GET2(cc,1);
1135     cc += 4;
1136     break;
1137    
1138     /* Handle single-char matchers */
1139    
1140     case OP_PROP:
1141     case OP_NOTPROP:
1142 nigel 87 cc += 2;
1143 nigel 77 /* Fall through */
1144    
1145     case OP_NOT_DIGIT:
1146     case OP_DIGIT:
1147     case OP_NOT_WHITESPACE:
1148     case OP_WHITESPACE:
1149     case OP_NOT_WORDCHAR:
1150     case OP_WORDCHAR:
1151     case OP_ANY:
1152     branchlength++;
1153     cc++;
1154     break;
1155    
1156     /* The single-byte matcher isn't allowed */
1157    
1158     case OP_ANYBYTE:
1159     return -2;
1160    
1161     /* Check a class for variable quantification */
1162    
1163     #ifdef SUPPORT_UTF8
1164     case OP_XCLASS:
1165     cc += GET(cc, 1) - 33;
1166     /* Fall through */
1167     #endif
1168    
1169     case OP_CLASS:
1170     case OP_NCLASS:
1171     cc += 33;
1172    
1173     switch (*cc)
1174     {
1175     case OP_CRSTAR:
1176     case OP_CRMINSTAR:
1177     case OP_CRQUERY:
1178     case OP_CRMINQUERY:
1179     return -1;
1180    
1181     case OP_CRRANGE:
1182     case OP_CRMINRANGE:
1183     if (GET2(cc,1) != GET2(cc,3)) return -1;
1184     branchlength += GET2(cc,1);
1185     cc += 5;
1186     break;
1187    
1188     default:
1189     branchlength++;
1190     }
1191     break;
1192    
1193     /* Anything else is variable length */
1194    
1195     default:
1196     return -1;
1197     }
1198     }
1199     /* Control never gets here */
1200     }
1201    
1202    
1203    
1204    
1205     /*************************************************
1206     * Scan compiled regex for numbered bracket *
1207     *************************************************/
1208    
1209     /* This little function scans through a compiled pattern until it finds a
1210     capturing bracket with the given number.
1211    
1212     Arguments:
1213     code points to start of expression
1214     utf8 TRUE in UTF-8 mode
1215     number the required bracket number
1216    
1217     Returns: pointer to the opcode for the bracket, or NULL if not found
1218     */
1219    
1220     static const uschar *
1221     find_bracket(const uschar *code, BOOL utf8, int number)
1222     {
1223     for (;;)
1224     {
1225     register int c = *code;
1226     if (c == OP_END) return NULL;
1227 nigel 91
1228     /* XCLASS is used for classes that cannot be represented just by a bit
1229     map. This includes negated single high-valued characters. The length in
1230     the table is zero; the actual length is stored in the compiled code. */
1231    
1232     if (c == OP_XCLASS) code += GET(code, 1);
1233    
1234 nigel 93 /* Handle capturing bracket */
1235 nigel 91
1236 nigel 93 else if (c == OP_CBRA)
1237 nigel 77 {
1238 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1239 nigel 77 if (n == number) return (uschar *)code;
1240 nigel 93 code += _pcre_OP_lengths[c];
1241 nigel 77 }
1242 nigel 91
1243 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1244     a multi-byte character. The length in the table is a minimum, so we have to
1245     arrange to skip the extra bytes. */
1246 nigel 91
1247 nigel 77 else
1248     {
1249     code += _pcre_OP_lengths[c];
1250 ph10 107 #ifdef SUPPORT_UTF8
1251 nigel 77 if (utf8) switch(c)
1252     {
1253     case OP_CHAR:
1254     case OP_CHARNC:
1255     case OP_EXACT:
1256     case OP_UPTO:
1257     case OP_MINUPTO:
1258 nigel 93 case OP_POSUPTO:
1259 nigel 77 case OP_STAR:
1260     case OP_MINSTAR:
1261 nigel 93 case OP_POSSTAR:
1262 nigel 77 case OP_PLUS:
1263     case OP_MINPLUS:
1264 nigel 93 case OP_POSPLUS:
1265 nigel 77 case OP_QUERY:
1266     case OP_MINQUERY:
1267 nigel 93 case OP_POSQUERY:
1268     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1269 nigel 77 break;
1270     }
1271 ph10 111 #endif
1272 nigel 77 }
1273     }
1274     }
1275    
1276    
1277    
1278     /*************************************************
1279     * Scan compiled regex for recursion reference *
1280     *************************************************/
1281    
1282     /* This little function scans through a compiled pattern until it finds an
1283     instance of OP_RECURSE.
1284    
1285     Arguments:
1286     code points to start of expression
1287     utf8 TRUE in UTF-8 mode
1288    
1289     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1290     */
1291    
1292     static const uschar *
1293     find_recurse(const uschar *code, BOOL utf8)
1294     {
1295     for (;;)
1296     {
1297     register int c = *code;
1298     if (c == OP_END) return NULL;
1299 nigel 91 if (c == OP_RECURSE) return code;
1300    
1301     /* XCLASS is used for classes that cannot be represented just by a bit
1302     map. This includes negated single high-valued characters. The length in
1303     the table is zero; the actual length is stored in the compiled code. */
1304    
1305     if (c == OP_XCLASS) code += GET(code, 1);
1306    
1307     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1308     that are followed by a character may be followed by a multi-byte character.
1309 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1310     bytes. */
1311 nigel 91
1312 nigel 77 else
1313     {
1314     code += _pcre_OP_lengths[c];
1315 ph10 107 #ifdef SUPPORT_UTF8
1316 nigel 77 if (utf8) switch(c)
1317     {
1318     case OP_CHAR:
1319     case OP_CHARNC:
1320     case OP_EXACT:
1321     case OP_UPTO:
1322     case OP_MINUPTO:
1323 nigel 93 case OP_POSUPTO:
1324 nigel 77 case OP_STAR:
1325     case OP_MINSTAR:
1326 nigel 93 case OP_POSSTAR:
1327 nigel 77 case OP_PLUS:
1328     case OP_MINPLUS:
1329 nigel 93 case OP_POSPLUS:
1330 nigel 77 case OP_QUERY:
1331     case OP_MINQUERY:
1332 nigel 93 case OP_POSQUERY:
1333     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1334 nigel 77 break;
1335     }
1336 ph10 111 #endif
1337 nigel 77 }
1338     }
1339     }
1340    
1341    
1342    
1343     /*************************************************
1344     * Scan compiled branch for non-emptiness *
1345     *************************************************/
1346    
1347     /* This function scans through a branch of a compiled pattern to see whether it
1348 nigel 93 can match the empty string or not. It is called from could_be_empty()
1349     below and from compile_branch() when checking for an unlimited repeat of a
1350     group that can match nothing. Note that first_significant_code() skips over
1351     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1352     struck an inner bracket whose current branch will already have been scanned.
1353 nigel 77
1354     Arguments:
1355     code points to start of search
1356     endcode points to where to stop
1357     utf8 TRUE if in UTF8 mode
1358    
1359     Returns: TRUE if what is matched could be empty
1360     */
1361    
1362     static BOOL
1363     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1364     {
1365     register int c;
1366 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1367 nigel 77 code < endcode;
1368     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1369     {
1370     const uschar *ccode;
1371    
1372     c = *code;
1373 ph10 170
1374     /* Groups with zero repeats can of course be empty; skip them. */
1375 nigel 77
1376 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1377     {
1378     do code += GET(code, 1); while (*code == OP_ALT);
1379     c = *code;
1380     continue;
1381     }
1382    
1383     /* For other groups, scan the branches. */
1384    
1385 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1386 nigel 77 {
1387     BOOL empty_branch;
1388     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1389    
1390     /* Scan a closed bracket */
1391    
1392     empty_branch = FALSE;
1393     do
1394     {
1395     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1396     empty_branch = TRUE;
1397     code += GET(code, 1);
1398     }
1399     while (*code == OP_ALT);
1400     if (!empty_branch) return FALSE; /* All branches are non-empty */
1401 ph10 170 c = *code;
1402 nigel 93 continue;
1403 nigel 77 }
1404    
1405 nigel 93 /* Handle the other opcodes */
1406    
1407     switch (c)
1408 nigel 77 {
1409     /* Check for quantifiers after a class */
1410    
1411     #ifdef SUPPORT_UTF8
1412     case OP_XCLASS:
1413     ccode = code + GET(code, 1);
1414     goto CHECK_CLASS_REPEAT;
1415     #endif
1416    
1417     case OP_CLASS:
1418     case OP_NCLASS:
1419     ccode = code + 33;
1420    
1421     #ifdef SUPPORT_UTF8
1422     CHECK_CLASS_REPEAT:
1423     #endif
1424    
1425     switch (*ccode)
1426     {
1427     case OP_CRSTAR: /* These could be empty; continue */
1428     case OP_CRMINSTAR:
1429     case OP_CRQUERY:
1430     case OP_CRMINQUERY:
1431     break;
1432    
1433     default: /* Non-repeat => class must match */
1434     case OP_CRPLUS: /* These repeats aren't empty */
1435     case OP_CRMINPLUS:
1436     return FALSE;
1437    
1438     case OP_CRRANGE:
1439     case OP_CRMINRANGE:
1440     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1441     break;
1442     }
1443     break;
1444    
1445     /* Opcodes that must match a character */
1446    
1447     case OP_PROP:
1448     case OP_NOTPROP:
1449     case OP_EXTUNI:
1450     case OP_NOT_DIGIT:
1451     case OP_DIGIT:
1452     case OP_NOT_WHITESPACE:
1453     case OP_WHITESPACE:
1454     case OP_NOT_WORDCHAR:
1455     case OP_WORDCHAR:
1456     case OP_ANY:
1457     case OP_ANYBYTE:
1458     case OP_CHAR:
1459     case OP_CHARNC:
1460     case OP_NOT:
1461     case OP_PLUS:
1462     case OP_MINPLUS:
1463 nigel 93 case OP_POSPLUS:
1464 nigel 77 case OP_EXACT:
1465     case OP_NOTPLUS:
1466     case OP_NOTMINPLUS:
1467 nigel 93 case OP_NOTPOSPLUS:
1468 nigel 77 case OP_NOTEXACT:
1469     case OP_TYPEPLUS:
1470     case OP_TYPEMINPLUS:
1471 nigel 93 case OP_TYPEPOSPLUS:
1472 nigel 77 case OP_TYPEEXACT:
1473     return FALSE;
1474    
1475     /* End of branch */
1476    
1477     case OP_KET:
1478     case OP_KETRMAX:
1479     case OP_KETRMIN:
1480     case OP_ALT:
1481     return TRUE;
1482    
1483 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1484     MINUPTO, and POSUPTO may be followed by a multibyte character */
1485 nigel 77
1486     #ifdef SUPPORT_UTF8
1487     case OP_STAR:
1488     case OP_MINSTAR:
1489 nigel 93 case OP_POSSTAR:
1490 nigel 77 case OP_QUERY:
1491     case OP_MINQUERY:
1492 nigel 93 case OP_POSQUERY:
1493 nigel 77 case OP_UPTO:
1494     case OP_MINUPTO:
1495 nigel 93 case OP_POSUPTO:
1496 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1497     break;
1498     #endif
1499     }
1500     }
1501    
1502     return TRUE;
1503     }
1504    
1505    
1506    
1507     /*************************************************
1508     * Scan compiled regex for non-emptiness *
1509     *************************************************/
1510    
1511     /* This function is called to check for left recursive calls. We want to check
1512     the current branch of the current pattern to see if it could match the empty
1513     string. If it could, we must look outwards for branches at other levels,
1514     stopping when we pass beyond the bracket which is the subject of the recursion.
1515    
1516     Arguments:
1517     code points to start of the recursion
1518     endcode points to where to stop (current RECURSE item)
1519     bcptr points to the chain of current (unclosed) branch starts
1520     utf8 TRUE if in UTF-8 mode
1521    
1522     Returns: TRUE if what is matched could be empty
1523     */
1524    
1525     static BOOL
1526     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1527     BOOL utf8)
1528     {
1529     while (bcptr != NULL && bcptr->current >= code)
1530     {
1531     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1532     bcptr = bcptr->outer;
1533     }
1534     return TRUE;
1535     }
1536    
1537    
1538    
1539     /*************************************************
1540     * Check for POSIX class syntax *
1541     *************************************************/
1542    
1543     /* This function is called when the sequence "[:" or "[." or "[=" is
1544     encountered in a character class. It checks whether this is followed by an
1545     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1546     ".]" or "=]".
1547    
1548     Argument:
1549     ptr pointer to the initial [
1550     endptr where to return the end pointer
1551     cd pointer to compile data
1552    
1553     Returns: TRUE or FALSE
1554     */
1555    
1556     static BOOL
1557     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1558     {
1559     int terminator; /* Don't combine these lines; the Solaris cc */
1560     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1561     if (*(++ptr) == '^') ptr++;
1562     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1563     if (*ptr == terminator && ptr[1] == ']')
1564     {
1565     *endptr = ptr;
1566     return TRUE;
1567     }
1568     return FALSE;
1569     }
1570    
1571    
1572    
1573    
1574     /*************************************************
1575     * Check POSIX class name *
1576     *************************************************/
1577    
1578     /* This function is called to check the name given in a POSIX-style class entry
1579     such as [:alnum:].
1580    
1581     Arguments:
1582     ptr points to the first letter
1583     len the length of the name
1584    
1585     Returns: a value representing the name, or -1 if unknown
1586     */
1587    
1588     static int
1589     check_posix_name(const uschar *ptr, int len)
1590     {
1591     register int yield = 0;
1592     while (posix_name_lengths[yield] != 0)
1593     {
1594     if (len == posix_name_lengths[yield] &&
1595     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1596     yield++;
1597     }
1598     return -1;
1599     }
1600    
1601    
1602     /*************************************************
1603     * Adjust OP_RECURSE items in repeated group *
1604     *************************************************/
1605    
1606     /* OP_RECURSE items contain an offset from the start of the regex to the group
1607     that is referenced. This means that groups can be replicated for fixed
1608     repetition simply by copying (because the recursion is allowed to refer to
1609     earlier groups that are outside the current group). However, when a group is
1610     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1611     it, after it has been compiled. This means that any OP_RECURSE items within it
1612     that refer to the group itself or any contained groups have to have their
1613 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1614     the partially compiled regex must be temporarily terminated with OP_END.
1615 nigel 77
1616 nigel 93 This function has been extended with the possibility of forward references for
1617     recursions and subroutine calls. It must also check the list of such references
1618     for the group we are dealing with. If it finds that one of the recursions in
1619     the current group is on this list, it adjusts the offset in the list, not the
1620     value in the reference (which is a group number).
1621    
1622 nigel 77 Arguments:
1623     group points to the start of the group
1624     adjust the amount by which the group is to be moved
1625     utf8 TRUE in UTF-8 mode
1626     cd contains pointers to tables etc.
1627 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1628 nigel 77
1629     Returns: nothing
1630     */
1631    
1632     static void
1633 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1634     uschar *save_hwm)
1635 nigel 77 {
1636     uschar *ptr = group;
1637     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1638     {
1639 nigel 93 int offset;
1640     uschar *hc;
1641    
1642     /* See if this recursion is on the forward reference list. If so, adjust the
1643     reference. */
1644    
1645     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1646     {
1647     offset = GET(hc, 0);
1648     if (cd->start_code + offset == ptr + 1)
1649     {
1650     PUT(hc, 0, offset + adjust);
1651     break;
1652     }
1653     }
1654    
1655     /* Otherwise, adjust the recursion offset if it's after the start of this
1656     group. */
1657    
1658     if (hc >= cd->hwm)
1659     {
1660     offset = GET(ptr, 1);
1661     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1662     }
1663    
1664 nigel 77 ptr += 1 + LINK_SIZE;
1665     }
1666     }
1667    
1668    
1669    
1670     /*************************************************
1671     * Insert an automatic callout point *
1672     *************************************************/
1673    
1674     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1675     callout points before each pattern item.
1676    
1677     Arguments:
1678     code current code pointer
1679     ptr current pattern pointer
1680     cd pointers to tables etc
1681    
1682     Returns: new code pointer
1683     */
1684    
1685     static uschar *
1686     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1687     {
1688     *code++ = OP_CALLOUT;
1689     *code++ = 255;
1690     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1691     PUT(code, LINK_SIZE, 0); /* Default length */
1692     return code + 2*LINK_SIZE;
1693     }
1694    
1695    
1696    
1697     /*************************************************
1698     * Complete a callout item *
1699     *************************************************/
1700    
1701     /* A callout item contains the length of the next item in the pattern, which
1702     we can't fill in till after we have reached the relevant point. This is used
1703     for both automatic and manual callouts.
1704    
1705     Arguments:
1706     previous_callout points to previous callout item
1707     ptr current pattern pointer
1708     cd pointers to tables etc
1709    
1710     Returns: nothing
1711     */
1712    
1713     static void
1714     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1715     {
1716     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1717     PUT(previous_callout, 2 + LINK_SIZE, length);
1718     }
1719    
1720    
1721    
1722     #ifdef SUPPORT_UCP
1723     /*************************************************
1724     * Get othercase range *
1725     *************************************************/
1726    
1727     /* This function is passed the start and end of a class range, in UTF-8 mode
1728     with UCP support. It searches up the characters, looking for internal ranges of
1729     characters in the "other" case. Each call returns the next one, updating the
1730     start address.
1731    
1732     Arguments:
1733     cptr points to starting character value; updated
1734     d end value
1735     ocptr where to put start of othercase range
1736     odptr where to put end of othercase range
1737    
1738     Yield: TRUE when range returned; FALSE when no more
1739     */
1740    
1741     static BOOL
1742 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1743     unsigned int *odptr)
1744 nigel 77 {
1745 nigel 93 unsigned int c, othercase, next;
1746 nigel 77
1747     for (c = *cptr; c <= d; c++)
1748 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1749 nigel 77
1750     if (c > d) return FALSE;
1751    
1752     *ocptr = othercase;
1753     next = othercase + 1;
1754    
1755     for (++c; c <= d; c++)
1756     {
1757 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1758 nigel 77 next++;
1759     }
1760    
1761     *odptr = next - 1;
1762     *cptr = c;
1763    
1764     return TRUE;
1765     }
1766     #endif /* SUPPORT_UCP */
1767    
1768    
1769 nigel 93
1770 nigel 77 /*************************************************
1771 nigel 93 * Check if auto-possessifying is possible *
1772     *************************************************/
1773    
1774     /* This function is called for unlimited repeats of certain items, to see
1775     whether the next thing could possibly match the repeated item. If not, it makes
1776     sense to automatically possessify the repeated item.
1777    
1778     Arguments:
1779     op_code the repeated op code
1780     this data for this item, depends on the opcode
1781     utf8 TRUE in UTF-8 mode
1782     utf8_char used for utf8 character bytes, NULL if not relevant
1783     ptr next character in pattern
1784     options options bits
1785     cd contains pointers to tables etc.
1786    
1787     Returns: TRUE if possessifying is wanted
1788     */
1789    
1790     static BOOL
1791     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1792     const uschar *ptr, int options, compile_data *cd)
1793     {
1794     int next;
1795    
1796     /* Skip whitespace and comments in extended mode */
1797    
1798     if ((options & PCRE_EXTENDED) != 0)
1799     {
1800     for (;;)
1801     {
1802     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1803     if (*ptr == '#')
1804     {
1805     while (*(++ptr) != 0)
1806     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1807     }
1808     else break;
1809     }
1810     }
1811    
1812     /* If the next item is one that we can handle, get its value. A non-negative
1813     value is a character, a negative value is an escape value. */
1814    
1815     if (*ptr == '\\')
1816     {
1817     int temperrorcode = 0;
1818     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1819     if (temperrorcode != 0) return FALSE;
1820     ptr++; /* Point after the escape sequence */
1821     }
1822    
1823     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1824     {
1825     #ifdef SUPPORT_UTF8
1826     if (utf8) { GETCHARINC(next, ptr); } else
1827     #endif
1828     next = *ptr++;
1829     }
1830    
1831     else return FALSE;
1832    
1833     /* Skip whitespace and comments in extended mode */
1834    
1835     if ((options & PCRE_EXTENDED) != 0)
1836     {
1837     for (;;)
1838     {
1839     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1840     if (*ptr == '#')
1841     {
1842     while (*(++ptr) != 0)
1843     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1844     }
1845     else break;
1846     }
1847     }
1848    
1849     /* If the next thing is itself optional, we have to give up. */
1850    
1851     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1852     return FALSE;
1853    
1854     /* Now compare the next item with the previous opcode. If the previous is a
1855     positive single character match, "item" either contains the character or, if
1856     "item" is greater than 127 in utf8 mode, the character's bytes are in
1857     utf8_char. */
1858    
1859    
1860     /* Handle cases when the next item is a character. */
1861    
1862     if (next >= 0) switch(op_code)
1863     {
1864     case OP_CHAR:
1865     #ifdef SUPPORT_UTF8
1866     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1867     #endif
1868     return item != next;
1869    
1870     /* For CHARNC (caseless character) we must check the other case. If we have
1871     Unicode property support, we can use it to test the other case of
1872     high-valued characters. */
1873    
1874     case OP_CHARNC:
1875     #ifdef SUPPORT_UTF8
1876     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1877     #endif
1878     if (item == next) return FALSE;
1879     #ifdef SUPPORT_UTF8
1880     if (utf8)
1881     {
1882     unsigned int othercase;
1883     if (next < 128) othercase = cd->fcc[next]; else
1884     #ifdef SUPPORT_UCP
1885     othercase = _pcre_ucp_othercase((unsigned int)next);
1886     #else
1887     othercase = NOTACHAR;
1888     #endif
1889     return (unsigned int)item != othercase;
1890     }
1891     else
1892     #endif /* SUPPORT_UTF8 */
1893     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1894    
1895     /* For OP_NOT, "item" must be a single-byte character. */
1896    
1897     case OP_NOT:
1898     if (next < 0) return FALSE; /* Not a character */
1899     if (item == next) return TRUE;
1900     if ((options & PCRE_CASELESS) == 0) return FALSE;
1901     #ifdef SUPPORT_UTF8
1902     if (utf8)
1903     {
1904     unsigned int othercase;
1905     if (next < 128) othercase = cd->fcc[next]; else
1906     #ifdef SUPPORT_UCP
1907     othercase = _pcre_ucp_othercase(next);
1908     #else
1909     othercase = NOTACHAR;
1910     #endif
1911     return (unsigned int)item == othercase;
1912     }
1913     else
1914     #endif /* SUPPORT_UTF8 */
1915     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1916    
1917     case OP_DIGIT:
1918     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1919    
1920     case OP_NOT_DIGIT:
1921     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1922    
1923     case OP_WHITESPACE:
1924     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1925    
1926     case OP_NOT_WHITESPACE:
1927     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1928    
1929     case OP_WORDCHAR:
1930     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1931    
1932     case OP_NOT_WORDCHAR:
1933     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1934    
1935     default:
1936     return FALSE;
1937     }
1938    
1939    
1940     /* Handle the case when the next item is \d, \s, etc. */
1941    
1942     switch(op_code)
1943     {
1944     case OP_CHAR:
1945     case OP_CHARNC:
1946     #ifdef SUPPORT_UTF8
1947     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1948     #endif
1949     switch(-next)
1950     {
1951     case ESC_d:
1952     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1953    
1954     case ESC_D:
1955     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1956    
1957     case ESC_s:
1958     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1959    
1960     case ESC_S:
1961     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1962    
1963     case ESC_w:
1964     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1965    
1966     case ESC_W:
1967     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1968    
1969     default:
1970     return FALSE;
1971     }
1972    
1973     case OP_DIGIT:
1974     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1975    
1976     case OP_NOT_DIGIT:
1977     return next == -ESC_d;
1978    
1979     case OP_WHITESPACE:
1980     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1981    
1982     case OP_NOT_WHITESPACE:
1983     return next == -ESC_s;
1984    
1985     case OP_WORDCHAR:
1986     return next == -ESC_W || next == -ESC_s;
1987    
1988     case OP_NOT_WORDCHAR:
1989     return next == -ESC_w || next == -ESC_d;
1990    
1991     default:
1992     return FALSE;
1993     }
1994    
1995     /* Control does not reach here */
1996     }
1997    
1998    
1999    
2000     /*************************************************
2001 nigel 77 * Compile one branch *
2002     *************************************************/
2003    
2004 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2005 nigel 77 changed during the branch, the pointer is used to change the external options
2006 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2007     to find out the amount of memory needed, as well as during the real compile
2008     phase. The value of lengthptr distinguishes the two phases.
2009 nigel 77
2010     Arguments:
2011     optionsptr pointer to the option bits
2012     codeptr points to the pointer to the current code point
2013     ptrptr points to the current pattern pointer
2014     errorcodeptr points to error code variable
2015     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2016     reqbyteptr set to the last literal character required, else < 0
2017     bcptr points to current branch chain
2018     cd contains pointers to tables etc.
2019 nigel 93 lengthptr NULL during the real compile phase
2020     points to length accumulator during pre-compile phase
2021 nigel 77
2022     Returns: TRUE on success
2023     FALSE, with *errorcodeptr set non-zero on error
2024     */
2025    
2026     static BOOL
2027 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2028     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2029     compile_data *cd, int *lengthptr)
2030 nigel 77 {
2031     int repeat_type, op_type;
2032     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2033     int bravalue = 0;
2034     int greedy_default, greedy_non_default;
2035     int firstbyte, reqbyte;
2036     int zeroreqbyte, zerofirstbyte;
2037     int req_caseopt, reqvary, tempreqvary;
2038     int options = *optionsptr;
2039     int after_manual_callout = 0;
2040 nigel 93 int length_prevgroup = 0;
2041 nigel 77 register int c;
2042     register uschar *code = *codeptr;
2043 nigel 93 uschar *last_code = code;
2044     uschar *orig_code = code;
2045 nigel 77 uschar *tempcode;
2046     BOOL inescq = FALSE;
2047     BOOL groupsetfirstbyte = FALSE;
2048     const uschar *ptr = *ptrptr;
2049     const uschar *tempptr;
2050     uschar *previous = NULL;
2051     uschar *previous_callout = NULL;
2052 nigel 93 uschar *save_hwm = NULL;
2053 nigel 77 uschar classbits[32];
2054    
2055     #ifdef SUPPORT_UTF8
2056     BOOL class_utf8;
2057     BOOL utf8 = (options & PCRE_UTF8) != 0;
2058     uschar *class_utf8data;
2059     uschar utf8_char[6];
2060     #else
2061     BOOL utf8 = FALSE;
2062 nigel 93 uschar *utf8_char = NULL;
2063 nigel 77 #endif
2064    
2065 nigel 93 #ifdef DEBUG
2066     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2067     #endif
2068    
2069 nigel 77 /* Set up the default and non-default settings for greediness */
2070    
2071     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2072     greedy_non_default = greedy_default ^ 1;
2073    
2074     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2075     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2076     matches a non-fixed char first char; reqbyte just remains unset if we never
2077     find one.
2078    
2079     When we hit a repeat whose minimum is zero, we may have to adjust these values
2080     to take the zero repeat into account. This is implemented by setting them to
2081     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2082     item types that can be repeated set these backoff variables appropriately. */
2083    
2084     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2085    
2086     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2087     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2088     value > 255. It is added into the firstbyte or reqbyte variables to record the
2089     case status of the value. This is used only for ASCII characters. */
2090    
2091     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2092    
2093     /* Switch on next character until the end of the branch */
2094    
2095     for (;; ptr++)
2096     {
2097     BOOL negate_class;
2098     BOOL possessive_quantifier;
2099     BOOL is_quantifier;
2100 nigel 93 BOOL is_recurse;
2101 nigel 77 int class_charcount;
2102     int class_lastchar;
2103     int newoptions;
2104     int recno;
2105 ph10 167 int refsign;
2106 nigel 77 int skipbytes;
2107     int subreqbyte;
2108     int subfirstbyte;
2109 nigel 93 int terminator;
2110 nigel 77 int mclength;
2111     uschar mcbuffer[8];
2112    
2113 nigel 93 /* Get next byte in the pattern */
2114 nigel 77
2115     c = *ptr;
2116    
2117 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2118     previous cycle of this loop. */
2119    
2120     if (lengthptr != NULL)
2121     {
2122     #ifdef DEBUG
2123     if (code > cd->hwm) cd->hwm = code; /* High water info */
2124     #endif
2125     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2126     {
2127     *errorcodeptr = ERR52;
2128     goto FAILED;
2129     }
2130    
2131     /* There is at least one situation where code goes backwards: this is the
2132     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2133     the class is simply eliminated. However, it is created first, so we have to
2134     allow memory for it. Therefore, don't ever reduce the length at this point.
2135     */
2136    
2137     if (code < last_code) code = last_code;
2138     *lengthptr += code - last_code;
2139     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2140    
2141     /* If "previous" is set and it is not at the start of the work space, move
2142     it back to there, in order to avoid filling up the work space. Otherwise,
2143     if "previous" is NULL, reset the current code pointer to the start. */
2144    
2145     if (previous != NULL)
2146     {
2147     if (previous > orig_code)
2148     {
2149     memmove(orig_code, previous, code - previous);
2150     code -= previous - orig_code;
2151     previous = orig_code;
2152     }
2153     }
2154     else code = orig_code;
2155    
2156     /* Remember where this code item starts so we can pick up the length
2157     next time round. */
2158    
2159     last_code = code;
2160     }
2161    
2162     /* In the real compile phase, just check the workspace used by the forward
2163     reference list. */
2164    
2165     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2166     {
2167     *errorcodeptr = ERR52;
2168     goto FAILED;
2169     }
2170    
2171 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2172    
2173     if (inescq && c != 0)
2174     {
2175     if (c == '\\' && ptr[1] == 'E')
2176     {
2177     inescq = FALSE;
2178     ptr++;
2179     continue;
2180     }
2181     else
2182     {
2183     if (previous_callout != NULL)
2184     {
2185 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2186     complete_callout(previous_callout, ptr, cd);
2187 nigel 77 previous_callout = NULL;
2188     }
2189     if ((options & PCRE_AUTO_CALLOUT) != 0)
2190     {
2191     previous_callout = code;
2192     code = auto_callout(code, ptr, cd);
2193     }
2194     goto NORMAL_CHAR;
2195     }
2196     }
2197    
2198     /* Fill in length of a previous callout, except when the next thing is
2199     a quantifier. */
2200    
2201     is_quantifier = c == '*' || c == '+' || c == '?' ||
2202     (c == '{' && is_counted_repeat(ptr+1));
2203    
2204     if (!is_quantifier && previous_callout != NULL &&
2205     after_manual_callout-- <= 0)
2206     {
2207 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2208     complete_callout(previous_callout, ptr, cd);
2209 nigel 77 previous_callout = NULL;
2210     }
2211    
2212     /* In extended mode, skip white space and comments */
2213    
2214     if ((options & PCRE_EXTENDED) != 0)
2215     {
2216     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2217     if (c == '#')
2218     {
2219 nigel 93 while (*(++ptr) != 0)
2220 nigel 91 {
2221 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2222 nigel 91 }
2223 nigel 93 if (*ptr != 0) continue;
2224    
2225 nigel 91 /* Else fall through to handle end of string */
2226     c = 0;
2227 nigel 77 }
2228     }
2229    
2230     /* No auto callout for quantifiers. */
2231    
2232     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2233     {
2234     previous_callout = code;
2235     code = auto_callout(code, ptr, cd);
2236     }
2237    
2238     switch(c)
2239     {
2240 nigel 93 /* ===================================================================*/
2241     case 0: /* The branch terminates at string end */
2242     case '|': /* or | or ) */
2243 nigel 77 case ')':
2244     *firstbyteptr = firstbyte;
2245     *reqbyteptr = reqbyte;
2246     *codeptr = code;
2247     *ptrptr = ptr;
2248 nigel 93 if (lengthptr != NULL)
2249     {
2250     *lengthptr += code - last_code; /* To include callout length */
2251     DPRINTF((">> end branch\n"));
2252     }
2253 nigel 77 return TRUE;
2254    
2255 nigel 93
2256     /* ===================================================================*/
2257 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2258     the setting of any following char as a first character. */
2259    
2260     case '^':
2261     if ((options & PCRE_MULTILINE) != 0)
2262     {
2263     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2264     }
2265     previous = NULL;
2266     *code++ = OP_CIRC;
2267     break;
2268    
2269     case '$':
2270     previous = NULL;
2271     *code++ = OP_DOLL;
2272     break;
2273    
2274     /* There can never be a first char if '.' is first, whatever happens about
2275     repeats. The value of reqbyte doesn't change either. */
2276    
2277     case '.':
2278     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2279     zerofirstbyte = firstbyte;
2280     zeroreqbyte = reqbyte;
2281     previous = code;
2282     *code++ = OP_ANY;
2283     break;
2284    
2285 nigel 93
2286     /* ===================================================================*/
2287 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2288     32-byte bitmap of the permitted characters, except in the special case
2289     where there is only one such character. For negated classes, we build the
2290     map as usual, then invert it at the end. However, we use a different opcode
2291     so that data characters > 255 can be handled correctly.
2292 nigel 77
2293     If the class contains characters outside the 0-255 range, a different
2294     opcode is compiled. It may optionally have a bit map for characters < 256,
2295     but those above are are explicitly listed afterwards. A flag byte tells
2296     whether the bitmap is present, and whether this is a negated class or not.
2297     */
2298    
2299     case '[':
2300     previous = code;
2301    
2302     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2303     they are encountered at the top level, so we'll do that too. */
2304    
2305     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2306     check_posix_syntax(ptr, &tempptr, cd))
2307     {
2308     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2309     goto FAILED;
2310     }
2311    
2312     /* If the first character is '^', set the negation flag and skip it. */
2313    
2314     if ((c = *(++ptr)) == '^')
2315     {
2316     negate_class = TRUE;
2317     c = *(++ptr);
2318     }
2319     else
2320     {
2321     negate_class = FALSE;
2322     }
2323    
2324     /* Keep a count of chars with values < 256 so that we can optimize the case
2325 nigel 93 of just a single character (as long as it's < 256). However, For higher
2326     valued UTF-8 characters, we don't yet do any optimization. */
2327 nigel 77
2328     class_charcount = 0;
2329     class_lastchar = -1;
2330    
2331 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2332     temporary bit of memory, in case the class contains only 1 character (less
2333     than 256), because in that case the compiled code doesn't use the bit map.
2334     */
2335    
2336     memset(classbits, 0, 32 * sizeof(uschar));
2337    
2338 nigel 77 #ifdef SUPPORT_UTF8
2339     class_utf8 = FALSE; /* No chars >= 256 */
2340 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2341 nigel 77 #endif
2342    
2343     /* Process characters until ] is reached. By writing this as a "do" it
2344 nigel 93 means that an initial ] is taken as a data character. At the start of the
2345     loop, c contains the first byte of the character. */
2346 nigel 77
2347 nigel 93 if (c != 0) do
2348 nigel 77 {
2349 nigel 93 const uschar *oldptr;
2350    
2351 nigel 77 #ifdef SUPPORT_UTF8
2352     if (utf8 && c > 127)
2353     { /* Braces are required because the */
2354     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2355     }
2356     #endif
2357    
2358     /* Inside \Q...\E everything is literal except \E */
2359    
2360     if (inescq)
2361     {
2362 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2363 nigel 77 {
2364 nigel 93 inescq = FALSE; /* Reset literal state */
2365     ptr++; /* Skip the 'E' */
2366     continue; /* Carry on with next */
2367 nigel 77 }
2368 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2369 nigel 77 }
2370    
2371     /* Handle POSIX class names. Perl allows a negation extension of the
2372     form [:^name:]. A square bracket that doesn't match the syntax is
2373     treated as a literal. We also recognize the POSIX constructions
2374     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2375     5.6 and 5.8 do. */
2376    
2377     if (c == '[' &&
2378     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2379     check_posix_syntax(ptr, &tempptr, cd))
2380     {
2381     BOOL local_negate = FALSE;
2382 nigel 87 int posix_class, taboffset, tabopt;
2383 nigel 77 register const uschar *cbits = cd->cbits;
2384 nigel 87 uschar pbits[32];
2385 nigel 77
2386     if (ptr[1] != ':')
2387     {
2388     *errorcodeptr = ERR31;
2389     goto FAILED;
2390     }
2391    
2392     ptr += 2;
2393     if (*ptr == '^')
2394     {
2395     local_negate = TRUE;
2396     ptr++;
2397     }
2398    
2399     posix_class = check_posix_name(ptr, tempptr - ptr);
2400     if (posix_class < 0)
2401     {
2402     *errorcodeptr = ERR30;
2403     goto FAILED;
2404     }
2405    
2406     /* If matching is caseless, upper and lower are converted to
2407     alpha. This relies on the fact that the class table starts with
2408     alpha, lower, upper as the first 3 entries. */
2409    
2410     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2411     posix_class = 0;
2412    
2413 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2414     because we may be adding and subtracting from it, and we don't want to
2415     subtract bits that may be in the main map already. At the end we or the
2416     result into the bit map that is being built. */
2417 nigel 77
2418     posix_class *= 3;
2419 nigel 87
2420     /* Copy in the first table (always present) */
2421    
2422     memcpy(pbits, cbits + posix_class_maps[posix_class],
2423     32 * sizeof(uschar));
2424    
2425     /* If there is a second table, add or remove it as required. */
2426    
2427     taboffset = posix_class_maps[posix_class + 1];
2428     tabopt = posix_class_maps[posix_class + 2];
2429    
2430     if (taboffset >= 0)
2431 nigel 77 {
2432 nigel 87 if (tabopt >= 0)
2433     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2434 nigel 77 else
2435 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2436 nigel 77 }
2437    
2438 nigel 87 /* Not see if we need to remove any special characters. An option
2439     value of 1 removes vertical space and 2 removes underscore. */
2440    
2441     if (tabopt < 0) tabopt = -tabopt;
2442     if (tabopt == 1) pbits[1] &= ~0x3c;
2443     else if (tabopt == 2) pbits[11] &= 0x7f;
2444    
2445     /* Add the POSIX table or its complement into the main table that is
2446     being built and we are done. */
2447    
2448     if (local_negate)
2449     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2450     else
2451     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2452    
2453 nigel 77 ptr = tempptr + 1;
2454     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2455     continue; /* End of POSIX syntax handling */
2456     }
2457    
2458     /* Backslash may introduce a single character, or it may introduce one
2459 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2460     case. Inside a class (and only there) it is treated as backspace.
2461     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2462     to or into the one we are building. We assume they have more than one
2463 nigel 77 character in them, so set class_charcount bigger than one. */
2464    
2465     if (c == '\\')
2466     {
2467 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2468     if (*errorcodeptr != 0) goto FAILED;
2469 nigel 77
2470     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2471     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2472 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2473 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2474     {
2475     if (ptr[1] == '\\' && ptr[2] == 'E')
2476     {
2477     ptr += 2; /* avoid empty string */
2478     }
2479     else inescq = TRUE;
2480     continue;
2481     }
2482    
2483     if (c < 0)
2484     {
2485     register const uschar *cbits = cd->cbits;
2486     class_charcount += 2; /* Greater than 1 is what matters */
2487 nigel 93
2488     /* Save time by not doing this in the pre-compile phase. */
2489    
2490     if (lengthptr == NULL) switch (-c)
2491 nigel 77 {
2492     case ESC_d:
2493     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2494     continue;
2495    
2496     case ESC_D:
2497     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2498     continue;
2499    
2500     case ESC_w:
2501     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2502     continue;
2503    
2504     case ESC_W:
2505     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2506     continue;
2507    
2508     case ESC_s:
2509     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2510     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2511     continue;
2512    
2513     case ESC_S:
2514     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2515     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2516     continue;
2517    
2518 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2519     continue;
2520    
2521     default: /* Not recognized; fall through */
2522     break; /* Need "default" setting to stop compiler warning. */
2523     }
2524    
2525     /* In the pre-compile phase, just do the recognition. */
2526    
2527     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2528     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2529    
2530     /* We need to deal with \P and \p in both phases. */
2531    
2532 nigel 77 #ifdef SUPPORT_UCP
2533 nigel 93 if (-c == ESC_p || -c == ESC_P)
2534     {
2535     BOOL negated;
2536     int pdata;
2537     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2538     if (ptype < 0) goto FAILED;
2539     class_utf8 = TRUE;
2540     *class_utf8data++ = ((-c == ESC_p) != negated)?
2541     XCL_PROP : XCL_NOTPROP;
2542     *class_utf8data++ = ptype;
2543     *class_utf8data++ = pdata;
2544     class_charcount -= 2; /* Not a < 256 character */
2545 nigel 77 continue;
2546 nigel 93 }
2547 nigel 77 #endif
2548 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2549     strict mode. By default, for compatibility with Perl, they are
2550     treated as literals. */
2551 nigel 77
2552 nigel 93 if ((options & PCRE_EXTRA) != 0)
2553     {
2554     *errorcodeptr = ERR7;
2555     goto FAILED;
2556     }
2557 nigel 77
2558 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2559     c = *ptr; /* Get the final character and fall through */
2560 nigel 77 }
2561    
2562     /* Fall through if we have a single character (c >= 0). This may be
2563 nigel 93 greater than 256 in UTF-8 mode. */
2564 nigel 77
2565     } /* End of backslash handling */
2566    
2567     /* A single character may be followed by '-' to form a range. However,
2568     Perl does not permit ']' to be the end of the range. A '-' character
2569 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2570     entirely. The code for handling \Q and \E is messy. */
2571 nigel 77
2572 nigel 93 CHECK_RANGE:
2573     while (ptr[1] == '\\' && ptr[2] == 'E')
2574 nigel 77 {
2575 nigel 93 inescq = FALSE;
2576     ptr += 2;
2577     }
2578    
2579     oldptr = ptr;
2580    
2581     if (!inescq && ptr[1] == '-')
2582     {
2583 nigel 77 int d;
2584     ptr += 2;
2585 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2586 nigel 77
2587 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2588     mode. */
2589    
2590     while (*ptr == '\\' && ptr[1] == 'Q')
2591     {
2592     ptr += 2;
2593     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2594     inescq = TRUE;
2595     break;
2596     }
2597    
2598     if (*ptr == 0 || (!inescq && *ptr == ']'))
2599     {
2600     ptr = oldptr;
2601     goto LONE_SINGLE_CHARACTER;
2602     }
2603    
2604 nigel 77 #ifdef SUPPORT_UTF8
2605     if (utf8)
2606     { /* Braces are required because the */
2607     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2608     }
2609     else
2610     #endif
2611     d = *ptr; /* Not UTF-8 mode */
2612    
2613     /* The second part of a range can be a single-character escape, but
2614     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2615     in such circumstances. */
2616    
2617 nigel 93 if (!inescq && d == '\\')
2618 nigel 77 {
2619 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2620     if (*errorcodeptr != 0) goto FAILED;
2621 nigel 77
2622 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2623     special means the '-' was literal */
2624 nigel 77
2625     if (d < 0)
2626     {
2627     if (d == -ESC_b) d = '\b';
2628 nigel 93 else if (d == -ESC_X) d = 'X';
2629     else if (d == -ESC_R) d = 'R'; else
2630 nigel 77 {
2631 nigel 93 ptr = oldptr;
2632 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2633     }
2634     }
2635     }
2636    
2637 nigel 93 /* Check that the two values are in the correct order. Optimize
2638     one-character ranges */
2639 nigel 77
2640 nigel 93 if (d < c)
2641     {
2642     *errorcodeptr = ERR8;
2643     goto FAILED;
2644     }
2645    
2646 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2647    
2648     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2649     matching, we have to use an XCLASS with extra data items. Caseless
2650     matching for characters > 127 is available only if UCP support is
2651     available. */
2652    
2653     #ifdef SUPPORT_UTF8
2654     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2655     {
2656     class_utf8 = TRUE;
2657    
2658     /* With UCP support, we can find the other case equivalents of
2659     the relevant characters. There may be several ranges. Optimize how
2660     they fit with the basic range. */
2661    
2662     #ifdef SUPPORT_UCP
2663     if ((options & PCRE_CASELESS) != 0)
2664     {
2665 nigel 93 unsigned int occ, ocd;
2666     unsigned int cc = c;
2667     unsigned int origd = d;
2668 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2669     {
2670     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2671    
2672     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2673     { /* if there is overlap, */
2674     c = occ; /* noting that if occ < c */
2675     continue; /* we can't have ocd > d */
2676     } /* because a subrange is */
2677     if (ocd > d && occ <= d + 1) /* always shorter than */
2678     { /* the basic range. */
2679     d = ocd;
2680     continue;
2681     }
2682    
2683     if (occ == ocd)
2684     {
2685     *class_utf8data++ = XCL_SINGLE;
2686     }
2687     else
2688     {
2689     *class_utf8data++ = XCL_RANGE;
2690     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2691     }
2692     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2693     }
2694     }
2695     #endif /* SUPPORT_UCP */
2696    
2697     /* Now record the original range, possibly modified for UCP caseless
2698     overlapping ranges. */
2699    
2700     *class_utf8data++ = XCL_RANGE;
2701     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2702     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2703    
2704     /* With UCP support, we are done. Without UCP support, there is no
2705     caseless matching for UTF-8 characters > 127; we can use the bit map
2706     for the smaller ones. */
2707    
2708     #ifdef SUPPORT_UCP
2709     continue; /* With next character in the class */
2710     #else
2711     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2712    
2713     /* Adjust upper limit and fall through to set up the map */
2714    
2715     d = 127;
2716    
2717     #endif /* SUPPORT_UCP */
2718     }
2719     #endif /* SUPPORT_UTF8 */
2720    
2721     /* We use the bit map for all cases when not in UTF-8 mode; else
2722     ranges that lie entirely within 0-127 when there is UCP support; else
2723     for partial ranges without UCP support. */
2724    
2725 nigel 93 class_charcount += d - c + 1;
2726     class_lastchar = d;
2727    
2728     /* We can save a bit of time by skipping this in the pre-compile. */
2729    
2730     if (lengthptr == NULL) for (; c <= d; c++)
2731 nigel 77 {
2732     classbits[c/8] |= (1 << (c&7));
2733     if ((options & PCRE_CASELESS) != 0)
2734     {
2735     int uc = cd->fcc[c]; /* flip case */
2736     classbits[uc/8] |= (1 << (uc&7));
2737     }
2738     }
2739    
2740     continue; /* Go get the next char in the class */
2741     }
2742    
2743     /* Handle a lone single character - we can get here for a normal
2744     non-escape char, or after \ that introduces a single character or for an
2745     apparent range that isn't. */
2746    
2747     LONE_SINGLE_CHARACTER:
2748    
2749     /* Handle a character that cannot go in the bit map */
2750    
2751     #ifdef SUPPORT_UTF8
2752     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2753     {
2754     class_utf8 = TRUE;
2755     *class_utf8data++ = XCL_SINGLE;
2756     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2757    
2758     #ifdef SUPPORT_UCP
2759     if ((options & PCRE_CASELESS) != 0)
2760     {
2761 nigel 93 unsigned int othercase;
2762     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2763 nigel 77 {
2764     *class_utf8data++ = XCL_SINGLE;
2765     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2766     }
2767     }
2768     #endif /* SUPPORT_UCP */
2769    
2770     }
2771     else
2772     #endif /* SUPPORT_UTF8 */
2773    
2774     /* Handle a single-byte character */
2775     {
2776     classbits[c/8] |= (1 << (c&7));
2777     if ((options & PCRE_CASELESS) != 0)
2778     {
2779     c = cd->fcc[c]; /* flip case */
2780     classbits[c/8] |= (1 << (c&7));
2781     }
2782     class_charcount++;
2783     class_lastchar = c;
2784     }
2785     }
2786    
2787 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2788 nigel 77
2789 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2790 nigel 77
2791 nigel 93 if (c == 0) /* Missing terminating ']' */
2792     {
2793     *errorcodeptr = ERR6;
2794     goto FAILED;
2795     }
2796    
2797 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2798     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2799     can optimize the negative case only if there were no characters >= 128
2800     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2801     single-bytes only. This is an historical hangover. Maybe one day we can
2802     tidy these opcodes to handle multi-byte characters.
2803    
2804     The optimization throws away the bit map. We turn the item into a
2805     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2806     that OP_NOT does not support multibyte characters. In the positive case, it
2807     can cause firstbyte to be set. Otherwise, there can be no first char if
2808     this item is first, whatever repeat count may follow. In the case of
2809     reqbyte, save the previous value for reinstating. */
2810    
2811     #ifdef SUPPORT_UTF8
2812     if (class_charcount == 1 &&
2813     (!utf8 ||
2814     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2815    
2816     #else
2817     if (class_charcount == 1)
2818     #endif
2819     {
2820     zeroreqbyte = reqbyte;
2821    
2822     /* The OP_NOT opcode works on one-byte characters only. */
2823    
2824     if (negate_class)
2825     {
2826     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2827     zerofirstbyte = firstbyte;
2828     *code++ = OP_NOT;
2829     *code++ = class_lastchar;
2830     break;
2831     }
2832    
2833     /* For a single, positive character, get the value into mcbuffer, and
2834     then we can handle this with the normal one-character code. */
2835    
2836     #ifdef SUPPORT_UTF8
2837     if (utf8 && class_lastchar > 127)
2838     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2839     else
2840     #endif
2841     {
2842     mcbuffer[0] = class_lastchar;
2843     mclength = 1;
2844     }
2845     goto ONE_CHAR;
2846     } /* End of 1-char optimization */
2847    
2848     /* The general case - not the one-char optimization. If this is the first
2849     thing in the branch, there can be no first char setting, whatever the
2850     repeat count. Any reqbyte setting must remain unchanged after any kind of
2851     repeat. */
2852    
2853     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2854     zerofirstbyte = firstbyte;
2855     zeroreqbyte = reqbyte;
2856    
2857     /* If there are characters with values > 255, we have to compile an
2858     extended class, with its own opcode. If there are no characters < 256,
2859 nigel 93 we can omit the bitmap in the actual compiled code. */
2860 nigel 77
2861     #ifdef SUPPORT_UTF8
2862     if (class_utf8)
2863     {
2864     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2865     *code++ = OP_XCLASS;
2866     code += LINK_SIZE;
2867     *code = negate_class? XCL_NOT : 0;
2868    
2869 nigel 93 /* If the map is required, move up the extra data to make room for it;
2870     otherwise just move the code pointer to the end of the extra data. */
2871 nigel 77
2872     if (class_charcount > 0)
2873     {
2874     *code++ |= XCL_MAP;
2875 nigel 93 memmove(code + 32, code, class_utf8data - code);
2876 nigel 77 memcpy(code, classbits, 32);
2877 nigel 93 code = class_utf8data + 32;
2878 nigel 77 }
2879 nigel 93 else code = class_utf8data;
2880 nigel 77
2881     /* Now fill in the complete length of the item */
2882    
2883     PUT(previous, 1, code - previous);
2884     break; /* End of class handling */
2885     }
2886     #endif
2887    
2888     /* If there are no characters > 255, negate the 32-byte map if necessary,
2889     and copy it into the code vector. If this is the first thing in the branch,
2890     there can be no first char setting, whatever the repeat count. Any reqbyte
2891     setting must remain unchanged after any kind of repeat. */
2892    
2893     if (negate_class)
2894     {
2895     *code++ = OP_NCLASS;
2896 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2897     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2898 nigel 77 }
2899     else
2900     {
2901     *code++ = OP_CLASS;
2902     memcpy(code, classbits, 32);
2903     }
2904     code += 32;
2905     break;
2906    
2907 nigel 93
2908     /* ===================================================================*/
2909 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2910     has been tested above. */
2911    
2912     case '{':
2913     if (!is_quantifier) goto NORMAL_CHAR;
2914     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2915     if (*errorcodeptr != 0) goto FAILED;
2916     goto REPEAT;
2917    
2918     case '*':
2919     repeat_min = 0;
2920     repeat_max = -1;
2921     goto REPEAT;
2922    
2923     case '+':
2924     repeat_min = 1;
2925     repeat_max = -1;
2926     goto REPEAT;
2927    
2928     case '?':
2929     repeat_min = 0;
2930     repeat_max = 1;
2931    
2932     REPEAT:
2933     if (previous == NULL)
2934     {
2935     *errorcodeptr = ERR9;
2936     goto FAILED;
2937     }
2938    
2939     if (repeat_min == 0)
2940     {
2941     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2942     reqbyte = zeroreqbyte; /* Ditto */
2943     }
2944    
2945     /* Remember whether this is a variable length repeat */
2946    
2947     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2948    
2949     op_type = 0; /* Default single-char op codes */
2950     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2951    
2952     /* Save start of previous item, in case we have to move it up to make space
2953     for an inserted OP_ONCE for the additional '+' extension. */
2954    
2955     tempcode = previous;
2956    
2957     /* If the next character is '+', we have a possessive quantifier. This
2958     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2959     If the next character is '?' this is a minimizing repeat, by default,
2960     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2961     repeat type to the non-default. */
2962    
2963     if (ptr[1] == '+')
2964     {
2965     repeat_type = 0; /* Force greedy */
2966     possessive_quantifier = TRUE;
2967     ptr++;
2968     }
2969     else if (ptr[1] == '?')
2970     {
2971     repeat_type = greedy_non_default;
2972     ptr++;
2973     }
2974     else repeat_type = greedy_default;
2975    
2976     /* If previous was a character match, abolish the item and generate a
2977     repeat item instead. If a char item has a minumum of more than one, ensure
2978     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2979     the first thing in a branch because the x will have gone into firstbyte
2980     instead. */
2981    
2982     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2983     {
2984     /* Deal with UTF-8 characters that take up more than one byte. It's
2985     easier to write this out separately than try to macrify it. Use c to
2986     hold the length of the character in bytes, plus 0x80 to flag that it's a
2987     length rather than a small character. */
2988    
2989     #ifdef SUPPORT_UTF8
2990     if (utf8 && (code[-1] & 0x80) != 0)
2991     {
2992     uschar *lastchar = code - 1;
2993     while((*lastchar & 0xc0) == 0x80) lastchar--;
2994     c = code - lastchar; /* Length of UTF-8 character */
2995     memcpy(utf8_char, lastchar, c); /* Save the char */
2996     c |= 0x80; /* Flag c as a length */
2997     }
2998     else
2999     #endif
3000    
3001     /* Handle the case of a single byte - either with no UTF8 support, or
3002     with UTF-8 disabled, or for a UTF-8 character < 128. */
3003    
3004     {
3005     c = code[-1];
3006     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3007     }
3008    
3009 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3010     the line is something that cannot possibly match this character. If so,
3011     automatically possessifying this item gains some performance in the case
3012     where the match fails. */
3013    
3014     if (!possessive_quantifier &&
3015     repeat_max < 0 &&
3016     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3017     options, cd))
3018     {
3019     repeat_type = 0; /* Force greedy */
3020     possessive_quantifier = TRUE;
3021     }
3022    
3023 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3024     }
3025    
3026     /* If previous was a single negated character ([^a] or similar), we use
3027     one of the special opcodes, replacing it. The code is shared with single-
3028     character repeats by setting opt_type to add a suitable offset into
3029 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3030     currently used only for single-byte chars. */
3031 nigel 77
3032     else if (*previous == OP_NOT)
3033     {
3034     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3035     c = previous[1];
3036 nigel 93 if (!possessive_quantifier &&
3037     repeat_max < 0 &&
3038     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3039     {
3040     repeat_type = 0; /* Force greedy */
3041     possessive_quantifier = TRUE;
3042     }
3043 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3044     }
3045    
3046     /* If previous was a character type match (\d or similar), abolish it and
3047     create a suitable repeat item. The code is shared with single-character
3048     repeats by setting op_type to add a suitable offset into repeat_type. Note
3049     the the Unicode property types will be present only when SUPPORT_UCP is
3050     defined, but we don't wrap the little bits of code here because it just
3051     makes it horribly messy. */
3052    
3053     else if (*previous < OP_EODN)
3054     {
3055     uschar *oldcode;
3056 nigel 87 int prop_type, prop_value;
3057 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3058     c = *previous;
3059    
3060 nigel 93 if (!possessive_quantifier &&
3061     repeat_max < 0 &&
3062     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3063     {
3064     repeat_type = 0; /* Force greedy */
3065     possessive_quantifier = TRUE;
3066     }
3067    
3068 nigel 77 OUTPUT_SINGLE_REPEAT:
3069 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3070     {
3071     prop_type = previous[1];
3072     prop_value = previous[2];
3073     }
3074     else prop_type = prop_value = -1;
3075 nigel 77
3076     oldcode = code;
3077     code = previous; /* Usually overwrite previous item */
3078    
3079     /* If the maximum is zero then the minimum must also be zero; Perl allows
3080     this case, so we do too - by simply omitting the item altogether. */
3081    
3082     if (repeat_max == 0) goto END_REPEAT;
3083    
3084     /* All real repeats make it impossible to handle partial matching (maybe
3085     one day we will be able to remove this restriction). */
3086    
3087     if (repeat_max != 1) cd->nopartial = TRUE;
3088    
3089     /* Combine the op_type with the repeat_type */
3090    
3091     repeat_type += op_type;
3092    
3093     /* A minimum of zero is handled either as the special case * or ?, or as
3094     an UPTO, with the maximum given. */
3095    
3096     if (repeat_min == 0)
3097     {
3098     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3099     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3100     else
3101     {
3102     *code++ = OP_UPTO + repeat_type;
3103     PUT2INC(code, 0, repeat_max);
3104     }
3105     }
3106    
3107     /* A repeat minimum of 1 is optimized into some special cases. If the
3108 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3109 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3110     one less than the maximum. */
3111    
3112     else if (repeat_min == 1)
3113     {
3114     if (repeat_max == -1)
3115     *code++ = OP_PLUS + repeat_type;
3116     else
3117     {
3118     code = oldcode; /* leave previous item in place */
3119     if (repeat_max == 1) goto END_REPEAT;
3120     *code++ = OP_UPTO + repeat_type;
3121     PUT2INC(code, 0, repeat_max - 1);
3122     }
3123     }
3124    
3125     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3126     handled as an EXACT followed by an UPTO. */
3127    
3128     else
3129     {
3130     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3131     PUT2INC(code, 0, repeat_min);
3132    
3133     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3134     we have to insert the character for the previous code. For a repeated
3135 nigel 87 Unicode property match, there are two extra bytes that define the
3136 nigel 77 required property. In UTF-8 mode, long characters have their length in
3137     c, with the 0x80 bit as a flag. */
3138    
3139     if (repeat_max < 0)
3140     {
3141     #ifdef SUPPORT_UTF8
3142     if (utf8 && c >= 128)
3143     {
3144     memcpy(code, utf8_char, c & 7);
3145     code += c & 7;
3146     }
3147     else
3148     #endif
3149     {
3150     *code++ = c;
3151 nigel 87 if (prop_type >= 0)
3152     {
3153     *code++ = prop_type;
3154     *code++ = prop_value;
3155     }
3156 nigel 77 }
3157     *code++ = OP_STAR + repeat_type;
3158     }
3159    
3160     /* Else insert an UPTO if the max is greater than the min, again
3161 nigel 93 preceded by the character, for the previously inserted code. If the
3162     UPTO is just for 1 instance, we can use QUERY instead. */
3163 nigel 77
3164     else if (repeat_max != repeat_min)
3165     {
3166     #ifdef SUPPORT_UTF8
3167     if (utf8 && c >= 128)
3168     {
3169     memcpy(code, utf8_char, c & 7);
3170     code += c & 7;
3171     }
3172     else
3173     #endif
3174     *code++ = c;
3175 nigel 87 if (prop_type >= 0)
3176     {
3177     *code++ = prop_type;
3178     *code++ = prop_value;
3179     }
3180 nigel 77 repeat_max -= repeat_min;
3181 nigel 93
3182     if (repeat_max == 1)
3183     {
3184     *code++ = OP_QUERY + repeat_type;
3185     }
3186     else
3187     {
3188     *code++ = OP_UPTO + repeat_type;
3189     PUT2INC(code, 0, repeat_max);
3190     }
3191 nigel 77 }
3192     }
3193    
3194     /* The character or character type itself comes last in all cases. */
3195    
3196     #ifdef SUPPORT_UTF8
3197     if (utf8 && c >= 128)
3198     {
3199     memcpy(code, utf8_char, c & 7);
3200     code += c & 7;
3201     }
3202     else
3203     #endif
3204     *code++ = c;
3205    
3206 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3207     define the required property. */
3208 nigel 77
3209     #ifdef SUPPORT_UCP
3210 nigel 87 if (prop_type >= 0)
3211     {
3212     *code++ = prop_type;
3213     *code++ = prop_value;
3214     }
3215 nigel 77 #endif
3216     }
3217    
3218     /* If previous was a character class or a back reference, we put the repeat
3219     stuff after it, but just skip the item if the repeat was {0,0}. */
3220    
3221     else if (*previous == OP_CLASS ||
3222     *previous == OP_NCLASS ||
3223     #ifdef SUPPORT_UTF8
3224     *previous == OP_XCLASS ||
3225     #endif
3226     *previous == OP_REF)
3227     {
3228     if (repeat_max == 0)
3229     {
3230     code = previous;
3231     goto END_REPEAT;
3232     }
3233    
3234     /* All real repeats make it impossible to handle partial matching (maybe
3235     one day we will be able to remove this restriction). */
3236    
3237     if (repeat_max != 1) cd->nopartial = TRUE;
3238    
3239     if (repeat_min == 0 && repeat_max == -1)
3240     *code++ = OP_CRSTAR + repeat_type;
3241     else if (repeat_min == 1 && repeat_max == -1)
3242     *code++ = OP_CRPLUS + repeat_type;
3243     else if (repeat_min == 0 && repeat_max == 1)
3244     *code++ = OP_CRQUERY + repeat_type;
3245     else
3246     {
3247     *code++ = OP_CRRANGE + repeat_type;
3248     PUT2INC(code, 0, repeat_min);
3249     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3250     PUT2INC(code, 0, repeat_max);
3251     }
3252     }
3253    
3254     /* If previous was a bracket group, we may have to replicate it in certain
3255     cases. */
3256    
3257 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3258     *previous == OP_ONCE || *previous == OP_COND)
3259 nigel 77 {
3260     register int i;
3261     int ketoffset = 0;
3262     int len = code - previous;
3263     uschar *bralink = NULL;
3264    
3265 nigel 93 /* Repeating a DEFINE group is pointless */
3266    
3267     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3268     {
3269     *errorcodeptr = ERR55;
3270     goto FAILED;
3271     }
3272    
3273     /* This is a paranoid check to stop integer overflow later on */
3274    
3275     if (len > MAX_DUPLENGTH)
3276     {
3277     *errorcodeptr = ERR50;
3278     goto FAILED;
3279     }
3280    
3281 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3282     by scanning through from the start, and compute the offset back to it
3283     from the current code pointer. There may be an OP_OPT setting following
3284     the final KET, so we can't find the end just by going back from the code
3285     pointer. */
3286    
3287     if (repeat_max == -1)
3288     {
3289     register uschar *ket = previous;
3290     do ket += GET(ket, 1); while (*ket != OP_KET);
3291     ketoffset = code - ket;
3292     }
3293    
3294     /* The case of a zero minimum is special because of the need to stick
3295     OP_BRAZERO in front of it, and because the group appears once in the
3296     data, whereas in other cases it appears the minimum number of times. For
3297     this reason, it is simplest to treat this case separately, as otherwise
3298     the code gets far too messy. There are several special subcases when the
3299     minimum is zero. */
3300    
3301     if (repeat_min == 0)
3302     {
3303     /* If the maximum is also zero, we just omit the group from the output
3304     altogether. */
3305    
3306     if (repeat_max == 0)
3307     {
3308     code = previous;
3309     goto END_REPEAT;
3310     }
3311    
3312     /* If the maximum is 1 or unlimited, we just have to stick in the
3313     BRAZERO and do no more at this point. However, we do need to adjust
3314     any OP_RECURSE calls inside the group that refer to the group itself or
3315 nigel 93 any internal or forward referenced group, because the offset is from
3316     the start of the whole regex. Temporarily terminate the pattern while
3317     doing this. */
3318 nigel 77
3319     if (repeat_max <= 1)
3320     {
3321     *code = OP_END;
3322 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3323 nigel 77 memmove(previous+1, previous, len);
3324     code++;
3325     *previous++ = OP_BRAZERO + repeat_type;
3326     }
3327    
3328     /* If the maximum is greater than 1 and limited, we have to replicate
3329     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3330     The first one has to be handled carefully because it's the original
3331     copy, which has to be moved up. The remainder can be handled by code
3332     that is common with the non-zero minimum case below. We have to
3333     adjust the value or repeat_max, since one less copy is required. Once
3334     again, we may have to adjust any OP_RECURSE calls inside the group. */
3335    
3336     else
3337     {
3338     int offset;
3339     *code = OP_END;
3340 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3341 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3342     code += 2 + LINK_SIZE;
3343     *previous++ = OP_BRAZERO + repeat_type;
3344     *previous++ = OP_BRA;
3345    
3346     /* We chain together the bracket offset fields that have to be
3347     filled in later when the ends of the brackets are reached. */
3348    
3349     offset = (bralink == NULL)? 0 : previous - bralink;
3350     bralink = previous;
3351     PUTINC(previous, 0, offset);
3352     }
3353    
3354     repeat_max--;
3355     }
3356    
3357     /* If the minimum is greater than zero, replicate the group as many
3358     times as necessary, and adjust the maximum to the number of subsequent
3359     copies that we need. If we set a first char from the group, and didn't
3360 nigel 93 set a required char, copy the latter from the former. If there are any
3361     forward reference subroutine calls in the group, there will be entries on
3362     the workspace list; replicate these with an appropriate increment. */
3363 nigel 77
3364     else
3365     {
3366     if (repeat_min > 1)
3367     {
3368 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3369     just adjust the length as if we had. */
3370    
3371     if (lengthptr != NULL)
3372     *lengthptr += (repeat_min - 1)*length_prevgroup;
3373    
3374     /* This is compiling for real */
3375    
3376     else
3377 nigel 77 {
3378 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3379     for (i = 1; i < repeat_min; i++)
3380     {
3381     uschar *hc;
3382     uschar *this_hwm = cd->hwm;
3383     memcpy(code, previous, len);
3384     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3385     {
3386     PUT(cd->hwm, 0, GET(hc, 0) + len);
3387     cd->hwm += LINK_SIZE;
3388     }
3389     save_hwm = this_hwm;
3390     code += len;
3391     }
3392 nigel 77 }
3393     }
3394 nigel 93
3395 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3396     }
3397    
3398     /* This code is common to both the zero and non-zero minimum cases. If
3399     the maximum is limited, it replicates the group in a nested fashion,
3400     remembering the bracket starts on a stack. In the case of a zero minimum,
3401     the first one was set up above. In all cases the repeat_max now specifies
3402 nigel 93 the number of additional copies needed. Again, we must remember to
3403     replicate entries on the forward reference list. */
3404 nigel 77
3405     if (repeat_max >= 0)
3406     {
3407 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3408     just adjust the length as if we had. For each repetition we must add 1
3409     to the length for BRAZERO and for all but the last repetition we must
3410     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3411    
3412     if (lengthptr != NULL && repeat_max > 0)
3413     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3414     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3415    
3416     /* This is compiling for real */
3417    
3418     else for (i = repeat_max - 1; i >= 0; i--)
3419 nigel 77 {
3420 nigel 93 uschar *hc;
3421     uschar *this_hwm = cd->hwm;
3422    
3423 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3424    
3425     /* All but the final copy start a new nesting, maintaining the
3426     chain of brackets outstanding. */
3427    
3428     if (i != 0)
3429     {
3430     int offset;
3431     *code++ = OP_BRA;
3432     offset = (bralink == NULL)? 0 : code - bralink;
3433     bralink = code;
3434     PUTINC(code, 0, offset);
3435     }
3436    
3437     memcpy(code, previous, len);
3438 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3439     {
3440     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3441     cd->hwm += LINK_SIZE;
3442     }
3443     save_hwm = this_hwm;
3444 nigel 77 code += len;
3445     }
3446    
3447     /* Now chain through the pending brackets, and fill in their length
3448     fields (which are holding the chain links pro tem). */
3449    
3450     while (bralink != NULL)
3451     {
3452     int oldlinkoffset;
3453     int offset = code - bralink + 1;
3454     uschar *bra = code - offset;
3455     oldlinkoffset = GET(bra, 1);
3456     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3457     *code++ = OP_KET;
3458     PUTINC(code, 0, offset);
3459     PUT(bra, 1, offset);
3460     }
3461     }
3462    
3463     /* If the maximum is unlimited, set a repeater in the final copy. We
3464     can't just offset backwards from the current code point, because we
3465     don't know if there's been an options resetting after the ket. The
3466 nigel 93 correct offset was computed above.
3467 nigel 77
3468 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3469     this group is a non-atomic one that could match an empty string. If so,
3470     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3471     that runtime checking can be done. [This check is also applied to
3472     atomic groups at runtime, but in a different way.] */
3473    
3474     else
3475     {
3476     uschar *ketcode = code - ketoffset;
3477     uschar *bracode = ketcode - GET(ketcode, 1);
3478     *ketcode = OP_KETRMAX + repeat_type;
3479     if (lengthptr == NULL && *bracode != OP_ONCE)
3480     {
3481     uschar *scode = bracode;
3482     do
3483     {
3484     if (could_be_empty_branch(scode, ketcode, utf8))
3485     {
3486     *bracode += OP_SBRA - OP_BRA;
3487     break;
3488     }
3489     scode += GET(scode, 1);
3490     }
3491     while (*scode == OP_ALT);
3492     }
3493     }
3494 nigel 77 }
3495    
3496     /* Else there's some kind of shambles */
3497    
3498     else
3499     {
3500     *errorcodeptr = ERR11;
3501     goto FAILED;
3502     }
3503    
3504 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3505     tests above succeeded, possessive_quantifier is TRUE. For some of the
3506     simpler opcodes, there is an special alternative opcode for this. For
3507     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3508     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3509     but the special opcodes can optimize it a bit. The repeated item starts at
3510     tempcode, not at previous, which might be the first part of a string whose
3511     (former) last char we repeated.
3512 nigel 77
3513 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3514     an 'upto' may follow. We skip over an 'exact' item, and then test the
3515     length of what remains before proceeding. */
3516    
3517 nigel 77 if (possessive_quantifier)
3518     {
3519 nigel 93 int len;
3520     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3521     *tempcode == OP_NOTEXACT)
3522     tempcode += _pcre_OP_lengths[*tempcode];
3523     len = code - tempcode;
3524     if (len > 0) switch (*tempcode)
3525     {
3526     case OP_STAR: *tempcode = OP_POSSTAR; break;
3527     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3528     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3529     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3530    
3531     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3532     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3533     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3534     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3535    
3536     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3537     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3538     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3539     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3540    
3541     default:
3542     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3543     code += 1 + LINK_SIZE;
3544     len += 1 + LINK_SIZE;
3545     tempcode[0] = OP_ONCE;
3546     *code++ = OP_KET;
3547     PUTINC(code, 0, len);
3548     PUT(tempcode, 1, len);
3549     break;
3550     }
3551 nigel 77 }
3552    
3553     /* In all case we no longer have a previous item. We also set the
3554     "follows varying string" flag for subsequently encountered reqbytes if
3555     it isn't already set and we have just passed a varying length item. */
3556    
3557     END_REPEAT:
3558     previous = NULL;
3559     cd->req_varyopt |= reqvary;
3560     break;
3561    
3562    
3563 nigel 93 /* ===================================================================*/
3564     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3565     lookbehind or option setting or condition or all the other extended
3566     parenthesis forms. First deal with the specials; all are introduced by ?,
3567     and the appearance of any of them means that this is not a capturing
3568     group. */
3569 nigel 77
3570     case '(':
3571     newoptions = options;
3572     skipbytes = 0;
3573 nigel 93 bravalue = OP_CBRA;
3574     save_hwm = cd->hwm;
3575 nigel 77
3576     if (*(++ptr) == '?')
3577     {
3578 nigel 93 int i, set, unset, namelen;
3579 nigel 77 int *optset;
3580 nigel 93 const uschar *name;
3581     uschar *slot;
3582 nigel 77
3583     switch (*(++ptr))
3584     {
3585     case '#': /* Comment; skip to ket */
3586     ptr++;
3587 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3588     if (*ptr == 0)
3589     {
3590     *errorcodeptr = ERR18;
3591     goto FAILED;
3592     }
3593 nigel 77 continue;
3594    
3595 nigel 93
3596     /* ------------------------------------------------------------ */
3597     case ':': /* Non-capturing bracket */
3598 nigel 77 bravalue = OP_BRA;
3599     ptr++;
3600     break;
3601    
3602 nigel 93
3603     /* ------------------------------------------------------------ */
3604 nigel 77 case '(':
3605     bravalue = OP_COND; /* Conditional group */
3606    
3607 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3608     group), a name (referring to a named group), or 'R', referring to
3609     recursion. R<digits> and R&name are also permitted for recursion tests.
3610 nigel 77
3611 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3612     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3613    
3614     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3615     be the recursive thing or the name 'R' (and similarly for 'R' followed
3616     by digits), and (b) a number could be a name that consists of digits.
3617     In both cases, we look for a name first; if not found, we try the other
3618     cases. */
3619    
3620     /* For conditions that are assertions, check the syntax, and then exit
3621     the switch. This will take control down to where bracketed groups,
3622     including assertions, are processed. */
3623    
3624     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3625     break;
3626    
3627     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3628     below), and all need to skip 3 bytes at the start of the group. */
3629    
3630     code[1+LINK_SIZE] = OP_CREF;
3631     skipbytes = 3;
3632 ph10 167 refsign = -1;
3633 nigel 93
3634     /* Check for a test for recursion in a named group. */
3635    
3636     if (ptr[1] == 'R' && ptr[2] == '&')
3637 nigel 77 {
3638 nigel 93 terminator = -1;
3639     ptr += 2;
3640     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3641     }
3642 nigel 91
3643 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3644     syntax (?(<name>) or (?('name') */
3645 nigel 91
3646 nigel 93 else if (ptr[1] == '<')
3647     {
3648     terminator = '>';
3649     ptr++;
3650     }
3651     else if (ptr[1] == '\'')
3652     {
3653     terminator = '\'';
3654     ptr++;
3655     }
3656 ph10 167 else
3657     {
3658     terminator = 0;
3659     if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3660     }
3661 nigel 77
3662 nigel 93 /* We now expect to read a name; any thing else is an error */
3663 nigel 77
3664 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3665     {
3666     ptr += 1; /* To get the right offset */
3667     *errorcodeptr = ERR28;
3668     goto FAILED;
3669     }
3670    
3671     /* Read the name, but also get it as a number if it's all digits */
3672    
3673     recno = 0;
3674     name = ++ptr;
3675     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3676     {
3677     if (recno >= 0)
3678     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3679     recno * 10 + *ptr - '0' : -1;
3680 nigel 91 ptr++;
3681 nigel 93 }
3682     namelen = ptr - name;
3683 nigel 91
3684 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3685     {
3686     ptr--; /* Error offset */
3687     *errorcodeptr = ERR26;
3688     goto FAILED;
3689     }
3690 nigel 91
3691 nigel 93 /* Do no further checking in the pre-compile phase. */
3692 nigel 91
3693 nigel 93 if (lengthptr != NULL) break;
3694 nigel 91
3695 nigel 93 /* In the real compile we do the work of looking for the actual
3696 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3697     be digits, in which case recno will be set. */
3698    
3699     if (refsign > 0)
3700     {
3701     if (recno <= 0)
3702     {
3703     *errorcodeptr = ERR58;
3704     goto FAILED;
3705     }
3706     if (refsign == '-')
3707     {
3708     recno = cd->bracount - recno + 1;
3709     if (recno <= 0)
3710     {
3711     *errorcodeptr = ERR15;
3712     goto FAILED;
3713     }
3714     }
3715     else recno += cd->bracount;
3716     PUT2(code, 2+LINK_SIZE, recno);
3717     break;
3718     }
3719 nigel 91
3720 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3721     name. */
3722    
3723 nigel 93 slot = cd->name_table;
3724     for (i = 0; i < cd->names_found; i++)
3725     {
3726     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3727     slot += cd->name_entry_size;
3728     }
3729 nigel 91
3730 nigel 93 /* Found a previous named subpattern */
3731 nigel 91
3732 nigel 93 if (i < cd->names_found)
3733     {
3734     recno = GET2(slot, 0);
3735     PUT2(code, 2+LINK_SIZE, recno);
3736     }
3737 nigel 91
3738 nigel 93 /* Search the pattern for a forward reference */
3739 nigel 91
3740 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3741     (options & PCRE_EXTENDED) != 0)) > 0)
3742     {
3743     PUT2(code, 2+LINK_SIZE, i);
3744     }
3745 nigel 91
3746 nigel 93 /* If terminator == 0 it means that the name followed directly after
3747     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3748     some further alternatives to try. For the cases where terminator != 0
3749     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3750     now checked all the possibilities, so give an error. */
3751 nigel 91
3752 nigel 93 else if (terminator != 0)
3753     {
3754     *errorcodeptr = ERR15;
3755     goto FAILED;
3756     }
3757    
3758     /* Check for (?(R) for recursion. Allow digits after R to specify a
3759     specific group number. */
3760    
3761     else if (*name == 'R')
3762     {
3763     recno = 0;
3764     for (i = 1; i < namelen; i++)
3765 nigel 91 {
3766 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3767     {
3768     *errorcodeptr = ERR15;
3769     goto FAILED;
3770     }
3771     recno = recno * 10 + name[i] - '0';
3772 nigel 77 }
3773 nigel 93 if (recno == 0) recno = RREF_ANY;
3774     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3775     PUT2(code, 2+LINK_SIZE, recno);
3776 nigel 77 }
3777 nigel 91
3778 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3779     false. */
3780 nigel 91
3781 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3782     {
3783     code[1+LINK_SIZE] = OP_DEF;
3784     skipbytes = 1;
3785     }
3786    
3787     /* Check for the "name" actually being a subpattern number. */
3788    
3789     else if (recno > 0)
3790     {
3791     PUT2(code, 2+LINK_SIZE, recno);
3792     }
3793    
3794     /* Either an unidentified subpattern, or a reference to (?(0) */
3795    
3796     else
3797     {
3798     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3799     goto FAILED;
3800     }
3801 nigel 77 break;
3802    
3803 nigel 93
3804     /* ------------------------------------------------------------ */
3805 nigel 77 case '=': /* Positive lookahead */
3806     bravalue = OP_ASSERT;
3807     ptr++;
3808     break;
3809    
3810 nigel 93
3811     /* ------------------------------------------------------------ */
3812 nigel 77 case '!': /* Negative lookahead */
3813     bravalue = OP_ASSERT_NOT;
3814     ptr++;
3815     break;
3816    
3817 nigel 93
3818     /* ------------------------------------------------------------ */
3819     case '<': /* Lookbehind or named define */
3820     switch (ptr[1])
3821 nigel 77 {
3822     case '=': /* Positive lookbehind */
3823     bravalue = OP_ASSERTBACK;
3824 nigel 93 ptr += 2;
3825 nigel 77 break;
3826    
3827     case '!': /* Negative lookbehind */
3828     bravalue = OP_ASSERTBACK_NOT;
3829 nigel 93 ptr += 2;
3830 nigel 77 break;
3831 nigel 93
3832     default: /* Could be name define, else bad */
3833     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3834     ptr++; /* Correct offset for error */
3835     *errorcodeptr = ERR24;
3836     goto FAILED;
3837 nigel 77 }
3838     break;
3839    
3840 nigel 93
3841     /* ------------------------------------------------------------ */
3842 nigel 77 case '>': /* One-time brackets */
3843     bravalue = OP_ONCE;
3844     ptr++;
3845     break;
3846    
3847 nigel 93
3848     /* ------------------------------------------------------------ */
3849 nigel 77 case 'C': /* Callout - may be followed by digits; */
3850     previous_callout = code; /* Save for later completion */
3851     after_manual_callout = 1; /* Skip one item before completing */
3852 nigel 93 *code++ = OP_CALLOUT;
3853     {
3854 nigel 77 int n = 0;
3855     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3856     n = n * 10 + *ptr - '0';
3857 nigel 93 if (*ptr != ')')
3858     {
3859     *errorcodeptr = ERR39;
3860     goto FAILED;
3861     }
3862 nigel 77 if (n > 255)
3863     {
3864     *errorcodeptr = ERR38;
3865     goto FAILED;
3866     }
3867     *code++ = n;
3868     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3869     PUT(code, LINK_SIZE, 0); /* Default length */
3870     code += 2 * LINK_SIZE;
3871     }
3872     previous = NULL;
3873     continue;
3874    
3875 nigel 93
3876     /* ------------------------------------------------------------ */
3877     case 'P': /* Python-style named subpattern handling */
3878     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3879 nigel 77 {
3880 nigel 93 is_recurse = *ptr == '>';
3881     terminator = ')';
3882     goto NAMED_REF_OR_RECURSE;
3883     }
3884     else if (*ptr != '<') /* Test for Python-style definition */
3885     {
3886     *errorcodeptr = ERR41;
3887     goto FAILED;
3888     }
3889     /* Fall through to handle (?P< as (?< is handled */
3890 nigel 77
3891    
3892 nigel 93 /* ------------------------------------------------------------ */
3893     DEFINE_NAME: /* Come here from (?< handling */
3894     case '\'':
3895     {
3896     terminator = (*ptr == '<')? '>' : '\'';
3897     name = ++ptr;
3898    
3899     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3900     namelen = ptr - name;
3901    
3902     /* In the pre-compile phase, just do a syntax check. */
3903    
3904     if (lengthptr != NULL)
3905 nigel 77 {
3906 nigel 93 if (*ptr != terminator)
3907 nigel 77 {
3908 nigel 93 *errorcodeptr = ERR42;
3909     goto FAILED;
3910     }
3911     if (cd->names_found >= MAX_NAME_COUNT)
3912     {
3913     *errorcodeptr = ERR49;
3914     goto FAILED;
3915     }
3916     if (namelen + 3 > cd->name_entry_size)
3917     {
3918     cd->name_entry_size = namelen + 3;
3919     if (namelen > MAX_NAME_SIZE)
3920 nigel 77 {
3921 nigel 93 *errorcodeptr = ERR48;
3922     goto FAILED;
3923     }
3924     }
3925     }
3926    
3927     /* In the real compile, create the entry in the table */
3928    
3929     else
3930     {
3931     slot = cd->name_table;
3932     for (i = 0; i < cd->names_found; i++)
3933     {
3934     int crc = memcmp(name, slot+2, namelen);
3935     if (crc == 0)
3936     {
3937     if (slot[2+namelen] == 0)
3938 nigel 91 {
3939 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
3940     {
3941     *errorcodeptr = ERR43;
3942     goto FAILED;
3943     }
3944 nigel 91 }
3945 nigel 93 else crc = -1; /* Current name is substring */
3946 nigel 77 }
3947 nigel 93 if (crc < 0)
3948     {
3949     memmove(slot + cd->name_entry_size, slot,
3950     (cd->names_found - i) * cd->name_entry_size);
3951     break;
3952     }
3953     slot += cd->name_entry_size;
3954 nigel 77 }
3955 nigel 93
3956     PUT2(slot, 0, cd->bracount + 1);
3957     memcpy(slot + 2, name, namelen);
3958     slot[2+namelen] = 0;
3959 nigel 77 }
3960     }
3961    
3962 nigel 93 /* In both cases, count the number of names we've encountered. */
3963    
3964     ptr++; /* Move past > or ' */
3965     cd->names_found++;
3966     goto NUMBERED_GROUP;
3967    
3968    
3969     /* ------------------------------------------------------------ */
3970     case '&': /* Perl recursion/subroutine syntax */
3971     terminator = ')';
3972     is_recurse = TRUE;
3973     /* Fall through */
3974    
3975     /* We come here from the Python syntax above that handles both
3976     references (?P=name) and recursion (?P>name), as well as falling
3977     through from the Perl recursion syntax (?&name). */
3978    
3979     NAMED_REF_OR_RECURSE:
3980     name = ++ptr;
3981     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3982     namelen = ptr - name;
3983    
3984     /* In the pre-compile phase, do a syntax check and set a dummy
3985     reference number. */
3986    
3987     if (lengthptr != NULL)
3988 nigel 77 {
3989 nigel 93 if (*ptr != terminator)
3990     {
3991     *errorcodeptr = ERR42;
3992     goto FAILED;
3993     }
3994     if (namelen > MAX_NAME_SIZE)
3995     {
3996     *errorcodeptr = ERR48;
3997     goto FAILED;
3998     }
3999     recno = 0;
4000     }
4001 nigel 77
4002 nigel 93 /* In the real compile, seek the name in the table */
4003 nigel 77
4004 nigel 93 else
4005     {
4006     slot = cd->name_table;
4007 nigel 77 for (i = 0; i < cd->names_found; i++)
4008     {
4009     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4010     slot += cd->name_entry_size;
4011     }
4012 nigel 91
4013     if (i < cd->names_found) /* Back reference */
4014 nigel 77 {
4015 nigel 91 recno = GET2(slot, 0);
4016     }
4017     else if ((recno = /* Forward back reference */
4018 nigel 93 find_parens(ptr, cd->bracount, name, namelen,
4019     (options & PCRE_EXTENDED) != 0)) <= 0)
4020 nigel 91 {
4021 nigel 77 *errorcodeptr = ERR15;
4022     goto FAILED;
4023     }
4024 nigel 93 }
4025 nigel 77
4026 nigel 93 /* In both phases, we can now go to the code than handles numerical
4027     recursion or backreferences. */
4028 nigel 77
4029 nigel 93 if (is_recurse) goto HANDLE_RECURSION;
4030     else goto HANDLE_REFERENCE;
4031 nigel 77
4032    
4033 nigel 93 /* ------------------------------------------------------------ */
4034     case 'R': /* Recursion */
4035 nigel 77 ptr++; /* Same as (?0) */
4036     /* Fall through */
4037    
4038    
4039 nigel 93 /* ------------------------------------------------------------ */
4040 ph10 166 case '-': case '+':
4041 nigel 93 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4042     case '5': case '6': case '7': case '8': case '9': /* subroutine */
4043 nigel 77 {
4044     const uschar *called;
4045 ph10 166
4046 ph10 167 if ((refsign = *ptr) == '+') ptr++;
4047     else if (refsign == '-')
4048 ph10 166 {
4049     if ((digitab[ptr[1]] & ctype_digit) == 0)
4050     goto OTHER_CHAR_AFTER_QUERY;
4051     ptr++;
4052     }
4053    
4054 nigel 77 recno = 0;
4055     while((digitab[*ptr] & ctype_digit) != 0)
4056     recno = recno * 10 + *ptr++ - '0';
4057 ph10 166
4058 nigel 93 if (*ptr != ')')
4059     {
4060     *errorcodeptr = ERR29;
4061     goto FAILED;
4062     }
4063 ph10 166
4064 ph10 167 if (refsign == '-')
4065 ph10 166 {
4066     if (recno == 0)
4067     {
4068     *errorcodeptr = ERR58;
4069     goto FAILED;
4070     }
4071     recno = cd->bracount - recno + 1;
4072     if (recno <= 0)
4073     {
4074     *errorcodeptr = ERR15;
4075     goto FAILED;
4076     }
4077     }
4078 ph10 167 else if (refsign == '+')
4079 ph10 166 {
4080     if (recno == 0)
4081     {
4082     *errorcodeptr = ERR58;
4083     goto FAILED;
4084     }
4085     recno += cd->bracount;
4086     }
4087 nigel 77
4088     /* Come here from code above that handles a named recursion */
4089    
4090     HANDLE_RECURSION:
4091    
4092     previous = code;
4093 nigel 93 called = cd->start_code;
4094 nigel 77
4095 nigel 93 /* When we are actually compiling, find the bracket that is being
4096     referenced. Temporarily end the regex in case it doesn't exist before
4097     this point. If we end up with a forward reference, first check that
4098     the bracket does occur later so we can give the error (and position)
4099     now. Then remember this forward reference in the workspace so it can
4100     be filled in at the end. */
4101 nigel 77
4102 nigel 93 if (lengthptr == NULL)
4103 nigel 77 {
4104 nigel 93 *code = OP_END;
4105     if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4106 nigel 77
4107 nigel 93 /* Forward reference */
4108 nigel 77
4109 nigel 93 if (called == NULL)
4110     {
4111     if (find_parens(ptr, cd->bracount, NULL, recno,
4112     (options & PCRE_EXTENDED) != 0) < 0)
4113     {
4114     *errorcodeptr = ERR15;
4115     goto FAILED;
4116     }
4117     called = cd->start_code + recno;
4118     PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4119     }
4120    
4121     /* If not a forward reference, and the subpattern is still open,
4122     this is a recursive call. We check to see if this is a left
4123     recursion that could loop for ever, and diagnose that case. */
4124    
4125     else if (GET(called, 1) == 0 &&
4126     could_be_empty(called, code, bcptr, utf8))
4127     {
4128     *errorcodeptr = ERR40;
4129     goto FAILED;
4130     }
4131 nigel 77 }
4132    
4133 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside
4134 nigel 93 "once" brackets. Set up a "previous group" length so that a
4135     subsequent quantifier will work. */
4136 nigel 77
4137 nigel 87 *code = OP_ONCE;
4138     PUT(code, 1, 2 + 2*LINK_SIZE);
4139     code += 1 + LINK_SIZE;
4140    
4141 nigel 77 *code = OP_RECURSE;
4142     PUT(code, 1, called - cd->start_code);
4143     code += 1 + LINK_SIZE;
4144 nigel 87
4145     *code = OP_KET;
4146     PUT(code, 1, 2 + 2*LINK_SIZE);
4147     code += 1 + LINK_SIZE;
4148 nigel 93
4149     length_prevgroup = 3 + 3*LINK_SIZE;
4150 nigel 77 }
4151 nigel 93
4152     /* Can't determine a first byte now */
4153    
4154     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4155 nigel 77 continue;
4156    
4157    
4158 nigel 93 /* ------------------------------------------------------------ */
4159     default: /* Other characters: check option setting */
4160 ph10 166 OTHER_CHAR_AFTER_QUERY:
4161 nigel 77 set = unset = 0;
4162