/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 97 - (hide annotations) (download)
Mon Mar 5 12:36:47 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 172500 byte(s)
Applied Bob and Daniel's patches to convert the build system to automake. Added 
the maintain directory, containing files that are used for maintenance, but are 
not distributed. This is an intermediate step.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 nigel 77 /*************************************************
62     * Code parameters and static tables *
63     *************************************************/
64    
65 nigel 93 /* This value specifies the size of stack workspace that is used during the
66     first pre-compile phase that determines how much memory is required. The regex
67     is partly compiled into this space, but the compiled parts are discarded as
68     soon as they can be, so that hopefully there will never be an overrun. The code
69     does, however, check for an overrun. The largest amount I've seen used is 218,
70     so this number is very generous.
71 nigel 77
72 nigel 93 The same workspace is used during the second, actual compile phase for
73     remembering forward references to groups so that they can be filled in at the
74     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75     is 4 there is plenty of room. */
76 nigel 77
77 nigel 93 #define COMPILE_WORK_SIZE (4096)
78 nigel 77
79 nigel 93
80 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81     are simple data values; negative values are for special things like \d and so
82     on. Zero means further processing is needed (for things like \x), or the escape
83     is invalid. */
84    
85 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
86 nigel 77 static const short int escapes[] = {
87     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90     0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 nigel 93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 nigel 93 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 nigel 77 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96     0, 0, -ESC_z /* x - z */
97     };
98    
99 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
100 nigel 77 static const short int escapes[] = {
101     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118     /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 nigel 77 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124     };
125     #endif
126    
127    
128     /* Tables of names of POSIX character classes and their lengths. The list is
129 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 nigel 77 as this is assumed for handling case independence. */
131    
132     static const char *const posix_names[] = {
133     "alpha", "lower", "upper",
134     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135     "print", "punct", "space", "word", "xdigit" };
136    
137     static const uschar posix_name_lengths[] = {
138     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139    
140 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
141     base map, with an optional addition or removal of another map. Then, for some
142     classes, there is some additional tweaking: for [:blank:] the vertical space
143     characters are removed, and for [:alpha:] and [:alnum:] the underscore
144     character is removed. The triples in the table consist of the base map offset,
145     second map offset or -1 if no second map, and a non-negative value for map
146     addition or a negative value for map subtraction (if there are two maps). The
147     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148     remove vertical space characters, 2 => remove underscore. */
149 nigel 77
150     static const int posix_class_maps[] = {
151 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
152     cbit_lower, -1, 0, /* lower */
153     cbit_upper, -1, 0, /* upper */
154     cbit_word, -1, 2, /* alnum - word without underscore */
155     cbit_print, cbit_cntrl, 0, /* ascii */
156     cbit_space, -1, 1, /* blank - a GNU extension */
157     cbit_cntrl, -1, 0, /* cntrl */
158     cbit_digit, -1, 0, /* digit */
159     cbit_graph, -1, 0, /* graph */
160     cbit_print, -1, 0, /* print */
161     cbit_punct, -1, 0, /* punct */
162     cbit_space, -1, 0, /* space */
163     cbit_word, -1, 0, /* word - a Perl extension */
164     cbit_xdigit,-1, 0 /* xdigit */
165 nigel 77 };
166    
167    
168 nigel 93 #define STRING(a) # a
169     #define XSTRING(s) STRING(s)
170    
171 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
172 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
173     they are documented. Always add a new error instead. Messages marked DEAD below
174     are no longer used. */
175 nigel 77
176     static const char *error_texts[] = {
177     "no error",
178     "\\ at end of pattern",
179     "\\c at end of pattern",
180     "unrecognized character follows \\",
181     "numbers out of order in {} quantifier",
182     /* 5 */
183     "number too big in {} quantifier",
184     "missing terminating ] for character class",
185     "invalid escape sequence in character class",
186     "range out of order in character class",
187     "nothing to repeat",
188     /* 10 */
189 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 nigel 77 "internal error: unexpected repeat",
191     "unrecognized character after (?",
192     "POSIX named classes are supported only within a class",
193     "missing )",
194     /* 15 */
195     "reference to non-existent subpattern",
196     "erroffset passed as NULL",
197     "unknown option bit(s) set",
198     "missing ) after comment",
199 nigel 93 "parentheses nested too deeply", /** DEAD **/
200 nigel 77 /* 20 */
201     "regular expression too large",
202     "failed to get memory",
203     "unmatched parentheses",
204     "internal error: code overflow",
205     "unrecognized character after (?<",
206     /* 25 */
207     "lookbehind assertion is not fixed length",
208 nigel 91 "malformed number or name after (?(",
209 nigel 77 "conditional group contains more than two branches",
210     "assertion expected after (?(",
211     "(?R or (?digits must be followed by )",
212     /* 30 */
213     "unknown POSIX class name",
214     "POSIX collating elements are not supported",
215     "this version of PCRE is not compiled with PCRE_UTF8 support",
216 nigel 93 "spare error", /** DEAD **/
217 nigel 77 "character value in \\x{...} sequence is too large",
218     /* 35 */
219     "invalid condition (?(0)",
220     "\\C not allowed in lookbehind assertion",
221     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222     "number after (?C is > 255",
223     "closing ) for (?C expected",
224     /* 40 */
225     "recursive call could loop indefinitely",
226     "unrecognized character after (?P",
227 nigel 93 "syntax error in subpattern name (missing terminator)",
228 nigel 91 "two named subpatterns have the same name",
229 nigel 77 "invalid UTF-8 string",
230     /* 45 */
231     "support for \\P, \\p, and \\X has not been compiled",
232     "malformed \\P or \\p sequence",
233 nigel 91 "unknown property name after \\P or \\p",
234 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 nigel 91 /* 50 */
237     "repeated subpattern is too long",
238 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
239     "internal error: overran compiling workspace",
240     "internal error: previously-checked referenced subpattern not found",
241     "DEFINE group contains more than one branch",
242     /* 55 */
243     "repeating a DEFINE group is not allowed",
244     "inconsistent NEWLINE options",
245     "\\g is not followed by an (optionally braced) non-zero number"
246 nigel 77 };
247    
248    
249     /* Table to identify digits and hex digits. This is used when compiling
250     patterns. Note that the tables in chartables are dependent on the locale, and
251     may mark arbitrary characters as digits - but the PCRE compiling code expects
252     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
253     a private table here. It costs 256 bytes, but it is a lot faster than doing
254     character value tests (at least in some simple cases I timed), and in some
255     applications one wants PCRE to compile efficiently as well as match
256     efficiently.
257    
258     For convenience, we use the same bit definitions as in chartables:
259    
260     0x04 decimal digit
261     0x08 hexadecimal digit
262    
263     Then we can use ctype_digit and ctype_xdigit in the code. */
264    
265 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
266 nigel 77 static const unsigned char digitab[] =
267     {
268     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
269     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
270     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
271     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
272     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
273     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
274     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
275     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
276     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
280     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
281     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
300    
301 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
302 nigel 77 static const unsigned char digitab[] =
303     {
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
315 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
316 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
320     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
321     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
328     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
334     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
335     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
336    
337     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
338     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
339     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
340     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
342     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
346     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
347     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
349 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
350 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
351     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
354     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
355     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
356     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
357     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
358     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
359     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
360     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
361     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
362     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
363     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
364     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
365     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
366     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
367     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
368     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
369     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
370     #endif
371    
372    
373     /* Definition to allow mutual recursion */
374    
375     static BOOL
376 nigel 93 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
377     int *, branch_chain *, compile_data *, int *);
378 nigel 77
379    
380    
381     /*************************************************
382     * Handle escapes *
383     *************************************************/
384    
385     /* This function is called when a \ has been encountered. It either returns a
386     positive value for a simple escape such as \n, or a negative value which
387 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
388     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
389     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
390     ptr is pointing at the \. On exit, it is on the final character of the escape
391     sequence.
392 nigel 77
393     Arguments:
394     ptrptr points to the pattern position pointer
395     errorcodeptr points to the errorcode variable
396     bracount number of previous extracting brackets
397     options the options bits
398     isclass TRUE if inside a character class
399    
400     Returns: zero or positive => a data character
401     negative => a special escape sequence
402     on error, errorptr is set
403     */
404    
405     static int
406     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
407     int options, BOOL isclass)
408     {
409 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
410     const uschar *ptr = *ptrptr + 1;
411 nigel 77 int c, i;
412    
413 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
414     ptr--; /* Set pointer back to the last byte */
415    
416 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
417    
418     if (c == 0) *errorcodeptr = ERR1;
419    
420     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
421     a table. A non-zero result is something that can be returned immediately.
422     Otherwise further processing may be required. */
423    
424 ph10 97 #ifndef EBCDIC /* ASCII coding */
425 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
426     else if ((i = escapes[c - '0']) != 0) c = i;
427    
428 ph10 97 #else /* EBCDIC coding */
429 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
430     else if ((i = escapes[c - 0x48]) != 0) c = i;
431     #endif
432    
433     /* Escapes that need further processing, or are illegal. */
434    
435     else
436     {
437     const uschar *oldptr;
438 nigel 93 BOOL braced, negated;
439    
440 nigel 77 switch (c)
441     {
442     /* A number of Perl escapes are not handled by PCRE. We give an explicit
443     error. */
444    
445     case 'l':
446     case 'L':
447     case 'N':
448     case 'u':
449     case 'U':
450     *errorcodeptr = ERR37;
451     break;
452    
453 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
454     is an absolute backreference. If negative, it is a relative backreference.
455     This is a Perl 5.10 feature. */
456    
457     case 'g':
458     if (ptr[1] == '{')
459     {
460     braced = TRUE;
461     ptr++;
462     }
463     else braced = FALSE;
464    
465     if (ptr[1] == '-')
466     {
467     negated = TRUE;
468     ptr++;
469     }
470     else negated = FALSE;
471    
472     c = 0;
473     while ((digitab[ptr[1]] & ctype_digit) != 0)
474     c = c * 10 + *(++ptr) - '0';
475    
476     if (c == 0 || (braced && *(++ptr) != '}'))
477     {
478     *errorcodeptr = ERR57;
479     return 0;
480     }
481    
482     if (negated)
483     {
484     if (c > bracount)
485     {
486     *errorcodeptr = ERR15;
487     return 0;
488     }
489     c = bracount - (c - 1);
490     }
491    
492     c = -(ESC_REF + c);
493     break;
494    
495 nigel 77 /* The handling of escape sequences consisting of a string of digits
496     starting with one that is not zero is not straightforward. By experiment,
497     the way Perl works seems to be as follows:
498    
499     Outside a character class, the digits are read as a decimal number. If the
500     number is less than 10, or if there are that many previous extracting
501     left brackets, then it is a back reference. Otherwise, up to three octal
502     digits are read to form an escaped byte. Thus \123 is likely to be octal
503     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
504     value is greater than 377, the least significant 8 bits are taken. Inside a
505     character class, \ followed by a digit is always an octal number. */
506    
507     case '1': case '2': case '3': case '4': case '5':
508     case '6': case '7': case '8': case '9':
509    
510     if (!isclass)
511     {
512     oldptr = ptr;
513     c -= '0';
514     while ((digitab[ptr[1]] & ctype_digit) != 0)
515     c = c * 10 + *(++ptr) - '0';
516     if (c < 10 || c <= bracount)
517     {
518     c = -(ESC_REF + c);
519     break;
520     }
521     ptr = oldptr; /* Put the pointer back and fall through */
522     }
523    
524     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
525     generates a binary zero byte and treats the digit as a following literal.
526     Thus we have to pull back the pointer by one. */
527    
528     if ((c = *ptr) >= '8')
529     {
530     ptr--;
531     c = 0;
532     break;
533     }
534    
535     /* \0 always starts an octal number, but we may drop through to here with a
536 nigel 91 larger first octal digit. The original code used just to take the least
537     significant 8 bits of octal numbers (I think this is what early Perls used
538     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
539     than 3 octal digits. */
540 nigel 77
541     case '0':
542     c -= '0';
543     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
544     c = c * 8 + *(++ptr) - '0';
545 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
546 nigel 77 break;
547    
548 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
549     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
550     treated as a data character. */
551 nigel 77
552     case 'x':
553 nigel 87 if (ptr[1] == '{')
554 nigel 77 {
555     const uschar *pt = ptr + 2;
556 nigel 87 int count = 0;
557    
558 nigel 77 c = 0;
559     while ((digitab[*pt] & ctype_xdigit) != 0)
560     {
561 nigel 87 register int cc = *pt++;
562     if (c == 0 && cc == '0') continue; /* Leading zeroes */
563 nigel 77 count++;
564 nigel 87
565 ph10 97 #ifndef EBCDIC /* ASCII coding */
566 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
567 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
568 ph10 97 #else /* EBCDIC coding */
569 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
570 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
571 nigel 77 #endif
572     }
573 nigel 87
574 nigel 77 if (*pt == '}')
575     {
576 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
577 nigel 77 ptr = pt;
578     break;
579     }
580 nigel 87
581 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
582     recognize this construct; fall through to the normal \x handling. */
583     }
584    
585 nigel 87 /* Read just a single-byte hex-defined char */
586 nigel 77
587     c = 0;
588     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
589     {
590     int cc; /* Some compilers don't like ++ */
591     cc = *(++ptr); /* in initializers */
592 ph10 97 #ifndef EBCDIC /* ASCII coding */
593 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
595 ph10 97 #else /* EBCDIC coding */
596 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
597     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
598     #endif
599     }
600     break;
601    
602 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
603     This coding is ASCII-specific, but then the whole concept of \cx is
604     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
605 nigel 77
606     case 'c':
607     c = *(++ptr);
608     if (c == 0)
609     {
610     *errorcodeptr = ERR2;
611     return 0;
612     }
613    
614 ph10 97 #ifndef EBCDIC /* ASCII coding */
615 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
616     c ^= 0x40;
617 ph10 97 #else /* EBCDIC coding */
618 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
619     c ^= 0xC0;
620     #endif
621     break;
622    
623     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
624     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
625     for Perl compatibility, it is a literal. This code looks a bit odd, but
626     there used to be some cases other than the default, and there may be again
627     in future, so I haven't "optimized" it. */
628    
629     default:
630     if ((options & PCRE_EXTRA) != 0) switch(c)
631     {
632     default:
633     *errorcodeptr = ERR3;
634     break;
635     }
636     break;
637     }
638     }
639    
640     *ptrptr = ptr;
641     return c;
642     }
643    
644    
645    
646     #ifdef SUPPORT_UCP
647     /*************************************************
648     * Handle \P and \p *
649     *************************************************/
650    
651     /* This function is called after \P or \p has been encountered, provided that
652     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
653     pointing at the P or p. On exit, it is pointing at the final character of the
654     escape sequence.
655    
656     Argument:
657     ptrptr points to the pattern position pointer
658     negptr points to a boolean that is set TRUE for negation else FALSE
659 nigel 87 dptr points to an int that is set to the detailed property value
660 nigel 77 errorcodeptr points to the error code variable
661    
662 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
663 nigel 77 */
664    
665     static int
666 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
667 nigel 77 {
668     int c, i, bot, top;
669     const uschar *ptr = *ptrptr;
670 nigel 87 char name[32];
671 nigel 77
672     c = *(++ptr);
673     if (c == 0) goto ERROR_RETURN;
674    
675     *negptr = FALSE;
676    
677 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
678     negation. */
679 nigel 77
680     if (c == '{')
681     {
682     if (ptr[1] == '^')
683     {
684     *negptr = TRUE;
685     ptr++;
686     }
687 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
688 nigel 77 {
689     c = *(++ptr);
690     if (c == 0) goto ERROR_RETURN;
691     if (c == '}') break;
692     name[i] = c;
693     }
694 nigel 87 if (c !='}') goto ERROR_RETURN;
695 nigel 77 name[i] = 0;
696     }
697    
698     /* Otherwise there is just one following character */
699    
700     else
701     {
702     name[0] = c;
703     name[1] = 0;
704     }
705    
706     *ptrptr = ptr;
707    
708     /* Search for a recognized property name using binary chop */
709    
710     bot = 0;
711     top = _pcre_utt_size;
712    
713     while (bot < top)
714     {
715 nigel 87 i = (bot + top) >> 1;
716 nigel 77 c = strcmp(name, _pcre_utt[i].name);
717 nigel 87 if (c == 0)
718     {
719     *dptr = _pcre_utt[i].value;
720     return _pcre_utt[i].type;
721     }
722 nigel 77 if (c > 0) bot = i + 1; else top = i;
723     }
724    
725     *errorcodeptr = ERR47;
726     *ptrptr = ptr;
727     return -1;
728    
729     ERROR_RETURN:
730     *errorcodeptr = ERR46;
731     *ptrptr = ptr;
732     return -1;
733     }
734     #endif
735    
736    
737    
738    
739     /*************************************************
740     * Check for counted repeat *
741     *************************************************/
742    
743     /* This function is called when a '{' is encountered in a place where it might
744     start a quantifier. It looks ahead to see if it really is a quantifier or not.
745     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
746     where the ddds are digits.
747    
748     Arguments:
749     p pointer to the first char after '{'
750    
751     Returns: TRUE or FALSE
752     */
753    
754     static BOOL
755     is_counted_repeat(const uschar *p)
756     {
757     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
758     while ((digitab[*p] & ctype_digit) != 0) p++;
759     if (*p == '}') return TRUE;
760    
761     if (*p++ != ',') return FALSE;
762     if (*p == '}') return TRUE;
763    
764     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
765     while ((digitab[*p] & ctype_digit) != 0) p++;
766    
767     return (*p == '}');
768     }
769    
770    
771    
772     /*************************************************
773     * Read repeat counts *
774     *************************************************/
775    
776     /* Read an item of the form {n,m} and return the values. This is called only
777     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
778     so the syntax is guaranteed to be correct, but we need to check the values.
779    
780     Arguments:
781     p pointer to first char after '{'
782     minp pointer to int for min
783     maxp pointer to int for max
784     returned as -1 if no max
785     errorcodeptr points to error code variable
786    
787     Returns: pointer to '}' on success;
788     current ptr on error, with errorcodeptr set non-zero
789     */
790    
791     static const uschar *
792     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
793     {
794     int min = 0;
795     int max = -1;
796    
797 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
798     an integer overflow. */
799    
800 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
801 nigel 81 if (min < 0 || min > 65535)
802     {
803     *errorcodeptr = ERR5;
804     return p;
805     }
806 nigel 77
807 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
808     Also, max must not be less than min. */
809    
810 nigel 77 if (*p == '}') max = min; else
811     {
812     if (*(++p) != '}')
813     {
814     max = 0;
815     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
816 nigel 81 if (max < 0 || max > 65535)
817     {
818     *errorcodeptr = ERR5;
819     return p;
820     }
821 nigel 77 if (max < min)
822     {
823     *errorcodeptr = ERR4;
824     return p;
825     }
826     }
827     }
828    
829 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
830     '}'. */
831 nigel 77
832 nigel 81 *minp = min;
833     *maxp = max;
834 nigel 77 return p;
835     }
836    
837    
838    
839     /*************************************************
840 nigel 93 * Find forward referenced subpattern *
841 nigel 91 *************************************************/
842    
843 nigel 93 /* This function scans along a pattern's text looking for capturing
844     subpatterns, and counting them. If it finds a named pattern that matches the
845     name it is given, it returns its number. Alternatively, if the name is NULL, it
846     returns when it reaches a given numbered subpattern. This is used for forward
847     references to subpatterns. We know that if (?P< is encountered, the name will
848     be terminated by '>' because that is checked in the first pass.
849 nigel 91
850     Arguments:
851 nigel 93 ptr current position in the pattern
852     count current count of capturing parens so far encountered
853     name name to seek, or NULL if seeking a numbered subpattern
854     lorn name length, or subpattern number if name is NULL
855     xmode TRUE if we are in /x mode
856 nigel 91
857     Returns: the number of the named subpattern, or -1 if not found
858     */
859    
860     static int
861 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
862     BOOL xmode)
863 nigel 91 {
864     const uschar *thisname;
865 nigel 93
866 nigel 91 for (; *ptr != 0; ptr++)
867     {
868 nigel 93 int term;
869    
870     /* Skip over backslashed characters and also entire \Q...\E */
871    
872     if (*ptr == '\\')
873     {
874     if (*(++ptr) == 0) return -1;
875     if (*ptr == 'Q') for (;;)
876     {
877     while (*(++ptr) != 0 && *ptr != '\\');
878     if (*ptr == 0) return -1;
879     if (*(++ptr) == 'E') break;
880     }
881     continue;
882     }
883    
884     /* Skip over character classes */
885    
886     if (*ptr == '[')
887     {
888     while (*(++ptr) != ']')
889     {
890     if (*ptr == '\\')
891     {
892     if (*(++ptr) == 0) return -1;
893     if (*ptr == 'Q') for (;;)
894     {
895     while (*(++ptr) != 0 && *ptr != '\\');
896     if (*ptr == 0) return -1;
897     if (*(++ptr) == 'E') break;
898     }
899     continue;
900     }
901     }
902     continue;
903     }
904    
905     /* Skip comments in /x mode */
906    
907     if (xmode && *ptr == '#')
908     {
909     while (*(++ptr) != 0 && *ptr != '\n');
910     if (*ptr == 0) return -1;
911     continue;
912     }
913    
914     /* An opening parens must now be a real metacharacter */
915    
916 nigel 91 if (*ptr != '(') continue;
917 nigel 93 if (ptr[1] != '?')
918     {
919     count++;
920     if (name == NULL && count == lorn) return count;
921     continue;
922     }
923    
924     ptr += 2;
925     if (*ptr == 'P') ptr++; /* Allow optional P */
926    
927     /* We have to disambiguate (?<! and (?<= from (?<name> */
928    
929     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
930     *ptr != '\'')
931     continue;
932    
933 nigel 91 count++;
934 nigel 93
935     if (name == NULL && count == lorn) return count;
936     term = *ptr++;
937     if (term == '<') term = '>';
938 nigel 91 thisname = ptr;
939 nigel 93 while (*ptr != term) ptr++;
940     if (name != NULL && lorn == ptr - thisname &&
941     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
942 nigel 91 return count;
943     }
944 nigel 93
945 nigel 91 return -1;
946     }
947    
948    
949    
950     /*************************************************
951 nigel 77 * Find first significant op code *
952     *************************************************/
953    
954     /* This is called by several functions that scan a compiled expression looking
955     for a fixed first character, or an anchoring op code etc. It skips over things
956     that do not influence this. For some calls, a change of option is important.
957     For some calls, it makes sense to skip negative forward and all backward
958     assertions, and also the \b assertion; for others it does not.
959    
960     Arguments:
961     code pointer to the start of the group
962     options pointer to external options
963     optbit the option bit whose changing is significant, or
964     zero if none are
965     skipassert TRUE if certain assertions are to be skipped
966    
967     Returns: pointer to the first significant opcode
968     */
969    
970     static const uschar*
971     first_significant_code(const uschar *code, int *options, int optbit,
972     BOOL skipassert)
973     {
974     for (;;)
975     {
976     switch ((int)*code)
977     {
978     case OP_OPT:
979     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
980     *options = (int)code[1];
981     code += 2;
982     break;
983    
984     case OP_ASSERT_NOT:
985     case OP_ASSERTBACK:
986     case OP_ASSERTBACK_NOT:
987     if (!skipassert) return code;
988     do code += GET(code, 1); while (*code == OP_ALT);
989     code += _pcre_OP_lengths[*code];
990     break;
991    
992     case OP_WORD_BOUNDARY:
993     case OP_NOT_WORD_BOUNDARY:
994     if (!skipassert) return code;
995     /* Fall through */
996    
997     case OP_CALLOUT:
998     case OP_CREF:
999 nigel 93 case OP_RREF:
1000     case OP_DEF:
1001 nigel 77 code += _pcre_OP_lengths[*code];
1002     break;
1003    
1004     default:
1005     return code;
1006     }
1007     }
1008     /* Control never reaches here */
1009     }
1010    
1011    
1012    
1013    
1014     /*************************************************
1015     * Find the fixed length of a pattern *
1016     *************************************************/
1017    
1018     /* Scan a pattern and compute the fixed length of subject that will match it,
1019     if the length is fixed. This is needed for dealing with backward assertions.
1020     In UTF8 mode, the result is in characters rather than bytes.
1021    
1022     Arguments:
1023     code points to the start of the pattern (the bracket)
1024     options the compiling options
1025    
1026     Returns: the fixed length, or -1 if there is no fixed length,
1027     or -2 if \C was encountered
1028     */
1029    
1030     static int
1031     find_fixedlength(uschar *code, int options)
1032     {
1033     int length = -1;
1034    
1035     register int branchlength = 0;
1036     register uschar *cc = code + 1 + LINK_SIZE;
1037    
1038     /* Scan along the opcodes for this branch. If we get to the end of the
1039     branch, check the length against that of the other branches. */
1040    
1041     for (;;)
1042     {
1043     int d;
1044     register int op = *cc;
1045    
1046     switch (op)
1047     {
1048 nigel 93 case OP_CBRA:
1049 nigel 77 case OP_BRA:
1050     case OP_ONCE:
1051     case OP_COND:
1052 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1053 nigel 77 if (d < 0) return d;
1054     branchlength += d;
1055     do cc += GET(cc, 1); while (*cc == OP_ALT);
1056     cc += 1 + LINK_SIZE;
1057     break;
1058    
1059     /* Reached end of a branch; if it's a ket it is the end of a nested
1060     call. If it's ALT it is an alternation in a nested call. If it is
1061     END it's the end of the outer call. All can be handled by the same code. */
1062    
1063     case OP_ALT:
1064     case OP_KET:
1065     case OP_KETRMAX:
1066     case OP_KETRMIN:
1067     case OP_END:
1068     if (length < 0) length = branchlength;
1069     else if (length != branchlength) return -1;
1070     if (*cc != OP_ALT) return length;
1071     cc += 1 + LINK_SIZE;
1072     branchlength = 0;
1073     break;
1074    
1075     /* Skip over assertive subpatterns */
1076    
1077     case OP_ASSERT:
1078     case OP_ASSERT_NOT:
1079     case OP_ASSERTBACK:
1080     case OP_ASSERTBACK_NOT:
1081     do cc += GET(cc, 1); while (*cc == OP_ALT);
1082     /* Fall through */
1083    
1084     /* Skip over things that don't match chars */
1085    
1086     case OP_REVERSE:
1087     case OP_CREF:
1088 nigel 93 case OP_RREF:
1089     case OP_DEF:
1090 nigel 77 case OP_OPT:
1091     case OP_CALLOUT:
1092     case OP_SOD:
1093     case OP_SOM:
1094     case OP_EOD:
1095     case OP_EODN:
1096     case OP_CIRC:
1097     case OP_DOLL:
1098     case OP_NOT_WORD_BOUNDARY:
1099     case OP_WORD_BOUNDARY:
1100     cc += _pcre_OP_lengths[*cc];
1101     break;
1102    
1103     /* Handle literal characters */
1104    
1105     case OP_CHAR:
1106     case OP_CHARNC:
1107 nigel 91 case OP_NOT:
1108 nigel 77 branchlength++;
1109     cc += 2;
1110     #ifdef SUPPORT_UTF8
1111     if ((options & PCRE_UTF8) != 0)
1112     {
1113     while ((*cc & 0xc0) == 0x80) cc++;
1114     }
1115     #endif
1116     break;
1117    
1118     /* Handle exact repetitions. The count is already in characters, but we
1119     need to skip over a multibyte character in UTF8 mode. */
1120    
1121     case OP_EXACT:
1122     branchlength += GET2(cc,1);
1123     cc += 4;
1124     #ifdef SUPPORT_UTF8
1125     if ((options & PCRE_UTF8) != 0)
1126     {
1127     while((*cc & 0x80) == 0x80) cc++;
1128     }
1129     #endif
1130     break;
1131    
1132     case OP_TYPEEXACT:
1133     branchlength += GET2(cc,1);
1134     cc += 4;
1135     break;
1136    
1137     /* Handle single-char matchers */
1138    
1139     case OP_PROP:
1140     case OP_NOTPROP:
1141 nigel 87 cc += 2;
1142 nigel 77 /* Fall through */
1143    
1144     case OP_NOT_DIGIT:
1145     case OP_DIGIT:
1146     case OP_NOT_WHITESPACE:
1147     case OP_WHITESPACE:
1148     case OP_NOT_WORDCHAR:
1149     case OP_WORDCHAR:
1150     case OP_ANY:
1151     branchlength++;
1152     cc++;
1153     break;
1154    
1155     /* The single-byte matcher isn't allowed */
1156    
1157     case OP_ANYBYTE:
1158     return -2;
1159    
1160     /* Check a class for variable quantification */
1161    
1162     #ifdef SUPPORT_UTF8
1163     case OP_XCLASS:
1164     cc += GET(cc, 1) - 33;
1165     /* Fall through */
1166     #endif
1167    
1168     case OP_CLASS:
1169     case OP_NCLASS:
1170     cc += 33;
1171    
1172     switch (*cc)
1173     {
1174     case OP_CRSTAR:
1175     case OP_CRMINSTAR:
1176     case OP_CRQUERY:
1177     case OP_CRMINQUERY:
1178     return -1;
1179    
1180     case OP_CRRANGE:
1181     case OP_CRMINRANGE:
1182     if (GET2(cc,1) != GET2(cc,3)) return -1;
1183     branchlength += GET2(cc,1);
1184     cc += 5;
1185     break;
1186    
1187     default:
1188     branchlength++;
1189     }
1190     break;
1191    
1192     /* Anything else is variable length */
1193    
1194     default:
1195     return -1;
1196     }
1197     }
1198     /* Control never gets here */
1199     }
1200    
1201    
1202    
1203    
1204     /*************************************************
1205     * Scan compiled regex for numbered bracket *
1206     *************************************************/
1207    
1208     /* This little function scans through a compiled pattern until it finds a
1209     capturing bracket with the given number.
1210    
1211     Arguments:
1212     code points to start of expression
1213     utf8 TRUE in UTF-8 mode
1214     number the required bracket number
1215    
1216     Returns: pointer to the opcode for the bracket, or NULL if not found
1217     */
1218    
1219     static const uschar *
1220     find_bracket(const uschar *code, BOOL utf8, int number)
1221     {
1222     for (;;)
1223     {
1224     register int c = *code;
1225     if (c == OP_END) return NULL;
1226 nigel 91
1227     /* XCLASS is used for classes that cannot be represented just by a bit
1228     map. This includes negated single high-valued characters. The length in
1229     the table is zero; the actual length is stored in the compiled code. */
1230    
1231     if (c == OP_XCLASS) code += GET(code, 1);
1232    
1233 nigel 93 /* Handle capturing bracket */
1234 nigel 91
1235 nigel 93 else if (c == OP_CBRA)
1236 nigel 77 {
1237 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1238 nigel 77 if (n == number) return (uschar *)code;
1239 nigel 93 code += _pcre_OP_lengths[c];
1240 nigel 77 }
1241 nigel 91
1242 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1243     a multi-byte character. The length in the table is a minimum, so we have to
1244     arrange to skip the extra bytes. */
1245 nigel 91
1246 nigel 77 else
1247     {
1248     code += _pcre_OP_lengths[c];
1249     if (utf8) switch(c)
1250     {
1251     case OP_CHAR:
1252     case OP_CHARNC:
1253     case OP_EXACT:
1254     case OP_UPTO:
1255     case OP_MINUPTO:
1256 nigel 93 case OP_POSUPTO:
1257 nigel 77 case OP_STAR:
1258     case OP_MINSTAR:
1259 nigel 93 case OP_POSSTAR:
1260 nigel 77 case OP_PLUS:
1261     case OP_MINPLUS:
1262 nigel 93 case OP_POSPLUS:
1263 nigel 77 case OP_QUERY:
1264     case OP_MINQUERY:
1265 nigel 93 case OP_POSQUERY:
1266     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1267 nigel 77 break;
1268     }
1269     }
1270     }
1271     }
1272    
1273    
1274    
1275     /*************************************************
1276     * Scan compiled regex for recursion reference *
1277     *************************************************/
1278    
1279     /* This little function scans through a compiled pattern until it finds an
1280     instance of OP_RECURSE.
1281    
1282     Arguments:
1283     code points to start of expression
1284     utf8 TRUE in UTF-8 mode
1285    
1286     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1287     */
1288    
1289     static const uschar *
1290     find_recurse(const uschar *code, BOOL utf8)
1291     {
1292     for (;;)
1293     {
1294     register int c = *code;
1295     if (c == OP_END) return NULL;
1296 nigel 91 if (c == OP_RECURSE) return code;
1297    
1298     /* XCLASS is used for classes that cannot be represented just by a bit
1299     map. This includes negated single high-valued characters. The length in
1300     the table is zero; the actual length is stored in the compiled code. */
1301    
1302     if (c == OP_XCLASS) code += GET(code, 1);
1303    
1304     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1305     that are followed by a character may be followed by a multi-byte character.
1306 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1307     bytes. */
1308 nigel 91
1309 nigel 77 else
1310     {
1311     code += _pcre_OP_lengths[c];
1312     if (utf8) switch(c)
1313     {
1314     case OP_CHAR:
1315     case OP_CHARNC:
1316     case OP_EXACT:
1317     case OP_UPTO:
1318     case OP_MINUPTO:
1319 nigel 93 case OP_POSUPTO:
1320 nigel 77 case OP_STAR:
1321     case OP_MINSTAR:
1322 nigel 93 case OP_POSSTAR:
1323 nigel 77 case OP_PLUS:
1324     case OP_MINPLUS:
1325 nigel 93 case OP_POSPLUS:
1326 nigel 77 case OP_QUERY:
1327     case OP_MINQUERY:
1328 nigel 93 case OP_POSQUERY:
1329     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1330 nigel 77 break;
1331     }
1332     }
1333     }
1334     }
1335    
1336    
1337    
1338     /*************************************************
1339     * Scan compiled branch for non-emptiness *
1340     *************************************************/
1341    
1342     /* This function scans through a branch of a compiled pattern to see whether it
1343 nigel 93 can match the empty string or not. It is called from could_be_empty()
1344     below and from compile_branch() when checking for an unlimited repeat of a
1345     group that can match nothing. Note that first_significant_code() skips over
1346     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1347     struck an inner bracket whose current branch will already have been scanned.
1348 nigel 77
1349     Arguments:
1350     code points to start of search
1351     endcode points to where to stop
1352     utf8 TRUE if in UTF8 mode
1353    
1354     Returns: TRUE if what is matched could be empty
1355     */
1356    
1357     static BOOL
1358     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1359     {
1360     register int c;
1361 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1362 nigel 77 code < endcode;
1363     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1364     {
1365     const uschar *ccode;
1366    
1367     c = *code;
1368    
1369 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1370 nigel 77 {
1371     BOOL empty_branch;
1372     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1373    
1374     /* Scan a closed bracket */
1375    
1376     empty_branch = FALSE;
1377     do
1378     {
1379     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1380     empty_branch = TRUE;
1381     code += GET(code, 1);
1382     }
1383     while (*code == OP_ALT);
1384     if (!empty_branch) return FALSE; /* All branches are non-empty */
1385 nigel 93
1386     /* Move past the KET and fudge things so that the increment in the "for"
1387     above has no effect. */
1388    
1389     c = OP_END;
1390     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1391     continue;
1392 nigel 77 }
1393    
1394 nigel 93 /* Handle the other opcodes */
1395    
1396     switch (c)
1397 nigel 77 {
1398     /* Check for quantifiers after a class */
1399    
1400     #ifdef SUPPORT_UTF8
1401     case OP_XCLASS:
1402     ccode = code + GET(code, 1);
1403     goto CHECK_CLASS_REPEAT;
1404     #endif
1405    
1406     case OP_CLASS:
1407     case OP_NCLASS:
1408     ccode = code + 33;
1409    
1410     #ifdef SUPPORT_UTF8
1411     CHECK_CLASS_REPEAT:
1412     #endif
1413    
1414     switch (*ccode)
1415     {
1416     case OP_CRSTAR: /* These could be empty; continue */
1417     case OP_CRMINSTAR:
1418     case OP_CRQUERY:
1419     case OP_CRMINQUERY:
1420     break;
1421    
1422     default: /* Non-repeat => class must match */
1423     case OP_CRPLUS: /* These repeats aren't empty */
1424     case OP_CRMINPLUS:
1425     return FALSE;
1426    
1427     case OP_CRRANGE:
1428     case OP_CRMINRANGE:
1429     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1430     break;
1431     }
1432     break;
1433    
1434     /* Opcodes that must match a character */
1435    
1436     case OP_PROP:
1437     case OP_NOTPROP:
1438     case OP_EXTUNI:
1439     case OP_NOT_DIGIT:
1440     case OP_DIGIT:
1441     case OP_NOT_WHITESPACE:
1442     case OP_WHITESPACE:
1443     case OP_NOT_WORDCHAR:
1444     case OP_WORDCHAR:
1445     case OP_ANY:
1446     case OP_ANYBYTE:
1447     case OP_CHAR:
1448     case OP_CHARNC:
1449     case OP_NOT:
1450     case OP_PLUS:
1451     case OP_MINPLUS:
1452 nigel 93 case OP_POSPLUS:
1453 nigel 77 case OP_EXACT:
1454     case OP_NOTPLUS:
1455     case OP_NOTMINPLUS:
1456 nigel 93 case OP_NOTPOSPLUS:
1457 nigel 77 case OP_NOTEXACT:
1458     case OP_TYPEPLUS:
1459     case OP_TYPEMINPLUS:
1460 nigel 93 case OP_TYPEPOSPLUS:
1461 nigel 77 case OP_TYPEEXACT:
1462     return FALSE;
1463    
1464     /* End of branch */
1465    
1466     case OP_KET:
1467     case OP_KETRMAX:
1468     case OP_KETRMIN:
1469     case OP_ALT:
1470     return TRUE;
1471    
1472 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1473     MINUPTO, and POSUPTO may be followed by a multibyte character */
1474 nigel 77
1475     #ifdef SUPPORT_UTF8
1476     case OP_STAR:
1477     case OP_MINSTAR:
1478 nigel 93 case OP_POSSTAR:
1479 nigel 77 case OP_QUERY:
1480     case OP_MINQUERY:
1481 nigel 93 case OP_POSQUERY:
1482 nigel 77 case OP_UPTO:
1483     case OP_MINUPTO:
1484 nigel 93 case OP_POSUPTO:
1485 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1486     break;
1487     #endif
1488     }
1489     }
1490    
1491     return TRUE;
1492     }
1493    
1494    
1495    
1496     /*************************************************
1497     * Scan compiled regex for non-emptiness *
1498     *************************************************/
1499    
1500     /* This function is called to check for left recursive calls. We want to check
1501     the current branch of the current pattern to see if it could match the empty
1502     string. If it could, we must look outwards for branches at other levels,
1503     stopping when we pass beyond the bracket which is the subject of the recursion.
1504    
1505     Arguments:
1506     code points to start of the recursion
1507     endcode points to where to stop (current RECURSE item)
1508     bcptr points to the chain of current (unclosed) branch starts
1509     utf8 TRUE if in UTF-8 mode
1510    
1511     Returns: TRUE if what is matched could be empty
1512     */
1513    
1514     static BOOL
1515     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1516     BOOL utf8)
1517     {
1518     while (bcptr != NULL && bcptr->current >= code)
1519     {
1520     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1521     bcptr = bcptr->outer;
1522     }
1523     return TRUE;
1524     }
1525    
1526    
1527    
1528     /*************************************************
1529     * Check for POSIX class syntax *
1530     *************************************************/
1531    
1532     /* This function is called when the sequence "[:" or "[." or "[=" is
1533     encountered in a character class. It checks whether this is followed by an
1534     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1535     ".]" or "=]".
1536    
1537     Argument:
1538     ptr pointer to the initial [
1539     endptr where to return the end pointer
1540     cd pointer to compile data
1541    
1542     Returns: TRUE or FALSE
1543     */
1544    
1545     static BOOL
1546     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1547     {
1548     int terminator; /* Don't combine these lines; the Solaris cc */
1549     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1550     if (*(++ptr) == '^') ptr++;
1551     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1552     if (*ptr == terminator && ptr[1] == ']')
1553     {
1554     *endptr = ptr;
1555     return TRUE;
1556     }
1557     return FALSE;
1558     }
1559    
1560    
1561    
1562    
1563     /*************************************************
1564     * Check POSIX class name *
1565     *************************************************/
1566    
1567     /* This function is called to check the name given in a POSIX-style class entry
1568     such as [:alnum:].
1569    
1570     Arguments:
1571     ptr points to the first letter
1572     len the length of the name
1573    
1574     Returns: a value representing the name, or -1 if unknown
1575     */
1576    
1577     static int
1578     check_posix_name(const uschar *ptr, int len)
1579     {
1580     register int yield = 0;
1581     while (posix_name_lengths[yield] != 0)
1582     {
1583     if (len == posix_name_lengths[yield] &&
1584     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1585     yield++;
1586     }
1587     return -1;
1588     }
1589    
1590    
1591     /*************************************************
1592     * Adjust OP_RECURSE items in repeated group *
1593     *************************************************/
1594    
1595     /* OP_RECURSE items contain an offset from the start of the regex to the group
1596     that is referenced. This means that groups can be replicated for fixed
1597     repetition simply by copying (because the recursion is allowed to refer to
1598     earlier groups that are outside the current group). However, when a group is
1599     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1600     it, after it has been compiled. This means that any OP_RECURSE items within it
1601     that refer to the group itself or any contained groups have to have their
1602 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1603     the partially compiled regex must be temporarily terminated with OP_END.
1604 nigel 77
1605 nigel 93 This function has been extended with the possibility of forward references for
1606     recursions and subroutine calls. It must also check the list of such references
1607     for the group we are dealing with. If it finds that one of the recursions in
1608     the current group is on this list, it adjusts the offset in the list, not the
1609     value in the reference (which is a group number).
1610    
1611 nigel 77 Arguments:
1612     group points to the start of the group
1613     adjust the amount by which the group is to be moved
1614     utf8 TRUE in UTF-8 mode
1615     cd contains pointers to tables etc.
1616 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1617 nigel 77
1618     Returns: nothing
1619     */
1620    
1621     static void
1622 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1623     uschar *save_hwm)
1624 nigel 77 {
1625     uschar *ptr = group;
1626     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1627     {
1628 nigel 93 int offset;
1629     uschar *hc;
1630    
1631     /* See if this recursion is on the forward reference list. If so, adjust the
1632     reference. */
1633    
1634     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1635     {
1636     offset = GET(hc, 0);
1637     if (cd->start_code + offset == ptr + 1)
1638     {
1639     PUT(hc, 0, offset + adjust);
1640     break;
1641     }
1642     }
1643    
1644     /* Otherwise, adjust the recursion offset if it's after the start of this
1645     group. */
1646    
1647     if (hc >= cd->hwm)
1648     {
1649     offset = GET(ptr, 1);
1650     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1651     }
1652    
1653 nigel 77 ptr += 1 + LINK_SIZE;
1654     }
1655     }
1656    
1657    
1658    
1659     /*************************************************
1660     * Insert an automatic callout point *
1661     *************************************************/
1662    
1663     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1664     callout points before each pattern item.
1665    
1666     Arguments:
1667     code current code pointer
1668     ptr current pattern pointer
1669     cd pointers to tables etc
1670    
1671     Returns: new code pointer
1672     */
1673    
1674     static uschar *
1675     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1676     {
1677     *code++ = OP_CALLOUT;
1678     *code++ = 255;
1679     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1680     PUT(code, LINK_SIZE, 0); /* Default length */
1681     return code + 2*LINK_SIZE;
1682     }
1683    
1684    
1685    
1686     /*************************************************
1687     * Complete a callout item *
1688     *************************************************/
1689    
1690     /* A callout item contains the length of the next item in the pattern, which
1691     we can't fill in till after we have reached the relevant point. This is used
1692     for both automatic and manual callouts.
1693    
1694     Arguments:
1695     previous_callout points to previous callout item
1696     ptr current pattern pointer
1697     cd pointers to tables etc
1698    
1699     Returns: nothing
1700     */
1701    
1702     static void
1703     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1704     {
1705     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1706     PUT(previous_callout, 2 + LINK_SIZE, length);
1707     }
1708    
1709    
1710    
1711     #ifdef SUPPORT_UCP
1712     /*************************************************
1713     * Get othercase range *
1714     *************************************************/
1715    
1716     /* This function is passed the start and end of a class range, in UTF-8 mode
1717     with UCP support. It searches up the characters, looking for internal ranges of
1718     characters in the "other" case. Each call returns the next one, updating the
1719     start address.
1720    
1721     Arguments:
1722     cptr points to starting character value; updated
1723     d end value
1724     ocptr where to put start of othercase range
1725     odptr where to put end of othercase range
1726    
1727     Yield: TRUE when range returned; FALSE when no more
1728     */
1729    
1730     static BOOL
1731 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1732     unsigned int *odptr)
1733 nigel 77 {
1734 nigel 93 unsigned int c, othercase, next;
1735 nigel 77
1736     for (c = *cptr; c <= d; c++)
1737 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1738 nigel 77
1739     if (c > d) return FALSE;
1740    
1741     *ocptr = othercase;
1742     next = othercase + 1;
1743    
1744     for (++c; c <= d; c++)
1745     {
1746 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1747 nigel 77 next++;
1748     }
1749    
1750     *odptr = next - 1;
1751     *cptr = c;
1752    
1753     return TRUE;
1754     }
1755     #endif /* SUPPORT_UCP */
1756    
1757    
1758 nigel 93
1759 nigel 77 /*************************************************
1760 nigel 93 * Check if auto-possessifying is possible *
1761     *************************************************/
1762    
1763     /* This function is called for unlimited repeats of certain items, to see
1764     whether the next thing could possibly match the repeated item. If not, it makes
1765     sense to automatically possessify the repeated item.
1766    
1767     Arguments:
1768     op_code the repeated op code
1769     this data for this item, depends on the opcode
1770     utf8 TRUE in UTF-8 mode
1771     utf8_char used for utf8 character bytes, NULL if not relevant
1772     ptr next character in pattern
1773     options options bits
1774     cd contains pointers to tables etc.
1775    
1776     Returns: TRUE if possessifying is wanted
1777     */
1778    
1779     static BOOL
1780     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1781     const uschar *ptr, int options, compile_data *cd)
1782     {
1783     int next;
1784    
1785     /* Skip whitespace and comments in extended mode */
1786    
1787     if ((options & PCRE_EXTENDED) != 0)
1788     {
1789     for (;;)
1790     {
1791     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1792     if (*ptr == '#')
1793     {
1794     while (*(++ptr) != 0)
1795     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1796     }
1797     else break;
1798     }
1799     }
1800    
1801     /* If the next item is one that we can handle, get its value. A non-negative
1802     value is a character, a negative value is an escape value. */
1803    
1804     if (*ptr == '\\')
1805     {
1806     int temperrorcode = 0;
1807     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1808     if (temperrorcode != 0) return FALSE;
1809     ptr++; /* Point after the escape sequence */
1810     }
1811    
1812     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1813     {
1814     #ifdef SUPPORT_UTF8
1815     if (utf8) { GETCHARINC(next, ptr); } else
1816     #endif
1817     next = *ptr++;
1818     }
1819    
1820     else return FALSE;
1821    
1822     /* Skip whitespace and comments in extended mode */
1823    
1824     if ((options & PCRE_EXTENDED) != 0)
1825     {
1826     for (;;)
1827     {
1828     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1829     if (*ptr == '#')
1830     {
1831     while (*(++ptr) != 0)
1832     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1833     }
1834     else break;
1835     }
1836     }
1837    
1838     /* If the next thing is itself optional, we have to give up. */
1839    
1840     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1841     return FALSE;
1842    
1843     /* Now compare the next item with the previous opcode. If the previous is a
1844     positive single character match, "item" either contains the character or, if
1845     "item" is greater than 127 in utf8 mode, the character's bytes are in
1846     utf8_char. */
1847    
1848    
1849     /* Handle cases when the next item is a character. */
1850    
1851     if (next >= 0) switch(op_code)
1852     {
1853     case OP_CHAR:
1854     #ifdef SUPPORT_UTF8
1855     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1856     #endif
1857     return item != next;
1858    
1859     /* For CHARNC (caseless character) we must check the other case. If we have
1860     Unicode property support, we can use it to test the other case of
1861     high-valued characters. */
1862    
1863     case OP_CHARNC:
1864     #ifdef SUPPORT_UTF8
1865     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1866     #endif
1867     if (item == next) return FALSE;
1868     #ifdef SUPPORT_UTF8
1869     if (utf8)
1870     {
1871     unsigned int othercase;
1872     if (next < 128) othercase = cd->fcc[next]; else
1873     #ifdef SUPPORT_UCP
1874     othercase = _pcre_ucp_othercase((unsigned int)next);
1875     #else
1876     othercase = NOTACHAR;
1877     #endif
1878     return (unsigned int)item != othercase;
1879     }
1880     else
1881     #endif /* SUPPORT_UTF8 */
1882     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1883    
1884     /* For OP_NOT, "item" must be a single-byte character. */
1885    
1886     case OP_NOT:
1887     if (next < 0) return FALSE; /* Not a character */
1888     if (item == next) return TRUE;
1889     if ((options & PCRE_CASELESS) == 0) return FALSE;
1890     #ifdef SUPPORT_UTF8
1891     if (utf8)
1892     {
1893     unsigned int othercase;
1894     if (next < 128) othercase = cd->fcc[next]; else
1895     #ifdef SUPPORT_UCP
1896     othercase = _pcre_ucp_othercase(next);
1897     #else
1898     othercase = NOTACHAR;
1899     #endif
1900     return (unsigned int)item == othercase;
1901     }
1902     else
1903     #endif /* SUPPORT_UTF8 */
1904     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1905    
1906     case OP_DIGIT:
1907     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1908    
1909     case OP_NOT_DIGIT:
1910     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1911    
1912     case OP_WHITESPACE:
1913     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1914    
1915     case OP_NOT_WHITESPACE:
1916     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1917    
1918     case OP_WORDCHAR:
1919     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1920    
1921     case OP_NOT_WORDCHAR:
1922     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1923    
1924     default:
1925     return FALSE;
1926     }
1927    
1928    
1929     /* Handle the case when the next item is \d, \s, etc. */
1930    
1931     switch(op_code)
1932     {
1933     case OP_CHAR:
1934     case OP_CHARNC:
1935     #ifdef SUPPORT_UTF8
1936     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1937     #endif
1938     switch(-next)
1939     {
1940     case ESC_d:
1941     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1942    
1943     case ESC_D:
1944     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1945    
1946     case ESC_s:
1947     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1948    
1949     case ESC_S:
1950     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1951    
1952     case ESC_w:
1953     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1954    
1955     case ESC_W:
1956     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1957    
1958     default:
1959     return FALSE;
1960     }
1961    
1962     case OP_DIGIT:
1963     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1964    
1965     case OP_NOT_DIGIT:
1966     return next == -ESC_d;
1967    
1968     case OP_WHITESPACE:
1969     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1970    
1971     case OP_NOT_WHITESPACE:
1972     return next == -ESC_s;
1973    
1974     case OP_WORDCHAR:
1975     return next == -ESC_W || next == -ESC_s;
1976    
1977     case OP_NOT_WORDCHAR:
1978     return next == -ESC_w || next == -ESC_d;
1979    
1980     default:
1981     return FALSE;
1982     }
1983    
1984     /* Control does not reach here */
1985     }
1986    
1987    
1988    
1989     /*************************************************
1990 nigel 77 * Compile one branch *
1991     *************************************************/
1992    
1993 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
1994 nigel 77 changed during the branch, the pointer is used to change the external options
1995 nigel 93 bits. This function is used during the pre-compile phase when we are trying
1996     to find out the amount of memory needed, as well as during the real compile
1997     phase. The value of lengthptr distinguishes the two phases.
1998 nigel 77
1999     Arguments:
2000     optionsptr pointer to the option bits
2001     codeptr points to the pointer to the current code point
2002     ptrptr points to the current pattern pointer
2003     errorcodeptr points to error code variable
2004     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2005     reqbyteptr set to the last literal character required, else < 0
2006     bcptr points to current branch chain
2007     cd contains pointers to tables etc.
2008 nigel 93 lengthptr NULL during the real compile phase
2009     points to length accumulator during pre-compile phase
2010 nigel 77
2011     Returns: TRUE on success
2012     FALSE, with *errorcodeptr set non-zero on error
2013     */
2014    
2015     static BOOL
2016 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2017     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2018     compile_data *cd, int *lengthptr)
2019 nigel 77 {
2020     int repeat_type, op_type;
2021     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2022     int bravalue = 0;
2023     int greedy_default, greedy_non_default;
2024     int firstbyte, reqbyte;
2025     int zeroreqbyte, zerofirstbyte;
2026     int req_caseopt, reqvary, tempreqvary;
2027     int options = *optionsptr;
2028     int after_manual_callout = 0;
2029 nigel 93 int length_prevgroup = 0;
2030 nigel 77 register int c;
2031     register uschar *code = *codeptr;
2032 nigel 93 uschar *last_code = code;
2033     uschar *orig_code = code;
2034 nigel 77 uschar *tempcode;
2035     BOOL inescq = FALSE;
2036     BOOL groupsetfirstbyte = FALSE;
2037     const uschar *ptr = *ptrptr;
2038     const uschar *tempptr;
2039     uschar *previous = NULL;
2040     uschar *previous_callout = NULL;
2041 nigel 93 uschar *save_hwm = NULL;
2042 nigel 77 uschar classbits[32];
2043    
2044     #ifdef SUPPORT_UTF8
2045     BOOL class_utf8;
2046     BOOL utf8 = (options & PCRE_UTF8) != 0;
2047     uschar *class_utf8data;
2048     uschar utf8_char[6];
2049     #else
2050     BOOL utf8 = FALSE;
2051 nigel 93 uschar *utf8_char = NULL;
2052 nigel 77 #endif
2053    
2054 nigel 93 #ifdef DEBUG
2055     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2056     #endif
2057    
2058 nigel 77 /* Set up the default and non-default settings for greediness */
2059    
2060     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2061     greedy_non_default = greedy_default ^ 1;
2062    
2063     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2064     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2065     matches a non-fixed char first char; reqbyte just remains unset if we never
2066     find one.
2067    
2068     When we hit a repeat whose minimum is zero, we may have to adjust these values
2069     to take the zero repeat into account. This is implemented by setting them to
2070     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2071     item types that can be repeated set these backoff variables appropriately. */
2072    
2073     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2074    
2075     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2076     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2077     value > 255. It is added into the firstbyte or reqbyte variables to record the
2078     case status of the value. This is used only for ASCII characters. */
2079    
2080     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2081    
2082     /* Switch on next character until the end of the branch */
2083    
2084     for (;; ptr++)
2085     {
2086     BOOL negate_class;
2087     BOOL possessive_quantifier;
2088     BOOL is_quantifier;
2089 nigel 93 BOOL is_recurse;
2090 nigel 77 int class_charcount;
2091     int class_lastchar;
2092     int newoptions;
2093     int recno;
2094     int skipbytes;
2095     int subreqbyte;
2096     int subfirstbyte;
2097 nigel 93 int terminator;
2098 nigel 77 int mclength;
2099     uschar mcbuffer[8];
2100    
2101 nigel 93 /* Get next byte in the pattern */
2102 nigel 77
2103     c = *ptr;
2104    
2105 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2106     previous cycle of this loop. */
2107    
2108     if (lengthptr != NULL)
2109     {
2110     #ifdef DEBUG
2111     if (code > cd->hwm) cd->hwm = code; /* High water info */
2112     #endif
2113     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2114     {
2115     *errorcodeptr = ERR52;
2116     goto FAILED;
2117     }
2118    
2119     /* There is at least one situation where code goes backwards: this is the
2120     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2121     the class is simply eliminated. However, it is created first, so we have to
2122     allow memory for it. Therefore, don't ever reduce the length at this point.
2123     */
2124    
2125     if (code < last_code) code = last_code;
2126     *lengthptr += code - last_code;
2127     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2128    
2129     /* If "previous" is set and it is not at the start of the work space, move
2130     it back to there, in order to avoid filling up the work space. Otherwise,
2131     if "previous" is NULL, reset the current code pointer to the start. */
2132    
2133     if (previous != NULL)
2134     {
2135     if (previous > orig_code)
2136     {
2137     memmove(orig_code, previous, code - previous);
2138     code -= previous - orig_code;
2139     previous = orig_code;
2140     }
2141     }
2142     else code = orig_code;
2143    
2144     /* Remember where this code item starts so we can pick up the length
2145     next time round. */
2146    
2147     last_code = code;
2148     }
2149    
2150     /* In the real compile phase, just check the workspace used by the forward
2151     reference list. */
2152    
2153     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2154     {
2155     *errorcodeptr = ERR52;
2156     goto FAILED;
2157     }
2158    
2159 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2160    
2161     if (inescq && c != 0)
2162     {
2163     if (c == '\\' && ptr[1] == 'E')
2164     {
2165     inescq = FALSE;
2166     ptr++;
2167     continue;
2168     }
2169     else
2170     {
2171     if (previous_callout != NULL)
2172     {
2173 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2174     complete_callout(previous_callout, ptr, cd);
2175 nigel 77 previous_callout = NULL;
2176     }
2177     if ((options & PCRE_AUTO_CALLOUT) != 0)
2178     {
2179     previous_callout = code;
2180     code = auto_callout(code, ptr, cd);
2181     }
2182     goto NORMAL_CHAR;
2183     }
2184     }
2185    
2186     /* Fill in length of a previous callout, except when the next thing is
2187     a quantifier. */
2188    
2189     is_quantifier = c == '*' || c == '+' || c == '?' ||
2190     (c == '{' && is_counted_repeat(ptr+1));
2191    
2192     if (!is_quantifier && previous_callout != NULL &&
2193     after_manual_callout-- <= 0)
2194     {
2195 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2196     complete_callout(previous_callout, ptr, cd);
2197 nigel 77 previous_callout = NULL;
2198     }
2199    
2200     /* In extended mode, skip white space and comments */
2201    
2202     if ((options & PCRE_EXTENDED) != 0)
2203     {
2204     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2205     if (c == '#')
2206     {
2207 nigel 93 while (*(++ptr) != 0)
2208 nigel 91 {
2209 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2210 nigel 91 }
2211 nigel 93 if (*ptr != 0) continue;
2212    
2213 nigel 91 /* Else fall through to handle end of string */
2214     c = 0;
2215 nigel 77 }
2216     }
2217    
2218     /* No auto callout for quantifiers. */
2219    
2220     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2221     {
2222     previous_callout = code;
2223     code = auto_callout(code, ptr, cd);
2224     }
2225    
2226     switch(c)
2227     {
2228 nigel 93 /* ===================================================================*/
2229     case 0: /* The branch terminates at string end */
2230     case '|': /* or | or ) */
2231 nigel 77 case ')':
2232     *firstbyteptr = firstbyte;
2233     *reqbyteptr = reqbyte;
2234     *codeptr = code;
2235     *ptrptr = ptr;
2236 nigel 93 if (lengthptr != NULL)
2237     {
2238     *lengthptr += code - last_code; /* To include callout length */
2239     DPRINTF((">> end branch\n"));
2240     }
2241 nigel 77 return TRUE;
2242    
2243 nigel 93
2244     /* ===================================================================*/
2245 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2246     the setting of any following char as a first character. */
2247    
2248     case '^':
2249     if ((options & PCRE_MULTILINE) != 0)
2250     {
2251     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2252     }
2253     previous = NULL;
2254     *code++ = OP_CIRC;
2255     break;
2256    
2257     case '$':
2258     previous = NULL;
2259     *code++ = OP_DOLL;
2260     break;
2261    
2262     /* There can never be a first char if '.' is first, whatever happens about
2263     repeats. The value of reqbyte doesn't change either. */
2264    
2265     case '.':
2266     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2267     zerofirstbyte = firstbyte;
2268     zeroreqbyte = reqbyte;
2269     previous = code;
2270     *code++ = OP_ANY;
2271     break;
2272    
2273 nigel 93
2274     /* ===================================================================*/
2275 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2276     32-byte bitmap of the permitted characters, except in the special case
2277     where there is only one such character. For negated classes, we build the
2278     map as usual, then invert it at the end. However, we use a different opcode
2279     so that data characters > 255 can be handled correctly.
2280 nigel 77
2281     If the class contains characters outside the 0-255 range, a different
2282     opcode is compiled. It may optionally have a bit map for characters < 256,
2283     but those above are are explicitly listed afterwards. A flag byte tells
2284     whether the bitmap is present, and whether this is a negated class or not.
2285     */
2286    
2287     case '[':
2288     previous = code;
2289    
2290     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2291     they are encountered at the top level, so we'll do that too. */
2292    
2293     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2294     check_posix_syntax(ptr, &tempptr, cd))
2295     {
2296     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2297     goto FAILED;
2298     }
2299    
2300     /* If the first character is '^', set the negation flag and skip it. */
2301    
2302     if ((c = *(++ptr)) == '^')
2303     {
2304     negate_class = TRUE;
2305     c = *(++ptr);
2306     }
2307     else
2308     {
2309     negate_class = FALSE;
2310     }
2311    
2312     /* Keep a count of chars with values < 256 so that we can optimize the case
2313 nigel 93 of just a single character (as long as it's < 256). However, For higher
2314     valued UTF-8 characters, we don't yet do any optimization. */
2315 nigel 77
2316     class_charcount = 0;
2317     class_lastchar = -1;
2318    
2319 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2320     temporary bit of memory, in case the class contains only 1 character (less
2321     than 256), because in that case the compiled code doesn't use the bit map.
2322     */
2323    
2324     memset(classbits, 0, 32 * sizeof(uschar));
2325    
2326 nigel 77 #ifdef SUPPORT_UTF8
2327     class_utf8 = FALSE; /* No chars >= 256 */
2328 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2329 nigel 77 #endif
2330    
2331     /* Process characters until ] is reached. By writing this as a "do" it
2332 nigel 93 means that an initial ] is taken as a data character. At the start of the
2333     loop, c contains the first byte of the character. */
2334 nigel 77
2335 nigel 93 if (c != 0) do
2336 nigel 77 {
2337 nigel 93 const uschar *oldptr;
2338    
2339 nigel 77 #ifdef SUPPORT_UTF8
2340     if (utf8 && c > 127)
2341     { /* Braces are required because the */
2342     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2343     }
2344     #endif
2345    
2346     /* Inside \Q...\E everything is literal except \E */
2347    
2348     if (inescq)
2349     {
2350 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2351 nigel 77 {
2352 nigel 93 inescq = FALSE; /* Reset literal state */
2353     ptr++; /* Skip the 'E' */
2354     continue; /* Carry on with next */
2355 nigel 77 }
2356 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2357 nigel 77 }
2358    
2359     /* Handle POSIX class names. Perl allows a negation extension of the
2360     form [:^name:]. A square bracket that doesn't match the syntax is
2361     treated as a literal. We also recognize the POSIX constructions
2362     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2363     5.6 and 5.8 do. */
2364    
2365     if (c == '[' &&
2366     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2367     check_posix_syntax(ptr, &tempptr, cd))
2368     {
2369     BOOL local_negate = FALSE;
2370 nigel 87 int posix_class, taboffset, tabopt;
2371 nigel 77 register const uschar *cbits = cd->cbits;
2372 nigel 87 uschar pbits[32];
2373 nigel 77
2374     if (ptr[1] != ':')
2375     {
2376     *errorcodeptr = ERR31;
2377     goto FAILED;
2378     }
2379    
2380     ptr += 2;
2381     if (*ptr == '^')
2382     {
2383     local_negate = TRUE;
2384     ptr++;
2385     }
2386    
2387     posix_class = check_posix_name(ptr, tempptr - ptr);
2388     if (posix_class < 0)
2389     {
2390     *errorcodeptr = ERR30;
2391     goto FAILED;
2392     }
2393    
2394     /* If matching is caseless, upper and lower are converted to
2395     alpha. This relies on the fact that the class table starts with
2396     alpha, lower, upper as the first 3 entries. */
2397    
2398     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2399     posix_class = 0;
2400    
2401 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2402     because we may be adding and subtracting from it, and we don't want to
2403     subtract bits that may be in the main map already. At the end we or the
2404     result into the bit map that is being built. */
2405 nigel 77
2406     posix_class *= 3;
2407 nigel 87
2408     /* Copy in the first table (always present) */
2409    
2410     memcpy(pbits, cbits + posix_class_maps[posix_class],
2411     32 * sizeof(uschar));
2412    
2413     /* If there is a second table, add or remove it as required. */
2414    
2415     taboffset = posix_class_maps[posix_class + 1];
2416     tabopt = posix_class_maps[posix_class + 2];
2417    
2418     if (taboffset >= 0)
2419 nigel 77 {
2420 nigel 87 if (tabopt >= 0)
2421     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2422 nigel 77 else
2423 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2424 nigel 77 }
2425    
2426 nigel 87 /* Not see if we need to remove any special characters. An option
2427     value of 1 removes vertical space and 2 removes underscore. */
2428    
2429     if (tabopt < 0) tabopt = -tabopt;
2430     if (tabopt == 1) pbits[1] &= ~0x3c;
2431     else if (tabopt == 2) pbits[11] &= 0x7f;
2432    
2433     /* Add the POSIX table or its complement into the main table that is
2434     being built and we are done. */
2435    
2436     if (local_negate)
2437     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2438     else
2439     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2440    
2441 nigel 77 ptr = tempptr + 1;
2442     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2443     continue; /* End of POSIX syntax handling */
2444     }
2445    
2446     /* Backslash may introduce a single character, or it may introduce one
2447 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2448     case. Inside a class (and only there) it is treated as backspace.
2449     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2450     to or into the one we are building. We assume they have more than one
2451 nigel 77 character in them, so set class_charcount bigger than one. */
2452    
2453     if (c == '\\')
2454     {
2455 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2456     if (*errorcodeptr != 0) goto FAILED;
2457 nigel 77
2458     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2459     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2460 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2461 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2462     {
2463     if (ptr[1] == '\\' && ptr[2] == 'E')
2464     {
2465     ptr += 2; /* avoid empty string */
2466     }
2467     else inescq = TRUE;
2468     continue;
2469     }
2470    
2471     if (c < 0)
2472     {
2473     register const uschar *cbits = cd->cbits;
2474     class_charcount += 2; /* Greater than 1 is what matters */
2475 nigel 93
2476     /* Save time by not doing this in the pre-compile phase. */
2477    
2478     if (lengthptr == NULL) switch (-c)
2479 nigel 77 {
2480     case ESC_d:
2481     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2482     continue;
2483    
2484     case ESC_D:
2485     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2486     continue;
2487    
2488     case ESC_w:
2489     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2490     continue;
2491    
2492     case ESC_W:
2493     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2494     continue;
2495    
2496     case ESC_s:
2497     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2498     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2499     continue;
2500    
2501     case ESC_S:
2502     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2503     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2504     continue;
2505    
2506 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2507     continue;
2508    
2509     default: /* Not recognized; fall through */
2510     break; /* Need "default" setting to stop compiler warning. */
2511     }
2512    
2513     /* In the pre-compile phase, just do the recognition. */
2514    
2515     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2516     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2517    
2518     /* We need to deal with \P and \p in both phases. */
2519    
2520 nigel 77 #ifdef SUPPORT_UCP
2521 nigel 93 if (-c == ESC_p || -c == ESC_P)
2522     {
2523     BOOL negated;
2524     int pdata;
2525     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2526     if (ptype < 0) goto FAILED;
2527     class_utf8 = TRUE;
2528     *class_utf8data++ = ((-c == ESC_p) != negated)?
2529     XCL_PROP : XCL_NOTPROP;
2530     *class_utf8data++ = ptype;
2531     *class_utf8data++ = pdata;
2532     class_charcount -= 2; /* Not a < 256 character */
2533 nigel 77 continue;
2534 nigel 93 }
2535 nigel 77 #endif
2536 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2537     strict mode. By default, for compatibility with Perl, they are
2538     treated as literals. */
2539 nigel 77
2540 nigel 93 if ((options & PCRE_EXTRA) != 0)
2541     {
2542     *errorcodeptr = ERR7;
2543     goto FAILED;
2544     }
2545 nigel 77
2546 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2547     c = *ptr; /* Get the final character and fall through */
2548 nigel 77 }
2549    
2550     /* Fall through if we have a single character (c >= 0). This may be
2551 nigel 93 greater than 256 in UTF-8 mode. */
2552 nigel 77
2553     } /* End of backslash handling */
2554    
2555     /* A single character may be followed by '-' to form a range. However,
2556     Perl does not permit ']' to be the end of the range. A '-' character
2557 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2558     entirely. The code for handling \Q and \E is messy. */
2559 nigel 77
2560 nigel 93 CHECK_RANGE:
2561     while (ptr[1] == '\\' && ptr[2] == 'E')
2562 nigel 77 {
2563 nigel 93 inescq = FALSE;
2564     ptr += 2;
2565     }
2566    
2567     oldptr = ptr;
2568    
2569     if (!inescq && ptr[1] == '-')
2570     {
2571 nigel 77 int d;
2572     ptr += 2;
2573 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2574 nigel 77
2575 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2576     mode. */
2577    
2578     while (*ptr == '\\' && ptr[1] == 'Q')
2579     {
2580     ptr += 2;
2581     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2582     inescq = TRUE;
2583     break;
2584     }
2585    
2586     if (*ptr == 0 || (!inescq && *ptr == ']'))
2587     {
2588     ptr = oldptr;
2589     goto LONE_SINGLE_CHARACTER;
2590     }
2591    
2592 nigel 77 #ifdef SUPPORT_UTF8
2593     if (utf8)
2594     { /* Braces are required because the */
2595     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2596     }
2597     else
2598     #endif
2599     d = *ptr; /* Not UTF-8 mode */
2600    
2601     /* The second part of a range can be a single-character escape, but
2602     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2603     in such circumstances. */
2604    
2605 nigel 93 if (!inescq && d == '\\')
2606 nigel 77 {
2607 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2608     if (*errorcodeptr != 0) goto FAILED;
2609 nigel 77
2610 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2611     special means the '-' was literal */
2612 nigel 77
2613     if (d < 0)
2614     {
2615     if (d == -ESC_b) d = '\b';
2616 nigel 93 else if (d == -ESC_X) d = 'X';
2617     else if (d == -ESC_R) d = 'R'; else
2618 nigel 77 {
2619 nigel 93 ptr = oldptr;
2620 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2621     }
2622     }
2623     }
2624    
2625 nigel 93 /* Check that the two values are in the correct order. Optimize
2626     one-character ranges */
2627 nigel 77
2628 nigel 93 if (d < c)
2629     {
2630     *errorcodeptr = ERR8;
2631     goto FAILED;
2632     }
2633    
2634 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2635    
2636     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2637     matching, we have to use an XCLASS with extra data items. Caseless
2638     matching for characters > 127 is available only if UCP support is
2639     available. */
2640    
2641     #ifdef SUPPORT_UTF8
2642     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2643     {
2644     class_utf8 = TRUE;
2645    
2646     /* With UCP support, we can find the other case equivalents of
2647     the relevant characters. There may be several ranges. Optimize how
2648     they fit with the basic range. */
2649    
2650     #ifdef SUPPORT_UCP
2651     if ((options & PCRE_CASELESS) != 0)
2652     {
2653 nigel 93 unsigned int occ, ocd;
2654     unsigned int cc = c;
2655     unsigned int origd = d;
2656 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2657     {
2658     if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2659    
2660     if (occ < c && ocd >= c - 1) /* Extend the basic range */
2661     { /* if there is overlap, */
2662     c = occ; /* noting that if occ < c */
2663     continue; /* we can't have ocd > d */
2664     } /* because a subrange is */
2665     if (ocd > d && occ <= d + 1) /* always shorter than */
2666     { /* the basic range. */
2667     d = ocd;
2668     continue;
2669     }
2670    
2671     if (occ == ocd)
2672     {
2673     *class_utf8data++ = XCL_SINGLE;
2674     }
2675     else
2676     {
2677     *class_utf8data++ = XCL_RANGE;
2678     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2679     }
2680     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2681     }
2682     }
2683     #endif /* SUPPORT_UCP */
2684    
2685     /* Now record the original range, possibly modified for UCP caseless
2686     overlapping ranges. */
2687    
2688     *class_utf8data++ = XCL_RANGE;
2689     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2690     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2691    
2692     /* With UCP support, we are done. Without UCP support, there is no
2693     caseless matching for UTF-8 characters > 127; we can use the bit map
2694     for the smaller ones. */
2695    
2696     #ifdef SUPPORT_UCP
2697     continue; /* With next character in the class */
2698     #else
2699     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2700    
2701     /* Adjust upper limit and fall through to set up the map */
2702    
2703     d = 127;
2704    
2705     #endif /* SUPPORT_UCP */
2706     }
2707     #endif /* SUPPORT_UTF8 */
2708    
2709     /* We use the bit map for all cases when not in UTF-8 mode; else
2710     ranges that lie entirely within 0-127 when there is UCP support; else
2711     for partial ranges without UCP support. */
2712    
2713 nigel 93 class_charcount += d - c + 1;
2714     class_lastchar = d;
2715    
2716     /* We can save a bit of time by skipping this in the pre-compile. */
2717    
2718     if (lengthptr == NULL) for (; c <= d; c++)
2719 nigel 77 {
2720     classbits[c/8] |= (1 << (c&7));
2721     if ((options & PCRE_CASELESS) != 0)
2722     {
2723     int uc = cd->fcc[c]; /* flip case */
2724     classbits[uc/8] |= (1 << (uc&7));
2725     }
2726     }
2727    
2728     continue; /* Go get the next char in the class */
2729     }
2730    
2731     /* Handle a lone single character - we can get here for a normal
2732     non-escape char, or after \ that introduces a single character or for an
2733     apparent range that isn't. */
2734    
2735     LONE_SINGLE_CHARACTER:
2736    
2737     /* Handle a character that cannot go in the bit map */
2738    
2739     #ifdef SUPPORT_UTF8
2740     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2741     {
2742     class_utf8 = TRUE;
2743     *class_utf8data++ = XCL_SINGLE;
2744     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2745    
2746     #ifdef SUPPORT_UCP
2747     if ((options & PCRE_CASELESS) != 0)
2748     {
2749 nigel 93 unsigned int othercase;
2750     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2751 nigel 77 {
2752     *class_utf8data++ = XCL_SINGLE;
2753     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2754     }
2755     }
2756     #endif /* SUPPORT_UCP */
2757    
2758     }
2759     else
2760     #endif /* SUPPORT_UTF8 */
2761    
2762     /* Handle a single-byte character */
2763     {
2764     classbits[c/8] |= (1 << (c&7));
2765     if ((options & PCRE_CASELESS) != 0)
2766     {
2767     c = cd->fcc[c]; /* flip case */
2768     classbits[c/8] |= (1 << (c&7));
2769     }
2770     class_charcount++;
2771     class_lastchar = c;
2772     }
2773     }
2774    
2775 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2776 nigel 77
2777 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2778 nigel 77
2779 nigel 93 if (c == 0) /* Missing terminating ']' */
2780     {
2781     *errorcodeptr = ERR6;
2782     goto FAILED;
2783     }
2784    
2785 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2786     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2787     can optimize the negative case only if there were no characters >= 128
2788     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2789     single-bytes only. This is an historical hangover. Maybe one day we can
2790     tidy these opcodes to handle multi-byte characters.
2791    
2792     The optimization throws away the bit map. We turn the item into a
2793     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2794     that OP_NOT does not support multibyte characters. In the positive case, it
2795     can cause firstbyte to be set. Otherwise, there can be no first char if
2796     this item is first, whatever repeat count may follow. In the case of
2797     reqbyte, save the previous value for reinstating. */
2798    
2799     #ifdef SUPPORT_UTF8
2800     if (class_charcount == 1 &&
2801     (!utf8 ||
2802     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2803    
2804     #else
2805     if (class_charcount == 1)
2806     #endif
2807     {
2808     zeroreqbyte = reqbyte;
2809    
2810     /* The OP_NOT opcode works on one-byte characters only. */
2811    
2812     if (negate_class)
2813     {
2814     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2815     zerofirstbyte = firstbyte;
2816     *code++ = OP_NOT;
2817     *code++ = class_lastchar;
2818     break;
2819     }
2820    
2821     /* For a single, positive character, get the value into mcbuffer, and
2822     then we can handle this with the normal one-character code. */
2823    
2824     #ifdef SUPPORT_UTF8
2825     if (utf8 && class_lastchar > 127)
2826     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2827     else
2828     #endif
2829     {
2830     mcbuffer[0] = class_lastchar;
2831     mclength = 1;
2832     }
2833     goto ONE_CHAR;
2834     } /* End of 1-char optimization */
2835    
2836     /* The general case - not the one-char optimization. If this is the first
2837     thing in the branch, there can be no first char setting, whatever the
2838     repeat count. Any reqbyte setting must remain unchanged after any kind of
2839     repeat. */
2840    
2841     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2842     zerofirstbyte = firstbyte;
2843     zeroreqbyte = reqbyte;
2844    
2845     /* If there are characters with values > 255, we have to compile an
2846     extended class, with its own opcode. If there are no characters < 256,
2847 nigel 93 we can omit the bitmap in the actual compiled code. */
2848 nigel 77
2849     #ifdef SUPPORT_UTF8
2850     if (class_utf8)
2851     {
2852     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2853     *code++ = OP_XCLASS;
2854     code += LINK_SIZE;
2855     *code = negate_class? XCL_NOT : 0;
2856    
2857 nigel 93 /* If the map is required, move up the extra data to make room for it;
2858     otherwise just move the code pointer to the end of the extra data. */
2859 nigel 77
2860     if (class_charcount > 0)
2861     {
2862     *code++ |= XCL_MAP;
2863 nigel 93 memmove(code + 32, code, class_utf8data - code);
2864 nigel 77 memcpy(code, classbits, 32);
2865 nigel 93 code = class_utf8data + 32;
2866 nigel 77 }
2867 nigel 93 else code = class_utf8data;
2868 nigel 77
2869     /* Now fill in the complete length of the item */
2870    
2871     PUT(previous, 1, code - previous);
2872     break; /* End of class handling */
2873     }
2874     #endif
2875    
2876     /* If there are no characters > 255, negate the 32-byte map if necessary,
2877     and copy it into the code vector. If this is the first thing in the branch,
2878     there can be no first char setting, whatever the repeat count. Any reqbyte
2879     setting must remain unchanged after any kind of repeat. */
2880    
2881     if (negate_class)
2882     {
2883     *code++ = OP_NCLASS;
2884 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2885     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2886 nigel 77 }
2887     else
2888     {
2889     *code++ = OP_CLASS;
2890     memcpy(code, classbits, 32);
2891     }
2892     code += 32;
2893     break;
2894    
2895 nigel 93
2896     /* ===================================================================*/
2897 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2898     has been tested above. */
2899    
2900     case '{':
2901     if (!is_quantifier) goto NORMAL_CHAR;
2902     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2903     if (*errorcodeptr != 0) goto FAILED;
2904     goto REPEAT;
2905    
2906     case '*':
2907     repeat_min = 0;
2908     repeat_max = -1;
2909     goto REPEAT;
2910    
2911     case '+':
2912     repeat_min = 1;
2913     repeat_max = -1;
2914     goto REPEAT;
2915    
2916     case '?':
2917     repeat_min = 0;
2918     repeat_max = 1;
2919    
2920     REPEAT:
2921     if (previous == NULL)
2922     {
2923     *errorcodeptr = ERR9;
2924     goto FAILED;
2925     }
2926    
2927     if (repeat_min == 0)
2928     {
2929     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2930     reqbyte = zeroreqbyte; /* Ditto */
2931     }
2932    
2933     /* Remember whether this is a variable length repeat */
2934    
2935     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2936    
2937     op_type = 0; /* Default single-char op codes */
2938     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2939    
2940     /* Save start of previous item, in case we have to move it up to make space
2941     for an inserted OP_ONCE for the additional '+' extension. */
2942    
2943     tempcode = previous;
2944    
2945     /* If the next character is '+', we have a possessive quantifier. This
2946     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2947     If the next character is '?' this is a minimizing repeat, by default,
2948     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2949     repeat type to the non-default. */
2950    
2951     if (ptr[1] == '+')
2952     {
2953     repeat_type = 0; /* Force greedy */
2954     possessive_quantifier = TRUE;
2955     ptr++;
2956     }
2957     else if (ptr[1] == '?')
2958     {
2959     repeat_type = greedy_non_default;
2960     ptr++;
2961     }
2962     else repeat_type = greedy_default;
2963    
2964     /* If previous was a character match, abolish the item and generate a
2965     repeat item instead. If a char item has a minumum of more than one, ensure
2966     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2967     the first thing in a branch because the x will have gone into firstbyte
2968     instead. */
2969    
2970     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2971     {
2972     /* Deal with UTF-8 characters that take up more than one byte. It's
2973     easier to write this out separately than try to macrify it. Use c to
2974     hold the length of the character in bytes, plus 0x80 to flag that it's a
2975     length rather than a small character. */
2976    
2977     #ifdef SUPPORT_UTF8
2978     if (utf8 && (code[-1] & 0x80) != 0)
2979     {
2980     uschar *lastchar = code - 1;
2981     while((*lastchar & 0xc0) == 0x80) lastchar--;
2982     c = code - lastchar; /* Length of UTF-8 character */
2983     memcpy(utf8_char, lastchar, c); /* Save the char */
2984     c |= 0x80; /* Flag c as a length */
2985     }
2986     else
2987     #endif
2988    
2989     /* Handle the case of a single byte - either with no UTF8 support, or
2990     with UTF-8 disabled, or for a UTF-8 character < 128. */
2991    
2992     {
2993     c = code[-1];
2994     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2995     }
2996    
2997 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
2998     the line is something that cannot possibly match this character. If so,
2999     automatically possessifying this item gains some performance in the case
3000     where the match fails. */
3001    
3002     if (!possessive_quantifier &&
3003     repeat_max < 0 &&
3004     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3005     options, cd))
3006     {
3007     repeat_type = 0; /* Force greedy */
3008     possessive_quantifier = TRUE;
3009     }
3010    
3011 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3012     }
3013    
3014     /* If previous was a single negated character ([^a] or similar), we use
3015     one of the special opcodes, replacing it. The code is shared with single-
3016     character repeats by setting opt_type to add a suitable offset into
3017 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3018     currently used only for single-byte chars. */
3019 nigel 77
3020     else if (*previous == OP_NOT)
3021     {
3022     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3023     c = previous[1];
3024 nigel 93 if (!possessive_quantifier &&
3025     repeat_max < 0 &&
3026     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3027     {
3028     repeat_type = 0; /* Force greedy */
3029     possessive_quantifier = TRUE;
3030     }
3031 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3032     }
3033    
3034     /* If previous was a character type match (\d or similar), abolish it and
3035     create a suitable repeat item. The code is shared with single-character
3036     repeats by setting op_type to add a suitable offset into repeat_type. Note
3037     the the Unicode property types will be present only when SUPPORT_UCP is
3038     defined, but we don't wrap the little bits of code here because it just
3039     makes it horribly messy. */
3040    
3041     else if (*previous < OP_EODN)
3042     {
3043     uschar *oldcode;
3044 nigel 87 int prop_type, prop_value;
3045 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3046     c = *previous;
3047    
3048 nigel 93 if (!possessive_quantifier &&
3049     repeat_max < 0 &&
3050     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3051     {
3052     repeat_type = 0; /* Force greedy */
3053     possessive_quantifier = TRUE;
3054     }
3055    
3056 nigel 77 OUTPUT_SINGLE_REPEAT:
3057 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3058     {
3059     prop_type = previous[1];
3060     prop_value = previous[2];
3061     }
3062     else prop_type = prop_value = -1;
3063 nigel 77
3064     oldcode = code;
3065     code = previous; /* Usually overwrite previous item */
3066    
3067     /* If the maximum is zero then the minimum must also be zero; Perl allows
3068     this case, so we do too - by simply omitting the item altogether. */
3069    
3070     if (repeat_max == 0) goto END_REPEAT;
3071    
3072     /* All real repeats make it impossible to handle partial matching (maybe
3073     one day we will be able to remove this restriction). */
3074    
3075     if (repeat_max != 1) cd->nopartial = TRUE;
3076    
3077     /* Combine the op_type with the repeat_type */
3078    
3079     repeat_type += op_type;
3080    
3081     /* A minimum of zero is handled either as the special case * or ?, or as
3082     an UPTO, with the maximum given. */
3083    
3084     if (repeat_min == 0)
3085     {
3086     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3087     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3088     else
3089     {
3090     *code++ = OP_UPTO + repeat_type;
3091     PUT2INC(code, 0, repeat_max);
3092     }
3093     }
3094    
3095     /* A repeat minimum of 1 is optimized into some special cases. If the
3096 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3097 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3098     one less than the maximum. */
3099    
3100     else if (repeat_min == 1)
3101     {
3102     if (repeat_max == -1)
3103     *code++ = OP_PLUS + repeat_type;
3104     else
3105     {
3106     code = oldcode; /* leave previous item in place */
3107     if (repeat_max == 1) goto END_REPEAT;
3108     *code++ = OP_UPTO + repeat_type;
3109     PUT2INC(code, 0, repeat_max - 1);
3110     }
3111     }
3112    
3113     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3114     handled as an EXACT followed by an UPTO. */
3115    
3116     else
3117     {
3118     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3119     PUT2INC(code, 0, repeat_min);
3120    
3121     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3122     we have to insert the character for the previous code. For a repeated
3123 nigel 87 Unicode property match, there are two extra bytes that define the
3124 nigel 77 required property. In UTF-8 mode, long characters have their length in
3125     c, with the 0x80 bit as a flag. */
3126    
3127     if (repeat_max < 0)
3128     {
3129     #ifdef SUPPORT_UTF8
3130     if (utf8 && c >= 128)
3131     {
3132     memcpy(code, utf8_char, c & 7);
3133     code += c & 7;
3134     }
3135     else
3136     #endif
3137     {
3138     *code++ = c;
3139 nigel 87 if (prop_type >= 0)
3140     {
3141     *code++ = prop_type;
3142     *code++ = prop_value;
3143     }
3144 nigel 77 }
3145     *code++ = OP_STAR + repeat_type;
3146     }
3147    
3148     /* Else insert an UPTO if the max is greater than the min, again
3149 nigel 93 preceded by the character, for the previously inserted code. If the
3150     UPTO is just for 1 instance, we can use QUERY instead. */
3151 nigel 77
3152     else if (repeat_max != repeat_min)
3153     {
3154     #ifdef SUPPORT_UTF8
3155     if (utf8 && c >= 128)
3156     {
3157     memcpy(code, utf8_char, c & 7);
3158     code += c & 7;
3159     }
3160     else
3161     #endif
3162     *code++ = c;
3163 nigel 87 if (prop_type >= 0)
3164     {
3165     *code++ = prop_type;
3166     *code++ = prop_value;
3167     }
3168 nigel 77 repeat_max -= repeat_min;
3169 nigel 93
3170     if (repeat_max == 1)
3171     {
3172     *code++ = OP_QUERY + repeat_type;
3173     }
3174     else
3175     {
3176     *code++ = OP_UPTO + repeat_type;
3177     PUT2INC(code, 0, repeat_max);
3178     }
3179 nigel 77 }
3180     }
3181    
3182     /* The character or character type itself comes last in all cases. */
3183    
3184     #ifdef SUPPORT_UTF8
3185     if (utf8 && c >= 128)
3186     {
3187     memcpy(code, utf8_char, c & 7);
3188     code += c & 7;
3189     }
3190     else
3191     #endif
3192     *code++ = c;
3193    
3194 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3195     define the required property. */
3196 nigel 77
3197     #ifdef SUPPORT_UCP
3198 nigel 87 if (prop_type >= 0)
3199     {
3200     *code++ = prop_type;
3201     *code++ = prop_value;
3202     }
3203 nigel 77 #endif
3204     }
3205    
3206     /* If previous was a character class or a back reference, we put the repeat
3207     stuff after it, but just skip the item if the repeat was {0,0}. */
3208    
3209     else if (*previous == OP_CLASS ||
3210     *previous == OP_NCLASS ||
3211     #ifdef SUPPORT_UTF8
3212     *previous == OP_XCLASS ||
3213     #endif
3214     *previous == OP_REF)
3215     {
3216     if (repeat_max == 0)
3217     {
3218     code = previous;
3219     goto END_REPEAT;
3220     }
3221    
3222     /* All real repeats make it impossible to handle partial matching (maybe
3223     one day we will be able to remove this restriction). */
3224    
3225     if (repeat_max != 1) cd->nopartial = TRUE;
3226    
3227     if (repeat_min == 0 && repeat_max == -1)
3228     *code++ = OP_CRSTAR + repeat_type;
3229     else if (repeat_min == 1 && repeat_max == -1)
3230     *code++ = OP_CRPLUS + repeat_type;
3231     else if (repeat_min == 0 && repeat_max == 1)
3232     *code++ = OP_CRQUERY + repeat_type;
3233     else
3234     {
3235     *code++ = OP_CRRANGE + repeat_type;
3236     PUT2INC(code, 0, repeat_min);
3237     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3238     PUT2INC(code, 0, repeat_max);
3239     }
3240     }
3241    
3242     /* If previous was a bracket group, we may have to replicate it in certain
3243     cases. */
3244    
3245 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3246     *previous == OP_ONCE || *previous == OP_COND)
3247 nigel 77 {
3248     register int i;
3249     int ketoffset = 0;
3250     int len = code - previous;
3251     uschar *bralink = NULL;
3252    
3253 nigel 93 /* Repeating a DEFINE group is pointless */
3254    
3255     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3256     {
3257     *errorcodeptr = ERR55;
3258     goto FAILED;
3259     }
3260    
3261     /* This is a paranoid check to stop integer overflow later on */
3262    
3263     if (len > MAX_DUPLENGTH)
3264     {
3265     *errorcodeptr = ERR50;
3266     goto FAILED;
3267     }
3268    
3269 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3270     by scanning through from the start, and compute the offset back to it
3271     from the current code pointer. There may be an OP_OPT setting following
3272     the final KET, so we can't find the end just by going back from the code
3273     pointer. */
3274    
3275     if (repeat_max == -1)
3276     {
3277     register uschar *ket = previous;
3278     do ket += GET(ket, 1); while (*ket != OP_KET);
3279     ketoffset = code - ket;
3280     }
3281    
3282     /* The case of a zero minimum is special because of the need to stick
3283     OP_BRAZERO in front of it, and because the group appears once in the
3284     data, whereas in other cases it appears the minimum number of times. For
3285     this reason, it is simplest to treat this case separately, as otherwise
3286     the code gets far too messy. There are several special subcases when the
3287     minimum is zero. */
3288    
3289     if (repeat_min == 0)
3290     {
3291     /* If the maximum is also zero, we just omit the group from the output
3292     altogether. */
3293    
3294     if (repeat_max == 0)
3295     {
3296     code = previous;
3297     goto END_REPEAT;
3298     }
3299    
3300     /* If the maximum is 1 or unlimited, we just have to stick in the
3301     BRAZERO and do no more at this point. However, we do need to adjust
3302     any OP_RECURSE calls inside the group that refer to the group itself or
3303 nigel 93 any internal or forward referenced group, because the offset is from
3304     the start of the whole regex. Temporarily terminate the pattern while
3305     doing this. */
3306 nigel 77
3307     if (repeat_max <= 1)
3308     {
3309     *code = OP_END;
3310 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3311 nigel 77 memmove(previous+1, previous, len);
3312     code++;
3313     *previous++ = OP_BRAZERO + repeat_type;
3314     }
3315    
3316     /* If the maximum is greater than 1 and limited, we have to replicate
3317     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3318     The first one has to be handled carefully because it's the original
3319     copy, which has to be moved up. The remainder can be handled by code
3320     that is common with the non-zero minimum case below. We have to
3321     adjust the value or repeat_max, since one less copy is required. Once
3322     again, we may have to adjust any OP_RECURSE calls inside the group. */
3323    
3324     else
3325     {
3326     int offset;
3327     *code = OP_END;
3328 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3329 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3330     code += 2 + LINK_SIZE;
3331     *previous++ = OP_BRAZERO + repeat_type;
3332     *previous++ = OP_BRA;
3333    
3334     /* We chain together the bracket offset fields that have to be
3335     filled in later when the ends of the brackets are reached. */
3336    
3337     offset = (bralink == NULL)? 0 : previous - bralink;
3338     bralink = previous;
3339     PUTINC(previous, 0, offset);
3340     }
3341    
3342     repeat_max--;
3343     }
3344    
3345     /* If the minimum is greater than zero, replicate the group as many
3346     times as necessary, and adjust the maximum to the number of subsequent
3347     copies that we need. If we set a first char from the group, and didn't
3348 nigel 93 set a required char, copy the latter from the former. If there are any
3349     forward reference subroutine calls in the group, there will be entries on
3350     the workspace list; replicate these with an appropriate increment. */
3351 nigel 77
3352     else
3353     {
3354     if (repeat_min > 1)
3355     {
3356 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3357     just adjust the length as if we had. */
3358    
3359     if (lengthptr != NULL)
3360     *lengthptr += (repeat_min - 1)*length_prevgroup;
3361    
3362     /* This is compiling for real */
3363    
3364     else
3365 nigel 77 {
3366 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3367     for (i = 1; i < repeat_min; i++)
3368     {
3369     uschar *hc;
3370     uschar *this_hwm = cd->hwm;
3371     memcpy(code, previous, len);
3372     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3373     {
3374     PUT(cd->hwm, 0, GET(hc, 0) + len);
3375     cd->hwm += LINK_SIZE;
3376     }
3377     save_hwm = this_hwm;
3378     code += len;
3379     }
3380 nigel 77 }
3381     }
3382 nigel 93
3383 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3384     }
3385    
3386     /* This code is common to both the zero and non-zero minimum cases. If
3387     the maximum is limited, it replicates the group in a nested fashion,
3388     remembering the bracket starts on a stack. In the case of a zero minimum,
3389     the first one was set up above. In all cases the repeat_max now specifies
3390 nigel 93 the number of additional copies needed. Again, we must remember to
3391     replicate entries on the forward reference list. */
3392 nigel 77
3393     if (repeat_max >= 0)
3394     {
3395 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3396     just adjust the length as if we had. For each repetition we must add 1
3397     to the length for BRAZERO and for all but the last repetition we must
3398     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3399    
3400     if (lengthptr != NULL && repeat_max > 0)
3401     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3402     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3403    
3404     /* This is compiling for real */
3405    
3406     else for (i = repeat_max - 1; i >= 0; i--)
3407 nigel 77 {
3408 nigel 93 uschar *hc;
3409     uschar *this_hwm = cd->hwm;
3410    
3411 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3412    
3413     /* All but the final copy start a new nesting, maintaining the
3414     chain of brackets outstanding. */
3415    
3416     if (i != 0)
3417     {
3418     int offset;
3419     *code++ = OP_BRA;
3420     offset = (bralink == NULL)? 0 : code - bralink;
3421     bralink = code;
3422     PUTINC(code, 0, offset);
3423     }
3424    
3425     memcpy(code, previous, len);
3426 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3427     {
3428     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3429     cd->hwm += LINK_SIZE;
3430     }
3431     save_hwm = this_hwm;
3432 nigel 77 code += len;
3433     }
3434    
3435     /* Now chain through the pending brackets, and fill in their length
3436     fields (which are holding the chain links pro tem). */
3437    
3438     while (bralink != NULL)
3439     {
3440     int oldlinkoffset;
3441     int offset = code - bralink + 1;
3442     uschar *bra = code - offset;
3443     oldlinkoffset = GET(bra, 1);
3444     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3445     *code++ = OP_KET;
3446     PUTINC(code, 0, offset);
3447     PUT(bra, 1, offset);
3448     }
3449     }
3450    
3451     /* If the maximum is unlimited, set a repeater in the final copy. We
3452     can't just offset backwards from the current code point, because we
3453     don't know if there's been an options resetting after the ket. The
3454 nigel 93 correct offset was computed above.
3455 nigel 77
3456 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3457     this group is a non-atomic one that could match an empty string. If so,
3458     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3459     that runtime checking can be done. [This check is also applied to
3460     atomic groups at runtime, but in a different way.] */
3461    
3462     else
3463     {
3464     uschar *ketcode = code - ketoffset;
3465     uschar *bracode = ketcode - GET(ketcode, 1);
3466     *ketcode = OP_KETRMAX + repeat_type;
3467     if (lengthptr == NULL && *bracode != OP_ONCE)
3468     {
3469     uschar *scode = bracode;
3470     do
3471     {
3472     if (could_be_empty_branch(scode, ketcode, utf8))
3473     {
3474     *bracode += OP_SBRA - OP_BRA;
3475     break;
3476     }
3477     scode += GET(scode, 1);
3478     }
3479     while (*scode == OP_ALT);
3480     }
3481     }
3482 nigel 77 }
3483    
3484     /* Else there's some kind of shambles */
3485    
3486     else
3487     {
3488     *errorcodeptr = ERR11;
3489     goto FAILED;
3490     }
3491    
3492 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3493     tests above succeeded, possessive_quantifier is TRUE. For some of the
3494     simpler opcodes, there is an special alternative opcode for this. For
3495     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3496     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3497     but the special opcodes can optimize it a bit. The repeated item starts at
3498     tempcode, not at previous, which might be the first part of a string whose
3499     (former) last char we repeated.
3500 nigel 77
3501 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3502     an 'upto' may follow. We skip over an 'exact' item, and then test the
3503     length of what remains before proceeding. */
3504    
3505 nigel 77 if (possessive_quantifier)
3506     {
3507 nigel 93 int len;
3508     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3509     *tempcode == OP_NOTEXACT)
3510     tempcode += _pcre_OP_lengths[*tempcode];
3511     len = code - tempcode;
3512     if (len > 0) switch (*tempcode)
3513     {
3514     case OP_STAR: *tempcode = OP_POSSTAR; break;
3515     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3516     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3517     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3518    
3519     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3520     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3521     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3522     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3523    
3524     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3525     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3526     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3527     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3528    
3529     default:
3530     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3531     code += 1 + LINK_SIZE;
3532     len += 1 + LINK_SIZE;
3533     tempcode[0] = OP_ONCE;
3534     *code++ = OP_KET;
3535     PUTINC(code, 0, len);
3536     PUT(tempcode, 1, len);
3537     break;
3538     }
3539 nigel 77 }
3540    
3541     /* In all case we no longer have a previous item. We also set the
3542     "follows varying string" flag for subsequently encountered reqbytes if
3543     it isn't already set and we have just passed a varying length item. */
3544    
3545     END_REPEAT:
3546     previous = NULL;
3547     cd->req_varyopt |= reqvary;
3548     break;
3549    
3550    
3551 nigel 93 /* ===================================================================*/
3552     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3553     lookbehind or option setting or condition or all the other extended
3554     parenthesis forms. First deal with the specials; all are introduced by ?,
3555     and the appearance of any of them means that this is not a capturing
3556     group. */
3557 nigel 77
3558     case '(':
3559     newoptions = options;
3560     skipbytes = 0;
3561 nigel 93 bravalue = OP_CBRA;
3562     save_hwm = cd->hwm;
3563 nigel 77
3564     if (*(++ptr) == '?')
3565     {
3566 nigel 93 int i, set, unset, namelen;
3567 nigel 77 int *optset;
3568 nigel 93 const uschar *name;
3569     uschar *slot;
3570 nigel 77
3571     switch (*(++ptr))
3572     {
3573     case '#': /* Comment; skip to ket */
3574     ptr++;
3575 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3576     if (*ptr == 0)
3577     {
3578     *errorcodeptr = ERR18;
3579     goto FAILED;
3580     }
3581 nigel 77 continue;
3582    
3583 nigel 93
3584     /* ------------------------------------------------------------ */
3585     case ':': /* Non-capturing bracket */
3586 nigel 77 bravalue = OP_BRA;
3587     ptr++;
3588     break;
3589    
3590 nigel 93
3591     /* ------------------------------------------------------------ */
3592 nigel 77 case '(':
3593     bravalue = OP_COND; /* Conditional group */
3594    
3595 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3596     group), a name (referring to a named group), or 'R', referring to
3597     recursion. R<digits> and R&name are also permitted for recursion tests.
3598 nigel 77
3599 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3600     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3601    
3602     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3603     be the recursive thing or the name 'R' (and similarly for 'R' followed
3604     by digits), and (b) a number could be a name that consists of digits.
3605     In both cases, we look for a name first; if not found, we try the other
3606     cases. */
3607    
3608     /* For conditions that are assertions, check the syntax, and then exit
3609     the switch. This will take control down to where bracketed groups,
3610     including assertions, are processed. */
3611    
3612     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3613     break;
3614    
3615     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3616     below), and all need to skip 3 bytes at the start of the group. */
3617    
3618     code[1+LINK_SIZE] = OP_CREF;
3619     skipbytes = 3;
3620    
3621     /* Check for a test for recursion in a named group. */
3622    
3623     if (ptr[1] == 'R' && ptr[2] == '&')
3624 nigel 77 {
3625 nigel 93 terminator = -1;
3626     ptr += 2;
3627     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3628     }
3629 nigel 91
3630 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3631     syntax (?(<name>) or (?('name') */
3632 nigel 91
3633 nigel 93 else if (ptr[1] == '<')
3634     {
3635     terminator = '>';
3636     ptr++;
3637     }
3638     else if (ptr[1] == '\'')
3639     {
3640     terminator = '\'';
3641     ptr++;
3642     }
3643     else terminator = 0;
3644 nigel 77
3645 nigel 93 /* We now expect to read a name; any thing else is an error */
3646 nigel 77
3647 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3648     {
3649     ptr += 1; /* To get the right offset */
3650     *errorcodeptr = ERR28;
3651     goto FAILED;
3652     }
3653    
3654     /* Read the name, but also get it as a number if it's all digits */
3655    
3656     recno = 0;
3657     name = ++ptr;
3658     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3659     {
3660     if (recno >= 0)
3661     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3662     recno * 10 + *ptr - '0' : -1;
3663 nigel 91 ptr++;
3664 nigel 93 }
3665     namelen = ptr - name;
3666 nigel 91
3667 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3668     {
3669     ptr--; /* Error offset */
3670     *errorcodeptr = ERR26;
3671     goto FAILED;
3672     }
3673 nigel 91
3674 nigel 93 /* Do no further checking in the pre-compile phase. */
3675 nigel 91
3676 nigel 93 if (lengthptr != NULL) break;
3677 nigel 91
3678 nigel 93 /* In the real compile we do the work of looking for the actual
3679     reference. */
3680 nigel 91
3681 nigel 93 slot = cd->name_table;
3682     for (i = 0; i < cd->names_found; i++)
3683     {
3684     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3685     slot += cd->name_entry_size;
3686     }
3687 nigel 91
3688 nigel 93 /* Found a previous named subpattern */
3689 nigel 91
3690 nigel 93 if (i < cd->names_found)
3691     {
3692     recno = GET2(slot, 0);
3693     PUT2(code, 2+LINK_SIZE, recno);
3694     }
3695 nigel 91
3696 nigel 93 /* Search the pattern for a forward reference */
3697 nigel 91
3698 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3699     (options & PCRE_EXTENDED) != 0)) > 0)
3700     {
3701     PUT2(code, 2+LINK_SIZE, i);
3702     }
3703 nigel 91
3704 nigel 93 /* If terminator == 0 it means that the name followed directly after
3705     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3706     some further alternatives to try. For the cases where terminator != 0
3707     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3708     now checked all the possibilities, so give an error. */
3709 nigel 91
3710 nigel 93 else if (terminator != 0)
3711     {
3712     *errorcodeptr = ERR15;
3713     goto FAILED;
3714     }
3715    
3716     /* Check for (?(R) for recursion. Allow digits after R to specify a
3717     specific group number. */
3718    
3719     else if (*name == 'R')
3720     {
3721     recno = 0;
3722     for (i = 1; i < namelen; i++)
3723 nigel 91 {
3724 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3725     {
3726     *errorcodeptr = ERR15;
3727     goto FAILED;
3728     }
3729     recno = recno * 10 + name[i] - '0';
3730 nigel 77 }
3731 nigel 93 if (recno == 0) recno = RREF_ANY;
3732     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3733     PUT2(code, 2+LINK_SIZE, recno);
3734 nigel 77 }
3735 nigel 91
3736 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3737     false. */
3738 nigel 91
3739 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3740     {
3741     code[1+LINK_SIZE] = OP_DEF;
3742     skipbytes = 1;
3743     }
3744    
3745     /* Check for the "name" actually being a subpattern number. */
3746    
3747     else if (recno > 0)
3748     {
3749     PUT2(code, 2+LINK_SIZE, recno);
3750     }
3751    
3752     /* Either an unidentified subpattern, or a reference to (?(0) */
3753    
3754     else
3755     {
3756     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3757     goto FAILED;
3758     }
3759 nigel 77 break;
3760    
3761 nigel 93
3762     /* ------------------------------------------------------------ */
3763 nigel 77 case '=': /* Positive lookahead */
3764     bravalue = OP_ASSERT;
3765     ptr++;
3766     break;
3767    
3768 nigel 93
3769     /* ------------------------------------------------------------ */
3770 nigel 77 case '!': /* Negative lookahead */
3771     bravalue = OP_ASSERT_NOT;
3772     ptr++;
3773     break;
3774    
3775 nigel 93
3776     /* ------------------------------------------------------------ */
3777     case '<': /* Lookbehind or named define */
3778     switch (ptr[1])
3779 nigel 77 {
3780     case '=': /* Positive lookbehind */
3781     bravalue = OP_ASSERTBACK;
3782 nigel 93 ptr += 2;
3783 nigel 77 break;
3784    
3785     case '!': /* Negative lookbehind */
3786     bravalue = OP_ASSERTBACK_NOT;
3787 nigel 93 ptr += 2;
3788 nigel 77 break;
3789 nigel 93
3790     default: /* Could be name define, else bad */
3791     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3792     ptr++; /* Correct offset for error */
3793     *errorcodeptr = ERR24;
3794     goto FAILED;
3795 nigel 77 }
3796     break;
3797    
3798 nigel 93
3799     /* ------------------------------------------------------------ */
3800 nigel 77 case '>': /* One-time brackets */
3801     bravalue = OP_ONCE;
3802     ptr++;
3803     break;
3804    
3805 nigel 93
3806     /* ------------------------------------------------------------ */
3807 nigel 77 case 'C': /* Callout - may be followed by digits; */
3808     previous_callout = code; /* Save for later completion */
3809     after_manual_callout = 1; /* Skip one item before completing */
3810 nigel 93 *code++ = OP_CALLOUT;
3811     {
3812 nigel 77 int n = 0;
3813     while ((digitab[*(++ptr)] & ctype_digit) != 0)
3814     n = n * 10 + *ptr - '0';
3815 nigel 93 if (*ptr != ')')
3816     {
3817     *errorcodeptr = ERR39;
3818     goto FAILED;
3819     }
3820 nigel 77 if (n > 255)
3821     {
3822     *errorcodeptr = ERR38;
3823     goto FAILED;
3824     }
3825     *code++ = n;
3826     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3827     PUT(code, LINK_SIZE, 0); /* Default length */
3828     code += 2 * LINK_SIZE;
3829     }
3830     previous = NULL;
3831     continue;
3832    
3833 nigel 93
3834     /* ------------------------------------------------------------ */
3835     case 'P': /* Python-style named subpattern handling */
3836     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3837 nigel 77 {
3838 nigel 93 is_recurse = *ptr == '>';
3839     terminator = ')';
3840     goto NAMED_REF_OR_RECURSE;
3841     }
3842     else if (*ptr != '<') /* Test for Python-style definition */
3843     {
3844     *errorcodeptr = ERR41;
3845     goto FAILED;
3846     }
3847     /* Fall through to handle (?P< as (?< is handled */
3848 nigel 77
3849    
3850 nigel 93 /* ------------------------------------------------------------ */
3851     DEFINE_NAME: /* Come here from (?< handling */
3852     case '\'':
3853     {
3854     terminator = (*ptr == '<')? '>' : '\'';
3855     name = ++ptr;
3856    
3857     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3858     namelen = ptr - name;
3859    
3860     /* In the pre-compile phase, just do a syntax check. */
3861    
3862     if (lengthptr != NULL)
3863 nigel 77 {
3864 nigel 93 if (*ptr != terminator)
3865 nigel 77 {
3866 nigel 93 *errorcodeptr = ERR42;
3867     goto FAILED;
3868     }
3869     if (cd->names_found >= MAX_NAME_COUNT)
3870     {
3871     *errorcodeptr = ERR49;
3872     goto FAILED;
3873     }
3874     if (namelen + 3 > cd->name_entry_size)
3875     {
3876     cd->name_entry_size = namelen + 3;
3877     if (namelen > MAX_NAME_SIZE)
3878 nigel 77 {
3879 nigel 93 *errorcodeptr = ERR48;
3880     goto FAILED;
3881     }
3882     }
3883     }
3884    
3885     /* In the real compile, create the entry in the table */
3886    
3887     else
3888     {
3889     slot = cd->name_table;
3890     for (i = 0; i < cd->names_found; i++)
3891     {
3892     int crc = memcmp(name, slot+2, namelen);
3893     if (crc == 0)
3894     {
3895     if (slot[2+namelen] == 0)
3896 nigel 91 {
3897 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
3898     {
3899     *errorcodeptr = ERR43;
3900     goto FAILED;
3901     }
3902 nigel 91 }
3903 nigel 93 else crc = -1; /* Current name is substring */
3904 nigel 77 }
3905 nigel 93 if (crc < 0)
3906     {
3907     memmove(slot + cd->name_entry_size, slot,
3908     (cd->names_found - i) * cd->name_entry_size);
3909     break;
3910     }
3911     slot += cd->name_entry_size;
3912 nigel 77 }
3913 nigel 93
3914     PUT2(slot, 0, cd->bracount + 1);
3915     memcpy(slot + 2, name, namelen);
3916     slot[2+namelen] = 0;
3917 nigel 77 }
3918     }
3919    
3920 nigel 93 /* In both cases, count the number of names we've encountered. */
3921    
3922     ptr++; /* Move past > or ' */
3923     cd->names_found++;
3924     goto NUMBERED_GROUP;
3925    
3926    
3927     /* ------------------------------------------------------------ */
3928     case '&': /* Perl recursion/subroutine syntax */
3929     terminator = ')';
3930     is_recurse = TRUE;
3931     /* Fall through */
3932    
3933     /* We come here from the Python syntax above that handles both
3934     references (?P=name) and recursion (?P>name), as well as falling
3935     through from the Perl recursion syntax (?&name). */
3936    
3937     NAMED_REF_OR_RECURSE:
3938     name = ++ptr;
3939     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3940     namelen = ptr - name;
3941    
3942     /* In the pre-compile phase, do a syntax check and set a dummy
3943     reference number. */
3944    
3945     if (lengthptr != NULL)
3946 nigel 77 {
3947 nigel 93 if (*ptr != terminator)
3948     {
3949     *errorcodeptr = ERR42;
3950     goto FAILED;
3951     }
3952     if (namelen > MAX_NAME_SIZE)
3953     {
3954     *errorcodeptr = ERR48;
3955     goto FAILED;
3956     }
3957     recno = 0;
3958     }
3959 nigel 77
3960 nigel 93 /* In the real compile, seek the name in the table */
3961 nigel 77
3962 nigel 93 else
3963     {
3964     slot = cd->name_table;
3965 nigel 77 for (i = 0; i < cd->names_found; i++)
3966     {
3967     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3968     slot += cd->name_entry_size;
3969     }
3970 nigel 91
3971     if (i < cd->names_found) /* Back reference */
3972 nigel 77 {
3973 nigel 91 recno = GET2(slot, 0);
3974     }
3975     else if ((recno = /* Forward back reference */
3976 nigel 93 find_parens(ptr, cd->bracount, name, namelen,
3977     (options & PCRE_EXTENDED) != 0)) <= 0)
3978 nigel 91 {
3979 nigel 77 *errorcodeptr = ERR15;
3980     goto FAILED;
3981     }
3982 nigel 93 }
3983 nigel 77
3984 nigel 93 /* In both phases, we can now go to the code than handles numerical
3985     recursion or backreferences. */
3986 nigel 77
3987 nigel 93 if (is_recurse) goto HANDLE_RECURSION;
3988     else goto HANDLE_REFERENCE;
3989 nigel 77
3990    
3991 nigel 93 /* ------------------------------------------------------------ */
3992     case 'R': /* Recursion */
3993 nigel 77 ptr++; /* Same as (?0) */
3994     /* Fall through */
3995    
3996    
3997 nigel 93 /* ------------------------------------------------------------ */
3998     case '0': case '1': case '2': case '3': case '4': /* Recursion or */
3999     case '5': case '6': case '7': case '8': case '9': /* subroutine */
4000 nigel 77 {
4001     const uschar *called;
4002     recno = 0;
4003     while((digitab[*ptr] & ctype_digit) != 0)
4004     recno = recno * 10 + *ptr++ - '0';
4005 nigel 93 if (*ptr != ')')
4006     {
4007     *errorcodeptr = ERR29;
4008     goto FAILED;
4009     }
4010 nigel 77
4011     /* Come here from code above that handles a named recursion */
4012    
4013     HANDLE_RECURSION:
4014    
4015     previous = code;
4016 nigel 93 called = cd->start_code;
4017 nigel 77
4018 nigel 93 /* When we are actually compiling, find the bracket that is being
4019     referenced. Temporarily end the regex in case it doesn't exist before
4020     this point. If we end up with a forward reference, first check that
4021     the bracket does occur later so we can give the error (and position)
4022     now. Then remember this forward reference in the workspace so it can
4023     be filled in at the end. */
4024 nigel 77
4025 nigel 93 if (lengthptr == NULL)
4026 nigel 77 {
4027 nigel 93 *code = OP_END;
4028     if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4029 nigel 77
4030 nigel 93 /* Forward reference */
4031 nigel 77
4032 nigel 93 if (called == NULL)
4033     {
4034     if (find_parens(ptr, cd->bracount, NULL, recno,
4035     (options & PCRE_EXTENDED) != 0) < 0)
4036     {
4037     *errorcodeptr = ERR15;
4038     goto FAILED;
4039     }
4040     called = cd->start_code + recno;
4041     PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4042     }
4043    
4044     /* If not a forward reference, and the subpattern is still open,
4045     this is a recursive call. We check to see if this is a left
4046     recursion that could loop for ever, and diagnose that case. */
4047    
4048     else if (GET(called, 1) == 0 &&
4049     could_be_empty(called, code, bcptr, utf8))
4050     {
4051     *errorcodeptr = ERR40;
4052     goto FAILED;
4053     }
4054 nigel 77 }
4055    
4056 nigel 87 /* Insert the recursion/subroutine item, automatically wrapped inside
4057 nigel 93 "once" brackets. Set up a "previous group" length so that a
4058     subsequent quantifier will work. */
4059 nigel 77
4060 nigel 87 *code = OP_ONCE;
4061     PUT(code, 1, 2 + 2*LINK_SIZE);
4062     code += 1 + LINK_SIZE;
4063    
4064 nigel 77 *code = OP_RECURSE;
4065     PUT(code, 1, called - cd->start_code);
4066     code += 1 + LINK_SIZE;
4067 nigel 87
4068     *code = OP_KET;
4069     PUT(code, 1, 2 + 2*LINK_SIZE);
4070     code += 1 + LINK_SIZE;
4071 nigel 93
4072     length_prevgroup = 3 + 3*LINK_SIZE;
4073 nigel 77 }
4074 nigel 93
4075     /* Can't determine a first byte now */
4076    
4077     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4078 nigel 77 continue;
4079    
4080    
4081 nigel 93 /* ------------------------------------------------------------ */
4082     default: /* Other characters: check option setting */
4083 nigel 77 set = unset = 0;
4084     optset = &set;
4085    
4086     while (*ptr != ')' && *ptr != ':')
4087     {
4088     switch (*ptr++)
4089     {
4090     case '-': optset = &unset; break;
4091    
4092 nigel 93 case 'J': /* Record that it changed in the external options */
4093     *optset |= PCRE_DUPNAMES;
4094     cd->external_options |= PCRE_JCHANGED;
4095     break;
4096    
4097 nigel 77 case 'i': *optset |= PCRE_CASELESS; break;
4098     case 'm': *optset |= PCRE_MULTILINE; break;
4099     case 's': *optset |= PCRE_DOTALL; break;
4100     case 'x': *optset |= PCRE_EXTENDED; break;
4101     case 'U': *optset |= PCRE_UNGREEDY; break;
4102     case 'X': *optset |= PCRE_EXTRA; break;
4103 nigel 93
4104     default: *errorcodeptr = ERR12;
4105     ptr--; /* Correct the offset */
4106     goto FAILED;
4107 nigel 77 }
4108     }
4109    
4110     /* Set up the changed option bits, but don't change anything yet. */
4111    
4112     newoptions = (options | set) & (~unset);
4113    
4114     /* If the options ended with ')' this is not the start of a nested
4115 nigel 93 group with option changes, so the options change at this level. If this
4116     item is right at the start of the pattern, the options can be
4117     abstracted and made external in the pre-compile phase, and ignored in
4118     the compile phase. This can be helpful when matching -- for instance in
4119     caseless checking of required bytes.
4120 nigel 77
4121 nigel 93 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4122     definitely *not* at the start of the pattern because something has been
4123     compiled. In the pre-compile phase, however, the code pointer can have
4124     that value after the start, because it gets reset as code is discarded
4125     during the pre-compile. However, this can happen only at top level - if
4126     we are within parentheses, the starting BRA will still be present. At
4127     any parenthesis level, the length value can be used to test if anything
4128     has been compiled at that level. Thus, a test for both these conditions
4129     is necessary to ensure we correctly detect the start of the pattern in
4130     both phases.
4131 nigel 77
4132 nigel 93 If we are not at the pattern start, compile code to change the ims
4133     options if this setting actually changes any of them. We also pass the
4134     new setting back so that it can be put at the start of any following
4135     branches, and when this group ends (if we are in a group), a resetting
4136     item can be compiled. */
4137    
4138 nigel 77 if (*ptr == ')')
4139     {
4140 nigel 93 if (code == cd->start_code + 1 + LINK_SIZE &&
4141     (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4142 nigel 77 {
4143 nigel 93 cd->external_options = newoptions;
4144     options = newoptions;