/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 178 - (hide annotations) (download)
Wed Jun 13 08:44:34 2007 UTC (7 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 181949 byte(s)
Add support for \h, \H, \v, \V.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 ph10 178 /* Macro for setting individual bits in class bitmaps. */
62    
63     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64    
65    
66 nigel 77 /*************************************************
67     * Code parameters and static tables *
68     *************************************************/
69    
70 nigel 93 /* This value specifies the size of stack workspace that is used during the
71     first pre-compile phase that determines how much memory is required. The regex
72     is partly compiled into this space, but the compiled parts are discarded as
73     soon as they can be, so that hopefully there will never be an overrun. The code
74     does, however, check for an overrun. The largest amount I've seen used is 218,
75     so this number is very generous.
76 nigel 77
77 nigel 93 The same workspace is used during the second, actual compile phase for
78     remembering forward references to groups so that they can be filled in at the
79     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
80     is 4 there is plenty of room. */
81 nigel 77
82 nigel 93 #define COMPILE_WORK_SIZE (4096)
83 nigel 77
84 nigel 93
85 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
86     are simple data values; negative values are for special things like \d and so
87     on. Zero means further processing is needed (for things like \x), or the escape
88     is invalid. */
89    
90 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
91 nigel 77 static const short int escapes[] = {
92     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
93     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
94     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
95 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
96     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
97 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
98     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
99 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
100     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
101 nigel 77 0, 0, -ESC_z /* x - z */
102     };
103    
104 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
105 nigel 77 static const short int escapes[] = {
106     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
107     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
108     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
109     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
110     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
111     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
112     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
113     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
114 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
115 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
116 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
117 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
118 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
119     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
120     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
121     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
122 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
123 nigel 77 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
124 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
125 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
126 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
127     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
128     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
129     };
130     #endif
131    
132    
133     /* Tables of names of POSIX character classes and their lengths. The list is
134 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
135 nigel 77 as this is assumed for handling case independence. */
136    
137     static const char *const posix_names[] = {
138     "alpha", "lower", "upper",
139     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
140     "print", "punct", "space", "word", "xdigit" };
141    
142     static const uschar posix_name_lengths[] = {
143     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
144    
145 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
146     base map, with an optional addition or removal of another map. Then, for some
147     classes, there is some additional tweaking: for [:blank:] the vertical space
148     characters are removed, and for [:alpha:] and [:alnum:] the underscore
149     character is removed. The triples in the table consist of the base map offset,
150     second map offset or -1 if no second map, and a non-negative value for map
151     addition or a negative value for map subtraction (if there are two maps). The
152     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
153     remove vertical space characters, 2 => remove underscore. */
154 nigel 77
155     static const int posix_class_maps[] = {
156 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
157     cbit_lower, -1, 0, /* lower */
158     cbit_upper, -1, 0, /* upper */
159     cbit_word, -1, 2, /* alnum - word without underscore */
160     cbit_print, cbit_cntrl, 0, /* ascii */
161     cbit_space, -1, 1, /* blank - a GNU extension */
162     cbit_cntrl, -1, 0, /* cntrl */
163     cbit_digit, -1, 0, /* digit */
164     cbit_graph, -1, 0, /* graph */
165     cbit_print, -1, 0, /* print */
166     cbit_punct, -1, 0, /* punct */
167     cbit_space, -1, 0, /* space */
168     cbit_word, -1, 0, /* word - a Perl extension */
169     cbit_xdigit,-1, 0 /* xdigit */
170 nigel 77 };
171    
172    
173 nigel 93 #define STRING(a) # a
174     #define XSTRING(s) STRING(s)
175    
176 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
177 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
178     they are documented. Always add a new error instead. Messages marked DEAD below
179     are no longer used. */
180 nigel 77
181     static const char *error_texts[] = {
182     "no error",
183     "\\ at end of pattern",
184     "\\c at end of pattern",
185     "unrecognized character follows \\",
186     "numbers out of order in {} quantifier",
187     /* 5 */
188     "number too big in {} quantifier",
189     "missing terminating ] for character class",
190     "invalid escape sequence in character class",
191     "range out of order in character class",
192     "nothing to repeat",
193     /* 10 */
194 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
195 nigel 77 "internal error: unexpected repeat",
196     "unrecognized character after (?",
197     "POSIX named classes are supported only within a class",
198     "missing )",
199     /* 15 */
200     "reference to non-existent subpattern",
201     "erroffset passed as NULL",
202     "unknown option bit(s) set",
203     "missing ) after comment",
204 nigel 93 "parentheses nested too deeply", /** DEAD **/
205 nigel 77 /* 20 */
206     "regular expression too large",
207     "failed to get memory",
208     "unmatched parentheses",
209     "internal error: code overflow",
210     "unrecognized character after (?<",
211     /* 25 */
212     "lookbehind assertion is not fixed length",
213 nigel 91 "malformed number or name after (?(",
214 nigel 77 "conditional group contains more than two branches",
215     "assertion expected after (?(",
216 ph10 166 "(?R or (?[+-]digits must be followed by )",
217 nigel 77 /* 30 */
218     "unknown POSIX class name",
219     "POSIX collating elements are not supported",
220     "this version of PCRE is not compiled with PCRE_UTF8 support",
221 nigel 93 "spare error", /** DEAD **/
222 nigel 77 "character value in \\x{...} sequence is too large",
223     /* 35 */
224     "invalid condition (?(0)",
225     "\\C not allowed in lookbehind assertion",
226     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
227     "number after (?C is > 255",
228     "closing ) for (?C expected",
229     /* 40 */
230     "recursive call could loop indefinitely",
231     "unrecognized character after (?P",
232 nigel 93 "syntax error in subpattern name (missing terminator)",
233 nigel 91 "two named subpatterns have the same name",
234 nigel 77 "invalid UTF-8 string",
235     /* 45 */
236     "support for \\P, \\p, and \\X has not been compiled",
237     "malformed \\P or \\p sequence",
238 nigel 91 "unknown property name after \\P or \\p",
239 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
240     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
241 nigel 91 /* 50 */
242     "repeated subpattern is too long",
243 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
244     "internal error: overran compiling workspace",
245     "internal error: previously-checked referenced subpattern not found",
246     "DEFINE group contains more than one branch",
247     /* 55 */
248     "repeating a DEFINE group is not allowed",
249     "inconsistent NEWLINE options",
250 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
251 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252 nigel 77 };
253    
254    
255     /* Table to identify digits and hex digits. This is used when compiling
256     patterns. Note that the tables in chartables are dependent on the locale, and
257     may mark arbitrary characters as digits - but the PCRE compiling code expects
258     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
259     a private table here. It costs 256 bytes, but it is a lot faster than doing
260     character value tests (at least in some simple cases I timed), and in some
261     applications one wants PCRE to compile efficiently as well as match
262     efficiently.
263    
264     For convenience, we use the same bit definitions as in chartables:
265    
266     0x04 decimal digit
267     0x08 hexadecimal digit
268    
269     Then we can use ctype_digit and ctype_xdigit in the code. */
270    
271 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
272 nigel 77 static const unsigned char digitab[] =
273     {
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
275     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
276     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
280     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
281     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
282     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
286     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
306    
307 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
308 nigel 77 static const unsigned char digitab[] =
309     {
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
316     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
321 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
322 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
326     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
334     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
340     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
341     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
342    
343     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
344     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
345     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
346     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
348     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
352     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
353     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
355 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
356 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
357     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
360     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
361     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
362     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
363     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
364     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
365     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
366     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
367     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
368     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
369     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
370     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
371     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
372     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
373     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
374     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
375     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
376     #endif
377    
378    
379     /* Definition to allow mutual recursion */
380    
381     static BOOL
382 ph10 175 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383     int *, int *, branch_chain *, compile_data *, int *);
384 nigel 77
385    
386    
387     /*************************************************
388     * Handle escapes *
389     *************************************************/
390    
391     /* This function is called when a \ has been encountered. It either returns a
392     positive value for a simple escape such as \n, or a negative value which
393 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
394     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
395     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
396     ptr is pointing at the \. On exit, it is on the final character of the escape
397     sequence.
398 nigel 77
399     Arguments:
400     ptrptr points to the pattern position pointer
401     errorcodeptr points to the errorcode variable
402     bracount number of previous extracting brackets
403     options the options bits
404     isclass TRUE if inside a character class
405    
406     Returns: zero or positive => a data character
407     negative => a special escape sequence
408     on error, errorptr is set
409     */
410    
411     static int
412     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
413     int options, BOOL isclass)
414     {
415 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
416     const uschar *ptr = *ptrptr + 1;
417 nigel 77 int c, i;
418    
419 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
420     ptr--; /* Set pointer back to the last byte */
421    
422 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
423    
424     if (c == 0) *errorcodeptr = ERR1;
425    
426     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
427     a table. A non-zero result is something that can be returned immediately.
428     Otherwise further processing may be required. */
429    
430 ph10 97 #ifndef EBCDIC /* ASCII coding */
431 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
432     else if ((i = escapes[c - '0']) != 0) c = i;
433    
434 ph10 97 #else /* EBCDIC coding */
435 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
436     else if ((i = escapes[c - 0x48]) != 0) c = i;
437     #endif
438    
439     /* Escapes that need further processing, or are illegal. */
440    
441     else
442     {
443     const uschar *oldptr;
444 nigel 93 BOOL braced, negated;
445    
446 nigel 77 switch (c)
447     {
448     /* A number of Perl escapes are not handled by PCRE. We give an explicit
449     error. */
450    
451     case 'l':
452     case 'L':
453     case 'N':
454     case 'u':
455     case 'U':
456     *errorcodeptr = ERR37;
457     break;
458    
459 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
460     is an absolute backreference. If negative, it is a relative backreference.
461 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462     reference to a named group. This is part of Perl's movement towards a
463     unified syntax for back references. As this is synonymous with \k{name}, we
464 ph10 171 fudge it up by pretending it really was \k. */
465 nigel 93
466     case 'g':
467     if (ptr[1] == '{')
468     {
469 ph10 171 const uschar *p;
470     for (p = ptr+2; *p != 0 && *p != '}'; p++)
471     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472 ph10 172 if (*p != 0 && *p != '}')
473 ph10 171 {
474     c = -ESC_k;
475     break;
476 ph10 172 }
477 nigel 93 braced = TRUE;
478     ptr++;
479     }
480     else braced = FALSE;
481    
482     if (ptr[1] == '-')
483     {
484     negated = TRUE;
485     ptr++;
486     }
487     else negated = FALSE;
488    
489     c = 0;
490     while ((digitab[ptr[1]] & ctype_digit) != 0)
491     c = c * 10 + *(++ptr) - '0';
492    
493     if (c == 0 || (braced && *(++ptr) != '}'))
494     {
495     *errorcodeptr = ERR57;
496     return 0;
497     }
498    
499     if (negated)
500     {
501     if (c > bracount)
502     {
503     *errorcodeptr = ERR15;
504     return 0;
505     }
506     c = bracount - (c - 1);
507     }
508    
509     c = -(ESC_REF + c);
510     break;
511    
512 nigel 77 /* The handling of escape sequences consisting of a string of digits
513     starting with one that is not zero is not straightforward. By experiment,
514     the way Perl works seems to be as follows:
515    
516     Outside a character class, the digits are read as a decimal number. If the
517     number is less than 10, or if there are that many previous extracting
518     left brackets, then it is a back reference. Otherwise, up to three octal
519     digits are read to form an escaped byte. Thus \123 is likely to be octal
520     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
521     value is greater than 377, the least significant 8 bits are taken. Inside a
522     character class, \ followed by a digit is always an octal number. */
523    
524     case '1': case '2': case '3': case '4': case '5':
525     case '6': case '7': case '8': case '9':
526    
527     if (!isclass)
528     {
529     oldptr = ptr;
530     c -= '0';
531     while ((digitab[ptr[1]] & ctype_digit) != 0)
532     c = c * 10 + *(++ptr) - '0';
533     if (c < 10 || c <= bracount)
534     {
535     c = -(ESC_REF + c);
536     break;
537     }
538     ptr = oldptr; /* Put the pointer back and fall through */
539     }
540    
541     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
542     generates a binary zero byte and treats the digit as a following literal.
543     Thus we have to pull back the pointer by one. */
544    
545     if ((c = *ptr) >= '8')
546     {
547     ptr--;
548     c = 0;
549     break;
550     }
551    
552     /* \0 always starts an octal number, but we may drop through to here with a
553 nigel 91 larger first octal digit. The original code used just to take the least
554     significant 8 bits of octal numbers (I think this is what early Perls used
555     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
556     than 3 octal digits. */
557 nigel 77
558     case '0':
559     c -= '0';
560     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
561     c = c * 8 + *(++ptr) - '0';
562 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
563 nigel 77 break;
564    
565 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
566     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
567     treated as a data character. */
568 nigel 77
569     case 'x':
570 nigel 87 if (ptr[1] == '{')
571 nigel 77 {
572     const uschar *pt = ptr + 2;
573 nigel 87 int count = 0;
574    
575 nigel 77 c = 0;
576     while ((digitab[*pt] & ctype_xdigit) != 0)
577     {
578 nigel 87 register int cc = *pt++;
579     if (c == 0 && cc == '0') continue; /* Leading zeroes */
580 nigel 77 count++;
581 nigel 87
582 ph10 97 #ifndef EBCDIC /* ASCII coding */
583 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
584 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
585 ph10 97 #else /* EBCDIC coding */
586 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
587 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
588 nigel 77 #endif
589     }
590 nigel 87
591 nigel 77 if (*pt == '}')
592     {
593 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
594 nigel 77 ptr = pt;
595     break;
596     }
597 nigel 87
598 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
599     recognize this construct; fall through to the normal \x handling. */
600     }
601    
602 nigel 87 /* Read just a single-byte hex-defined char */
603 nigel 77
604     c = 0;
605     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
606     {
607     int cc; /* Some compilers don't like ++ */
608     cc = *(++ptr); /* in initializers */
609 ph10 97 #ifndef EBCDIC /* ASCII coding */
610 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
611     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
612 ph10 97 #else /* EBCDIC coding */
613 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
614     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
615     #endif
616     }
617     break;
618    
619 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
620     This coding is ASCII-specific, but then the whole concept of \cx is
621     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
622 nigel 77
623     case 'c':
624     c = *(++ptr);
625     if (c == 0)
626     {
627     *errorcodeptr = ERR2;
628     return 0;
629     }
630    
631 ph10 97 #ifndef EBCDIC /* ASCII coding */
632 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
633     c ^= 0x40;
634 ph10 97 #else /* EBCDIC coding */
635 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
636     c ^= 0xC0;
637     #endif
638     break;
639    
640     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
641     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
642     for Perl compatibility, it is a literal. This code looks a bit odd, but
643     there used to be some cases other than the default, and there may be again
644     in future, so I haven't "optimized" it. */
645    
646     default:
647     if ((options & PCRE_EXTRA) != 0) switch(c)
648     {
649     default:
650     *errorcodeptr = ERR3;
651     break;
652     }
653     break;
654     }
655     }
656    
657     *ptrptr = ptr;
658     return c;
659     }
660    
661    
662    
663     #ifdef SUPPORT_UCP
664     /*************************************************
665     * Handle \P and \p *
666     *************************************************/
667    
668     /* This function is called after \P or \p has been encountered, provided that
669     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
670     pointing at the P or p. On exit, it is pointing at the final character of the
671     escape sequence.
672    
673     Argument:
674     ptrptr points to the pattern position pointer
675     negptr points to a boolean that is set TRUE for negation else FALSE
676 nigel 87 dptr points to an int that is set to the detailed property value
677 nigel 77 errorcodeptr points to the error code variable
678    
679 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
680 nigel 77 */
681    
682     static int
683 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
684 nigel 77 {
685     int c, i, bot, top;
686     const uschar *ptr = *ptrptr;
687 nigel 87 char name[32];
688 nigel 77
689     c = *(++ptr);
690     if (c == 0) goto ERROR_RETURN;
691    
692     *negptr = FALSE;
693    
694 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
695     negation. */
696 nigel 77
697     if (c == '{')
698     {
699     if (ptr[1] == '^')
700     {
701     *negptr = TRUE;
702     ptr++;
703     }
704 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
705 nigel 77 {
706     c = *(++ptr);
707     if (c == 0) goto ERROR_RETURN;
708     if (c == '}') break;
709     name[i] = c;
710     }
711 nigel 87 if (c !='}') goto ERROR_RETURN;
712 nigel 77 name[i] = 0;
713     }
714    
715     /* Otherwise there is just one following character */
716    
717     else
718     {
719     name[0] = c;
720     name[1] = 0;
721     }
722    
723     *ptrptr = ptr;
724    
725     /* Search for a recognized property name using binary chop */
726    
727     bot = 0;
728     top = _pcre_utt_size;
729    
730     while (bot < top)
731     {
732 nigel 87 i = (bot + top) >> 1;
733 nigel 77 c = strcmp(name, _pcre_utt[i].name);
734 nigel 87 if (c == 0)
735     {
736     *dptr = _pcre_utt[i].value;
737     return _pcre_utt[i].type;
738     }
739 nigel 77 if (c > 0) bot = i + 1; else top = i;
740     }
741    
742     *errorcodeptr = ERR47;
743     *ptrptr = ptr;
744     return -1;
745    
746     ERROR_RETURN:
747     *errorcodeptr = ERR46;
748     *ptrptr = ptr;
749     return -1;
750     }
751     #endif
752    
753    
754    
755    
756     /*************************************************
757     * Check for counted repeat *
758     *************************************************/
759    
760     /* This function is called when a '{' is encountered in a place where it might
761     start a quantifier. It looks ahead to see if it really is a quantifier or not.
762     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
763     where the ddds are digits.
764    
765     Arguments:
766     p pointer to the first char after '{'
767    
768     Returns: TRUE or FALSE
769     */
770    
771     static BOOL
772     is_counted_repeat(const uschar *p)
773     {
774     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
775     while ((digitab[*p] & ctype_digit) != 0) p++;
776     if (*p == '}') return TRUE;
777    
778     if (*p++ != ',') return FALSE;
779     if (*p == '}') return TRUE;
780    
781     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
782     while ((digitab[*p] & ctype_digit) != 0) p++;
783    
784     return (*p == '}');
785     }
786    
787    
788    
789     /*************************************************
790     * Read repeat counts *
791     *************************************************/
792    
793     /* Read an item of the form {n,m} and return the values. This is called only
794     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
795     so the syntax is guaranteed to be correct, but we need to check the values.
796    
797     Arguments:
798     p pointer to first char after '{'
799     minp pointer to int for min
800     maxp pointer to int for max
801     returned as -1 if no max
802     errorcodeptr points to error code variable
803    
804     Returns: pointer to '}' on success;
805     current ptr on error, with errorcodeptr set non-zero
806     */
807    
808     static const uschar *
809     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
810     {
811     int min = 0;
812     int max = -1;
813    
814 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
815     an integer overflow. */
816    
817 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
818 nigel 81 if (min < 0 || min > 65535)
819     {
820     *errorcodeptr = ERR5;
821     return p;
822     }
823 nigel 77
824 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
825     Also, max must not be less than min. */
826    
827 nigel 77 if (*p == '}') max = min; else
828     {
829     if (*(++p) != '}')
830     {
831     max = 0;
832     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
833 nigel 81 if (max < 0 || max > 65535)
834     {
835     *errorcodeptr = ERR5;
836     return p;
837     }
838 nigel 77 if (max < min)
839     {
840     *errorcodeptr = ERR4;
841     return p;
842     }
843     }
844     }
845    
846 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
847     '}'. */
848 nigel 77
849 nigel 81 *minp = min;
850     *maxp = max;
851 nigel 77 return p;
852     }
853    
854    
855    
856     /*************************************************
857 nigel 93 * Find forward referenced subpattern *
858 nigel 91 *************************************************/
859    
860 nigel 93 /* This function scans along a pattern's text looking for capturing
861     subpatterns, and counting them. If it finds a named pattern that matches the
862     name it is given, it returns its number. Alternatively, if the name is NULL, it
863     returns when it reaches a given numbered subpattern. This is used for forward
864     references to subpatterns. We know that if (?P< is encountered, the name will
865     be terminated by '>' because that is checked in the first pass.
866 nigel 91
867     Arguments:
868 nigel 93 ptr current position in the pattern
869     count current count of capturing parens so far encountered
870     name name to seek, or NULL if seeking a numbered subpattern
871     lorn name length, or subpattern number if name is NULL
872     xmode TRUE if we are in /x mode
873 nigel 91
874     Returns: the number of the named subpattern, or -1 if not found
875     */
876    
877     static int
878 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
879     BOOL xmode)
880 nigel 91 {
881     const uschar *thisname;
882 nigel 93
883 nigel 91 for (; *ptr != 0; ptr++)
884     {
885 nigel 93 int term;
886    
887     /* Skip over backslashed characters and also entire \Q...\E */
888    
889     if (*ptr == '\\')
890     {
891     if (*(++ptr) == 0) return -1;
892     if (*ptr == 'Q') for (;;)
893     {
894     while (*(++ptr) != 0 && *ptr != '\\');
895     if (*ptr == 0) return -1;
896     if (*(++ptr) == 'E') break;
897     }
898     continue;
899     }
900    
901     /* Skip over character classes */
902    
903     if (*ptr == '[')
904     {
905     while (*(++ptr) != ']')
906     {
907     if (*ptr == '\\')
908     {
909     if (*(++ptr) == 0) return -1;
910     if (*ptr == 'Q') for (;;)
911     {
912     while (*(++ptr) != 0 && *ptr != '\\');
913     if (*ptr == 0) return -1;
914     if (*(++ptr) == 'E') break;
915     }
916     continue;
917     }
918     }
919     continue;
920     }
921    
922     /* Skip comments in /x mode */
923    
924     if (xmode && *ptr == '#')
925     {
926     while (*(++ptr) != 0 && *ptr != '\n');
927     if (*ptr == 0) return -1;
928     continue;
929     }
930    
931     /* An opening parens must now be a real metacharacter */
932    
933 nigel 91 if (*ptr != '(') continue;
934 nigel 93 if (ptr[1] != '?')
935     {
936     count++;
937     if (name == NULL && count == lorn) return count;
938     continue;
939     }
940    
941     ptr += 2;
942     if (*ptr == 'P') ptr++; /* Allow optional P */
943    
944     /* We have to disambiguate (?<! and (?<= from (?<name> */
945    
946     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
947     *ptr != '\'')
948     continue;
949    
950 nigel 91 count++;
951 nigel 93
952     if (name == NULL && count == lorn) return count;
953     term = *ptr++;
954     if (term == '<') term = '>';
955 nigel 91 thisname = ptr;
956 nigel 93 while (*ptr != term) ptr++;
957     if (name != NULL && lorn == ptr - thisname &&
958     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
959 nigel 91 return count;
960     }
961 nigel 93
962 nigel 91 return -1;
963     }
964    
965    
966    
967     /*************************************************
968 nigel 77 * Find first significant op code *
969     *************************************************/
970    
971     /* This is called by several functions that scan a compiled expression looking
972     for a fixed first character, or an anchoring op code etc. It skips over things
973     that do not influence this. For some calls, a change of option is important.
974     For some calls, it makes sense to skip negative forward and all backward
975     assertions, and also the \b assertion; for others it does not.
976    
977     Arguments:
978     code pointer to the start of the group
979     options pointer to external options
980     optbit the option bit whose changing is significant, or
981     zero if none are
982     skipassert TRUE if certain assertions are to be skipped
983    
984     Returns: pointer to the first significant opcode
985     */
986    
987     static const uschar*
988     first_significant_code(const uschar *code, int *options, int optbit,
989     BOOL skipassert)
990     {
991     for (;;)
992     {
993     switch ((int)*code)
994     {
995     case OP_OPT:
996     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
997     *options = (int)code[1];
998     code += 2;
999     break;
1000    
1001     case OP_ASSERT_NOT:
1002     case OP_ASSERTBACK:
1003     case OP_ASSERTBACK_NOT:
1004     if (!skipassert) return code;
1005     do code += GET(code, 1); while (*code == OP_ALT);
1006     code += _pcre_OP_lengths[*code];
1007     break;
1008    
1009     case OP_WORD_BOUNDARY:
1010     case OP_NOT_WORD_BOUNDARY:
1011     if (!skipassert) return code;
1012     /* Fall through */
1013    
1014     case OP_CALLOUT:
1015     case OP_CREF:
1016 nigel 93 case OP_RREF:
1017     case OP_DEF:
1018 nigel 77 code += _pcre_OP_lengths[*code];
1019     break;
1020    
1021     default:
1022     return code;
1023     }
1024     }
1025     /* Control never reaches here */
1026     }
1027    
1028    
1029    
1030    
1031     /*************************************************
1032     * Find the fixed length of a pattern *
1033     *************************************************/
1034    
1035     /* Scan a pattern and compute the fixed length of subject that will match it,
1036     if the length is fixed. This is needed for dealing with backward assertions.
1037     In UTF8 mode, the result is in characters rather than bytes.
1038    
1039     Arguments:
1040     code points to the start of the pattern (the bracket)
1041     options the compiling options
1042    
1043     Returns: the fixed length, or -1 if there is no fixed length,
1044     or -2 if \C was encountered
1045     */
1046    
1047     static int
1048     find_fixedlength(uschar *code, int options)
1049     {
1050     int length = -1;
1051    
1052     register int branchlength = 0;
1053     register uschar *cc = code + 1 + LINK_SIZE;
1054    
1055     /* Scan along the opcodes for this branch. If we get to the end of the
1056     branch, check the length against that of the other branches. */
1057    
1058     for (;;)
1059     {
1060     int d;
1061     register int op = *cc;
1062    
1063     switch (op)
1064     {
1065 nigel 93 case OP_CBRA:
1066 nigel 77 case OP_BRA:
1067     case OP_ONCE:
1068     case OP_COND:
1069 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1070 nigel 77 if (d < 0) return d;
1071     branchlength += d;
1072     do cc += GET(cc, 1); while (*cc == OP_ALT);
1073     cc += 1 + LINK_SIZE;
1074     break;
1075    
1076     /* Reached end of a branch; if it's a ket it is the end of a nested
1077     call. If it's ALT it is an alternation in a nested call. If it is
1078     END it's the end of the outer call. All can be handled by the same code. */
1079    
1080     case OP_ALT:
1081     case OP_KET:
1082     case OP_KETRMAX:
1083     case OP_KETRMIN:
1084     case OP_END:
1085     if (length < 0) length = branchlength;
1086     else if (length != branchlength) return -1;
1087     if (*cc != OP_ALT) return length;
1088     cc += 1 + LINK_SIZE;
1089     branchlength = 0;
1090     break;
1091    
1092     /* Skip over assertive subpatterns */
1093    
1094     case OP_ASSERT:
1095     case OP_ASSERT_NOT:
1096     case OP_ASSERTBACK:
1097     case OP_ASSERTBACK_NOT:
1098     do cc += GET(cc, 1); while (*cc == OP_ALT);
1099     /* Fall through */
1100    
1101     /* Skip over things that don't match chars */
1102    
1103     case OP_REVERSE:
1104     case OP_CREF:
1105 nigel 93 case OP_RREF:
1106     case OP_DEF:
1107 nigel 77 case OP_OPT:
1108     case OP_CALLOUT:
1109     case OP_SOD:
1110     case OP_SOM:
1111     case OP_EOD:
1112     case OP_EODN:
1113     case OP_CIRC:
1114     case OP_DOLL:
1115     case OP_NOT_WORD_BOUNDARY:
1116     case OP_WORD_BOUNDARY:
1117     cc += _pcre_OP_lengths[*cc];
1118     break;
1119    
1120     /* Handle literal characters */
1121    
1122     case OP_CHAR:
1123     case OP_CHARNC:
1124 nigel 91 case OP_NOT:
1125 nigel 77 branchlength++;
1126     cc += 2;
1127     #ifdef SUPPORT_UTF8
1128     if ((options & PCRE_UTF8) != 0)
1129     {
1130     while ((*cc & 0xc0) == 0x80) cc++;
1131     }
1132     #endif
1133     break;
1134    
1135     /* Handle exact repetitions. The count is already in characters, but we
1136     need to skip over a multibyte character in UTF8 mode. */
1137    
1138     case OP_EXACT:
1139     branchlength += GET2(cc,1);
1140     cc += 4;
1141     #ifdef SUPPORT_UTF8
1142     if ((options & PCRE_UTF8) != 0)
1143     {
1144     while((*cc & 0x80) == 0x80) cc++;
1145     }
1146     #endif
1147     break;
1148    
1149     case OP_TYPEEXACT:
1150     branchlength += GET2(cc,1);
1151     cc += 4;
1152     break;
1153    
1154     /* Handle single-char matchers */
1155    
1156     case OP_PROP:
1157     case OP_NOTPROP:
1158 nigel 87 cc += 2;
1159 nigel 77 /* Fall through */
1160    
1161     case OP_NOT_DIGIT:
1162     case OP_DIGIT:
1163     case OP_NOT_WHITESPACE:
1164     case OP_WHITESPACE:
1165     case OP_NOT_WORDCHAR:
1166     case OP_WORDCHAR:
1167     case OP_ANY:
1168     branchlength++;
1169     cc++;
1170     break;
1171    
1172     /* The single-byte matcher isn't allowed */
1173    
1174     case OP_ANYBYTE:
1175     return -2;
1176    
1177     /* Check a class for variable quantification */
1178    
1179     #ifdef SUPPORT_UTF8
1180     case OP_XCLASS:
1181     cc += GET(cc, 1) - 33;
1182     /* Fall through */
1183     #endif
1184    
1185     case OP_CLASS:
1186     case OP_NCLASS:
1187     cc += 33;
1188    
1189     switch (*cc)
1190     {
1191     case OP_CRSTAR:
1192     case OP_CRMINSTAR:
1193     case OP_CRQUERY:
1194     case OP_CRMINQUERY:
1195     return -1;
1196    
1197     case OP_CRRANGE:
1198     case OP_CRMINRANGE:
1199     if (GET2(cc,1) != GET2(cc,3)) return -1;
1200     branchlength += GET2(cc,1);
1201     cc += 5;
1202     break;
1203    
1204     default:
1205     branchlength++;
1206     }
1207     break;
1208    
1209     /* Anything else is variable length */
1210    
1211     default:
1212     return -1;
1213     }
1214     }
1215     /* Control never gets here */
1216     }
1217    
1218    
1219    
1220    
1221     /*************************************************
1222     * Scan compiled regex for numbered bracket *
1223     *************************************************/
1224    
1225     /* This little function scans through a compiled pattern until it finds a
1226     capturing bracket with the given number.
1227    
1228     Arguments:
1229     code points to start of expression
1230     utf8 TRUE in UTF-8 mode
1231     number the required bracket number
1232    
1233     Returns: pointer to the opcode for the bracket, or NULL if not found
1234     */
1235    
1236     static const uschar *
1237     find_bracket(const uschar *code, BOOL utf8, int number)
1238     {
1239     for (;;)
1240     {
1241     register int c = *code;
1242     if (c == OP_END) return NULL;
1243 nigel 91
1244     /* XCLASS is used for classes that cannot be represented just by a bit
1245     map. This includes negated single high-valued characters. The length in
1246     the table is zero; the actual length is stored in the compiled code. */
1247    
1248     if (c == OP_XCLASS) code += GET(code, 1);
1249    
1250 nigel 93 /* Handle capturing bracket */
1251 nigel 91
1252 nigel 93 else if (c == OP_CBRA)
1253 nigel 77 {
1254 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1255 nigel 77 if (n == number) return (uschar *)code;
1256 nigel 93 code += _pcre_OP_lengths[c];
1257 nigel 77 }
1258 nigel 91
1259 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1260     a multi-byte character. The length in the table is a minimum, so we have to
1261     arrange to skip the extra bytes. */
1262 nigel 91
1263 nigel 77 else
1264     {
1265     code += _pcre_OP_lengths[c];
1266 ph10 107 #ifdef SUPPORT_UTF8
1267 nigel 77 if (utf8) switch(c)
1268     {
1269     case OP_CHAR:
1270     case OP_CHARNC:
1271     case OP_EXACT:
1272     case OP_UPTO:
1273     case OP_MINUPTO:
1274 nigel 93 case OP_POSUPTO:
1275 nigel 77 case OP_STAR:
1276     case OP_MINSTAR:
1277 nigel 93 case OP_POSSTAR:
1278 nigel 77 case OP_PLUS:
1279     case OP_MINPLUS:
1280 nigel 93 case OP_POSPLUS:
1281 nigel 77 case OP_QUERY:
1282     case OP_MINQUERY:
1283 nigel 93 case OP_POSQUERY:
1284     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1285 nigel 77 break;
1286     }
1287 ph10 111 #endif
1288 nigel 77 }
1289     }
1290     }
1291    
1292    
1293    
1294     /*************************************************
1295     * Scan compiled regex for recursion reference *
1296     *************************************************/
1297    
1298     /* This little function scans through a compiled pattern until it finds an
1299     instance of OP_RECURSE.
1300    
1301     Arguments:
1302     code points to start of expression
1303     utf8 TRUE in UTF-8 mode
1304    
1305     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1306     */
1307    
1308     static const uschar *
1309     find_recurse(const uschar *code, BOOL utf8)
1310     {
1311     for (;;)
1312     {
1313     register int c = *code;
1314     if (c == OP_END) return NULL;
1315 nigel 91 if (c == OP_RECURSE) return code;
1316    
1317     /* XCLASS is used for classes that cannot be represented just by a bit
1318     map. This includes negated single high-valued characters. The length in
1319     the table is zero; the actual length is stored in the compiled code. */
1320    
1321     if (c == OP_XCLASS) code += GET(code, 1);
1322    
1323     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1324     that are followed by a character may be followed by a multi-byte character.
1325 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1326     bytes. */
1327 nigel 91
1328 nigel 77 else
1329     {
1330     code += _pcre_OP_lengths[c];
1331 ph10 107 #ifdef SUPPORT_UTF8
1332 nigel 77 if (utf8) switch(c)
1333     {
1334     case OP_CHAR:
1335     case OP_CHARNC:
1336     case OP_EXACT:
1337     case OP_UPTO:
1338     case OP_MINUPTO:
1339 nigel 93 case OP_POSUPTO:
1340 nigel 77 case OP_STAR:
1341     case OP_MINSTAR:
1342 nigel 93 case OP_POSSTAR:
1343 nigel 77 case OP_PLUS:
1344     case OP_MINPLUS:
1345 nigel 93 case OP_POSPLUS:
1346 nigel 77 case OP_QUERY:
1347     case OP_MINQUERY:
1348 nigel 93 case OP_POSQUERY:
1349     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1350 nigel 77 break;
1351     }
1352 ph10 111 #endif
1353 nigel 77 }
1354     }
1355     }
1356    
1357    
1358    
1359     /*************************************************
1360     * Scan compiled branch for non-emptiness *
1361     *************************************************/
1362    
1363     /* This function scans through a branch of a compiled pattern to see whether it
1364 nigel 93 can match the empty string or not. It is called from could_be_empty()
1365     below and from compile_branch() when checking for an unlimited repeat of a
1366     group that can match nothing. Note that first_significant_code() skips over
1367     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1368     struck an inner bracket whose current branch will already have been scanned.
1369 nigel 77
1370     Arguments:
1371     code points to start of search
1372     endcode points to where to stop
1373     utf8 TRUE if in UTF8 mode
1374    
1375     Returns: TRUE if what is matched could be empty
1376     */
1377    
1378     static BOOL
1379     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1380     {
1381     register int c;
1382 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1383 nigel 77 code < endcode;
1384     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1385     {
1386     const uschar *ccode;
1387    
1388     c = *code;
1389 ph10 172
1390 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1391 nigel 77
1392 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393     {
1394 ph10 172 code += _pcre_OP_lengths[c];
1395 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1396     c = *code;
1397     continue;
1398     }
1399    
1400     /* For other groups, scan the branches. */
1401 ph10 172
1402 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403 nigel 77 {
1404     BOOL empty_branch;
1405     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1406    
1407     /* Scan a closed bracket */
1408    
1409     empty_branch = FALSE;
1410     do
1411     {
1412     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1413     empty_branch = TRUE;
1414     code += GET(code, 1);
1415     }
1416     while (*code == OP_ALT);
1417     if (!empty_branch) return FALSE; /* All branches are non-empty */
1418 ph10 172 c = *code;
1419 nigel 93 continue;
1420 nigel 77 }
1421    
1422 nigel 93 /* Handle the other opcodes */
1423    
1424     switch (c)
1425 nigel 77 {
1426     /* Check for quantifiers after a class */
1427    
1428     #ifdef SUPPORT_UTF8
1429     case OP_XCLASS:
1430     ccode = code + GET(code, 1);
1431     goto CHECK_CLASS_REPEAT;
1432     #endif
1433    
1434     case OP_CLASS:
1435     case OP_NCLASS:
1436     ccode = code + 33;
1437    
1438     #ifdef SUPPORT_UTF8
1439     CHECK_CLASS_REPEAT:
1440     #endif
1441    
1442     switch (*ccode)
1443     {
1444     case OP_CRSTAR: /* These could be empty; continue */
1445     case OP_CRMINSTAR:
1446     case OP_CRQUERY:
1447     case OP_CRMINQUERY:
1448     break;
1449    
1450     default: /* Non-repeat => class must match */
1451     case OP_CRPLUS: /* These repeats aren't empty */
1452     case OP_CRMINPLUS:
1453     return FALSE;
1454    
1455     case OP_CRRANGE:
1456     case OP_CRMINRANGE:
1457     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1458     break;
1459     }
1460     break;
1461    
1462     /* Opcodes that must match a character */
1463    
1464     case OP_PROP:
1465     case OP_NOTPROP:
1466     case OP_EXTUNI:
1467     case OP_NOT_DIGIT:
1468     case OP_DIGIT:
1469     case OP_NOT_WHITESPACE:
1470     case OP_WHITESPACE:
1471     case OP_NOT_WORDCHAR:
1472     case OP_WORDCHAR:
1473     case OP_ANY:
1474     case OP_ANYBYTE:
1475     case OP_CHAR:
1476     case OP_CHARNC:
1477     case OP_NOT:
1478     case OP_PLUS:
1479     case OP_MINPLUS:
1480 nigel 93 case OP_POSPLUS:
1481 nigel 77 case OP_EXACT:
1482     case OP_NOTPLUS:
1483     case OP_NOTMINPLUS:
1484 nigel 93 case OP_NOTPOSPLUS:
1485 nigel 77 case OP_NOTEXACT:
1486     case OP_TYPEPLUS:
1487     case OP_TYPEMINPLUS:
1488 nigel 93 case OP_TYPEPOSPLUS:
1489 nigel 77 case OP_TYPEEXACT:
1490     return FALSE;
1491    
1492     /* End of branch */
1493    
1494     case OP_KET:
1495     case OP_KETRMAX:
1496     case OP_KETRMIN:
1497     case OP_ALT:
1498     return TRUE;
1499    
1500 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1501     MINUPTO, and POSUPTO may be followed by a multibyte character */
1502 nigel 77
1503     #ifdef SUPPORT_UTF8
1504     case OP_STAR:
1505     case OP_MINSTAR:
1506 nigel 93 case OP_POSSTAR:
1507 nigel 77 case OP_QUERY:
1508     case OP_MINQUERY:
1509 nigel 93 case OP_POSQUERY:
1510 nigel 77 case OP_UPTO:
1511     case OP_MINUPTO:
1512 nigel 93 case OP_POSUPTO:
1513 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1514     break;
1515     #endif
1516     }
1517     }
1518    
1519     return TRUE;
1520     }
1521    
1522    
1523    
1524     /*************************************************
1525     * Scan compiled regex for non-emptiness *
1526     *************************************************/
1527    
1528     /* This function is called to check for left recursive calls. We want to check
1529     the current branch of the current pattern to see if it could match the empty
1530     string. If it could, we must look outwards for branches at other levels,
1531     stopping when we pass beyond the bracket which is the subject of the recursion.
1532    
1533     Arguments:
1534     code points to start of the recursion
1535     endcode points to where to stop (current RECURSE item)
1536     bcptr points to the chain of current (unclosed) branch starts
1537     utf8 TRUE if in UTF-8 mode
1538    
1539     Returns: TRUE if what is matched could be empty
1540     */
1541    
1542     static BOOL
1543     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1544     BOOL utf8)
1545     {
1546     while (bcptr != NULL && bcptr->current >= code)
1547     {
1548     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1549     bcptr = bcptr->outer;
1550     }
1551     return TRUE;
1552     }
1553    
1554    
1555    
1556     /*************************************************
1557     * Check for POSIX class syntax *
1558     *************************************************/
1559    
1560     /* This function is called when the sequence "[:" or "[." or "[=" is
1561     encountered in a character class. It checks whether this is followed by an
1562     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1563     ".]" or "=]".
1564    
1565     Argument:
1566     ptr pointer to the initial [
1567     endptr where to return the end pointer
1568     cd pointer to compile data
1569    
1570     Returns: TRUE or FALSE
1571     */
1572    
1573     static BOOL
1574     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1575     {
1576     int terminator; /* Don't combine these lines; the Solaris cc */
1577     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1578     if (*(++ptr) == '^') ptr++;
1579     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1580     if (*ptr == terminator && ptr[1] == ']')
1581     {
1582     *endptr = ptr;
1583     return TRUE;
1584     }
1585     return FALSE;
1586     }
1587    
1588    
1589    
1590    
1591     /*************************************************
1592     * Check POSIX class name *
1593     *************************************************/
1594    
1595     /* This function is called to check the name given in a POSIX-style class entry
1596     such as [:alnum:].
1597    
1598     Arguments:
1599     ptr points to the first letter
1600     len the length of the name
1601    
1602     Returns: a value representing the name, or -1 if unknown
1603     */
1604    
1605     static int
1606     check_posix_name(const uschar *ptr, int len)
1607     {
1608     register int yield = 0;
1609     while (posix_name_lengths[yield] != 0)
1610     {
1611     if (len == posix_name_lengths[yield] &&
1612     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1613     yield++;
1614     }
1615     return -1;
1616     }
1617    
1618    
1619     /*************************************************
1620     * Adjust OP_RECURSE items in repeated group *
1621     *************************************************/
1622    
1623     /* OP_RECURSE items contain an offset from the start of the regex to the group
1624     that is referenced. This means that groups can be replicated for fixed
1625     repetition simply by copying (because the recursion is allowed to refer to
1626     earlier groups that are outside the current group). However, when a group is
1627     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1628     it, after it has been compiled. This means that any OP_RECURSE items within it
1629     that refer to the group itself or any contained groups have to have their
1630 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1631     the partially compiled regex must be temporarily terminated with OP_END.
1632 nigel 77
1633 nigel 93 This function has been extended with the possibility of forward references for
1634     recursions and subroutine calls. It must also check the list of such references
1635     for the group we are dealing with. If it finds that one of the recursions in
1636     the current group is on this list, it adjusts the offset in the list, not the
1637     value in the reference (which is a group number).
1638    
1639 nigel 77 Arguments:
1640     group points to the start of the group
1641     adjust the amount by which the group is to be moved
1642     utf8 TRUE in UTF-8 mode
1643     cd contains pointers to tables etc.
1644 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1645 nigel 77
1646     Returns: nothing
1647     */
1648    
1649     static void
1650 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1651     uschar *save_hwm)
1652 nigel 77 {
1653     uschar *ptr = group;
1654     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1655     {
1656 nigel 93 int offset;
1657     uschar *hc;
1658    
1659     /* See if this recursion is on the forward reference list. If so, adjust the
1660     reference. */
1661    
1662     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1663     {
1664     offset = GET(hc, 0);
1665     if (cd->start_code + offset == ptr + 1)
1666     {
1667     PUT(hc, 0, offset + adjust);
1668     break;
1669     }
1670     }
1671    
1672     /* Otherwise, adjust the recursion offset if it's after the start of this
1673     group. */
1674    
1675     if (hc >= cd->hwm)
1676     {
1677     offset = GET(ptr, 1);
1678     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1679     }
1680    
1681 nigel 77 ptr += 1 + LINK_SIZE;
1682     }
1683     }
1684    
1685    
1686    
1687     /*************************************************
1688     * Insert an automatic callout point *
1689     *************************************************/
1690    
1691     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1692     callout points before each pattern item.
1693    
1694     Arguments:
1695     code current code pointer
1696     ptr current pattern pointer
1697     cd pointers to tables etc
1698    
1699     Returns: new code pointer
1700     */
1701    
1702     static uschar *
1703     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1704     {
1705     *code++ = OP_CALLOUT;
1706     *code++ = 255;
1707     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1708     PUT(code, LINK_SIZE, 0); /* Default length */
1709     return code + 2*LINK_SIZE;
1710     }
1711    
1712    
1713    
1714     /*************************************************
1715     * Complete a callout item *
1716     *************************************************/
1717    
1718     /* A callout item contains the length of the next item in the pattern, which
1719     we can't fill in till after we have reached the relevant point. This is used
1720     for both automatic and manual callouts.
1721    
1722     Arguments:
1723     previous_callout points to previous callout item
1724     ptr current pattern pointer
1725     cd pointers to tables etc
1726    
1727     Returns: nothing
1728     */
1729    
1730     static void
1731     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1732     {
1733     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1734     PUT(previous_callout, 2 + LINK_SIZE, length);
1735     }
1736    
1737    
1738    
1739     #ifdef SUPPORT_UCP
1740     /*************************************************
1741     * Get othercase range *
1742     *************************************************/
1743    
1744     /* This function is passed the start and end of a class range, in UTF-8 mode
1745     with UCP support. It searches up the characters, looking for internal ranges of
1746     characters in the "other" case. Each call returns the next one, updating the
1747     start address.
1748    
1749     Arguments:
1750     cptr points to starting character value; updated
1751     d end value
1752     ocptr where to put start of othercase range
1753     odptr where to put end of othercase range
1754    
1755     Yield: TRUE when range returned; FALSE when no more
1756     */
1757    
1758     static BOOL
1759 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1760     unsigned int *odptr)
1761 nigel 77 {
1762 nigel 93 unsigned int c, othercase, next;
1763 nigel 77
1764     for (c = *cptr; c <= d; c++)
1765 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1766 nigel 77
1767     if (c > d) return FALSE;
1768    
1769     *ocptr = othercase;
1770     next = othercase + 1;
1771    
1772     for (++c; c <= d; c++)
1773     {
1774 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1775 nigel 77 next++;
1776     }
1777    
1778     *odptr = next - 1;
1779     *cptr = c;
1780    
1781     return TRUE;
1782     }
1783     #endif /* SUPPORT_UCP */
1784    
1785    
1786 nigel 93
1787 nigel 77 /*************************************************
1788 nigel 93 * Check if auto-possessifying is possible *
1789     *************************************************/
1790    
1791     /* This function is called for unlimited repeats of certain items, to see
1792     whether the next thing could possibly match the repeated item. If not, it makes
1793     sense to automatically possessify the repeated item.
1794    
1795     Arguments:
1796     op_code the repeated op code
1797     this data for this item, depends on the opcode
1798     utf8 TRUE in UTF-8 mode
1799     utf8_char used for utf8 character bytes, NULL if not relevant
1800     ptr next character in pattern
1801     options options bits
1802     cd contains pointers to tables etc.
1803    
1804     Returns: TRUE if possessifying is wanted
1805     */
1806    
1807     static BOOL
1808     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1809     const uschar *ptr, int options, compile_data *cd)
1810     {
1811     int next;
1812    
1813     /* Skip whitespace and comments in extended mode */
1814    
1815     if ((options & PCRE_EXTENDED) != 0)
1816     {
1817     for (;;)
1818     {
1819     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1820     if (*ptr == '#')
1821     {
1822     while (*(++ptr) != 0)
1823     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1824     }
1825     else break;
1826     }
1827     }
1828    
1829     /* If the next item is one that we can handle, get its value. A non-negative
1830     value is a character, a negative value is an escape value. */
1831    
1832     if (*ptr == '\\')
1833     {
1834     int temperrorcode = 0;
1835     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1836     if (temperrorcode != 0) return FALSE;
1837     ptr++; /* Point after the escape sequence */
1838     }
1839    
1840     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1841     {
1842     #ifdef SUPPORT_UTF8
1843     if (utf8) { GETCHARINC(next, ptr); } else
1844     #endif
1845     next = *ptr++;
1846     }
1847    
1848     else return FALSE;
1849    
1850     /* Skip whitespace and comments in extended mode */
1851    
1852     if ((options & PCRE_EXTENDED) != 0)
1853     {
1854     for (;;)
1855     {
1856     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1857     if (*ptr == '#')
1858     {
1859     while (*(++ptr) != 0)
1860     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1861     }
1862     else break;
1863     }
1864     }
1865    
1866     /* If the next thing is itself optional, we have to give up. */
1867    
1868     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1869     return FALSE;
1870    
1871     /* Now compare the next item with the previous opcode. If the previous is a
1872     positive single character match, "item" either contains the character or, if
1873     "item" is greater than 127 in utf8 mode, the character's bytes are in
1874     utf8_char. */
1875    
1876    
1877     /* Handle cases when the next item is a character. */
1878    
1879     if (next >= 0) switch(op_code)
1880     {
1881     case OP_CHAR:
1882     #ifdef SUPPORT_UTF8
1883     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1884     #endif
1885     return item != next;
1886    
1887     /* For CHARNC (caseless character) we must check the other case. If we have
1888     Unicode property support, we can use it to test the other case of
1889     high-valued characters. */
1890    
1891     case OP_CHARNC:
1892     #ifdef SUPPORT_UTF8
1893     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894     #endif
1895     if (item == next) return FALSE;
1896     #ifdef SUPPORT_UTF8
1897     if (utf8)
1898     {
1899     unsigned int othercase;
1900     if (next < 128) othercase = cd->fcc[next]; else
1901     #ifdef SUPPORT_UCP
1902     othercase = _pcre_ucp_othercase((unsigned int)next);
1903     #else
1904     othercase = NOTACHAR;
1905     #endif
1906     return (unsigned int)item != othercase;
1907     }
1908     else
1909     #endif /* SUPPORT_UTF8 */
1910     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1911    
1912     /* For OP_NOT, "item" must be a single-byte character. */
1913    
1914     case OP_NOT:
1915     if (next < 0) return FALSE; /* Not a character */
1916     if (item == next) return TRUE;
1917     if ((options & PCRE_CASELESS) == 0) return FALSE;
1918     #ifdef SUPPORT_UTF8
1919     if (utf8)
1920     {
1921     unsigned int othercase;
1922     if (next < 128) othercase = cd->fcc[next]; else
1923     #ifdef SUPPORT_UCP
1924     othercase = _pcre_ucp_othercase(next);
1925     #else
1926     othercase = NOTACHAR;
1927     #endif
1928     return (unsigned int)item == othercase;
1929     }
1930     else
1931     #endif /* SUPPORT_UTF8 */
1932     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1933    
1934     case OP_DIGIT:
1935     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1936    
1937     case OP_NOT_DIGIT:
1938     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1939    
1940     case OP_WHITESPACE:
1941     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1942    
1943     case OP_NOT_WHITESPACE:
1944     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1945    
1946     case OP_WORDCHAR:
1947     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1948    
1949     case OP_NOT_WORDCHAR:
1950     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951    
1952     default:
1953     return FALSE;
1954     }
1955    
1956    
1957     /* Handle the case when the next item is \d, \s, etc. */
1958    
1959     switch(op_code)
1960     {
1961     case OP_CHAR:
1962     case OP_CHARNC:
1963     #ifdef SUPPORT_UTF8
1964     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1965     #endif
1966     switch(-next)
1967     {
1968     case ESC_d:
1969     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1970    
1971     case ESC_D:
1972     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1973    
1974     case ESC_s:
1975     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1976    
1977     case ESC_S:
1978     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1979    
1980     case ESC_w:
1981     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1982    
1983     case ESC_W:
1984     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1985    
1986     default:
1987     return FALSE;
1988     }
1989    
1990     case OP_DIGIT:
1991     return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1992    
1993     case OP_NOT_DIGIT:
1994     return next == -ESC_d;
1995    
1996     case OP_WHITESPACE:
1997     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1998    
1999     case OP_NOT_WHITESPACE:
2000     return next == -ESC_s;
2001    
2002     case OP_WORDCHAR:
2003     return next == -ESC_W || next == -ESC_s;
2004    
2005     case OP_NOT_WORDCHAR:
2006     return next == -ESC_w || next == -ESC_d;
2007    
2008     default:
2009     return FALSE;
2010     }
2011    
2012     /* Control does not reach here */
2013     }
2014    
2015    
2016    
2017     /*************************************************
2018 nigel 77 * Compile one branch *
2019     *************************************************/
2020    
2021 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2022 nigel 77 changed during the branch, the pointer is used to change the external options
2023 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2024     to find out the amount of memory needed, as well as during the real compile
2025     phase. The value of lengthptr distinguishes the two phases.
2026 nigel 77
2027     Arguments:
2028     optionsptr pointer to the option bits
2029     codeptr points to the pointer to the current code point
2030     ptrptr points to the current pattern pointer
2031     errorcodeptr points to error code variable
2032     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2033     reqbyteptr set to the last literal character required, else < 0
2034     bcptr points to current branch chain
2035     cd contains pointers to tables etc.
2036 nigel 93 lengthptr NULL during the real compile phase
2037     points to length accumulator during pre-compile phase
2038 nigel 77
2039     Returns: TRUE on success
2040     FALSE, with *errorcodeptr set non-zero on error
2041     */
2042    
2043     static BOOL
2044 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2045     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2046     compile_data *cd, int *lengthptr)
2047 nigel 77 {
2048     int repeat_type, op_type;
2049     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2050     int bravalue = 0;
2051     int greedy_default, greedy_non_default;
2052     int firstbyte, reqbyte;
2053     int zeroreqbyte, zerofirstbyte;
2054     int req_caseopt, reqvary, tempreqvary;
2055     int options = *optionsptr;
2056     int after_manual_callout = 0;
2057 nigel 93 int length_prevgroup = 0;
2058 nigel 77 register int c;
2059     register uschar *code = *codeptr;
2060 nigel 93 uschar *last_code = code;
2061     uschar *orig_code = code;
2062 nigel 77 uschar *tempcode;
2063     BOOL inescq = FALSE;
2064     BOOL groupsetfirstbyte = FALSE;
2065     const uschar *ptr = *ptrptr;
2066     const uschar *tempptr;
2067     uschar *previous = NULL;
2068     uschar *previous_callout = NULL;
2069 nigel 93 uschar *save_hwm = NULL;
2070 nigel 77 uschar classbits[32];
2071    
2072     #ifdef SUPPORT_UTF8
2073     BOOL class_utf8;
2074     BOOL utf8 = (options & PCRE_UTF8) != 0;
2075     uschar *class_utf8data;
2076     uschar utf8_char[6];
2077     #else
2078     BOOL utf8 = FALSE;
2079 nigel 93 uschar *utf8_char = NULL;
2080 nigel 77 #endif
2081    
2082 nigel 93 #ifdef DEBUG
2083     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2084     #endif
2085    
2086 nigel 77 /* Set up the default and non-default settings for greediness */
2087    
2088     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2089     greedy_non_default = greedy_default ^ 1;
2090    
2091     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2092     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2093     matches a non-fixed char first char; reqbyte just remains unset if we never
2094     find one.
2095    
2096     When we hit a repeat whose minimum is zero, we may have to adjust these values
2097     to take the zero repeat into account. This is implemented by setting them to
2098     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2099     item types that can be repeated set these backoff variables appropriately. */
2100    
2101     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2102    
2103     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2104     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2105     value > 255. It is added into the firstbyte or reqbyte variables to record the
2106     case status of the value. This is used only for ASCII characters. */
2107    
2108     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2109    
2110     /* Switch on next character until the end of the branch */
2111    
2112     for (;; ptr++)
2113     {
2114     BOOL negate_class;
2115     BOOL possessive_quantifier;
2116     BOOL is_quantifier;
2117 nigel 93 BOOL is_recurse;
2118 ph10 175 BOOL reset_bracount;
2119 nigel 77 int class_charcount;
2120     int class_lastchar;
2121     int newoptions;
2122     int recno;
2123 ph10 172 int refsign;
2124 nigel 77 int skipbytes;
2125     int subreqbyte;
2126     int subfirstbyte;
2127 nigel 93 int terminator;
2128 nigel 77 int mclength;
2129     uschar mcbuffer[8];
2130    
2131 nigel 93 /* Get next byte in the pattern */
2132 nigel 77
2133     c = *ptr;
2134    
2135 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2136     previous cycle of this loop. */
2137    
2138     if (lengthptr != NULL)
2139     {
2140     #ifdef DEBUG
2141     if (code > cd->hwm) cd->hwm = code; /* High water info */
2142     #endif
2143     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2144     {
2145     *errorcodeptr = ERR52;
2146     goto FAILED;
2147     }
2148    
2149     /* There is at least one situation where code goes backwards: this is the
2150     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2151     the class is simply eliminated. However, it is created first, so we have to
2152     allow memory for it. Therefore, don't ever reduce the length at this point.
2153     */
2154    
2155     if (code < last_code) code = last_code;
2156     *lengthptr += code - last_code;
2157     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2158    
2159     /* If "previous" is set and it is not at the start of the work space, move
2160     it back to there, in order to avoid filling up the work space. Otherwise,
2161     if "previous" is NULL, reset the current code pointer to the start. */
2162    
2163     if (previous != NULL)
2164     {
2165     if (previous > orig_code)
2166     {
2167     memmove(orig_code, previous, code - previous);
2168     code -= previous - orig_code;
2169     previous = orig_code;
2170     }
2171     }
2172     else code = orig_code;
2173    
2174     /* Remember where this code item starts so we can pick up the length
2175     next time round. */
2176    
2177     last_code = code;
2178     }
2179    
2180     /* In the real compile phase, just check the workspace used by the forward
2181     reference list. */
2182    
2183     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2184     {
2185     *errorcodeptr = ERR52;
2186     goto FAILED;
2187     }
2188    
2189 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2190    
2191     if (inescq && c != 0)
2192     {
2193     if (c == '\\' && ptr[1] == 'E')
2194     {
2195     inescq = FALSE;
2196     ptr++;
2197     continue;
2198     }
2199     else
2200     {
2201     if (previous_callout != NULL)
2202     {
2203 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2204     complete_callout(previous_callout, ptr, cd);
2205 nigel 77 previous_callout = NULL;
2206     }
2207     if ((options & PCRE_AUTO_CALLOUT) != 0)
2208     {
2209     previous_callout = code;
2210     code = auto_callout(code, ptr, cd);
2211     }
2212     goto NORMAL_CHAR;
2213     }
2214     }
2215    
2216     /* Fill in length of a previous callout, except when the next thing is
2217     a quantifier. */
2218    
2219     is_quantifier = c == '*' || c == '+' || c == '?' ||
2220     (c == '{' && is_counted_repeat(ptr+1));
2221    
2222     if (!is_quantifier && previous_callout != NULL &&
2223     after_manual_callout-- <= 0)
2224     {
2225 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2226     complete_callout(previous_callout, ptr, cd);
2227 nigel 77 previous_callout = NULL;
2228     }
2229    
2230     /* In extended mode, skip white space and comments */
2231    
2232     if ((options & PCRE_EXTENDED) != 0)
2233     {
2234     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2235     if (c == '#')
2236     {
2237 nigel 93 while (*(++ptr) != 0)
2238 nigel 91 {
2239 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2240 nigel 91 }
2241 nigel 93 if (*ptr != 0) continue;
2242    
2243 nigel 91 /* Else fall through to handle end of string */
2244     c = 0;
2245 nigel 77 }
2246     }
2247    
2248     /* No auto callout for quantifiers. */
2249    
2250     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2251     {
2252     previous_callout = code;
2253     code = auto_callout(code, ptr, cd);
2254     }
2255    
2256     switch(c)
2257     {
2258 nigel 93 /* ===================================================================*/
2259     case 0: /* The branch terminates at string end */
2260     case '|': /* or | or ) */
2261 nigel 77 case ')':
2262     *firstbyteptr = firstbyte;
2263     *reqbyteptr = reqbyte;
2264     *codeptr = code;
2265     *ptrptr = ptr;
2266 nigel 93 if (lengthptr != NULL)
2267     {
2268     *lengthptr += code - last_code; /* To include callout length */
2269     DPRINTF((">> end branch\n"));
2270     }
2271 nigel 77 return TRUE;
2272    
2273 nigel 93
2274     /* ===================================================================*/
2275 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2276     the setting of any following char as a first character. */
2277    
2278     case '^':
2279     if ((options & PCRE_MULTILINE) != 0)
2280     {
2281     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2282     }
2283     previous = NULL;
2284     *code++ = OP_CIRC;
2285     break;
2286    
2287     case '$':
2288     previous = NULL;
2289     *code++ = OP_DOLL;
2290     break;
2291    
2292     /* There can never be a first char if '.' is first, whatever happens about
2293     repeats. The value of reqbyte doesn't change either. */
2294    
2295     case '.':
2296     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2297     zerofirstbyte = firstbyte;
2298     zeroreqbyte = reqbyte;
2299     previous = code;
2300     *code++ = OP_ANY;
2301     break;
2302    
2303 nigel 93
2304     /* ===================================================================*/
2305 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2306     32-byte bitmap of the permitted characters, except in the special case
2307     where there is only one such character. For negated classes, we build the
2308     map as usual, then invert it at the end. However, we use a different opcode
2309     so that data characters > 255 can be handled correctly.
2310 nigel 77
2311     If the class contains characters outside the 0-255 range, a different
2312     opcode is compiled. It may optionally have a bit map for characters < 256,
2313     but those above are are explicitly listed afterwards. A flag byte tells
2314     whether the bitmap is present, and whether this is a negated class or not.
2315     */
2316    
2317     case '[':
2318     previous = code;
2319    
2320     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2321     they are encountered at the top level, so we'll do that too. */
2322    
2323     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2324     check_posix_syntax(ptr, &tempptr, cd))
2325     {
2326     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2327     goto FAILED;
2328     }
2329    
2330     /* If the first character is '^', set the negation flag and skip it. */
2331    
2332     if ((c = *(++ptr)) == '^')
2333     {
2334     negate_class = TRUE;
2335     c = *(++ptr);
2336     }
2337     else
2338     {
2339     negate_class = FALSE;
2340     }
2341    
2342     /* Keep a count of chars with values < 256 so that we can optimize the case
2343 nigel 93 of just a single character (as long as it's < 256). However, For higher
2344     valued UTF-8 characters, we don't yet do any optimization. */
2345 nigel 77
2346     class_charcount = 0;
2347     class_lastchar = -1;
2348    
2349 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2350     temporary bit of memory, in case the class contains only 1 character (less
2351     than 256), because in that case the compiled code doesn't use the bit map.
2352     */
2353    
2354     memset(classbits, 0, 32 * sizeof(uschar));
2355    
2356 nigel 77 #ifdef SUPPORT_UTF8
2357     class_utf8 = FALSE; /* No chars >= 256 */
2358 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2359 nigel 77 #endif
2360    
2361     /* Process characters until ] is reached. By writing this as a "do" it
2362 nigel 93 means that an initial ] is taken as a data character. At the start of the
2363     loop, c contains the first byte of the character. */
2364 nigel 77
2365 nigel 93 if (c != 0) do
2366 nigel 77 {
2367 nigel 93 const uschar *oldptr;
2368    
2369 nigel 77 #ifdef SUPPORT_UTF8
2370     if (utf8 && c > 127)
2371     { /* Braces are required because the */
2372     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2373     }
2374     #endif
2375    
2376     /* Inside \Q...\E everything is literal except \E */
2377    
2378     if (inescq)
2379     {
2380 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2381 nigel 77 {
2382 nigel 93 inescq = FALSE; /* Reset literal state */
2383     ptr++; /* Skip the 'E' */
2384     continue; /* Carry on with next */
2385 nigel 77 }
2386 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2387 nigel 77 }
2388    
2389     /* Handle POSIX class names. Perl allows a negation extension of the
2390     form [:^name:]. A square bracket that doesn't match the syntax is
2391     treated as a literal. We also recognize the POSIX constructions
2392     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2393     5.6 and 5.8 do. */
2394    
2395     if (c == '[' &&
2396     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2397     check_posix_syntax(ptr, &tempptr, cd))
2398     {
2399     BOOL local_negate = FALSE;
2400 nigel 87 int posix_class, taboffset, tabopt;
2401 nigel 77 register const uschar *cbits = cd->cbits;
2402 nigel 87 uschar pbits[32];
2403 nigel 77
2404     if (ptr[1] != ':')
2405     {
2406     *errorcodeptr = ERR31;
2407     goto FAILED;
2408     }
2409    
2410     ptr += 2;
2411     if (*ptr == '^')
2412     {
2413     local_negate = TRUE;
2414     ptr++;
2415     }
2416    
2417     posix_class = check_posix_name(ptr, tempptr - ptr);
2418     if (posix_class < 0)
2419     {
2420     *errorcodeptr = ERR30;
2421     goto FAILED;
2422     }
2423    
2424     /* If matching is caseless, upper and lower are converted to
2425     alpha. This relies on the fact that the class table starts with
2426     alpha, lower, upper as the first 3 entries. */
2427    
2428     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2429     posix_class = 0;
2430    
2431 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2432     because we may be adding and subtracting from it, and we don't want to
2433     subtract bits that may be in the main map already. At the end we or the
2434     result into the bit map that is being built. */
2435 nigel 77
2436     posix_class *= 3;
2437 nigel 87
2438     /* Copy in the first table (always present) */
2439    
2440     memcpy(pbits, cbits + posix_class_maps[posix_class],
2441     32 * sizeof(uschar));
2442    
2443     /* If there is a second table, add or remove it as required. */
2444    
2445     taboffset = posix_class_maps[posix_class + 1];
2446     tabopt = posix_class_maps[posix_class + 2];
2447    
2448     if (taboffset >= 0)
2449 nigel 77 {
2450 nigel 87 if (tabopt >= 0)
2451     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2452 nigel 77 else
2453 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2454 nigel 77 }
2455    
2456 nigel 87 /* Not see if we need to remove any special characters. An option
2457     value of 1 removes vertical space and 2 removes underscore. */
2458    
2459     if (tabopt < 0) tabopt = -tabopt;
2460     if (tabopt == 1) pbits[1] &= ~0x3c;
2461     else if (tabopt == 2) pbits[11] &= 0x7f;
2462    
2463     /* Add the POSIX table or its complement into the main table that is
2464     being built and we are done. */
2465    
2466     if (local_negate)
2467     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2468     else
2469     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2470    
2471 nigel 77 ptr = tempptr + 1;
2472     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2473     continue; /* End of POSIX syntax handling */
2474     }
2475    
2476     /* Backslash may introduce a single character, or it may introduce one
2477 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2478     case. Inside a class (and only there) it is treated as backspace.
2479     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2480     to or into the one we are building. We assume they have more than one
2481 nigel 77 character in them, so set class_charcount bigger than one. */
2482    
2483     if (c == '\\')
2484     {
2485 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2486     if (*errorcodeptr != 0) goto FAILED;
2487 nigel 77
2488     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2489     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2490 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2491 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2492     {
2493     if (ptr[1] == '\\' && ptr[2] == 'E')
2494     {
2495     ptr += 2; /* avoid empty string */
2496     }
2497     else inescq = TRUE;
2498     continue;
2499     }
2500    
2501     if (c < 0)
2502     {
2503     register const uschar *cbits = cd->cbits;
2504     class_charcount += 2; /* Greater than 1 is what matters */
2505 nigel 93
2506     /* Save time by not doing this in the pre-compile phase. */
2507    
2508     if (lengthptr == NULL) switch (-c)
2509 nigel 77 {
2510     case ESC_d:
2511     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2512     continue;
2513    
2514     case ESC_D:
2515     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2516     continue;
2517    
2518     case ESC_w:
2519     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2520     continue;
2521    
2522     case ESC_W:
2523     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2524     continue;
2525    
2526     case ESC_s:
2527     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2528     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2529     continue;
2530    
2531     case ESC_S:
2532     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2533     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2534     continue;
2535    
2536 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2537     continue;
2538 ph10 178
2539 nigel 93 default: /* Not recognized; fall through */
2540     break; /* Need "default" setting to stop compiler warning. */
2541     }
2542    
2543     /* In the pre-compile phase, just do the recognition. */
2544    
2545     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2546     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2547 ph10 178
2548     /* We need to deal with \H, \h, \V, and \v in both phases because
2549     they use extra memory. */
2550    
2551     if (-c == ESC_h)
2552     {
2553     SETBIT(classbits, 0x09); /* VT */
2554     SETBIT(classbits, 0x20); /* SPACE */
2555     SETBIT(classbits, 0xa0); /* NSBP */
2556     #ifdef SUPPORT_UTF8
2557     if (utf8)
2558     {
2559     class_utf8 = TRUE;
2560     *class_utf8data++ = XCL_SINGLE;
2561     class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2562     *class_utf8data++ = XCL_SINGLE;
2563     class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2564     *class_utf8data++ = XCL_RANGE;
2565     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2566     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2567     *class_utf8data++ = XCL_SINGLE;
2568     class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2569     *class_utf8data++ = XCL_SINGLE;
2570     class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2571     *class_utf8data++ = XCL_SINGLE;
2572     class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2573     }
2574     #endif
2575     continue;
2576     }
2577 nigel 93
2578 ph10 178 if (-c == ESC_H)
2579     {
2580     for (c = 0; c < 32; c++)
2581     {
2582     int x = 0xff;
2583     switch (c)
2584     {
2585     case 0x09/8: x ^= 1 << (0x09%8); break;
2586     case 0x20/8: x ^= 1 << (0x20%8); break;
2587     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2588     default: break;
2589     }
2590     classbits[c] |= x;
2591     }
2592    
2593     #ifdef SUPPORT_UTF8
2594     if (utf8)
2595     {
2596     class_utf8 = TRUE;
2597     *class_utf8data++ = XCL_RANGE;
2598     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2599     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2600     *class_utf8data++ = XCL_RANGE;
2601     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2602     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2603     *class_utf8data++ = XCL_RANGE;
2604     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2605     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2606     *class_utf8data++ = XCL_RANGE;
2607     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2608     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2609     *class_utf8data++ = XCL_RANGE;
2610     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2611     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2612     *class_utf8data++ = XCL_RANGE;
2613     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2614     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2615     *class_utf8data++ = XCL_RANGE;
2616     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2617     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2618     }
2619     #endif
2620     continue;
2621     }
2622    
2623     if (-c == ESC_v)
2624     {
2625     SETBIT(classbits, 0x0a); /* LF */
2626     SETBIT(classbits, 0x0b); /* VT */
2627     SETBIT(classbits, 0x0c); /* FF */
2628     SETBIT(classbits, 0x0d); /* CR */
2629     SETBIT(classbits, 0x85); /* NEL */
2630     #ifdef SUPPORT_UTF8
2631     if (utf8)
2632     {
2633     class_utf8 = TRUE;
2634     *class_utf8data++ = XCL_RANGE;
2635     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2636     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2637     }
2638     #endif
2639     continue;
2640     }
2641    
2642     if (-c == ESC_V)
2643     {
2644     for (c = 0; c < 32; c++)
2645     {
2646     int x = 0xff;
2647     switch (c)
2648     {
2649     case 0x0a/8: x ^= 1 << (0x0a%8);
2650     x ^= 1 << (0x0b%8);
2651     x ^= 1 << (0x0c%8);
2652     x ^= 1 << (0x0d%8);
2653     break;
2654     case 0x85/8: x ^= 1 << (0x85%8); break;
2655     default: break;
2656     }
2657     classbits[c] |= x;
2658     }
2659    
2660     #ifdef SUPPORT_UTF8
2661     if (utf8)
2662     {
2663     class_utf8 = TRUE;
2664     *class_utf8data++ = XCL_RANGE;
2665     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2666     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2667     *class_utf8data++ = XCL_RANGE;
2668     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2669     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2670     }
2671     #endif
2672     continue;
2673     }
2674    
2675 nigel 93 /* We need to deal with \P and \p in both phases. */
2676    
2677 nigel 77 #ifdef SUPPORT_UCP
2678 nigel 93 if (-c == ESC_p || -c == ESC_P)
2679     {
2680     BOOL negated;
2681     int pdata;
2682     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2683     if (ptype < 0) goto FAILED;
2684     class_utf8 = TRUE;
2685     *class_utf8data++ = ((-c == ESC_p) != negated)?
2686     XCL_PROP : XCL_NOTPROP;
2687     *class_utf8data++ = ptype;
2688     *class_utf8data++ = pdata;
2689     class_charcount -= 2; /* Not a < 256 character */
2690 nigel 77 continue;
2691 nigel 93 }
2692 nigel 77 #endif
2693 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2694     strict mode. By default, for compatibility with Perl, they are
2695     treated as literals. */
2696 nigel 77
2697 nigel 93 if ((options & PCRE_EXTRA) != 0)
2698     {
2699     *errorcodeptr = ERR7;
2700     goto FAILED;
2701     }
2702 nigel 77
2703 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2704     c = *ptr; /* Get the final character and fall through */
2705 nigel 77 }
2706    
2707     /* Fall through if we have a single character (c >= 0). This may be
2708 nigel 93 greater than 256 in UTF-8 mode. */
2709 nigel 77
2710     } /* End of backslash handling */
2711    
2712     /* A single character may be followed by '-' to form a range. However,
2713     Perl does not permit ']' to be the end of the range. A '-' character
2714 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2715     entirely. The code for handling \Q and \E is messy. */
2716 nigel 77
2717 nigel 93 CHECK_RANGE:
2718     while (ptr[1] == '\\' && ptr[2] == 'E')
2719 nigel 77 {
2720 nigel 93 inescq = FALSE;
2721     ptr += 2;
2722     }
2723    
2724     oldptr = ptr;
2725    
2726     if (!inescq && ptr[1] == '-')
2727     {
2728 nigel 77 int d;
2729     ptr += 2;
2730 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2731 nigel 77
2732 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2733     mode. */
2734    
2735     while (*ptr == '\\' && ptr[1] == 'Q')
2736     {
2737     ptr += 2;
2738     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2739     inescq = TRUE;
2740     break;
2741     }
2742    
2743     if (*ptr == 0 || (!inescq && *ptr == ']'))
2744     {
2745     ptr = oldptr;
2746     goto LONE_SINGLE_CHARACTER;
2747     }
2748    
2749 nigel 77 #ifdef SUPPORT_UTF8
2750     if (utf8)
2751     { /* Braces are required because the */
2752     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2753     }
2754     else
2755     #endif
2756     d = *ptr; /* Not UTF-8 mode */
2757    
2758     /* The second part of a range can be a single-character escape, but
2759     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2760     in such circumstances. */
2761    
2762 nigel 93 if (!inescq && d == '\\')
2763 nigel 77 {
2764 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2765     if (*errorcodeptr != 0) goto FAILED;
2766 nigel 77
2767 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2768     special means the '-' was literal */
2769 nigel 77
2770     if (d < 0)
2771     {
2772     if (d == -ESC_b) d = '\b';
2773 nigel 93 else if (d == -ESC_X) d = 'X';
2774     else if (d == -ESC_R) d = 'R'; else
2775 nigel 77 {
2776 nigel 93 ptr = oldptr;
2777 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2778     }
2779     }
2780     }
2781    
2782 nigel 93 /* Check that the two values are in the correct order. Optimize
2783     one-character ranges */
2784 nigel 77
2785 nigel 93 if (d < c)
2786     {
2787     *errorcodeptr = ERR8;
2788     goto FAILED;
2789     }
2790    
2791 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2792    
2793     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2794     matching, we have to use an XCLASS with extra data items. Caseless
2795     matching for characters > 127 is available only if UCP support is
2796     available. */
2797    
2798     #ifdef SUPPORT_UTF8
2799     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2800     {
2801     class_utf8 = TRUE;
2802    
2803     /* With UCP support, we can find the other case equivalents of
2804     the relevant characters. There may be several ranges. Optimize how
2805     they fit with the basic range. */
2806    
2807     #ifdef SUPPORT_UCP
2808     if ((options & PCRE_CASELESS) != 0)
2809     {
2810 nigel 93 unsigned int occ, ocd;
2811     unsigned int cc = c;
2812     unsigned int origd = d;
2813 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2814     {
2815 ph10 176 if (occ >= (unsigned int)c &&
2816     ocd <= (unsigned int)d)
2817     continue; /* Skip embedded ranges */
2818 nigel 77
2819 ph10 176 if (occ < (unsigned int)c &&
2820     ocd >= (unsigned int)c - 1) /* Extend the basic range */
2821 nigel 77 { /* if there is overlap, */
2822     c = occ; /* noting that if occ < c */
2823     continue; /* we can't have ocd > d */
2824     } /* because a subrange is */
2825 ph10 176 if (ocd > (unsigned int)d &&
2826     occ <= (unsigned int)d + 1) /* always shorter than */
2827 nigel 77 { /* the basic range. */
2828     d = ocd;
2829     continue;
2830     }
2831    
2832     if (occ == ocd)
2833     {
2834     *class_utf8data++ = XCL_SINGLE;
2835     }
2836     else
2837     {
2838     *class_utf8data++ = XCL_RANGE;
2839     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2840     }
2841     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2842     }
2843     }
2844     #endif /* SUPPORT_UCP */
2845    
2846     /* Now record the original range, possibly modified for UCP caseless
2847     overlapping ranges. */
2848    
2849     *class_utf8data++ = XCL_RANGE;
2850     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2851     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2852    
2853     /* With UCP support, we are done. Without UCP support, there is no
2854     caseless matching for UTF-8 characters > 127; we can use the bit map
2855     for the smaller ones. */
2856    
2857     #ifdef SUPPORT_UCP
2858     continue; /* With next character in the class */
2859     #else
2860     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2861    
2862     /* Adjust upper limit and fall through to set up the map */
2863    
2864     d = 127;
2865    
2866     #endif /* SUPPORT_UCP */
2867     }
2868     #endif /* SUPPORT_UTF8 */
2869    
2870     /* We use the bit map for all cases when not in UTF-8 mode; else
2871     ranges that lie entirely within 0-127 when there is UCP support; else
2872     for partial ranges without UCP support. */
2873    
2874 nigel 93 class_charcount += d - c + 1;
2875     class_lastchar = d;
2876    
2877     /* We can save a bit of time by skipping this in the pre-compile. */
2878    
2879     if (lengthptr == NULL) for (; c <= d; c++)
2880 nigel 77 {
2881     classbits[c/8] |= (1 << (c&7));
2882     if ((options & PCRE_CASELESS) != 0)
2883     {
2884     int uc = cd->fcc[c]; /* flip case */
2885     classbits[uc/8] |= (1 << (uc&7));
2886     }
2887     }
2888    
2889     continue; /* Go get the next char in the class */
2890     }
2891    
2892     /* Handle a lone single character - we can get here for a normal
2893     non-escape char, or after \ that introduces a single character or for an
2894     apparent range that isn't. */
2895    
2896     LONE_SINGLE_CHARACTER:
2897    
2898     /* Handle a character that cannot go in the bit map */
2899    
2900     #ifdef SUPPORT_UTF8
2901     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2902     {
2903     class_utf8 = TRUE;
2904     *class_utf8data++ = XCL_SINGLE;
2905     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2906    
2907     #ifdef SUPPORT_UCP
2908     if ((options & PCRE_CASELESS) != 0)
2909     {
2910 nigel 93 unsigned int othercase;
2911     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2912 nigel 77 {
2913     *class_utf8data++ = XCL_SINGLE;
2914     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2915     }
2916     }
2917     #endif /* SUPPORT_UCP */
2918    
2919     }
2920     else
2921     #endif /* SUPPORT_UTF8 */
2922    
2923     /* Handle a single-byte character */
2924     {
2925     classbits[c/8] |= (1 << (c&7));
2926     if ((options & PCRE_CASELESS) != 0)
2927     {
2928     c = cd->fcc[c]; /* flip case */
2929     classbits[c/8] |= (1 << (c&7));
2930     }
2931     class_charcount++;
2932     class_lastchar = c;
2933     }
2934     }
2935    
2936 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2937 nigel 77
2938 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2939 nigel 77
2940 nigel 93 if (c == 0) /* Missing terminating ']' */
2941     {
2942     *errorcodeptr = ERR6;
2943     goto FAILED;
2944     }
2945    
2946 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
2947     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2948     can optimize the negative case only if there were no characters >= 128
2949     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2950     single-bytes only. This is an historical hangover. Maybe one day we can
2951     tidy these opcodes to handle multi-byte characters.
2952    
2953     The optimization throws away the bit map. We turn the item into a
2954     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2955     that OP_NOT does not support multibyte characters. In the positive case, it
2956     can cause firstbyte to be set. Otherwise, there can be no first char if
2957     this item is first, whatever repeat count may follow. In the case of
2958     reqbyte, save the previous value for reinstating. */
2959    
2960     #ifdef SUPPORT_UTF8
2961     if (class_charcount == 1 &&
2962     (!utf8 ||
2963     (!class_utf8 && (!negate_class || class_lastchar < 128))))
2964    
2965     #else
2966     if (class_charcount == 1)
2967     #endif
2968     {
2969     zeroreqbyte = reqbyte;
2970    
2971     /* The OP_NOT opcode works on one-byte characters only. */
2972    
2973     if (negate_class)
2974     {
2975     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2976     zerofirstbyte = firstbyte;
2977     *code++ = OP_NOT;
2978     *code++ = class_lastchar;
2979     break;
2980     }
2981    
2982     /* For a single, positive character, get the value into mcbuffer, and
2983     then we can handle this with the normal one-character code. */
2984    
2985     #ifdef SUPPORT_UTF8
2986     if (utf8 && class_lastchar > 127)
2987     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2988     else
2989     #endif
2990     {
2991     mcbuffer[0] = class_lastchar;
2992     mclength = 1;
2993     }
2994     goto ONE_CHAR;
2995     } /* End of 1-char optimization */
2996    
2997     /* The general case - not the one-char optimization. If this is the first
2998     thing in the branch, there can be no first char setting, whatever the
2999     repeat count. Any reqbyte setting must remain unchanged after any kind of
3000     repeat. */
3001    
3002     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3003     zerofirstbyte = firstbyte;
3004     zeroreqbyte = reqbyte;
3005    
3006     /* If there are characters with values > 255, we have to compile an
3007     extended class, with its own opcode. If there are no characters < 256,
3008 nigel 93 we can omit the bitmap in the actual compiled code. */
3009 nigel 77
3010     #ifdef SUPPORT_UTF8
3011     if (class_utf8)
3012     {
3013     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3014     *code++ = OP_XCLASS;
3015     code += LINK_SIZE;
3016     *code = negate_class? XCL_NOT : 0;
3017    
3018 nigel 93 /* If the map is required, move up the extra data to make room for it;
3019     otherwise just move the code pointer to the end of the extra data. */
3020 nigel 77
3021     if (class_charcount > 0)
3022     {
3023     *code++ |= XCL_MAP;
3024 nigel 93 memmove(code + 32, code, class_utf8data - code);
3025 nigel 77 memcpy(code, classbits, 32);
3026 nigel 93 code = class_utf8data + 32;
3027 nigel 77 }
3028 nigel 93 else code = class_utf8data;
3029 nigel 77
3030     /* Now fill in the complete length of the item */
3031    
3032     PUT(previous, 1, code - previous);
3033     break; /* End of class handling */
3034     }
3035     #endif
3036    
3037     /* If there are no characters > 255, negate the 32-byte map if necessary,
3038     and copy it into the code vector. If this is the first thing in the branch,
3039     there can be no first char setting, whatever the repeat count. Any reqbyte
3040     setting must remain unchanged after any kind of repeat. */
3041    
3042     if (negate_class)
3043     {
3044     *code++ = OP_NCLASS;
3045 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3046     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3047 nigel 77 }
3048     else
3049     {
3050     *code++ = OP_CLASS;
3051     memcpy(code, classbits, 32);
3052     }
3053     code += 32;
3054     break;
3055    
3056 nigel 93
3057     /* ===================================================================*/
3058 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3059     has been tested above. */
3060    
3061     case '{':
3062     if (!is_quantifier) goto NORMAL_CHAR;
3063     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3064     if (*errorcodeptr != 0) goto FAILED;
3065     goto REPEAT;
3066    
3067     case '*':
3068     repeat_min = 0;
3069     repeat_max = -1;
3070     goto REPEAT;
3071    
3072     case '+':
3073     repeat_min = 1;
3074     repeat_max = -1;
3075     goto REPEAT;
3076    
3077     case '?':
3078     repeat_min = 0;
3079     repeat_max = 1;
3080    
3081     REPEAT:
3082     if (previous == NULL)
3083     {
3084     *errorcodeptr = ERR9;
3085     goto FAILED;
3086     }
3087    
3088     if (repeat_min == 0)
3089     {
3090     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3091     reqbyte = zeroreqbyte; /* Ditto */
3092     }
3093    
3094     /* Remember whether this is a variable length repeat */
3095    
3096     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3097    
3098     op_type = 0; /* Default single-char op codes */
3099     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3100    
3101     /* Save start of previous item, in case we have to move it up to make space
3102     for an inserted OP_ONCE for the additional '+' extension. */
3103    
3104     tempcode = previous;
3105    
3106     /* If the next character is '+', we have a possessive quantifier. This
3107     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3108     If the next character is '?' this is a minimizing repeat, by default,
3109     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3110     repeat type to the non-default. */
3111    
3112     if (ptr[1] == '+')
3113     {
3114     repeat_type = 0; /* Force greedy */
3115     possessive_quantifier = TRUE;
3116     ptr++;
3117     }
3118     else if (ptr[1] == '?')
3119     {
3120     repeat_type = greedy_non_default;
3121     ptr++;
3122     }
3123     else repeat_type = greedy_default;
3124    
3125     /* If previous was a character match, abolish the item and generate a
3126     repeat item instead. If a char item has a minumum of more than one, ensure
3127     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3128     the first thing in a branch because the x will have gone into firstbyte
3129     instead. */
3130    
3131     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3132     {
3133     /* Deal with UTF-8 characters that take up more than one byte. It's
3134     easier to write this out separately than try to macrify it. Use c to
3135     hold the length of the character in bytes, plus 0x80 to flag that it's a
3136     length rather than a small character. */
3137    
3138     #ifdef SUPPORT_UTF8
3139     if (utf8 && (code[-1] & 0x80) != 0)
3140     {
3141     uschar *lastchar = code - 1;
3142     while((*lastchar & 0xc0) == 0x80) lastchar--;
3143     c = code - lastchar; /* Length of UTF-8 character */
3144     memcpy(utf8_char, lastchar, c); /* Save the char */
3145     c |= 0x80; /* Flag c as a length */
3146     }
3147     else
3148     #endif
3149    
3150     /* Handle the case of a single byte - either with no UTF8 support, or
3151     with UTF-8 disabled, or for a UTF-8 character < 128. */
3152    
3153     {
3154     c = code[-1];
3155     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3156     }
3157    
3158 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3159     the line is something that cannot possibly match this character. If so,
3160     automatically possessifying this item gains some performance in the case
3161     where the match fails. */
3162    
3163     if (!possessive_quantifier &&
3164     repeat_max < 0 &&
3165     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3166     options, cd))
3167     {
3168     repeat_type = 0; /* Force greedy */
3169     possessive_quantifier = TRUE;
3170     }
3171    
3172 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3173     }
3174    
3175     /* If previous was a single negated character ([^a] or similar), we use
3176     one of the special opcodes, replacing it. The code is shared with single-
3177     character repeats by setting opt_type to add a suitable offset into
3178 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3179     currently used only for single-byte chars. */
3180 nigel 77
3181     else if (*previous == OP_NOT)
3182     {
3183     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3184     c = previous[1];
3185 nigel 93 if (!possessive_quantifier &&
3186     repeat_max < 0 &&
3187     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3188     {
3189     repeat_type = 0; /* Force greedy */
3190     possessive_quantifier = TRUE;
3191     }
3192 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3193     }
3194    
3195     /* If previous was a character type match (\d or similar), abolish it and
3196     create a suitable repeat item. The code is shared with single-character
3197     repeats by setting op_type to add a suitable offset into repeat_type. Note
3198     the the Unicode property types will be present only when SUPPORT_UCP is
3199     defined, but we don't wrap the little bits of code here because it just
3200     makes it horribly messy. */
3201    
3202     else if (*previous < OP_EODN)
3203     {
3204     uschar *oldcode;
3205 nigel 87 int prop_type, prop_value;
3206 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3207     c = *previous;
3208    
3209 nigel 93 if (!possessive_quantifier &&
3210     repeat_max < 0 &&
3211     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3212     {
3213     repeat_type = 0; /* Force greedy */
3214     possessive_quantifier = TRUE;
3215     }
3216    
3217 nigel 77 OUTPUT_SINGLE_REPEAT:
3218 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3219     {
3220     prop_type = previous[1];
3221     prop_value = previous[2];
3222     }
3223     else prop_type = prop_value = -1;
3224 nigel 77
3225     oldcode = code;
3226     code = previous; /* Usually overwrite previous item */
3227    
3228     /* If the maximum is zero then the minimum must also be zero; Perl allows
3229     this case, so we do too - by simply omitting the item altogether. */
3230    
3231     if (repeat_max == 0) goto END_REPEAT;
3232    
3233     /* All real repeats make it impossible to handle partial matching (maybe
3234     one day we will be able to remove this restriction). */
3235    
3236     if (repeat_max != 1) cd->nopartial = TRUE;
3237    
3238     /* Combine the op_type with the repeat_type */
3239    
3240     repeat_type += op_type;
3241    
3242     /* A minimum of zero is handled either as the special case * or ?, or as
3243     an UPTO, with the maximum given. */
3244    
3245     if (repeat_min == 0)
3246     {
3247     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3248     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3249     else
3250     {
3251     *code++ = OP_UPTO + repeat_type;
3252     PUT2INC(code, 0, repeat_max);
3253     }
3254     }
3255    
3256     /* A repeat minimum of 1 is optimized into some special cases. If the
3257 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3258 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3259     one less than the maximum. */
3260    
3261     else if (repeat_min == 1)
3262     {
3263     if (repeat_max == -1)
3264     *code++ = OP_PLUS + repeat_type;
3265     else
3266     {
3267     code = oldcode; /* leave previous item in place */
3268     if (repeat_max == 1) goto END_REPEAT;
3269     *code++ = OP_UPTO + repeat_type;
3270     PUT2INC(code, 0, repeat_max - 1);
3271     }
3272     }
3273    
3274     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3275     handled as an EXACT followed by an UPTO. */
3276    
3277     else
3278     {
3279     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3280     PUT2INC(code, 0, repeat_min);
3281    
3282     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3283     we have to insert the character for the previous code. For a repeated
3284 nigel 87 Unicode property match, there are two extra bytes that define the
3285 nigel 77 required property. In UTF-8 mode, long characters have their length in
3286     c, with the 0x80 bit as a flag. */
3287    
3288     if (repeat_max < 0)
3289     {
3290     #ifdef SUPPORT_UTF8
3291     if (utf8 && c >= 128)
3292     {
3293     memcpy(code, utf8_char, c & 7);
3294     code += c & 7;
3295     }
3296     else
3297     #endif
3298     {
3299     *code++ = c;
3300 nigel 87 if (prop_type >= 0)
3301     {
3302     *code++ = prop_type;
3303     *code++ = prop_value;
3304     }
3305 nigel 77 }
3306     *code++ = OP_STAR + repeat_type;
3307     }
3308    
3309     /* Else insert an UPTO if the max is greater than the min, again
3310 nigel 93 preceded by the character, for the previously inserted code. If the
3311     UPTO is just for 1 instance, we can use QUERY instead. */
3312 nigel 77
3313     else if (repeat_max != repeat_min)
3314     {
3315     #ifdef SUPPORT_UTF8
3316     if (utf8 && c >= 128)
3317     {
3318     memcpy(code, utf8_char, c & 7);
3319     code += c & 7;
3320     }
3321     else
3322     #endif
3323     *code++ = c;
3324 nigel 87 if (prop_type >= 0)
3325     {
3326     *code++ = prop_type;
3327     *code++ = prop_value;
3328     }
3329 nigel 77 repeat_max -= repeat_min;
3330 nigel 93
3331     if (repeat_max == 1)
3332     {
3333     *code++ = OP_QUERY + repeat_type;
3334     }
3335     else
3336     {
3337     *code++ = OP_UPTO + repeat_type;
3338     PUT2INC(code, 0, repeat_max);
3339     }
3340 nigel 77 }
3341     }
3342    
3343     /* The character or character type itself comes last in all cases. */
3344    
3345     #ifdef SUPPORT_UTF8
3346     if (utf8 && c >= 128)
3347     {
3348     memcpy(code, utf8_char, c & 7);
3349     code += c & 7;
3350     }
3351     else
3352     #endif
3353     *code++ = c;
3354    
3355 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3356     define the required property. */
3357 nigel 77
3358     #ifdef SUPPORT_UCP
3359 nigel 87 if (prop_type >= 0)
3360     {
3361     *code++ = prop_type;
3362     *code++ = prop_value;
3363     }
3364 nigel 77 #endif
3365     }
3366    
3367     /* If previous was a character class or a back reference, we put the repeat
3368     stuff after it, but just skip the item if the repeat was {0,0}. */
3369    
3370     else if (*previous == OP_CLASS ||
3371     *previous == OP_NCLASS ||
3372     #ifdef SUPPORT_UTF8
3373     *previous == OP_XCLASS ||
3374     #endif
3375     *previous == OP_REF)
3376     {
3377     if (repeat_max == 0)
3378     {
3379     code = previous;
3380     goto END_REPEAT;
3381     }
3382    
3383     /* All real repeats make it impossible to handle partial matching (maybe
3384     one day we will be able to remove this restriction). */
3385    
3386     if (repeat_max != 1) cd->nopartial = TRUE;
3387    
3388     if (repeat_min == 0 && repeat_max == -1)
3389     *code++ = OP_CRSTAR + repeat_type;
3390     else if (repeat_min == 1 && repeat_max == -1)
3391     *code++ = OP_CRPLUS + repeat_type;
3392     else if (repeat_min == 0 && repeat_max == 1)
3393     *code++ = OP_CRQUERY + repeat_type;
3394     else
3395     {
3396     *code++ = OP_CRRANGE + repeat_type;
3397     PUT2INC(code, 0, repeat_min);
3398     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3399     PUT2INC(code, 0, repeat_max);
3400     }
3401     }
3402    
3403     /* If previous was a bracket group, we may have to replicate it in certain
3404     cases. */
3405    
3406 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3407     *previous == OP_ONCE || *previous == OP_COND)
3408 nigel 77 {
3409     register int i;
3410     int ketoffset = 0;
3411     int len = code - previous;
3412     uschar *bralink = NULL;
3413    
3414 nigel 93 /* Repeating a DEFINE group is pointless */
3415    
3416     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3417     {
3418     *errorcodeptr = ERR55;
3419     goto FAILED;
3420     }
3421    
3422     /* This is a paranoid check to stop integer overflow later on */
3423    
3424     if (len > MAX_DUPLENGTH)
3425     {
3426     *errorcodeptr = ERR50;
3427     goto FAILED;
3428     }
3429    
3430 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3431     by scanning through from the start, and compute the offset back to it
3432     from the current code pointer. There may be an OP_OPT setting following
3433     the final KET, so we can't find the end just by going back from the code
3434     pointer. */
3435    
3436     if (repeat_max == -1)
3437     {
3438     register uschar *ket = previous;
3439     do ket += GET(ket, 1); while (*ket != OP_KET);
3440     ketoffset = code - ket;
3441     }
3442    
3443     /* The case of a zero minimum is special because of the need to stick
3444     OP_BRAZERO in front of it, and because the group appears once in the
3445     data, whereas in other cases it appears the minimum number of times. For
3446     this reason, it is simplest to treat this case separately, as otherwise
3447     the code gets far too messy. There are several special subcases when the
3448     minimum is zero. */
3449    
3450     if (repeat_min == 0)
3451     {
3452     /* If the maximum is also zero, we just omit the group from the output
3453     altogether. */
3454    
3455     if (repeat_max == 0)
3456     {
3457     code = previous;
3458     goto END_REPEAT;
3459     }
3460    
3461     /* If the maximum is 1 or unlimited, we just have to stick in the
3462     BRAZERO and do no more at this point. However, we do need to adjust
3463     any OP_RECURSE calls inside the group that refer to the group itself or
3464 nigel 93 any internal or forward referenced group, because the offset is from
3465     the start of the whole regex. Temporarily terminate the pattern while
3466     doing this. */
3467 nigel 77
3468     if (repeat_max <= 1)
3469     {
3470     *code = OP_END;
3471 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3472 nigel 77 memmove(previous+1, previous, len);
3473     code++;
3474     *previous++ = OP_BRAZERO + repeat_type;
3475     }
3476    
3477     /* If the maximum is greater than 1 and limited, we have to replicate
3478     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3479     The first one has to be handled carefully because it's the original
3480     copy, which has to be moved up. The remainder can be handled by code
3481     that is common with the non-zero minimum case below. We have to
3482     adjust the value or repeat_max, since one less copy is required. Once
3483     again, we may have to adjust any OP_RECURSE calls inside the group. */
3484    
3485     else
3486     {
3487     int offset;
3488     *code = OP_END;
3489 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3490 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3491     code += 2 + LINK_SIZE;
3492     *previous++ = OP_BRAZERO + repeat_type;
3493     *previous++ = OP_BRA;
3494    
3495     /* We chain together the bracket offset fields that have to be
3496     filled in later when the ends of the brackets are reached. */
3497    
3498     offset = (bralink == NULL)? 0 : previous - bralink;
3499     bralink = previous;
3500     PUTINC(previous, 0, offset);
3501     }
3502    
3503     repeat_max--;
3504     }
3505    
3506     /* If the minimum is greater than zero, replicate the group as many
3507     times as necessary, and adjust the maximum to the number of subsequent
3508     copies that we need. If we set a first char from the group, and didn't
3509 nigel 93 set a required char, copy the latter from the former. If there are any
3510     forward reference subroutine calls in the group, there will be entries on
3511     the workspace list; replicate these with an appropriate increment. */
3512 nigel 77
3513     else
3514     {
3515     if (repeat_min > 1)
3516     {
3517 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3518     just adjust the length as if we had. */
3519    
3520     if (lengthptr != NULL)
3521     *lengthptr += (repeat_min - 1)*length_prevgroup;
3522    
3523     /* This is compiling for real */
3524    
3525     else
3526 nigel 77 {
3527 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3528     for (i = 1; i < repeat_min; i++)
3529     {
3530     uschar *hc;
3531     uschar *this_hwm = cd->hwm;
3532     memcpy(code, previous, len);
3533     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3534     {
3535     PUT(cd->hwm, 0, GET(hc, 0) + len);
3536     cd->hwm += LINK_SIZE;
3537     }
3538     save_hwm = this_hwm;
3539     code += len;
3540     }
3541 nigel 77 }
3542     }
3543 nigel 93
3544 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3545     }
3546    
3547     /* This code is common to both the zero and non-zero minimum cases. If
3548     the maximum is limited, it replicates the group in a nested fashion,
3549     remembering the bracket starts on a stack. In the case of a zero minimum,
3550     the first one was set up above. In all cases the repeat_max now specifies
3551 nigel 93 the number of additional copies needed. Again, we must remember to
3552     replicate entries on the forward reference list. */
3553 nigel 77
3554     if (repeat_max >= 0)
3555     {
3556 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3557     just adjust the length as if we had. For each repetition we must add 1
3558     to the length for BRAZERO and for all but the last repetition we must
3559     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3560    
3561     if (lengthptr != NULL && repeat_max > 0)
3562     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3563     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3564    
3565     /* This is compiling for real */
3566    
3567     else for (i = repeat_max - 1; i >= 0; i--)
3568 nigel 77 {
3569 nigel 93 uschar *hc;
3570     uschar *this_hwm = cd->hwm;
3571    
3572 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3573    
3574     /* All but the final copy start a new nesting, maintaining the
3575     chain of brackets outstanding. */
3576    
3577     if (i != 0)
3578     {
3579     int offset;
3580     *code++ = OP_BRA;
3581     offset = (bralink == NULL)? 0 : code - bralink;
3582     bralink = code;
3583     PUTINC(code, 0, offset);
3584     }
3585    
3586     memcpy(code, previous, len);
3587 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3588     {
3589     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3590     cd->hwm += LINK_SIZE;
3591     }
3592     save_hwm = this_hwm;
3593 nigel 77 code += len;
3594     }
3595    
3596     /* Now chain through the pending brackets, and fill in their length
3597     fields (which are holding the chain links pro tem). */
3598    
3599     while (bralink != NULL)
3600     {
3601     int oldlinkoffset;
3602     int offset = code - bralink + 1;
3603     uschar *bra = code - offset;
3604     oldlinkoffset = GET(bra, 1);
3605     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3606     *code++ = OP_KET;
3607     PUTINC(code, 0, offset);
3608     PUT(bra, 1, offset);
3609     }
3610     }
3611    
3612     /* If the maximum is unlimited, set a repeater in the final copy. We
3613     can't just offset backwards from the current code point, because we
3614     don't know if there's been an options resetting after the ket. The
3615 nigel 93 correct offset was computed above.
3616 nigel 77
3617 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3618     this group is a non-atomic one that could match an empty string. If so,
3619     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3620     that runtime checking can be done. [This check is also applied to
3621     atomic groups at runtime, but in a different way.] */
3622    
3623     else
3624     {
3625     uschar *ketcode = code - ketoffset;
3626     uschar *bracode = ketcode - GET(ketcode, 1);
3627     *ketcode = OP_KETRMAX + repeat_type;
3628     if (lengthptr == NULL && *bracode != OP_ONCE)
3629     {
3630     uschar *scode = bracode;
3631     do
3632     {
3633     if (could_be_empty_branch(scode, ketcode, utf8))
3634     {
3635     *bracode += OP_SBRA - OP_BRA;
3636     break;
3637     }
3638     scode += GET(scode, 1);
3639     }
3640     while (*scode == OP_ALT);
3641     }
3642     }
3643 nigel 77 }
3644    
3645     /* Else there's some kind of shambles */
3646    
3647     else
3648     {
3649     *errorcodeptr = ERR11;
3650     goto FAILED;
3651     }
3652    
3653 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3654     tests above succeeded, possessive_quantifier is TRUE. For some of the
3655     simpler opcodes, there is an special alternative opcode for this. For
3656     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3657     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3658     but the special opcodes can optimize it a bit. The repeated item starts at
3659     tempcode, not at previous, which might be the first part of a string whose
3660     (former) last char we repeated.
3661 nigel 77
3662 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3663     an 'upto' may follow. We skip over an 'exact' item, and then test the
3664     length of what remains before proceeding. */
3665    
3666 nigel 77 if (possessive_quantifier)
3667     {
3668 nigel 93 int len;
3669     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3670     *tempcode == OP_NOTEXACT)
3671     tempcode += _pcre_OP_lengths[*tempcode];
3672     len = code - tempcode;
3673     if (len > 0) switch (*tempcode)
3674     {
3675     case OP_STAR: *tempcode = OP_POSSTAR; break;
3676     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3677     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3678     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3679    
3680     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3681     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3682     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3683     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3684    
3685     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3686     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3687     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3688     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3689    
3690     default:
3691     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3692     code += 1 + LINK_SIZE;
3693     len += 1 + LINK_SIZE;
3694     tempcode[0] = OP_ONCE;
3695     *code++ = OP_KET;
3696     PUTINC(code, 0, len);
3697     PUT(tempcode, 1, len);
3698     break;
3699     }
3700 nigel 77 }
3701    
3702     /* In all case we no longer have a previous item. We also set the
3703     "follows varying string" flag for subsequently encountered reqbytes if
3704     it isn't already set and we have just passed a varying length item. */
3705    
3706     END_REPEAT:
3707     previous = NULL;
3708     cd->req_varyopt |= reqvary;
3709     break;
3710    
3711    
3712 nigel 93 /* ===================================================================*/
3713     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3714     lookbehind or option setting or condition or all the other extended
3715     parenthesis forms. First deal with the specials; all are introduced by ?,
3716     and the appearance of any of them means that this is not a capturing
3717     group. */
3718 nigel 77
3719     case '(':
3720     newoptions = options;
3721     skipbytes = 0;
3722 nigel 93 bravalue = OP_CBRA;
3723     save_hwm = cd->hwm;
3724 ph10 175 reset_bracount = FALSE;
3725 nigel 77
3726     if (*(++ptr) == '?')
3727     {
3728 nigel 93 int i, set, unset, namelen;
3729 nigel 77 int *optset;
3730 nigel 93 const uschar *name;
3731     uschar *slot;
3732 nigel 77
3733     switch (*(++ptr))
3734     {
3735     case '#': /* Comment; skip to ket */
3736     ptr++;
3737 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3738     if (*ptr == 0)
3739     {
3740     *errorcodeptr = ERR18;
3741     goto FAILED;
3742     }
3743 nigel 77 continue;
3744    
3745 nigel 93
3746     /* ------------------------------------------------------------ */
3747 ph10 175 case '|': /* Reset capture count for each branch */
3748     reset_bracount = TRUE;
3749     /* Fall through */
3750    
3751     /* ------------------------------------------------------------ */
3752 nigel 93 case ':': /* Non-capturing bracket */
3753 nigel 77 bravalue = OP_BRA;
3754     ptr++;
3755     break;
3756    
3757 nigel 93
3758     /* ------------------------------------------------------------ */
3759 nigel 77 case '(':
3760     bravalue = OP_COND; /* Conditional group */
3761    
3762 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3763     group), a name (referring to a named group), or 'R', referring to
3764     recursion. R<digits> and R&name are also permitted for recursion tests.
3765 nigel 77
3766 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3767     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3768    
3769     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3770     be the recursive thing or the name 'R' (and similarly for 'R' followed
3771     by digits), and (b) a number could be a name that consists of digits.
3772     In both cases, we look for a name first; if not found, we try the other
3773     cases. */
3774    
3775     /* For conditions that are assertions, check the syntax, and then exit
3776     the switch. This will take control down to where bracketed groups,
3777     including assertions, are processed. */
3778    
3779     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3780     break;
3781    
3782     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3783     below), and all need to skip 3 bytes at the start of the group. */
3784    
3785     code[1+LINK_SIZE] = OP_CREF;
3786     skipbytes = 3;
3787 ph10 172 refsign = -1;
3788 nigel 93
3789     /* Check for a test for recursion in a named group. */
3790    
3791     if (ptr[1] == 'R' && ptr[2] == '&')
3792 nigel 77 {
3793 nigel 93 terminator = -1;
3794     ptr += 2;
3795     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3796     }
3797 nigel 91
3798 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3799     syntax (?(<name>) or (?('name') */
3800 nigel 91
3801 nigel 93 else if (ptr[1] == '<')
3802     {
3803     terminator = '>';
3804     ptr++;
3805     }
3806     else if (ptr[1] == '\'')
3807     {
3808     terminator = '\'';
3809     ptr++;
3810     }
3811 ph10 172 else
3812 ph10 167 {
3813     terminator = 0;
3814 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3815     }
3816 nigel 77
3817 nigel 93 /* We now expect to read a name; any thing else is an error */
3818 nigel 77
3819 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3820     {
3821     ptr += 1; /* To get the right offset */
3822     *errorcodeptr = ERR28;
3823     goto FAILED;
3824     }
3825    
3826     /* Read the name, but also get it as a number if it's all digits */
3827    
3828     recno = 0;
3829     name = ++ptr;
3830     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3831     {
3832     if (recno >= 0)
3833     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3834     recno * 10 + *ptr - '0' : -1;
3835 nigel 91 ptr++;
3836 nigel 93 }
3837     namelen = ptr - name;
3838 nigel 91
3839 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3840     {
3841     ptr--; /* Error offset */
3842     *errorcodeptr = ERR26;
3843     goto FAILED;
3844     }
3845 nigel 91
3846 nigel 93 /* Do no further checking in the pre-compile phase. */
3847 nigel 91
3848 nigel 93 if (lengthptr != NULL) break;
3849 nigel 91
3850 nigel 93 /* In the real compile we do the work of looking for the actual
3851 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3852     be digits, in which case recno will be set. */
3853 ph10 172
3854 ph10 167 if (refsign > 0)
3855     {
3856     if (recno <= 0)
3857     {
3858     *errorcodeptr = ERR58;
3859     goto FAILED;
3860 ph10 172 }
3861 ph10 167 if (refsign == '-')
3862     {
3863 ph10 172 recno = cd->bracount - recno + 1;
3864 ph10 167 if (recno <= 0)
3865     {
3866     *errorcodeptr = ERR15;
3867     goto FAILED;
3868 ph10 172 }
3869 ph10 167 }
3870 ph10 172 else recno += cd->bracount;
3871 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
3872     break;
3873 ph10 172 }
3874 nigel 91
3875 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3876     name. */
3877 ph10 172
3878 nigel 93 slot = cd->name_table;
3879     for (i = 0; i < cd->names_found; i++)
3880     {
3881     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3882     slot += cd->name_entry_size;
3883     }
3884 nigel 91
3885 nigel 93 /* Found a previous named subpattern */
3886 nigel 91
3887 nigel 93 if (i < cd->names_found)
3888     {
3889     recno = GET2(slot, 0);
3890     PUT2(code, 2+LINK_SIZE, recno);
3891     }
3892 nigel 91
3893 nigel 93 /* Search the pattern for a forward reference */
3894 nigel 91
3895 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3896     (options & PCRE_EXTENDED) != 0)) > 0)
3897     {
3898     PUT2(code, 2+LINK_SIZE, i);
3899     }
3900 nigel 91
3901 nigel 93 /* If terminator == 0 it means that the name followed directly after
3902     the opening parenthesis [e.g. (?(abc)...] and in this case there are
3903     some further alternatives to try. For the cases where terminator != 0
3904     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3905     now checked all the possibilities, so give an error. */
3906 nigel 91
3907 nigel 93 else if (terminator != 0)
3908     {
3909     *errorcodeptr = ERR15;
3910     goto FAILED;
3911     }
3912    
3913     /* Check for (?(R) for recursion. Allow digits after R to specify a
3914     specific group number. */
3915    
3916     else if (*name == 'R')
3917     {
3918     recno = 0;
3919     for (i = 1; i < namelen; i++)
3920 nigel 91 {
3921 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
3922     {
3923     *errorcodeptr = ERR15;
3924     goto FAILED;
3925     }
3926     recno = recno * 10 + name[i] - '0';
3927 nigel 77 }
3928 nigel 93 if (recno == 0) recno = RREF_ANY;
3929     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3930     PUT2(code, 2+LINK_SIZE, recno);
3931 nigel 77 }
3932 nigel 91
3933 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
3934     false. */
3935 nigel 91
3936 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3937     {
3938     code[1+LINK_SIZE] = OP_DEF;
3939     skipbytes = 1;
3940     }
3941    
3942     /* Check for the "name" actually being a subpattern number. */
3943    
3944     else if (recno > 0)
3945     {
3946     PUT2(code, 2+LINK_SIZE, recno);
3947     }
3948    
3949     /* Either an unidentified subpattern, or a reference to (?(0) */
3950    
3951     else
3952     {
3953     *errorcodeptr = (recno == 0)? ERR35: ERR15;
3954     goto FAILED;
3955     }
3956 nigel 77 break;
3957    
3958 nigel 93
3959     /* ------------------------------------------------------------ */
3960 nigel 77 case '=': /* Positive lookahead */
3961     bravalue = OP_ASSERT;
3962     ptr++;
3963     break;
3964    
3965 nigel 93
3966     /* ------------------------------------------------------------ */
3967 nigel 77 case '!': /* Negative lookahead */
3968     bravalue = OP_ASSERT_NOT;
3969     ptr++;
3970     break;
3971    
3972 nigel 93
3973     /* ------------------------------------------------------------ */
3974     case '<': /* Lookbehind or named define */
3975     switch (ptr[1])
3976 nigel 77 {
3977     case '=': /* Positive lookbehind */
3978     bravalue = OP_ASSERTBACK;
3979 nigel 93 ptr += 2;
3980 nigel 77 break;
3981    
3982     case '!': /* Negative lookbehind */
3983     bravalue = OP_ASSERTBACK_NOT;
3984 nigel 93 ptr += 2;
3985 nigel 77 break;
3986 nigel 93
3987     default: /* Could be name define, else bad */
3988     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3989     ptr++; /* Correct offset for error */
3990     *errorcodeptr = ERR24;
3991     goto FAILED;
3992 nigel 77 }
3993     break;
3994    
3995 nigel 93
3996     /* ------------------------------------------------------------ */
3997 nigel 77 case '>': /* One-time brackets */
3998     bravalue = OP_ONCE;
3999     ptr++;
4000     break;
4001    
4002 nigel 93
4003     /* ------------------------------------------------------------ */
4004 nigel 77 case 'C': /* Callout - may be followed by digits; */
4005     previous_callout = code; /* Save for later completion */
4006     after_manual_callout = 1; /* Skip one item before completing */
4007 nigel 93 *code++ = OP_CALLOUT;
4008     {
4009 nigel 77 int n = 0;
4010     while ((digitab[*(++ptr)] & ctype_digit) != 0)
4011     n = n * 10 + *ptr - '0';
4012 nigel 93 if (*ptr != ')')
4013     {
4014     *errorcodeptr = ERR39;
4015     goto FAILED;
4016     }
4017 nigel 77 if (n > 255)
4018     {
4019     *errorcodeptr = ERR38;
4020     goto FAILED;
4021     }
4022     *code++ = n;
4023     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4024     PUT(code, LINK_SIZE, 0); /* Default length */
4025     code += 2 * LINK_SIZE;
4026     }
4027     previous = NULL;
4028     continue;
4029    
4030 nigel 93
4031     /* ------------------------------------------------------------ */
4032     case 'P': /* Python-style named subpattern handling */
4033     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4034 nigel 77 {
4035 nigel 93 is_recurse = *ptr == '>';
4036     terminator = ')';
4037     goto NAMED_REF_OR_RECURSE;
4038     }
4039     else if (*ptr != '<') /* Test for Python-style definition */
4040     {
4041     *errorcodeptr = ERR41;
4042     goto FAILED;
4043     }
4044     /* Fall through to handle (?P< as (?< is handled */
4045 nigel 77
4046    
4047 nigel 93 /* ------------------------------------------------------------ */
4048     DEFINE_NAME: /* Come here from (?< handling */
4049     case '\'':
4050     {
4051     terminator = (*ptr == '<')? '>' : '\'';
4052     name = ++ptr;
4053    
4054     while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4055     namelen = ptr - name;
4056    
4057     /* In the pre-compile phase, just do a syntax check. */
4058    
4059     if (lengthptr != NULL)
4060 nigel 77 {
4061 nigel 93 if (*ptr != terminator)
4062 nigel 77 {
4063 nigel 93 *errorcodeptr = ERR42;
4064     goto FAILED;
4065     }
4066     if (cd->names_found >= MAX_NAME_COUNT)
4067     {
4068     *errorcodeptr = ERR49;
4069     goto FAILED;
4070     }
4071     if (namelen + 3 > cd->name_entry_size)
4072     {
4073     cd->name_entry_size = namelen + 3;
4074     if (namelen > MAX_NAME_SIZE)
4075 nigel 77 {
4076 nigel 93 *errorcodeptr = ERR48;
4077     goto FAILED;
4078     }
4079     }
4080     }
4081    
4082     /* In the real compile, create the entry in the table */
4083    
4084     else
4085     {
4086     slot = cd->name_table;
4087     for (i = 0; i < cd->names_found; i++)
4088     {
4089     int crc = memcmp(name, slot+2, namelen);
4090     if (crc == 0)
4091     {
4092     if (slot[2+namelen] == 0)
4093 nigel 91 {
4094 nigel 93 if ((options & PCRE_DUPNAMES) == 0)
4095     {
4096     *errorcodeptr = ERR43;
4097     goto FAILED;
4098     }
4099 nigel 91 }
4100 nigel 93 else crc = -1; /* Current name is substring */
4101 nigel 77 }
4102 nigel 93 if (crc < 0)
4103     {
4104     memmove(slot + cd->name_entry_size, slot,
4105     (cd->names_found - i) * cd->name_entry_size);
4106     break;
4107     }
4108     slot += cd->name_entry_size;
4109 nigel 77 }
4110 nigel 93
4111     PUT2(slot, 0, cd->bracount + 1);
4112     memcpy(slot + 2, name, namelen);
4113     slot[2+namelen] = 0;
4114 nigel 77 }
4115     }
4116    
4117 nigel 93 /* In both cases, count the number of names we've encountered. */
4118    
4119     ptr++; /* Move past > or ' */
4120     cd->names_found++;
4121     goto NUMBERED_GROUP;
4122    
4123    
4124     /* ------------------------------------------------------------ */
4125     case '&': /* Perl recursion/subroutine syntax */
4126     terminator = ')';
4127     is_rec