/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (show annotations) (download)
Sat Feb 24 21:41:34 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 166857 byte(s)
Load pcre-6.7 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* The block containing newline information */
46 #include "pcre_internal.h"
47
48
49 /* When DEBUG is defined, we need the pcre_printint() function, which is also
50 used by pcretest. DEBUG is not defined when building a production library. */
51
52 #ifdef DEBUG
53 #include "pcre_printint.src"
54 #endif
55
56
57
58 /*************************************************
59 * Code parameters and static tables *
60 *************************************************/
61
62 /* Maximum number of items on the nested bracket stacks at compile time. This
63 applies to the nesting of all kinds of parentheses. It does not limit
64 un-nested, non-capturing parentheses. This number can be made bigger if
65 necessary - it is used to dimension one int and one unsigned char vector at
66 compile time. */
67
68 #define BRASTACK_SIZE 200
69
70
71 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
72 are simple data values; negative values are for special things like \d and so
73 on. Zero means further processing is needed (for things like \x), or the escape
74 is invalid. */
75
76 #if !EBCDIC /* This is the "normal" table for ASCII systems */
77 static const short int escapes[] = {
78 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
79 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
80 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
81 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
82 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
83 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
84 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
85 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
86 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
87 0, 0, -ESC_z /* x - z */
88 };
89
90 #else /* This is the "abnormal" table for EBCDIC systems */
91 static const short int escapes[] = {
92 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
93 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
94 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
95 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
96 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
97 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
98 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
99 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
100 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
101 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
102 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
103 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
104 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
105 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
107 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
108 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
109 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
110 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
111 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
112 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
113 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
114 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
115 };
116 #endif
117
118
119 /* Tables of names of POSIX character classes and their lengths. The list is
120 terminated by a zero length entry. The first three must be alpha, lower, upper,
121 as this is assumed for handling case independence. */
122
123 static const char *const posix_names[] = {
124 "alpha", "lower", "upper",
125 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
126 "print", "punct", "space", "word", "xdigit" };
127
128 static const uschar posix_name_lengths[] = {
129 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
130
131 /* Table of class bit maps for each POSIX class. Each class is formed from a
132 base map, with an optional addition or removal of another map. Then, for some
133 classes, there is some additional tweaking: for [:blank:] the vertical space
134 characters are removed, and for [:alpha:] and [:alnum:] the underscore
135 character is removed. The triples in the table consist of the base map offset,
136 second map offset or -1 if no second map, and a non-negative value for map
137 addition or a negative value for map subtraction (if there are two maps). The
138 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
139 remove vertical space characters, 2 => remove underscore. */
140
141 static const int posix_class_maps[] = {
142 cbit_word, cbit_digit, -2, /* alpha */
143 cbit_lower, -1, 0, /* lower */
144 cbit_upper, -1, 0, /* upper */
145 cbit_word, -1, 2, /* alnum - word without underscore */
146 cbit_print, cbit_cntrl, 0, /* ascii */
147 cbit_space, -1, 1, /* blank - a GNU extension */
148 cbit_cntrl, -1, 0, /* cntrl */
149 cbit_digit, -1, 0, /* digit */
150 cbit_graph, -1, 0, /* graph */
151 cbit_print, -1, 0, /* print */
152 cbit_punct, -1, 0, /* punct */
153 cbit_space, -1, 0, /* space */
154 cbit_word, -1, 0, /* word - a Perl extension */
155 cbit_xdigit,-1, 0 /* xdigit */
156 };
157
158
159 /* The texts of compile-time error messages. These are "char *" because they
160 are passed to the outside world. */
161
162 static const char *error_texts[] = {
163 "no error",
164 "\\ at end of pattern",
165 "\\c at end of pattern",
166 "unrecognized character follows \\",
167 "numbers out of order in {} quantifier",
168 /* 5 */
169 "number too big in {} quantifier",
170 "missing terminating ] for character class",
171 "invalid escape sequence in character class",
172 "range out of order in character class",
173 "nothing to repeat",
174 /* 10 */
175 "operand of unlimited repeat could match the empty string",
176 "internal error: unexpected repeat",
177 "unrecognized character after (?",
178 "POSIX named classes are supported only within a class",
179 "missing )",
180 /* 15 */
181 "reference to non-existent subpattern",
182 "erroffset passed as NULL",
183 "unknown option bit(s) set",
184 "missing ) after comment",
185 "parentheses nested too deeply",
186 /* 20 */
187 "regular expression too large",
188 "failed to get memory",
189 "unmatched parentheses",
190 "internal error: code overflow",
191 "unrecognized character after (?<",
192 /* 25 */
193 "lookbehind assertion is not fixed length",
194 "malformed number or name after (?(",
195 "conditional group contains more than two branches",
196 "assertion expected after (?(",
197 "(?R or (?digits must be followed by )",
198 /* 30 */
199 "unknown POSIX class name",
200 "POSIX collating elements are not supported",
201 "this version of PCRE is not compiled with PCRE_UTF8 support",
202 "spare error",
203 "character value in \\x{...} sequence is too large",
204 /* 35 */
205 "invalid condition (?(0)",
206 "\\C not allowed in lookbehind assertion",
207 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
208 "number after (?C is > 255",
209 "closing ) for (?C expected",
210 /* 40 */
211 "recursive call could loop indefinitely",
212 "unrecognized character after (?P",
213 "syntax error after (?P",
214 "two named subpatterns have the same name",
215 "invalid UTF-8 string",
216 /* 45 */
217 "support for \\P, \\p, and \\X has not been compiled",
218 "malformed \\P or \\p sequence",
219 "unknown property name after \\P or \\p",
220 "subpattern name is too long (maximum 32 characters)",
221 "too many named subpatterns (maximum 10,000)",
222 /* 50 */
223 "repeated subpattern is too long",
224 "octal value is greater than \\377 (not in UTF-8 mode)"
225 };
226
227
228 /* Table to identify digits and hex digits. This is used when compiling
229 patterns. Note that the tables in chartables are dependent on the locale, and
230 may mark arbitrary characters as digits - but the PCRE compiling code expects
231 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
232 a private table here. It costs 256 bytes, but it is a lot faster than doing
233 character value tests (at least in some simple cases I timed), and in some
234 applications one wants PCRE to compile efficiently as well as match
235 efficiently.
236
237 For convenience, we use the same bit definitions as in chartables:
238
239 0x04 decimal digit
240 0x08 hexadecimal digit
241
242 Then we can use ctype_digit and ctype_xdigit in the code. */
243
244 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
245 static const unsigned char digitab[] =
246 {
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
253 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
254 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
255 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
259 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
279
280 #else /* This is the "abnormal" case, for EBCDIC systems */
281 static const unsigned char digitab[] =
282 {
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
299 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
307 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
313 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
314 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
315
316 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
317 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
318 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
319 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
321 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
325 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
326 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
328 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
330 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
333 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
334 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
335 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
336 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
337 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
338 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
339 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
340 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
341 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
342 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
343 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
344 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
345 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
346 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
347 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
348 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
349 #endif
350
351
352 /* Definition to allow mutual recursion */
353
354 static BOOL
355 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
356 int *, int *, branch_chain *, compile_data *);
357
358
359
360 /*************************************************
361 * Handle escapes *
362 *************************************************/
363
364 /* This function is called when a \ has been encountered. It either returns a
365 positive value for a simple escape such as \n, or a negative value which
366 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
367 a positive value greater than 255 may be returned. On entry, ptr is pointing at
368 the \. On exit, it is on the final character of the escape sequence.
369
370 Arguments:
371 ptrptr points to the pattern position pointer
372 errorcodeptr points to the errorcode variable
373 bracount number of previous extracting brackets
374 options the options bits
375 isclass TRUE if inside a character class
376
377 Returns: zero or positive => a data character
378 negative => a special escape sequence
379 on error, errorptr is set
380 */
381
382 static int
383 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
384 int options, BOOL isclass)
385 {
386 BOOL utf8 = (options & PCRE_UTF8) != 0;
387 const uschar *ptr = *ptrptr + 1;
388 int c, i;
389
390 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
391 ptr--; /* Set pointer back to the last byte */
392
393 /* If backslash is at the end of the pattern, it's an error. */
394
395 if (c == 0) *errorcodeptr = ERR1;
396
397 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
398 a table. A non-zero result is something that can be returned immediately.
399 Otherwise further processing may be required. */
400
401 #if !EBCDIC /* ASCII coding */
402 else if (c < '0' || c > 'z') {} /* Not alphameric */
403 else if ((i = escapes[c - '0']) != 0) c = i;
404
405 #else /* EBCDIC coding */
406 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
407 else if ((i = escapes[c - 0x48]) != 0) c = i;
408 #endif
409
410 /* Escapes that need further processing, or are illegal. */
411
412 else
413 {
414 const uschar *oldptr;
415 switch (c)
416 {
417 /* A number of Perl escapes are not handled by PCRE. We give an explicit
418 error. */
419
420 case 'l':
421 case 'L':
422 case 'N':
423 case 'u':
424 case 'U':
425 *errorcodeptr = ERR37;
426 break;
427
428 /* The handling of escape sequences consisting of a string of digits
429 starting with one that is not zero is not straightforward. By experiment,
430 the way Perl works seems to be as follows:
431
432 Outside a character class, the digits are read as a decimal number. If the
433 number is less than 10, or if there are that many previous extracting
434 left brackets, then it is a back reference. Otherwise, up to three octal
435 digits are read to form an escaped byte. Thus \123 is likely to be octal
436 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
437 value is greater than 377, the least significant 8 bits are taken. Inside a
438 character class, \ followed by a digit is always an octal number. */
439
440 case '1': case '2': case '3': case '4': case '5':
441 case '6': case '7': case '8': case '9':
442
443 if (!isclass)
444 {
445 oldptr = ptr;
446 c -= '0';
447 while ((digitab[ptr[1]] & ctype_digit) != 0)
448 c = c * 10 + *(++ptr) - '0';
449 if (c < 10 || c <= bracount)
450 {
451 c = -(ESC_REF + c);
452 break;
453 }
454 ptr = oldptr; /* Put the pointer back and fall through */
455 }
456
457 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
458 generates a binary zero byte and treats the digit as a following literal.
459 Thus we have to pull back the pointer by one. */
460
461 if ((c = *ptr) >= '8')
462 {
463 ptr--;
464 c = 0;
465 break;
466 }
467
468 /* \0 always starts an octal number, but we may drop through to here with a
469 larger first octal digit. The original code used just to take the least
470 significant 8 bits of octal numbers (I think this is what early Perls used
471 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
472 than 3 octal digits. */
473
474 case '0':
475 c -= '0';
476 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
477 c = c * 8 + *(++ptr) - '0';
478 if (!utf8 && c > 255) *errorcodeptr = ERR51;
479 break;
480
481 /* \x is complicated. \x{ddd} is a character number which can be greater
482 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
483 treated as a data character. */
484
485 case 'x':
486 if (ptr[1] == '{')
487 {
488 const uschar *pt = ptr + 2;
489 int count = 0;
490
491 c = 0;
492 while ((digitab[*pt] & ctype_xdigit) != 0)
493 {
494 register int cc = *pt++;
495 if (c == 0 && cc == '0') continue; /* Leading zeroes */
496 count++;
497
498 #if !EBCDIC /* ASCII coding */
499 if (cc >= 'a') cc -= 32; /* Convert to upper case */
500 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
501 #else /* EBCDIC coding */
502 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
503 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
504 #endif
505 }
506
507 if (*pt == '}')
508 {
509 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
510 ptr = pt;
511 break;
512 }
513
514 /* If the sequence of hex digits does not end with '}', then we don't
515 recognize this construct; fall through to the normal \x handling. */
516 }
517
518 /* Read just a single-byte hex-defined char */
519
520 c = 0;
521 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
522 {
523 int cc; /* Some compilers don't like ++ */
524 cc = *(++ptr); /* in initializers */
525 #if !EBCDIC /* ASCII coding */
526 if (cc >= 'a') cc -= 32; /* Convert to upper case */
527 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
528 #else /* EBCDIC coding */
529 if (cc <= 'z') cc += 64; /* Convert to upper case */
530 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
531 #endif
532 }
533 break;
534
535 /* Other special escapes not starting with a digit are straightforward */
536
537 case 'c':
538 c = *(++ptr);
539 if (c == 0)
540 {
541 *errorcodeptr = ERR2;
542 return 0;
543 }
544
545 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
546 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
547 (However, an EBCDIC equivalent has now been added.) */
548
549 #if !EBCDIC /* ASCII coding */
550 if (c >= 'a' && c <= 'z') c -= 32;
551 c ^= 0x40;
552 #else /* EBCDIC coding */
553 if (c >= 'a' && c <= 'z') c += 64;
554 c ^= 0xC0;
555 #endif
556 break;
557
558 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
559 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
560 for Perl compatibility, it is a literal. This code looks a bit odd, but
561 there used to be some cases other than the default, and there may be again
562 in future, so I haven't "optimized" it. */
563
564 default:
565 if ((options & PCRE_EXTRA) != 0) switch(c)
566 {
567 default:
568 *errorcodeptr = ERR3;
569 break;
570 }
571 break;
572 }
573 }
574
575 *ptrptr = ptr;
576 return c;
577 }
578
579
580
581 #ifdef SUPPORT_UCP
582 /*************************************************
583 * Handle \P and \p *
584 *************************************************/
585
586 /* This function is called after \P or \p has been encountered, provided that
587 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
588 pointing at the P or p. On exit, it is pointing at the final character of the
589 escape sequence.
590
591 Argument:
592 ptrptr points to the pattern position pointer
593 negptr points to a boolean that is set TRUE for negation else FALSE
594 dptr points to an int that is set to the detailed property value
595 errorcodeptr points to the error code variable
596
597 Returns: type value from ucp_type_table, or -1 for an invalid type
598 */
599
600 static int
601 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
602 {
603 int c, i, bot, top;
604 const uschar *ptr = *ptrptr;
605 char name[32];
606
607 c = *(++ptr);
608 if (c == 0) goto ERROR_RETURN;
609
610 *negptr = FALSE;
611
612 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
613 negation. */
614
615 if (c == '{')
616 {
617 if (ptr[1] == '^')
618 {
619 *negptr = TRUE;
620 ptr++;
621 }
622 for (i = 0; i < sizeof(name) - 1; i++)
623 {
624 c = *(++ptr);
625 if (c == 0) goto ERROR_RETURN;
626 if (c == '}') break;
627 name[i] = c;
628 }
629 if (c !='}') goto ERROR_RETURN;
630 name[i] = 0;
631 }
632
633 /* Otherwise there is just one following character */
634
635 else
636 {
637 name[0] = c;
638 name[1] = 0;
639 }
640
641 *ptrptr = ptr;
642
643 /* Search for a recognized property name using binary chop */
644
645 bot = 0;
646 top = _pcre_utt_size;
647
648 while (bot < top)
649 {
650 i = (bot + top) >> 1;
651 c = strcmp(name, _pcre_utt[i].name);
652 if (c == 0)
653 {
654 *dptr = _pcre_utt[i].value;
655 return _pcre_utt[i].type;
656 }
657 if (c > 0) bot = i + 1; else top = i;
658 }
659
660 *errorcodeptr = ERR47;
661 *ptrptr = ptr;
662 return -1;
663
664 ERROR_RETURN:
665 *errorcodeptr = ERR46;
666 *ptrptr = ptr;
667 return -1;
668 }
669 #endif
670
671
672
673
674 /*************************************************
675 * Check for counted repeat *
676 *************************************************/
677
678 /* This function is called when a '{' is encountered in a place where it might
679 start a quantifier. It looks ahead to see if it really is a quantifier or not.
680 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
681 where the ddds are digits.
682
683 Arguments:
684 p pointer to the first char after '{'
685
686 Returns: TRUE or FALSE
687 */
688
689 static BOOL
690 is_counted_repeat(const uschar *p)
691 {
692 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
693 while ((digitab[*p] & ctype_digit) != 0) p++;
694 if (*p == '}') return TRUE;
695
696 if (*p++ != ',') return FALSE;
697 if (*p == '}') return TRUE;
698
699 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
700 while ((digitab[*p] & ctype_digit) != 0) p++;
701
702 return (*p == '}');
703 }
704
705
706
707 /*************************************************
708 * Read repeat counts *
709 *************************************************/
710
711 /* Read an item of the form {n,m} and return the values. This is called only
712 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
713 so the syntax is guaranteed to be correct, but we need to check the values.
714
715 Arguments:
716 p pointer to first char after '{'
717 minp pointer to int for min
718 maxp pointer to int for max
719 returned as -1 if no max
720 errorcodeptr points to error code variable
721
722 Returns: pointer to '}' on success;
723 current ptr on error, with errorcodeptr set non-zero
724 */
725
726 static const uschar *
727 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
728 {
729 int min = 0;
730 int max = -1;
731
732 /* Read the minimum value and do a paranoid check: a negative value indicates
733 an integer overflow. */
734
735 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
736 if (min < 0 || min > 65535)
737 {
738 *errorcodeptr = ERR5;
739 return p;
740 }
741
742 /* Read the maximum value if there is one, and again do a paranoid on its size.
743 Also, max must not be less than min. */
744
745 if (*p == '}') max = min; else
746 {
747 if (*(++p) != '}')
748 {
749 max = 0;
750 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
751 if (max < 0 || max > 65535)
752 {
753 *errorcodeptr = ERR5;
754 return p;
755 }
756 if (max < min)
757 {
758 *errorcodeptr = ERR4;
759 return p;
760 }
761 }
762 }
763
764 /* Fill in the required variables, and pass back the pointer to the terminating
765 '}'. */
766
767 *minp = min;
768 *maxp = max;
769 return p;
770 }
771
772
773
774 /*************************************************
775 * Find forward referenced named subpattern *
776 *************************************************/
777
778 /* This function scans along a pattern looking for capturing subpatterns, and
779 counting them. If it finds a named pattern that matches the name it is given,
780 it returns its number. This is used for forward references to named
781 subpatterns. We know that if (?P< is encountered, the name will be terminated
782 by '>' because that is checked in the first pass.
783
784 Arguments:
785 pointer current position in the pattern
786 count current count of capturing parens
787 name name to seek
788 namelen name length
789
790 Returns: the number of the named subpattern, or -1 if not found
791 */
792
793 static int
794 find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)
795 {
796 const uschar *thisname;
797 for (; *ptr != 0; ptr++)
798 {
799 if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }
800 if (*ptr != '(') continue;
801 if (ptr[1] != '?') { count++; continue; }
802 if (ptr[2] == '(') { ptr += 2; continue; }
803 if (ptr[2] != 'P' || ptr[3] != '<') continue;
804 count++;
805 ptr += 4;
806 thisname = ptr;
807 while (*ptr != '>') ptr++;
808 if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)
809 return count;
810 }
811 return -1;
812 }
813
814
815
816 /*************************************************
817 * Find first significant op code *
818 *************************************************/
819
820 /* This is called by several functions that scan a compiled expression looking
821 for a fixed first character, or an anchoring op code etc. It skips over things
822 that do not influence this. For some calls, a change of option is important.
823 For some calls, it makes sense to skip negative forward and all backward
824 assertions, and also the \b assertion; for others it does not.
825
826 Arguments:
827 code pointer to the start of the group
828 options pointer to external options
829 optbit the option bit whose changing is significant, or
830 zero if none are
831 skipassert TRUE if certain assertions are to be skipped
832
833 Returns: pointer to the first significant opcode
834 */
835
836 static const uschar*
837 first_significant_code(const uschar *code, int *options, int optbit,
838 BOOL skipassert)
839 {
840 for (;;)
841 {
842 switch ((int)*code)
843 {
844 case OP_OPT:
845 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
846 *options = (int)code[1];
847 code += 2;
848 break;
849
850 case OP_ASSERT_NOT:
851 case OP_ASSERTBACK:
852 case OP_ASSERTBACK_NOT:
853 if (!skipassert) return code;
854 do code += GET(code, 1); while (*code == OP_ALT);
855 code += _pcre_OP_lengths[*code];
856 break;
857
858 case OP_WORD_BOUNDARY:
859 case OP_NOT_WORD_BOUNDARY:
860 if (!skipassert) return code;
861 /* Fall through */
862
863 case OP_CALLOUT:
864 case OP_CREF:
865 case OP_BRANUMBER:
866 code += _pcre_OP_lengths[*code];
867 break;
868
869 default:
870 return code;
871 }
872 }
873 /* Control never reaches here */
874 }
875
876
877
878
879 /*************************************************
880 * Find the fixed length of a pattern *
881 *************************************************/
882
883 /* Scan a pattern and compute the fixed length of subject that will match it,
884 if the length is fixed. This is needed for dealing with backward assertions.
885 In UTF8 mode, the result is in characters rather than bytes.
886
887 Arguments:
888 code points to the start of the pattern (the bracket)
889 options the compiling options
890
891 Returns: the fixed length, or -1 if there is no fixed length,
892 or -2 if \C was encountered
893 */
894
895 static int
896 find_fixedlength(uschar *code, int options)
897 {
898 int length = -1;
899
900 register int branchlength = 0;
901 register uschar *cc = code + 1 + LINK_SIZE;
902
903 /* Scan along the opcodes for this branch. If we get to the end of the
904 branch, check the length against that of the other branches. */
905
906 for (;;)
907 {
908 int d;
909 register int op = *cc;
910 if (op >= OP_BRA) op = OP_BRA;
911
912 switch (op)
913 {
914 case OP_BRA:
915 case OP_ONCE:
916 case OP_COND:
917 d = find_fixedlength(cc, options);
918 if (d < 0) return d;
919 branchlength += d;
920 do cc += GET(cc, 1); while (*cc == OP_ALT);
921 cc += 1 + LINK_SIZE;
922 break;
923
924 /* Reached end of a branch; if it's a ket it is the end of a nested
925 call. If it's ALT it is an alternation in a nested call. If it is
926 END it's the end of the outer call. All can be handled by the same code. */
927
928 case OP_ALT:
929 case OP_KET:
930 case OP_KETRMAX:
931 case OP_KETRMIN:
932 case OP_END:
933 if (length < 0) length = branchlength;
934 else if (length != branchlength) return -1;
935 if (*cc != OP_ALT) return length;
936 cc += 1 + LINK_SIZE;
937 branchlength = 0;
938 break;
939
940 /* Skip over assertive subpatterns */
941
942 case OP_ASSERT:
943 case OP_ASSERT_NOT:
944 case OP_ASSERTBACK:
945 case OP_ASSERTBACK_NOT:
946 do cc += GET(cc, 1); while (*cc == OP_ALT);
947 /* Fall through */
948
949 /* Skip over things that don't match chars */
950
951 case OP_REVERSE:
952 case OP_BRANUMBER:
953 case OP_CREF:
954 case OP_OPT:
955 case OP_CALLOUT:
956 case OP_SOD:
957 case OP_SOM:
958 case OP_EOD:
959 case OP_EODN:
960 case OP_CIRC:
961 case OP_DOLL:
962 case OP_NOT_WORD_BOUNDARY:
963 case OP_WORD_BOUNDARY:
964 cc += _pcre_OP_lengths[*cc];
965 break;
966
967 /* Handle literal characters */
968
969 case OP_CHAR:
970 case OP_CHARNC:
971 case OP_NOT:
972 branchlength++;
973 cc += 2;
974 #ifdef SUPPORT_UTF8
975 if ((options & PCRE_UTF8) != 0)
976 {
977 while ((*cc & 0xc0) == 0x80) cc++;
978 }
979 #endif
980 break;
981
982 /* Handle exact repetitions. The count is already in characters, but we
983 need to skip over a multibyte character in UTF8 mode. */
984
985 case OP_EXACT:
986 branchlength += GET2(cc,1);
987 cc += 4;
988 #ifdef SUPPORT_UTF8
989 if ((options & PCRE_UTF8) != 0)
990 {
991 while((*cc & 0x80) == 0x80) cc++;
992 }
993 #endif
994 break;
995
996 case OP_TYPEEXACT:
997 branchlength += GET2(cc,1);
998 cc += 4;
999 break;
1000
1001 /* Handle single-char matchers */
1002
1003 case OP_PROP:
1004 case OP_NOTPROP:
1005 cc += 2;
1006 /* Fall through */
1007
1008 case OP_NOT_DIGIT:
1009 case OP_DIGIT:
1010 case OP_NOT_WHITESPACE:
1011 case OP_WHITESPACE:
1012 case OP_NOT_WORDCHAR:
1013 case OP_WORDCHAR:
1014 case OP_ANY:
1015 branchlength++;
1016 cc++;
1017 break;
1018
1019 /* The single-byte matcher isn't allowed */
1020
1021 case OP_ANYBYTE:
1022 return -2;
1023
1024 /* Check a class for variable quantification */
1025
1026 #ifdef SUPPORT_UTF8
1027 case OP_XCLASS:
1028 cc += GET(cc, 1) - 33;
1029 /* Fall through */
1030 #endif
1031
1032 case OP_CLASS:
1033 case OP_NCLASS:
1034 cc += 33;
1035
1036 switch (*cc)
1037 {
1038 case OP_CRSTAR:
1039 case OP_CRMINSTAR:
1040 case OP_CRQUERY:
1041 case OP_CRMINQUERY:
1042 return -1;
1043
1044 case OP_CRRANGE:
1045 case OP_CRMINRANGE:
1046 if (GET2(cc,1) != GET2(cc,3)) return -1;
1047 branchlength += GET2(cc,1);
1048 cc += 5;
1049 break;
1050
1051 default:
1052 branchlength++;
1053 }
1054 break;
1055
1056 /* Anything else is variable length */
1057
1058 default:
1059 return -1;
1060 }
1061 }
1062 /* Control never gets here */
1063 }
1064
1065
1066
1067
1068 /*************************************************
1069 * Scan compiled regex for numbered bracket *
1070 *************************************************/
1071
1072 /* This little function scans through a compiled pattern until it finds a
1073 capturing bracket with the given number.
1074
1075 Arguments:
1076 code points to start of expression
1077 utf8 TRUE in UTF-8 mode
1078 number the required bracket number
1079
1080 Returns: pointer to the opcode for the bracket, or NULL if not found
1081 */
1082
1083 static const uschar *
1084 find_bracket(const uschar *code, BOOL utf8, int number)
1085 {
1086 for (;;)
1087 {
1088 register int c = *code;
1089 if (c == OP_END) return NULL;
1090
1091 /* XCLASS is used for classes that cannot be represented just by a bit
1092 map. This includes negated single high-valued characters. The length in
1093 the table is zero; the actual length is stored in the compiled code. */
1094
1095 if (c == OP_XCLASS) code += GET(code, 1);
1096
1097 /* Handle bracketed group */
1098
1099 else if (c > OP_BRA)
1100 {
1101 int n = c - OP_BRA;
1102 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1103 if (n == number) return (uschar *)code;
1104 code += _pcre_OP_lengths[OP_BRA];
1105 }
1106
1107 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1108 that are followed by a character may be followed by a multi-byte character.
1109 The length in the table is a minimum, so we have to scan along to skip the
1110 extra bytes. All opcodes are less than 128, so we can use relatively
1111 efficient code. */
1112
1113 else
1114 {
1115 code += _pcre_OP_lengths[c];
1116 if (utf8) switch(c)
1117 {
1118 case OP_CHAR:
1119 case OP_CHARNC:
1120 case OP_EXACT:
1121 case OP_UPTO:
1122 case OP_MINUPTO:
1123 case OP_STAR:
1124 case OP_MINSTAR:
1125 case OP_PLUS:
1126 case OP_MINPLUS:
1127 case OP_QUERY:
1128 case OP_MINQUERY:
1129 while ((*code & 0xc0) == 0x80) code++;
1130 break;
1131 }
1132 }
1133 }
1134 }
1135
1136
1137
1138 /*************************************************
1139 * Scan compiled regex for recursion reference *
1140 *************************************************/
1141
1142 /* This little function scans through a compiled pattern until it finds an
1143 instance of OP_RECURSE.
1144
1145 Arguments:
1146 code points to start of expression
1147 utf8 TRUE in UTF-8 mode
1148
1149 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1150 */
1151
1152 static const uschar *
1153 find_recurse(const uschar *code, BOOL utf8)
1154 {
1155 for (;;)
1156 {
1157 register int c = *code;
1158 if (c == OP_END) return NULL;
1159 if (c == OP_RECURSE) return code;
1160
1161 /* XCLASS is used for classes that cannot be represented just by a bit
1162 map. This includes negated single high-valued characters. The length in
1163 the table is zero; the actual length is stored in the compiled code. */
1164
1165 if (c == OP_XCLASS) code += GET(code, 1);
1166
1167 /* All bracketed groups have the same length. */
1168
1169 else if (c > OP_BRA)
1170 {
1171 code += _pcre_OP_lengths[OP_BRA];
1172 }
1173
1174 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1175 that are followed by a character may be followed by a multi-byte character.
1176 The length in the table is a minimum, so we have to scan along to skip the
1177 extra bytes. All opcodes are less than 128, so we can use relatively
1178 efficient code. */
1179
1180 else
1181 {
1182 code += _pcre_OP_lengths[c];
1183 if (utf8) switch(c)
1184 {
1185 case OP_CHAR:
1186 case OP_CHARNC:
1187 case OP_EXACT:
1188 case OP_UPTO:
1189 case OP_MINUPTO:
1190 case OP_STAR:
1191 case OP_MINSTAR:
1192 case OP_PLUS:
1193 case OP_MINPLUS:
1194 case OP_QUERY:
1195 case OP_MINQUERY:
1196 while ((*code & 0xc0) == 0x80) code++;
1197 break;
1198 }
1199 }
1200 }
1201 }
1202
1203
1204
1205 /*************************************************
1206 * Scan compiled branch for non-emptiness *
1207 *************************************************/
1208
1209 /* This function scans through a branch of a compiled pattern to see whether it
1210 can match the empty string or not. It is called only from could_be_empty()
1211 below. Note that first_significant_code() skips over assertions. If we hit an
1212 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1213 whose current branch will already have been scanned.
1214
1215 Arguments:
1216 code points to start of search
1217 endcode points to where to stop
1218 utf8 TRUE if in UTF8 mode
1219
1220 Returns: TRUE if what is matched could be empty
1221 */
1222
1223 static BOOL
1224 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1225 {
1226 register int c;
1227 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1228 code < endcode;
1229 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1230 {
1231 const uschar *ccode;
1232
1233 c = *code;
1234
1235 if (c >= OP_BRA)
1236 {
1237 BOOL empty_branch;
1238 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1239
1240 /* Scan a closed bracket */
1241
1242 empty_branch = FALSE;
1243 do
1244 {
1245 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1246 empty_branch = TRUE;
1247 code += GET(code, 1);
1248 }
1249 while (*code == OP_ALT);
1250 if (!empty_branch) return FALSE; /* All branches are non-empty */
1251 code += 1 + LINK_SIZE;
1252 c = *code;
1253 }
1254
1255 else switch (c)
1256 {
1257 /* Check for quantifiers after a class */
1258
1259 #ifdef SUPPORT_UTF8
1260 case OP_XCLASS:
1261 ccode = code + GET(code, 1);
1262 goto CHECK_CLASS_REPEAT;
1263 #endif
1264
1265 case OP_CLASS:
1266 case OP_NCLASS:
1267 ccode = code + 33;
1268
1269 #ifdef SUPPORT_UTF8
1270 CHECK_CLASS_REPEAT:
1271 #endif
1272
1273 switch (*ccode)
1274 {
1275 case OP_CRSTAR: /* These could be empty; continue */
1276 case OP_CRMINSTAR:
1277 case OP_CRQUERY:
1278 case OP_CRMINQUERY:
1279 break;
1280
1281 default: /* Non-repeat => class must match */
1282 case OP_CRPLUS: /* These repeats aren't empty */
1283 case OP_CRMINPLUS:
1284 return FALSE;
1285
1286 case OP_CRRANGE:
1287 case OP_CRMINRANGE:
1288 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1289 break;
1290 }
1291 break;
1292
1293 /* Opcodes that must match a character */
1294
1295 case OP_PROP:
1296 case OP_NOTPROP:
1297 case OP_EXTUNI:
1298 case OP_NOT_DIGIT:
1299 case OP_DIGIT:
1300 case OP_NOT_WHITESPACE:
1301 case OP_WHITESPACE:
1302 case OP_NOT_WORDCHAR:
1303 case OP_WORDCHAR:
1304 case OP_ANY:
1305 case OP_ANYBYTE:
1306 case OP_CHAR:
1307 case OP_CHARNC:
1308 case OP_NOT:
1309 case OP_PLUS:
1310 case OP_MINPLUS:
1311 case OP_EXACT:
1312 case OP_NOTPLUS:
1313 case OP_NOTMINPLUS:
1314 case OP_NOTEXACT:
1315 case OP_TYPEPLUS:
1316 case OP_TYPEMINPLUS:
1317 case OP_TYPEEXACT:
1318 return FALSE;
1319
1320 /* End of branch */
1321
1322 case OP_KET:
1323 case OP_KETRMAX:
1324 case OP_KETRMIN:
1325 case OP_ALT:
1326 return TRUE;
1327
1328 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1329 followed by a multibyte character */
1330
1331 #ifdef SUPPORT_UTF8
1332 case OP_STAR:
1333 case OP_MINSTAR:
1334 case OP_QUERY:
1335 case OP_MINQUERY:
1336 case OP_UPTO:
1337 case OP_MINUPTO:
1338 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1339 break;
1340 #endif
1341 }
1342 }
1343
1344 return TRUE;
1345 }
1346
1347
1348
1349 /*************************************************
1350 * Scan compiled regex for non-emptiness *
1351 *************************************************/
1352
1353 /* This function is called to check for left recursive calls. We want to check
1354 the current branch of the current pattern to see if it could match the empty
1355 string. If it could, we must look outwards for branches at other levels,
1356 stopping when we pass beyond the bracket which is the subject of the recursion.
1357
1358 Arguments:
1359 code points to start of the recursion
1360 endcode points to where to stop (current RECURSE item)
1361 bcptr points to the chain of current (unclosed) branch starts
1362 utf8 TRUE if in UTF-8 mode
1363
1364 Returns: TRUE if what is matched could be empty
1365 */
1366
1367 static BOOL
1368 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1369 BOOL utf8)
1370 {
1371 while (bcptr != NULL && bcptr->current >= code)
1372 {
1373 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1374 bcptr = bcptr->outer;
1375 }
1376 return TRUE;
1377 }
1378
1379
1380
1381 /*************************************************
1382 * Check for POSIX class syntax *
1383 *************************************************/
1384
1385 /* This function is called when the sequence "[:" or "[." or "[=" is
1386 encountered in a character class. It checks whether this is followed by an
1387 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1388 ".]" or "=]".
1389
1390 Argument:
1391 ptr pointer to the initial [
1392 endptr where to return the end pointer
1393 cd pointer to compile data
1394
1395 Returns: TRUE or FALSE
1396 */
1397
1398 static BOOL
1399 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1400 {
1401 int terminator; /* Don't combine these lines; the Solaris cc */
1402 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1403 if (*(++ptr) == '^') ptr++;
1404 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1405 if (*ptr == terminator && ptr[1] == ']')
1406 {
1407 *endptr = ptr;
1408 return TRUE;
1409 }
1410 return FALSE;
1411 }
1412
1413
1414
1415
1416 /*************************************************
1417 * Check POSIX class name *
1418 *************************************************/
1419
1420 /* This function is called to check the name given in a POSIX-style class entry
1421 such as [:alnum:].
1422
1423 Arguments:
1424 ptr points to the first letter
1425 len the length of the name
1426
1427 Returns: a value representing the name, or -1 if unknown
1428 */
1429
1430 static int
1431 check_posix_name(const uschar *ptr, int len)
1432 {
1433 register int yield = 0;
1434 while (posix_name_lengths[yield] != 0)
1435 {
1436 if (len == posix_name_lengths[yield] &&
1437 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1438 yield++;
1439 }
1440 return -1;
1441 }
1442
1443
1444 /*************************************************
1445 * Adjust OP_RECURSE items in repeated group *
1446 *************************************************/
1447
1448 /* OP_RECURSE items contain an offset from the start of the regex to the group
1449 that is referenced. This means that groups can be replicated for fixed
1450 repetition simply by copying (because the recursion is allowed to refer to
1451 earlier groups that are outside the current group). However, when a group is
1452 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1453 it, after it has been compiled. This means that any OP_RECURSE items within it
1454 that refer to the group itself or any contained groups have to have their
1455 offsets adjusted. That is the job of this function. Before it is called, the
1456 partially compiled regex must be temporarily terminated with OP_END.
1457
1458 Arguments:
1459 group points to the start of the group
1460 adjust the amount by which the group is to be moved
1461 utf8 TRUE in UTF-8 mode
1462 cd contains pointers to tables etc.
1463
1464 Returns: nothing
1465 */
1466
1467 static void
1468 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1469 {
1470 uschar *ptr = group;
1471 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1472 {
1473 int offset = GET(ptr, 1);
1474 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1475 ptr += 1 + LINK_SIZE;
1476 }
1477 }
1478
1479
1480
1481 /*************************************************
1482 * Insert an automatic callout point *
1483 *************************************************/
1484
1485 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1486 callout points before each pattern item.
1487
1488 Arguments:
1489 code current code pointer
1490 ptr current pattern pointer
1491 cd pointers to tables etc
1492
1493 Returns: new code pointer
1494 */
1495
1496 static uschar *
1497 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1498 {
1499 *code++ = OP_CALLOUT;
1500 *code++ = 255;
1501 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1502 PUT(code, LINK_SIZE, 0); /* Default length */
1503 return code + 2*LINK_SIZE;
1504 }
1505
1506
1507
1508 /*************************************************
1509 * Complete a callout item *
1510 *************************************************/
1511
1512 /* A callout item contains the length of the next item in the pattern, which
1513 we can't fill in till after we have reached the relevant point. This is used
1514 for both automatic and manual callouts.
1515
1516 Arguments:
1517 previous_callout points to previous callout item
1518 ptr current pattern pointer
1519 cd pointers to tables etc
1520
1521 Returns: nothing
1522 */
1523
1524 static void
1525 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1526 {
1527 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1528 PUT(previous_callout, 2 + LINK_SIZE, length);
1529 }
1530
1531
1532
1533 #ifdef SUPPORT_UCP
1534 /*************************************************
1535 * Get othercase range *
1536 *************************************************/
1537
1538 /* This function is passed the start and end of a class range, in UTF-8 mode
1539 with UCP support. It searches up the characters, looking for internal ranges of
1540 characters in the "other" case. Each call returns the next one, updating the
1541 start address.
1542
1543 Arguments:
1544 cptr points to starting character value; updated
1545 d end value
1546 ocptr where to put start of othercase range
1547 odptr where to put end of othercase range
1548
1549 Yield: TRUE when range returned; FALSE when no more
1550 */
1551
1552 static BOOL
1553 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1554 {
1555 int c, othercase, next;
1556
1557 for (c = *cptr; c <= d; c++)
1558 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
1559
1560 if (c > d) return FALSE;
1561
1562 *ocptr = othercase;
1563 next = othercase + 1;
1564
1565 for (++c; c <= d; c++)
1566 {
1567 if (_pcre_ucp_othercase(c) != next) break;
1568 next++;
1569 }
1570
1571 *odptr = next - 1;
1572 *cptr = c;
1573
1574 return TRUE;
1575 }
1576 #endif /* SUPPORT_UCP */
1577
1578
1579 /*************************************************
1580 * Compile one branch *
1581 *************************************************/
1582
1583 /* Scan the pattern, compiling it into the code vector. If the options are
1584 changed during the branch, the pointer is used to change the external options
1585 bits.
1586
1587 Arguments:
1588 optionsptr pointer to the option bits
1589 brackets points to number of extracting brackets used
1590 codeptr points to the pointer to the current code point
1591 ptrptr points to the current pattern pointer
1592 errorcodeptr points to error code variable
1593 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1594 reqbyteptr set to the last literal character required, else < 0
1595 bcptr points to current branch chain
1596 cd contains pointers to tables etc.
1597
1598 Returns: TRUE on success
1599 FALSE, with *errorcodeptr set non-zero on error
1600 */
1601
1602 static BOOL
1603 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1604 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1605 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1606 {
1607 int repeat_type, op_type;
1608 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1609 int bravalue = 0;
1610 int greedy_default, greedy_non_default;
1611 int firstbyte, reqbyte;
1612 int zeroreqbyte, zerofirstbyte;
1613 int req_caseopt, reqvary, tempreqvary;
1614 int options = *optionsptr;
1615 int after_manual_callout = 0;
1616 register int c;
1617 register uschar *code = *codeptr;
1618 uschar *tempcode;
1619 BOOL inescq = FALSE;
1620 BOOL groupsetfirstbyte = FALSE;
1621 const uschar *ptr = *ptrptr;
1622 const uschar *tempptr;
1623 uschar *previous = NULL;
1624 uschar *previous_callout = NULL;
1625 uschar classbits[32];
1626
1627 #ifdef SUPPORT_UTF8
1628 BOOL class_utf8;
1629 BOOL utf8 = (options & PCRE_UTF8) != 0;
1630 uschar *class_utf8data;
1631 uschar utf8_char[6];
1632 #else
1633 BOOL utf8 = FALSE;
1634 #endif
1635
1636 /* Set up the default and non-default settings for greediness */
1637
1638 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1639 greedy_non_default = greedy_default ^ 1;
1640
1641 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1642 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1643 matches a non-fixed char first char; reqbyte just remains unset if we never
1644 find one.
1645
1646 When we hit a repeat whose minimum is zero, we may have to adjust these values
1647 to take the zero repeat into account. This is implemented by setting them to
1648 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1649 item types that can be repeated set these backoff variables appropriately. */
1650
1651 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1652
1653 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1654 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1655 value > 255. It is added into the firstbyte or reqbyte variables to record the
1656 case status of the value. This is used only for ASCII characters. */
1657
1658 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1659
1660 /* Switch on next character until the end of the branch */
1661
1662 for (;; ptr++)
1663 {
1664 BOOL negate_class;
1665 BOOL possessive_quantifier;
1666 BOOL is_quantifier;
1667 int class_charcount;
1668 int class_lastchar;
1669 int newoptions;
1670 int recno;
1671 int skipbytes;
1672 int subreqbyte;
1673 int subfirstbyte;
1674 int mclength;
1675 uschar mcbuffer[8];
1676
1677 /* Next byte in the pattern */
1678
1679 c = *ptr;
1680
1681 /* If in \Q...\E, check for the end; if not, we have a literal */
1682
1683 if (inescq && c != 0)
1684 {
1685 if (c == '\\' && ptr[1] == 'E')
1686 {
1687 inescq = FALSE;
1688 ptr++;
1689 continue;
1690 }
1691 else
1692 {
1693 if (previous_callout != NULL)
1694 {
1695 complete_callout(previous_callout, ptr, cd);
1696 previous_callout = NULL;
1697 }
1698 if ((options & PCRE_AUTO_CALLOUT) != 0)
1699 {
1700 previous_callout = code;
1701 code = auto_callout(code, ptr, cd);
1702 }
1703 goto NORMAL_CHAR;
1704 }
1705 }
1706
1707 /* Fill in length of a previous callout, except when the next thing is
1708 a quantifier. */
1709
1710 is_quantifier = c == '*' || c == '+' || c == '?' ||
1711 (c == '{' && is_counted_repeat(ptr+1));
1712
1713 if (!is_quantifier && previous_callout != NULL &&
1714 after_manual_callout-- <= 0)
1715 {
1716 complete_callout(previous_callout, ptr, cd);
1717 previous_callout = NULL;
1718 }
1719
1720 /* In extended mode, skip white space and comments */
1721
1722 if ((options & PCRE_EXTENDED) != 0)
1723 {
1724 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1725 if (c == '#')
1726 {
1727 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
1728 if (*ptr != 0)
1729 {
1730 ptr += cd->nllen - 1;
1731 continue;
1732 }
1733 /* Else fall through to handle end of string */
1734 c = 0;
1735 }
1736 }
1737
1738 /* No auto callout for quantifiers. */
1739
1740 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1741 {
1742 previous_callout = code;
1743 code = auto_callout(code, ptr, cd);
1744 }
1745
1746 switch(c)
1747 {
1748 /* The branch terminates at end of string, |, or ). */
1749
1750 case 0:
1751 case '|':
1752 case ')':
1753 *firstbyteptr = firstbyte;
1754 *reqbyteptr = reqbyte;
1755 *codeptr = code;
1756 *ptrptr = ptr;
1757 return TRUE;
1758
1759 /* Handle single-character metacharacters. In multiline mode, ^ disables
1760 the setting of any following char as a first character. */
1761
1762 case '^':
1763 if ((options & PCRE_MULTILINE) != 0)
1764 {
1765 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1766 }
1767 previous = NULL;
1768 *code++ = OP_CIRC;
1769 break;
1770
1771 case '$':
1772 previous = NULL;
1773 *code++ = OP_DOLL;
1774 break;
1775
1776 /* There can never be a first char if '.' is first, whatever happens about
1777 repeats. The value of reqbyte doesn't change either. */
1778
1779 case '.':
1780 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1781 zerofirstbyte = firstbyte;
1782 zeroreqbyte = reqbyte;
1783 previous = code;
1784 *code++ = OP_ANY;
1785 break;
1786
1787 /* Character classes. If the included characters are all < 256, we build a
1788 32-byte bitmap of the permitted characters, except in the special case
1789 where there is only one such character. For negated classes, we build the
1790 map as usual, then invert it at the end. However, we use a different opcode
1791 so that data characters > 255 can be handled correctly.
1792
1793 If the class contains characters outside the 0-255 range, a different
1794 opcode is compiled. It may optionally have a bit map for characters < 256,
1795 but those above are are explicitly listed afterwards. A flag byte tells
1796 whether the bitmap is present, and whether this is a negated class or not.
1797 */
1798
1799 case '[':
1800 previous = code;
1801
1802 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1803 they are encountered at the top level, so we'll do that too. */
1804
1805 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1806 check_posix_syntax(ptr, &tempptr, cd))
1807 {
1808 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1809 goto FAILED;
1810 }
1811
1812 /* If the first character is '^', set the negation flag and skip it. */
1813
1814 if ((c = *(++ptr)) == '^')
1815 {
1816 negate_class = TRUE;
1817 c = *(++ptr);
1818 }
1819 else
1820 {
1821 negate_class = FALSE;
1822 }
1823
1824 /* Keep a count of chars with values < 256 so that we can optimize the case
1825 of just a single character (as long as it's < 256). For higher valued UTF-8
1826 characters, we don't yet do any optimization. */
1827
1828 class_charcount = 0;
1829 class_lastchar = -1;
1830
1831 #ifdef SUPPORT_UTF8
1832 class_utf8 = FALSE; /* No chars >= 256 */
1833 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1834 #endif
1835
1836 /* Initialize the 32-char bit map to all zeros. We have to build the
1837 map in a temporary bit of store, in case the class contains only 1
1838 character (< 256), because in that case the compiled code doesn't use the
1839 bit map. */
1840
1841 memset(classbits, 0, 32 * sizeof(uschar));
1842
1843 /* Process characters until ] is reached. By writing this as a "do" it
1844 means that an initial ] is taken as a data character. The first pass
1845 through the regex checked the overall syntax, so we don't need to be very
1846 strict here. At the start of the loop, c contains the first byte of the
1847 character. */
1848
1849 do
1850 {
1851 #ifdef SUPPORT_UTF8
1852 if (utf8 && c > 127)
1853 { /* Braces are required because the */
1854 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1855 }
1856 #endif
1857
1858 /* Inside \Q...\E everything is literal except \E */
1859
1860 if (inescq)
1861 {
1862 if (c == '\\' && ptr[1] == 'E')
1863 {
1864 inescq = FALSE;
1865 ptr++;
1866 continue;
1867 }
1868 else goto LONE_SINGLE_CHARACTER;
1869 }
1870
1871 /* Handle POSIX class names. Perl allows a negation extension of the
1872 form [:^name:]. A square bracket that doesn't match the syntax is
1873 treated as a literal. We also recognize the POSIX constructions
1874 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1875 5.6 and 5.8 do. */
1876
1877 if (c == '[' &&
1878 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1879 check_posix_syntax(ptr, &tempptr, cd))
1880 {
1881 BOOL local_negate = FALSE;
1882 int posix_class, taboffset, tabopt;
1883 register const uschar *cbits = cd->cbits;
1884 uschar pbits[32];
1885
1886 if (ptr[1] != ':')
1887 {
1888 *errorcodeptr = ERR31;
1889 goto FAILED;
1890 }
1891
1892 ptr += 2;
1893 if (*ptr == '^')
1894 {
1895 local_negate = TRUE;
1896 ptr++;
1897 }
1898
1899 posix_class = check_posix_name(ptr, tempptr - ptr);
1900 if (posix_class < 0)
1901 {
1902 *errorcodeptr = ERR30;
1903 goto FAILED;
1904 }
1905
1906 /* If matching is caseless, upper and lower are converted to
1907 alpha. This relies on the fact that the class table starts with
1908 alpha, lower, upper as the first 3 entries. */
1909
1910 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1911 posix_class = 0;
1912
1913 /* We build the bit map for the POSIX class in a chunk of local store
1914 because we may be adding and subtracting from it, and we don't want to
1915 subtract bits that may be in the main map already. At the end we or the
1916 result into the bit map that is being built. */
1917
1918 posix_class *= 3;
1919
1920 /* Copy in the first table (always present) */
1921
1922 memcpy(pbits, cbits + posix_class_maps[posix_class],
1923 32 * sizeof(uschar));
1924
1925 /* If there is a second table, add or remove it as required. */
1926
1927 taboffset = posix_class_maps[posix_class + 1];
1928 tabopt = posix_class_maps[posix_class + 2];
1929
1930 if (taboffset >= 0)
1931 {
1932 if (tabopt >= 0)
1933 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
1934 else
1935 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
1936 }
1937
1938 /* Not see if we need to remove any special characters. An option
1939 value of 1 removes vertical space and 2 removes underscore. */
1940
1941 if (tabopt < 0) tabopt = -tabopt;
1942 if (tabopt == 1) pbits[1] &= ~0x3c;
1943 else if (tabopt == 2) pbits[11] &= 0x7f;
1944
1945 /* Add the POSIX table or its complement into the main table that is
1946 being built and we are done. */
1947
1948 if (local_negate)
1949 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
1950 else
1951 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
1952
1953 ptr = tempptr + 1;
1954 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1955 continue; /* End of POSIX syntax handling */
1956 }
1957
1958 /* Backslash may introduce a single character, or it may introduce one
1959 of the specials, which just set a flag. Escaped items are checked for
1960 validity in the pre-compiling pass. The sequence \b is a special case.
1961 Inside a class (and only there) it is treated as backspace. Elsewhere
1962 it marks a word boundary. Other escapes have preset maps ready to
1963 or into the one we are building. We assume they have more than one
1964 character in them, so set class_charcount bigger than one. */
1965
1966 if (c == '\\')
1967 {
1968 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1969
1970 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1971 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1972 else if (-c == ESC_Q) /* Handle start of quoted string */
1973 {
1974 if (ptr[1] == '\\' && ptr[2] == 'E')
1975 {
1976 ptr += 2; /* avoid empty string */
1977 }
1978 else inescq = TRUE;
1979 continue;
1980 }
1981
1982 if (c < 0)
1983 {
1984 register const uschar *cbits = cd->cbits;
1985 class_charcount += 2; /* Greater than 1 is what matters */
1986 switch (-c)
1987 {
1988 case ESC_d:
1989 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1990 continue;
1991
1992 case ESC_D:
1993 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1994 continue;
1995
1996 case ESC_w:
1997 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1998 continue;
1999
2000 case ESC_W:
2001 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2002 continue;
2003
2004 case ESC_s:
2005 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2006 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2007 continue;
2008
2009 case ESC_S:
2010 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2011 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2012 continue;
2013
2014 #ifdef SUPPORT_UCP
2015 case ESC_p:
2016 case ESC_P:
2017 {
2018 BOOL negated;
2019 int pdata;
2020 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2021 if (ptype < 0) goto FAILED;
2022 class_utf8 = TRUE;
2023 *class_utf8data++ = ((-c == ESC_p) != negated)?
2024 XCL_PROP : XCL_NOTPROP;
2025 *class_utf8data++ = ptype;
2026 *class_utf8data++ = pdata;
2027 class_charcount -= 2; /* Not a < 256 character */
2028 }
2029 continue;
2030 #endif
2031
2032 /* Unrecognized escapes are faulted if PCRE is running in its
2033 strict mode. By default, for compatibility with Perl, they are
2034 treated as literals. */
2035
2036 default:
2037 if ((options & PCRE_EXTRA) != 0)
2038 {
2039 *errorcodeptr = ERR7;
2040 goto FAILED;
2041 }
2042 c = *ptr; /* The final character */
2043 class_charcount -= 2; /* Undo the default count from above */
2044 }
2045 }
2046
2047 /* Fall through if we have a single character (c >= 0). This may be
2048 > 256 in UTF-8 mode. */
2049
2050 } /* End of backslash handling */
2051
2052 /* A single character may be followed by '-' to form a range. However,
2053 Perl does not permit ']' to be the end of the range. A '-' character
2054 here is treated as a literal. */
2055
2056 if (ptr[1] == '-' && ptr[2] != ']')
2057 {
2058 int d;
2059 ptr += 2;
2060
2061 #ifdef SUPPORT_UTF8
2062 if (utf8)
2063 { /* Braces are required because the */
2064 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2065 }
2066 else
2067 #endif
2068 d = *ptr; /* Not UTF-8 mode */
2069
2070 /* The second part of a range can be a single-character escape, but
2071 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2072 in such circumstances. */
2073
2074 if (d == '\\')
2075 {
2076 const uschar *oldptr = ptr;
2077 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2078
2079 /* \b is backslash; \X is literal X; any other special means the '-'
2080 was literal */
2081
2082 if (d < 0)
2083 {
2084 if (d == -ESC_b) d = '\b';
2085 else if (d == -ESC_X) d = 'X'; else
2086 {
2087 ptr = oldptr - 2;
2088 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2089 }
2090 }
2091 }
2092
2093 /* The check that the two values are in the correct order happens in
2094 the pre-pass. Optimize one-character ranges */
2095
2096 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2097
2098 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2099 matching, we have to use an XCLASS with extra data items. Caseless
2100 matching for characters > 127 is available only if UCP support is
2101 available. */
2102
2103 #ifdef SUPPORT_UTF8
2104 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2105 {
2106 class_utf8 = TRUE;
2107
2108 /* With UCP support, we can find the other case equivalents of
2109 the relevant characters. There may be several ranges. Optimize how
2110 they fit with the basic range. */
2111
2112 #ifdef SUPPORT_UCP
2113 if ((options & PCRE_CASELESS) != 0)
2114 {
2115 int occ, ocd;
2116 int cc = c;
2117 int origd = d;
2118 while (get_othercase_range(&cc, origd, &occ, &ocd))
2119 {
2120 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2121
2122 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2123 { /* if there is overlap, */
2124 c = occ; /* noting that if occ < c */
2125 continue; /* we can't have ocd > d */
2126 } /* because a subrange is */
2127 if (ocd > d && occ <= d + 1) /* always shorter than */
2128 { /* the basic range. */
2129 d = ocd;
2130 continue;
2131 }
2132
2133 if (occ == ocd)
2134 {
2135 *class_utf8data++ = XCL_SINGLE;
2136 }
2137 else
2138 {
2139 *class_utf8data++ = XCL_RANGE;
2140 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2141 }
2142 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2143 }
2144 }
2145 #endif /* SUPPORT_UCP */
2146
2147 /* Now record the original range, possibly modified for UCP caseless
2148 overlapping ranges. */
2149
2150 *class_utf8data++ = XCL_RANGE;
2151 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2152 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2153
2154 /* With UCP support, we are done. Without UCP support, there is no
2155 caseless matching for UTF-8 characters > 127; we can use the bit map
2156 for the smaller ones. */
2157
2158 #ifdef SUPPORT_UCP
2159 continue; /* With next character in the class */
2160 #else
2161 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2162
2163 /* Adjust upper limit and fall through to set up the map */
2164
2165 d = 127;
2166
2167 #endif /* SUPPORT_UCP */
2168 }
2169 #endif /* SUPPORT_UTF8 */
2170
2171 /* We use the bit map for all cases when not in UTF-8 mode; else
2172 ranges that lie entirely within 0-127 when there is UCP support; else
2173 for partial ranges without UCP support. */
2174
2175 for (; c <= d; c++)
2176 {
2177 classbits[c/8] |= (1 << (c&7));
2178 if ((options & PCRE_CASELESS) != 0)
2179 {
2180 int uc = cd->fcc[c]; /* flip case */
2181 classbits[uc/8] |= (1 << (uc&7));
2182 }
2183 class_charcount++; /* in case a one-char range */
2184 class_lastchar = c;
2185 }
2186
2187 continue; /* Go get the next char in the class */
2188 }
2189
2190 /* Handle a lone single character - we can get here for a normal
2191 non-escape char, or after \ that introduces a single character or for an
2192 apparent range that isn't. */
2193
2194 LONE_SINGLE_CHARACTER:
2195
2196 /* Handle a character that cannot go in the bit map */
2197
2198 #ifdef SUPPORT_UTF8
2199 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2200 {
2201 class_utf8 = TRUE;
2202 *class_utf8data++ = XCL_SINGLE;
2203 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2204
2205 #ifdef SUPPORT_UCP
2206 if ((options & PCRE_CASELESS) != 0)
2207 {
2208 int othercase;
2209 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
2210 {
2211 *class_utf8data++ = XCL_SINGLE;
2212 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2213 }
2214 }
2215 #endif /* SUPPORT_UCP */
2216
2217 }
2218 else
2219 #endif /* SUPPORT_UTF8 */
2220
2221 /* Handle a single-byte character */
2222 {
2223 classbits[c/8] |= (1 << (c&7));
2224 if ((options & PCRE_CASELESS) != 0)
2225 {
2226 c = cd->fcc[c]; /* flip case */
2227 classbits[c/8] |= (1 << (c&7));
2228 }
2229 class_charcount++;
2230 class_lastchar = c;
2231 }
2232 }
2233
2234 /* Loop until ']' reached; the check for end of string happens inside the
2235 loop. This "while" is the end of the "do" above. */
2236
2237 while ((c = *(++ptr)) != ']' || inescq);
2238
2239 /* If class_charcount is 1, we saw precisely one character whose value is
2240 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2241 can optimize the negative case only if there were no characters >= 128
2242 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2243 single-bytes only. This is an historical hangover. Maybe one day we can
2244 tidy these opcodes to handle multi-byte characters.
2245
2246 The optimization throws away the bit map. We turn the item into a
2247 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2248 that OP_NOT does not support multibyte characters. In the positive case, it
2249 can cause firstbyte to be set. Otherwise, there can be no first char if
2250 this item is first, whatever repeat count may follow. In the case of
2251 reqbyte, save the previous value for reinstating. */
2252
2253 #ifdef SUPPORT_UTF8
2254 if (class_charcount == 1 &&
2255 (!utf8 ||
2256 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2257
2258 #else
2259 if (class_charcount == 1)
2260 #endif
2261 {
2262 zeroreqbyte = reqbyte;
2263
2264 /* The OP_NOT opcode works on one-byte characters only. */
2265
2266 if (negate_class)
2267 {
2268 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2269 zerofirstbyte = firstbyte;
2270 *code++ = OP_NOT;
2271 *code++ = class_lastchar;
2272 break;
2273 }
2274
2275 /* For a single, positive character, get the value into mcbuffer, and
2276 then we can handle this with the normal one-character code. */
2277
2278 #ifdef SUPPORT_UTF8
2279 if (utf8 && class_lastchar > 127)
2280 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2281 else
2282 #endif
2283 {
2284 mcbuffer[0] = class_lastchar;
2285 mclength = 1;
2286 }
2287 goto ONE_CHAR;
2288 } /* End of 1-char optimization */
2289
2290 /* The general case - not the one-char optimization. If this is the first
2291 thing in the branch, there can be no first char setting, whatever the
2292 repeat count. Any reqbyte setting must remain unchanged after any kind of
2293 repeat. */
2294
2295 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2296 zerofirstbyte = firstbyte;
2297 zeroreqbyte = reqbyte;
2298
2299 /* If there are characters with values > 255, we have to compile an
2300 extended class, with its own opcode. If there are no characters < 256,
2301 we can omit the bitmap. */
2302
2303 #ifdef SUPPORT_UTF8
2304 if (class_utf8)
2305 {
2306 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2307 *code++ = OP_XCLASS;
2308 code += LINK_SIZE;
2309 *code = negate_class? XCL_NOT : 0;
2310
2311 /* If the map is required, install it, and move on to the end of
2312 the extra data */
2313
2314 if (class_charcount > 0)
2315 {
2316 *code++ |= XCL_MAP;
2317 memcpy(code, classbits, 32);
2318 code = class_utf8data;
2319 }
2320
2321 /* If the map is not required, slide down the extra data. */
2322
2323 else
2324 {
2325 int len = class_utf8data - (code + 33);
2326 memmove(code + 1, code + 33, len);
2327 code += len + 1;
2328 }
2329
2330 /* Now fill in the complete length of the item */
2331
2332 PUT(previous, 1, code - previous);
2333 break; /* End of class handling */
2334 }
2335 #endif
2336
2337 /* If there are no characters > 255, negate the 32-byte map if necessary,
2338 and copy it into the code vector. If this is the first thing in the branch,
2339 there can be no first char setting, whatever the repeat count. Any reqbyte
2340 setting must remain unchanged after any kind of repeat. */
2341
2342 if (negate_class)
2343 {
2344 *code++ = OP_NCLASS;
2345 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2346 }
2347 else
2348 {
2349 *code++ = OP_CLASS;
2350 memcpy(code, classbits, 32);
2351 }
2352 code += 32;
2353 break;
2354
2355 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2356 has been tested above. */
2357
2358 case '{':
2359 if (!is_quantifier) goto NORMAL_CHAR;
2360 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2361 if (*errorcodeptr != 0) goto FAILED;
2362 goto REPEAT;
2363
2364 case '*':
2365 repeat_min = 0;
2366 repeat_max = -1;
2367 goto REPEAT;
2368
2369 case '+':
2370 repeat_min = 1;
2371 repeat_max = -1;
2372 goto REPEAT;
2373
2374 case '?':
2375 repeat_min = 0;
2376 repeat_max = 1;
2377
2378 REPEAT:
2379 if (previous == NULL)
2380 {
2381 *errorcodeptr = ERR9;
2382 goto FAILED;
2383 }
2384
2385 if (repeat_min == 0)
2386 {
2387 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2388 reqbyte = zeroreqbyte; /* Ditto */
2389 }
2390
2391 /* Remember whether this is a variable length repeat */
2392
2393 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2394
2395 op_type = 0; /* Default single-char op codes */
2396 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2397
2398 /* Save start of previous item, in case we have to move it up to make space
2399 for an inserted OP_ONCE for the additional '+' extension. */
2400
2401 tempcode = previous;
2402
2403 /* If the next character is '+', we have a possessive quantifier. This
2404 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2405 If the next character is '?' this is a minimizing repeat, by default,
2406 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2407 repeat type to the non-default. */
2408
2409 if (ptr[1] == '+')
2410 {
2411 repeat_type = 0; /* Force greedy */
2412 possessive_quantifier = TRUE;
2413 ptr++;
2414 }
2415 else if (ptr[1] == '?')
2416 {
2417 repeat_type = greedy_non_default;
2418 ptr++;
2419 }
2420 else repeat_type = greedy_default;
2421
2422 /* If previous was a recursion, we need to wrap it inside brackets so that
2423 it can be replicated if necessary. */
2424
2425 if (*previous == OP_RECURSE)
2426 {
2427 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2428 code += 1 + LINK_SIZE;
2429 *previous = OP_BRA;
2430 PUT(previous, 1, code - previous);
2431 *code = OP_KET;
2432 PUT(code, 1, code - previous);
2433 code += 1 + LINK_SIZE;
2434 }
2435
2436 /* If previous was a character match, abolish the item and generate a
2437 repeat item instead. If a char item has a minumum of more than one, ensure
2438 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2439 the first thing in a branch because the x will have gone into firstbyte
2440 instead. */
2441
2442 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2443 {
2444 /* Deal with UTF-8 characters that take up more than one byte. It's
2445 easier to write this out separately than try to macrify it. Use c to
2446 hold the length of the character in bytes, plus 0x80 to flag that it's a
2447 length rather than a small character. */
2448
2449 #ifdef SUPPORT_UTF8
2450 if (utf8 && (code[-1] & 0x80) != 0)
2451 {
2452 uschar *lastchar = code - 1;
2453 while((*lastchar & 0xc0) == 0x80) lastchar--;
2454 c = code - lastchar; /* Length of UTF-8 character */
2455 memcpy(utf8_char, lastchar, c); /* Save the char */
2456 c |= 0x80; /* Flag c as a length */
2457 }
2458 else
2459 #endif
2460
2461 /* Handle the case of a single byte - either with no UTF8 support, or
2462 with UTF-8 disabled, or for a UTF-8 character < 128. */
2463
2464 {
2465 c = code[-1];
2466 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2467 }
2468
2469 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2470 }
2471
2472 /* If previous was a single negated character ([^a] or similar), we use
2473 one of the special opcodes, replacing it. The code is shared with single-
2474 character repeats by setting opt_type to add a suitable offset into
2475 repeat_type. OP_NOT is currently used only for single-byte chars. */
2476
2477 else if (*previous == OP_NOT)
2478 {
2479 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2480 c = previous[1];
2481 goto OUTPUT_SINGLE_REPEAT;
2482 }
2483
2484 /* If previous was a character type match (\d or similar), abolish it and
2485 create a suitable repeat item. The code is shared with single-character
2486 repeats by setting op_type to add a suitable offset into repeat_type. Note
2487 the the Unicode property types will be present only when SUPPORT_UCP is
2488 defined, but we don't wrap the little bits of code here because it just
2489 makes it horribly messy. */
2490
2491 else if (*previous < OP_EODN)
2492 {
2493 uschar *oldcode;
2494 int prop_type, prop_value;
2495 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2496 c = *previous;
2497
2498 OUTPUT_SINGLE_REPEAT:
2499 if (*previous == OP_PROP || *previous == OP_NOTPROP)
2500 {
2501 prop_type = previous[1];
2502 prop_value = previous[2];
2503 }
2504 else prop_type = prop_value = -1;
2505
2506 oldcode = code;
2507 code = previous; /* Usually overwrite previous item */
2508
2509 /* If the maximum is zero then the minimum must also be zero; Perl allows
2510 this case, so we do too - by simply omitting the item altogether. */
2511
2512 if (repeat_max == 0) goto END_REPEAT;
2513
2514 /* All real repeats make it impossible to handle partial matching (maybe
2515 one day we will be able to remove this restriction). */
2516
2517 if (repeat_max != 1) cd->nopartial = TRUE;
2518
2519 /* Combine the op_type with the repeat_type */
2520
2521 repeat_type += op_type;
2522
2523 /* A minimum of zero is handled either as the special case * or ?, or as
2524 an UPTO, with the maximum given. */
2525
2526 if (repeat_min == 0)
2527 {
2528 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2529 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2530 else
2531 {
2532 *code++ = OP_UPTO + repeat_type;
2533 PUT2INC(code, 0, repeat_max);
2534 }
2535 }
2536
2537 /* A repeat minimum of 1 is optimized into some special cases. If the
2538 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2539 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2540 one less than the maximum. */
2541
2542 else if (repeat_min == 1)
2543 {
2544 if (repeat_max == -1)
2545 *code++ = OP_PLUS + repeat_type;
2546 else
2547 {
2548 code = oldcode; /* leave previous item in place */
2549 if (repeat_max == 1) goto END_REPEAT;
2550 *code++ = OP_UPTO + repeat_type;
2551 PUT2INC(code, 0, repeat_max - 1);
2552 }
2553 }
2554
2555 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2556 handled as an EXACT followed by an UPTO. */
2557
2558 else
2559 {
2560 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2561 PUT2INC(code, 0, repeat_min);
2562
2563 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2564 we have to insert the character for the previous code. For a repeated
2565 Unicode property match, there are two extra bytes that define the
2566 required property. In UTF-8 mode, long characters have their length in
2567 c, with the 0x80 bit as a flag. */
2568
2569 if (repeat_max < 0)
2570 {
2571 #ifdef SUPPORT_UTF8
2572 if (utf8 && c >= 128)
2573 {
2574 memcpy(code, utf8_char, c & 7);
2575 code += c & 7;
2576 }
2577 else
2578 #endif
2579 {
2580 *code++ = c;
2581 if (prop_type >= 0)
2582 {
2583 *code++ = prop_type;
2584 *code++ = prop_value;
2585 }
2586 }
2587 *code++ = OP_STAR + repeat_type;
2588 }
2589
2590 /* Else insert an UPTO if the max is greater than the min, again
2591 preceded by the character, for the previously inserted code. */
2592
2593 else if (repeat_max != repeat_min)
2594 {
2595 #ifdef SUPPORT_UTF8
2596 if (utf8 && c >= 128)
2597 {
2598 memcpy(code, utf8_char, c & 7);
2599 code += c & 7;
2600 }
2601 else
2602 #endif
2603 *code++ = c;
2604 if (prop_type >= 0)
2605 {
2606 *code++ = prop_type;
2607 *code++ = prop_value;
2608 }
2609 repeat_max -= repeat_min;
2610 *code++ = OP_UPTO + repeat_type;
2611 PUT2INC(code, 0, repeat_max);
2612 }
2613 }
2614
2615 /* The character or character type itself comes last in all cases. */
2616
2617 #ifdef SUPPORT_UTF8
2618 if (utf8 && c >= 128)
2619 {
2620 memcpy(code, utf8_char, c & 7);
2621 code += c & 7;
2622 }
2623 else
2624 #endif
2625 *code++ = c;
2626
2627 /* For a repeated Unicode property match, there are two extra bytes that
2628 define the required property. */
2629
2630 #ifdef SUPPORT_UCP
2631 if (prop_type >= 0)
2632 {
2633 *code++ = prop_type;
2634 *code++ = prop_value;
2635 }
2636 #endif
2637 }
2638
2639 /* If previous was a character class or a back reference, we put the repeat
2640 stuff after it, but just skip the item if the repeat was {0,0}. */
2641
2642 else if (*previous == OP_CLASS ||
2643 *previous == OP_NCLASS ||
2644 #ifdef SUPPORT_UTF8
2645 *previous == OP_XCLASS ||
2646 #endif
2647 *previous == OP_REF)
2648 {
2649 if (repeat_max == 0)
2650 {
2651 code = previous;
2652 goto END_REPEAT;
2653 }
2654
2655 /* All real repeats make it impossible to handle partial matching (maybe
2656 one day we will be able to remove this restriction). */
2657
2658 if (repeat_max != 1) cd->nopartial = TRUE;
2659
2660 if (repeat_min == 0 && repeat_max == -1)
2661 *code++ = OP_CRSTAR + repeat_type;
2662 else if (repeat_min == 1 && repeat_max == -1)
2663 *code++ = OP_CRPLUS + repeat_type;
2664 else if (repeat_min == 0 && repeat_max == 1)
2665 *code++ = OP_CRQUERY + repeat_type;
2666 else
2667 {
2668 *code++ = OP_CRRANGE + repeat_type;
2669 PUT2INC(code, 0, repeat_min);
2670 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2671 PUT2INC(code, 0, repeat_max);
2672 }
2673 }
2674
2675 /* If previous was a bracket group, we may have to replicate it in certain
2676 cases. */
2677
2678 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2679 *previous == OP_COND)
2680 {
2681 register int i;
2682 int ketoffset = 0;
2683 int len = code - previous;
2684 uschar *bralink = NULL;
2685
2686 /* If the maximum repeat count is unlimited, find the end of the bracket
2687 by scanning through from the start, and compute the offset back to it
2688 from the current code pointer. There may be an OP_OPT setting following
2689 the final KET, so we can't find the end just by going back from the code
2690 pointer. */
2691
2692 if (repeat_max == -1)
2693 {
2694 register uschar *ket = previous;
2695 do ket += GET(ket, 1); while (*ket != OP_KET);
2696 ketoffset = code - ket;
2697 }
2698
2699 /* The case of a zero minimum is special because of the need to stick
2700 OP_BRAZERO in front of it, and because the group appears once in the
2701 data, whereas in other cases it appears the minimum number of times. For
2702 this reason, it is simplest to treat this case separately, as otherwise
2703 the code gets far too messy. There are several special subcases when the
2704 minimum is zero. */
2705
2706 if (repeat_min == 0)
2707 {
2708 /* If the maximum is also zero, we just omit the group from the output
2709 altogether. */
2710
2711 if (repeat_max == 0)
2712 {
2713 code = previous;
2714 goto END_REPEAT;
2715 }
2716
2717 /* If the maximum is 1 or unlimited, we just have to stick in the
2718 BRAZERO and do no more at this point. However, we do need to adjust
2719 any OP_RECURSE calls inside the group that refer to the group itself or
2720 any internal group, because the offset is from the start of the whole
2721 regex. Temporarily terminate the pattern while doing this. */
2722
2723 if (repeat_max <= 1)
2724 {
2725 *code = OP_END;
2726 adjust_recurse(previous, 1, utf8, cd);
2727 memmove(previous+1, previous, len);
2728 code++;
2729 *previous++ = OP_BRAZERO + repeat_type;
2730 }
2731
2732 /* If the maximum is greater than 1 and limited, we have to replicate
2733 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2734 The first one has to be handled carefully because it's the original
2735 copy, which has to be moved up. The remainder can be handled by code
2736 that is common with the non-zero minimum case below. We have to
2737 adjust the value or repeat_max, since one less copy is required. Once
2738 again, we may have to adjust any OP_RECURSE calls inside the group. */
2739
2740 else
2741 {
2742 int offset;
2743 *code = OP_END;
2744 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2745 memmove(previous + 2 + LINK_SIZE, previous, len);
2746 code += 2 + LINK_SIZE;
2747 *previous++ = OP_BRAZERO + repeat_type;
2748 *previous++ = OP_BRA;
2749
2750 /* We chain together the bracket offset fields that have to be
2751 filled in later when the ends of the brackets are reached. */
2752
2753 offset = (bralink == NULL)? 0 : previous - bralink;
2754 bralink = previous;
2755 PUTINC(previous, 0, offset);
2756 }
2757
2758 repeat_max--;
2759 }
2760
2761 /* If the minimum is greater than zero, replicate the group as many
2762 times as necessary, and adjust the maximum to the number of subsequent
2763 copies that we need. If we set a first char from the group, and didn't
2764 set a required char, copy the latter from the former. */
2765
2766 else
2767 {
2768 if (repeat_min > 1)
2769 {
2770 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2771 for (i = 1; i < repeat_min; i++)
2772 {
2773 memcpy(code, previous, len);
2774 code += len;
2775 }
2776 }
2777 if (repeat_max > 0) repeat_max -= repeat_min;
2778 }
2779
2780 /* This code is common to both the zero and non-zero minimum cases. If
2781 the maximum is limited, it replicates the group in a nested fashion,
2782 remembering the bracket starts on a stack. In the case of a zero minimum,
2783 the first one was set up above. In all cases the repeat_max now specifies
2784 the number of additional copies needed. */
2785
2786 if (repeat_max >= 0)
2787 {
2788 for (i = repeat_max - 1; i >= 0; i--)
2789 {
2790 *code++ = OP_BRAZERO + repeat_type;
2791
2792 /* All but the final copy start a new nesting, maintaining the
2793 chain of brackets outstanding. */
2794
2795 if (i != 0)
2796 {
2797 int offset;
2798 *code++ = OP_BRA;
2799 offset = (bralink == NULL)? 0 : code - bralink;
2800 bralink = code;
2801 PUTINC(code, 0, offset);
2802 }
2803
2804 memcpy(code, previous, len);
2805 code += len;
2806 }
2807
2808 /* Now chain through the pending brackets, and fill in their length
2809 fields (which are holding the chain links pro tem). */
2810
2811 while (bralink != NULL)
2812 {
2813 int oldlinkoffset;
2814 int offset = code - bralink + 1;
2815 uschar *bra = code - offset;
2816 oldlinkoffset = GET(bra, 1);
2817 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2818 *code++ = OP_KET;
2819 PUTINC(code, 0, offset);
2820 PUT(bra, 1, offset);
2821 }
2822 }
2823
2824 /* If the maximum is unlimited, set a repeater in the final copy. We
2825 can't just offset backwards from the current code point, because we
2826 don't know if there's been an options resetting after the ket. The
2827 correct offset was computed above. */
2828
2829 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2830 }
2831
2832 /* Else there's some kind of shambles */
2833
2834 else
2835 {
2836 *errorcodeptr = ERR11;
2837 goto FAILED;
2838 }
2839
2840 /* If the character following a repeat is '+', we wrap the entire repeated
2841 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2842 Sun's Java package. The repeated item starts at tempcode, not at previous,
2843 which might be the first part of a string whose (former) last char we
2844 repeated. However, we don't support '+' after a greediness '?'. */
2845
2846 if (possessive_quantifier)
2847 {
2848 int len = code - tempcode;
2849 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2850 code += 1 + LINK_SIZE;
2851 len += 1 + LINK_SIZE;
2852 tempcode[0] = OP_ONCE;
2853 *code++ = OP_KET;
2854 PUTINC(code, 0, len);
2855 PUT(tempcode, 1, len);
2856 }
2857
2858 /* In all case we no longer have a previous item. We also set the
2859 "follows varying string" flag for subsequently encountered reqbytes if
2860 it isn't already set and we have just passed a varying length item. */
2861
2862 END_REPEAT:
2863 previous = NULL;
2864 cd->req_varyopt |= reqvary;
2865 break;
2866
2867
2868 /* Start of nested bracket sub-expression, or comment or lookahead or
2869 lookbehind or option setting or condition. First deal with special things
2870 that can come after a bracket; all are introduced by ?, and the appearance
2871 of any of them means that this is not a referencing group. They were
2872 checked for validity in the first pass over the string, so we don't have to
2873 check for syntax errors here. */
2874
2875 case '(':
2876 newoptions = options;
2877 skipbytes = 0;
2878
2879 if (*(++ptr) == '?')
2880 {
2881 int set, unset;
2882 int *optset;
2883
2884 switch (*(++ptr))
2885 {
2886 case '#': /* Comment; skip to ket */
2887 ptr++;
2888 while (*ptr != ')') ptr++;
2889 continue;
2890
2891 case ':': /* Non-extracting bracket */
2892 bravalue = OP_BRA;
2893 ptr++;
2894 break;
2895
2896 case '(':
2897 bravalue = OP_COND; /* Conditional group */
2898
2899 /* A condition can be a number, referring to a numbered group, a name,
2900 referring to a named group, 'R', referring to recursion, or an
2901 assertion. There are two unfortunate ambiguities, caused by history.
2902 (a) 'R' can be the recursive thing or the name 'R', and (b) a number
2903 could be a name that consists of digits. In both cases, we look for a
2904 name first; if not found, we try the other cases. If the first
2905 character after (?( is a word character, we know the rest up to ) will
2906 also be word characters because the syntax was checked in the first
2907 pass. */
2908
2909 if ((cd->ctypes[ptr[1]] & ctype_word) != 0)
2910 {
2911 int i, namelen;
2912 int condref = 0;
2913 const uschar *name;
2914 uschar *slot = cd->name_table;
2915
2916 /* This is needed for all successful cases. */
2917
2918 skipbytes = 3;
2919
2920 /* Read the name, but also get it as a number if it's all digits */
2921
2922 name = ++ptr;
2923 while (*ptr != ')')
2924 {
2925 if (condref >= 0)
2926 condref = ((digitab[*ptr] & ctype_digit) != 0)?
2927 condref * 10 + *ptr - '0' : -1;
2928 ptr++;
2929 }
2930 namelen = ptr - name;
2931 ptr++;
2932
2933 for (i = 0; i < cd->names_found; i++)
2934 {
2935 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2936 slot += cd->name_entry_size;
2937 }
2938
2939 /* Found a previous named subpattern */
2940
2941 if (i < cd->names_found)
2942 {
2943 condref = GET2(slot, 0);
2944 code[1+LINK_SIZE] = OP_CREF;
2945 PUT2(code, 2+LINK_SIZE, condref);
2946 }
2947
2948 /* Search the pattern for a forward reference */
2949
2950 else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)
2951 {
2952 code[1+LINK_SIZE] = OP_CREF;
2953 PUT2(code, 2+LINK_SIZE, i);
2954 }
2955
2956 /* Check for 'R' for recursion */
2957
2958 else if (namelen == 1 && *name == 'R')
2959 {
2960 code[1+LINK_SIZE] = OP_CREF;
2961 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2962 }
2963
2964 /* Check for a subpattern number */
2965
2966 else if (condref > 0)
2967 {
2968 code[1+LINK_SIZE] = OP_CREF;
2969 PUT2(code, 2+LINK_SIZE, condref);
2970 }
2971
2972 /* Either an unidentified subpattern, or a reference to (?(0) */
2973
2974 else
2975 {
2976 *errorcodeptr = (condref == 0)? ERR35: ERR15;
2977 goto FAILED;
2978 }
2979 }
2980
2981 /* For conditions that are assertions, we just fall through, having
2982 set bravalue above. */
2983
2984 break;
2985
2986 case '=': /* Positive lookahead */
2987 bravalue = OP_ASSERT;
2988 ptr++;
2989 break;
2990
2991 case '!': /* Negative lookahead */
2992 bravalue = OP_ASSERT_NOT;
2993 ptr++;
2994 break;
2995
2996 case '<': /* Lookbehinds */
2997 switch (*(++ptr))
2998 {
2999 case '=': /* Positive lookbehind */
3000 bravalue = OP_ASSERTBACK;
3001 ptr++;
3002 break;
3003
3004 case '!': /* Negative lookbehind */
3005 bravalue = OP_ASSERTBACK_NOT;
3006 ptr++;
3007 break;
3008 }
3009 break;
3010
3011 case '>': /* One-time brackets */
3012 bravalue = OP_ONCE;
3013 ptr++;
3014 break;
3015
3016 case 'C': /* Callout - may be followed by digits; */
3017 previous_callout = code; /* Save for later completion */
3018 after_manual_callout = 1; /* Skip one item before completing */
3019 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3020 { /* closing parenthesis is present. */
3021 int n = 0;
3022 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3023 n = n * 10 + *ptr - '0';
3024 if (n > 255)
3025 {
3026 *errorcodeptr = ERR38;
3027 goto FAILED;
3028 }
3029 *code++ = n;
3030 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3031 PUT(code, LINK_SIZE, 0); /* Default length */
3032 code += 2 * LINK_SIZE;
3033 }
3034 previous = NULL;
3035 continue;
3036
3037 case 'P': /* Named subpattern handling */
3038 if (*(++ptr) == '<') /* Definition */
3039 {
3040 int i, namelen;
3041 uschar *slot = cd->name_table;
3042 const uschar *name; /* Don't amalgamate; some compilers */
3043 name = ++ptr; /* grumble at autoincrement in declaration */
3044
3045 while (*ptr++ != '>');
3046 namelen = ptr - name - 1;
3047
3048 for (i = 0; i < cd->names_found; i++)
3049 {
3050 int crc = memcmp(name, slot+2, namelen);
3051 if (crc == 0)
3052 {
3053 if (slot[2+namelen] == 0)
3054 {
3055 if ((options & PCRE_DUPNAMES) == 0)
3056 {
3057 *errorcodeptr = ERR43;
3058 goto FAILED;
3059 }
3060 }
3061 else crc = -1; /* Current name is substring */
3062 }
3063 if (crc < 0)
3064 {
3065 memmove(slot + cd->name_entry_size, slot,
3066 (cd->names_found - i) * cd->name_entry_size);
3067 break;
3068 }
3069 slot += cd->name_entry_size;
3070 }
3071
3072 PUT2(slot, 0, *brackets + 1);
3073 memcpy(slot + 2, name, namelen);
3074 slot[2+namelen] = 0;
3075 cd->names_found++;
3076 goto NUMBERED_GROUP;
3077 }
3078
3079 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3080 {
3081 int i, namelen;
3082 int type = *ptr++;
3083 const uschar *name = ptr;
3084 uschar *slot = cd->name_table;
3085
3086 while (*ptr != ')') ptr++;
3087 namelen = ptr - name;
3088
3089 for (i = 0; i < cd->names_found; i++)
3090 {
3091 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3092 slot += cd->name_entry_size;
3093 }
3094
3095 if (i < cd->names_found) /* Back reference */
3096 {
3097 recno = GET2(slot, 0);
3098 }
3099 else if ((recno = /* Forward back reference */
3100 find_named_parens(ptr, *brackets, name, namelen)) <= 0)
3101 {
3102 *errorcodeptr = ERR15;
3103 goto FAILED;
3104 }
3105
3106 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3107
3108 /* Back reference */
3109
3110 previous = code;
3111 *code++ = OP_REF;
3112 PUT2INC(code, 0, recno);
3113 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3114 if (recno > cd->top_backref) cd->top_backref = recno;
3115 continue;
3116 }
3117
3118 /* Should never happen */
3119 break;
3120
3121 case 'R': /* Pattern recursion */
3122 ptr++; /* Same as (?0) */
3123 /* Fall through */
3124
3125 /* Recursion or "subroutine" call */
3126
3127 case '0': case '1': case '2': case '3': case '4':
3128 case '5': case '6': case '7': case '8': case '9':
3129 {
3130 const uschar *called;
3131 recno = 0;
3132 while((digitab[*ptr] & ctype_digit) != 0)
3133 recno = recno * 10 + *ptr++ - '0';
3134
3135 /* Come here from code above that handles a named recursion */
3136
3137 HANDLE_RECURSION:
3138
3139 previous = code;
3140
3141 /* Find the bracket that is being referenced. Temporarily end the
3142 regex in case it doesn't exist. */
3143
3144 *code = OP_END;
3145 called = (recno == 0)? cd->start_code :
3146 find_bracket(cd->start_code, utf8, recno);
3147 if (called == NULL)
3148 {
3149 *errorcodeptr = ERR15;
3150 goto FAILED;
3151 }
3152
3153 /* If the subpattern is still open, this is a recursive call. We
3154 check to see if this is a left recursion that could loop for ever,
3155 and diagnose that case. */
3156
3157 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3158 {
3159 *errorcodeptr = ERR40;
3160 goto FAILED;
3161 }
3162
3163 /* Insert the recursion/subroutine item, automatically wrapped inside
3164 "once" brackets. */
3165
3166 *code = OP_ONCE;
3167 PUT(code, 1, 2 + 2*LINK_SIZE);
3168 code += 1 + LINK_SIZE;
3169
3170 *code = OP_RECURSE;
3171 PUT(code, 1, called - cd->start_code);
3172 code += 1 + LINK_SIZE;
3173
3174 *code = OP_KET;
3175 PUT(code, 1, 2 + 2*LINK_SIZE);
3176 code += 1 + LINK_SIZE;
3177 }
3178 continue;
3179
3180 /* Character after (? not specially recognized */
3181
3182 default: /* Option setting */
3183 set = unset = 0;
3184 optset = &set;
3185
3186 while (*ptr != ')' && *ptr != ':')
3187 {
3188 switch (*ptr++)
3189 {
3190 case '-': optset = &unset; break;
3191
3192 case 'i': *optset |= PCRE_CASELESS; break;
3193 case 'J': *optset |= PCRE_DUPNAMES; break;
3194 case 'm': *optset |= PCRE_MULTILINE; break;
3195 case 's': *optset |= PCRE_DOTALL; break;
3196 case 'x': *optset |= PCRE_EXTENDED; break;
3197 case 'U': *optset |= PCRE_UNGREEDY; break;
3198 case 'X': *optset |= PCRE_EXTRA; break;
3199 }
3200 }
3201
3202 /* Set up the changed option bits, but don't change anything yet. */
3203
3204 newoptions = (options | set) & (~unset);
3205
3206 /* If the options ended with ')' this is not the start of a nested
3207 group with option changes, so the options change at this level. Compile
3208 code to change the ims options if this setting actually changes any of
3209 them. We also pass the new setting back so that it can be put at the
3210 start of any following branches, and when this group ends (if we are in
3211 a group), a resetting item can be compiled.
3212
3213 Note that if this item is right at the start of the pattern, the
3214 options will have been abstracted and made global, so there will be no
3215 change to compile. */
3216
3217 if (*ptr == ')')
3218 {
3219 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3220 {
3221 *code++ = OP_OPT;
3222 *code++ = newoptions & PCRE_IMS;
3223 }
3224
3225 /* Change options at this level, and pass them back for use
3226 in subsequent branches. Reset the greedy defaults and the case
3227 value for firstbyte and reqbyte. */
3228
3229 *optionsptr = options = newoptions;
3230 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3231 greedy_non_default = greedy_default ^ 1;
3232 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3233
3234 previous = NULL; /* This item can't be repeated */
3235 continue; /* It is complete */
3236 }
3237
3238 /* If the options ended with ':' we are heading into a nested group
3239 with possible change of options. Such groups are non-capturing and are
3240 not assertions of any kind. All we need to do is skip over the ':';
3241 the newoptions value is handled below. */
3242
3243 bravalue = OP_BRA;
3244 ptr++;
3245 }
3246 }
3247
3248 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3249 non-capturing and behave like (?:...) brackets */
3250
3251 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3252 {
3253 bravalue = OP_BRA;
3254 }
3255
3256 /* Else we have a referencing group; adjust the opcode. If the bracket
3257 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3258 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3259
3260 else
3261 {
3262 NUMBERED_GROUP:
3263 if (++(*brackets) > EXTRACT_BASIC_MAX)
3264 {
3265 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3266 code[1+LINK_SIZE] = OP_BRANUMBER;
3267 PUT2(code, 2+LINK_SIZE, *brackets);
3268 skipbytes = 3;
3269 }
3270 else bravalue = OP_BRA + *brackets;
3271 }
3272
3273 /* Process nested bracketed re. Assertions may not be repeated, but other
3274 kinds can be. We copy code into a non-register variable in order to be able
3275 to pass its address because some compilers complain otherwise. Pass in a
3276 new setting for the ims options if they have changed. */
3277
3278 previous = (bravalue >= OP_ONCE)? code : NULL;
3279 *code = bravalue;
3280 tempcode = code;
3281 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3282
3283 if (!compile_regex(
3284 newoptions, /* The complete new option state */
3285 options & PCRE_IMS, /* The previous ims option state */
3286 brackets, /* Extracting bracket count */
3287 &tempcode, /* Where to put code (updated) */
3288 &ptr, /* Input pointer (updated) */
3289 errorcodeptr, /* Where to put an error message */
3290 (bravalue == OP_ASSERTBACK ||
3291 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3292 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3293 &subfirstbyte, /* For possible first char */
3294 &subreqbyte, /* For possible last char */
3295 bcptr, /* Current branch chain */
3296 cd)) /* Tables block */
3297 goto FAILED;
3298
3299 /* At the end of compiling, code is still pointing to the start of the
3300 group, while tempcode has been updated to point past the end of the group
3301 and any option resetting that may follow it. The pattern pointer (ptr)
3302 is on the bracket. */
3303
3304 /* If this is a conditional bracket, check that there are no more than
3305 two branches in the group. */
3306
3307 else if (bravalue == OP_COND)
3308 {
3309 uschar *tc = code;
3310 int condcount = 0;
3311
3312 do {
3313 condcount++;
3314 tc += GET(tc,1);
3315 }
3316 while (*tc != OP_KET);
3317
3318 if (condcount > 2)
3319 {
3320 *errorcodeptr = ERR27;
3321 goto FAILED;
3322 }
3323
3324 /* If there is just one branch, we must not make use of its firstbyte or
3325 reqbyte, because this is equivalent to an empty second branch. */
3326
3327 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3328 }
3329
3330 /* Handle updating of the required and first characters. Update for normal
3331 brackets of all kinds, and conditions with two branches (see code above).
3332 If the bracket is followed by a quantifier with zero repeat, we have to
3333 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3334 main loop so that they can be accessed for the back off. */
3335
3336 zeroreqbyte = reqbyte;
3337 zerofirstbyte = firstbyte;
3338 groupsetfirstbyte = FALSE;
3339
3340 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3341 {
3342 /* If we have not yet set a firstbyte in this branch, take it from the
3343 subpattern, remembering that it was set here so that a repeat of more
3344 than one can replicate it as reqbyte if necessary. If the subpattern has
3345 no firstbyte, set "none" for the whole branch. In both cases, a zero
3346 repeat forces firstbyte to "none". */
3347
3348 if (firstbyte == REQ_UNSET)
3349 {
3350 if (subfirstbyte >= 0)
3351 {
3352 firstbyte = subfirstbyte;
3353 groupsetfirstbyte = TRUE;
3354 }
3355 else firstbyte = REQ_NONE;
3356 zerofirstbyte = REQ_NONE;
3357 }
3358
3359 /* If firstbyte was previously set, convert the subpattern's firstbyte
3360 into reqbyte if there wasn't one, using the vary flag that was in
3361 existence beforehand. */
3362
3363 else if (subfirstbyte >= 0 && subreqbyte < 0)
3364 subreqbyte = subfirstbyte | tempreqvary;
3365
3366 /* If the subpattern set a required byte (or set a first byte that isn't
3367 really the first byte - see above), set it. */
3368
3369 if (subreqbyte >= 0) reqbyte = subreqbyte;
3370 }
3371
3372 /* For a forward assertion, we take the reqbyte, if set. This can be
3373 helpful if the pattern that follows the assertion doesn't set a different
3374 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3375 for an assertion, however because it leads to incorrect effect for patterns
3376 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3377 of a firstbyte. This is overcome by a scan at the end if there's no
3378 firstbyte, looking for an asserted first char. */
3379
3380 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3381
3382 /* Now update the main code pointer to the end of the group. */
3383
3384 code = tempcode;
3385
3386 /* Error if hit end of pattern */
3387
3388 if (*ptr != ')')
3389 {
3390 *errorcodeptr = ERR14;
3391 goto FAILED;
3392 }
3393 break;
3394
3395 /* Check \ for being a real metacharacter; if not, fall through and handle
3396 it as a data character at the start of a string. Escape items are checked
3397 for validity in the pre-compiling pass. */
3398
3399 case '\\':
3400 tempptr = ptr;
3401 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3402
3403 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3404 are arranged to be the negation of the corresponding OP_values. For the
3405 back references, the values are ESC_REF plus the reference number. Only
3406 back references and those types that consume a character may be repeated.
3407 We can test for values between ESC_b and ESC_Z for the latter; this may
3408 have to change if any new ones are ever created. */
3409
3410 if (c < 0)
3411 {
3412 if (-c == ESC_Q) /* Handle start of quoted string */
3413 {
3414 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3415 else inescq = TRUE;
3416 continue;
3417 }
3418
3419 /* For metasequences that actually match a character, we disable the
3420 setting of a first character if it hasn't already been set. */
3421
3422 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3423 firstbyte = REQ_NONE;
3424
3425 /* Set values to reset to if this is followed by a zero repeat. */
3426
3427 zerofirstbyte = firstbyte;
3428 zeroreqbyte = reqbyte;
3429
3430 /* Back references are handled specially */
3431
3432 if (-c >= ESC_REF)
3433 {
3434 int number = -c - ESC_REF;
3435 previous = code;
3436 *code++ = OP_REF;
3437 PUT2INC(code, 0, number);
3438 }
3439
3440 /* So are Unicode property matches, if supported. We know that get_ucp
3441 won't fail because it was tested in the pre-pass. */
3442
3443 #ifdef SUPPORT_UCP
3444 else if (-c == ESC_P || -c == ESC_p)
3445 {
3446 BOOL negated;
3447 int pdata;
3448 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3449 previous = code;
3450 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3451 *code++ = ptype;
3452 *code++ = pdata;
3453 }
3454 #endif
3455
3456 /* For the rest, we can obtain the OP value by negating the escape
3457 value */
3458
3459 else
3460 {
3461 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3462 *code++ = -c;
3463 }
3464 continue;
3465 }
3466
3467 /* We have a data character whose value is in c. In UTF-8 mode it may have
3468 a value > 127. We set its representation in the length/buffer, and then
3469 handle it as a data character. */
3470
3471 #ifdef SUPPORT_UTF8
3472 if (utf8 && c > 127)
3473 mclength = _pcre_ord2utf8(c, mcbuffer);
3474 else
3475 #endif
3476
3477 {
3478 mcbuffer[0] = c;
3479 mclength = 1;
3480 }
3481
3482 goto ONE_CHAR;
3483
3484 /* Handle a literal character. It is guaranteed not to be whitespace or #
3485 when the extended flag is set. If we are in UTF-8 mode, it may be a
3486 multi-byte literal character. */
3487
3488 default:
3489 NORMAL_CHAR:
3490 mclength = 1;
3491 mcbuffer[0] = c;
3492
3493 #ifdef SUPPORT_UTF8
3494 if (utf8 && (c & 0xc0) == 0xc0)
3495 {
3496 while ((ptr[1] & 0xc0) == 0x80)
3497 mcbuffer[mclength++] = *(++ptr);
3498 }
3499 #endif
3500
3501 /* At this point we have the character's bytes in mcbuffer, and the length
3502 in mclength. When not in UTF-8 mode, the length is always 1. */
3503
3504 ONE_CHAR:
3505 previous = code;
3506 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3507 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3508
3509 /* Set the first and required bytes appropriately. If no previous first
3510 byte, set it from this character, but revert to none on a zero repeat.
3511 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3512 repeat. */
3513
3514 if (firstbyte == REQ_UNSET)
3515 {
3516 zerofirstbyte = REQ_NONE;
3517 zeroreqbyte = reqbyte;
3518
3519 /* If the character is more than one byte long, we can set firstbyte
3520 only if it is not to be matched caselessly. */
3521
3522 if (mclength == 1 || req_caseopt == 0)
3523 {
3524 firstbyte = mcbuffer[0] | req_caseopt;
3525 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3526 }
3527 else firstbyte = reqbyte = REQ_NONE;
3528 }
3529
3530 /* firstbyte was previously set; we can set reqbyte only the length is
3531 1 or the matching is caseful. */
3532
3533 else
3534 {
3535 zerofirstbyte = firstbyte;
3536 zeroreqbyte = reqbyte;
3537 if (mclength == 1 || req_caseopt == 0)
3538 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3539 }
3540
3541 break; /* End of literal character handling */
3542 }
3543 } /* end of big loop */
3544
3545 /* Control never reaches here by falling through, only by a goto for all the
3546 error states. Pass back the position in the pattern so that it can be displayed
3547 to the user for diagnosing the error. */
3548
3549 FAILED:
3550 *ptrptr = ptr;
3551 return FALSE;
3552 }
3553
3554
3555
3556
3557 /*************************************************
3558 * Compile sequence of alternatives *
3559 *************************************************/
3560
3561 /* On entry, ptr is pointing past the bracket character, but on return
3562 it points to the closing bracket, or vertical bar, or end of string.
3563 The code variable is pointing at the byte into which the BRA operator has been
3564 stored. If the ims options are changed at the start (for a (?ims: group) or
3565 during any branch, we need to insert an OP_OPT item at the start of every
3566 following branch to ensure they get set correctly at run time, and also pass
3567 the new options into every subsequent branch compile.
3568
3569 Argument:
3570 options option bits, including any changes for this subpattern
3571 oldims previous settings of ims option bits
3572 brackets -> int containing the number of extracting brackets used
3573 codeptr -> the address of the current code pointer
3574 ptrptr -> the address of the current pattern pointer
3575 errorcodeptr -> pointer to error code variable
3576 lookbehind TRUE if this is a lookbehind assertion
3577 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3578 firstbyteptr place to put the first required character, or a negative number
3579 reqbyteptr place to put the last required character, or a negative number
3580 bcptr pointer to the chain of currently open branches
3581 cd points to the data block with tables pointers etc.
3582
3583 Returns: TRUE on success
3584 */
3585
3586 static BOOL
3587 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3588 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3589 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3590 {
3591 const uschar *ptr = *ptrptr;
3592 uschar *code = *codeptr;
3593 uschar *last_branch = code;
3594 uschar *start_bracket = code;
3595 uschar *reverse_count = NULL;
3596 int firstbyte, reqbyte;
3597 int branchfirstbyte, branchreqbyte;
3598 branch_chain bc;
3599
3600 bc.outer = bcptr;
3601 bc.current = code;
3602
3603 firstbyte = reqbyte = REQ_UNSET;
3604
3605 /* Offset is set zero to mark that this bracket is still open */
3606
3607 PUT(code, 1, 0);
3608 code += 1 + LINK_SIZE + skipbytes;
3609
3610 /* Loop for each alternative branch */
3611
3612 for (;;)
3613 {
3614 /* Handle a change of ims options at the start of the branch */
3615
3616 if ((options & PCRE_IMS) != oldims)
3617 {
3618 *code++ = OP_OPT;
3619 *code++ = options & PCRE_IMS;
3620 }
3621
3622 /* Set up dummy OP_REVERSE if lookbehind assertion */
3623
3624 if (lookbehind)
3625 {
3626 *code++ = OP_REVERSE;
3627 reverse_count = code;
3628 PUTINC(code, 0, 0);
3629 }
3630
3631 /* Now compile the branch */
3632
3633 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3634 &branchfirstbyte, &branchreqbyte, &bc, cd))
3635 {
3636 *ptrptr = ptr;
3637 return FALSE;
3638 }
3639
3640 /* If this is the first branch, the firstbyte and reqbyte values for the
3641 branch become the values for the regex. */
3642
3643 if (*last_branch != OP_ALT)
3644 {
3645 firstbyte = branchfirstbyte;
3646 reqbyte = branchreqbyte;
3647 }
3648
3649 /* If this is not the first branch, the first char and reqbyte have to
3650 match the values from all the previous branches, except that if the previous
3651 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3652 REQ_VARY for the regex. */
3653
3654 else
3655 {
3656 /* If we previously had a firstbyte, but it doesn't match the new branch,
3657 we have to abandon the firstbyte for the regex, but if there was previously
3658 no reqbyte, it takes on the value of the old firstbyte. */
3659
3660 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3661 {
3662 if (reqbyte < 0) reqbyte = firstbyte;
3663 firstbyte = REQ_NONE;
3664 }
3665
3666 /* If we (now or from before) have no firstbyte, a firstbyte from the
3667 branch becomes a reqbyte if there isn't a branch reqbyte. */
3668
3669 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3670 branchreqbyte = branchfirstbyte;
3671
3672 /* Now ensure that the reqbytes match */
3673
3674 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3675 reqbyte = REQ_NONE;
3676 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3677 }
3678
3679 /* If lookbehind, check that this branch matches a fixed-length string,
3680 and put the length into the OP_REVERSE item. Temporarily mark the end of
3681 the branch with OP_END. */
3682
3683 if (lookbehind)
3684 {
3685 int length;
3686 *code = OP_END;
3687 length = find_fixedlength(last_branch, options);
3688 DPRINTF(("fixed length = %d\n", length));
3689 if (length < 0)
3690 {
3691 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3692 *ptrptr = ptr;
3693 return FALSE;
3694 }
3695 PUT(reverse_count, 0, length);
3696 }
3697
3698 /* Reached end of expression, either ')' or end of pattern. Go back through
3699 the alternative branches and reverse the chain of offsets, with the field in
3700 the BRA item now becoming an offset to the first alternative. If there are
3701 no alternatives, it points to the end of the group. The length in the
3702 terminating ket is always the length of the whole bracketed item. If any of
3703 the ims options were changed inside the group, compile a resetting op-code
3704 following, except at the very end of the pattern. Return leaving the pointer
3705 at the terminating char. */
3706
3707 if (*ptr != '|')
3708 {
3709 int length = code - last_branch;
3710 do
3711 {
3712 int prev_length = GET(last_branch, 1);
3713 PUT(last_branch, 1, length);
3714 length = prev_length;
3715 last_branch -= length;
3716 }
3717 while (length > 0);
3718
3719 /* Fill in the ket */
3720
3721 *code = OP_KET;
3722 PUT(code, 1, code - start_bracket);
3723 code += 1 + LINK_SIZE;
3724
3725 /* Resetting option if needed */
3726
3727 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3728 {
3729 *code++ = OP_OPT;
3730 *code++ = oldims;
3731 }
3732
3733 /* Set values to pass back */
3734
3735 *codeptr = code;
3736 *ptrptr = ptr;
3737 *firstbyteptr = firstbyte;
3738 *reqbyteptr = reqbyte;
3739 return TRUE;
3740 }
3741
3742 /* Another branch follows; insert an "or" node. Its length field points back
3743 to the previous branch while the bracket remains open. At the end the chain
3744 is reversed. It's done like this so that the start of the bracket has a
3745 zero offset until it is closed, making it possible to detect recursion. */
3746
3747 *code = OP_ALT;
3748 PUT(code, 1, code - last_branch);
3749 bc.current = last_branch = code;
3750 code += 1 + LINK_SIZE;
3751 ptr++;
3752 }
3753 /* Control never reaches here */
3754 }
3755
3756
3757
3758
3759 /*************************************************
3760 * Check for anchored expression *
3761 *************************************************/
3762
3763 /* Try to find out if this is an anchored regular expression. Consider each
3764 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3765 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3766 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3767 counts, since OP_CIRC can match in the middle.
3768
3769 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3770 This is the code for \G, which means "match at start of match position, taking
3771 into account the match offset".
3772
3773 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3774 because that will try the rest of the pattern at all possible matching points,
3775 so there is no point trying again.... er ....
3776
3777 .... except when the .* appears inside capturing parentheses, and there is a
3778 subsequent back reference to those parentheses. We haven't enough information
3779 to catch that case precisely.
3780
3781 At first, the best we could do was to detect when .* was in capturing brackets
3782 and the highest back reference was greater than or equal to that level.
3783 However, by keeping a bitmap of the first 31 back references, we can catch some
3784 of the more common cases more precisely.
3785
3786 Arguments:
3787 code points to start of expression (the bracket)
3788 options points to the options setting
3789 bracket_map a bitmap of which brackets we are inside while testing; this
3790 handles up to substring 31; after that we just have to take
3791 the less precise approach
3792 backref_map the back reference bitmap
3793
3794 Returns: TRUE or FALSE
3795 */
3796
3797 static BOOL
3798 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3799 unsigned int backref_map)
3800 {
3801 do {
3802 const uschar *scode =
3803 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3804 register int op = *scode;
3805
3806 /* Capturing brackets */
3807
3808 if (op > OP_BRA)
3809 {
3810 int new_map;
3811 op -= OP_BRA;
3812 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3813 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3814 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3815 }
3816
3817 /* Other brackets */
3818
3819 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3820 {
3821 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3822 }
3823
3824 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3825 are or may be referenced. */
3826
3827 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3828 (*options & PCRE_DOTALL) != 0)
3829 {
3830 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3831 }
3832
3833 /* Check for explicit anchoring */
3834
3835 else if (op != OP_SOD && op != OP_SOM &&
3836 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3837 return FALSE;
3838 code += GET(code, 1);
3839 }
3840 while (*code == OP_ALT); /* Loop for each alternative */
3841 return TRUE;
3842 }
3843
3844
3845
3846 /*************************************************
3847 * Check for starting with ^ or .* *
3848 *************************************************/
3849
3850 /* This is called to find out if every branch starts with ^ or .* so that
3851 "first char" processing can be done to speed things up in multiline
3852 matching and for non-DOTALL patterns that start with .* (which must start at
3853 the beginning or after \n). As in the case of is_anchored() (see above), we
3854 have to take account of back references to capturing brackets that contain .*
3855 because in that case we can't make the assumption.
3856
3857 Arguments:
3858 code points to start of expression (the bracket)
3859 bracket_map a bitmap of which brackets we are inside while testing; this
3860 handles up to substring 31; after that we just have to take
3861 the less precise approach
3862 backref_map the back reference bitmap
3863
3864 Returns: TRUE or FALSE
3865 */
3866
3867 static BOOL
3868 is_startline(const uschar *code, unsigned int bracket_map,
3869 unsigned int backref_map)
3870 {
3871 do {
3872 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3873 FALSE);
3874 register int op = *scode;
3875
3876 /* Capturing brackets */
3877
3878 if (op > OP_BRA)
3879 {
3880 int new_map;
3881 op -= OP_BRA;
3882 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3883 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3884 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3885 }
3886
3887 /* Other brackets */
3888
3889 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3890 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3891
3892 /* .* means "start at start or after \n" if it isn't in brackets that
3893 may be referenced. */
3894
3895 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3896 {
3897 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3898 }
3899
3900 /* Check for explicit circumflex */
3901
3902 else if (op != OP_CIRC) return FALSE;
3903
3904 /* Move on to the next alternative */
3905
3906 code += GET(code, 1);
3907 }
3908 while (*code == OP_ALT); /* Loop for each alternative */
3909 return TRUE;
3910 }
3911
3912
3913
3914 /*************************************************
3915 * Check for asserted fixed first char *
3916 *************************************************/
3917
3918 /* During compilation, the "first char" settings from forward assertions are
3919 discarded, because they can cause conflicts with actual literals that follow.
3920 However, if we end up without a first char setting for an unanchored pattern,
3921 it is worth scanning the regex to see if there is an initial asserted first
3922 char. If all branches start with the same asserted char, or with a bracket all
3923 of whose alternatives start with the same asserted char (recurse ad lib), then
3924 we return that char, otherwise -1.
3925
3926 Arguments:
3927 code points to start of expression (the bracket)
3928 options pointer to the options (used to check casing changes)
3929 inassert TRUE if in an assertion
3930
3931 Returns: -1 or the fixed first char
3932 */
3933
3934 static int
3935 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3936 {
3937 register int c = -1;
3938 do {
3939 int d;
3940 const uschar *scode =
3941 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3942 register int op = *scode;
3943
3944 if (op >= OP_BRA) op = OP_BRA;
3945
3946 switch(op)
3947 {
3948 default:
3949 return -1;
3950
3951 case OP_BRA:
3952 case OP_ASSERT:
3953 case OP_ONCE:
3954 case OP_COND:
3955 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3956 return -1;
3957 if (c < 0) c = d; else if (c != d) return -1;
3958 break;
3959
3960 case OP_EXACT: /* Fall through */
3961 scode += 2;
3962
3963 case OP_CHAR:
3964 case OP_CHARNC:
3965 case OP_PLUS:
3966 case OP_MINPLUS:
3967 if (!inassert) return -1;
3968 if (c < 0)
3969 {
3970 c = scode[1];
3971 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3972 }
3973 else if (c != scode[1]) return -1;
3974 break;
3975 }
3976
3977 code += GET(code, 1);
3978 }
3979 while (*code == OP_ALT);
3980 return c;
3981 }
3982
3983
3984
3985 /*************************************************
3986 * Compile a Regular Expression *
3987 *************************************************/
3988
3989 /* This function takes a string and returns a pointer to a block of store
3990 holding a compiled version of the expression. The original API for this
3991 function had no error code return variable; it is retained for backwards
3992 compatibility. The new function is given a new name.
3993
3994 Arguments:
3995 pattern the regular expression
3996 options various option bits
3997 errorcodeptr pointer to error code variable (pcre_compile2() only)
3998 can be NULL if you don't want a code value
3999 errorptr pointer to pointer to error text
4000 erroroffset ptr offset in pattern where error was detected
4001 tables pointer to character tables or NULL
4002
4003 Returns: pointer to compiled data block, or NULL on error,
4004 with errorptr and erroroffset set
4005 */
4006
4007 PCRE_DATA_SCOPE pcre *
4008 pcre_compile(const char *pattern, int options, const char **errorptr,
4009 int *erroroffset, const unsigned char *tables)
4010 {
4011 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
4012 }
4013
4014
4015
4016 PCRE_DATA_SCOPE pcre *
4017 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
4018 const char **errorptr, int *erroroffset, const unsigned char *tables)
4019 {
4020 real_pcre *re;
4021 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4022 int c, firstbyte, reqbyte, newline;
4023 int bracount = 0;
4024 int branch_extra = 0;
4025 int branch_newextra;
4026 int item_count = -1;
4027 int name_count = 0;
4028 int max_name_size = 0;
4029 int lastitemlength = 0;
4030 int errorcode = 0;
4031 #ifdef SUPPORT_UTF8
4032 BOOL utf8;
4033 BOOL class_utf8;
4034 #endif
4035 BOOL inescq = FALSE;
4036 BOOL capturing;
4037 unsigned int brastackptr = 0;
4038 size_t size;
4039 uschar *code;
4040 const uschar *codestart;
4041 const uschar *ptr;
4042 compile_data compile_block;
4043 compile_data *cd = &compile_block;
4044 int brastack[BRASTACK_SIZE];
4045 uschar bralenstack[BRASTACK_SIZE];
4046
4047 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4048 can do is just return NULL, but we can set a code value if there is a code
4049 pointer. */
4050
4051 if (errorptr == NULL)
4052 {
4053 if (errorcodeptr != NULL) *errorcodeptr = 99;
4054 return NULL;
4055 }
4056
4057 *errorptr = NULL;
4058 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
4059
4060 /* However, we can give a message for this error */
4061
4062 if (erroroffset == NULL)
4063 {
4064 errorcode = ERR16;
4065 goto PCRE_EARLY_ERROR_RETURN;
4066 }
4067
4068 *erroroffset = 0;
4069
4070 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4071
4072 #ifdef SUPPORT_UTF8
4073 utf8 = (options & PCRE_UTF8) != 0;
4074 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4075 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
4076 {
4077 errorcode = ERR44;
4078 goto PCRE_EARLY_ERROR_RETURN;
4079 }
4080 #else
4081 if ((options & PCRE_UTF8) != 0)
4082 {
4083 errorcode = ERR32;
4084 goto PCRE_EARLY_ERROR_RETURN;
4085 }
4086 #endif
4087
4088 if ((options & ~PUBLIC_OPTIONS) != 0)
4089 {
4090 errorcode = ERR17;
4091 goto PCRE_EARLY_ERROR_RETURN;
4092 }
4093
4094 /* Set up pointers to the individual character tables */
4095
4096 if (tables == NULL) tables = _pcre_default_tables;
4097 cd->lcc = tables + lcc_offset;
4098 cd->fcc = tables + fcc_offset;
4099 cd->cbits = tables + cbits_offset;
4100 cd->ctypes = tables + ctypes_offset;
4101
4102 /* Handle different types of newline. The two bits give four cases. The current
4103 code allows for one- or two-byte sequences. */
4104
4105 switch (options & PCRE_NEWLINE_CRLF)
4106 {
4107 default: newline = NEWLINE; break; /* Compile-time default */
4108 case PCRE_NEWLINE_CR: newline = '\r'; break;
4109 case PCRE_NEWLINE_LF: newline = '\n'; break;
4110 case PCRE_NEWLINE_CR+
4111 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4112 }
4113
4114 if (newline > 255)
4115 {
4116 cd->nllen = 2;
4117 cd->nl[0] = (newline >> 8) & 255;
4118 cd->nl[1] = newline & 255;
4119 }
4120 else
4121 {
4122 cd->nllen = 1;
4123 cd->nl[0] = newline;
4124 }
4125
4126 /* Maximum back reference and backref bitmap. This is updated for numeric
4127 references during the first pass, but for named references during the actual
4128 compile pass. The bitmap records up to 31 back references to help in deciding
4129 whether (.*) can be treated as anchored or not. */
4130
4131 cd->top_backref = 0;
4132 cd->backref_map = 0;
4133
4134 /* Reflect pattern for debugging output */
4135
4136 DPRINTF(("------------------------------------------------------------------\n"));
4137 DPRINTF(("%s\n", pattern));
4138
4139 /* The first thing to do is to make a pass over the pattern to compute the
4140 amount of store required to hold the compiled code. This does not have to be
4141 perfect as long as errors are overestimates. At the same time we can detect any
4142 flag settings right at the start, and extract them. Make an attempt to correct
4143 for any counted white space if an "extended" flag setting appears late in the
4144 pattern. We can't be so clever for #-comments. */
4145
4146 ptr = (const uschar *)(pattern - 1);
4147 while ((c = *(++ptr)) != 0)
4148 {
4149 int min, max;
4150 int class_optcount;
4151 int bracket_length;
4152 int duplength;
4153
4154 /* If we are inside a \Q...\E sequence, all chars are literal */
4155
4156 if (inescq)
4157 {
4158 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4159 goto NORMAL_CHAR;
4160 }
4161
4162 /* Otherwise, first check for ignored whitespace and comments */
4163
4164 if ((options & PCRE_EXTENDED) != 0)
4165 {
4166 if ((cd->ctypes[c] & ctype_space) != 0) continue;
4167 if (c == '#')
4168 {
4169 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
4170 if (*ptr != 0)
4171 {
4172 ptr += cd->nllen - 1;
4173 continue;
4174 }
4175 break; /* End loop at end of pattern */
4176 }
4177 }
4178
4179 item_count++; /* Is zero for the first non-comment item */
4180
4181 /* Allow space for auto callout before every item except quantifiers. */
4182
4183 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4184 c != '*' && c != '+' && c != '?' &&
4185 (c != '{' || !is_counted_repeat(ptr + 1)))
4186 length += 2 + 2*LINK_SIZE;
4187
4188 switch(c)
4189 {
4190 /* A backslashed item may be an escaped data character or it may be a
4191 character type. */
4192
4193 case '\\':
4194 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4195 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4196
4197 lastitemlength = 1; /* Default length of last item for repeats */
4198
4199 if (c >= 0) /* Data character */
4200 {
4201 length += 2; /* For a one-byte character */
4202
4203 #ifdef SUPPORT_UTF8
4204 if (utf8 && c > 127)
4205 {
4206 int i;
4207 for (i = 0; i < _pcre_utf8_table1_size; i++)
4208 if (c <= _pcre_utf8_table1[i]) break;
4209 length += i;
4210 lastitemlength += i;
4211 }
4212 #endif
4213
4214 continue;
4215 }
4216
4217 /* If \Q, enter "literal" mode */
4218
4219 if (-c == ESC_Q)
4220 {
4221 inescq = TRUE;
4222 continue;
4223 }
4224
4225 /* \X is supported only if Unicode property support is compiled */
4226
4227 #ifndef SUPPORT_UCP
4228 if (-c == ESC_X)
4229 {
4230 errorcode = ERR45;
4231 goto PCRE_ERROR_RETURN;
4232 }
4233 #endif
4234
4235 /* \P and \p are for Unicode properties, but only when the support has
4236 been compiled. Each item needs 3 bytes. */
4237
4238 else if (-c == ESC_P || -c == ESC_p)
4239 {
4240 #ifdef SUPPORT_UCP
4241 BOOL negated;
4242 BOOL pdata;
4243 length += 3;
4244 lastitemlength = 3;
4245 if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
4246 goto PCRE_ERROR_RETURN;
4247 continue;
4248 #else
4249 errorcode = ERR45;
4250 goto PCRE_ERROR_RETURN;
4251 #endif
4252 }
4253
4254 /* Other escapes need one byte */
4255
4256 length++;
4257
4258 /* A back reference needs an additional 2 bytes, plus either one or 5
4259 bytes for a repeat. We also need to keep the value of the highest
4260 back reference. */
4261
4262 if (c <= -ESC_REF)
4263 {
4264 int refnum = -c - ESC_REF;
4265 cd->backref_map |= (refnum < 32)? (1 << refnum) : 1;
4266 if (refnum > cd->top_backref)
4267 cd->top_backref = refnum;
4268 length += 2; /* For single back reference */
4269 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4270 {
4271 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4272 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4273 if ((min == 0 && (max == 1 || max == -1)) ||
4274 (min == 1 && max == -1))
4275 length++;
4276 else length += 5;
4277 if (ptr[1] == '?') ptr++;
4278 }
4279 }
4280 continue;
4281
4282 case '^': /* Single-byte metacharacters */
4283 case '.':
4284 case '$':
4285 length++;
4286 lastitemlength = 1;
4287 continue;
4288
4289 case '*': /* These repeats won't be after brackets; */
4290 case '+': /* those are handled separately */
4291 case '?':
4292 length++;
4293 goto POSESSIVE; /* A few lines below */
4294
4295 /* This covers the cases of braced repeats after a single char, metachar,
4296 class, or back reference. */
4297
4298 case '{':
4299 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4300 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4301 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4302
4303 /* These special cases just insert one extra opcode */
4304
4305 if ((min == 0 && (max == 1 || max == -1)) ||
4306 (min == 1 && max == -1))
4307 length++;
4308
4309 /* These cases might insert additional copies of a preceding character. */
4310
4311 else
4312 {
4313 if (min != 1)
4314 {
4315 length -= lastitemlength; /* Uncount the original char or metachar */
4316 if (min > 0) length += 3 + lastitemlength;
4317 }
4318 length += lastitemlength + ((max > 0)? 3 : 1);
4319 }
4320
4321 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4322
4323 POSESSIVE: /* Test for possessive quantifier */
4324 if (ptr[1] == '+')
4325 {
4326 ptr++;
4327 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4328 }
4329 continue;
4330
4331 /* An alternation contains an offset to the next branch or ket. If any ims
4332 options changed in the previous branch(es), and/or if we are in a
4333 lookbehind assertion, extra space will be needed at the start of the
4334 branch. This is handled by branch_extra. */
4335
4336 case '|':
4337 length += 1 + LINK_SIZE + branch_extra;
4338 continue;
4339
4340 /* A character class uses 33 characters provided that all the character
4341 values are less than 256. Otherwise, it uses a bit map for low valued
4342 characters, and individual items for others. Don't worry about character
4343 types that aren't allowed in classes - they'll get picked up during the
4344 compile. A character class that contains only one single-byte character
4345 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4346 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4347
4348 case '[':
4349 if (*(++ptr) == '^')
4350 {
4351 class_optcount = 10; /* Greater than one */
4352 ptr++;
4353 }
4354 else class_optcount = 0;
4355
4356 #ifdef SUPPORT_UTF8
4357 class_utf8 = FALSE;
4358 #endif
4359
4360 /* Written as a "do" so that an initial ']' is taken as data */
4361
4362 if (*ptr != 0) do
4363 {
4364 /* Inside \Q...\E everything is literal except \E */
4365
4366 if (inescq)
4367 {
4368 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4369 inescq = FALSE;
4370 ptr += 1;
4371 continue;
4372 }
4373
4374 /* Outside \Q...\E, check for escapes */
4375
4376 if (*ptr == '\\')
4377 {
4378 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4379 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4380
4381 /* \b is backspace inside a class; \X is literal */
4382
4383 if (-c == ESC_b) c = '\b';
4384 else if (-c == ESC_X) c = 'X';
4385
4386 /* \Q enters quoting mode */
4387
4388 else if (-c == ESC_Q)
4389 {
4390 inescq = TRUE;
4391 continue;
4392 }
4393
4394 /* Handle escapes that turn into characters */
4395
4396 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4397
4398 /* Escapes that are meta-things. The normal ones just affect the
4399 bit map, but Unicode properties require an XCLASS extended item. */
4400
4401 else
4402 {
4403 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4404 #ifdef SUPPORT_UTF8
4405 if (-c == ESC_p || -c == ESC_P)
4406 {
4407 if (!class_utf8)
4408 {
4409 class_utf8 = TRUE;
4410 length += LINK_SIZE + 2;
4411 }
4412 length += 3;
4413 }
4414 #endif
4415 }
4416 }
4417
4418 /* Check the syntax for POSIX stuff. The bits we actually handle are
4419 checked during the real compile phase. */
4420
4421 else if (*ptr == '[' &&
4422 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
4423 check_posix_syntax(ptr, &ptr, cd))
4424 {
4425 ptr++;
4426 class_optcount = 10; /* Make sure > 1 */
4427 }
4428
4429 /* Anything else increments the possible optimization count. We have to
4430 detect ranges here so that we can compute the number of extra ranges for
4431 caseless wide characters when UCP support is available. If there are wide
4432 characters, we are going to have to use an XCLASS, even for single
4433 characters. */
4434
4435 else
4436 {
4437 int d;
4438
4439 GET_ONE_CHARACTER:
4440
4441 #ifdef SUPPORT_UTF8
4442 if (utf8)
4443 {
4444 int extra = 0;
4445 GETCHARLEN(c, ptr, extra);
4446 ptr += extra;
4447 }
4448 else c = *ptr;
4449 #else
4450 c = *ptr;
4451 #endif
4452
4453 /* Come here from handling \ above when it escapes to a char value */
4454
4455 NON_SPECIAL_CHARACTER:
4456 class_optcount++;
4457
4458 d = -1;
4459 if (ptr[1] == '-')
4460 {
4461 uschar const *hyptr = ptr++;
4462 if (ptr[1] == '\\')
4463 {
4464 ptr++;
4465 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4466 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4467 if (-d == ESC_b) d = '\b'; /* backspace */
4468 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4469 }
4470 else if (ptr[1] != 0 && ptr[1] != ']')
4471 {
4472 ptr++;
4473 #ifdef SUPPORT_UTF8
4474 if (utf8)
4475 {
4476 int extra = 0;
4477 GETCHARLEN(d, ptr, extra);
4478 ptr += extra;
4479 }
4480 else
4481 #endif
4482 d = *ptr;
4483 }
4484 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4485 }
4486
4487 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4488 127 for caseless matching, we will need to use an XCLASS. */
4489
4490 if (d >= 0)
4491 {
4492 class_optcount = 10; /* Ensure > 1 */
4493 if (d < c)
4494 {
4495 errorcode = ERR8;
4496 goto PCRE_ERROR_RETURN;
4497 }
4498
4499 #ifdef SUPPORT_UTF8
4500 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4501 {
4502 uschar buffer[6];
4503 if (!class_utf8) /* Allow for XCLASS overhead */
4504 {
4505 class_utf8 = TRUE;
4506 length += LINK_SIZE + 2;
4507 }
4508
4509 #ifdef SUPPORT_UCP
4510 /* If we have UCP support, find out how many extra ranges are
4511 needed to map the other case of characters within this range. We
4512 have to mimic the range optimization here, because extending the
4513 range upwards might push d over a boundary that makes is use
4514 another byte in the UTF-8 representation. */
4515
4516 if ((options & PCRE_CASELESS) != 0)
4517 {
4518 int occ, ocd;
4519 int cc = c;
4520 int origd = d;
4521 while (get_othercase_range(&cc, origd, &occ, &ocd))
4522 {
4523 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4524
4525 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4526 { /* if there is overlap, */
4527 c = occ; /* noting that if occ < c */
4528 continue; /* we can't have ocd > d */
4529 } /* because a subrange is */
4530 if (ocd > d && occ <= d + 1) /* always shorter than */
4531 { /* the basic range. */
4532 d = ocd;
4533 continue;
4534 }
4535
4536 /* An extra item is needed */
4537
4538 length += 1 + _pcre_ord2utf8(occ, buffer) +
4539 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4540 }
4541 }
4542 #endif /* SUPPORT_UCP */
4543
4544 /* The length of the (possibly extended) range */
4545
4546 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4547 }
4548 #endif /* SUPPORT_UTF8 */
4549
4550 }
4551
4552 /* We have a single character. There is nothing to be done unless we
4553 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4554 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4555 support. */
4556
4557 else
4558 {
4559 #ifdef SUPPORT_UTF8
4560 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4561 {
4562 uschar buffer[6];
4563 class_optcount = 10; /* Ensure > 1 */
4564 if (!class_utf8) /* Allow for XCLASS overhead */
4565 {
4566 class_utf8 = TRUE;
4567 length += LINK_SIZE + 2;
4568 }
4569 #ifdef SUPPORT_UCP
4570 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4571 (1 + _pcre_ord2utf8(c, buffer));
4572 #else /* SUPPORT_UCP */
4573 length += 1 + _pcre_ord2utf8(c, buffer);
4574 #endif /* SUPPORT_UCP */
4575 }
4576 #endif /* SUPPORT_UTF8 */
4577 }
4578 }
4579 }
4580 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4581
4582 if (*ptr == 0) /* Missing terminating ']' */
4583 {
4584 errorcode = ERR6;
4585 goto PCRE_ERROR_RETURN;
4586 }
4587
4588 /* We can optimize when there was only one optimizable character. Repeats
4589 for positive and negated single one-byte chars are handled by the general
4590 code. Here, we handle repeats for the class opcodes. */
4591
4592 if (class_optcount == 1) length += 3; else
4593 {
4594 length += 33;
4595
4596 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4597 we also need extra for wrapping the whole thing in a sub-pattern. */
4598
4599 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4600 {
4601 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4602 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4603 if ((min == 0 && (max == 1 || max == -1)) ||
4604 (min == 1 && max == -1))
4605 length++;
4606 else length += 5;
4607 if (ptr[1] == '+')
4608 {
4609 ptr++;
4610 length += 2 + 2*LINK_SIZE;
4611 }
4612 else if (ptr[1] == '?') ptr++;
4613 }
4614 }
4615 continue;
4616
4617 /* Brackets may be genuine groups or special things */
4618
4619 case '(':
4620 branch_newextra = 0;
4621 bracket_length = 1 + LINK_SIZE;
4622 capturing = FALSE;
4623
4624 /* Handle special forms of bracket, which all start (? */
4625
4626 if (ptr[1] == '?')
4627 {
4628 int set, unset;
4629 int *optset;
4630
4631 switch (c = ptr[2])
4632 {
4633 /* Skip over comments entirely */
4634 case '#':
4635 ptr += 3;
4636 while (*ptr != 0 && *ptr != ')') ptr++;
4637 if (*ptr == 0)
4638 {
4639 errorcode = ERR18;
4640 goto PCRE_ERROR_RETURN;
4641 }
4642 continue;
4643
4644 /* Non-referencing groups and lookaheads just move the pointer on, and
4645 then behave like a non-special bracket, except that they don't increment
4646 the count of extracting brackets. Ditto for the "once only" bracket,
4647 which is in Perl from version 5.005. */
4648
4649 case ':':
4650 case '=':
4651 case '!':
4652 case '>':
4653 ptr += 2;
4654 break;
4655
4656 /* Named subpatterns are an extension copied from Python */
4657
4658 case 'P':
4659 ptr += 3;
4660
4661 /* Handle the definition of a named subpattern */
4662
4663 if (*ptr == '<')
4664 {
4665 const uschar *p; /* Don't amalgamate; some compilers */
4666 p = ++ptr; /* grumble at autoincrement in declaration */
4667 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4668 if (*ptr != '>')
4669 {
4670 errorcode = ERR42;
4671 goto PCRE_ERROR_RETURN;
4672 }
4673 name_count++;
4674 if (name_count > MAX_NAME_COUNT)
4675 {
4676 errorcode = ERR49;
4677 goto PCRE_ERROR_RETURN;
4678 }
4679 if (ptr - p > max_name_size)
4680 {
4681 max_name_size = (ptr - p);
4682 if (max_name_size > MAX_NAME_SIZE)
4683 {
4684 errorcode = ERR48;
4685 goto PCRE_ERROR_RETURN;
4686 }
4687 }
4688 capturing = TRUE; /* Named parentheses are always capturing */
4689 break; /* Go handle capturing parentheses */
4690 }
4691
4692 /* Handle back references and recursive calls to named subpatterns */
4693
4694 if (*ptr == '=' || *ptr == '>')
4695 {
4696 length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */
4697 while ((cd->ctypes[*(++ptr)] & ctype_word) != 0);
4698 if (*ptr != ')')
4699 {
4700 errorcode = ERR42;
4701 goto PCRE_ERROR_RETURN;
4702 }
4703 goto RECURSE_CHECK_QUANTIFIED;
4704 }
4705
4706 /* Unknown character after (?P */
4707
4708 errorcode = ERR41;
4709 goto PCRE_ERROR_RETURN;
4710
4711 /* (?R) specifies a recursive call to the regex, which is an extension
4712 to provide the facility which can be obtained by (?p{perl-code}) in
4713 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4714
4715 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4716 the appropriate numbered brackets. This includes both recursive and
4717 non-recursive calls. (?R) is now synonymous with (?0). */
4718
4719 case 'R':
4720 ptr++;
4721
4722 case '0': case '1': case '2': case '3': case '4':
4723 case '5': case '6': case '7': case '8': case '9':
4724 ptr += 2;
4725 if (c != 'R')
4726 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4727 if (*ptr != ')')
4728 {
4729 errorcode = ERR29;
4730 goto PCRE_ERROR_RETURN;
4731 }
4732 length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */
4733
4734 /* If this item is quantified, it will get wrapped inside brackets so
4735 as to use the code for quantified brackets. We jump down and use the
4736 code that handles this for real brackets. Come here from code for
4737 named recursions/subroutines. */
4738
4739 RECURSE_CHECK_QUANTIFIED:
4740 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4741 {
4742 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4743 duplength = 5 + 3 * LINK_SIZE;
4744 goto HANDLE_QUANTIFIED_BRACKETS;
4745 }
4746 continue;
4747
4748 /* (?C) is an extension which provides "callout" - to provide a bit of
4749 the functionality of the Perl (?{...}) feature. An optional number may
4750 follow (default is zero). */
4751
4752 case 'C':
4753 ptr += 2;
4754 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4755 if (*ptr != ')')
4756 {
4757 errorcode = ERR39;
4758 goto PCRE_ERROR_RETURN;
4759 }
4760 length += 2 + 2*LINK_SIZE;
4761 continue;
4762
4763 /* Lookbehinds are in Perl from version 5.005 */
4764
4765 case '<':
4766 ptr += 3;
4767 if (*ptr == '=' || *ptr == '!')
4768 {
4769 branch_newextra = 1 + LINK_SIZE;
4770 length += 1 + LINK_SIZE; /* For the first branch */
4771 break;
4772 }
4773 errorcode = ERR24;
4774 goto PCRE_ERROR_RETURN;
4775
4776 /* Conditionals are in Perl from version 5.005. The bracket must either
4777 be followed by a number (for bracket reference) or by an assertion
4778 group. PCRE extends this by allowing a name to reference a named group;
4779 unfortunately, previously 'R' was implemented for a recursion test.
4780 When this is compiled, we look for the named group 'R' first. At this
4781 point we just do a basic syntax check. */
4782
4783 case '(':
4784 if ((cd->ctypes[ptr[3]] & ctype_word) != 0)
4785 {
4786 ptr += 4;
4787 length += 3;
4788 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4789 if (*ptr != ')')
4790 {
4791 errorcode = ERR26;
4792 goto PCRE_ERROR_RETURN;
4793 }
4794 }
4795 else /* An assertion must follow */
4796 {
4797 ptr++; /* Can treat like ':' as far as spacing is concerned */
4798 if (ptr[2] != '?' ||
4799 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4800 {
4801 ptr += 2; /* To get right offset in message */
4802 errorcode = ERR28;
4803 goto PCRE_ERROR_RETURN;
4804 }
4805 }
4806 break;
4807
4808 /* Else loop checking valid options until ) is met. Anything else is an
4809 error. If we are without any brackets, i.e. at top level, the settings
4810 act as if specified in the options, so massage the options immediately.
4811 This is for backward compatibility with Perl 5.004. */
4812
4813 default:
4814 set = unset = 0;
4815 optset = &set;
4816 ptr += 2;
4817
4818 for (;; ptr++)
4819 {
4820 c = *ptr;
4821 switch (c)
4822 {
4823 case 'i':
4824 *optset |= PCRE_CASELESS;
4825 continue;
4826
4827 case 'J':
4828 *optset |= PCRE_DUPNAMES;
4829 options |= PCRE_JCHANGED; /* Record that it changed */
4830 continue;
4831
4832 case 'm':
4833 *optset |= PCRE_MULTILINE;
4834 continue;
4835
4836 case 's':
4837 *optset |= PCRE_DOTALL;
4838 continue;
4839
4840 case 'x':
4841 *optset |= PCRE_EXTENDED;
4842 continue;
4843
4844 case 'X':
4845 *optset |= PCRE_EXTRA;
4846 continue;
4847
4848 case 'U':
4849 *optset |= PCRE_UNGREEDY;
4850 continue;
4851
4852 case '-':
4853 optset = &unset;
4854 continue;
4855
4856 /* A termination by ')' indicates an options-setting-only item; if
4857 this is at the very start of the pattern (indicated by item_count
4858 being zero), we use it to set the global options. This is helpful
4859 when analyzing the pattern for first characters, etc. Otherwise
4860 nothing is done here and it is handled during the compiling
4861 process.
4862
4863 We allow for more than one options setting at the start. If such
4864 settings do not change the existing options, nothing is compiled.
4865 However, we must leave space just in case something is compiled.
4866 This can happen for pathological sequences such as (?i)(?-i)
4867 because the global options will end up with -i set. The space is
4868 small and not significant. (Before I did this there was a reported
4869 bug with (?i)(?-i) in a machine-generated pattern.)
4870
4871 [Historical note: Up to Perl 5.8, options settings at top level
4872 were always global settings, wherever they appeared in the pattern.
4873 That is, they were equivalent to an external setting. From 5.8
4874 onwards, they apply only to what follows (which is what you might
4875 expect).] */
4876
4877 case ')':
4878 if (item_count == 0)
4879 {
4880 options = (options | set) & (~unset);
4881 set = unset = 0; /* To save length */
4882 item_count--; /* To allow for several */
4883 length += 2;
4884 }
4885
4886 /* Fall through */
4887
4888 /* A termination by ':' indicates the start of a nested group with
4889 the given options set. This is again handled at compile time, but
4890 we must allow for compiled space if any of the ims options are
4891 set. We also have to allow for resetting space at the end of
4892 the group, which is why 4 is added to the length and not just 2.
4893 If there are several changes of options within the same group, this
4894 will lead to an over-estimate on the length, but this shouldn't
4895 matter very much. We also have to allow for resetting options at
4896 the start of any alternations, which we do by setting
4897 branch_newextra to 2. */
4898
4899 case ':':
4900 if (((set|unset) & PCRE_IMS) != 0)
4901 {
4902 length += 4;
4903 branch_newextra = 2;
4904 }
4905 goto END_OPTIONS;
4906
4907 /* Unrecognized option character */
4908
4909 default:
4910 errorcode = ERR12;
4911 goto PCRE_ERROR_RETURN;
4912 }
4913 }
4914
4915 /* If we hit a closing bracket, that's it - this is a freestanding
4916 option-setting. We need to ensure that branch_extra is updated if
4917 necessary. The only values branch_newextra can have here are 0 or 2.
4918 If the value is 2, then branch_extra must either be 2 or 5, depending
4919 on whether this is a lookbehind group or not. */
4920
4921 END_OPTIONS:
4922 if (c == ')')
4923 {
4924 if (branch_newextra == 2 &&
4925 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4926 branch_extra += branch_newextra;
4927 continue;
4928 }
4929
4930 /* If options were terminated by ':' control comes here. This is a
4931 non-capturing group with an options change. There is nothing more that
4932 needs to be done because "capturing" is already set FALSE by default;
4933 we can just fall through. */
4934
4935 }
4936 }
4937
4938 /* Ordinary parentheses, not followed by '?', are capturing unless
4939 PCRE_NO_AUTO_CAPTURE is set. */
4940
4941 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4942
4943 /* Capturing brackets must be counted so we can process escapes in a
4944 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4945 an additional 3 bytes of memory per capturing bracket. */
4946
4947 if (capturing)
4948 {
4949 bracount++;
4950 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4951 }
4952
4953 /* Save length for computing whole length at end if there's a repeat that
4954 requires duplication of the group. Also save the current value of
4955 branch_extra, and start the new group with the new value. If non-zero, this
4956 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4957
4958 if (brastackptr >= sizeof(brastack)/sizeof(int))
4959 {
4960 errorcode = ERR19;
4961 goto PCRE_ERROR_RETURN;
4962 }
4963
4964 bralenstack[brastackptr] = branch_extra;
4965 branch_extra = branch_newextra;
4966
4967 brastack[brastackptr++] = length;
4968 length += bracket_length;
4969 continue;
4970
4971 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4972 have to replicate this bracket up to that many times. If brastackptr is
4973 0 this is an unmatched bracket which will generate an error, but take care
4974 not to try to access brastack[-1] when computing the length and restoring
4975 the branch_extra value. */
4976
4977 case ')':
4978 length += 1 + LINK_SIZE;
4979 if (brastackptr > 0)
4980 {
4981 duplength = length - brastack[--brastackptr];
4982 branch_extra = bralenstack[brastackptr];
4983 /* This is a paranoid check to stop integer overflow later on */
4984 if (duplength > MAX_DUPLENGTH)
4985 {
4986 errorcode = ERR50;
4987 goto PCRE_ERROR_RETURN;
4988 }
4989 }
4990 else duplength = 0;
4991
4992 /* The following code is also used when a recursion such as (?3) is
4993 followed by a quantifier, because in that case, it has to be wrapped inside
4994 brackets so that the quantifier works. The value of duplength must be
4995 set before arrival. */
4996
4997 HANDLE_QUANTIFIED_BRACKETS:
4998
4999 /* Leave ptr at the final char; for read_repeat_counts this happens
5000 automatically; for the others we need an increment. */
5001
5002 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5003 {
5004 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
5005 if (errorcode != 0) goto PCRE_ERROR_RETURN;
5006 }
5007 else if (c == '*') { min = 0; max = -1; ptr++; }
5008 else if (c == '+') { min = 1; max = -1; ptr++; }
5009 else if (c == '?') { min = 0; max = 1; ptr++; }
5010 else { min = 1; max = 1; }
5011
5012 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5013 group, and if the maximum is greater than zero, we have to replicate
5014 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5015 bracket set. */
5016
5017 if (min == 0)
5018 {
5019 length++;
5020 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5021 }
5022
5023 /* When the minimum is greater than zero, we have to replicate up to
5024 minval-1 times, with no additions required in the copies. Then, if there
5025 is a limited maximum we have to replicate up to maxval-1 times allowing
5026 for a BRAZERO item before each optional copy and nesting brackets for all
5027 but one of the optional copies. */
5028
5029 else
5030 {
5031 length += (min - 1) * duplength;
5032 if (max > min) /* Need this test as max=-1 means no limit */
5033 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5034 - (2 + 2*LINK_SIZE);
5035 }
5036
5037 /* Allow space for once brackets for "possessive quantifier" */
5038
5039 if (ptr[1] == '+')
5040 {
5041 ptr++;
5042 length += 2 + 2*LINK_SIZE;
5043 }
5044 continue;
5045
5046 /* Non-special character. It won't be space or # in extended mode, so it is
5047 always a genuine character. If we are in a \Q...\E sequence, check for the
5048 end; if not, we have a literal. */
5049
5050 default:
5051 NORMAL_CHAR:
5052
5053 if (inescq && c == '\\' && ptr[1] == 'E')
5054 {
5055 inescq = FALSE;
5056 ptr++;
5057 continue;
5058 }
5059
5060 length += 2; /* For a one-byte character */
5061 lastitemlength = 1; /* Default length of last item for repeats */
5062
5063 /* In UTF-8 mode, check for additional bytes. */
5064
5065 #ifdef SUPPORT_UTF8
5066 if (utf8 && (c & 0xc0) == 0xc0)
5067 {
5068 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5069 { /* because the end is marked */
5070 lastitemlength++; /* by a zero byte. */
5071 length++;
5072 ptr++;
5073 }
5074 }
5075 #endif
5076
5077 continue;
5078 }
5079 }
5080
5081 length += 2 + LINK_SIZE; /* For final KET and END */
5082
5083 if ((options & PCRE_AUTO_CALLOUT) != 0)
5084 length += 2 + 2*LINK_SIZE; /* For final callout */
5085
5086 if (length > MAX_PATTERN_SIZE)
5087 {
5088 errorcode = ERR20;
5089 goto PCRE_EARLY_ERROR_RETURN;
5090 }
5091
5092 /* Compute the size of data block needed and get it, either from malloc or
5093 externally provided function. Integer overflow should no longer be possible
5094 because nowadays we limit the maximum value of name_count and max_name size. */
5095
5096 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5097 re = (real_pcre *)(pcre_malloc)(size);
5098
5099 if (re == NULL)
5100 {
5101 errorcode = ERR21;
5102 goto PCRE_EARLY_ERROR_RETURN;
5103 }
5104
5105 /* Put in the magic number, and save the sizes, options, and character table
5106 pointer. NULL is used for the default character tables. The nullpad field is at
5107 the end; it's there to help in the case when a regex compiled on a system with
5108 4-byte pointers is run on another with 8-byte pointers. */
5109
5110 re->magic_number = MAGIC_NUMBER;
5111 re->size = size;
5112 re->options = options;
5113 re->dummy1 = 0;
5114 re->name_table_offset = sizeof(real_pcre);
5115 re->name_entry_size = max_name_size + 3;
5116 re->name_count = name_count;
5117 re->ref_count = 0;
5118 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5119 re->nullpad = NULL;
5120
5121 /* The starting points of the name/number translation table and of the code are
5122 passed around in the compile data block. */
5123
5124 cd->names_found = 0;
5125 cd->name_entry_size = max_name_size + 3;
5126 cd->name_table = (uschar *)re + re->name_table_offset;
5127 codestart = cd->name_table + re->name_entry_size * re->name_count;
5128 cd->start_code = codestart;
5129 cd->start_pattern = (const uschar *)pattern;
5130 cd->req_varyopt = 0;
5131 cd->nopartial = FALSE;
5132
5133 /* Set up a starting, non-extracting bracket, then compile the expression. On
5134 error, errorcode will be set non-zero, so we don't need to look at the result
5135 of the function here. */
5136
5137 ptr = (const uschar *)pattern;
5138 code = (uschar *)codestart;
5139 *code = OP_BRA;
5140 bracount = 0;
5141 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5142 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd);
5143 re->top_bracket = bracount;
5144 re->top_backref = cd->top_backref;
5145
5146 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5147
5148 /* If not reached end of pattern on success, there's an excess bracket. */
5149
5150 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5151
5152 /* Fill in the terminating state and check for disastrous overflow, but
5153 if debugging, leave the test till after things are printed out. */
5154
5155 *code++ = OP_END;
5156
5157 #ifndef DEBUG
5158 if (code - codestart > length) errorcode = ERR23;
5159 #endif
5160
5161 /* Give an error if there's back reference to a non-existent capturing
5162 subpattern. */
5163
5164 if (re->top_backref > re->top_bracket) errorcode = ERR15;
5165
5166 /* Failed to compile, or error while post-processing */
5167
5168 if (errorcode != 0)
5169 {
5170 (pcre_free)(re);
5171 PCRE_ERROR_RETURN:
5172 *erroroffset = ptr - (const uschar *)pattern;
5173 PCRE_EARLY_ERROR_RETURN:
5174 *errorptr = error_texts[errorcode];
5175 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5176 return NULL;
5177 }
5178
5179 /* If the anchored option was not passed, set the flag if we can determine that
5180 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5181 as starting with .* when DOTALL is set).
5182
5183 Otherwise, if we know what the first character has to be, save it, because that
5184 speeds up unanchored matches no end. If not, see if we can set the
5185 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5186 start with ^. and also when all branches start with .* for non-DOTALL matches.
5187 */
5188
5189 if ((options & PCRE_ANCHORED) == 0)
5190 {
5191 int temp_options = options;
5192 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5193 re->options |= PCRE_ANCHORED;
5194 else
5195 {
5196 if (firstbyte < 0)
5197 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5198 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5199 {
5200 int ch = firstbyte & 255;
5201 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5202 cd->fcc[ch] == ch)? ch : firstbyte;
5203 re->options |= PCRE_FIRSTSET;
5204 }
5205 else if (is_startline(codestart, 0, cd->backref_map))
5206 re->options |= PCRE_STARTLINE;
5207 }
5208 }
5209
5210 /* For an anchored pattern, we use the "required byte" only if it follows a
5211 variable length item in the regex. Remove the caseless flag for non-caseable
5212 bytes. */
5213
5214 if (reqbyte >= 0 &&
5215 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5216 {
5217 int ch = reqbyte & 255;
5218 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5219 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5220 re->options |= PCRE_REQCHSET;
5221 }
5222
5223 /* Print out the compiled data if debugging is enabled. This is never the
5224 case when building a production library. */
5225
5226 #ifdef DEBUG
5227
5228 printf("Length = %d top_bracket = %d top_backref = %d\n",
5229 length, re->top_bracket, re->top_backref);
5230
5231 if (re->options != 0)
5232 {
5233 printf("%s%s%s%s%s%s%s%s%s\n",
5234 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5235 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5236 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5237 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5238 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5239 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5240 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5241 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5242 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5243 }
5244
5245 if ((re->options & PCRE_FIRSTSET) != 0)
5246 {
5247 int ch = re->first_byte & 255;
5248 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5249 "" : " (caseless)";
5250 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5251 else printf("First char = \\x%02x%s\n", ch, caseless);
5252 }
5253
5254 if ((re->options & PCRE_REQCHSET) != 0)
5255 {
5256 int ch = re->req_byte & 255;
5257 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5258 "" : " (caseless)";
5259 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5260 else printf("Req char = \\x%02x%s\n", ch, caseless);
5261 }
5262
5263 pcre_printint(re, stdout);
5264
5265 /* This check is done here in the debugging case so that the code that
5266 was compiled can be seen. */
5267
5268 if (code - codestart > length)
5269 {
5270 (pcre_free)(re);
5271 *errorptr = error_texts[ERR23];
5272 *erroroffset = ptr - (uschar *)pattern;
5273 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5274 return NULL;
5275 }
5276 #endif
5277
5278 return (pcre *)re;
5279 }
5280
5281 /* End of pcre_compile.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12