/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 87 - (show annotations) (download)
Sat Feb 24 21:41:21 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 162063 byte(s)
Load pcre-6.5 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #include "pcre_internal.h"
46
47
48 /* When DEBUG is defined, we need the pcre_printint() function, which is also
49 used by pcretest. DEBUG is not defined when building a production library. */
50
51 #ifdef DEBUG
52 #include "pcre_printint.src"
53 #endif
54
55
56
57 /*************************************************
58 * Code parameters and static tables *
59 *************************************************/
60
61 /* Maximum number of items on the nested bracket stacks at compile time. This
62 applies to the nesting of all kinds of parentheses. It does not limit
63 un-nested, non-capturing parentheses. This number can be made bigger if
64 necessary - it is used to dimension one int and one unsigned char vector at
65 compile time. */
66
67 #define BRASTACK_SIZE 200
68
69
70 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
71 are simple data values; negative values are for special things like \d and so
72 on. Zero means further processing is needed (for things like \x), or the escape
73 is invalid. */
74
75 #if !EBCDIC /* This is the "normal" table for ASCII systems */
76 static const short int escapes[] = {
77 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
78 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
79 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
80 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
81 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
82 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
83 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
84 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
85 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
86 0, 0, -ESC_z /* x - z */
87 };
88
89 #else /* This is the "abnormal" table for EBCDIC systems */
90 static const short int escapes[] = {
91 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
92 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
93 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
94 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
95 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
96 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
97 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
98 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
99 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
100 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
101 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
102 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
103 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
104 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
105 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
106 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
107 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
108 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
109 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
110 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
111 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
112 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
113 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
114 };
115 #endif
116
117
118 /* Tables of names of POSIX character classes and their lengths. The list is
119 terminated by a zero length entry. The first three must be alpha, lower, upper,
120 as this is assumed for handling case independence. */
121
122 static const char *const posix_names[] = {
123 "alpha", "lower", "upper",
124 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
125 "print", "punct", "space", "word", "xdigit" };
126
127 static const uschar posix_name_lengths[] = {
128 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
129
130 /* Table of class bit maps for each POSIX class. Each class is formed from a
131 base map, with an optional addition or removal of another map. Then, for some
132 classes, there is some additional tweaking: for [:blank:] the vertical space
133 characters are removed, and for [:alpha:] and [:alnum:] the underscore
134 character is removed. The triples in the table consist of the base map offset,
135 second map offset or -1 if no second map, and a non-negative value for map
136 addition or a negative value for map subtraction (if there are two maps). The
137 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
138 remove vertical space characters, 2 => remove underscore. */
139
140 static const int posix_class_maps[] = {
141 cbit_word, cbit_digit, -2, /* alpha */
142 cbit_lower, -1, 0, /* lower */
143 cbit_upper, -1, 0, /* upper */
144 cbit_word, -1, 2, /* alnum - word without underscore */
145 cbit_print, cbit_cntrl, 0, /* ascii */
146 cbit_space, -1, 1, /* blank - a GNU extension */
147 cbit_cntrl, -1, 0, /* cntrl */
148 cbit_digit, -1, 0, /* digit */
149 cbit_graph, -1, 0, /* graph */
150 cbit_print, -1, 0, /* print */
151 cbit_punct, -1, 0, /* punct */
152 cbit_space, -1, 0, /* space */
153 cbit_word, -1, 0, /* word - a Perl extension */
154 cbit_xdigit,-1, 0 /* xdigit */
155 };
156
157
158 /* The texts of compile-time error messages. These are "char *" because they
159 are passed to the outside world. */
160
161 static const char *error_texts[] = {
162 "no error",
163 "\\ at end of pattern",
164 "\\c at end of pattern",
165 "unrecognized character follows \\",
166 "numbers out of order in {} quantifier",
167 /* 5 */
168 "number too big in {} quantifier",
169 "missing terminating ] for character class",
170 "invalid escape sequence in character class",
171 "range out of order in character class",
172 "nothing to repeat",
173 /* 10 */
174 "operand of unlimited repeat could match the empty string",
175 "internal error: unexpected repeat",
176 "unrecognized character after (?",
177 "POSIX named classes are supported only within a class",
178 "missing )",
179 /* 15 */
180 "reference to non-existent subpattern",
181 "erroffset passed as NULL",
182 "unknown option bit(s) set",
183 "missing ) after comment",
184 "parentheses nested too deeply",
185 /* 20 */
186 "regular expression too large",
187 "failed to get memory",
188 "unmatched parentheses",
189 "internal error: code overflow",
190 "unrecognized character after (?<",
191 /* 25 */
192 "lookbehind assertion is not fixed length",
193 "malformed number after (?(",
194 "conditional group contains more than two branches",
195 "assertion expected after (?(",
196 "(?R or (?digits must be followed by )",
197 /* 30 */
198 "unknown POSIX class name",
199 "POSIX collating elements are not supported",
200 "this version of PCRE is not compiled with PCRE_UTF8 support",
201 "spare error",
202 "character value in \\x{...} sequence is too large",
203 /* 35 */
204 "invalid condition (?(0)",
205 "\\C not allowed in lookbehind assertion",
206 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
207 "number after (?C is > 255",
208 "closing ) for (?C expected",
209 /* 40 */
210 "recursive call could loop indefinitely",
211 "unrecognized character after (?P",
212 "syntax error after (?P",
213 "two named groups have the same name",
214 "invalid UTF-8 string",
215 /* 45 */
216 "support for \\P, \\p, and \\X has not been compiled",
217 "malformed \\P or \\p sequence",
218 "unknown property name after \\P or \\p"
219 };
220
221
222 /* Table to identify digits and hex digits. This is used when compiling
223 patterns. Note that the tables in chartables are dependent on the locale, and
224 may mark arbitrary characters as digits - but the PCRE compiling code expects
225 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
226 a private table here. It costs 256 bytes, but it is a lot faster than doing
227 character value tests (at least in some simple cases I timed), and in some
228 applications one wants PCRE to compile efficiently as well as match
229 efficiently.
230
231 For convenience, we use the same bit definitions as in chartables:
232
233 0x04 decimal digit
234 0x08 hexadecimal digit
235
236 Then we can use ctype_digit and ctype_xdigit in the code. */
237
238 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
239 static const unsigned char digitab[] =
240 {
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
247 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
248 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
249 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
253 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
273
274 #else /* This is the "abnormal" case, for EBCDIC systems */
275 static const unsigned char digitab[] =
276 {
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
293 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
301 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
307 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
308 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
309
310 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
311 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
312 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
313 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
315 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
319 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
320 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
322 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
324 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
327 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
328 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
329 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
330 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
331 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
332 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
333 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
334 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
335 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
336 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
337 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
338 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
339 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
340 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
341 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
342 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
343 #endif
344
345
346 /* Definition to allow mutual recursion */
347
348 static BOOL
349 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
350 int *, int *, branch_chain *, compile_data *);
351
352
353
354 /*************************************************
355 * Handle escapes *
356 *************************************************/
357
358 /* This function is called when a \ has been encountered. It either returns a
359 positive value for a simple escape such as \n, or a negative value which
360 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
361 a positive value greater than 255 may be returned. On entry, ptr is pointing at
362 the \. On exit, it is on the final character of the escape sequence.
363
364 Arguments:
365 ptrptr points to the pattern position pointer
366 errorcodeptr points to the errorcode variable
367 bracount number of previous extracting brackets
368 options the options bits
369 isclass TRUE if inside a character class
370
371 Returns: zero or positive => a data character
372 negative => a special escape sequence
373 on error, errorptr is set
374 */
375
376 static int
377 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
378 int options, BOOL isclass)
379 {
380 BOOL utf8 = (options & PCRE_UTF8) != 0;
381 const uschar *ptr = *ptrptr + 1;
382 int c, i;
383
384 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
385 ptr--; /* Set pointer back to the last byte */
386
387 /* If backslash is at the end of the pattern, it's an error. */
388
389 if (c == 0) *errorcodeptr = ERR1;
390
391 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
392 a table. A non-zero result is something that can be returned immediately.
393 Otherwise further processing may be required. */
394
395 #if !EBCDIC /* ASCII coding */
396 else if (c < '0' || c > 'z') {} /* Not alphameric */
397 else if ((i = escapes[c - '0']) != 0) c = i;
398
399 #else /* EBCDIC coding */
400 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
401 else if ((i = escapes[c - 0x48]) != 0) c = i;
402 #endif
403
404 /* Escapes that need further processing, or are illegal. */
405
406 else
407 {
408 const uschar *oldptr;
409 switch (c)
410 {
411 /* A number of Perl escapes are not handled by PCRE. We give an explicit
412 error. */
413
414 case 'l':
415 case 'L':
416 case 'N':
417 case 'u':
418 case 'U':
419 *errorcodeptr = ERR37;
420 break;
421
422 /* The handling of escape sequences consisting of a string of digits
423 starting with one that is not zero is not straightforward. By experiment,
424 the way Perl works seems to be as follows:
425
426 Outside a character class, the digits are read as a decimal number. If the
427 number is less than 10, or if there are that many previous extracting
428 left brackets, then it is a back reference. Otherwise, up to three octal
429 digits are read to form an escaped byte. Thus \123 is likely to be octal
430 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
431 value is greater than 377, the least significant 8 bits are taken. Inside a
432 character class, \ followed by a digit is always an octal number. */
433
434 case '1': case '2': case '3': case '4': case '5':
435 case '6': case '7': case '8': case '9':
436
437 if (!isclass)
438 {
439 oldptr = ptr;
440 c -= '0';
441 while ((digitab[ptr[1]] & ctype_digit) != 0)
442 c = c * 10 + *(++ptr) - '0';
443 if (c < 10 || c <= bracount)
444 {
445 c = -(ESC_REF + c);
446 break;
447 }
448 ptr = oldptr; /* Put the pointer back and fall through */
449 }
450
451 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
452 generates a binary zero byte and treats the digit as a following literal.
453 Thus we have to pull back the pointer by one. */
454
455 if ((c = *ptr) >= '8')
456 {
457 ptr--;
458 c = 0;
459 break;
460 }
461
462 /* \0 always starts an octal number, but we may drop through to here with a
463 larger first octal digit. */
464
465 case '0':
466 c -= '0';
467 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
468 c = c * 8 + *(++ptr) - '0';
469 c &= 255; /* Take least significant 8 bits */
470 break;
471
472 /* \x is complicated. \x{ddd} is a character number which can be greater
473 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
474 treated as a data character. */
475
476 case 'x':
477 if (ptr[1] == '{')
478 {
479 const uschar *pt = ptr + 2;
480 int count = 0;
481
482 c = 0;
483 while ((digitab[*pt] & ctype_xdigit) != 0)
484 {
485 register int cc = *pt++;
486 if (c == 0 && cc == '0') continue; /* Leading zeroes */
487 count++;
488
489 #if !EBCDIC /* ASCII coding */
490 if (cc >= 'a') cc -= 32; /* Convert to upper case */
491 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
492 #else /* EBCDIC coding */
493 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
494 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
495 #endif
496 }
497
498 if (*pt == '}')
499 {
500 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
501 ptr = pt;
502 break;
503 }
504
505 /* If the sequence of hex digits does not end with '}', then we don't
506 recognize this construct; fall through to the normal \x handling. */
507 }
508
509 /* Read just a single-byte hex-defined char */
510
511 c = 0;
512 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
513 {
514 int cc; /* Some compilers don't like ++ */
515 cc = *(++ptr); /* in initializers */
516 #if !EBCDIC /* ASCII coding */
517 if (cc >= 'a') cc -= 32; /* Convert to upper case */
518 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
519 #else /* EBCDIC coding */
520 if (cc <= 'z') cc += 64; /* Convert to upper case */
521 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
522 #endif
523 }
524 break;
525
526 /* Other special escapes not starting with a digit are straightforward */
527
528 case 'c':
529 c = *(++ptr);
530 if (c == 0)
531 {
532 *errorcodeptr = ERR2;
533 return 0;
534 }
535
536 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
537 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
538 (However, an EBCDIC equivalent has now been added.) */
539
540 #if !EBCDIC /* ASCII coding */
541 if (c >= 'a' && c <= 'z') c -= 32;
542 c ^= 0x40;
543 #else /* EBCDIC coding */
544 if (c >= 'a' && c <= 'z') c += 64;
545 c ^= 0xC0;
546 #endif
547 break;
548
549 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
550 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
551 for Perl compatibility, it is a literal. This code looks a bit odd, but
552 there used to be some cases other than the default, and there may be again
553 in future, so I haven't "optimized" it. */
554
555 default:
556 if ((options & PCRE_EXTRA) != 0) switch(c)
557 {
558 default:
559 *errorcodeptr = ERR3;
560 break;
561 }
562 break;
563 }
564 }
565
566 *ptrptr = ptr;
567 return c;
568 }
569
570
571
572 #ifdef SUPPORT_UCP
573 /*************************************************
574 * Handle \P and \p *
575 *************************************************/
576
577 /* This function is called after \P or \p has been encountered, provided that
578 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
579 pointing at the P or p. On exit, it is pointing at the final character of the
580 escape sequence.
581
582 Argument:
583 ptrptr points to the pattern position pointer
584 negptr points to a boolean that is set TRUE for negation else FALSE
585 dptr points to an int that is set to the detailed property value
586 errorcodeptr points to the error code variable
587
588 Returns: type value from ucp_type_table, or -1 for an invalid type
589 */
590
591 static int
592 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
593 {
594 int c, i, bot, top;
595 const uschar *ptr = *ptrptr;
596 char name[32];
597
598 c = *(++ptr);
599 if (c == 0) goto ERROR_RETURN;
600
601 *negptr = FALSE;
602
603 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
604 negation. */
605
606 if (c == '{')
607 {
608 if (ptr[1] == '^')
609 {
610 *negptr = TRUE;
611 ptr++;
612 }
613 for (i = 0; i < sizeof(name) - 1; i++)
614 {
615 c = *(++ptr);
616 if (c == 0) goto ERROR_RETURN;
617 if (c == '}') break;
618 name[i] = c;
619 }
620 if (c !='}') goto ERROR_RETURN;
621 name[i] = 0;
622 }
623
624 /* Otherwise there is just one following character */
625
626 else
627 {
628 name[0] = c;
629 name[1] = 0;
630 }
631
632 *ptrptr = ptr;
633
634 /* Search for a recognized property name using binary chop */
635
636 bot = 0;
637 top = _pcre_utt_size;
638
639 while (bot < top)
640 {
641 i = (bot + top) >> 1;
642 c = strcmp(name, _pcre_utt[i].name);
643 if (c == 0)
644 {
645 *dptr = _pcre_utt[i].value;
646 return _pcre_utt[i].type;
647 }
648 if (c > 0) bot = i + 1; else top = i;
649 }
650
651 *errorcodeptr = ERR47;
652 *ptrptr = ptr;
653 return -1;
654
655 ERROR_RETURN:
656 *errorcodeptr = ERR46;
657 *ptrptr = ptr;
658 return -1;
659 }
660 #endif
661
662
663
664
665 /*************************************************
666 * Check for counted repeat *
667 *************************************************/
668
669 /* This function is called when a '{' is encountered in a place where it might
670 start a quantifier. It looks ahead to see if it really is a quantifier or not.
671 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
672 where the ddds are digits.
673
674 Arguments:
675 p pointer to the first char after '{'
676
677 Returns: TRUE or FALSE
678 */
679
680 static BOOL
681 is_counted_repeat(const uschar *p)
682 {
683 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
684 while ((digitab[*p] & ctype_digit) != 0) p++;
685 if (*p == '}') return TRUE;
686
687 if (*p++ != ',') return FALSE;
688 if (*p == '}') return TRUE;
689
690 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
691 while ((digitab[*p] & ctype_digit) != 0) p++;
692
693 return (*p == '}');
694 }
695
696
697
698 /*************************************************
699 * Read repeat counts *
700 *************************************************/
701
702 /* Read an item of the form {n,m} and return the values. This is called only
703 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
704 so the syntax is guaranteed to be correct, but we need to check the values.
705
706 Arguments:
707 p pointer to first char after '{'
708 minp pointer to int for min
709 maxp pointer to int for max
710 returned as -1 if no max
711 errorcodeptr points to error code variable
712
713 Returns: pointer to '}' on success;
714 current ptr on error, with errorcodeptr set non-zero
715 */
716
717 static const uschar *
718 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
719 {
720 int min = 0;
721 int max = -1;
722
723 /* Read the minimum value and do a paranoid check: a negative value indicates
724 an integer overflow. */
725
726 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
727 if (min < 0 || min > 65535)
728 {
729 *errorcodeptr = ERR5;
730 return p;
731 }
732
733 /* Read the maximum value if there is one, and again do a paranoid on its size.
734 Also, max must not be less than min. */
735
736 if (*p == '}') max = min; else
737 {
738 if (*(++p) != '}')
739 {
740 max = 0;
741 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
742 if (max < 0 || max > 65535)
743 {
744 *errorcodeptr = ERR5;
745 return p;
746 }
747 if (max < min)
748 {
749 *errorcodeptr = ERR4;
750 return p;
751 }
752 }
753 }
754
755 /* Fill in the required variables, and pass back the pointer to the terminating
756 '}'. */
757
758 *minp = min;
759 *maxp = max;
760 return p;
761 }
762
763
764
765 /*************************************************
766 * Find first significant op code *
767 *************************************************/
768
769 /* This is called by several functions that scan a compiled expression looking
770 for a fixed first character, or an anchoring op code etc. It skips over things
771 that do not influence this. For some calls, a change of option is important.
772 For some calls, it makes sense to skip negative forward and all backward
773 assertions, and also the \b assertion; for others it does not.
774
775 Arguments:
776 code pointer to the start of the group
777 options pointer to external options
778 optbit the option bit whose changing is significant, or
779 zero if none are
780 skipassert TRUE if certain assertions are to be skipped
781
782 Returns: pointer to the first significant opcode
783 */
784
785 static const uschar*
786 first_significant_code(const uschar *code, int *options, int optbit,
787 BOOL skipassert)
788 {
789 for (;;)
790 {
791 switch ((int)*code)
792 {
793 case OP_OPT:
794 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
795 *options = (int)code[1];
796 code += 2;
797 break;
798
799 case OP_ASSERT_NOT:
800 case OP_ASSERTBACK:
801 case OP_ASSERTBACK_NOT:
802 if (!skipassert) return code;
803 do code += GET(code, 1); while (*code == OP_ALT);
804 code += _pcre_OP_lengths[*code];
805 break;
806
807 case OP_WORD_BOUNDARY:
808 case OP_NOT_WORD_BOUNDARY:
809 if (!skipassert) return code;
810 /* Fall through */
811
812 case OP_CALLOUT:
813 case OP_CREF:
814 case OP_BRANUMBER:
815 code += _pcre_OP_lengths[*code];
816 break;
817
818 default:
819 return code;
820 }
821 }
822 /* Control never reaches here */
823 }
824
825
826
827
828 /*************************************************
829 * Find the fixed length of a pattern *
830 *************************************************/
831
832 /* Scan a pattern and compute the fixed length of subject that will match it,
833 if the length is fixed. This is needed for dealing with backward assertions.
834 In UTF8 mode, the result is in characters rather than bytes.
835
836 Arguments:
837 code points to the start of the pattern (the bracket)
838 options the compiling options
839
840 Returns: the fixed length, or -1 if there is no fixed length,
841 or -2 if \C was encountered
842 */
843
844 static int
845 find_fixedlength(uschar *code, int options)
846 {
847 int length = -1;
848
849 register int branchlength = 0;
850 register uschar *cc = code + 1 + LINK_SIZE;
851
852 /* Scan along the opcodes for this branch. If we get to the end of the
853 branch, check the length against that of the other branches. */
854
855 for (;;)
856 {
857 int d;
858 register int op = *cc;
859 if (op >= OP_BRA) op = OP_BRA;
860
861 switch (op)
862 {
863 case OP_BRA:
864 case OP_ONCE:
865 case OP_COND:
866 d = find_fixedlength(cc, options);
867 if (d < 0) return d;
868 branchlength += d;
869 do cc += GET(cc, 1); while (*cc == OP_ALT);
870 cc += 1 + LINK_SIZE;
871 break;
872
873 /* Reached end of a branch; if it's a ket it is the end of a nested
874 call. If it's ALT it is an alternation in a nested call. If it is
875 END it's the end of the outer call. All can be handled by the same code. */
876
877 case OP_ALT:
878 case OP_KET:
879 case OP_KETRMAX:
880 case OP_KETRMIN:
881 case OP_END:
882 if (length < 0) length = branchlength;
883 else if (length != branchlength) return -1;
884 if (*cc != OP_ALT) return length;
885 cc += 1 + LINK_SIZE;
886 branchlength = 0;
887 break;
888
889 /* Skip over assertive subpatterns */
890
891 case OP_ASSERT:
892 case OP_ASSERT_NOT:
893 case OP_ASSERTBACK:
894 case OP_ASSERTBACK_NOT:
895 do cc += GET(cc, 1); while (*cc == OP_ALT);
896 /* Fall through */
897
898 /* Skip over things that don't match chars */
899
900 case OP_REVERSE:
901 case OP_BRANUMBER:
902 case OP_CREF:
903 case OP_OPT:
904 case OP_CALLOUT:
905 case OP_SOD:
906 case OP_SOM:
907 case OP_EOD:
908 case OP_EODN:
909 case OP_CIRC:
910 case OP_DOLL:
911 case OP_NOT_WORD_BOUNDARY:
912 case OP_WORD_BOUNDARY:
913 cc += _pcre_OP_lengths[*cc];
914 break;
915
916 /* Handle literal characters */
917
918 case OP_CHAR:
919 case OP_CHARNC:
920 branchlength++;
921 cc += 2;
922 #ifdef SUPPORT_UTF8
923 if ((options & PCRE_UTF8) != 0)
924 {
925 while ((*cc & 0xc0) == 0x80) cc++;
926 }
927 #endif
928 break;
929
930 /* Handle exact repetitions. The count is already in characters, but we
931 need to skip over a multibyte character in UTF8 mode. */
932
933 case OP_EXACT:
934 branchlength += GET2(cc,1);
935 cc += 4;
936 #ifdef SUPPORT_UTF8
937 if ((options & PCRE_UTF8) != 0)
938 {
939 while((*cc & 0x80) == 0x80) cc++;
940 }
941 #endif
942 break;
943
944 case OP_TYPEEXACT:
945 branchlength += GET2(cc,1);
946 cc += 4;
947 break;
948
949 /* Handle single-char matchers */
950
951 case OP_PROP:
952 case OP_NOTPROP:
953 cc += 2;
954 /* Fall through */
955
956 case OP_NOT_DIGIT:
957 case OP_DIGIT:
958 case OP_NOT_WHITESPACE:
959 case OP_WHITESPACE:
960 case OP_NOT_WORDCHAR:
961 case OP_WORDCHAR:
962 case OP_ANY:
963 branchlength++;
964 cc++;
965 break;
966
967 /* The single-byte matcher isn't allowed */
968
969 case OP_ANYBYTE:
970 return -2;
971
972 /* Check a class for variable quantification */
973
974 #ifdef SUPPORT_UTF8
975 case OP_XCLASS:
976 cc += GET(cc, 1) - 33;
977 /* Fall through */
978 #endif
979
980 case OP_CLASS:
981 case OP_NCLASS:
982 cc += 33;
983
984 switch (*cc)
985 {
986 case OP_CRSTAR:
987 case OP_CRMINSTAR:
988 case OP_CRQUERY:
989 case OP_CRMINQUERY:
990 return -1;
991
992 case OP_CRRANGE:
993 case OP_CRMINRANGE:
994 if (GET2(cc,1) != GET2(cc,3)) return -1;
995 branchlength += GET2(cc,1);
996 cc += 5;
997 break;
998
999 default:
1000 branchlength++;
1001 }
1002 break;
1003
1004 /* Anything else is variable length */
1005
1006 default:
1007 return -1;
1008 }
1009 }
1010 /* Control never gets here */
1011 }
1012
1013
1014
1015
1016 /*************************************************
1017 * Scan compiled regex for numbered bracket *
1018 *************************************************/
1019
1020 /* This little function scans through a compiled pattern until it finds a
1021 capturing bracket with the given number.
1022
1023 Arguments:
1024 code points to start of expression
1025 utf8 TRUE in UTF-8 mode
1026 number the required bracket number
1027
1028 Returns: pointer to the opcode for the bracket, or NULL if not found
1029 */
1030
1031 static const uschar *
1032 find_bracket(const uschar *code, BOOL utf8, int number)
1033 {
1034 #ifndef SUPPORT_UTF8
1035 utf8 = utf8; /* Stop pedantic compilers complaining */
1036 #endif
1037
1038 for (;;)
1039 {
1040 register int c = *code;
1041 if (c == OP_END) return NULL;
1042 else if (c > OP_BRA)
1043 {
1044 int n = c - OP_BRA;
1045 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1046 if (n == number) return (uschar *)code;
1047 code += _pcre_OP_lengths[OP_BRA];
1048 }
1049 else
1050 {
1051 code += _pcre_OP_lengths[c];
1052
1053 #ifdef SUPPORT_UTF8
1054
1055 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1056 by a multi-byte character. The length in the table is a minimum, so we have
1057 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1058 can use relatively efficient code. */
1059
1060 if (utf8) switch(c)
1061 {
1062 case OP_CHAR:
1063 case OP_CHARNC:
1064 case OP_EXACT:
1065 case OP_UPTO:
1066 case OP_MINUPTO:
1067 case OP_STAR:
1068 case OP_MINSTAR:
1069 case OP_PLUS:
1070 case OP_MINPLUS:
1071 case OP_QUERY:
1072 case OP_MINQUERY:
1073 while ((*code & 0xc0) == 0x80) code++;
1074 break;
1075
1076 /* XCLASS is used for classes that cannot be represented just by a bit
1077 map. This includes negated single high-valued characters. The length in
1078 the table is zero; the actual length is stored in the compiled code. */
1079
1080 case OP_XCLASS:
1081 code += GET(code, 1) + 1;
1082 break;
1083 }
1084 #endif
1085 }
1086 }
1087 }
1088
1089
1090
1091 /*************************************************
1092 * Scan compiled regex for recursion reference *
1093 *************************************************/
1094
1095 /* This little function scans through a compiled pattern until it finds an
1096 instance of OP_RECURSE.
1097
1098 Arguments:
1099 code points to start of expression
1100 utf8 TRUE in UTF-8 mode
1101
1102 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1103 */
1104
1105 static const uschar *
1106 find_recurse(const uschar *code, BOOL utf8)
1107 {
1108 #ifndef SUPPORT_UTF8
1109 utf8 = utf8; /* Stop pedantic compilers complaining */
1110 #endif
1111
1112 for (;;)
1113 {
1114 register int c = *code;
1115 if (c == OP_END) return NULL;
1116 else if (c == OP_RECURSE) return code;
1117 else if (c > OP_BRA)
1118 {
1119 code += _pcre_OP_lengths[OP_BRA];
1120 }
1121 else
1122 {
1123 code += _pcre_OP_lengths[c];
1124
1125 #ifdef SUPPORT_UTF8
1126
1127 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1128 by a multi-byte character. The length in the table is a minimum, so we have
1129 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1130 can use relatively efficient code. */
1131
1132 if (utf8) switch(c)
1133 {
1134 case OP_CHAR:
1135 case OP_CHARNC:
1136 case OP_EXACT:
1137 case OP_UPTO:
1138 case OP_MINUPTO:
1139 case OP_STAR:
1140 case OP_MINSTAR:
1141 case OP_PLUS:
1142 case OP_MINPLUS:
1143 case OP_QUERY:
1144 case OP_MINQUERY:
1145 while ((*code & 0xc0) == 0x80) code++;
1146 break;
1147
1148 /* XCLASS is used for classes that cannot be represented just by a bit
1149 map. This includes negated single high-valued characters. The length in
1150 the table is zero; the actual length is stored in the compiled code. */
1151
1152 case OP_XCLASS:
1153 code += GET(code, 1) + 1;
1154 break;
1155 }
1156 #endif
1157 }
1158 }
1159 }
1160
1161
1162
1163 /*************************************************
1164 * Scan compiled branch for non-emptiness *
1165 *************************************************/
1166
1167 /* This function scans through a branch of a compiled pattern to see whether it
1168 can match the empty string or not. It is called only from could_be_empty()
1169 below. Note that first_significant_code() skips over assertions. If we hit an
1170 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1171 whose current branch will already have been scanned.
1172
1173 Arguments:
1174 code points to start of search
1175 endcode points to where to stop
1176 utf8 TRUE if in UTF8 mode
1177
1178 Returns: TRUE if what is matched could be empty
1179 */
1180
1181 static BOOL
1182 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1183 {
1184 register int c;
1185 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1186 code < endcode;
1187 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1188 {
1189 const uschar *ccode;
1190
1191 c = *code;
1192
1193 if (c >= OP_BRA)
1194 {
1195 BOOL empty_branch;
1196 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1197
1198 /* Scan a closed bracket */
1199
1200 empty_branch = FALSE;
1201 do
1202 {
1203 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1204 empty_branch = TRUE;
1205 code += GET(code, 1);
1206 }
1207 while (*code == OP_ALT);
1208 if (!empty_branch) return FALSE; /* All branches are non-empty */
1209 code += 1 + LINK_SIZE;
1210 c = *code;
1211 }
1212
1213 else switch (c)
1214 {
1215 /* Check for quantifiers after a class */
1216
1217 #ifdef SUPPORT_UTF8
1218 case OP_XCLASS:
1219 ccode = code + GET(code, 1);
1220 goto CHECK_CLASS_REPEAT;
1221 #endif
1222
1223 case OP_CLASS:
1224 case OP_NCLASS:
1225 ccode = code + 33;
1226
1227 #ifdef SUPPORT_UTF8
1228 CHECK_CLASS_REPEAT:
1229 #endif
1230
1231 switch (*ccode)
1232 {
1233 case OP_CRSTAR: /* These could be empty; continue */
1234 case OP_CRMINSTAR:
1235 case OP_CRQUERY:
1236 case OP_CRMINQUERY:
1237 break;
1238
1239 default: /* Non-repeat => class must match */
1240 case OP_CRPLUS: /* These repeats aren't empty */
1241 case OP_CRMINPLUS:
1242 return FALSE;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1247 break;
1248 }
1249 break;
1250
1251 /* Opcodes that must match a character */
1252
1253 case OP_PROP:
1254 case OP_NOTPROP:
1255 case OP_EXTUNI:
1256 case OP_NOT_DIGIT:
1257 case OP_DIGIT:
1258 case OP_NOT_WHITESPACE:
1259 case OP_WHITESPACE:
1260 case OP_NOT_WORDCHAR:
1261 case OP_WORDCHAR:
1262 case OP_ANY:
1263 case OP_ANYBYTE:
1264 case OP_CHAR:
1265 case OP_CHARNC:
1266 case OP_NOT:
1267 case OP_PLUS:
1268 case OP_MINPLUS:
1269 case OP_EXACT:
1270 case OP_NOTPLUS:
1271 case OP_NOTMINPLUS:
1272 case OP_NOTEXACT:
1273 case OP_TYPEPLUS:
1274 case OP_TYPEMINPLUS:
1275 case OP_TYPEEXACT:
1276 return FALSE;
1277
1278 /* End of branch */
1279
1280 case OP_KET:
1281 case OP_KETRMAX:
1282 case OP_KETRMIN:
1283 case OP_ALT:
1284 return TRUE;
1285
1286 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1287 followed by a multibyte character */
1288
1289 #ifdef SUPPORT_UTF8
1290 case OP_STAR:
1291 case OP_MINSTAR:
1292 case OP_QUERY:
1293 case OP_MINQUERY:
1294 case OP_UPTO:
1295 case OP_MINUPTO:
1296 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1297 break;
1298 #endif
1299 }
1300 }
1301
1302 return TRUE;
1303 }
1304
1305
1306
1307 /*************************************************
1308 * Scan compiled regex for non-emptiness *
1309 *************************************************/
1310
1311 /* This function is called to check for left recursive calls. We want to check
1312 the current branch of the current pattern to see if it could match the empty
1313 string. If it could, we must look outwards for branches at other levels,
1314 stopping when we pass beyond the bracket which is the subject of the recursion.
1315
1316 Arguments:
1317 code points to start of the recursion
1318 endcode points to where to stop (current RECURSE item)
1319 bcptr points to the chain of current (unclosed) branch starts
1320 utf8 TRUE if in UTF-8 mode
1321
1322 Returns: TRUE if what is matched could be empty
1323 */
1324
1325 static BOOL
1326 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1327 BOOL utf8)
1328 {
1329 while (bcptr != NULL && bcptr->current >= code)
1330 {
1331 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1332 bcptr = bcptr->outer;
1333 }
1334 return TRUE;
1335 }
1336
1337
1338
1339 /*************************************************
1340 * Check for POSIX class syntax *
1341 *************************************************/
1342
1343 /* This function is called when the sequence "[:" or "[." or "[=" is
1344 encountered in a character class. It checks whether this is followed by an
1345 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1346 ".]" or "=]".
1347
1348 Argument:
1349 ptr pointer to the initial [
1350 endptr where to return the end pointer
1351 cd pointer to compile data
1352
1353 Returns: TRUE or FALSE
1354 */
1355
1356 static BOOL
1357 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1358 {
1359 int terminator; /* Don't combine these lines; the Solaris cc */
1360 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1361 if (*(++ptr) == '^') ptr++;
1362 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1363 if (*ptr == terminator && ptr[1] == ']')
1364 {
1365 *endptr = ptr;
1366 return TRUE;
1367 }
1368 return FALSE;
1369 }
1370
1371
1372
1373
1374 /*************************************************
1375 * Check POSIX class name *
1376 *************************************************/
1377
1378 /* This function is called to check the name given in a POSIX-style class entry
1379 such as [:alnum:].
1380
1381 Arguments:
1382 ptr points to the first letter
1383 len the length of the name
1384
1385 Returns: a value representing the name, or -1 if unknown
1386 */
1387
1388 static int
1389 check_posix_name(const uschar *ptr, int len)
1390 {
1391 register int yield = 0;
1392 while (posix_name_lengths[yield] != 0)
1393 {
1394 if (len == posix_name_lengths[yield] &&
1395 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1396 yield++;
1397 }
1398 return -1;
1399 }
1400
1401
1402 /*************************************************
1403 * Adjust OP_RECURSE items in repeated group *
1404 *************************************************/
1405
1406 /* OP_RECURSE items contain an offset from the start of the regex to the group
1407 that is referenced. This means that groups can be replicated for fixed
1408 repetition simply by copying (because the recursion is allowed to refer to
1409 earlier groups that are outside the current group). However, when a group is
1410 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1411 it, after it has been compiled. This means that any OP_RECURSE items within it
1412 that refer to the group itself or any contained groups have to have their
1413 offsets adjusted. That is the job of this function. Before it is called, the
1414 partially compiled regex must be temporarily terminated with OP_END.
1415
1416 Arguments:
1417 group points to the start of the group
1418 adjust the amount by which the group is to be moved
1419 utf8 TRUE in UTF-8 mode
1420 cd contains pointers to tables etc.
1421
1422 Returns: nothing
1423 */
1424
1425 static void
1426 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1427 {
1428 uschar *ptr = group;
1429 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1430 {
1431 int offset = GET(ptr, 1);
1432 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1433 ptr += 1 + LINK_SIZE;
1434 }
1435 }
1436
1437
1438
1439 /*************************************************
1440 * Insert an automatic callout point *
1441 *************************************************/
1442
1443 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1444 callout points before each pattern item.
1445
1446 Arguments:
1447 code current code pointer
1448 ptr current pattern pointer
1449 cd pointers to tables etc
1450
1451 Returns: new code pointer
1452 */
1453
1454 static uschar *
1455 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1456 {
1457 *code++ = OP_CALLOUT;
1458 *code++ = 255;
1459 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1460 PUT(code, LINK_SIZE, 0); /* Default length */
1461 return code + 2*LINK_SIZE;
1462 }
1463
1464
1465
1466 /*************************************************
1467 * Complete a callout item *
1468 *************************************************/
1469
1470 /* A callout item contains the length of the next item in the pattern, which
1471 we can't fill in till after we have reached the relevant point. This is used
1472 for both automatic and manual callouts.
1473
1474 Arguments:
1475 previous_callout points to previous callout item
1476 ptr current pattern pointer
1477 cd pointers to tables etc
1478
1479 Returns: nothing
1480 */
1481
1482 static void
1483 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1484 {
1485 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1486 PUT(previous_callout, 2 + LINK_SIZE, length);
1487 }
1488
1489
1490
1491 #ifdef SUPPORT_UCP
1492 /*************************************************
1493 * Get othercase range *
1494 *************************************************/
1495
1496 /* This function is passed the start and end of a class range, in UTF-8 mode
1497 with UCP support. It searches up the characters, looking for internal ranges of
1498 characters in the "other" case. Each call returns the next one, updating the
1499 start address.
1500
1501 Arguments:
1502 cptr points to starting character value; updated
1503 d end value
1504 ocptr where to put start of othercase range
1505 odptr where to put end of othercase range
1506
1507 Yield: TRUE when range returned; FALSE when no more
1508 */
1509
1510 static BOOL
1511 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1512 {
1513 int c, othercase, next;
1514
1515 for (c = *cptr; c <= d; c++)
1516 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
1517
1518 if (c > d) return FALSE;
1519
1520 *ocptr = othercase;
1521 next = othercase + 1;
1522
1523 for (++c; c <= d; c++)
1524 {
1525 if (_pcre_ucp_othercase(c) != next) break;
1526 next++;
1527 }
1528
1529 *odptr = next - 1;
1530 *cptr = c;
1531
1532 return TRUE;
1533 }
1534 #endif /* SUPPORT_UCP */
1535
1536
1537 /*************************************************
1538 * Compile one branch *
1539 *************************************************/
1540
1541 /* Scan the pattern, compiling it into the code vector. If the options are
1542 changed during the branch, the pointer is used to change the external options
1543 bits.
1544
1545 Arguments:
1546 optionsptr pointer to the option bits
1547 brackets points to number of extracting brackets used
1548 codeptr points to the pointer to the current code point
1549 ptrptr points to the current pattern pointer
1550 errorcodeptr points to error code variable
1551 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1552 reqbyteptr set to the last literal character required, else < 0
1553 bcptr points to current branch chain
1554 cd contains pointers to tables etc.
1555
1556 Returns: TRUE on success
1557 FALSE, with *errorcodeptr set non-zero on error
1558 */
1559
1560 static BOOL
1561 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1562 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1563 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1564 {
1565 int repeat_type, op_type;
1566 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1567 int bravalue = 0;
1568 int greedy_default, greedy_non_default;
1569 int firstbyte, reqbyte;
1570 int zeroreqbyte, zerofirstbyte;
1571 int req_caseopt, reqvary, tempreqvary;
1572 int condcount = 0;
1573 int options = *optionsptr;
1574 int after_manual_callout = 0;
1575 register int c;
1576 register uschar *code = *codeptr;
1577 uschar *tempcode;
1578 BOOL inescq = FALSE;
1579 BOOL groupsetfirstbyte = FALSE;
1580 const uschar *ptr = *ptrptr;
1581 const uschar *tempptr;
1582 uschar *previous = NULL;
1583 uschar *previous_callout = NULL;
1584 uschar classbits[32];
1585
1586 #ifdef SUPPORT_UTF8
1587 BOOL class_utf8;
1588 BOOL utf8 = (options & PCRE_UTF8) != 0;
1589 uschar *class_utf8data;
1590 uschar utf8_char[6];
1591 #else
1592 BOOL utf8 = FALSE;
1593 #endif
1594
1595 /* Set up the default and non-default settings for greediness */
1596
1597 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1598 greedy_non_default = greedy_default ^ 1;
1599
1600 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1601 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1602 matches a non-fixed char first char; reqbyte just remains unset if we never
1603 find one.
1604
1605 When we hit a repeat whose minimum is zero, we may have to adjust these values
1606 to take the zero repeat into account. This is implemented by setting them to
1607 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1608 item types that can be repeated set these backoff variables appropriately. */
1609
1610 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1611
1612 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1613 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1614 value > 255. It is added into the firstbyte or reqbyte variables to record the
1615 case status of the value. This is used only for ASCII characters. */
1616
1617 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1618
1619 /* Switch on next character until the end of the branch */
1620
1621 for (;; ptr++)
1622 {
1623 BOOL negate_class;
1624 BOOL possessive_quantifier;
1625 BOOL is_quantifier;
1626 int class_charcount;
1627 int class_lastchar;
1628 int newoptions;
1629 int recno;
1630 int skipbytes;
1631 int subreqbyte;
1632 int subfirstbyte;
1633 int mclength;
1634 uschar mcbuffer[8];
1635
1636 /* Next byte in the pattern */
1637
1638 c = *ptr;
1639
1640 /* If in \Q...\E, check for the end; if not, we have a literal */
1641
1642 if (inescq && c != 0)
1643 {
1644 if (c == '\\' && ptr[1] == 'E')
1645 {
1646 inescq = FALSE;
1647 ptr++;
1648 continue;
1649 }
1650 else
1651 {
1652 if (previous_callout != NULL)
1653 {
1654 complete_callout(previous_callout, ptr, cd);
1655 previous_callout = NULL;
1656 }
1657 if ((options & PCRE_AUTO_CALLOUT) != 0)
1658 {
1659 previous_callout = code;
1660 code = auto_callout(code, ptr, cd);
1661 }
1662 goto NORMAL_CHAR;
1663 }
1664 }
1665
1666 /* Fill in length of a previous callout, except when the next thing is
1667 a quantifier. */
1668
1669 is_quantifier = c == '*' || c == '+' || c == '?' ||
1670 (c == '{' && is_counted_repeat(ptr+1));
1671
1672 if (!is_quantifier && previous_callout != NULL &&
1673 after_manual_callout-- <= 0)
1674 {
1675 complete_callout(previous_callout, ptr, cd);
1676 previous_callout = NULL;
1677 }
1678
1679 /* In extended mode, skip white space and comments */
1680
1681 if ((options & PCRE_EXTENDED) != 0)
1682 {
1683 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1684 if (c == '#')
1685 {
1686 /* The space before the ; is to avoid a warning on a silly compiler
1687 on the Macintosh. */
1688 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1689 if (c != 0) continue; /* Else fall through to handle end of string */
1690 }
1691 }
1692
1693 /* No auto callout for quantifiers. */
1694
1695 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1696 {
1697 previous_callout = code;
1698 code = auto_callout(code, ptr, cd);
1699 }
1700
1701 switch(c)
1702 {
1703 /* The branch terminates at end of string, |, or ). */
1704
1705 case 0:
1706 case '|':
1707 case ')':
1708 *firstbyteptr = firstbyte;
1709 *reqbyteptr = reqbyte;
1710 *codeptr = code;
1711 *ptrptr = ptr;
1712 return TRUE;
1713
1714 /* Handle single-character metacharacters. In multiline mode, ^ disables
1715 the setting of any following char as a first character. */
1716
1717 case '^':
1718 if ((options & PCRE_MULTILINE) != 0)
1719 {
1720 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1721 }
1722 previous = NULL;
1723 *code++ = OP_CIRC;
1724 break;
1725
1726 case '$':
1727 previous = NULL;
1728 *code++ = OP_DOLL;
1729 break;
1730
1731 /* There can never be a first char if '.' is first, whatever happens about
1732 repeats. The value of reqbyte doesn't change either. */
1733
1734 case '.':
1735 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1736 zerofirstbyte = firstbyte;
1737 zeroreqbyte = reqbyte;
1738 previous = code;
1739 *code++ = OP_ANY;
1740 break;
1741
1742 /* Character classes. If the included characters are all < 256, we build a
1743 32-byte bitmap of the permitted characters, except in the special case
1744 where there is only one such character. For negated classes, we build the
1745 map as usual, then invert it at the end. However, we use a different opcode
1746 so that data characters > 255 can be handled correctly.
1747
1748 If the class contains characters outside the 0-255 range, a different
1749 opcode is compiled. It may optionally have a bit map for characters < 256,
1750 but those above are are explicitly listed afterwards. A flag byte tells
1751 whether the bitmap is present, and whether this is a negated class or not.
1752 */
1753
1754 case '[':
1755 previous = code;
1756
1757 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1758 they are encountered at the top level, so we'll do that too. */
1759
1760 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1761 check_posix_syntax(ptr, &tempptr, cd))
1762 {
1763 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1764 goto FAILED;
1765 }
1766
1767 /* If the first character is '^', set the negation flag and skip it. */
1768
1769 if ((c = *(++ptr)) == '^')
1770 {
1771 negate_class = TRUE;
1772 c = *(++ptr);
1773 }
1774 else
1775 {
1776 negate_class = FALSE;
1777 }
1778
1779 /* Keep a count of chars with values < 256 so that we can optimize the case
1780 of just a single character (as long as it's < 256). For higher valued UTF-8
1781 characters, we don't yet do any optimization. */
1782
1783 class_charcount = 0;
1784 class_lastchar = -1;
1785
1786 #ifdef SUPPORT_UTF8
1787 class_utf8 = FALSE; /* No chars >= 256 */
1788 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1789 #endif
1790
1791 /* Initialize the 32-char bit map to all zeros. We have to build the
1792 map in a temporary bit of store, in case the class contains only 1
1793 character (< 256), because in that case the compiled code doesn't use the
1794 bit map. */
1795
1796 memset(classbits, 0, 32 * sizeof(uschar));
1797
1798 /* Process characters until ] is reached. By writing this as a "do" it
1799 means that an initial ] is taken as a data character. The first pass
1800 through the regex checked the overall syntax, so we don't need to be very
1801 strict here. At the start of the loop, c contains the first byte of the
1802 character. */
1803
1804 do
1805 {
1806 #ifdef SUPPORT_UTF8
1807 if (utf8 && c > 127)
1808 { /* Braces are required because the */
1809 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1810 }
1811 #endif
1812
1813 /* Inside \Q...\E everything is literal except \E */
1814
1815 if (inescq)
1816 {
1817 if (c == '\\' && ptr[1] == 'E')
1818 {
1819 inescq = FALSE;
1820 ptr++;
1821 continue;
1822 }
1823 else goto LONE_SINGLE_CHARACTER;
1824 }
1825
1826 /* Handle POSIX class names. Perl allows a negation extension of the
1827 form [:^name:]. A square bracket that doesn't match the syntax is
1828 treated as a literal. We also recognize the POSIX constructions
1829 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1830 5.6 and 5.8 do. */
1831
1832 if (c == '[' &&
1833 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1834 check_posix_syntax(ptr, &tempptr, cd))
1835 {
1836 BOOL local_negate = FALSE;
1837 int posix_class, taboffset, tabopt;
1838 register const uschar *cbits = cd->cbits;
1839 uschar pbits[32];
1840
1841 if (ptr[1] != ':')
1842 {
1843 *errorcodeptr = ERR31;
1844 goto FAILED;
1845 }
1846
1847 ptr += 2;
1848 if (*ptr == '^')
1849 {
1850 local_negate = TRUE;
1851 ptr++;
1852 }
1853
1854 posix_class = check_posix_name(ptr, tempptr - ptr);
1855 if (posix_class < 0)
1856 {
1857 *errorcodeptr = ERR30;
1858 goto FAILED;
1859 }
1860
1861 /* If matching is caseless, upper and lower are converted to
1862 alpha. This relies on the fact that the class table starts with
1863 alpha, lower, upper as the first 3 entries. */
1864
1865 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1866 posix_class = 0;
1867
1868 /* We build the bit map for the POSIX class in a chunk of local store
1869 because we may be adding and subtracting from it, and we don't want to
1870 subtract bits that may be in the main map already. At the end we or the
1871 result into the bit map that is being built. */
1872
1873 posix_class *= 3;
1874
1875 /* Copy in the first table (always present) */
1876
1877 memcpy(pbits, cbits + posix_class_maps[posix_class],
1878 32 * sizeof(uschar));
1879
1880 /* If there is a second table, add or remove it as required. */
1881
1882 taboffset = posix_class_maps[posix_class + 1];
1883 tabopt = posix_class_maps[posix_class + 2];
1884
1885 if (taboffset >= 0)
1886 {
1887 if (tabopt >= 0)
1888 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
1889 else
1890 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
1891 }
1892
1893 /* Not see if we need to remove any special characters. An option
1894 value of 1 removes vertical space and 2 removes underscore. */
1895
1896 if (tabopt < 0) tabopt = -tabopt;
1897 if (tabopt == 1) pbits[1] &= ~0x3c;
1898 else if (tabopt == 2) pbits[11] &= 0x7f;
1899
1900 /* Add the POSIX table or its complement into the main table that is
1901 being built and we are done. */
1902
1903 if (local_negate)
1904 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
1905 else
1906 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
1907
1908 ptr = tempptr + 1;
1909 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1910 continue; /* End of POSIX syntax handling */
1911 }
1912
1913 /* Backslash may introduce a single character, or it may introduce one
1914 of the specials, which just set a flag. Escaped items are checked for
1915 validity in the pre-compiling pass. The sequence \b is a special case.
1916 Inside a class (and only there) it is treated as backspace. Elsewhere
1917 it marks a word boundary. Other escapes have preset maps ready to
1918 or into the one we are building. We assume they have more than one
1919 character in them, so set class_charcount bigger than one. */
1920
1921 if (c == '\\')
1922 {
1923 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1924
1925 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1926 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1927 else if (-c == ESC_Q) /* Handle start of quoted string */
1928 {
1929 if (ptr[1] == '\\' && ptr[2] == 'E')
1930 {
1931 ptr += 2; /* avoid empty string */
1932 }
1933 else inescq = TRUE;
1934 continue;
1935 }
1936
1937 if (c < 0)
1938 {
1939 register const uschar *cbits = cd->cbits;
1940 class_charcount += 2; /* Greater than 1 is what matters */
1941 switch (-c)
1942 {
1943 case ESC_d:
1944 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1945 continue;
1946
1947 case ESC_D:
1948 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1949 continue;
1950
1951 case ESC_w:
1952 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1953 continue;
1954
1955 case ESC_W:
1956 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1957 continue;
1958
1959 case ESC_s:
1960 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1961 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1962 continue;
1963
1964 case ESC_S:
1965 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1966 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1967 continue;
1968
1969 #ifdef SUPPORT_UCP
1970 case ESC_p:
1971 case ESC_P:
1972 {
1973 BOOL negated;
1974 int pdata;
1975 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
1976 if (ptype < 0) goto FAILED;
1977 class_utf8 = TRUE;
1978 *class_utf8data++ = ((-c == ESC_p) != negated)?
1979 XCL_PROP : XCL_NOTPROP;
1980 *class_utf8data++ = ptype;
1981 *class_utf8data++ = pdata;
1982 class_charcount -= 2; /* Not a < 256 character */
1983 }
1984 continue;
1985 #endif
1986
1987 /* Unrecognized escapes are faulted if PCRE is running in its
1988 strict mode. By default, for compatibility with Perl, they are
1989 treated as literals. */
1990
1991 default:
1992 if ((options & PCRE_EXTRA) != 0)
1993 {
1994 *errorcodeptr = ERR7;
1995 goto FAILED;
1996 }
1997 c = *ptr; /* The final character */
1998 class_charcount -= 2; /* Undo the default count from above */
1999 }
2000 }
2001
2002 /* Fall through if we have a single character (c >= 0). This may be
2003 > 256 in UTF-8 mode. */
2004
2005 } /* End of backslash handling */
2006
2007 /* A single character may be followed by '-' to form a range. However,
2008 Perl does not permit ']' to be the end of the range. A '-' character
2009 here is treated as a literal. */
2010
2011 if (ptr[1] == '-' && ptr[2] != ']')
2012 {
2013 int d;
2014 ptr += 2;
2015
2016 #ifdef SUPPORT_UTF8
2017 if (utf8)
2018 { /* Braces are required because the */
2019 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2020 }
2021 else
2022 #endif
2023 d = *ptr; /* Not UTF-8 mode */
2024
2025 /* The second part of a range can be a single-character escape, but
2026 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2027 in such circumstances. */
2028
2029 if (d == '\\')
2030 {
2031 const uschar *oldptr = ptr;
2032 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2033
2034 /* \b is backslash; \X is literal X; any other special means the '-'
2035 was literal */
2036
2037 if (d < 0)
2038 {
2039 if (d == -ESC_b) d = '\b';
2040 else if (d == -ESC_X) d = 'X'; else
2041 {
2042 ptr = oldptr - 2;
2043 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2044 }
2045 }
2046 }
2047
2048 /* The check that the two values are in the correct order happens in
2049 the pre-pass. Optimize one-character ranges */
2050
2051 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2052
2053 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2054 matching, we have to use an XCLASS with extra data items. Caseless
2055 matching for characters > 127 is available only if UCP support is
2056 available. */
2057
2058 #ifdef SUPPORT_UTF8
2059 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2060 {
2061 class_utf8 = TRUE;
2062
2063 /* With UCP support, we can find the other case equivalents of
2064 the relevant characters. There may be several ranges. Optimize how
2065 they fit with the basic range. */
2066
2067 #ifdef SUPPORT_UCP
2068 if ((options & PCRE_CASELESS) != 0)
2069 {
2070 int occ, ocd;
2071 int cc = c;
2072 int origd = d;
2073 while (get_othercase_range(&cc, origd, &occ, &ocd))
2074 {
2075 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2076
2077 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2078 { /* if there is overlap, */
2079 c = occ; /* noting that if occ < c */
2080 continue; /* we can't have ocd > d */
2081 } /* because a subrange is */
2082 if (ocd > d && occ <= d + 1) /* always shorter than */
2083 { /* the basic range. */
2084 d = ocd;
2085 continue;
2086 }
2087
2088 if (occ == ocd)
2089 {
2090 *class_utf8data++ = XCL_SINGLE;
2091 }
2092 else
2093 {
2094 *class_utf8data++ = XCL_RANGE;
2095 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2096 }
2097 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2098 }
2099 }
2100 #endif /* SUPPORT_UCP */
2101
2102 /* Now record the original range, possibly modified for UCP caseless
2103 overlapping ranges. */
2104
2105 *class_utf8data++ = XCL_RANGE;
2106 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2107 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2108
2109 /* With UCP support, we are done. Without UCP support, there is no
2110 caseless matching for UTF-8 characters > 127; we can use the bit map
2111 for the smaller ones. */
2112
2113 #ifdef SUPPORT_UCP
2114 continue; /* With next character in the class */
2115 #else
2116 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2117
2118 /* Adjust upper limit and fall through to set up the map */
2119
2120 d = 127;
2121
2122 #endif /* SUPPORT_UCP */
2123 }
2124 #endif /* SUPPORT_UTF8 */
2125
2126 /* We use the bit map for all cases when not in UTF-8 mode; else
2127 ranges that lie entirely within 0-127 when there is UCP support; else
2128 for partial ranges without UCP support. */
2129
2130 for (; c <= d; c++)
2131 {
2132 classbits[c/8] |= (1 << (c&7));
2133 if ((options & PCRE_CASELESS) != 0)
2134 {
2135 int uc = cd->fcc[c]; /* flip case */
2136 classbits[uc/8] |= (1 << (uc&7));
2137 }
2138 class_charcount++; /* in case a one-char range */
2139 class_lastchar = c;
2140 }
2141
2142 continue; /* Go get the next char in the class */
2143 }
2144
2145 /* Handle a lone single character - we can get here for a normal
2146 non-escape char, or after \ that introduces a single character or for an
2147 apparent range that isn't. */
2148
2149 LONE_SINGLE_CHARACTER:
2150
2151 /* Handle a character that cannot go in the bit map */
2152
2153 #ifdef SUPPORT_UTF8
2154 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2155 {
2156 class_utf8 = TRUE;
2157 *class_utf8data++ = XCL_SINGLE;
2158 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2159
2160 #ifdef SUPPORT_UCP
2161 if ((options & PCRE_CASELESS) != 0)
2162 {
2163 int othercase;
2164 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
2165 {
2166 *class_utf8data++ = XCL_SINGLE;
2167 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2168 }
2169 }
2170 #endif /* SUPPORT_UCP */
2171
2172 }
2173 else
2174 #endif /* SUPPORT_UTF8 */
2175
2176 /* Handle a single-byte character */
2177 {
2178 classbits[c/8] |= (1 << (c&7));
2179 if ((options & PCRE_CASELESS) != 0)
2180 {
2181 c = cd->fcc[c]; /* flip case */
2182 classbits[c/8] |= (1 << (c&7));
2183 }
2184 class_charcount++;
2185 class_lastchar = c;
2186 }
2187 }
2188
2189 /* Loop until ']' reached; the check for end of string happens inside the
2190 loop. This "while" is the end of the "do" above. */
2191
2192 while ((c = *(++ptr)) != ']' || inescq);
2193
2194 /* If class_charcount is 1, we saw precisely one character whose value is
2195 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2196 can optimize the negative case only if there were no characters >= 128
2197 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2198 single-bytes only. This is an historical hangover. Maybe one day we can
2199 tidy these opcodes to handle multi-byte characters.
2200
2201 The optimization throws away the bit map. We turn the item into a
2202 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2203 that OP_NOT does not support multibyte characters. In the positive case, it
2204 can cause firstbyte to be set. Otherwise, there can be no first char if
2205 this item is first, whatever repeat count may follow. In the case of
2206 reqbyte, save the previous value for reinstating. */
2207
2208 #ifdef SUPPORT_UTF8
2209 if (class_charcount == 1 &&
2210 (!utf8 ||
2211 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2212
2213 #else
2214 if (class_charcount == 1)
2215 #endif
2216 {
2217 zeroreqbyte = reqbyte;
2218
2219 /* The OP_NOT opcode works on one-byte characters only. */
2220
2221 if (negate_class)
2222 {
2223 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2224 zerofirstbyte = firstbyte;
2225 *code++ = OP_NOT;
2226 *code++ = class_lastchar;
2227 break;
2228 }
2229
2230 /* For a single, positive character, get the value into mcbuffer, and
2231 then we can handle this with the normal one-character code. */
2232
2233 #ifdef SUPPORT_UTF8
2234 if (utf8 && class_lastchar > 127)
2235 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2236 else
2237 #endif
2238 {
2239 mcbuffer[0] = class_lastchar;
2240 mclength = 1;
2241 }
2242 goto ONE_CHAR;
2243 } /* End of 1-char optimization */
2244
2245 /* The general case - not the one-char optimization. If this is the first
2246 thing in the branch, there can be no first char setting, whatever the
2247 repeat count. Any reqbyte setting must remain unchanged after any kind of
2248 repeat. */
2249
2250 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2251 zerofirstbyte = firstbyte;
2252 zeroreqbyte = reqbyte;
2253
2254 /* If there are characters with values > 255, we have to compile an
2255 extended class, with its own opcode. If there are no characters < 256,
2256 we can omit the bitmap. */
2257
2258 #ifdef SUPPORT_UTF8
2259 if (class_utf8)
2260 {
2261 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2262 *code++ = OP_XCLASS;
2263 code += LINK_SIZE;
2264 *code = negate_class? XCL_NOT : 0;
2265
2266 /* If the map is required, install it, and move on to the end of
2267 the extra data */
2268
2269 if (class_charcount > 0)
2270 {
2271 *code++ |= XCL_MAP;
2272 memcpy(code, classbits, 32);
2273 code = class_utf8data;
2274 }
2275
2276 /* If the map is not required, slide down the extra data. */
2277
2278 else
2279 {
2280 int len = class_utf8data - (code + 33);
2281 memmove(code + 1, code + 33, len);
2282 code += len + 1;
2283 }
2284
2285 /* Now fill in the complete length of the item */
2286
2287 PUT(previous, 1, code - previous);
2288 break; /* End of class handling */
2289 }
2290 #endif
2291
2292 /* If there are no characters > 255, negate the 32-byte map if necessary,
2293 and copy it into the code vector. If this is the first thing in the branch,
2294 there can be no first char setting, whatever the repeat count. Any reqbyte
2295 setting must remain unchanged after any kind of repeat. */
2296
2297 if (negate_class)
2298 {
2299 *code++ = OP_NCLASS;
2300 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2301 }
2302 else
2303 {
2304 *code++ = OP_CLASS;
2305 memcpy(code, classbits, 32);
2306 }
2307 code += 32;
2308 break;
2309
2310 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2311 has been tested above. */
2312
2313 case '{':
2314 if (!is_quantifier) goto NORMAL_CHAR;
2315 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2316 if (*errorcodeptr != 0) goto FAILED;
2317 goto REPEAT;
2318
2319 case '*':
2320 repeat_min = 0;
2321 repeat_max = -1;
2322 goto REPEAT;
2323
2324 case '+':
2325 repeat_min = 1;
2326 repeat_max = -1;
2327 goto REPEAT;
2328
2329 case '?':
2330 repeat_min = 0;
2331 repeat_max = 1;
2332
2333 REPEAT:
2334 if (previous == NULL)
2335 {
2336 *errorcodeptr = ERR9;
2337 goto FAILED;
2338 }
2339
2340 if (repeat_min == 0)
2341 {
2342 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2343 reqbyte = zeroreqbyte; /* Ditto */
2344 }
2345
2346 /* Remember whether this is a variable length repeat */
2347
2348 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2349
2350 op_type = 0; /* Default single-char op codes */
2351 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2352
2353 /* Save start of previous item, in case we have to move it up to make space
2354 for an inserted OP_ONCE for the additional '+' extension. */
2355
2356 tempcode = previous;
2357
2358 /* If the next character is '+', we have a possessive quantifier. This
2359 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2360 If the next character is '?' this is a minimizing repeat, by default,
2361 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2362 repeat type to the non-default. */
2363
2364 if (ptr[1] == '+')
2365 {
2366 repeat_type = 0; /* Force greedy */
2367 possessive_quantifier = TRUE;
2368 ptr++;
2369 }
2370 else if (ptr[1] == '?')
2371 {
2372 repeat_type = greedy_non_default;
2373 ptr++;
2374 }
2375 else repeat_type = greedy_default;
2376
2377 /* If previous was a recursion, we need to wrap it inside brackets so that
2378 it can be replicated if necessary. */
2379
2380 if (*previous == OP_RECURSE)
2381 {
2382 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2383 code += 1 + LINK_SIZE;
2384 *previous = OP_BRA;
2385 PUT(previous, 1, code - previous);
2386 *code = OP_KET;
2387 PUT(code, 1, code - previous);
2388 code += 1 + LINK_SIZE;
2389 }
2390
2391 /* If previous was a character match, abolish the item and generate a
2392 repeat item instead. If a char item has a minumum of more than one, ensure
2393 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2394 the first thing in a branch because the x will have gone into firstbyte
2395 instead. */
2396
2397 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2398 {
2399 /* Deal with UTF-8 characters that take up more than one byte. It's
2400 easier to write this out separately than try to macrify it. Use c to
2401 hold the length of the character in bytes, plus 0x80 to flag that it's a
2402 length rather than a small character. */
2403
2404 #ifdef SUPPORT_UTF8
2405 if (utf8 && (code[-1] & 0x80) != 0)
2406 {
2407 uschar *lastchar = code - 1;
2408 while((*lastchar & 0xc0) == 0x80) lastchar--;
2409 c = code - lastchar; /* Length of UTF-8 character */
2410 memcpy(utf8_char, lastchar, c); /* Save the char */
2411 c |= 0x80; /* Flag c as a length */
2412 }
2413 else
2414 #endif
2415
2416 /* Handle the case of a single byte - either with no UTF8 support, or
2417 with UTF-8 disabled, or for a UTF-8 character < 128. */
2418
2419 {
2420 c = code[-1];
2421 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2422 }
2423
2424 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2425 }
2426
2427 /* If previous was a single negated character ([^a] or similar), we use
2428 one of the special opcodes, replacing it. The code is shared with single-
2429 character repeats by setting opt_type to add a suitable offset into
2430 repeat_type. OP_NOT is currently used only for single-byte chars. */
2431
2432 else if (*previous == OP_NOT)
2433 {
2434 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2435 c = previous[1];
2436 goto OUTPUT_SINGLE_REPEAT;
2437 }
2438
2439 /* If previous was a character type match (\d or similar), abolish it and
2440 create a suitable repeat item. The code is shared with single-character
2441 repeats by setting op_type to add a suitable offset into repeat_type. Note
2442 the the Unicode property types will be present only when SUPPORT_UCP is
2443 defined, but we don't wrap the little bits of code here because it just
2444 makes it horribly messy. */
2445
2446 else if (*previous < OP_EODN)
2447 {
2448 uschar *oldcode;
2449 int prop_type, prop_value;
2450 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2451 c = *previous;
2452
2453 OUTPUT_SINGLE_REPEAT:
2454 if (*previous == OP_PROP || *previous == OP_NOTPROP)
2455 {
2456 prop_type = previous[1];
2457 prop_value = previous[2];
2458 }
2459 else prop_type = prop_value = -1;
2460
2461 oldcode = code;
2462 code = previous; /* Usually overwrite previous item */
2463
2464 /* If the maximum is zero then the minimum must also be zero; Perl allows
2465 this case, so we do too - by simply omitting the item altogether. */
2466
2467 if (repeat_max == 0) goto END_REPEAT;
2468
2469 /* All real repeats make it impossible to handle partial matching (maybe
2470 one day we will be able to remove this restriction). */
2471
2472 if (repeat_max != 1) cd->nopartial = TRUE;
2473
2474 /* Combine the op_type with the repeat_type */
2475
2476 repeat_type += op_type;
2477
2478 /* A minimum of zero is handled either as the special case * or ?, or as
2479 an UPTO, with the maximum given. */
2480
2481 if (repeat_min == 0)
2482 {
2483 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2484 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2485 else
2486 {
2487 *code++ = OP_UPTO + repeat_type;
2488 PUT2INC(code, 0, repeat_max);
2489 }
2490 }
2491
2492 /* A repeat minimum of 1 is optimized into some special cases. If the
2493 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2494 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2495 one less than the maximum. */
2496
2497 else if (repeat_min == 1)
2498 {
2499 if (repeat_max == -1)
2500 *code++ = OP_PLUS + repeat_type;
2501 else
2502 {
2503 code = oldcode; /* leave previous item in place */
2504 if (repeat_max == 1) goto END_REPEAT;
2505 *code++ = OP_UPTO + repeat_type;
2506 PUT2INC(code, 0, repeat_max - 1);
2507 }
2508 }
2509
2510 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2511 handled as an EXACT followed by an UPTO. */
2512
2513 else
2514 {
2515 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2516 PUT2INC(code, 0, repeat_min);
2517
2518 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2519 we have to insert the character for the previous code. For a repeated
2520 Unicode property match, there are two extra bytes that define the
2521 required property. In UTF-8 mode, long characters have their length in
2522 c, with the 0x80 bit as a flag. */
2523
2524 if (repeat_max < 0)
2525 {
2526 #ifdef SUPPORT_UTF8
2527 if (utf8 && c >= 128)
2528 {
2529 memcpy(code, utf8_char, c & 7);
2530 code += c & 7;
2531 }
2532 else
2533 #endif
2534 {
2535 *code++ = c;
2536 if (prop_type >= 0)
2537 {
2538 *code++ = prop_type;
2539 *code++ = prop_value;
2540 }
2541 }
2542 *code++ = OP_STAR + repeat_type;
2543 }
2544
2545 /* Else insert an UPTO if the max is greater than the min, again
2546 preceded by the character, for the previously inserted code. */
2547
2548 else if (repeat_max != repeat_min)
2549 {
2550 #ifdef SUPPORT_UTF8
2551 if (utf8 && c >= 128)
2552 {
2553 memcpy(code, utf8_char, c & 7);
2554 code += c & 7;
2555 }
2556 else
2557 #endif
2558 *code++ = c;
2559 if (prop_type >= 0)
2560 {
2561 *code++ = prop_type;
2562 *code++ = prop_value;
2563 }
2564 repeat_max -= repeat_min;
2565 *code++ = OP_UPTO + repeat_type;
2566 PUT2INC(code, 0, repeat_max);
2567 }
2568 }
2569
2570 /* The character or character type itself comes last in all cases. */
2571
2572 #ifdef SUPPORT_UTF8
2573 if (utf8 && c >= 128)
2574 {
2575 memcpy(code, utf8_char, c & 7);
2576 code += c & 7;
2577 }
2578 else
2579 #endif
2580 *code++ = c;
2581
2582 /* For a repeated Unicode property match, there are two extra bytes that
2583 define the required property. */
2584
2585 #ifdef SUPPORT_UCP
2586 if (prop_type >= 0)
2587 {
2588 *code++ = prop_type;
2589 *code++ = prop_value;
2590 }
2591 #endif
2592 }
2593
2594 /* If previous was a character class or a back reference, we put the repeat
2595 stuff after it, but just skip the item if the repeat was {0,0}. */
2596
2597 else if (*previous == OP_CLASS ||
2598 *previous == OP_NCLASS ||
2599 #ifdef SUPPORT_UTF8
2600 *previous == OP_XCLASS ||
2601 #endif
2602 *previous == OP_REF)
2603 {
2604 if (repeat_max == 0)
2605 {
2606 code = previous;
2607 goto END_REPEAT;
2608 }
2609
2610 /* All real repeats make it impossible to handle partial matching (maybe
2611 one day we will be able to remove this restriction). */
2612
2613 if (repeat_max != 1) cd->nopartial = TRUE;
2614
2615 if (repeat_min == 0 && repeat_max == -1)
2616 *code++ = OP_CRSTAR + repeat_type;
2617 else if (repeat_min == 1 && repeat_max == -1)
2618 *code++ = OP_CRPLUS + repeat_type;
2619 else if (repeat_min == 0 && repeat_max == 1)
2620 *code++ = OP_CRQUERY + repeat_type;
2621 else
2622 {
2623 *code++ = OP_CRRANGE + repeat_type;
2624 PUT2INC(code, 0, repeat_min);
2625 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2626 PUT2INC(code, 0, repeat_max);
2627 }
2628 }
2629
2630 /* If previous was a bracket group, we may have to replicate it in certain
2631 cases. */
2632
2633 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2634 *previous == OP_COND)
2635 {
2636 register int i;
2637 int ketoffset = 0;
2638 int len = code - previous;
2639 uschar *bralink = NULL;
2640
2641 /* If the maximum repeat count is unlimited, find the end of the bracket
2642 by scanning through from the start, and compute the offset back to it
2643 from the current code pointer. There may be an OP_OPT setting following
2644 the final KET, so we can't find the end just by going back from the code
2645 pointer. */
2646
2647 if (repeat_max == -1)
2648 {
2649 register uschar *ket = previous;
2650 do ket += GET(ket, 1); while (*ket != OP_KET);
2651 ketoffset = code - ket;
2652 }
2653
2654 /* The case of a zero minimum is special because of the need to stick
2655 OP_BRAZERO in front of it, and because the group appears once in the
2656 data, whereas in other cases it appears the minimum number of times. For
2657 this reason, it is simplest to treat this case separately, as otherwise
2658 the code gets far too messy. There are several special subcases when the
2659 minimum is zero. */
2660
2661 if (repeat_min == 0)
2662 {
2663 /* If the maximum is also zero, we just omit the group from the output
2664 altogether. */
2665
2666 if (repeat_max == 0)
2667 {
2668 code = previous;
2669 goto END_REPEAT;
2670 }
2671
2672 /* If the maximum is 1 or unlimited, we just have to stick in the
2673 BRAZERO and do no more at this point. However, we do need to adjust
2674 any OP_RECURSE calls inside the group that refer to the group itself or
2675 any internal group, because the offset is from the start of the whole
2676 regex. Temporarily terminate the pattern while doing this. */
2677
2678 if (repeat_max <= 1)
2679 {
2680 *code = OP_END;
2681 adjust_recurse(previous, 1, utf8, cd);
2682 memmove(previous+1, previous, len);
2683 code++;
2684 *previous++ = OP_BRAZERO + repeat_type;
2685 }
2686
2687 /* If the maximum is greater than 1 and limited, we have to replicate
2688 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2689 The first one has to be handled carefully because it's the original
2690 copy, which has to be moved up. The remainder can be handled by code
2691 that is common with the non-zero minimum case below. We have to
2692 adjust the value or repeat_max, since one less copy is required. Once
2693 again, we may have to adjust any OP_RECURSE calls inside the group. */
2694
2695 else
2696 {
2697 int offset;
2698 *code = OP_END;
2699 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2700 memmove(previous + 2 + LINK_SIZE, previous, len);
2701 code += 2 + LINK_SIZE;
2702 *previous++ = OP_BRAZERO + repeat_type;
2703 *previous++ = OP_BRA;
2704
2705 /* We chain together the bracket offset fields that have to be
2706 filled in later when the ends of the brackets are reached. */
2707
2708 offset = (bralink == NULL)? 0 : previous - bralink;
2709 bralink = previous;
2710 PUTINC(previous, 0, offset);
2711 }
2712
2713 repeat_max--;
2714 }
2715
2716 /* If the minimum is greater than zero, replicate the group as many
2717 times as necessary, and adjust the maximum to the number of subsequent
2718 copies that we need. If we set a first char from the group, and didn't
2719 set a required char, copy the latter from the former. */
2720
2721 else
2722 {
2723 if (repeat_min > 1)
2724 {
2725 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2726 for (i = 1; i < repeat_min; i++)
2727 {
2728 memcpy(code, previous, len);
2729 code += len;
2730 }
2731 }
2732 if (repeat_max > 0) repeat_max -= repeat_min;
2733 }
2734
2735 /* This code is common to both the zero and non-zero minimum cases. If
2736 the maximum is limited, it replicates the group in a nested fashion,
2737 remembering the bracket starts on a stack. In the case of a zero minimum,
2738 the first one was set up above. In all cases the repeat_max now specifies
2739 the number of additional copies needed. */
2740
2741 if (repeat_max >= 0)
2742 {
2743 for (i = repeat_max - 1; i >= 0; i--)
2744 {
2745 *code++ = OP_BRAZERO + repeat_type;
2746
2747 /* All but the final copy start a new nesting, maintaining the
2748 chain of brackets outstanding. */
2749
2750 if (i != 0)
2751 {
2752 int offset;
2753 *code++ = OP_BRA;
2754 offset = (bralink == NULL)? 0 : code - bralink;
2755 bralink = code;
2756 PUTINC(code, 0, offset);
2757 }
2758
2759 memcpy(code, previous, len);
2760 code += len;
2761 }
2762
2763 /* Now chain through the pending brackets, and fill in their length
2764 fields (which are holding the chain links pro tem). */
2765
2766 while (bralink != NULL)
2767 {
2768 int oldlinkoffset;
2769 int offset = code - bralink + 1;
2770 uschar *bra = code - offset;
2771 oldlinkoffset = GET(bra, 1);
2772 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2773 *code++ = OP_KET;
2774 PUTINC(code, 0, offset);
2775 PUT(bra, 1, offset);
2776 }
2777 }
2778
2779 /* If the maximum is unlimited, set a repeater in the final copy. We
2780 can't just offset backwards from the current code point, because we
2781 don't know if there's been an options resetting after the ket. The
2782 correct offset was computed above. */
2783
2784 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2785 }
2786
2787 /* Else there's some kind of shambles */
2788
2789 else
2790 {
2791 *errorcodeptr = ERR11;
2792 goto FAILED;
2793 }
2794
2795 /* If the character following a repeat is '+', we wrap the entire repeated
2796 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2797 Sun's Java package. The repeated item starts at tempcode, not at previous,
2798 which might be the first part of a string whose (former) last char we
2799 repeated. However, we don't support '+' after a greediness '?'. */
2800
2801 if (possessive_quantifier)
2802 {
2803 int len = code - tempcode;
2804 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2805 code += 1 + LINK_SIZE;
2806 len += 1 + LINK_SIZE;
2807 tempcode[0] = OP_ONCE;
2808 *code++ = OP_KET;
2809 PUTINC(code, 0, len);
2810 PUT(tempcode, 1, len);
2811 }
2812
2813 /* In all case we no longer have a previous item. We also set the
2814 "follows varying string" flag for subsequently encountered reqbytes if
2815 it isn't already set and we have just passed a varying length item. */
2816
2817 END_REPEAT:
2818 previous = NULL;
2819 cd->req_varyopt |= reqvary;
2820 break;
2821
2822
2823 /* Start of nested bracket sub-expression, or comment or lookahead or
2824 lookbehind or option setting or condition. First deal with special things
2825 that can come after a bracket; all are introduced by ?, and the appearance
2826 of any of them means that this is not a referencing group. They were
2827 checked for validity in the first pass over the string, so we don't have to
2828 check for syntax errors here. */
2829
2830 case '(':
2831 newoptions = options;
2832 skipbytes = 0;
2833
2834 if (*(++ptr) == '?')
2835 {
2836 int set, unset;
2837 int *optset;
2838
2839 switch (*(++ptr))
2840 {
2841 case '#': /* Comment; skip to ket */
2842 ptr++;
2843 while (*ptr != ')') ptr++;
2844 continue;
2845
2846 case ':': /* Non-extracting bracket */
2847 bravalue = OP_BRA;
2848 ptr++;
2849 break;
2850
2851 case '(':
2852 bravalue = OP_COND; /* Conditional group */
2853
2854 /* Condition to test for recursion */
2855
2856 if (ptr[1] == 'R')
2857 {
2858 code[1+LINK_SIZE] = OP_CREF;
2859 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2860 skipbytes = 3;
2861 ptr += 3;
2862 }
2863
2864 /* Condition to test for a numbered subpattern match. We know that
2865 if a digit follows ( then there will just be digits until ) because
2866 the syntax was checked in the first pass. */
2867
2868 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2869 {
2870 int condref; /* Don't amalgamate; some compilers */
2871 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2872 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2873 if (condref == 0)
2874 {
2875 *errorcodeptr = ERR35;
2876 goto FAILED;
2877 }
2878 ptr++;
2879 code[1+LINK_SIZE] = OP_CREF;
2880 PUT2(code, 2+LINK_SIZE, condref);
2881 skipbytes = 3;
2882 }
2883 /* For conditions that are assertions, we just fall through, having
2884 set bravalue above. */
2885 break;
2886
2887 case '=': /* Positive lookahead */
2888 bravalue = OP_ASSERT;
2889 ptr++;
2890 break;
2891
2892 case '!': /* Negative lookahead */
2893 bravalue = OP_ASSERT_NOT;
2894 ptr++;
2895 break;
2896
2897 case '<': /* Lookbehinds */
2898 switch (*(++ptr))
2899 {
2900 case '=': /* Positive lookbehind */
2901 bravalue = OP_ASSERTBACK;
2902 ptr++;
2903 break;
2904
2905 case '!': /* Negative lookbehind */
2906 bravalue = OP_ASSERTBACK_NOT;
2907 ptr++;
2908 break;
2909 }
2910 break;
2911
2912 case '>': /* One-time brackets */
2913 bravalue = OP_ONCE;
2914 ptr++;
2915 break;
2916
2917 case 'C': /* Callout - may be followed by digits; */
2918 previous_callout = code; /* Save for later completion */
2919 after_manual_callout = 1; /* Skip one item before completing */
2920 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2921 { /* closing parenthesis is present. */
2922 int n = 0;
2923 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2924 n = n * 10 + *ptr - '0';
2925 if (n > 255)
2926 {
2927 *errorcodeptr = ERR38;
2928 goto FAILED;
2929 }
2930 *code++ = n;
2931 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2932 PUT(code, LINK_SIZE, 0); /* Default length */
2933 code += 2 * LINK_SIZE;
2934 }
2935 previous = NULL;
2936 continue;
2937
2938 case 'P': /* Named subpattern handling */
2939 if (*(++ptr) == '<') /* Definition */
2940 {
2941 int i, namelen;
2942 uschar *slot = cd->name_table;
2943 const uschar *name; /* Don't amalgamate; some compilers */
2944 name = ++ptr; /* grumble at autoincrement in declaration */
2945
2946 while (*ptr++ != '>');
2947 namelen = ptr - name - 1;
2948
2949 for (i = 0; i < cd->names_found; i++)
2950 {
2951 int crc = memcmp(name, slot+2, namelen);
2952 if (crc == 0)
2953 {
2954 if (slot[2+namelen] == 0)
2955 {
2956 *errorcodeptr = ERR43;
2957 goto FAILED;
2958 }
2959 crc = -1; /* Current name is substring */
2960 }
2961 if (crc < 0)
2962 {
2963 memmove(slot + cd->name_entry_size, slot,
2964 (cd->names_found - i) * cd->name_entry_size);
2965 break;
2966 }
2967 slot += cd->name_entry_size;
2968 }
2969
2970 PUT2(slot, 0, *brackets + 1);
2971 memcpy(slot + 2, name, namelen);
2972 slot[2+namelen] = 0;
2973 cd->names_found++;
2974 goto NUMBERED_GROUP;
2975 }
2976
2977 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2978 {
2979 int i, namelen;
2980 int type = *ptr++;
2981 const uschar *name = ptr;
2982 uschar *slot = cd->name_table;
2983
2984 while (*ptr != ')') ptr++;
2985 namelen = ptr - name;
2986
2987 for (i = 0; i < cd->names_found; i++)
2988 {
2989 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2990 slot += cd->name_entry_size;
2991 }
2992 if (i >= cd->names_found)
2993 {
2994 *errorcodeptr = ERR15;
2995 goto FAILED;
2996 }
2997
2998 recno = GET2(slot, 0);
2999
3000 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3001
3002 /* Back reference */
3003
3004 previous = code;
3005 *code++ = OP_REF;
3006 PUT2INC(code, 0, recno);
3007 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3008 if (recno > cd->top_backref) cd->top_backref = recno;
3009 continue;
3010 }
3011
3012 /* Should never happen */
3013 break;
3014
3015 case 'R': /* Pattern recursion */
3016 ptr++; /* Same as (?0) */
3017 /* Fall through */
3018
3019 /* Recursion or "subroutine" call */
3020
3021 case '0': case '1': case '2': case '3': case '4':
3022 case '5': case '6': case '7': case '8': case '9':
3023 {
3024 const uschar *called;
3025 recno = 0;
3026 while((digitab[*ptr] & ctype_digit) != 0)
3027 recno = recno * 10 + *ptr++ - '0';
3028
3029 /* Come here from code above that handles a named recursion */
3030
3031 HANDLE_RECURSION:
3032
3033 previous = code;
3034
3035 /* Find the bracket that is being referenced. Temporarily end the
3036 regex in case it doesn't exist. */
3037
3038 *code = OP_END;
3039 called = (recno == 0)?
3040 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3041
3042 if (called == NULL)
3043 {
3044 *errorcodeptr = ERR15;
3045 goto FAILED;
3046 }
3047
3048 /* If the subpattern is still open, this is a recursive call. We
3049 check to see if this is a left recursion that could loop for ever,
3050 and diagnose that case. */
3051
3052 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3053 {
3054 *errorcodeptr = ERR40;
3055 goto FAILED;
3056 }
3057
3058 /* Insert the recursion/subroutine item, automatically wrapped inside
3059 "once" brackets. */
3060
3061 *code = OP_ONCE;
3062 PUT(code, 1, 2 + 2*LINK_SIZE);
3063 code += 1 + LINK_SIZE;
3064
3065 *code = OP_RECURSE;
3066 PUT(code, 1, called - cd->start_code);
3067 code += 1 + LINK_SIZE;
3068
3069 *code = OP_KET;
3070 PUT(code, 1, 2 + 2*LINK_SIZE);
3071 code += 1 + LINK_SIZE;
3072 }
3073 continue;
3074
3075 /* Character after (? not specially recognized */
3076
3077 default: /* Option setting */
3078 set = unset = 0;
3079 optset = &set;
3080
3081 while (*ptr != ')' && *ptr != ':')
3082 {
3083 switch (*ptr++)
3084 {
3085 case '-': optset = &unset; break;
3086
3087 case 'i': *optset |= PCRE_CASELESS; break;
3088 case 'm': *optset |= PCRE_MULTILINE; break;
3089 case 's': *optset |= PCRE_DOTALL; break;
3090 case 'x': *optset |= PCRE_EXTENDED; break;
3091 case 'U': *optset |= PCRE_UNGREEDY; break;
3092 case 'X': *optset |= PCRE_EXTRA; break;
3093 }
3094 }
3095
3096 /* Set up the changed option bits, but don't change anything yet. */
3097
3098 newoptions = (options | set) & (~unset);
3099
3100 /* If the options ended with ')' this is not the start of a nested
3101 group with option changes, so the options change at this level. Compile
3102 code to change the ims options if this setting actually changes any of
3103 them. We also pass the new setting back so that it can be put at the
3104 start of any following branches, and when this group ends (if we are in
3105 a group), a resetting item can be compiled.
3106
3107 Note that if this item is right at the start of the pattern, the
3108 options will have been abstracted and made global, so there will be no
3109 change to compile. */
3110
3111 if (*ptr == ')')
3112 {
3113 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3114 {
3115 *code++ = OP_OPT;
3116 *code++ = newoptions & PCRE_IMS;
3117 }
3118
3119 /* Change options at this level, and pass them back for use
3120 in subsequent branches. Reset the greedy defaults and the case
3121 value for firstbyte and reqbyte. */
3122
3123 *optionsptr = options = newoptions;
3124 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3125 greedy_non_default = greedy_default ^ 1;
3126 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3127
3128 previous = NULL; /* This item can't be repeated */
3129 continue; /* It is complete */
3130 }
3131
3132 /* If the options ended with ':' we are heading into a nested group
3133 with possible change of options. Such groups are non-capturing and are
3134 not assertions of any kind. All we need to do is skip over the ':';
3135 the newoptions value is handled below. */
3136
3137 bravalue = OP_BRA;
3138 ptr++;
3139 }
3140 }
3141
3142 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3143 non-capturing and behave like (?:...) brackets */
3144
3145 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3146 {
3147 bravalue = OP_BRA;
3148 }
3149
3150 /* Else we have a referencing group; adjust the opcode. If the bracket
3151 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3152 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3153
3154 else
3155 {
3156 NUMBERED_GROUP:
3157 if (++(*brackets) > EXTRACT_BASIC_MAX)
3158 {
3159 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3160 code[1+LINK_SIZE] = OP_BRANUMBER;
3161 PUT2(code, 2+LINK_SIZE, *brackets);
3162 skipbytes = 3;
3163 }
3164 else bravalue = OP_BRA + *brackets;
3165 }
3166
3167 /* Process nested bracketed re. Assertions may not be repeated, but other
3168 kinds can be. We copy code into a non-register variable in order to be able
3169 to pass its address because some compilers complain otherwise. Pass in a
3170 new setting for the ims options if they have changed. */
3171
3172 previous = (bravalue >= OP_ONCE)? code : NULL;
3173 *code = bravalue;
3174 tempcode = code;
3175 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3176
3177 if (!compile_regex(
3178 newoptions, /* The complete new option state */
3179 options & PCRE_IMS, /* The previous ims option state */
3180 brackets, /* Extracting bracket count */
3181 &tempcode, /* Where to put code (updated) */
3182 &ptr, /* Input pointer (updated) */
3183 errorcodeptr, /* Where to put an error message */
3184 (bravalue == OP_ASSERTBACK ||
3185 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3186 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3187 &subfirstbyte, /* For possible first char */
3188 &subreqbyte, /* For possible last char */
3189 bcptr, /* Current branch chain */
3190 cd)) /* Tables block */
3191 goto FAILED;
3192
3193 /* At the end of compiling, code is still pointing to the start of the
3194 group, while tempcode has been updated to point past the end of the group
3195 and any option resetting that may follow it. The pattern pointer (ptr)
3196 is on the bracket. */
3197
3198 /* If this is a conditional bracket, check that there are no more than
3199 two branches in the group. */
3200
3201 else if (bravalue == OP_COND)
3202 {
3203 uschar *tc = code;
3204 condcount = 0;
3205
3206 do {
3207 condcount++;
3208 tc += GET(tc,1);
3209 }
3210 while (*tc != OP_KET);
3211
3212 if (condcount > 2)
3213 {
3214 *errorcodeptr = ERR27;
3215 goto FAILED;
3216 }
3217
3218 /* If there is just one branch, we must not make use of its firstbyte or
3219 reqbyte, because this is equivalent to an empty second branch. */
3220
3221 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3222 }
3223
3224 /* Handle updating of the required and first characters. Update for normal
3225 brackets of all kinds, and conditions with two branches (see code above).
3226 If the bracket is followed by a quantifier with zero repeat, we have to
3227 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3228 main loop so that they can be accessed for the back off. */
3229
3230 zeroreqbyte = reqbyte;
3231 zerofirstbyte = firstbyte;
3232 groupsetfirstbyte = FALSE;
3233
3234 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3235 {
3236 /* If we have not yet set a firstbyte in this branch, take it from the
3237 subpattern, remembering that it was set here so that a repeat of more
3238 than one can replicate it as reqbyte if necessary. If the subpattern has
3239 no firstbyte, set "none" for the whole branch. In both cases, a zero
3240 repeat forces firstbyte to "none". */
3241
3242 if (firstbyte == REQ_UNSET)
3243 {
3244 if (subfirstbyte >= 0)
3245 {
3246 firstbyte = subfirstbyte;
3247 groupsetfirstbyte = TRUE;
3248 }
3249 else firstbyte = REQ_NONE;
3250 zerofirstbyte = REQ_NONE;
3251 }
3252
3253 /* If firstbyte was previously set, convert the subpattern's firstbyte
3254 into reqbyte if there wasn't one, using the vary flag that was in
3255 existence beforehand. */
3256
3257 else if (subfirstbyte >= 0 && subreqbyte < 0)
3258 subreqbyte = subfirstbyte | tempreqvary;
3259
3260 /* If the subpattern set a required byte (or set a first byte that isn't
3261 really the first byte - see above), set it. */
3262
3263 if (subreqbyte >= 0) reqbyte = subreqbyte;
3264 }
3265
3266 /* For a forward assertion, we take the reqbyte, if set. This can be
3267 helpful if the pattern that follows the assertion doesn't set a different
3268 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3269 for an assertion, however because it leads to incorrect effect for patterns
3270 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3271 of a firstbyte. This is overcome by a scan at the end if there's no
3272 firstbyte, looking for an asserted first char. */
3273
3274 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3275
3276 /* Now update the main code pointer to the end of the group. */
3277
3278 code = tempcode;
3279
3280 /* Error if hit end of pattern */
3281
3282 if (*ptr != ')')
3283 {
3284 *errorcodeptr = ERR14;
3285 goto FAILED;
3286 }
3287 break;
3288
3289 /* Check \ for being a real metacharacter; if not, fall through and handle
3290 it as a data character at the start of a string. Escape items are checked
3291 for validity in the pre-compiling pass. */
3292
3293 case '\\':
3294 tempptr = ptr;
3295 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3296
3297 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3298 are arranged to be the negation of the corresponding OP_values. For the
3299 back references, the values are ESC_REF plus the reference number. Only
3300 back references and those types that consume a character may be repeated.
3301 We can test for values between ESC_b and ESC_Z for the latter; this may
3302 have to change if any new ones are ever created. */
3303
3304 if (c < 0)
3305 {
3306 if (-c == ESC_Q) /* Handle start of quoted string */
3307 {
3308 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3309 else inescq = TRUE;
3310 continue;
3311 }
3312
3313 /* For metasequences that actually match a character, we disable the
3314 setting of a first character if it hasn't already been set. */
3315
3316 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3317 firstbyte = REQ_NONE;
3318
3319 /* Set values to reset to if this is followed by a zero repeat. */
3320
3321 zerofirstbyte = firstbyte;
3322 zeroreqbyte = reqbyte;
3323
3324 /* Back references are handled specially */
3325
3326 if (-c >= ESC_REF)
3327 {
3328 int number = -c - ESC_REF;
3329 previous = code;
3330 *code++ = OP_REF;
3331 PUT2INC(code, 0, number);
3332 }
3333
3334 /* So are Unicode property matches, if supported. We know that get_ucp
3335 won't fail because it was tested in the pre-pass. */
3336
3337 #ifdef SUPPORT_UCP
3338 else if (-c == ESC_P || -c == ESC_p)
3339 {
3340 BOOL negated;
3341 int pdata;
3342 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3343 previous = code;
3344 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3345 *code++ = ptype;
3346 *code++ = pdata;
3347 }
3348 #endif
3349
3350 /* For the rest, we can obtain the OP value by negating the escape
3351 value */
3352
3353 else
3354 {
3355 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3356 *code++ = -c;
3357 }
3358 continue;
3359 }
3360
3361 /* We have a data character whose value is in c. In UTF-8 mode it may have
3362 a value > 127. We set its representation in the length/buffer, and then
3363 handle it as a data character. */
3364
3365 #ifdef SUPPORT_UTF8
3366 if (utf8 && c > 127)
3367 mclength = _pcre_ord2utf8(c, mcbuffer);
3368 else
3369 #endif
3370
3371 {
3372 mcbuffer[0] = c;
3373 mclength = 1;
3374 }
3375
3376 goto ONE_CHAR;
3377
3378 /* Handle a literal character. It is guaranteed not to be whitespace or #
3379 when the extended flag is set. If we are in UTF-8 mode, it may be a
3380 multi-byte literal character. */
3381
3382 default:
3383 NORMAL_CHAR:
3384 mclength = 1;
3385 mcbuffer[0] = c;
3386
3387 #ifdef SUPPORT_UTF8
3388 if (utf8 && (c & 0xc0) == 0xc0)
3389 {
3390 while ((ptr[1] & 0xc0) == 0x80)
3391 mcbuffer[mclength++] = *(++ptr);
3392 }
3393 #endif
3394
3395 /* At this point we have the character's bytes in mcbuffer, and the length
3396 in mclength. When not in UTF-8 mode, the length is always 1. */
3397
3398 ONE_CHAR:
3399 previous = code;
3400 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3401 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3402
3403 /* Set the first and required bytes appropriately. If no previous first
3404 byte, set it from this character, but revert to none on a zero repeat.
3405 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3406 repeat. */
3407
3408 if (firstbyte == REQ_UNSET)
3409 {
3410 zerofirstbyte = REQ_NONE;
3411 zeroreqbyte = reqbyte;
3412
3413 /* If the character is more than one byte long, we can set firstbyte
3414 only if it is not to be matched caselessly. */
3415
3416 if (mclength == 1 || req_caseopt == 0)
3417 {
3418 firstbyte = mcbuffer[0] | req_caseopt;
3419 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3420 }
3421 else firstbyte = reqbyte = REQ_NONE;
3422 }
3423
3424 /* firstbyte was previously set; we can set reqbyte only the length is
3425 1 or the matching is caseful. */
3426
3427 else
3428 {
3429 zerofirstbyte = firstbyte;
3430 zeroreqbyte = reqbyte;
3431 if (mclength == 1 || req_caseopt == 0)
3432 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3433 }
3434
3435 break; /* End of literal character handling */
3436 }
3437 } /* end of big loop */
3438
3439 /* Control never reaches here by falling through, only by a goto for all the
3440 error states. Pass back the position in the pattern so that it can be displayed
3441 to the user for diagnosing the error. */
3442
3443 FAILED:
3444 *ptrptr = ptr;
3445 return FALSE;
3446 }
3447
3448
3449
3450
3451 /*************************************************
3452 * Compile sequence of alternatives *
3453 *************************************************/
3454
3455 /* On entry, ptr is pointing past the bracket character, but on return
3456 it points to the closing bracket, or vertical bar, or end of string.
3457 The code variable is pointing at the byte into which the BRA operator has been
3458 stored. If the ims options are changed at the start (for a (?ims: group) or
3459 during any branch, we need to insert an OP_OPT item at the start of every
3460 following branch to ensure they get set correctly at run time, and also pass
3461 the new options into every subsequent branch compile.
3462
3463 Argument:
3464 options option bits, including any changes for this subpattern
3465 oldims previous settings of ims option bits
3466 brackets -> int containing the number of extracting brackets used
3467 codeptr -> the address of the current code pointer
3468 ptrptr -> the address of the current pattern pointer
3469 errorcodeptr -> pointer to error code variable
3470 lookbehind TRUE if this is a lookbehind assertion
3471 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3472 firstbyteptr place to put the first required character, or a negative number
3473 reqbyteptr place to put the last required character, or a negative number
3474 bcptr pointer to the chain of currently open branches
3475 cd points to the data block with tables pointers etc.
3476
3477 Returns: TRUE on success
3478 */
3479
3480 static BOOL
3481 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3482 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3483 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3484 {
3485 const uschar *ptr = *ptrptr;
3486 uschar *code = *codeptr;
3487 uschar *last_branch = code;
3488 uschar *start_bracket = code;
3489 uschar *reverse_count = NULL;
3490 int firstbyte, reqbyte;
3491 int branchfirstbyte, branchreqbyte;
3492 branch_chain bc;
3493
3494 bc.outer = bcptr;
3495 bc.current = code;
3496
3497 firstbyte = reqbyte = REQ_UNSET;
3498
3499 /* Offset is set zero to mark that this bracket is still open */
3500
3501 PUT(code, 1, 0);
3502 code += 1 + LINK_SIZE + skipbytes;
3503
3504 /* Loop for each alternative branch */
3505
3506 for (;;)
3507 {
3508 /* Handle a change of ims options at the start of the branch */
3509
3510 if ((options & PCRE_IMS) != oldims)
3511 {
3512 *code++ = OP_OPT;
3513 *code++ = options & PCRE_IMS;
3514 }
3515
3516 /* Set up dummy OP_REVERSE if lookbehind assertion */
3517
3518 if (lookbehind)
3519 {
3520 *code++ = OP_REVERSE;
3521 reverse_count = code;
3522 PUTINC(code, 0, 0);
3523 }
3524
3525 /* Now compile the branch */
3526
3527 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3528 &branchfirstbyte, &branchreqbyte, &bc, cd))
3529 {
3530 *ptrptr = ptr;
3531 return FALSE;
3532 }
3533
3534 /* If this is the first branch, the firstbyte and reqbyte values for the
3535 branch become the values for the regex. */
3536
3537 if (*last_branch != OP_ALT)
3538 {
3539 firstbyte = branchfirstbyte;
3540 reqbyte = branchreqbyte;
3541 }
3542
3543 /* If this is not the first branch, the first char and reqbyte have to
3544 match the values from all the previous branches, except that if the previous
3545 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3546 REQ_VARY for the regex. */
3547
3548 else
3549 {
3550 /* If we previously had a firstbyte, but it doesn't match the new branch,
3551 we have to abandon the firstbyte for the regex, but if there was previously
3552 no reqbyte, it takes on the value of the old firstbyte. */
3553
3554 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3555 {
3556 if (reqbyte < 0) reqbyte = firstbyte;
3557 firstbyte = REQ_NONE;
3558 }
3559
3560 /* If we (now or from before) have no firstbyte, a firstbyte from the
3561 branch becomes a reqbyte if there isn't a branch reqbyte. */
3562
3563 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3564 branchreqbyte = branchfirstbyte;
3565
3566 /* Now ensure that the reqbytes match */
3567
3568 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3569 reqbyte = REQ_NONE;
3570 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3571 }
3572
3573 /* If lookbehind, check that this branch matches a fixed-length string,
3574 and put the length into the OP_REVERSE item. Temporarily mark the end of
3575 the branch with OP_END. */
3576
3577 if (lookbehind)
3578 {
3579 int length;
3580 *code = OP_END;
3581 length = find_fixedlength(last_branch, options);
3582 DPRINTF(("fixed length = %d\n", length));
3583 if (length < 0)
3584 {
3585 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3586 *ptrptr = ptr;
3587 return FALSE;
3588 }
3589 PUT(reverse_count, 0, length);
3590 }
3591
3592 /* Reached end of expression, either ')' or end of pattern. Go back through
3593 the alternative branches and reverse the chain of offsets, with the field in
3594 the BRA item now becoming an offset to the first alternative. If there are
3595 no alternatives, it points to the end of the group. The length in the
3596 terminating ket is always the length of the whole bracketed item. If any of
3597 the ims options were changed inside the group, compile a resetting op-code
3598 following, except at the very end of the pattern. Return leaving the pointer
3599 at the terminating char. */
3600
3601 if (*ptr != '|')
3602 {
3603 int length = code - last_branch;
3604 do
3605 {
3606 int prev_length = GET(last_branch, 1);
3607 PUT(last_branch, 1, length);
3608 length = prev_length;
3609 last_branch -= length;
3610 }
3611 while (length > 0);
3612
3613 /* Fill in the ket */
3614
3615 *code = OP_KET;
3616 PUT(code, 1, code - start_bracket);
3617 code += 1 + LINK_SIZE;
3618
3619 /* Resetting option if needed */
3620
3621 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3622 {
3623 *code++ = OP_OPT;
3624 *code++ = oldims;
3625 }
3626
3627 /* Set values to pass back */
3628
3629 *codeptr = code;
3630 *ptrptr = ptr;
3631 *firstbyteptr = firstbyte;
3632 *reqbyteptr = reqbyte;
3633 return TRUE;
3634 }
3635
3636 /* Another branch follows; insert an "or" node. Its length field points back
3637 to the previous branch while the bracket remains open. At the end the chain
3638 is reversed. It's done like this so that the start of the bracket has a
3639 zero offset until it is closed, making it possible to detect recursion. */
3640
3641 *code = OP_ALT;
3642 PUT(code, 1, code - last_branch);
3643 bc.current = last_branch = code;
3644 code += 1 + LINK_SIZE;
3645 ptr++;
3646 }
3647 /* Control never reaches here */
3648 }
3649
3650
3651
3652
3653 /*************************************************
3654 * Check for anchored expression *
3655 *************************************************/
3656
3657 /* Try to find out if this is an anchored regular expression. Consider each
3658 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3659 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3660 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3661 counts, since OP_CIRC can match in the middle.
3662
3663 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3664 This is the code for \G, which means "match at start of match position, taking
3665 into account the match offset".
3666
3667 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3668 because that will try the rest of the pattern at all possible matching points,
3669 so there is no point trying again.... er ....
3670
3671 .... except when the .* appears inside capturing parentheses, and there is a
3672 subsequent back reference to those parentheses. We haven't enough information
3673 to catch that case precisely.
3674
3675 At first, the best we could do was to detect when .* was in capturing brackets
3676 and the highest back reference was greater than or equal to that level.
3677 However, by keeping a bitmap of the first 31 back references, we can catch some
3678 of the more common cases more precisely.
3679
3680 Arguments:
3681 code points to start of expression (the bracket)
3682 options points to the options setting
3683 bracket_map a bitmap of which brackets we are inside while testing; this
3684 handles up to substring 31; after that we just have to take
3685 the less precise approach
3686 backref_map the back reference bitmap
3687
3688 Returns: TRUE or FALSE
3689 */
3690
3691 static BOOL
3692 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3693 unsigned int backref_map)
3694 {
3695 do {
3696 const uschar *scode =
3697 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3698 register int op = *scode;
3699
3700 /* Capturing brackets */
3701
3702 if (op > OP_BRA)
3703 {
3704 int new_map;
3705 op -= OP_BRA;
3706 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3707 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3708 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3709 }
3710
3711 /* Other brackets */
3712
3713 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3714 {
3715 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3716 }
3717
3718 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3719 are or may be referenced. */
3720
3721 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3722 (*options & PCRE_DOTALL) != 0)
3723 {
3724 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3725 }
3726
3727 /* Check for explicit anchoring */
3728
3729 else if (op != OP_SOD && op != OP_SOM &&
3730 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3731 return FALSE;
3732 code += GET(code, 1);
3733 }
3734 while (*code == OP_ALT); /* Loop for each alternative */
3735 return TRUE;
3736 }
3737
3738
3739
3740 /*************************************************
3741 * Check for starting with ^ or .* *
3742 *************************************************/
3743
3744 /* This is called to find out if every branch starts with ^ or .* so that
3745 "first char" processing can be done to speed things up in multiline
3746 matching and for non-DOTALL patterns that start with .* (which must start at
3747 the beginning or after \n). As in the case of is_anchored() (see above), we
3748 have to take account of back references to capturing brackets that contain .*
3749 because in that case we can't make the assumption.
3750
3751 Arguments:
3752 code points to start of expression (the bracket)
3753 bracket_map a bitmap of which brackets we are inside while testing; this
3754 handles up to substring 31; after that we just have to take
3755 the less precise approach
3756 backref_map the back reference bitmap
3757
3758 Returns: TRUE or FALSE
3759 */
3760
3761 static BOOL
3762 is_startline(const uschar *code, unsigned int bracket_map,
3763 unsigned int backref_map)
3764 {
3765 do {
3766 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3767 FALSE);
3768 register int op = *scode;
3769
3770 /* Capturing brackets */
3771
3772 if (op > OP_BRA)
3773 {
3774 int new_map;
3775 op -= OP_BRA;
3776 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3777 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3778 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3779 }
3780
3781 /* Other brackets */
3782
3783 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3784 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3785
3786 /* .* means "start at start or after \n" if it isn't in brackets that
3787 may be referenced. */
3788
3789 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3790 {
3791 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3792 }
3793
3794 /* Check for explicit circumflex */
3795
3796 else if (op != OP_CIRC) return FALSE;
3797
3798 /* Move on to the next alternative */
3799
3800 code += GET(code, 1);
3801 }
3802 while (*code == OP_ALT); /* Loop for each alternative */
3803 return TRUE;
3804 }
3805
3806
3807
3808 /*************************************************
3809 * Check for asserted fixed first char *
3810 *************************************************/
3811
3812 /* During compilation, the "first char" settings from forward assertions are
3813 discarded, because they can cause conflicts with actual literals that follow.
3814 However, if we end up without a first char setting for an unanchored pattern,
3815 it is worth scanning the regex to see if there is an initial asserted first
3816 char. If all branches start with the same asserted char, or with a bracket all
3817 of whose alternatives start with the same asserted char (recurse ad lib), then
3818 we return that char, otherwise -1.
3819
3820 Arguments:
3821 code points to start of expression (the bracket)
3822 options pointer to the options (used to check casing changes)
3823 inassert TRUE if in an assertion
3824
3825 Returns: -1 or the fixed first char
3826 */
3827
3828 static int
3829 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3830 {
3831 register int c = -1;
3832 do {
3833 int d;
3834 const uschar *scode =
3835 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3836 register int op = *scode;
3837
3838 if (op >= OP_BRA) op = OP_BRA;
3839
3840 switch(op)
3841 {
3842 default:
3843 return -1;
3844
3845 case OP_BRA:
3846 case OP_ASSERT:
3847 case OP_ONCE:
3848 case OP_COND:
3849 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3850 return -1;
3851 if (c < 0) c = d; else if (c != d) return -1;
3852 break;
3853
3854 case OP_EXACT: /* Fall through */
3855 scode += 2;
3856
3857 case OP_CHAR:
3858 case OP_CHARNC:
3859 case OP_PLUS:
3860 case OP_MINPLUS:
3861 if (!inassert) return -1;
3862 if (c < 0)
3863 {
3864 c = scode[1];
3865 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3866 }
3867 else if (c != scode[1]) return -1;
3868 break;
3869 }
3870
3871 code += GET(code, 1);
3872 }
3873 while (*code == OP_ALT);
3874 return c;
3875 }
3876
3877
3878
3879 /*************************************************
3880 * Compile a Regular Expression *
3881 *************************************************/
3882
3883 /* This function takes a string and returns a pointer to a block of store
3884 holding a compiled version of the expression. The original API for this
3885 function had no error code return variable; it is retained for backwards
3886 compatibility. The new function is given a new name.
3887
3888 Arguments:
3889 pattern the regular expression
3890 options various option bits
3891 errorcodeptr pointer to error code variable (pcre_compile2() only)
3892 can be NULL if you don't want a code value
3893 errorptr pointer to pointer to error text
3894 erroroffset ptr offset in pattern where error was detected
3895 tables pointer to character tables or NULL
3896
3897 Returns: pointer to compiled data block, or NULL on error,
3898 with errorptr and erroroffset set
3899 */
3900
3901 PCRE_DATA_SCOPE pcre *
3902 pcre_compile(const char *pattern, int options, const char **errorptr,
3903 int *erroroffset, const unsigned char *tables)
3904 {
3905 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3906 }
3907
3908
3909 PCRE_DATA_SCOPE pcre *
3910 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3911 const char **errorptr, int *erroroffset, const unsigned char *tables)
3912 {
3913 real_pcre *re;
3914 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3915 int c, firstbyte, reqbyte;
3916 int bracount = 0;
3917 int branch_extra = 0;
3918 int branch_newextra;
3919 int item_count = -1;
3920 int name_count = 0;
3921 int max_name_size = 0;
3922 int lastitemlength = 0;
3923 int errorcode = 0;
3924 #ifdef SUPPORT_UTF8
3925 BOOL utf8;
3926 BOOL class_utf8;
3927 #endif
3928 BOOL inescq = FALSE;
3929 BOOL capturing;
3930 unsigned int brastackptr = 0;
3931 size_t size;
3932 uschar *code;
3933 const uschar *codestart;
3934 const uschar *ptr;
3935 compile_data compile_block;
3936 int brastack[BRASTACK_SIZE];
3937 uschar bralenstack[BRASTACK_SIZE];
3938
3939 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3940 can do is just return NULL, but we can set a code value if there is a code
3941 pointer. */
3942
3943 if (errorptr == NULL)
3944 {
3945 if (errorcodeptr != NULL) *errorcodeptr = 99;
3946 return NULL;
3947 }
3948
3949 *errorptr = NULL;
3950 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3951
3952 /* However, we can give a message for this error */
3953
3954 if (erroroffset == NULL)
3955 {
3956 errorcode = ERR16;
3957 goto PCRE_EARLY_ERROR_RETURN;
3958 }
3959
3960 *erroroffset = 0;
3961
3962 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3963
3964 #ifdef SUPPORT_UTF8
3965 utf8 = (options & PCRE_UTF8) != 0;
3966 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3967 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3968 {
3969 errorcode = ERR44;
3970 goto PCRE_EARLY_ERROR_RETURN;
3971 }
3972 #else
3973 if ((options & PCRE_UTF8) != 0)
3974 {
3975 errorcode = ERR32;
3976 goto PCRE_EARLY_ERROR_RETURN;
3977 }
3978 #endif
3979
3980 if ((options & ~PUBLIC_OPTIONS) != 0)
3981 {
3982 errorcode = ERR17;
3983 goto PCRE_EARLY_ERROR_RETURN;
3984 }
3985
3986 /* Set up pointers to the individual character tables */
3987
3988 if (tables == NULL) tables = _pcre_default_tables;
3989 compile_block.lcc = tables + lcc_offset;
3990 compile_block.fcc = tables + fcc_offset;
3991 compile_block.cbits = tables + cbits_offset;
3992 compile_block.ctypes = tables + ctypes_offset;
3993
3994 /* Maximum back reference and backref bitmap. This is updated for numeric
3995 references during the first pass, but for named references during the actual
3996 compile pass. The bitmap records up to 31 back references to help in deciding
3997 whether (.*) can be treated as anchored or not. */
3998
3999 compile_block.top_backref = 0;
4000 compile_block.backref_map = 0;
4001
4002 /* Reflect pattern for debugging output */
4003
4004 DPRINTF(("------------------------------------------------------------------\n"));
4005 DPRINTF(("%s\n", pattern));
4006
4007 /* The first thing to do is to make a pass over the pattern to compute the
4008 amount of store required to hold the compiled code. This does not have to be
4009 perfect as long as errors are overestimates. At the same time we can detect any
4010 flag settings right at the start, and extract them. Make an attempt to correct
4011 for any counted white space if an "extended" flag setting appears late in the
4012 pattern. We can't be so clever for #-comments. */
4013
4014 ptr = (const uschar *)(pattern - 1);
4015 while ((c = *(++ptr)) != 0)
4016 {
4017 int min, max;
4018 int class_optcount;
4019 int bracket_length;
4020 int duplength;
4021
4022 /* If we are inside a \Q...\E sequence, all chars are literal */
4023
4024 if (inescq)
4025 {
4026 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4027 goto NORMAL_CHAR;
4028 }
4029
4030 /* Otherwise, first check for ignored whitespace and comments */
4031
4032 if ((options & PCRE_EXTENDED) != 0)
4033 {
4034 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4035 if (c == '#')
4036 {
4037 /* The space before the ; is to avoid a warning on a silly compiler
4038 on the Macintosh. */
4039 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4040 if (c == 0) break;
4041 continue;
4042 }
4043 }
4044
4045 item_count++; /* Is zero for the first non-comment item */
4046
4047 /* Allow space for auto callout before every item except quantifiers. */
4048
4049 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4050 c != '*' && c != '+' && c != '?' &&
4051 (c != '{' || !is_counted_repeat(ptr + 1)))
4052 length += 2 + 2*LINK_SIZE;
4053
4054 switch(c)
4055 {
4056 /* A backslashed item may be an escaped data character or it may be a
4057 character type. */
4058
4059 case '\\':
4060 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4061 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4062
4063 lastitemlength = 1; /* Default length of last item for repeats */
4064
4065 if (c >= 0) /* Data character */
4066 {
4067 length += 2; /* For a one-byte character */
4068
4069 #ifdef SUPPORT_UTF8
4070 if (utf8 && c > 127)
4071 {
4072 int i;
4073 for (i = 0; i < _pcre_utf8_table1_size; i++)
4074 if (c <= _pcre_utf8_table1[i]) break;
4075 length += i;
4076 lastitemlength += i;
4077 }
4078 #endif
4079
4080 continue;
4081 }
4082
4083 /* If \Q, enter "literal" mode */
4084
4085 if (-c == ESC_Q)
4086 {
4087 inescq = TRUE;
4088 continue;
4089 }
4090
4091 /* \X is supported only if Unicode property support is compiled */
4092
4093 #ifndef SUPPORT_UCP
4094 if (-c == ESC_X)
4095 {
4096 errorcode = ERR45;
4097 goto PCRE_ERROR_RETURN;
4098 }
4099 #endif
4100
4101 /* \P and \p are for Unicode properties, but only when the support has
4102 been compiled. Each item needs 3 bytes. */
4103
4104 else if (-c == ESC_P || -c == ESC_p)
4105 {
4106 #ifdef SUPPORT_UCP
4107 BOOL negated;
4108 BOOL pdata;
4109 length += 3;
4110 lastitemlength = 3;
4111 if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
4112 goto PCRE_ERROR_RETURN;
4113 continue;
4114 #else
4115 errorcode = ERR45;
4116 goto PCRE_ERROR_RETURN;
4117 #endif
4118 }
4119
4120 /* Other escapes need one byte */
4121
4122 length++;
4123
4124 /* A back reference needs an additional 2 bytes, plus either one or 5
4125 bytes for a repeat. We also need to keep the value of the highest
4126 back reference. */
4127
4128 if (c <= -ESC_REF)
4129 {
4130 int refnum = -c - ESC_REF;
4131 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4132 if (refnum > compile_block.top_backref)
4133 compile_block.top_backref = refnum;
4134 length += 2; /* For single back reference */
4135 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4136 {
4137 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4138 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4139 if ((min == 0 && (max == 1 || max == -1)) ||
4140 (min == 1 && max == -1))
4141 length++;
4142 else length += 5;
4143 if (ptr[1] == '?') ptr++;
4144 }
4145 }
4146 continue;
4147
4148 case '^': /* Single-byte metacharacters */
4149 case '.':
4150 case '$':
4151 length++;
4152 lastitemlength = 1;
4153 continue;
4154
4155 case '*': /* These repeats won't be after brackets; */
4156 case '+': /* those are handled separately */
4157 case '?':
4158 length++;
4159 goto POSESSIVE; /* A few lines below */
4160
4161 /* This covers the cases of braced repeats after a single char, metachar,
4162 class, or back reference. */
4163
4164 case '{':
4165 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4166 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4167 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4168
4169 /* These special cases just insert one extra opcode */
4170
4171 if ((min == 0 && (max == 1 || max == -1)) ||
4172 (min == 1 && max == -1))
4173 length++;
4174
4175 /* These cases might insert additional copies of a preceding character. */
4176
4177 else
4178 {
4179 if (min != 1)
4180 {
4181 length -= lastitemlength; /* Uncount the original char or metachar */
4182 if (min > 0) length += 3 + lastitemlength;
4183 }
4184 length += lastitemlength + ((max > 0)? 3 : 1);
4185 }
4186
4187 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4188
4189 POSESSIVE: /* Test for possessive quantifier */
4190 if (ptr[1] == '+')
4191 {
4192 ptr++;
4193 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4194 }
4195 continue;
4196
4197 /* An alternation contains an offset to the next branch or ket. If any ims
4198 options changed in the previous branch(es), and/or if we are in a
4199 lookbehind assertion, extra space will be needed at the start of the
4200 branch. This is handled by branch_extra. */
4201
4202 case '|':
4203 length += 1 + LINK_SIZE + branch_extra;
4204 continue;
4205
4206 /* A character class uses 33 characters provided that all the character
4207 values are less than 256. Otherwise, it uses a bit map for low valued
4208 characters, and individual items for others. Don't worry about character
4209 types that aren't allowed in classes - they'll get picked up during the
4210 compile. A character class that contains only one single-byte character
4211 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4212 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4213
4214 case '[':
4215 if (*(++ptr) == '^')
4216 {
4217 class_optcount = 10; /* Greater than one */
4218 ptr++;
4219 }
4220 else class_optcount = 0;
4221
4222 #ifdef SUPPORT_UTF8
4223 class_utf8 = FALSE;
4224 #endif
4225
4226 /* Written as a "do" so that an initial ']' is taken as data */
4227
4228 if (*ptr != 0) do
4229 {
4230 /* Inside \Q...\E everything is literal except \E */
4231
4232 if (inescq)
4233 {
4234 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4235 inescq = FALSE;
4236 ptr += 1;
4237 continue;
4238 }
4239
4240 /* Outside \Q...\E, check for escapes */
4241
4242 if (*ptr == '\\')
4243 {
4244 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4245 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4246
4247 /* \b is backspace inside a class; \X is literal */
4248
4249 if (-c == ESC_b) c = '\b';
4250 else if (-c == ESC_X) c = 'X';
4251
4252 /* \Q enters quoting mode */
4253
4254 else if (-c == ESC_Q)
4255 {
4256 inescq = TRUE;
4257 continue;
4258 }
4259
4260 /* Handle escapes that turn into characters */
4261
4262 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4263
4264 /* Escapes that are meta-things. The normal ones just affect the
4265 bit map, but Unicode properties require an XCLASS extended item. */
4266
4267 else
4268 {
4269 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4270 #ifdef SUPPORT_UTF8
4271 if (-c == ESC_p || -c == ESC_P)
4272 {
4273 if (!class_utf8)
4274 {
4275 class_utf8 = TRUE;
4276 length += LINK_SIZE + 2;
4277 }
4278 length += 3;
4279 }
4280 #endif
4281 }
4282 }
4283
4284 /* Check the syntax for POSIX stuff. The bits we actually handle are
4285 checked during the real compile phase. */
4286
4287 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4288 {
4289 ptr++;
4290 class_optcount = 10; /* Make sure > 1 */
4291 }
4292
4293 /* Anything else increments the possible optimization count. We have to
4294 detect ranges here so that we can compute the number of extra ranges for
4295 caseless wide characters when UCP support is available. If there are wide
4296 characters, we are going to have to use an XCLASS, even for single
4297 characters. */
4298
4299 else
4300 {
4301 int d;
4302
4303 GET_ONE_CHARACTER:
4304
4305 #ifdef SUPPORT_UTF8
4306 if (utf8)
4307 {
4308 int extra = 0;
4309 GETCHARLEN(c, ptr, extra);
4310 ptr += extra;
4311 }
4312 else c = *ptr;
4313 #else
4314 c = *ptr;
4315 #endif
4316
4317 /* Come here from handling \ above when it escapes to a char value */
4318
4319 NON_SPECIAL_CHARACTER:
4320 class_optcount++;
4321
4322 d = -1;
4323 if (ptr[1] == '-')
4324 {
4325 uschar const *hyptr = ptr++;
4326 if (ptr[1] == '\\')
4327 {
4328 ptr++;
4329 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4330 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4331 if (-d == ESC_b) d = '\b'; /* backspace */
4332 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4333 }
4334 else if (ptr[1] != 0 && ptr[1] != ']')
4335 {
4336 ptr++;
4337 #ifdef SUPPORT_UTF8
4338 if (utf8)
4339 {
4340 int extra = 0;
4341 GETCHARLEN(d, ptr, extra);
4342 ptr += extra;
4343 }
4344 else
4345 #endif
4346 d = *ptr;
4347 }
4348 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4349 }
4350
4351 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4352 127 for caseless matching, we will need to use an XCLASS. */
4353
4354 if (d >= 0)
4355 {
4356 class_optcount = 10; /* Ensure > 1 */
4357 if (d < c)
4358 {
4359 errorcode = ERR8;
4360 goto PCRE_ERROR_RETURN;
4361 }
4362
4363 #ifdef SUPPORT_UTF8
4364 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4365 {
4366 uschar buffer[6];
4367 if (!class_utf8) /* Allow for XCLASS overhead */
4368 {
4369 class_utf8 = TRUE;
4370 length += LINK_SIZE + 2;
4371 }
4372
4373 #ifdef SUPPORT_UCP
4374 /* If we have UCP support, find out how many extra ranges are
4375 needed to map the other case of characters within this range. We
4376 have to mimic the range optimization here, because extending the
4377 range upwards might push d over a boundary that makes is use
4378 another byte in the UTF-8 representation. */
4379
4380 if ((options & PCRE_CASELESS) != 0)
4381 {
4382 int occ, ocd;
4383 int cc = c;
4384 int origd = d;
4385 while (get_othercase_range(&cc, origd, &occ, &ocd))
4386 {
4387 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4388
4389 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4390 { /* if there is overlap, */
4391 c = occ; /* noting that if occ < c */
4392 continue; /* we can't have ocd > d */
4393 } /* because a subrange is */
4394 if (ocd > d && occ <= d + 1) /* always shorter than */
4395 { /* the basic range. */
4396 d = ocd;
4397 continue;
4398 }
4399
4400 /* An extra item is needed */
4401
4402 length += 1 + _pcre_ord2utf8(occ, buffer) +
4403 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4404 }
4405 }
4406 #endif /* SUPPORT_UCP */
4407
4408 /* The length of the (possibly extended) range */
4409
4410 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4411 }
4412 #endif /* SUPPORT_UTF8 */
4413
4414 }
4415
4416 /* We have a single character. There is nothing to be done unless we
4417 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4418 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4419 support. */
4420
4421 else
4422 {
4423 #ifdef SUPPORT_UTF8
4424 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4425 {
4426 uschar buffer[6];
4427 class_optcount = 10; /* Ensure > 1 */
4428 if (!class_utf8) /* Allow for XCLASS overhead */
4429 {
4430 class_utf8 = TRUE;
4431 length += LINK_SIZE + 2;
4432 }
4433 #ifdef SUPPORT_UCP
4434 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4435 (1 + _pcre_ord2utf8(c, buffer));
4436 #else /* SUPPORT_UCP */
4437 length += 1 + _pcre_ord2utf8(c, buffer);
4438 #endif /* SUPPORT_UCP */
4439 }
4440 #endif /* SUPPORT_UTF8 */
4441 }
4442 }
4443 }
4444 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4445
4446 if (*ptr == 0) /* Missing terminating ']' */
4447 {
4448 errorcode = ERR6;
4449 goto PCRE_ERROR_RETURN;
4450 }
4451
4452 /* We can optimize when there was only one optimizable character. Repeats
4453 for positive and negated single one-byte chars are handled by the general
4454 code. Here, we handle repeats for the class opcodes. */
4455
4456 if (class_optcount == 1) length += 3; else
4457 {
4458 length += 33;
4459
4460 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4461 we also need extra for wrapping the whole thing in a sub-pattern. */
4462
4463 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4464 {
4465 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4466 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4467 if ((min == 0 && (max == 1 || max == -1)) ||
4468 (min == 1 && max == -1))
4469 length++;
4470 else length += 5;
4471 if (ptr[1] == '+')
4472 {
4473 ptr++;
4474 length += 2 + 2*LINK_SIZE;
4475 }
4476 else if (ptr[1] == '?') ptr++;
4477 }
4478 }
4479 continue;
4480
4481 /* Brackets may be genuine groups or special things */
4482
4483 case '(':
4484 branch_newextra = 0;
4485 bracket_length = 1 + LINK_SIZE;
4486 capturing = FALSE;
4487
4488 /* Handle special forms of bracket, which all start (? */
4489
4490 if (ptr[1] == '?')
4491 {
4492 int set, unset;
4493 int *optset;
4494
4495 switch (c = ptr[2])
4496 {
4497 /* Skip over comments entirely */
4498 case '#':
4499 ptr += 3;
4500 while (*ptr != 0 && *ptr != ')') ptr++;
4501 if (*ptr == 0)
4502 {
4503 errorcode = ERR18;
4504 goto PCRE_ERROR_RETURN;
4505 }
4506 continue;
4507
4508 /* Non-referencing groups and lookaheads just move the pointer on, and
4509 then behave like a non-special bracket, except that they don't increment
4510 the count of extracting brackets. Ditto for the "once only" bracket,
4511 which is in Perl from version 5.005. */
4512
4513 case ':':
4514 case '=':
4515 case '!':
4516 case '>':
4517 ptr += 2;
4518 break;
4519
4520 /* (?R) specifies a recursive call to the regex, which is an extension
4521 to provide the facility which can be obtained by (?p{perl-code}) in
4522 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4523
4524 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4525 the appropriate numbered brackets. This includes both recursive and
4526 non-recursive calls. (?R) is now synonymous with (?0). */
4527
4528 case 'R':
4529 ptr++;
4530
4531 case '0': case '1': case '2': case '3': case '4':
4532 case '5': case '6': case '7': case '8': case '9':
4533 ptr += 2;
4534 if (c != 'R')
4535 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4536 if (*ptr != ')')
4537 {
4538 errorcode = ERR29;
4539 goto PCRE_ERROR_RETURN;
4540 }
4541 length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */
4542
4543 /* If this item is quantified, it will get wrapped inside brackets so
4544 as to use the code for quantified brackets. We jump down and use the
4545 code that handles this for real brackets. */
4546
4547 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4548 {
4549 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4550 duplength = 5 + 3 * LINK_SIZE;
4551 goto HANDLE_QUANTIFIED_BRACKETS;
4552 }
4553 continue;
4554
4555 /* (?C) is an extension which provides "callout" - to provide a bit of
4556 the functionality of the Perl (?{...}) feature. An optional number may
4557 follow (default is zero). */
4558
4559 case 'C':
4560 ptr += 2;
4561 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4562 if (*ptr != ')')
4563 {
4564 errorcode = ERR39;
4565 goto PCRE_ERROR_RETURN;
4566 }
4567 length += 2 + 2*LINK_SIZE;
4568 continue;
4569
4570 /* Named subpatterns are an extension copied from Python */
4571
4572 case 'P':
4573 ptr += 3;
4574
4575 /* Handle the definition of a named subpattern */
4576
4577 if (*ptr == '<')
4578 {
4579 const uschar *p; /* Don't amalgamate; some compilers */
4580 p = ++ptr; /* grumble at autoincrement in declaration */
4581 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4582 if (*ptr != '>')
4583 {
4584 errorcode = ERR42;
4585 goto PCRE_ERROR_RETURN;
4586 }
4587 name_count++;
4588 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4589 capturing = TRUE; /* Named parentheses are always capturing */
4590 break;
4591 }
4592
4593 /* Handle back references and recursive calls to named subpatterns */
4594
4595 if (*ptr == '=' || *ptr == '>')
4596 {
4597 length += 2 + 2*LINK_SIZE; /* Allow for the automatic "once" */
4598 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4599 if (*ptr != ')')
4600 {
4601 errorcode = ERR42;
4602 goto PCRE_ERROR_RETURN;
4603 }
4604 break;
4605 }
4606
4607 /* Unknown character after (?P */
4608
4609 errorcode = ERR41;
4610 goto PCRE_ERROR_RETURN;
4611
4612 /* Lookbehinds are in Perl from version 5.005 */
4613
4614 case '<':
4615 ptr += 3;
4616 if (*ptr == '=' || *ptr == '!')
4617 {
4618 branch_newextra = 1 + LINK_SIZE;
4619 length += 1 + LINK_SIZE; /* For the first branch */
4620 break;
4621 }
4622 errorcode = ERR24;
4623 goto PCRE_ERROR_RETURN;
4624
4625 /* Conditionals are in Perl from version 5.005. The bracket must either
4626 be followed by a number (for bracket reference) or by an assertion
4627 group, or (a PCRE extension) by 'R' for a recursion test. */
4628
4629 case '(':
4630 if (ptr[3] == 'R' && ptr[4] == ')')
4631 {
4632 ptr += 4;
4633 length += 3;
4634 }
4635 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4636 {
4637 ptr += 4;
4638 length += 3;
4639 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4640 if (*ptr != ')')
4641 {
4642 errorcode = ERR26;
4643 goto PCRE_ERROR_RETURN;
4644 }
4645 }
4646 else /* An assertion must follow */
4647 {
4648 ptr++; /* Can treat like ':' as far as spacing is concerned */
4649 if (ptr[2] != '?' ||
4650 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4651 {
4652 ptr += 2; /* To get right offset in message */
4653 errorcode = ERR28;
4654 goto PCRE_ERROR_RETURN;
4655 }
4656 }
4657 break;
4658
4659 /* Else loop checking valid options until ) is met. Anything else is an
4660 error. If we are without any brackets, i.e. at top level, the settings
4661 act as if specified in the options, so massage the options immediately.
4662 This is for backward compatibility with Perl 5.004. */
4663
4664 default:
4665 set = unset = 0;
4666 optset = &set;
4667 ptr += 2;
4668
4669 for (;; ptr++)
4670 {
4671 c = *ptr;
4672 switch (c)
4673 {
4674 case 'i':
4675 *optset |= PCRE_CASELESS;
4676 continue;
4677
4678 case 'm':
4679 *optset |= PCRE_MULTILINE;
4680 continue;
4681
4682 case 's':
4683 *optset |= PCRE_DOTALL;
4684 continue;
4685
4686 case 'x':
4687 *optset |= PCRE_EXTENDED;
4688 continue;
4689
4690 case 'X':
4691 *optset |= PCRE_EXTRA;
4692 continue;
4693
4694 case 'U':
4695 *optset |= PCRE_UNGREEDY;
4696 continue;
4697
4698 case '-':
4699 optset = &unset;
4700 continue;
4701
4702 /* A termination by ')' indicates an options-setting-only item; if
4703 this is at the very start of the pattern (indicated by item_count
4704 being zero), we use it to set the global options. This is helpful
4705 when analyzing the pattern for first characters, etc. Otherwise
4706 nothing is done here and it is handled during the compiling
4707 process.
4708
4709 We allow for more than one options setting at the start. If such
4710 settings do not change the existing options, nothing is compiled.
4711 However, we must leave space just in case something is compiled.
4712 This can happen for pathological sequences such as (?i)(?-i)
4713 because the global options will end up with -i set. The space is
4714 small and not significant. (Before I did this there was a reported
4715 bug with (?i)(?-i) in a machine-generated pattern.)
4716
4717 [Historical note: Up to Perl 5.8, options settings at top level
4718 were always global settings, wherever they appeared in the pattern.
4719 That is, they were equivalent to an external setting. From 5.8
4720 onwards, they apply only to what follows (which is what you might
4721 expect).] */
4722
4723 case ')':
4724 if (item_count == 0)
4725 {
4726 options = (options | set) & (~unset);
4727 set = unset = 0; /* To save length */
4728 item_count--; /* To allow for several */
4729 length += 2;
4730 }
4731
4732 /* Fall through */
4733
4734 /* A termination by ':' indicates the start of a nested group with
4735 the given options set. This is again handled at compile time, but
4736 we must allow for compiled space if any of the ims options are
4737 set. We also have to allow for resetting space at the end of
4738 the group, which is why 4 is added to the length and not just 2.
4739 If there are several changes of options within the same group, this
4740 will lead to an over-estimate on the length, but this shouldn't
4741 matter very much. We also have to allow for resetting options at
4742 the start of any alternations, which we do by setting
4743 branch_newextra to 2. Finally, we record whether the case-dependent
4744 flag ever changes within the regex. This is used by the "required
4745 character" code. */
4746
4747 case ':':
4748 if (((set|unset) & PCRE_IMS) != 0)
4749 {
4750 length += 4;
4751 branch_newextra = 2;
4752 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4753 }
4754 goto END_OPTIONS;
4755
4756 /* Unrecognized option character */
4757
4758 default:
4759 errorcode = ERR12;
4760 goto PCRE_ERROR_RETURN;
4761 }
4762 }
4763
4764 /* If we hit a closing bracket, that's it - this is a freestanding
4765 option-setting. We need to ensure that branch_extra is updated if
4766 necessary. The only values branch_newextra can have here are 0 or 2.
4767 If the value is 2, then branch_extra must either be 2 or 5, depending
4768 on whether this is a lookbehind group or not. */
4769
4770 END_OPTIONS:
4771 if (c == ')')
4772 {
4773 if (branch_newextra == 2 &&
4774 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4775 branch_extra += branch_newextra;
4776 continue;
4777 }
4778
4779 /* If options were terminated by ':' control comes here. This is a
4780 non-capturing group with an options change. There is nothing more that
4781 needs to be done because "capturing" is already set FALSE by default;
4782 we can just fall through. */
4783
4784 }
4785 }
4786
4787 /* Ordinary parentheses, not followed by '?', are capturing unless
4788 PCRE_NO_AUTO_CAPTURE is set. */
4789
4790 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4791
4792 /* Capturing brackets must be counted so we can process escapes in a
4793 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4794 an additional 3 bytes of memory per capturing bracket. */
4795
4796 if (capturing)
4797 {
4798 bracount++;
4799 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4800 }
4801
4802 /* Save length for computing whole length at end if there's a repeat that
4803 requires duplication of the group. Also save the current value of
4804 branch_extra, and start the new group with the new value. If non-zero, this
4805 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4806
4807 if (brastackptr >= sizeof(brastack)/sizeof(int))
4808 {
4809 errorcode = ERR19;
4810 goto PCRE_ERROR_RETURN;
4811 }
4812
4813 bralenstack[brastackptr] = branch_extra;
4814 branch_extra = branch_newextra;
4815
4816 brastack[brastackptr++] = length;
4817 length += bracket_length;
4818 continue;
4819
4820 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4821 have to replicate this bracket up to that many times. If brastackptr is
4822 0 this is an unmatched bracket which will generate an error, but take care
4823 not to try to access brastack[-1] when computing the length and restoring
4824 the branch_extra value. */
4825
4826 case ')':
4827 length += 1 + LINK_SIZE;
4828 if (brastackptr > 0)
4829 {
4830 duplength = length - brastack[--brastackptr];
4831 branch_extra = bralenstack[brastackptr];
4832 }
4833 else duplength = 0;
4834
4835 /* The following code is also used when a recursion such as (?3) is
4836 followed by a quantifier, because in that case, it has to be wrapped inside
4837 brackets so that the quantifier works. The value of duplength must be
4838 set before arrival. */
4839
4840 HANDLE_QUANTIFIED_BRACKETS:
4841
4842 /* Leave ptr at the final char; for read_repeat_counts this happens
4843 automatically; for the others we need an increment. */
4844
4845 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4846 {
4847 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4848 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4849 }
4850 else if (c == '*') { min = 0; max = -1; ptr++; }
4851 else if (c == '+') { min = 1; max = -1; ptr++; }
4852 else if (c == '?') { min = 0; max = 1; ptr++; }
4853 else { min = 1; max = 1; }
4854
4855 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4856 group, and if the maximum is greater than zero, we have to replicate
4857 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4858 bracket set. */
4859
4860 if (min == 0)
4861 {
4862 length++;
4863 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4864 }
4865
4866 /* When the minimum is greater than zero, we have to replicate up to
4867 minval-1 times, with no additions required in the copies. Then, if there
4868 is a limited maximum we have to replicate up to maxval-1 times allowing
4869 for a BRAZERO item before each optional copy and nesting brackets for all
4870 but one of the optional copies. */
4871
4872 else
4873 {
4874 length += (min - 1) * duplength;
4875 if (max > min) /* Need this test as max=-1 means no limit */
4876 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4877 - (2 + 2*LINK_SIZE);
4878 }
4879
4880 /* Allow space for once brackets for "possessive quantifier" */
4881
4882 if (ptr[1] == '+')
4883 {
4884 ptr++;
4885 length += 2 + 2*LINK_SIZE;
4886 }
4887 continue;
4888
4889 /* Non-special character. It won't be space or # in extended mode, so it is
4890 always a genuine character. If we are in a \Q...\E sequence, check for the
4891 end; if not, we have a literal. */
4892
4893 default:
4894 NORMAL_CHAR:
4895
4896 if (inescq && c == '\\' && ptr[1] == 'E')
4897 {
4898 inescq = FALSE;
4899 ptr++;
4900 continue;
4901 }
4902
4903 length += 2; /* For a one-byte character */
4904 lastitemlength = 1; /* Default length of last item for repeats */
4905
4906 /* In UTF-8 mode, check for additional bytes. */
4907
4908 #ifdef SUPPORT_UTF8
4909 if (utf8 && (c & 0xc0) == 0xc0)
4910 {
4911 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4912 { /* because the end is marked */
4913 lastitemlength++; /* by a zero byte. */
4914 length++;
4915 ptr++;
4916 }
4917 }
4918 #endif
4919
4920 continue;
4921 }
4922 }
4923
4924 length += 2 + LINK_SIZE; /* For final KET and END */
4925
4926 if ((options & PCRE_AUTO_CALLOUT) != 0)
4927 length += 2 + 2*LINK_SIZE; /* For final callout */
4928
4929 if (length > MAX_PATTERN_SIZE)
4930 {
4931 errorcode = ERR20;
4932 goto PCRE_EARLY_ERROR_RETURN;
4933 }
4934
4935 /* Compute the size of data block needed and get it, either from malloc or
4936 externally provided function. */
4937
4938 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4939 re = (real_pcre *)(pcre_malloc)(size);
4940
4941 if (re == NULL)
4942 {
4943 errorcode = ERR21;
4944 goto PCRE_EARLY_ERROR_RETURN;
4945 }
4946
4947 /* Put in the magic number, and save the sizes, options, and character table
4948 pointer. NULL is used for the default character tables. The nullpad field is at
4949 the end; it's there to help in the case when a regex compiled on a system with
4950 4-byte pointers is run on another with 8-byte pointers. */
4951
4952 re->magic_number = MAGIC_NUMBER;
4953 re->size = size;
4954 re->options = options;
4955 re->dummy1 = 0;
4956 re->name_table_offset = sizeof(real_pcre);
4957 re->name_entry_size = max_name_size + 3;
4958 re->name_count = name_count;
4959 re->ref_count = 0;
4960 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4961 re->nullpad = NULL;
4962
4963 /* The starting points of the name/number translation table and of the code are
4964 passed around in the compile data block. */
4965
4966 compile_block.names_found = 0;
4967 compile_block.name_entry_size = max_name_size + 3;
4968 compile_block.name_table = (uschar *)re + re->name_table_offset;
4969 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4970 compile_block.start_code = codestart;
4971 compile_block.start_pattern = (const uschar *)pattern;
4972 compile_block.req_varyopt = 0;
4973 compile_block.nopartial = FALSE;
4974
4975 /* Set up a starting, non-extracting bracket, then compile the expression. On
4976 error, errorcode will be set non-zero, so we don't need to look at the result
4977 of the function here. */
4978
4979 ptr = (const uschar *)pattern;
4980 code = (uschar *)codestart;
4981 *code = OP_BRA;
4982 bracount = 0;
4983 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4984 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4985 re->top_bracket = bracount;
4986 re->top_backref = compile_block.top_backref;
4987
4988 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4989
4990 /* If not reached end of pattern on success, there's an excess bracket. */
4991
4992 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4993
4994 /* Fill in the terminating state and check for disastrous overflow, but
4995 if debugging, leave the test till after things are printed out. */
4996
4997 *code++ = OP_END;
4998
4999 #ifndef DEBUG
5000 if (code - codestart > length) errorcode = ERR23;
5001 #endif
5002
5003 /* Give an error if there's back reference to a non-existent capturing
5004 subpattern. */
5005
5006 if (re->top_backref > re->top_bracket) errorcode = ERR15;
5007
5008 /* Failed to compile, or error while post-processing */
5009
5010 if (errorcode != 0)
5011 {
5012 (pcre_free)(re);
5013 PCRE_ERROR_RETURN:
5014 *erroroffset = ptr - (const uschar *)pattern;
5015 PCRE_EARLY_ERROR_RETURN:
5016 *errorptr = error_texts[errorcode];
5017 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5018 return NULL;
5019 }
5020
5021 /* If the anchored option was not passed, set the flag if we can determine that
5022 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5023 as starting with .* when DOTALL is set).
5024
5025 Otherwise, if we know what the first character has to be, save it, because that
5026 speeds up unanchored matches no end. If not, see if we can set the
5027 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5028 start with ^. and also when all branches start with .* for non-DOTALL matches.
5029 */
5030
5031 if ((options & PCRE_ANCHORED) == 0)
5032 {
5033 int temp_options = options;
5034 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5035 re->options |= PCRE_ANCHORED;
5036 else
5037 {
5038 if (firstbyte < 0)
5039 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5040 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5041 {
5042 int ch = firstbyte & 255;
5043 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5044 compile_block.fcc[ch] == ch)? ch : firstbyte;
5045 re->options |= PCRE_FIRSTSET;
5046 }
5047 else if (is_startline(codestart, 0, compile_block.backref_map))
5048 re->options |= PCRE_STARTLINE;
5049 }
5050 }
5051
5052 /* For an anchored pattern, we use the "required byte" only if it follows a
5053 variable length item in the regex. Remove the caseless flag for non-caseable
5054 bytes. */
5055
5056 if (reqbyte >= 0 &&
5057 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5058 {
5059 int ch = reqbyte & 255;
5060 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5061 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5062 re->options |= PCRE_REQCHSET;
5063 }
5064
5065 /* Print out the compiled data if debugging is enabled. This is never the
5066 case when building a production library. */
5067
5068 #ifdef DEBUG
5069
5070 printf("Length = %d top_bracket = %d top_backref = %d\n",
5071 length, re->top_bracket, re->top_backref);
5072
5073 if (re->options != 0)
5074 {
5075 printf("%s%s%s%s%s%s%s%s%s%s\n",
5076 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5077 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5078 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5079 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5080 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5081 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5082 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5083 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5084 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5085 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5086 }
5087
5088 if ((re->options & PCRE_FIRSTSET) != 0)
5089 {
5090 int ch = re->first_byte & 255;
5091 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5092 "" : " (caseless)";
5093 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5094 else printf("First char = \\x%02x%s\n", ch, caseless);
5095 }
5096
5097 if ((re->options & PCRE_REQCHSET) != 0)
5098 {
5099 int ch = re->req_byte & 255;
5100 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5101 "" : " (caseless)";
5102 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5103 else printf("Req char = \\x%02x%s\n", ch, caseless);
5104 }
5105
5106 pcre_printint(re, stdout);
5107
5108 /* This check is done here in the debugging case so that the code that
5109 was compiled can be seen. */
5110
5111 if (code - codestart > length)
5112 {
5113 (pcre_free)(re);
5114 *errorptr = error_texts[ERR23];
5115 *erroroffset = ptr - (uschar *)pattern;
5116 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5117 return NULL;
5118 }
5119 #endif
5120
5121 return (pcre *)re;
5122 }
5123
5124 /* End of pcre_compile.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12