/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 79 - (show annotations) (download)
Sat Feb 24 21:40:52 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 159122 byte(s)
Load pcre-6.1 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #include "pcre_internal.h"
46
47
48 /*************************************************
49 * Code parameters and static tables *
50 *************************************************/
51
52 /* Maximum number of items on the nested bracket stacks at compile time. This
53 applies to the nesting of all kinds of parentheses. It does not limit
54 un-nested, non-capturing parentheses. This number can be made bigger if
55 necessary - it is used to dimension one int and one unsigned char vector at
56 compile time. */
57
58 #define BRASTACK_SIZE 200
59
60
61 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
62 are simple data values; negative values are for special things like \d and so
63 on. Zero means further processing is needed (for things like \x), or the escape
64 is invalid. */
65
66 #if !EBCDIC /* This is the "normal" table for ASCII systems */
67 static const short int escapes[] = {
68 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
69 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
70 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
71 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
72 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
73 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
74 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
75 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
76 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
77 0, 0, -ESC_z /* x - z */
78 };
79
80 #else /* This is the "abnormal" table for EBCDIC systems */
81 static const short int escapes[] = {
82 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
83 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
84 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
85 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
86 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
87 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
88 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
89 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
90 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
91 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
92 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
93 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
94 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
95 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
96 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
97 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
98 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
99 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
100 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
101 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
102 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
103 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
104 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
105 };
106 #endif
107
108
109 /* Tables of names of POSIX character classes and their lengths. The list is
110 terminated by a zero length entry. The first three must be alpha, upper, lower,
111 as this is assumed for handling case independence. */
112
113 static const char *const posix_names[] = {
114 "alpha", "lower", "upper",
115 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
116 "print", "punct", "space", "word", "xdigit" };
117
118 static const uschar posix_name_lengths[] = {
119 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
120
121 /* Table of class bit maps for each POSIX class; up to three may be combined
122 to form the class. The table for [:blank:] is dynamically modified to remove
123 the vertical space characters. */
124
125 static const int posix_class_maps[] = {
126 cbit_lower, cbit_upper, -1, /* alpha */
127 cbit_lower, -1, -1, /* lower */
128 cbit_upper, -1, -1, /* upper */
129 cbit_digit, cbit_lower, cbit_upper, /* alnum */
130 cbit_print, cbit_cntrl, -1, /* ascii */
131 cbit_space, -1, -1, /* blank - a GNU extension */
132 cbit_cntrl, -1, -1, /* cntrl */
133 cbit_digit, -1, -1, /* digit */
134 cbit_graph, -1, -1, /* graph */
135 cbit_print, -1, -1, /* print */
136 cbit_punct, -1, -1, /* punct */
137 cbit_space, -1, -1, /* space */
138 cbit_word, -1, -1, /* word - a Perl extension */
139 cbit_xdigit,-1, -1 /* xdigit */
140 };
141
142
143 /* The texts of compile-time error messages. These are "char *" because they
144 are passed to the outside world. */
145
146 static const char *error_texts[] = {
147 "no error",
148 "\\ at end of pattern",
149 "\\c at end of pattern",
150 "unrecognized character follows \\",
151 "numbers out of order in {} quantifier",
152 /* 5 */
153 "number too big in {} quantifier",
154 "missing terminating ] for character class",
155 "invalid escape sequence in character class",
156 "range out of order in character class",
157 "nothing to repeat",
158 /* 10 */
159 "operand of unlimited repeat could match the empty string",
160 "internal error: unexpected repeat",
161 "unrecognized character after (?",
162 "POSIX named classes are supported only within a class",
163 "missing )",
164 /* 15 */
165 "reference to non-existent subpattern",
166 "erroffset passed as NULL",
167 "unknown option bit(s) set",
168 "missing ) after comment",
169 "parentheses nested too deeply",
170 /* 20 */
171 "regular expression too large",
172 "failed to get memory",
173 "unmatched parentheses",
174 "internal error: code overflow",
175 "unrecognized character after (?<",
176 /* 25 */
177 "lookbehind assertion is not fixed length",
178 "malformed number after (?(",
179 "conditional group contains more than two branches",
180 "assertion expected after (?(",
181 "(?R or (?digits must be followed by )",
182 /* 30 */
183 "unknown POSIX class name",
184 "POSIX collating elements are not supported",
185 "this version of PCRE is not compiled with PCRE_UTF8 support",
186 "spare error",
187 "character value in \\x{...} sequence is too large",
188 /* 35 */
189 "invalid condition (?(0)",
190 "\\C not allowed in lookbehind assertion",
191 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
192 "number after (?C is > 255",
193 "closing ) for (?C expected",
194 /* 40 */
195 "recursive call could loop indefinitely",
196 "unrecognized character after (?P",
197 "syntax error after (?P",
198 "two named groups have the same name",
199 "invalid UTF-8 string",
200 /* 45 */
201 "support for \\P, \\p, and \\X has not been compiled",
202 "malformed \\P or \\p sequence",
203 "unknown property name after \\P or \\p"
204 };
205
206
207 /* Table to identify digits and hex digits. This is used when compiling
208 patterns. Note that the tables in chartables are dependent on the locale, and
209 may mark arbitrary characters as digits - but the PCRE compiling code expects
210 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
211 a private table here. It costs 256 bytes, but it is a lot faster than doing
212 character value tests (at least in some simple cases I timed), and in some
213 applications one wants PCRE to compile efficiently as well as match
214 efficiently.
215
216 For convenience, we use the same bit definitions as in chartables:
217
218 0x04 decimal digit
219 0x08 hexadecimal digit
220
221 Then we can use ctype_digit and ctype_xdigit in the code. */
222
223 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
224 static const unsigned char digitab[] =
225 {
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
232 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
233 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
234 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
235 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
236 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
238 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
258
259 #else /* This is the "abnormal" case, for EBCDIC systems */
260 static const unsigned char digitab[] =
261 {
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
278 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
286 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
292 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
293 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
294
295 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
296 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
297 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
298 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
300 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
304 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
305 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
307 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
309 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
312 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
313 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
314 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
315 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
316 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
317 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
318 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
319 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
320 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
321 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
322 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
323 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
324 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
325 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
326 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
327 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
328 #endif
329
330
331 /* Definition to allow mutual recursion */
332
333 static BOOL
334 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
335 int *, int *, branch_chain *, compile_data *);
336
337
338
339 /*************************************************
340 * Handle escapes *
341 *************************************************/
342
343 /* This function is called when a \ has been encountered. It either returns a
344 positive value for a simple escape such as \n, or a negative value which
345 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
346 a positive value greater than 255 may be returned. On entry, ptr is pointing at
347 the \. On exit, it is on the final character of the escape sequence.
348
349 Arguments:
350 ptrptr points to the pattern position pointer
351 errorcodeptr points to the errorcode variable
352 bracount number of previous extracting brackets
353 options the options bits
354 isclass TRUE if inside a character class
355
356 Returns: zero or positive => a data character
357 negative => a special escape sequence
358 on error, errorptr is set
359 */
360
361 static int
362 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
363 int options, BOOL isclass)
364 {
365 const uschar *ptr = *ptrptr;
366 int c, i;
367
368 /* If backslash is at the end of the pattern, it's an error. */
369
370 c = *(++ptr);
371 if (c == 0) *errorcodeptr = ERR1;
372
373 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
374 a table. A non-zero result is something that can be returned immediately.
375 Otherwise further processing may be required. */
376
377 #if !EBCDIC /* ASCII coding */
378 else if (c < '0' || c > 'z') {} /* Not alphameric */
379 else if ((i = escapes[c - '0']) != 0) c = i;
380
381 #else /* EBCDIC coding */
382 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
383 else if ((i = escapes[c - 0x48]) != 0) c = i;
384 #endif
385
386 /* Escapes that need further processing, or are illegal. */
387
388 else
389 {
390 const uschar *oldptr;
391 switch (c)
392 {
393 /* A number of Perl escapes are not handled by PCRE. We give an explicit
394 error. */
395
396 case 'l':
397 case 'L':
398 case 'N':
399 case 'u':
400 case 'U':
401 *errorcodeptr = ERR37;
402 break;
403
404 /* The handling of escape sequences consisting of a string of digits
405 starting with one that is not zero is not straightforward. By experiment,
406 the way Perl works seems to be as follows:
407
408 Outside a character class, the digits are read as a decimal number. If the
409 number is less than 10, or if there are that many previous extracting
410 left brackets, then it is a back reference. Otherwise, up to three octal
411 digits are read to form an escaped byte. Thus \123 is likely to be octal
412 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
413 value is greater than 377, the least significant 8 bits are taken. Inside a
414 character class, \ followed by a digit is always an octal number. */
415
416 case '1': case '2': case '3': case '4': case '5':
417 case '6': case '7': case '8': case '9':
418
419 if (!isclass)
420 {
421 oldptr = ptr;
422 c -= '0';
423 while ((digitab[ptr[1]] & ctype_digit) != 0)
424 c = c * 10 + *(++ptr) - '0';
425 if (c < 10 || c <= bracount)
426 {
427 c = -(ESC_REF + c);
428 break;
429 }
430 ptr = oldptr; /* Put the pointer back and fall through */
431 }
432
433 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
434 generates a binary zero byte and treats the digit as a following literal.
435 Thus we have to pull back the pointer by one. */
436
437 if ((c = *ptr) >= '8')
438 {
439 ptr--;
440 c = 0;
441 break;
442 }
443
444 /* \0 always starts an octal number, but we may drop through to here with a
445 larger first octal digit. */
446
447 case '0':
448 c -= '0';
449 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
450 c = c * 8 + *(++ptr) - '0';
451 c &= 255; /* Take least significant 8 bits */
452 break;
453
454 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
455 which can be greater than 0xff, but only if the ddd are hex digits. */
456
457 case 'x':
458 #ifdef SUPPORT_UTF8
459 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
460 {
461 const uschar *pt = ptr + 2;
462 register int count = 0;
463 c = 0;
464 while ((digitab[*pt] & ctype_xdigit) != 0)
465 {
466 int cc = *pt++;
467 count++;
468 #if !EBCDIC /* ASCII coding */
469 if (cc >= 'a') cc -= 32; /* Convert to upper case */
470 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
471 #else /* EBCDIC coding */
472 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
473 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
474 #endif
475 }
476 if (*pt == '}')
477 {
478 if (c < 0 || count > 8) *errorcodeptr = ERR34;
479 ptr = pt;
480 break;
481 }
482 /* If the sequence of hex digits does not end with '}', then we don't
483 recognize this construct; fall through to the normal \x handling. */
484 }
485 #endif
486
487 /* Read just a single hex char */
488
489 c = 0;
490 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
491 {
492 int cc; /* Some compilers don't like ++ */
493 cc = *(++ptr); /* in initializers */
494 #if !EBCDIC /* ASCII coding */
495 if (cc >= 'a') cc -= 32; /* Convert to upper case */
496 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
497 #else /* EBCDIC coding */
498 if (cc <= 'z') cc += 64; /* Convert to upper case */
499 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
500 #endif
501 }
502 break;
503
504 /* Other special escapes not starting with a digit are straightforward */
505
506 case 'c':
507 c = *(++ptr);
508 if (c == 0)
509 {
510 *errorcodeptr = ERR2;
511 return 0;
512 }
513
514 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
515 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
516 (However, an EBCDIC equivalent has now been added.) */
517
518 #if !EBCDIC /* ASCII coding */
519 if (c >= 'a' && c <= 'z') c -= 32;
520 c ^= 0x40;
521 #else /* EBCDIC coding */
522 if (c >= 'a' && c <= 'z') c += 64;
523 c ^= 0xC0;
524 #endif
525 break;
526
527 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
528 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
529 for Perl compatibility, it is a literal. This code looks a bit odd, but
530 there used to be some cases other than the default, and there may be again
531 in future, so I haven't "optimized" it. */
532
533 default:
534 if ((options & PCRE_EXTRA) != 0) switch(c)
535 {
536 default:
537 *errorcodeptr = ERR3;
538 break;
539 }
540 break;
541 }
542 }
543
544 *ptrptr = ptr;
545 return c;
546 }
547
548
549
550 #ifdef SUPPORT_UCP
551 /*************************************************
552 * Handle \P and \p *
553 *************************************************/
554
555 /* This function is called after \P or \p has been encountered, provided that
556 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
557 pointing at the P or p. On exit, it is pointing at the final character of the
558 escape sequence.
559
560 Argument:
561 ptrptr points to the pattern position pointer
562 negptr points to a boolean that is set TRUE for negation else FALSE
563 errorcodeptr points to the error code variable
564
565 Returns: value from ucp_type_table, or -1 for an invalid type
566 */
567
568 static int
569 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
570 {
571 int c, i, bot, top;
572 const uschar *ptr = *ptrptr;
573 char name[4];
574
575 c = *(++ptr);
576 if (c == 0) goto ERROR_RETURN;
577
578 *negptr = FALSE;
579
580 /* \P or \p can be followed by a one- or two-character name in {}, optionally
581 preceded by ^ for negation. */
582
583 if (c == '{')
584 {
585 if (ptr[1] == '^')
586 {
587 *negptr = TRUE;
588 ptr++;
589 }
590 for (i = 0; i <= 2; i++)
591 {
592 c = *(++ptr);
593 if (c == 0) goto ERROR_RETURN;
594 if (c == '}') break;
595 name[i] = c;
596 }
597 if (c !='}') /* Try to distinguish error cases */
598 {
599 while (*(++ptr) != 0 && *ptr != '}');
600 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
601 }
602 name[i] = 0;
603 }
604
605 /* Otherwise there is just one following character */
606
607 else
608 {
609 name[0] = c;
610 name[1] = 0;
611 }
612
613 *ptrptr = ptr;
614
615 /* Search for a recognized property name using binary chop */
616
617 bot = 0;
618 top = _pcre_utt_size;
619
620 while (bot < top)
621 {
622 i = (bot + top)/2;
623 c = strcmp(name, _pcre_utt[i].name);
624 if (c == 0) return _pcre_utt[i].value;
625 if (c > 0) bot = i + 1; else top = i;
626 }
627
628 UNKNOWN_RETURN:
629 *errorcodeptr = ERR47;
630 *ptrptr = ptr;
631 return -1;
632
633 ERROR_RETURN:
634 *errorcodeptr = ERR46;
635 *ptrptr = ptr;
636 return -1;
637 }
638 #endif
639
640
641
642
643 /*************************************************
644 * Check for counted repeat *
645 *************************************************/
646
647 /* This function is called when a '{' is encountered in a place where it might
648 start a quantifier. It looks ahead to see if it really is a quantifier or not.
649 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
650 where the ddds are digits.
651
652 Arguments:
653 p pointer to the first char after '{'
654
655 Returns: TRUE or FALSE
656 */
657
658 static BOOL
659 is_counted_repeat(const uschar *p)
660 {
661 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
662 while ((digitab[*p] & ctype_digit) != 0) p++;
663 if (*p == '}') return TRUE;
664
665 if (*p++ != ',') return FALSE;
666 if (*p == '}') return TRUE;
667
668 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
669 while ((digitab[*p] & ctype_digit) != 0) p++;
670
671 return (*p == '}');
672 }
673
674
675
676 /*************************************************
677 * Read repeat counts *
678 *************************************************/
679
680 /* Read an item of the form {n,m} and return the values. This is called only
681 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
682 so the syntax is guaranteed to be correct, but we need to check the values.
683
684 Arguments:
685 p pointer to first char after '{'
686 minp pointer to int for min
687 maxp pointer to int for max
688 returned as -1 if no max
689 errorcodeptr points to error code variable
690
691 Returns: pointer to '}' on success;
692 current ptr on error, with errorcodeptr set non-zero
693 */
694
695 static const uschar *
696 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
697 {
698 int min = 0;
699 int max = -1;
700
701 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
702
703 if (*p == '}') max = min; else
704 {
705 if (*(++p) != '}')
706 {
707 max = 0;
708 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
709 if (max < min)
710 {
711 *errorcodeptr = ERR4;
712 return p;
713 }
714 }
715 }
716
717 /* Do paranoid checks, then fill in the required variables, and pass back the
718 pointer to the terminating '}'. */
719
720 if (min > 65535 || max > 65535)
721 *errorcodeptr = ERR5;
722 else
723 {
724 *minp = min;
725 *maxp = max;
726 }
727 return p;
728 }
729
730
731
732 /*************************************************
733 * Find first significant op code *
734 *************************************************/
735
736 /* This is called by several functions that scan a compiled expression looking
737 for a fixed first character, or an anchoring op code etc. It skips over things
738 that do not influence this. For some calls, a change of option is important.
739 For some calls, it makes sense to skip negative forward and all backward
740 assertions, and also the \b assertion; for others it does not.
741
742 Arguments:
743 code pointer to the start of the group
744 options pointer to external options
745 optbit the option bit whose changing is significant, or
746 zero if none are
747 skipassert TRUE if certain assertions are to be skipped
748
749 Returns: pointer to the first significant opcode
750 */
751
752 static const uschar*
753 first_significant_code(const uschar *code, int *options, int optbit,
754 BOOL skipassert)
755 {
756 for (;;)
757 {
758 switch ((int)*code)
759 {
760 case OP_OPT:
761 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
762 *options = (int)code[1];
763 code += 2;
764 break;
765
766 case OP_ASSERT_NOT:
767 case OP_ASSERTBACK:
768 case OP_ASSERTBACK_NOT:
769 if (!skipassert) return code;
770 do code += GET(code, 1); while (*code == OP_ALT);
771 code += _pcre_OP_lengths[*code];
772 break;
773
774 case OP_WORD_BOUNDARY:
775 case OP_NOT_WORD_BOUNDARY:
776 if (!skipassert) return code;
777 /* Fall through */
778
779 case OP_CALLOUT:
780 case OP_CREF:
781 case OP_BRANUMBER:
782 code += _pcre_OP_lengths[*code];
783 break;
784
785 default:
786 return code;
787 }
788 }
789 /* Control never reaches here */
790 }
791
792
793
794
795 /*************************************************
796 * Find the fixed length of a pattern *
797 *************************************************/
798
799 /* Scan a pattern and compute the fixed length of subject that will match it,
800 if the length is fixed. This is needed for dealing with backward assertions.
801 In UTF8 mode, the result is in characters rather than bytes.
802
803 Arguments:
804 code points to the start of the pattern (the bracket)
805 options the compiling options
806
807 Returns: the fixed length, or -1 if there is no fixed length,
808 or -2 if \C was encountered
809 */
810
811 static int
812 find_fixedlength(uschar *code, int options)
813 {
814 int length = -1;
815
816 register int branchlength = 0;
817 register uschar *cc = code + 1 + LINK_SIZE;
818
819 /* Scan along the opcodes for this branch. If we get to the end of the
820 branch, check the length against that of the other branches. */
821
822 for (;;)
823 {
824 int d;
825 register int op = *cc;
826 if (op >= OP_BRA) op = OP_BRA;
827
828 switch (op)
829 {
830 case OP_BRA:
831 case OP_ONCE:
832 case OP_COND:
833 d = find_fixedlength(cc, options);
834 if (d < 0) return d;
835 branchlength += d;
836 do cc += GET(cc, 1); while (*cc == OP_ALT);
837 cc += 1 + LINK_SIZE;
838 break;
839
840 /* Reached end of a branch; if it's a ket it is the end of a nested
841 call. If it's ALT it is an alternation in a nested call. If it is
842 END it's the end of the outer call. All can be handled by the same code. */
843
844 case OP_ALT:
845 case OP_KET:
846 case OP_KETRMAX:
847 case OP_KETRMIN:
848 case OP_END:
849 if (length < 0) length = branchlength;
850 else if (length != branchlength) return -1;
851 if (*cc != OP_ALT) return length;
852 cc += 1 + LINK_SIZE;
853 branchlength = 0;
854 break;
855
856 /* Skip over assertive subpatterns */
857
858 case OP_ASSERT:
859 case OP_ASSERT_NOT:
860 case OP_ASSERTBACK:
861 case OP_ASSERTBACK_NOT:
862 do cc += GET(cc, 1); while (*cc == OP_ALT);
863 /* Fall through */
864
865 /* Skip over things that don't match chars */
866
867 case OP_REVERSE:
868 case OP_BRANUMBER:
869 case OP_CREF:
870 case OP_OPT:
871 case OP_CALLOUT:
872 case OP_SOD:
873 case OP_SOM:
874 case OP_EOD:
875 case OP_EODN:
876 case OP_CIRC:
877 case OP_DOLL:
878 case OP_NOT_WORD_BOUNDARY:
879 case OP_WORD_BOUNDARY:
880 cc += _pcre_OP_lengths[*cc];
881 break;
882
883 /* Handle literal characters */
884
885 case OP_CHAR:
886 case OP_CHARNC:
887 branchlength++;
888 cc += 2;
889 #ifdef SUPPORT_UTF8
890 if ((options & PCRE_UTF8) != 0)
891 {
892 while ((*cc & 0xc0) == 0x80) cc++;
893 }
894 #endif
895 break;
896
897 /* Handle exact repetitions. The count is already in characters, but we
898 need to skip over a multibyte character in UTF8 mode. */
899
900 case OP_EXACT:
901 branchlength += GET2(cc,1);
902 cc += 4;
903 #ifdef SUPPORT_UTF8
904 if ((options & PCRE_UTF8) != 0)
905 {
906 while((*cc & 0x80) == 0x80) cc++;
907 }
908 #endif
909 break;
910
911 case OP_TYPEEXACT:
912 branchlength += GET2(cc,1);
913 cc += 4;
914 break;
915
916 /* Handle single-char matchers */
917
918 case OP_PROP:
919 case OP_NOTPROP:
920 cc++;
921 /* Fall through */
922
923 case OP_NOT_DIGIT:
924 case OP_DIGIT:
925 case OP_NOT_WHITESPACE:
926 case OP_WHITESPACE:
927 case OP_NOT_WORDCHAR:
928 case OP_WORDCHAR:
929 case OP_ANY:
930 branchlength++;
931 cc++;
932 break;
933
934 /* The single-byte matcher isn't allowed */
935
936 case OP_ANYBYTE:
937 return -2;
938
939 /* Check a class for variable quantification */
940
941 #ifdef SUPPORT_UTF8
942 case OP_XCLASS:
943 cc += GET(cc, 1) - 33;
944 /* Fall through */
945 #endif
946
947 case OP_CLASS:
948 case OP_NCLASS:
949 cc += 33;
950
951 switch (*cc)
952 {
953 case OP_CRSTAR:
954 case OP_CRMINSTAR:
955 case OP_CRQUERY:
956 case OP_CRMINQUERY:
957 return -1;
958
959 case OP_CRRANGE:
960 case OP_CRMINRANGE:
961 if (GET2(cc,1) != GET2(cc,3)) return -1;
962 branchlength += GET2(cc,1);
963 cc += 5;
964 break;
965
966 default:
967 branchlength++;
968 }
969 break;
970
971 /* Anything else is variable length */
972
973 default:
974 return -1;
975 }
976 }
977 /* Control never gets here */
978 }
979
980
981
982
983 /*************************************************
984 * Scan compiled regex for numbered bracket *
985 *************************************************/
986
987 /* This little function scans through a compiled pattern until it finds a
988 capturing bracket with the given number.
989
990 Arguments:
991 code points to start of expression
992 utf8 TRUE in UTF-8 mode
993 number the required bracket number
994
995 Returns: pointer to the opcode for the bracket, or NULL if not found
996 */
997
998 static const uschar *
999 find_bracket(const uschar *code, BOOL utf8, int number)
1000 {
1001 #ifndef SUPPORT_UTF8
1002 utf8 = utf8; /* Stop pedantic compilers complaining */
1003 #endif
1004
1005 for (;;)
1006 {
1007 register int c = *code;
1008 if (c == OP_END) return NULL;
1009 else if (c > OP_BRA)
1010 {
1011 int n = c - OP_BRA;
1012 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1013 if (n == number) return (uschar *)code;
1014 code += _pcre_OP_lengths[OP_BRA];
1015 }
1016 else
1017 {
1018 code += _pcre_OP_lengths[c];
1019
1020 #ifdef SUPPORT_UTF8
1021
1022 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1023 by a multi-byte character. The length in the table is a minimum, so we have
1024 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1025 can use relatively efficient code. */
1026
1027 if (utf8) switch(c)
1028 {
1029 case OP_CHAR:
1030 case OP_CHARNC:
1031 case OP_EXACT:
1032 case OP_UPTO:
1033 case OP_MINUPTO:
1034 case OP_STAR:
1035 case OP_MINSTAR:
1036 case OP_PLUS:
1037 case OP_MINPLUS:
1038 case OP_QUERY:
1039 case OP_MINQUERY:
1040 while ((*code & 0xc0) == 0x80) code++;
1041 break;
1042
1043 /* XCLASS is used for classes that cannot be represented just by a bit
1044 map. This includes negated single high-valued characters. The length in
1045 the table is zero; the actual length is stored in the compiled code. */
1046
1047 case OP_XCLASS:
1048 code += GET(code, 1) + 1;
1049 break;
1050 }
1051 #endif
1052 }
1053 }
1054 }
1055
1056
1057
1058 /*************************************************
1059 * Scan compiled regex for recursion reference *
1060 *************************************************/
1061
1062 /* This little function scans through a compiled pattern until it finds an
1063 instance of OP_RECURSE.
1064
1065 Arguments:
1066 code points to start of expression
1067 utf8 TRUE in UTF-8 mode
1068
1069 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1070 */
1071
1072 static const uschar *
1073 find_recurse(const uschar *code, BOOL utf8)
1074 {
1075 #ifndef SUPPORT_UTF8
1076 utf8 = utf8; /* Stop pedantic compilers complaining */
1077 #endif
1078
1079 for (;;)
1080 {
1081 register int c = *code;
1082 if (c == OP_END) return NULL;
1083 else if (c == OP_RECURSE) return code;
1084 else if (c > OP_BRA)
1085 {
1086 code += _pcre_OP_lengths[OP_BRA];
1087 }
1088 else
1089 {
1090 code += _pcre_OP_lengths[c];
1091
1092 #ifdef SUPPORT_UTF8
1093
1094 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1095 by a multi-byte character. The length in the table is a minimum, so we have
1096 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1097 can use relatively efficient code. */
1098
1099 if (utf8) switch(c)
1100 {
1101 case OP_CHAR:
1102 case OP_CHARNC:
1103 case OP_EXACT:
1104 case OP_UPTO:
1105 case OP_MINUPTO:
1106 case OP_STAR:
1107 case OP_MINSTAR:
1108 case OP_PLUS:
1109 case OP_MINPLUS:
1110 case OP_QUERY:
1111 case OP_MINQUERY:
1112 while ((*code & 0xc0) == 0x80) code++;
1113 break;
1114
1115 /* XCLASS is used for classes that cannot be represented just by a bit
1116 map. This includes negated single high-valued characters. The length in
1117 the table is zero; the actual length is stored in the compiled code. */
1118
1119 case OP_XCLASS:
1120 code += GET(code, 1) + 1;
1121 break;
1122 }
1123 #endif
1124 }
1125 }
1126 }
1127
1128
1129
1130 /*************************************************
1131 * Scan compiled branch for non-emptiness *
1132 *************************************************/
1133
1134 /* This function scans through a branch of a compiled pattern to see whether it
1135 can match the empty string or not. It is called only from could_be_empty()
1136 below. Note that first_significant_code() skips over assertions. If we hit an
1137 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1138 whose current branch will already have been scanned.
1139
1140 Arguments:
1141 code points to start of search
1142 endcode points to where to stop
1143 utf8 TRUE if in UTF8 mode
1144
1145 Returns: TRUE if what is matched could be empty
1146 */
1147
1148 static BOOL
1149 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1150 {
1151 register int c;
1152 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1153 code < endcode;
1154 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1155 {
1156 const uschar *ccode;
1157
1158 c = *code;
1159
1160 if (c >= OP_BRA)
1161 {
1162 BOOL empty_branch;
1163 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1164
1165 /* Scan a closed bracket */
1166
1167 empty_branch = FALSE;
1168 do
1169 {
1170 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1171 empty_branch = TRUE;
1172 code += GET(code, 1);
1173 }
1174 while (*code == OP_ALT);
1175 if (!empty_branch) return FALSE; /* All branches are non-empty */
1176 code += 1 + LINK_SIZE;
1177 c = *code;
1178 }
1179
1180 else switch (c)
1181 {
1182 /* Check for quantifiers after a class */
1183
1184 #ifdef SUPPORT_UTF8
1185 case OP_XCLASS:
1186 ccode = code + GET(code, 1);
1187 goto CHECK_CLASS_REPEAT;
1188 #endif
1189
1190 case OP_CLASS:
1191 case OP_NCLASS:
1192 ccode = code + 33;
1193
1194 #ifdef SUPPORT_UTF8
1195 CHECK_CLASS_REPEAT:
1196 #endif
1197
1198 switch (*ccode)
1199 {
1200 case OP_CRSTAR: /* These could be empty; continue */
1201 case OP_CRMINSTAR:
1202 case OP_CRQUERY:
1203 case OP_CRMINQUERY:
1204 break;
1205
1206 default: /* Non-repeat => class must match */
1207 case OP_CRPLUS: /* These repeats aren't empty */
1208 case OP_CRMINPLUS:
1209 return FALSE;
1210
1211 case OP_CRRANGE:
1212 case OP_CRMINRANGE:
1213 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1214 break;
1215 }
1216 break;
1217
1218 /* Opcodes that must match a character */
1219
1220 case OP_PROP:
1221 case OP_NOTPROP:
1222 case OP_EXTUNI:
1223 case OP_NOT_DIGIT:
1224 case OP_DIGIT:
1225 case OP_NOT_WHITESPACE:
1226 case OP_WHITESPACE:
1227 case OP_NOT_WORDCHAR:
1228 case OP_WORDCHAR:
1229 case OP_ANY:
1230 case OP_ANYBYTE:
1231 case OP_CHAR:
1232 case OP_CHARNC:
1233 case OP_NOT:
1234 case OP_PLUS:
1235 case OP_MINPLUS:
1236 case OP_EXACT:
1237 case OP_NOTPLUS:
1238 case OP_NOTMINPLUS:
1239 case OP_NOTEXACT:
1240 case OP_TYPEPLUS:
1241 case OP_TYPEMINPLUS:
1242 case OP_TYPEEXACT:
1243 return FALSE;
1244
1245 /* End of branch */
1246
1247 case OP_KET:
1248 case OP_KETRMAX:
1249 case OP_KETRMIN:
1250 case OP_ALT:
1251 return TRUE;
1252
1253 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1254 followed by a multibyte character */
1255
1256 #ifdef SUPPORT_UTF8
1257 case OP_STAR:
1258 case OP_MINSTAR:
1259 case OP_QUERY:
1260 case OP_MINQUERY:
1261 case OP_UPTO:
1262 case OP_MINUPTO:
1263 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1264 break;
1265 #endif
1266 }
1267 }
1268
1269 return TRUE;
1270 }
1271
1272
1273
1274 /*************************************************
1275 * Scan compiled regex for non-emptiness *
1276 *************************************************/
1277
1278 /* This function is called to check for left recursive calls. We want to check
1279 the current branch of the current pattern to see if it could match the empty
1280 string. If it could, we must look outwards for branches at other levels,
1281 stopping when we pass beyond the bracket which is the subject of the recursion.
1282
1283 Arguments:
1284 code points to start of the recursion
1285 endcode points to where to stop (current RECURSE item)
1286 bcptr points to the chain of current (unclosed) branch starts
1287 utf8 TRUE if in UTF-8 mode
1288
1289 Returns: TRUE if what is matched could be empty
1290 */
1291
1292 static BOOL
1293 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1294 BOOL utf8)
1295 {
1296 while (bcptr != NULL && bcptr->current >= code)
1297 {
1298 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1299 bcptr = bcptr->outer;
1300 }
1301 return TRUE;
1302 }
1303
1304
1305
1306 /*************************************************
1307 * Check for POSIX class syntax *
1308 *************************************************/
1309
1310 /* This function is called when the sequence "[:" or "[." or "[=" is
1311 encountered in a character class. It checks whether this is followed by an
1312 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1313 ".]" or "=]".
1314
1315 Argument:
1316 ptr pointer to the initial [
1317 endptr where to return the end pointer
1318 cd pointer to compile data
1319
1320 Returns: TRUE or FALSE
1321 */
1322
1323 static BOOL
1324 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1325 {
1326 int terminator; /* Don't combine these lines; the Solaris cc */
1327 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1328 if (*(++ptr) == '^') ptr++;
1329 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1330 if (*ptr == terminator && ptr[1] == ']')
1331 {
1332 *endptr = ptr;
1333 return TRUE;
1334 }
1335 return FALSE;
1336 }
1337
1338
1339
1340
1341 /*************************************************
1342 * Check POSIX class name *
1343 *************************************************/
1344
1345 /* This function is called to check the name given in a POSIX-style class entry
1346 such as [:alnum:].
1347
1348 Arguments:
1349 ptr points to the first letter
1350 len the length of the name
1351
1352 Returns: a value representing the name, or -1 if unknown
1353 */
1354
1355 static int
1356 check_posix_name(const uschar *ptr, int len)
1357 {
1358 register int yield = 0;
1359 while (posix_name_lengths[yield] != 0)
1360 {
1361 if (len == posix_name_lengths[yield] &&
1362 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1363 yield++;
1364 }
1365 return -1;
1366 }
1367
1368
1369 /*************************************************
1370 * Adjust OP_RECURSE items in repeated group *
1371 *************************************************/
1372
1373 /* OP_RECURSE items contain an offset from the start of the regex to the group
1374 that is referenced. This means that groups can be replicated for fixed
1375 repetition simply by copying (because the recursion is allowed to refer to
1376 earlier groups that are outside the current group). However, when a group is
1377 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1378 it, after it has been compiled. This means that any OP_RECURSE items within it
1379 that refer to the group itself or any contained groups have to have their
1380 offsets adjusted. That is the job of this function. Before it is called, the
1381 partially compiled regex must be temporarily terminated with OP_END.
1382
1383 Arguments:
1384 group points to the start of the group
1385 adjust the amount by which the group is to be moved
1386 utf8 TRUE in UTF-8 mode
1387 cd contains pointers to tables etc.
1388
1389 Returns: nothing
1390 */
1391
1392 static void
1393 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1394 {
1395 uschar *ptr = group;
1396 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1397 {
1398 int offset = GET(ptr, 1);
1399 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1400 ptr += 1 + LINK_SIZE;
1401 }
1402 }
1403
1404
1405
1406 /*************************************************
1407 * Insert an automatic callout point *
1408 *************************************************/
1409
1410 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1411 callout points before each pattern item.
1412
1413 Arguments:
1414 code current code pointer
1415 ptr current pattern pointer
1416 cd pointers to tables etc
1417
1418 Returns: new code pointer
1419 */
1420
1421 static uschar *
1422 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1423 {
1424 *code++ = OP_CALLOUT;
1425 *code++ = 255;
1426 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1427 PUT(code, LINK_SIZE, 0); /* Default length */
1428 return code + 2*LINK_SIZE;
1429 }
1430
1431
1432
1433 /*************************************************
1434 * Complete a callout item *
1435 *************************************************/
1436
1437 /* A callout item contains the length of the next item in the pattern, which
1438 we can't fill in till after we have reached the relevant point. This is used
1439 for both automatic and manual callouts.
1440
1441 Arguments:
1442 previous_callout points to previous callout item
1443 ptr current pattern pointer
1444 cd pointers to tables etc
1445
1446 Returns: nothing
1447 */
1448
1449 static void
1450 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1451 {
1452 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1453 PUT(previous_callout, 2 + LINK_SIZE, length);
1454 }
1455
1456
1457
1458 #ifdef SUPPORT_UCP
1459 /*************************************************
1460 * Get othercase range *
1461 *************************************************/
1462
1463 /* This function is passed the start and end of a class range, in UTF-8 mode
1464 with UCP support. It searches up the characters, looking for internal ranges of
1465 characters in the "other" case. Each call returns the next one, updating the
1466 start address.
1467
1468 Arguments:
1469 cptr points to starting character value; updated
1470 d end value
1471 ocptr where to put start of othercase range
1472 odptr where to put end of othercase range
1473
1474 Yield: TRUE when range returned; FALSE when no more
1475 */
1476
1477 static BOOL
1478 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1479 {
1480 int c, chartype, othercase, next;
1481
1482 for (c = *cptr; c <= d; c++)
1483 {
1484 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1485 break;
1486 }
1487
1488 if (c > d) return FALSE;
1489
1490 *ocptr = othercase;
1491 next = othercase + 1;
1492
1493 for (++c; c <= d; c++)
1494 {
1495 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1496 othercase != next)
1497 break;
1498 next++;
1499 }
1500
1501 *odptr = next - 1;
1502 *cptr = c;
1503
1504 return TRUE;
1505 }
1506 #endif /* SUPPORT_UCP */
1507
1508
1509 /*************************************************
1510 * Compile one branch *
1511 *************************************************/
1512
1513 /* Scan the pattern, compiling it into the code vector. If the options are
1514 changed during the branch, the pointer is used to change the external options
1515 bits.
1516
1517 Arguments:
1518 optionsptr pointer to the option bits
1519 brackets points to number of extracting brackets used
1520 codeptr points to the pointer to the current code point
1521 ptrptr points to the current pattern pointer
1522 errorcodeptr points to error code variable
1523 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1524 reqbyteptr set to the last literal character required, else < 0
1525 bcptr points to current branch chain
1526 cd contains pointers to tables etc.
1527
1528 Returns: TRUE on success
1529 FALSE, with *errorcodeptr set non-zero on error
1530 */
1531
1532 static BOOL
1533 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1534 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1535 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1536 {
1537 int repeat_type, op_type;
1538 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1539 int bravalue = 0;
1540 int greedy_default, greedy_non_default;
1541 int firstbyte, reqbyte;
1542 int zeroreqbyte, zerofirstbyte;
1543 int req_caseopt, reqvary, tempreqvary;
1544 int condcount = 0;
1545 int options = *optionsptr;
1546 int after_manual_callout = 0;
1547 register int c;
1548 register uschar *code = *codeptr;
1549 uschar *tempcode;
1550 BOOL inescq = FALSE;
1551 BOOL groupsetfirstbyte = FALSE;
1552 const uschar *ptr = *ptrptr;
1553 const uschar *tempptr;
1554 uschar *previous = NULL;
1555 uschar *previous_callout = NULL;
1556 uschar classbits[32];
1557
1558 #ifdef SUPPORT_UTF8
1559 BOOL class_utf8;
1560 BOOL utf8 = (options & PCRE_UTF8) != 0;
1561 uschar *class_utf8data;
1562 uschar utf8_char[6];
1563 #else
1564 BOOL utf8 = FALSE;
1565 #endif
1566
1567 /* Set up the default and non-default settings for greediness */
1568
1569 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1570 greedy_non_default = greedy_default ^ 1;
1571
1572 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1573 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1574 matches a non-fixed char first char; reqbyte just remains unset if we never
1575 find one.
1576
1577 When we hit a repeat whose minimum is zero, we may have to adjust these values
1578 to take the zero repeat into account. This is implemented by setting them to
1579 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1580 item types that can be repeated set these backoff variables appropriately. */
1581
1582 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1583
1584 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1585 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1586 value > 255. It is added into the firstbyte or reqbyte variables to record the
1587 case status of the value. This is used only for ASCII characters. */
1588
1589 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1590
1591 /* Switch on next character until the end of the branch */
1592
1593 for (;; ptr++)
1594 {
1595 BOOL negate_class;
1596 BOOL possessive_quantifier;
1597 BOOL is_quantifier;
1598 int class_charcount;
1599 int class_lastchar;
1600 int newoptions;
1601 int recno;
1602 int skipbytes;
1603 int subreqbyte;
1604 int subfirstbyte;
1605 int mclength;
1606 uschar mcbuffer[8];
1607
1608 /* Next byte in the pattern */
1609
1610 c = *ptr;
1611
1612 /* If in \Q...\E, check for the end; if not, we have a literal */
1613
1614 if (inescq && c != 0)
1615 {
1616 if (c == '\\' && ptr[1] == 'E')
1617 {
1618 inescq = FALSE;
1619 ptr++;
1620 continue;
1621 }
1622 else
1623 {
1624 if (previous_callout != NULL)
1625 {
1626 complete_callout(previous_callout, ptr, cd);
1627 previous_callout = NULL;
1628 }
1629 if ((options & PCRE_AUTO_CALLOUT) != 0)
1630 {
1631 previous_callout = code;
1632 code = auto_callout(code, ptr, cd);
1633 }
1634 goto NORMAL_CHAR;
1635 }
1636 }
1637
1638 /* Fill in length of a previous callout, except when the next thing is
1639 a quantifier. */
1640
1641 is_quantifier = c == '*' || c == '+' || c == '?' ||
1642 (c == '{' && is_counted_repeat(ptr+1));
1643
1644 if (!is_quantifier && previous_callout != NULL &&
1645 after_manual_callout-- <= 0)
1646 {
1647 complete_callout(previous_callout, ptr, cd);
1648 previous_callout = NULL;
1649 }
1650
1651 /* In extended mode, skip white space and comments */
1652
1653 if ((options & PCRE_EXTENDED) != 0)
1654 {
1655 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1656 if (c == '#')
1657 {
1658 /* The space before the ; is to avoid a warning on a silly compiler
1659 on the Macintosh. */
1660 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1661 if (c != 0) continue; /* Else fall through to handle end of string */
1662 }
1663 }
1664
1665 /* No auto callout for quantifiers. */
1666
1667 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1668 {
1669 previous_callout = code;
1670 code = auto_callout(code, ptr, cd);
1671 }
1672
1673 switch(c)
1674 {
1675 /* The branch terminates at end of string, |, or ). */
1676
1677 case 0:
1678 case '|':
1679 case ')':
1680 *firstbyteptr = firstbyte;
1681 *reqbyteptr = reqbyte;
1682 *codeptr = code;
1683 *ptrptr = ptr;
1684 return TRUE;
1685
1686 /* Handle single-character metacharacters. In multiline mode, ^ disables
1687 the setting of any following char as a first character. */
1688
1689 case '^':
1690 if ((options & PCRE_MULTILINE) != 0)
1691 {
1692 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1693 }
1694 previous = NULL;
1695 *code++ = OP_CIRC;
1696 break;
1697
1698 case '$':
1699 previous = NULL;
1700 *code++ = OP_DOLL;
1701 break;
1702
1703 /* There can never be a first char if '.' is first, whatever happens about
1704 repeats. The value of reqbyte doesn't change either. */
1705
1706 case '.':
1707 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1708 zerofirstbyte = firstbyte;
1709 zeroreqbyte = reqbyte;
1710 previous = code;
1711 *code++ = OP_ANY;
1712 break;
1713
1714 /* Character classes. If the included characters are all < 255 in value, we
1715 build a 32-byte bitmap of the permitted characters, except in the special
1716 case where there is only one such character. For negated classes, we build
1717 the map as usual, then invert it at the end. However, we use a different
1718 opcode so that data characters > 255 can be handled correctly.
1719
1720 If the class contains characters outside the 0-255 range, a different
1721 opcode is compiled. It may optionally have a bit map for characters < 256,
1722 but those above are are explicitly listed afterwards. A flag byte tells
1723 whether the bitmap is present, and whether this is a negated class or not.
1724 */
1725
1726 case '[':
1727 previous = code;
1728
1729 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1730 they are encountered at the top level, so we'll do that too. */
1731
1732 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1733 check_posix_syntax(ptr, &tempptr, cd))
1734 {
1735 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1736 goto FAILED;
1737 }
1738
1739 /* If the first character is '^', set the negation flag and skip it. */
1740
1741 if ((c = *(++ptr)) == '^')
1742 {
1743 negate_class = TRUE;
1744 c = *(++ptr);
1745 }
1746 else
1747 {
1748 negate_class = FALSE;
1749 }
1750
1751 /* Keep a count of chars with values < 256 so that we can optimize the case
1752 of just a single character (as long as it's < 256). For higher valued UTF-8
1753 characters, we don't yet do any optimization. */
1754
1755 class_charcount = 0;
1756 class_lastchar = -1;
1757
1758 #ifdef SUPPORT_UTF8
1759 class_utf8 = FALSE; /* No chars >= 256 */
1760 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1761 #endif
1762
1763 /* Initialize the 32-char bit map to all zeros. We have to build the
1764 map in a temporary bit of store, in case the class contains only 1
1765 character (< 256), because in that case the compiled code doesn't use the
1766 bit map. */
1767
1768 memset(classbits, 0, 32 * sizeof(uschar));
1769
1770 /* Process characters until ] is reached. By writing this as a "do" it
1771 means that an initial ] is taken as a data character. The first pass
1772 through the regex checked the overall syntax, so we don't need to be very
1773 strict here. At the start of the loop, c contains the first byte of the
1774 character. */
1775
1776 do
1777 {
1778 #ifdef SUPPORT_UTF8
1779 if (utf8 && c > 127)
1780 { /* Braces are required because the */
1781 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1782 }
1783 #endif
1784
1785 /* Inside \Q...\E everything is literal except \E */
1786
1787 if (inescq)
1788 {
1789 if (c == '\\' && ptr[1] == 'E')
1790 {
1791 inescq = FALSE;
1792 ptr++;
1793 continue;
1794 }
1795 else goto LONE_SINGLE_CHARACTER;
1796 }
1797
1798 /* Handle POSIX class names. Perl allows a negation extension of the
1799 form [:^name:]. A square bracket that doesn't match the syntax is
1800 treated as a literal. We also recognize the POSIX constructions
1801 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1802 5.6 and 5.8 do. */
1803
1804 if (c == '[' &&
1805 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1806 check_posix_syntax(ptr, &tempptr, cd))
1807 {
1808 BOOL local_negate = FALSE;
1809 int posix_class, i;
1810 register const uschar *cbits = cd->cbits;
1811
1812 if (ptr[1] != ':')
1813 {
1814 *errorcodeptr = ERR31;
1815 goto FAILED;
1816 }
1817
1818 ptr += 2;
1819 if (*ptr == '^')
1820 {
1821 local_negate = TRUE;
1822 ptr++;
1823 }
1824
1825 posix_class = check_posix_name(ptr, tempptr - ptr);
1826 if (posix_class < 0)
1827 {
1828 *errorcodeptr = ERR30;
1829 goto FAILED;
1830 }
1831
1832 /* If matching is caseless, upper and lower are converted to
1833 alpha. This relies on the fact that the class table starts with
1834 alpha, lower, upper as the first 3 entries. */
1835
1836 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1837 posix_class = 0;
1838
1839 /* Or into the map we are building up to 3 of the static class
1840 tables, or their negations. The [:blank:] class sets up the same
1841 chars as the [:space:] class (all white space). We remove the vertical
1842 white space chars afterwards. */
1843
1844 posix_class *= 3;
1845 for (i = 0; i < 3; i++)
1846 {
1847 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1848 int taboffset = posix_class_maps[posix_class + i];
1849 if (taboffset < 0) break;
1850 if (local_negate)
1851 {
1852 if (i == 0)
1853 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1854 else
1855 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1856 if (blankclass) classbits[1] |= 0x3c;
1857 }
1858 else
1859 {
1860 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1861 if (blankclass) classbits[1] &= ~0x3c;
1862 }
1863 }
1864
1865 ptr = tempptr + 1;
1866 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1867 continue; /* End of POSIX syntax handling */
1868 }
1869
1870 /* Backslash may introduce a single character, or it may introduce one
1871 of the specials, which just set a flag. Escaped items are checked for
1872 validity in the pre-compiling pass. The sequence \b is a special case.
1873 Inside a class (and only there) it is treated as backspace. Elsewhere
1874 it marks a word boundary. Other escapes have preset maps ready to
1875 or into the one we are building. We assume they have more than one
1876 character in them, so set class_charcount bigger than one. */
1877
1878 if (c == '\\')
1879 {
1880 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1881
1882 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1883 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1884 else if (-c == ESC_Q) /* Handle start of quoted string */
1885 {
1886 if (ptr[1] == '\\' && ptr[2] == 'E')
1887 {
1888 ptr += 2; /* avoid empty string */
1889 }
1890 else inescq = TRUE;
1891 continue;
1892 }
1893
1894 if (c < 0)
1895 {
1896 register const uschar *cbits = cd->cbits;
1897 class_charcount += 2; /* Greater than 1 is what matters */
1898 switch (-c)
1899 {
1900 case ESC_d:
1901 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1902 continue;
1903
1904 case ESC_D:
1905 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1906 continue;
1907
1908 case ESC_w:
1909 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1910 continue;
1911
1912 case ESC_W:
1913 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1914 continue;
1915
1916 case ESC_s:
1917 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1918 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1919 continue;
1920
1921 case ESC_S:
1922 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1923 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1924 continue;
1925
1926 #ifdef SUPPORT_UCP
1927 case ESC_p:
1928 case ESC_P:
1929 {
1930 BOOL negated;
1931 int property = get_ucp(&ptr, &negated, errorcodeptr);
1932 if (property < 0) goto FAILED;
1933 class_utf8 = TRUE;
1934 *class_utf8data++ = ((-c == ESC_p) != negated)?
1935 XCL_PROP : XCL_NOTPROP;
1936 *class_utf8data++ = property;
1937 class_charcount -= 2; /* Not a < 256 character */
1938 }
1939 continue;
1940 #endif
1941
1942 /* Unrecognized escapes are faulted if PCRE is running in its
1943 strict mode. By default, for compatibility with Perl, they are
1944 treated as literals. */
1945
1946 default:
1947 if ((options & PCRE_EXTRA) != 0)
1948 {
1949 *errorcodeptr = ERR7;
1950 goto FAILED;
1951 }
1952 c = *ptr; /* The final character */
1953 class_charcount -= 2; /* Undo the default count from above */
1954 }
1955 }
1956
1957 /* Fall through if we have a single character (c >= 0). This may be
1958 > 256 in UTF-8 mode. */
1959
1960 } /* End of backslash handling */
1961
1962 /* A single character may be followed by '-' to form a range. However,
1963 Perl does not permit ']' to be the end of the range. A '-' character
1964 here is treated as a literal. */
1965
1966 if (ptr[1] == '-' && ptr[2] != ']')
1967 {
1968 int d;
1969 ptr += 2;
1970
1971 #ifdef SUPPORT_UTF8
1972 if (utf8)
1973 { /* Braces are required because the */
1974 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1975 }
1976 else
1977 #endif
1978 d = *ptr; /* Not UTF-8 mode */
1979
1980 /* The second part of a range can be a single-character escape, but
1981 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1982 in such circumstances. */
1983
1984 if (d == '\\')
1985 {
1986 const uschar *oldptr = ptr;
1987 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1988
1989 /* \b is backslash; \X is literal X; any other special means the '-'
1990 was literal */
1991
1992 if (d < 0)
1993 {
1994 if (d == -ESC_b) d = '\b';
1995 else if (d == -ESC_X) d = 'X'; else
1996 {
1997 ptr = oldptr - 2;
1998 goto LONE_SINGLE_CHARACTER; /* A few lines below */
1999 }
2000 }
2001 }
2002
2003 /* The check that the two values are in the correct order happens in
2004 the pre-pass. Optimize one-character ranges */
2005
2006 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2007
2008 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2009 matching, we have to use an XCLASS with extra data items. Caseless
2010 matching for characters > 127 is available only if UCP support is
2011 available. */
2012
2013 #ifdef SUPPORT_UTF8
2014 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2015 {
2016 class_utf8 = TRUE;
2017
2018 /* With UCP support, we can find the other case equivalents of
2019 the relevant characters. There may be several ranges. Optimize how
2020 they fit with the basic range. */
2021
2022 #ifdef SUPPORT_UCP
2023 if ((options & PCRE_CASELESS) != 0)
2024 {
2025 int occ, ocd;
2026 int cc = c;
2027 int origd = d;
2028 while (get_othercase_range(&cc, origd, &occ, &ocd))
2029 {
2030 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2031
2032 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2033 { /* if there is overlap, */
2034 c = occ; /* noting that if occ < c */
2035 continue; /* we can't have ocd > d */
2036 } /* because a subrange is */
2037 if (ocd > d && occ <= d + 1) /* always shorter than */
2038 { /* the basic range. */
2039 d = ocd;
2040 continue;
2041 }
2042
2043 if (occ == ocd)
2044 {
2045 *class_utf8data++ = XCL_SINGLE;
2046 }
2047 else
2048 {
2049 *class_utf8data++ = XCL_RANGE;
2050 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2051 }
2052 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2053 }
2054 }
2055 #endif /* SUPPORT_UCP */
2056
2057 /* Now record the original range, possibly modified for UCP caseless
2058 overlapping ranges. */
2059
2060 *class_utf8data++ = XCL_RANGE;
2061 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2062 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2063
2064 /* With UCP support, we are done. Without UCP support, there is no
2065 caseless matching for UTF-8 characters > 127; we can use the bit map
2066 for the smaller ones. */
2067
2068 #ifdef SUPPORT_UCP
2069 continue; /* With next character in the class */
2070 #else
2071 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2072
2073 /* Adjust upper limit and fall through to set up the map */
2074
2075 d = 127;
2076
2077 #endif /* SUPPORT_UCP */
2078 }
2079 #endif /* SUPPORT_UTF8 */
2080
2081 /* We use the bit map for all cases when not in UTF-8 mode; else
2082 ranges that lie entirely within 0-127 when there is UCP support; else
2083 for partial ranges without UCP support. */
2084
2085 for (; c <= d; c++)
2086 {
2087 classbits[c/8] |= (1 << (c&7));
2088 if ((options & PCRE_CASELESS) != 0)
2089 {
2090 int uc = cd->fcc[c]; /* flip case */
2091 classbits[uc/8] |= (1 << (uc&7));
2092 }
2093 class_charcount++; /* in case a one-char range */
2094 class_lastchar = c;
2095 }
2096
2097 continue; /* Go get the next char in the class */
2098 }
2099
2100 /* Handle a lone single character - we can get here for a normal
2101 non-escape char, or after \ that introduces a single character or for an
2102 apparent range that isn't. */
2103
2104 LONE_SINGLE_CHARACTER:
2105
2106 /* Handle a character that cannot go in the bit map */
2107
2108 #ifdef SUPPORT_UTF8
2109 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2110 {
2111 class_utf8 = TRUE;
2112 *class_utf8data++ = XCL_SINGLE;
2113 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2114
2115 #ifdef SUPPORT_UCP
2116 if ((options & PCRE_CASELESS) != 0)
2117 {
2118 int chartype;
2119 int othercase;
2120 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2121 othercase > 0)
2122 {
2123 *class_utf8data++ = XCL_SINGLE;
2124 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2125 }
2126 }
2127 #endif /* SUPPORT_UCP */
2128
2129 }
2130 else
2131 #endif /* SUPPORT_UTF8 */
2132
2133 /* Handle a single-byte character */
2134 {
2135 classbits[c/8] |= (1 << (c&7));
2136 if ((options & PCRE_CASELESS) != 0)
2137 {
2138 c = cd->fcc[c]; /* flip case */
2139 classbits[c/8] |= (1 << (c&7));
2140 }
2141 class_charcount++;
2142 class_lastchar = c;
2143 }
2144 }
2145
2146 /* Loop until ']' reached; the check for end of string happens inside the
2147 loop. This "while" is the end of the "do" above. */
2148
2149 while ((c = *(++ptr)) != ']' || inescq);
2150
2151 /* If class_charcount is 1, we saw precisely one character whose value is
2152 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2153 can optimize the negative case only if there were no characters >= 128
2154 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2155 single-bytes only. This is an historical hangover. Maybe one day we can
2156 tidy these opcodes to handle multi-byte characters.
2157
2158 The optimization throws away the bit map. We turn the item into a
2159 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2160 that OP_NOT does not support multibyte characters. In the positive case, it
2161 can cause firstbyte to be set. Otherwise, there can be no first char if
2162 this item is first, whatever repeat count may follow. In the case of
2163 reqbyte, save the previous value for reinstating. */
2164
2165 #ifdef SUPPORT_UTF8
2166 if (class_charcount == 1 &&
2167 (!utf8 ||
2168 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2169
2170 #else
2171 if (class_charcount == 1)
2172 #endif
2173 {
2174 zeroreqbyte = reqbyte;
2175
2176 /* The OP_NOT opcode works on one-byte characters only. */
2177
2178 if (negate_class)
2179 {
2180 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2181 zerofirstbyte = firstbyte;
2182 *code++ = OP_NOT;
2183 *code++ = class_lastchar;
2184 break;
2185 }
2186
2187 /* For a single, positive character, get the value into mcbuffer, and
2188 then we can handle this with the normal one-character code. */
2189
2190 #ifdef SUPPORT_UTF8
2191 if (utf8 && class_lastchar > 127)
2192 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2193 else
2194 #endif
2195 {
2196 mcbuffer[0] = class_lastchar;
2197 mclength = 1;
2198 }
2199 goto ONE_CHAR;
2200 } /* End of 1-char optimization */
2201
2202 /* The general case - not the one-char optimization. If this is the first
2203 thing in the branch, there can be no first char setting, whatever the
2204 repeat count. Any reqbyte setting must remain unchanged after any kind of
2205 repeat. */
2206
2207 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2208 zerofirstbyte = firstbyte;
2209 zeroreqbyte = reqbyte;
2210
2211 /* If there are characters with values > 255, we have to compile an
2212 extended class, with its own opcode. If there are no characters < 256,
2213 we can omit the bitmap. */
2214
2215 #ifdef SUPPORT_UTF8
2216 if (class_utf8)
2217 {
2218 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2219 *code++ = OP_XCLASS;
2220 code += LINK_SIZE;
2221 *code = negate_class? XCL_NOT : 0;
2222
2223 /* If the map is required, install it, and move on to the end of
2224 the extra data */
2225
2226 if (class_charcount > 0)
2227 {
2228 *code++ |= XCL_MAP;
2229 memcpy(code, classbits, 32);
2230 code = class_utf8data;
2231 }
2232
2233 /* If the map is not required, slide down the extra data. */
2234
2235 else
2236 {
2237 int len = class_utf8data - (code + 33);
2238 memmove(code + 1, code + 33, len);
2239 code += len + 1;
2240 }
2241
2242 /* Now fill in the complete length of the item */
2243
2244 PUT(previous, 1, code - previous);
2245 break; /* End of class handling */
2246 }
2247 #endif
2248
2249 /* If there are no characters > 255, negate the 32-byte map if necessary,
2250 and copy it into the code vector. If this is the first thing in the branch,
2251 there can be no first char setting, whatever the repeat count. Any reqbyte
2252 setting must remain unchanged after any kind of repeat. */
2253
2254 if (negate_class)
2255 {
2256 *code++ = OP_NCLASS;
2257 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2258 }
2259 else
2260 {
2261 *code++ = OP_CLASS;
2262 memcpy(code, classbits, 32);
2263 }
2264 code += 32;
2265 break;
2266
2267 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2268 has been tested above. */
2269
2270 case '{':
2271 if (!is_quantifier) goto NORMAL_CHAR;
2272 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2273 if (*errorcodeptr != 0) goto FAILED;
2274 goto REPEAT;
2275
2276 case '*':
2277 repeat_min = 0;
2278 repeat_max = -1;
2279 goto REPEAT;
2280
2281 case '+':
2282 repeat_min = 1;
2283 repeat_max = -1;
2284 goto REPEAT;
2285
2286 case '?':
2287 repeat_min = 0;
2288 repeat_max = 1;
2289
2290 REPEAT:
2291 if (previous == NULL)
2292 {
2293 *errorcodeptr = ERR9;
2294 goto FAILED;
2295 }
2296
2297 if (repeat_min == 0)
2298 {
2299 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2300 reqbyte = zeroreqbyte; /* Ditto */
2301 }
2302
2303 /* Remember whether this is a variable length repeat */
2304
2305 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2306
2307 op_type = 0; /* Default single-char op codes */
2308 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2309
2310 /* Save start of previous item, in case we have to move it up to make space
2311 for an inserted OP_ONCE for the additional '+' extension. */
2312
2313 tempcode = previous;
2314
2315 /* If the next character is '+', we have a possessive quantifier. This
2316 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2317 If the next character is '?' this is a minimizing repeat, by default,
2318 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2319 repeat type to the non-default. */
2320
2321 if (ptr[1] == '+')
2322 {
2323 repeat_type = 0; /* Force greedy */
2324 possessive_quantifier = TRUE;
2325 ptr++;
2326 }
2327 else if (ptr[1] == '?')
2328 {
2329 repeat_type = greedy_non_default;
2330 ptr++;
2331 }
2332 else repeat_type = greedy_default;
2333
2334 /* If previous was a recursion, we need to wrap it inside brackets so that
2335 it can be replicated if necessary. */
2336
2337 if (*previous == OP_RECURSE)
2338 {
2339 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2340 code += 1 + LINK_SIZE;
2341 *previous = OP_BRA;
2342 PUT(previous, 1, code - previous);
2343 *code = OP_KET;
2344 PUT(code, 1, code - previous);
2345 code += 1 + LINK_SIZE;
2346 }
2347
2348 /* If previous was a character match, abolish the item and generate a
2349 repeat item instead. If a char item has a minumum of more than one, ensure
2350 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2351 the first thing in a branch because the x will have gone into firstbyte
2352 instead. */
2353
2354 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2355 {
2356 /* Deal with UTF-8 characters that take up more than one byte. It's
2357 easier to write this out separately than try to macrify it. Use c to
2358 hold the length of the character in bytes, plus 0x80 to flag that it's a
2359 length rather than a small character. */
2360
2361 #ifdef SUPPORT_UTF8
2362 if (utf8 && (code[-1] & 0x80) != 0)
2363 {
2364 uschar *lastchar = code - 1;
2365 while((*lastchar & 0xc0) == 0x80) lastchar--;
2366 c = code - lastchar; /* Length of UTF-8 character */
2367 memcpy(utf8_char, lastchar, c); /* Save the char */
2368 c |= 0x80; /* Flag c as a length */
2369 }
2370 else
2371 #endif
2372
2373 /* Handle the case of a single byte - either with no UTF8 support, or
2374 with UTF-8 disabled, or for a UTF-8 character < 128. */
2375
2376 {
2377 c = code[-1];
2378 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2379 }
2380
2381 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2382 }
2383
2384 /* If previous was a single negated character ([^a] or similar), we use
2385 one of the special opcodes, replacing it. The code is shared with single-
2386 character repeats by setting opt_type to add a suitable offset into
2387 repeat_type. OP_NOT is currently used only for single-byte chars. */
2388
2389 else if (*previous == OP_NOT)
2390 {
2391 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2392 c = previous[1];
2393 goto OUTPUT_SINGLE_REPEAT;
2394 }
2395
2396 /* If previous was a character type match (\d or similar), abolish it and
2397 create a suitable repeat item. The code is shared with single-character
2398 repeats by setting op_type to add a suitable offset into repeat_type. Note
2399 the the Unicode property types will be present only when SUPPORT_UCP is
2400 defined, but we don't wrap the little bits of code here because it just
2401 makes it horribly messy. */
2402
2403 else if (*previous < OP_EODN)
2404 {
2405 uschar *oldcode;
2406 int prop_type;
2407 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2408 c = *previous;
2409
2410 OUTPUT_SINGLE_REPEAT:
2411 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2412 previous[1] : -1;
2413
2414 oldcode = code;
2415 code = previous; /* Usually overwrite previous item */
2416
2417 /* If the maximum is zero then the minimum must also be zero; Perl allows
2418 this case, so we do too - by simply omitting the item altogether. */
2419
2420 if (repeat_max == 0) goto END_REPEAT;
2421
2422 /* All real repeats make it impossible to handle partial matching (maybe
2423 one day we will be able to remove this restriction). */
2424
2425 if (repeat_max != 1) cd->nopartial = TRUE;
2426
2427 /* Combine the op_type with the repeat_type */
2428
2429 repeat_type += op_type;
2430
2431 /* A minimum of zero is handled either as the special case * or ?, or as
2432 an UPTO, with the maximum given. */
2433
2434 if (repeat_min == 0)
2435 {
2436 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2437 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2438 else
2439 {
2440 *code++ = OP_UPTO + repeat_type;
2441 PUT2INC(code, 0, repeat_max);
2442 }
2443 }
2444
2445 /* A repeat minimum of 1 is optimized into some special cases. If the
2446 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2447 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2448 one less than the maximum. */
2449
2450 else if (repeat_min == 1)
2451 {
2452 if (repeat_max == -1)
2453 *code++ = OP_PLUS + repeat_type;
2454 else
2455 {
2456 code = oldcode; /* leave previous item in place */
2457 if (repeat_max == 1) goto END_REPEAT;
2458 *code++ = OP_UPTO + repeat_type;
2459 PUT2INC(code, 0, repeat_max - 1);
2460 }
2461 }
2462
2463 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2464 handled as an EXACT followed by an UPTO. */
2465
2466 else
2467 {
2468 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2469 PUT2INC(code, 0, repeat_min);
2470
2471 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2472 we have to insert the character for the previous code. For a repeated
2473 Unicode property match, there is an extra byte that defines the
2474 required property. In UTF-8 mode, long characters have their length in
2475 c, with the 0x80 bit as a flag. */
2476
2477 if (repeat_max < 0)
2478 {
2479 #ifdef SUPPORT_UTF8
2480 if (utf8 && c >= 128)
2481 {
2482 memcpy(code, utf8_char, c & 7);
2483 code += c & 7;
2484 }
2485 else
2486 #endif
2487 {
2488 *code++ = c;
2489 if (prop_type >= 0) *code++ = prop_type;
2490 }
2491 *code++ = OP_STAR + repeat_type;
2492 }
2493
2494 /* Else insert an UPTO if the max is greater than the min, again
2495 preceded by the character, for the previously inserted code. */
2496
2497 else if (repeat_max != repeat_min)
2498 {
2499 #ifdef SUPPORT_UTF8
2500 if (utf8 && c >= 128)
2501 {
2502 memcpy(code, utf8_char, c & 7);
2503 code += c & 7;
2504 }
2505 else
2506 #endif
2507 *code++ = c;
2508 if (prop_type >= 0) *code++ = prop_type;
2509 repeat_max -= repeat_min;
2510 *code++ = OP_UPTO + repeat_type;
2511 PUT2INC(code, 0, repeat_max);
2512 }
2513 }
2514
2515 /* The character or character type itself comes last in all cases. */
2516
2517 #ifdef SUPPORT_UTF8
2518 if (utf8 && c >= 128)
2519 {
2520 memcpy(code, utf8_char, c & 7);
2521 code += c & 7;
2522 }
2523 else
2524 #endif
2525 *code++ = c;
2526
2527 /* For a repeated Unicode property match, there is an extra byte that
2528 defines the required property. */
2529
2530 #ifdef SUPPORT_UCP
2531 if (prop_type >= 0) *code++ = prop_type;
2532 #endif
2533 }
2534
2535 /* If previous was a character class or a back reference, we put the repeat
2536 stuff after it, but just skip the item if the repeat was {0,0}. */
2537
2538 else if (*previous == OP_CLASS ||
2539 *previous == OP_NCLASS ||
2540 #ifdef SUPPORT_UTF8
2541 *previous == OP_XCLASS ||
2542 #endif
2543 *previous == OP_REF)
2544 {
2545 if (repeat_max == 0)
2546 {
2547 code = previous;
2548 goto END_REPEAT;
2549 }
2550
2551 /* All real repeats make it impossible to handle partial matching (maybe
2552 one day we will be able to remove this restriction). */
2553
2554 if (repeat_max != 1) cd->nopartial = TRUE;
2555
2556 if (repeat_min == 0 && repeat_max == -1)
2557 *code++ = OP_CRSTAR + repeat_type;
2558 else if (repeat_min == 1 && repeat_max == -1)
2559 *code++ = OP_CRPLUS + repeat_type;
2560 else if (repeat_min == 0 && repeat_max == 1)
2561 *code++ = OP_CRQUERY + repeat_type;
2562 else
2563 {
2564 *code++ = OP_CRRANGE + repeat_type;
2565 PUT2INC(code, 0, repeat_min);
2566 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2567 PUT2INC(code, 0, repeat_max);
2568 }
2569 }
2570
2571 /* If previous was a bracket group, we may have to replicate it in certain
2572 cases. */
2573
2574 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2575 *previous == OP_COND)
2576 {
2577 register int i;
2578 int ketoffset = 0;
2579 int len = code - previous;
2580 uschar *bralink = NULL;
2581
2582 /* If the maximum repeat count is unlimited, find the end of the bracket
2583 by scanning through from the start, and compute the offset back to it
2584 from the current code pointer. There may be an OP_OPT setting following
2585 the final KET, so we can't find the end just by going back from the code
2586 pointer. */
2587
2588 if (repeat_max == -1)
2589 {
2590 register uschar *ket = previous;
2591 do ket += GET(ket, 1); while (*ket != OP_KET);
2592 ketoffset = code - ket;
2593 }
2594
2595 /* The case of a zero minimum is special because of the need to stick
2596 OP_BRAZERO in front of it, and because the group appears once in the
2597 data, whereas in other cases it appears the minimum number of times. For
2598 this reason, it is simplest to treat this case separately, as otherwise
2599 the code gets far too messy. There are several special subcases when the
2600 minimum is zero. */
2601
2602 if (repeat_min == 0)
2603 {
2604 /* If the maximum is also zero, we just omit the group from the output
2605 altogether. */
2606
2607 if (repeat_max == 0)
2608 {
2609 code = previous;
2610 goto END_REPEAT;
2611 }
2612
2613 /* If the maximum is 1 or unlimited, we just have to stick in the
2614 BRAZERO and do no more at this point. However, we do need to adjust
2615 any OP_RECURSE calls inside the group that refer to the group itself or
2616 any internal group, because the offset is from the start of the whole
2617 regex. Temporarily terminate the pattern while doing this. */
2618
2619 if (repeat_max <= 1)
2620 {
2621 *code = OP_END;
2622 adjust_recurse(previous, 1, utf8, cd);
2623 memmove(previous+1, previous, len);
2624 code++;
2625 *previous++ = OP_BRAZERO + repeat_type;
2626 }
2627
2628 /* If the maximum is greater than 1 and limited, we have to replicate
2629 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2630 The first one has to be handled carefully because it's the original
2631 copy, which has to be moved up. The remainder can be handled by code
2632 that is common with the non-zero minimum case below. We have to
2633 adjust the value or repeat_max, since one less copy is required. Once
2634 again, we may have to adjust any OP_RECURSE calls inside the group. */
2635
2636 else
2637 {
2638 int offset;
2639 *code = OP_END;
2640 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2641 memmove(previous + 2 + LINK_SIZE, previous, len);
2642 code += 2 + LINK_SIZE;
2643 *previous++ = OP_BRAZERO + repeat_type;
2644 *previous++ = OP_BRA;
2645
2646 /* We chain together the bracket offset fields that have to be
2647 filled in later when the ends of the brackets are reached. */
2648
2649 offset = (bralink == NULL)? 0 : previous - bralink;
2650 bralink = previous;
2651 PUTINC(previous, 0, offset);
2652 }
2653
2654 repeat_max--;
2655 }
2656
2657 /* If the minimum is greater than zero, replicate the group as many
2658 times as necessary, and adjust the maximum to the number of subsequent
2659 copies that we need. If we set a first char from the group, and didn't
2660 set a required char, copy the latter from the former. */
2661
2662 else
2663 {
2664 if (repeat_min > 1)
2665 {
2666 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2667 for (i = 1; i < repeat_min; i++)
2668 {
2669 memcpy(code, previous, len);
2670 code += len;
2671 }
2672 }
2673 if (repeat_max > 0) repeat_max -= repeat_min;
2674 }
2675
2676 /* This code is common to both the zero and non-zero minimum cases. If
2677 the maximum is limited, it replicates the group in a nested fashion,
2678 remembering the bracket starts on a stack. In the case of a zero minimum,
2679 the first one was set up above. In all cases the repeat_max now specifies
2680 the number of additional copies needed. */
2681
2682 if (repeat_max >= 0)
2683 {
2684 for (i = repeat_max - 1; i >= 0; i--)
2685 {
2686 *code++ = OP_BRAZERO + repeat_type;
2687
2688 /* All but the final copy start a new nesting, maintaining the
2689 chain of brackets outstanding. */
2690
2691 if (i != 0)
2692 {
2693 int offset;
2694 *code++ = OP_BRA;
2695 offset = (bralink == NULL)? 0 : code - bralink;
2696 bralink = code;
2697 PUTINC(code, 0, offset);
2698 }
2699
2700 memcpy(code, previous, len);
2701 code += len;
2702 }
2703
2704 /* Now chain through the pending brackets, and fill in their length
2705 fields (which are holding the chain links pro tem). */
2706
2707 while (bralink != NULL)
2708 {
2709 int oldlinkoffset;
2710 int offset = code - bralink + 1;
2711 uschar *bra = code - offset;
2712 oldlinkoffset = GET(bra, 1);
2713 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2714 *code++ = OP_KET;
2715 PUTINC(code, 0, offset);
2716 PUT(bra, 1, offset);
2717 }
2718 }
2719
2720 /* If the maximum is unlimited, set a repeater in the final copy. We
2721 can't just offset backwards from the current code point, because we
2722 don't know if there's been an options resetting after the ket. The
2723 correct offset was computed above. */
2724
2725 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2726 }
2727
2728 /* Else there's some kind of shambles */
2729
2730 else
2731 {
2732 *errorcodeptr = ERR11;
2733 goto FAILED;
2734 }
2735
2736 /* If the character following a repeat is '+', we wrap the entire repeated
2737 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2738 Sun's Java package. The repeated item starts at tempcode, not at previous,
2739 which might be the first part of a string whose (former) last char we
2740 repeated. However, we don't support '+' after a greediness '?'. */
2741
2742 if (possessive_quantifier)
2743 {
2744 int len = code - tempcode;
2745 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2746 code += 1 + LINK_SIZE;
2747 len += 1 + LINK_SIZE;
2748 tempcode[0] = OP_ONCE;
2749 *code++ = OP_KET;
2750 PUTINC(code, 0, len);
2751 PUT(tempcode, 1, len);
2752 }
2753
2754 /* In all case we no longer have a previous item. We also set the
2755 "follows varying string" flag for subsequently encountered reqbytes if
2756 it isn't already set and we have just passed a varying length item. */
2757
2758 END_REPEAT:
2759 previous = NULL;
2760 cd->req_varyopt |= reqvary;
2761 break;
2762
2763
2764 /* Start of nested bracket sub-expression, or comment or lookahead or
2765 lookbehind or option setting or condition. First deal with special things
2766 that can come after a bracket; all are introduced by ?, and the appearance
2767 of any of them means that this is not a referencing group. They were
2768 checked for validity in the first pass over the string, so we don't have to
2769 check for syntax errors here. */
2770
2771 case '(':
2772 newoptions = options;
2773 skipbytes = 0;
2774
2775 if (*(++ptr) == '?')
2776 {
2777 int set, unset;
2778 int *optset;
2779
2780 switch (*(++ptr))
2781 {
2782 case '#': /* Comment; skip to ket */
2783 ptr++;
2784 while (*ptr != ')') ptr++;
2785 continue;
2786
2787 case ':': /* Non-extracting bracket */
2788 bravalue = OP_BRA;
2789 ptr++;
2790 break;
2791
2792 case '(':
2793 bravalue = OP_COND; /* Conditional group */
2794
2795 /* Condition to test for recursion */
2796
2797 if (ptr[1] == 'R')
2798 {
2799 code[1+LINK_SIZE] = OP_CREF;
2800 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2801 skipbytes = 3;
2802 ptr += 3;
2803 }
2804
2805 /* Condition to test for a numbered subpattern match. We know that
2806 if a digit follows ( then there will just be digits until ) because
2807 the syntax was checked in the first pass. */
2808
2809 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2810 {
2811 int condref; /* Don't amalgamate; some compilers */
2812 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2813 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2814 if (condref == 0)
2815 {
2816 *errorcodeptr = ERR35;
2817 goto FAILED;
2818 }
2819 ptr++;
2820 code[1+LINK_SIZE] = OP_CREF;
2821 PUT2(code, 2+LINK_SIZE, condref);
2822 skipbytes = 3;
2823 }
2824 /* For conditions that are assertions, we just fall through, having
2825 set bravalue above. */
2826 break;
2827
2828 case '=': /* Positive lookahead */
2829 bravalue = OP_ASSERT;
2830 ptr++;
2831 break;
2832
2833 case '!': /* Negative lookahead */
2834 bravalue = OP_ASSERT_NOT;
2835 ptr++;
2836 break;
2837
2838 case '<': /* Lookbehinds */
2839 switch (*(++ptr))
2840 {
2841 case '=': /* Positive lookbehind */
2842 bravalue = OP_ASSERTBACK;
2843 ptr++;
2844 break;
2845
2846 case '!': /* Negative lookbehind */
2847 bravalue = OP_ASSERTBACK_NOT;
2848 ptr++;
2849 break;
2850 }
2851 break;
2852
2853 case '>': /* One-time brackets */
2854 bravalue = OP_ONCE;
2855 ptr++;
2856 break;
2857
2858 case 'C': /* Callout - may be followed by digits; */
2859 previous_callout = code; /* Save for later completion */
2860 after_manual_callout = 1; /* Skip one item before completing */
2861 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2862 { /* closing parenthesis is present. */
2863 int n = 0;
2864 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2865 n = n * 10 + *ptr - '0';
2866 if (n > 255)
2867 {
2868 *errorcodeptr = ERR38;
2869 goto FAILED;
2870 }
2871 *code++ = n;
2872 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2873 PUT(code, LINK_SIZE, 0); /* Default length */
2874 code += 2 * LINK_SIZE;
2875 }
2876 previous = NULL;
2877 continue;
2878
2879 case 'P': /* Named subpattern handling */
2880 if (*(++ptr) == '<') /* Definition */
2881 {
2882 int i, namelen;
2883 uschar *slot = cd->name_table;
2884 const uschar *name; /* Don't amalgamate; some compilers */
2885 name = ++ptr; /* grumble at autoincrement in declaration */
2886
2887 while (*ptr++ != '>');
2888 namelen = ptr - name - 1;
2889
2890 for (i = 0; i < cd->names_found; i++)
2891 {
2892 int crc = memcmp(name, slot+2, namelen);
2893 if (crc == 0)
2894 {
2895 if (slot[2+namelen] == 0)
2896 {
2897 *errorcodeptr = ERR43;
2898 goto FAILED;
2899 }
2900 crc = -1; /* Current name is substring */
2901 }
2902 if (crc < 0)
2903 {
2904 memmove(slot + cd->name_entry_size, slot,
2905 (cd->names_found - i) * cd->name_entry_size);
2906 break;
2907 }
2908 slot += cd->name_entry_size;
2909 }
2910
2911 PUT2(slot, 0, *brackets + 1);
2912 memcpy(slot + 2, name, namelen);
2913 slot[2+namelen] = 0;
2914 cd->names_found++;
2915 goto NUMBERED_GROUP;
2916 }
2917
2918 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2919 {
2920 int i, namelen;
2921 int type = *ptr++;
2922 const uschar *name = ptr;
2923 uschar *slot = cd->name_table;
2924
2925 while (*ptr != ')') ptr++;
2926 namelen = ptr - name;
2927
2928 for (i = 0; i < cd->names_found; i++)
2929 {
2930 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2931 slot += cd->name_entry_size;
2932 }
2933 if (i >= cd->names_found)
2934 {
2935 *errorcodeptr = ERR15;
2936 goto FAILED;
2937 }
2938
2939 recno = GET2(slot, 0);
2940
2941 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2942
2943 /* Back reference */
2944
2945 previous = code;
2946 *code++ = OP_REF;
2947 PUT2INC(code, 0, recno);
2948 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2949 if (recno > cd->top_backref) cd->top_backref = recno;
2950 continue;
2951 }
2952
2953 /* Should never happen */
2954 break;
2955
2956 case 'R': /* Pattern recursion */
2957 ptr++; /* Same as (?0) */
2958 /* Fall through */
2959
2960 /* Recursion or "subroutine" call */
2961
2962 case '0': case '1': case '2': case '3': case '4':
2963 case '5': case '6': case '7': case '8': case '9':
2964 {
2965 const uschar *called;
2966 recno = 0;
2967 while((digitab[*ptr] & ctype_digit) != 0)
2968 recno = recno * 10 + *ptr++ - '0';
2969
2970 /* Come here from code above that handles a named recursion */
2971
2972 HANDLE_RECURSION:
2973
2974 previous = code;
2975
2976 /* Find the bracket that is being referenced. Temporarily end the
2977 regex in case it doesn't exist. */
2978
2979 *code = OP_END;
2980 called = (recno == 0)?
2981 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2982
2983 if (called == NULL)
2984 {
2985 *errorcodeptr = ERR15;
2986 goto FAILED;
2987 }
2988
2989 /* If the subpattern is still open, this is a recursive call. We
2990 check to see if this is a left recursion that could loop for ever,
2991 and diagnose that case. */
2992
2993 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2994 {
2995 *errorcodeptr = ERR40;
2996 goto FAILED;
2997 }
2998
2999 /* Insert the recursion/subroutine item */
3000
3001 *code = OP_RECURSE;
3002 PUT(code, 1, called - cd->start_code);
3003 code += 1 + LINK_SIZE;
3004 }
3005 continue;
3006
3007 /* Character after (? not specially recognized */
3008
3009 default: /* Option setting */
3010 set = unset = 0;
3011 optset = &set;
3012
3013 while (*ptr != ')' && *ptr != ':')
3014 {
3015 switch (*ptr++)
3016 {
3017 case '-': optset = &unset; break;
3018
3019 case 'i': *optset |= PCRE_CASELESS; break;
3020 case 'm': *optset |= PCRE_MULTILINE; break;
3021 case 's': *optset |= PCRE_DOTALL; break;
3022 case 'x': *optset |= PCRE_EXTENDED; break;
3023 case 'U': *optset |= PCRE_UNGREEDY; break;
3024 case 'X': *optset |= PCRE_EXTRA; break;
3025 }
3026 }
3027
3028 /* Set up the changed option bits, but don't change anything yet. */
3029
3030 newoptions = (options | set) & (~unset);
3031
3032 /* If the options ended with ')' this is not the start of a nested
3033 group with option changes, so the options change at this level. Compile
3034 code to change the ims options if this setting actually changes any of
3035 them. We also pass the new setting back so that it can be put at the
3036 start of any following branches, and when this group ends (if we are in
3037 a group), a resetting item can be compiled.
3038
3039 Note that if this item is right at the start of the pattern, the
3040 options will have been abstracted and made global, so there will be no
3041 change to compile. */
3042
3043 if (*ptr == ')')
3044 {
3045 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3046 {
3047 *code++ = OP_OPT;
3048 *code++ = newoptions & PCRE_IMS;
3049 }
3050
3051 /* Change options at this level, and pass them back for use
3052 in subsequent branches. Reset the greedy defaults and the case
3053 value for firstbyte and reqbyte. */
3054
3055 *optionsptr = options = newoptions;
3056 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3057 greedy_non_default = greedy_default ^ 1;
3058 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3059
3060 previous = NULL; /* This item can't be repeated */
3061 continue; /* It is complete */
3062 }
3063
3064 /* If the options ended with ':' we are heading into a nested group
3065 with possible change of options. Such groups are non-capturing and are
3066 not assertions of any kind. All we need to do is skip over the ':';
3067 the newoptions value is handled below. */
3068
3069 bravalue = OP_BRA;
3070 ptr++;
3071 }
3072 }
3073
3074 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3075 non-capturing and behave like (?:...) brackets */
3076
3077 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3078 {
3079 bravalue = OP_BRA;
3080 }
3081
3082 /* Else we have a referencing group; adjust the opcode. If the bracket
3083 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3084 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3085
3086 else
3087 {
3088 NUMBERED_GROUP:
3089 if (++(*brackets) > EXTRACT_BASIC_MAX)
3090 {
3091 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3092 code[1+LINK_SIZE] = OP_BRANUMBER;
3093 PUT2(code, 2+LINK_SIZE, *brackets);
3094 skipbytes = 3;
3095 }
3096 else bravalue = OP_BRA + *brackets;
3097 }
3098
3099 /* Process nested bracketed re. Assertions may not be repeated, but other
3100 kinds can be. We copy code into a non-register variable in order to be able
3101 to pass its address because some compilers complain otherwise. Pass in a
3102 new setting for the ims options if they have changed. */
3103
3104 previous = (bravalue >= OP_ONCE)? code : NULL;
3105 *code = bravalue;
3106 tempcode = code;
3107 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3108
3109 if (!compile_regex(
3110 newoptions, /* The complete new option state */
3111 options & PCRE_IMS, /* The previous ims option state */
3112 brackets, /* Extracting bracket count */
3113 &tempcode, /* Where to put code (updated) */
3114 &ptr, /* Input pointer (updated) */
3115 errorcodeptr, /* Where to put an error message */
3116 (bravalue == OP_ASSERTBACK ||
3117 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3118 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3119 &subfirstbyte, /* For possible first char */
3120 &subreqbyte, /* For possible last char */
3121 bcptr, /* Current branch chain */
3122 cd)) /* Tables block */
3123 goto FAILED;
3124
3125 /* At the end of compiling, code is still pointing to the start of the
3126 group, while tempcode has been updated to point past the end of the group
3127 and any option resetting that may follow it. The pattern pointer (ptr)
3128 is on the bracket. */
3129
3130 /* If this is a conditional bracket, check that there are no more than
3131 two branches in the group. */
3132
3133 else if (bravalue == OP_COND)
3134 {
3135 uschar *tc = code;
3136 condcount = 0;
3137
3138 do {
3139 condcount++;
3140 tc += GET(tc,1);
3141 }
3142 while (*tc != OP_KET);
3143
3144 if (condcount > 2)
3145 {
3146 *errorcodeptr = ERR27;
3147 goto FAILED;
3148 }
3149
3150 /* If there is just one branch, we must not make use of its firstbyte or
3151 reqbyte, because this is equivalent to an empty second branch. */
3152
3153 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3154 }
3155
3156 /* Handle updating of the required and first characters. Update for normal
3157 brackets of all kinds, and conditions with two branches (see code above).
3158 If the bracket is followed by a quantifier with zero repeat, we have to
3159 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3160 main loop so that they can be accessed for the back off. */
3161
3162 zeroreqbyte = reqbyte;
3163 zerofirstbyte = firstbyte;
3164 groupsetfirstbyte = FALSE;
3165
3166 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3167 {
3168 /* If we have not yet set a firstbyte in this branch, take it from the
3169 subpattern, remembering that it was set here so that a repeat of more
3170 than one can replicate it as reqbyte if necessary. If the subpattern has
3171 no firstbyte, set "none" for the whole branch. In both cases, a zero
3172 repeat forces firstbyte to "none". */
3173
3174 if (firstbyte == REQ_UNSET)
3175 {
3176 if (subfirstbyte >= 0)
3177 {
3178 firstbyte = subfirstbyte;
3179 groupsetfirstbyte = TRUE;
3180 }
3181 else firstbyte = REQ_NONE;
3182 zerofirstbyte = REQ_NONE;
3183 }
3184
3185 /* If firstbyte was previously set, convert the subpattern's firstbyte
3186 into reqbyte if there wasn't one, using the vary flag that was in
3187 existence beforehand. */
3188
3189 else if (subfirstbyte >= 0 && subreqbyte < 0)
3190 subreqbyte = subfirstbyte | tempreqvary;
3191
3192 /* If the subpattern set a required byte (or set a first byte that isn't
3193 really the first byte - see above), set it. */
3194
3195 if (subreqbyte >= 0) reqbyte = subreqbyte;
3196 }
3197
3198 /* For a forward assertion, we take the reqbyte, if set. This can be
3199 helpful if the pattern that follows the assertion doesn't set a different
3200 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3201 for an assertion, however because it leads to incorrect effect for patterns
3202 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3203 of a firstbyte. This is overcome by a scan at the end if there's no
3204 firstbyte, looking for an asserted first char. */
3205
3206 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3207
3208 /* Now update the main code pointer to the end of the group. */
3209
3210 code = tempcode;
3211
3212 /* Error if hit end of pattern */
3213
3214 if (*ptr != ')')
3215 {
3216 *errorcodeptr = ERR14;
3217 goto FAILED;
3218 }
3219 break;
3220
3221 /* Check \ for being a real metacharacter; if not, fall through and handle
3222 it as a data character at the start of a string. Escape items are checked
3223 for validity in the pre-compiling pass. */
3224
3225 case '\\':
3226 tempptr = ptr;
3227 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3228
3229 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3230 are arranged to be the negation of the corresponding OP_values. For the
3231 back references, the values are ESC_REF plus the reference number. Only
3232 back references and those types that consume a character may be repeated.
3233 We can test for values between ESC_b and ESC_Z for the latter; this may
3234 have to change if any new ones are ever created. */
3235
3236 if (c < 0)
3237 {
3238 if (-c == ESC_Q) /* Handle start of quoted string */
3239 {
3240 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3241 else inescq = TRUE;
3242 continue;
3243 }
3244
3245 /* For metasequences that actually match a character, we disable the
3246 setting of a first character if it hasn't already been set. */
3247
3248 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3249 firstbyte = REQ_NONE;
3250
3251 /* Set values to reset to if this is followed by a zero repeat. */
3252
3253 zerofirstbyte = firstbyte;
3254 zeroreqbyte = reqbyte;
3255
3256 /* Back references are handled specially */
3257
3258 if (-c >= ESC_REF)
3259 {
3260 int number = -c - ESC_REF;
3261 previous = code;
3262 *code++ = OP_REF;
3263 PUT2INC(code, 0, number);
3264 }
3265
3266 /* So are Unicode property matches, if supported. We know that get_ucp
3267 won't fail because it was tested in the pre-pass. */
3268
3269 #ifdef SUPPORT_UCP
3270 else if (-c == ESC_P || -c == ESC_p)
3271 {
3272 BOOL negated;
3273 int value = get_ucp(&ptr, &negated, errorcodeptr);
3274 previous = code;
3275 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3276 *code++ = value;
3277 }
3278 #endif
3279
3280 /* For the rest, we can obtain the OP value by negating the escape
3281 value */
3282
3283 else
3284 {
3285 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3286 *code++ = -c;
3287 }
3288 continue;
3289 }
3290
3291 /* We have a data character whose value is in c. In UTF-8 mode it may have
3292 a value > 127. We set its representation in the length/buffer, and then
3293 handle it as a data character. */
3294
3295 #ifdef SUPPORT_UTF8
3296 if (utf8 && c > 127)
3297 mclength = _pcre_ord2utf8(c, mcbuffer);
3298 else
3299 #endif
3300
3301 {
3302 mcbuffer[0] = c;
3303 mclength = 1;
3304 }
3305
3306 goto ONE_CHAR;
3307
3308 /* Handle a literal character. It is guaranteed not to be whitespace or #
3309 when the extended flag is set. If we are in UTF-8 mode, it may be a
3310 multi-byte literal character. */
3311
3312 default:
3313 NORMAL_CHAR:
3314 mclength = 1;
3315 mcbuffer[0] = c;
3316
3317 #ifdef SUPPORT_UTF8
3318 if (utf8 && (c & 0xc0) == 0xc0)
3319 {
3320 while ((ptr[1] & 0xc0) == 0x80)
3321 mcbuffer[mclength++] = *(++ptr);
3322 }
3323 #endif
3324
3325 /* At this point we have the character's bytes in mcbuffer, and the length
3326 in mclength. When not in UTF-8 mode, the length is always 1. */
3327
3328 ONE_CHAR:
3329 previous = code;
3330 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3331 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3332
3333 /* Set the first and required bytes appropriately. If no previous first
3334 byte, set it from this character, but revert to none on a zero repeat.
3335 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3336 repeat. */
3337
3338 if (firstbyte == REQ_UNSET)
3339 {
3340 zerofirstbyte = REQ_NONE;
3341 zeroreqbyte = reqbyte;
3342
3343 /* If the character is more than one byte long, we can set firstbyte
3344 only if it is not to be matched caselessly. */
3345
3346 if (mclength == 1 || req_caseopt == 0)
3347 {
3348 firstbyte = mcbuffer[0] | req_caseopt;
3349 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3350 }
3351 else firstbyte = reqbyte = REQ_NONE;
3352 }
3353
3354 /* firstbyte was previously set; we can set reqbyte only the length is
3355 1 or the matching is caseful. */
3356
3357 else
3358 {
3359 zerofirstbyte = firstbyte;
3360 zeroreqbyte = reqbyte;
3361 if (mclength == 1 || req_caseopt == 0)
3362 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3363 }
3364
3365 break; /* End of literal character handling */
3366 }
3367 } /* end of big loop */
3368
3369 /* Control never reaches here by falling through, only by a goto for all the
3370 error states. Pass back the position in the pattern so that it can be displayed
3371 to the user for diagnosing the error. */
3372
3373 FAILED:
3374 *ptrptr = ptr;
3375 return FALSE;
3376 }
3377
3378
3379
3380
3381 /*************************************************
3382 * Compile sequence of alternatives *
3383 *************************************************/
3384
3385 /* On entry, ptr is pointing past the bracket character, but on return
3386 it points to the closing bracket, or vertical bar, or end of string.
3387 The code variable is pointing at the byte into which the BRA operator has been
3388 stored. If the ims options are changed at the start (for a (?ims: group) or
3389 during any branch, we need to insert an OP_OPT item at the start of every
3390 following branch to ensure they get set correctly at run time, and also pass
3391 the new options into every subsequent branch compile.
3392
3393 Argument:
3394 options option bits, including any changes for this subpattern
3395 oldims previous settings of ims option bits
3396 brackets -> int containing the number of extracting brackets used
3397 codeptr -> the address of the current code pointer
3398 ptrptr -> the address of the current pattern pointer
3399 errorcodeptr -> pointer to error code variable
3400 lookbehind TRUE if this is a lookbehind assertion
3401 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3402 firstbyteptr place to put the first required character, or a negative number
3403 reqbyteptr place to put the last required character, or a negative number
3404 bcptr pointer to the chain of currently open branches
3405 cd points to the data block with tables pointers etc.
3406
3407 Returns: TRUE on success
3408 */
3409
3410 static BOOL
3411 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3412 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3413 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3414 {
3415 const uschar *ptr = *ptrptr;
3416 uschar *code = *codeptr;
3417 uschar *last_branch = code;
3418 uschar *start_bracket = code;
3419 uschar *reverse_count = NULL;
3420 int firstbyte, reqbyte;
3421 int branchfirstbyte, branchreqbyte;
3422 branch_chain bc;
3423
3424 bc.outer = bcptr;
3425 bc.current = code;
3426
3427 firstbyte = reqbyte = REQ_UNSET;
3428
3429 /* Offset is set zero to mark that this bracket is still open */
3430
3431 PUT(code, 1, 0);
3432 code += 1 + LINK_SIZE + skipbytes;
3433
3434 /* Loop for each alternative branch */
3435
3436 for (;;)
3437 {
3438 /* Handle a change of ims options at the start of the branch */
3439
3440 if ((options & PCRE_IMS) != oldims)
3441 {
3442 *code++ = OP_OPT;
3443 *code++ = options & PCRE_IMS;
3444 }
3445
3446 /* Set up dummy OP_REVERSE if lookbehind assertion */
3447
3448 if (lookbehind)
3449 {
3450 *code++ = OP_REVERSE;
3451 reverse_count = code;
3452 PUTINC(code, 0, 0);
3453 }
3454
3455 /* Now compile the branch */
3456
3457 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3458 &branchfirstbyte, &branchreqbyte, &bc, cd))
3459 {
3460 *ptrptr = ptr;
3461 return FALSE;
3462 }
3463
3464 /* If this is the first branch, the firstbyte and reqbyte values for the
3465 branch become the values for the regex. */
3466
3467 if (*last_branch != OP_ALT)
3468 {
3469 firstbyte = branchfirstbyte;
3470 reqbyte = branchreqbyte;
3471 }
3472
3473 /* If this is not the first branch, the first char and reqbyte have to
3474 match the values from all the previous branches, except that if the previous
3475 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3476 REQ_VARY for the regex. */
3477
3478 else
3479 {
3480 /* If we previously had a firstbyte, but it doesn't match the new branch,
3481 we have to abandon the firstbyte for the regex, but if there was previously
3482 no reqbyte, it takes on the value of the old firstbyte. */
3483
3484 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3485 {
3486 if (reqbyte < 0) reqbyte = firstbyte;
3487 firstbyte = REQ_NONE;
3488 }
3489
3490 /* If we (now or from before) have no firstbyte, a firstbyte from the
3491 branch becomes a reqbyte if there isn't a branch reqbyte. */
3492
3493 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3494 branchreqbyte = branchfirstbyte;
3495
3496 /* Now ensure that the reqbytes match */
3497
3498 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3499 reqbyte = REQ_NONE;
3500 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3501 }
3502
3503 /* If lookbehind, check that this branch matches a fixed-length string,
3504 and put the length into the OP_REVERSE item. Temporarily mark the end of
3505 the branch with OP_END. */
3506
3507 if (lookbehind)
3508 {
3509 int length;
3510 *code = OP_END;
3511 length = find_fixedlength(last_branch, options);
3512 DPRINTF(("fixed length = %d\n", length));
3513 if (length < 0)
3514 {
3515 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3516 *ptrptr = ptr;
3517 return FALSE;
3518 }
3519 PUT(reverse_count, 0, length);
3520 }
3521
3522 /* Reached end of expression, either ')' or end of pattern. Go back through
3523 the alternative branches and reverse the chain of offsets, with the field in
3524 the BRA item now becoming an offset to the first alternative. If there are
3525 no alternatives, it points to the end of the group. The length in the
3526 terminating ket is always the length of the whole bracketed item. If any of
3527 the ims options were changed inside the group, compile a resetting op-code
3528 following, except at the very end of the pattern. Return leaving the pointer
3529 at the terminating char. */
3530
3531 if (*ptr != '|')
3532 {
3533 int length = code - last_branch;
3534 do
3535 {
3536 int prev_length = GET(last_branch, 1);
3537 PUT(last_branch, 1, length);
3538 length = prev_length;
3539 last_branch -= length;
3540 }
3541 while (length > 0);
3542
3543 /* Fill in the ket */
3544
3545 *code = OP_KET;
3546 PUT(code, 1, code - start_bracket);
3547 code += 1 + LINK_SIZE;
3548
3549 /* Resetting option if needed */
3550
3551 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3552 {
3553 *code++ = OP_OPT;
3554 *code++ = oldims;
3555 }
3556
3557 /* Set values to pass back */
3558
3559 *codeptr = code;
3560 *ptrptr = ptr;
3561 *firstbyteptr = firstbyte;
3562 *reqbyteptr = reqbyte;
3563 return TRUE;
3564 }
3565
3566 /* Another branch follows; insert an "or" node. Its length field points back
3567 to the previous branch while the bracket remains open. At the end the chain
3568 is reversed. It's done like this so that the start of the bracket has a
3569 zero offset until it is closed, making it possible to detect recursion. */
3570
3571 *code = OP_ALT;
3572 PUT(code, 1, code - last_branch);
3573 bc.current = last_branch = code;
3574 code += 1 + LINK_SIZE;
3575 ptr++;
3576 }
3577 /* Control never reaches here */
3578 }
3579
3580
3581
3582
3583 /*************************************************
3584 * Check for anchored expression *
3585 *************************************************/
3586
3587 /* Try to find out if this is an anchored regular expression. Consider each
3588 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3589 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3590 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3591 counts, since OP_CIRC can match in the middle.
3592
3593 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3594 This is the code for \G, which means "match at start of match position, taking
3595 into account the match offset".
3596
3597 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3598 because that will try the rest of the pattern at all possible matching points,
3599 so there is no point trying again.... er ....
3600
3601 .... except when the .* appears inside capturing parentheses, and there is a
3602 subsequent back reference to those parentheses. We haven't enough information
3603 to catch that case precisely.
3604
3605 At first, the best we could do was to detect when .* was in capturing brackets
3606 and the highest back reference was greater than or equal to that level.
3607 However, by keeping a bitmap of the first 31 back references, we can catch some
3608 of the more common cases more precisely.
3609
3610 Arguments:
3611 code points to start of expression (the bracket)
3612 options points to the options setting
3613 bracket_map a bitmap of which brackets we are inside while testing; this
3614 handles up to substring 31; after that we just have to take
3615 the less precise approach
3616 backref_map the back reference bitmap
3617
3618 Returns: TRUE or FALSE
3619 */
3620
3621 static BOOL
3622 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3623 unsigned int backref_map)
3624 {
3625 do {
3626 const uschar *scode =
3627 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3628 register int op = *scode;
3629
3630 /* Capturing brackets */
3631
3632 if (op > OP_BRA)
3633 {
3634 int new_map;
3635 op -= OP_BRA;
3636 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3637 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3638 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3639 }
3640
3641 /* Other brackets */
3642
3643 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3644 {
3645 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3646 }
3647
3648 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3649 are or may be referenced. */
3650
3651 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3652 (*options & PCRE_DOTALL) != 0)
3653 {
3654 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3655 }
3656
3657 /* Check for explicit anchoring */
3658
3659 else if (op != OP_SOD && op != OP_SOM &&
3660 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3661 return FALSE;
3662 code += GET(code, 1);
3663 }
3664 while (*code == OP_ALT); /* Loop for each alternative */
3665 return TRUE;
3666 }
3667
3668
3669
3670 /*************************************************
3671 * Check for starting with ^ or .* *
3672 *************************************************/
3673
3674 /* This is called to find out if every branch starts with ^ or .* so that
3675 "first char" processing can be done to speed things up in multiline
3676 matching and for non-DOTALL patterns that start with .* (which must start at
3677 the beginning or after \n). As in the case of is_anchored() (see above), we
3678 have to take account of back references to capturing brackets that contain .*
3679 because in that case we can't make the assumption.
3680
3681 Arguments:
3682 code points to start of expression (the bracket)
3683 bracket_map a bitmap of which brackets we are inside while testing; this
3684 handles up to substring 31; after that we just have to take
3685 the less precise approach
3686 backref_map the back reference bitmap
3687
3688 Returns: TRUE or FALSE
3689 */
3690
3691 static BOOL
3692 is_startline(const uschar *code, unsigned int bracket_map,
3693 unsigned int backref_map)
3694 {
3695 do {
3696 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3697 FALSE);
3698 register int op = *scode;
3699
3700 /* Capturing brackets */
3701
3702 if (op > OP_BRA)
3703 {
3704 int new_map;
3705 op -= OP_BRA;
3706 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3707 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3708 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3709 }
3710
3711 /* Other brackets */
3712
3713 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3714 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3715
3716 /* .* means "start at start or after \n" if it isn't in brackets that
3717 may be referenced. */
3718
3719 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3720 {
3721 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3722 }
3723
3724 /* Check for explicit circumflex */
3725
3726 else if (op != OP_CIRC) return FALSE;
3727
3728 /* Move on to the next alternative */
3729
3730 code += GET(code, 1);
3731 }
3732 while (*code == OP_ALT); /* Loop for each alternative */
3733 return TRUE;
3734 }
3735
3736
3737
3738 /*************************************************
3739 * Check for asserted fixed first char *
3740 *************************************************/
3741
3742 /* During compilation, the "first char" settings from forward assertions are
3743 discarded, because they can cause conflicts with actual literals that follow.
3744 However, if we end up without a first char setting for an unanchored pattern,
3745 it is worth scanning the regex to see if there is an initial asserted first
3746 char. If all branches start with the same asserted char, or with a bracket all
3747 of whose alternatives start with the same asserted char (recurse ad lib), then
3748 we return that char, otherwise -1.
3749
3750 Arguments:
3751 code points to start of expression (the bracket)
3752 options pointer to the options (used to check casing changes)
3753 inassert TRUE if in an assertion
3754
3755 Returns: -1 or the fixed first char
3756 */
3757
3758 static int
3759 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3760 {
3761 register int c = -1;
3762 do {
3763 int d;
3764 const uschar *scode =
3765 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3766 register int op = *scode;
3767
3768 if (op >= OP_BRA) op = OP_BRA;
3769
3770 switch(op)
3771 {
3772 default:
3773 return -1;
3774
3775 case OP_BRA:
3776 case OP_ASSERT:
3777 case OP_ONCE:
3778 case OP_COND:
3779 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3780 return -1;
3781 if (c < 0) c = d; else if (c != d) return -1;
3782 break;
3783
3784 case OP_EXACT: /* Fall through */
3785 scode += 2;
3786
3787 case OP_CHAR:
3788 case OP_CHARNC:
3789 case OP_PLUS:
3790 case OP_MINPLUS:
3791 if (!inassert) return -1;
3792 if (c < 0)
3793 {
3794 c = scode[1];
3795 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3796 }
3797 else if (c != scode[1]) return -1;
3798 break;
3799 }
3800
3801 code += GET(code, 1);
3802 }
3803 while (*code == OP_ALT);
3804 return c;
3805 }
3806
3807
3808
3809 /*************************************************
3810 * Compile a Regular Expression *
3811 *************************************************/
3812
3813 /* This function takes a string and returns a pointer to a block of store
3814 holding a compiled version of the expression. The original API for this
3815 function had no error code return variable; it is retained for backwards
3816 compatibility. The new function is given a new name.
3817
3818 Arguments:
3819 pattern the regular expression
3820 options various option bits
3821 errorcodeptr pointer to error code variable (pcre_compile2() only)
3822 can be NULL if you don't want a code value
3823 errorptr pointer to pointer to error text
3824 erroroffset ptr offset in pattern where error was detected
3825 tables pointer to character tables or NULL
3826
3827 Returns: pointer to compiled data block, or NULL on error,
3828 with errorptr and erroroffset set
3829 */
3830
3831 EXPORT pcre *
3832 pcre_compile(const char *pattern, int options, const char **errorptr,
3833 int *erroroffset, const unsigned char *tables)
3834 {
3835 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3836 }
3837
3838
3839 EXPORT pcre *
3840 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3841 const char **errorptr, int *erroroffset, const unsigned char *tables)
3842 {
3843 real_pcre *re;
3844 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3845 int c, firstbyte, reqbyte;
3846 int bracount = 0;
3847 int branch_extra = 0;
3848 int branch_newextra;
3849 int item_count = -1;
3850 int name_count = 0;
3851 int max_name_size = 0;
3852 int lastitemlength = 0;
3853 int errorcode = 0;
3854 #ifdef SUPPORT_UTF8
3855 BOOL utf8;
3856 BOOL class_utf8;
3857 #endif
3858 BOOL inescq = FALSE;
3859 unsigned int brastackptr = 0;
3860 size_t size;
3861 uschar *code;
3862 const uschar *codestart;
3863 const uschar *ptr;
3864 compile_data compile_block;
3865 int brastack[BRASTACK_SIZE];
3866 uschar bralenstack[BRASTACK_SIZE];
3867
3868 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3869 can do is just return NULL, but we can set a code value if there is a code
3870 pointer. */
3871
3872 if (errorptr == NULL)
3873 {
3874 if (errorcodeptr != NULL) *errorcodeptr = 99;
3875 return NULL;
3876 }
3877
3878 *errorptr = NULL;
3879 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3880
3881 /* However, we can give a message for this error */
3882
3883 if (erroroffset == NULL)
3884 {
3885 errorcode = ERR16;
3886 goto PCRE_EARLY_ERROR_RETURN;
3887 }
3888
3889 *erroroffset = 0;
3890
3891 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3892
3893 #ifdef SUPPORT_UTF8
3894 utf8 = (options & PCRE_UTF8) != 0;
3895 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3896 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3897 {
3898 errorcode = ERR44;
3899 goto PCRE_EARLY_ERROR_RETURN;
3900 }
3901 #else
3902 if ((options & PCRE_UTF8) != 0)
3903 {
3904 errorcode = ERR32;
3905 goto PCRE_EARLY_ERROR_RETURN;
3906 }
3907 #endif
3908
3909 if ((options & ~PUBLIC_OPTIONS) != 0)
3910 {
3911 errorcode = ERR17;
3912 goto PCRE_EARLY_ERROR_RETURN;
3913 }
3914
3915 /* Set up pointers to the individual character tables */
3916
3917 if (tables == NULL) tables = _pcre_default_tables;
3918 compile_block.lcc = tables + lcc_offset;
3919 compile_block.fcc = tables + fcc_offset;
3920 compile_block.cbits = tables + cbits_offset;
3921 compile_block.ctypes = tables + ctypes_offset;
3922
3923 /* Maximum back reference and backref bitmap. This is updated for numeric
3924 references during the first pass, but for named references during the actual
3925 compile pass. The bitmap records up to 31 back references to help in deciding
3926 whether (.*) can be treated as anchored or not. */
3927
3928 compile_block.top_backref = 0;
3929 compile_block.backref_map = 0;
3930
3931 /* Reflect pattern for debugging output */
3932
3933 DPRINTF(("------------------------------------------------------------------\n"));
3934 DPRINTF(("%s\n", pattern));
3935
3936 /* The first thing to do is to make a pass over the pattern to compute the
3937 amount of store required to hold the compiled code. This does not have to be
3938 perfect as long as errors are overestimates. At the same time we can detect any
3939 flag settings right at the start, and extract them. Make an attempt to correct
3940 for any counted white space if an "extended" flag setting appears late in the
3941 pattern. We can't be so clever for #-comments. */
3942
3943 ptr = (const uschar *)(pattern - 1);
3944 while ((c = *(++ptr)) != 0)
3945 {
3946 int min, max;
3947 int class_optcount;
3948 int bracket_length;
3949 int duplength;
3950
3951 /* If we are inside a \Q...\E sequence, all chars are literal */
3952
3953 if (inescq)
3954 {
3955 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3956 goto NORMAL_CHAR;
3957 }
3958
3959 /* Otherwise, first check for ignored whitespace and comments */
3960
3961 if ((options & PCRE_EXTENDED) != 0)
3962 {
3963 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3964 if (c == '#')
3965 {
3966 /* The space before the ; is to avoid a warning on a silly compiler
3967 on the Macintosh. */
3968 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3969 if (c == 0) break;
3970 continue;
3971 }
3972 }
3973
3974 item_count++; /* Is zero for the first non-comment item */
3975
3976 /* Allow space for auto callout before every item except quantifiers. */
3977
3978 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3979 c != '*' && c != '+' && c != '?' &&
3980 (c != '{' || !is_counted_repeat(ptr + 1)))
3981 length += 2 + 2*LINK_SIZE;
3982
3983 switch(c)
3984 {
3985 /* A backslashed item may be an escaped data character or it may be a
3986 character type. */
3987
3988 case '\\':
3989 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
3990 if (errorcode != 0) goto PCRE_ERROR_RETURN;
3991
3992 lastitemlength = 1; /* Default length of last item for repeats */
3993
3994 if (c >= 0) /* Data character */
3995 {
3996 length += 2; /* For a one-byte character */
3997
3998 #ifdef SUPPORT_UTF8
3999 if (utf8 && c > 127)
4000 {
4001 int i;
4002 for (i = 0; i < _pcre_utf8_table1_size; i++)
4003 if (c <= _pcre_utf8_table1[i]) break;
4004 length += i;
4005 lastitemlength += i;
4006 }
4007 #endif
4008
4009 continue;
4010 }
4011
4012 /* If \Q, enter "literal" mode */
4013
4014 if (-c == ESC_Q)
4015 {
4016 inescq = TRUE;
4017 continue;
4018 }
4019
4020 /* \X is supported only if Unicode property support is compiled */
4021
4022 #ifndef SUPPORT_UCP
4023 if (-c == ESC_X)
4024 {
4025 errorcode = ERR45;
4026 goto PCRE_ERROR_RETURN;
4027 }
4028 #endif
4029
4030 /* \P and \p are for Unicode properties, but only when the support has
4031 been compiled. Each item needs 2 bytes. */
4032
4033 else if (-c == ESC_P || -c == ESC_p)
4034 {
4035 #ifdef SUPPORT_UCP
4036 BOOL negated;
4037 length += 2;
4038 lastitemlength = 2;
4039 if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4040 continue;
4041 #else
4042 errorcode = ERR45;
4043 goto PCRE_ERROR_RETURN;
4044 #endif
4045 }
4046
4047 /* Other escapes need one byte */
4048
4049 length++;
4050
4051 /* A back reference needs an additional 2 bytes, plus either one or 5
4052 bytes for a repeat. We also need to keep the value of the highest
4053 back reference. */
4054
4055 if (c <= -ESC_REF)
4056 {
4057 int refnum = -c - ESC_REF;
4058 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4059 if (refnum > compile_block.top_backref)
4060 compile_block.top_backref = refnum;
4061 length += 2; /* For single back reference */
4062 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4063 {
4064 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4065 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4066 if ((min == 0 && (max == 1 || max == -1)) ||
4067 (min == 1 && max == -1))
4068 length++;
4069 else length += 5;
4070 if (ptr[1] == '?') ptr++;
4071 }
4072 }
4073 continue;
4074
4075 case '^': /* Single-byte metacharacters */
4076 case '.':
4077 case '$':
4078 length++;
4079 lastitemlength = 1;
4080 continue;
4081
4082 case '*': /* These repeats won't be after brackets; */
4083 case '+': /* those are handled separately */
4084 case '?':
4085 length++;
4086 goto POSESSIVE; /* A few lines below */
4087
4088 /* This covers the cases of braced repeats after a single char, metachar,
4089 class, or back reference. */
4090
4091 case '{':
4092 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4093 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4094 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4095
4096 /* These special cases just insert one extra opcode */
4097
4098 if ((min == 0 && (max == 1 || max == -1)) ||
4099 (min == 1 && max == -1))
4100 length++;
4101
4102 /* These cases might insert additional copies of a preceding character. */
4103
4104 else
4105 {
4106 if (min != 1)
4107 {
4108 length -= lastitemlength; /* Uncount the original char or metachar */
4109 if (min > 0) length += 3 + lastitemlength;
4110 }
4111 length += lastitemlength + ((max > 0)? 3 : 1);
4112 }
4113
4114 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4115
4116 POSESSIVE: /* Test for possessive quantifier */
4117 if (ptr[1] == '+')
4118 {
4119 ptr++;
4120 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4121 }
4122 continue;
4123
4124 /* An alternation contains an offset to the next branch or ket. If any ims
4125 options changed in the previous branch(es), and/or if we are in a
4126 lookbehind assertion, extra space will be needed at the start of the
4127 branch. This is handled by branch_extra. */
4128
4129 case '|':
4130 length += 1 + LINK_SIZE + branch_extra;
4131 continue;
4132
4133 /* A character class uses 33 characters provided that all the character
4134 values are less than 256. Otherwise, it uses a bit map for low valued
4135 characters, and individual items for others. Don't worry about character
4136 types that aren't allowed in classes - they'll get picked up during the
4137 compile. A character class that contains only one single-byte character
4138 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4139 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4140
4141 case '[':
4142 if (*(++ptr) == '^')
4143 {
4144 class_optcount = 10; /* Greater than one */
4145 ptr++;
4146 }
4147 else class_optcount = 0;
4148
4149 #ifdef SUPPORT_UTF8
4150 class_utf8 = FALSE;
4151 #endif
4152
4153 /* Written as a "do" so that an initial ']' is taken as data */
4154
4155 if (*ptr != 0) do
4156 {
4157 /* Inside \Q...\E everything is literal except \E */
4158
4159 if (inescq)
4160 {
4161 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4162 inescq = FALSE;
4163 ptr += 1;
4164 continue;
4165 }
4166
4167 /* Outside \Q...\E, check for escapes */
4168
4169 if (*ptr == '\\')
4170 {
4171 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4172 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4173
4174 /* \b is backspace inside a class; \X is literal */
4175
4176 if (-c == ESC_b) c = '\b';
4177 else if (-c == ESC_X) c = 'X';
4178
4179 /* \Q enters quoting mode */
4180
4181 else if (-c == ESC_Q)
4182 {
4183 inescq = TRUE;
4184 continue;
4185 }
4186
4187 /* Handle escapes that turn into characters */
4188
4189 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4190
4191 /* Escapes that are meta-things. The normal ones just affect the
4192 bit map, but Unicode properties require an XCLASS extended item. */
4193
4194 else
4195 {
4196 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4197 #ifdef SUPPORT_UTF8
4198 if (-c == ESC_p || -c == ESC_P)
4199 {
4200 if (!class_utf8)
4201 {
4202 class_utf8 = TRUE;
4203 length += LINK_SIZE + 2;
4204 }
4205 length += 2;
4206 }
4207 #endif
4208 }
4209 }
4210
4211 /* Check the syntax for POSIX stuff. The bits we actually handle are
4212 checked during the real compile phase. */
4213
4214 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4215 {
4216 ptr++;
4217 class_optcount = 10; /* Make sure > 1 */
4218 }
4219
4220 /* Anything else increments the possible optimization count. We have to
4221 detect ranges here so that we can compute the number of extra ranges for
4222 caseless wide characters when UCP support is available. If there are wide
4223 characters, we are going to have to use an XCLASS, even for single
4224 characters. */
4225
4226 else
4227 {
4228 int d;
4229
4230 GET_ONE_CHARACTER:
4231
4232 #ifdef SUPPORT_UTF8
4233 if (utf8)
4234 {
4235 int extra = 0;
4236 GETCHARLEN(c, ptr, extra);
4237 ptr += extra;
4238 }
4239 else c = *ptr;
4240 #else
4241 c = *ptr;
4242 #endif
4243
4244 /* Come here from handling \ above when it escapes to a char value */
4245
4246 NON_SPECIAL_CHARACTER:
4247 class_optcount++;
4248
4249 d = -1;
4250 if (ptr[1] == '-')
4251 {
4252 uschar const *hyptr = ptr++;
4253 if (ptr[1] == '\\')
4254 {
4255 ptr++;
4256 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4257 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4258 if (-d == ESC_b) d = '\b'; /* backspace */
4259 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4260 }
4261 else if (ptr[1] != 0 && ptr[1] != ']')
4262 {
4263 ptr++;
4264 #ifdef SUPPORT_UTF8
4265 if (utf8)
4266 {
4267 int extra = 0;
4268 GETCHARLEN(d, ptr, extra);
4269 ptr += extra;
4270 }
4271 else
4272 #endif
4273 d = *ptr;
4274 }
4275 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4276 }
4277
4278 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4279 127 for caseless matching, we will need to use an XCLASS. */
4280
4281 if (d >= 0)
4282 {
4283 class_optcount = 10; /* Ensure > 1 */
4284 if (d < c)
4285 {
4286 errorcode = ERR8;
4287 goto PCRE_ERROR_RETURN;
4288 }
4289
4290 #ifdef SUPPORT_UTF8
4291 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4292 {
4293 uschar buffer[6];
4294 if (!class_utf8) /* Allow for XCLASS overhead */
4295 {
4296 class_utf8 = TRUE;
4297 length += LINK_SIZE + 2;
4298 }
4299
4300 #ifdef SUPPORT_UCP
4301 /* If we have UCP support, find out how many extra ranges are
4302 needed to map the other case of characters within this range. We
4303 have to mimic the range optimization here, because extending the
4304 range upwards might push d over a boundary that makes is use
4305 another byte in the UTF-8 representation. */
4306
4307 if ((options & PCRE_CASELESS) != 0)
4308 {
4309 int occ, ocd;
4310 int cc = c;
4311 int origd = d;
4312 while (get_othercase_range(&cc, origd, &occ, &ocd))
4313 {
4314 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4315
4316 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4317 { /* if there is overlap, */
4318 c = occ; /* noting that if occ < c */
4319 continue; /* we can't have ocd > d */
4320 } /* because a subrange is */
4321 if (ocd > d && occ <= d + 1) /* always shorter than */
4322 { /* the basic range. */
4323 d = ocd;
4324 continue;
4325 }
4326
4327 /* An extra item is needed */
4328
4329 length += 1 + _pcre_ord2utf8(occ, buffer) +
4330 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4331 }
4332 }
4333 #endif /* SUPPORT_UCP */
4334
4335 /* The length of the (possibly extended) range */
4336
4337 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4338 }
4339 #endif /* SUPPORT_UTF8 */
4340
4341 }
4342
4343 /* We have a single character. There is nothing to be done unless we
4344 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4345 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4346 support. */
4347
4348 else
4349 {
4350 #ifdef SUPPORT_UTF8
4351 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4352 {
4353 uschar buffer[6];
4354 class_optcount = 10; /* Ensure > 1 */
4355 if (!class_utf8) /* Allow for XCLASS overhead */
4356 {
4357 class_utf8 = TRUE;
4358 length += LINK_SIZE + 2;
4359 }
4360 #ifdef SUPPORT_UCP
4361 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4362 (1 + _pcre_ord2utf8(c, buffer));
4363 #else /* SUPPORT_UCP */
4364 length += 1 + _pcre_ord2utf8(c, buffer);
4365 #endif /* SUPPORT_UCP */
4366 }
4367 #endif /* SUPPORT_UTF8 */
4368 }
4369 }
4370 }
4371 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4372
4373 if (*ptr == 0) /* Missing terminating ']' */
4374 {
4375 errorcode = ERR6;
4376 goto PCRE_ERROR_RETURN;
4377 }
4378
4379 /* We can optimize when there was only one optimizable character. Repeats
4380 for positive and negated single one-byte chars are handled by the general
4381 code. Here, we handle repeats for the class opcodes. */
4382
4383 if (class_optcount == 1) length += 3; else
4384 {
4385 length += 33;
4386
4387 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4388 we also need extra for wrapping the whole thing in a sub-pattern. */
4389
4390 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4391 {
4392 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4393 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4394 if ((min == 0 && (max == 1 || max == -1)) ||
4395 (min == 1 && max == -1))
4396 length++;
4397 else length += 5;
4398 if (ptr[1] == '+')
4399 {
4400 ptr++;
4401 length += 2 + 2*LINK_SIZE;
4402 }
4403 else if (ptr[1] == '?') ptr++;
4404 }
4405 }
4406 continue;
4407
4408 /* Brackets may be genuine groups or special things */
4409
4410 case '(':
4411 branch_newextra = 0;
4412 bracket_length = 1 + LINK_SIZE;
4413
4414 /* Handle special forms of bracket, which all start (? */
4415
4416 if (ptr[1] == '?')
4417 {
4418 int set, unset;
4419 int *optset;
4420
4421 switch (c = ptr[2])
4422 {
4423 /* Skip over comments entirely */
4424 case '#':
4425 ptr += 3;
4426 while (*ptr != 0 && *ptr != ')') ptr++;
4427 if (*ptr == 0)
4428 {
4429 errorcode = ERR18;
4430 goto PCRE_ERROR_RETURN;
4431 }
4432 continue;
4433
4434 /* Non-referencing groups and lookaheads just move the pointer on, and
4435 then behave like a non-special bracket, except that they don't increment
4436 the count of extracting brackets. Ditto for the "once only" bracket,
4437 which is in Perl from version 5.005. */
4438
4439 case ':':
4440 case '=':
4441 case '!':
4442 case '>':
4443 ptr += 2;
4444 break;
4445
4446 /* (?R) specifies a recursive call to the regex, which is an extension
4447 to provide the facility which can be obtained by (?p{perl-code}) in
4448 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4449
4450 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4451 the appropriate numbered brackets. This includes both recursive and
4452 non-recursive calls. (?R) is now synonymous with (?0). */
4453
4454 case 'R':
4455 ptr++;
4456
4457 case '0': case '1': case '2': case '3': case '4':
4458 case '5': case '6': case '7': case '8': case '9':
4459 ptr += 2;
4460 if (c != 'R')
4461 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4462 if (*ptr != ')')
4463 {
4464 errorcode = ERR29;
4465 goto PCRE_ERROR_RETURN;
4466 }
4467 length += 1 + LINK_SIZE;
4468
4469 /* If this item is quantified, it will get wrapped inside brackets so
4470 as to use the code for quantified brackets. We jump down and use the
4471 code that handles this for real brackets. */
4472
4473 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4474 {
4475 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4476 duplength = 5 + 3 * LINK_SIZE;
4477 goto HANDLE_QUANTIFIED_BRACKETS;
4478 }
4479 continue;
4480
4481 /* (?C) is an extension which provides "callout" - to provide a bit of
4482 the functionality of the Perl (?{...}) feature. An optional number may
4483 follow (default is zero). */
4484
4485 case 'C':
4486 ptr += 2;
4487 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4488 if (*ptr != ')')
4489 {
4490 errorcode = ERR39;
4491 goto PCRE_ERROR_RETURN;
4492 }
4493 length += 2 + 2*LINK_SIZE;
4494 continue;
4495
4496 /* Named subpatterns are an extension copied from Python */
4497
4498 case 'P':
4499 ptr += 3;
4500 if (*ptr == '<')
4501 {
4502 const uschar *p; /* Don't amalgamate; some compilers */
4503 p = ++ptr; /* grumble at autoincrement in declaration */
4504 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4505 if (*ptr != '>')
4506 {
4507 errorcode = ERR42;
4508 goto PCRE_ERROR_RETURN;
4509 }
4510 name_count++;
4511 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4512 break;
4513 }
4514
4515 if (*ptr == '=' || *ptr == '>')
4516 {
4517 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4518 if (*ptr != ')')
4519 {
4520 errorcode = ERR42;
4521 goto PCRE_ERROR_RETURN;
4522 }
4523 break;
4524 }
4525
4526 /* Unknown character after (?P */
4527
4528 errorcode = ERR41;
4529 goto PCRE_ERROR_RETURN;
4530
4531 /* Lookbehinds are in Perl from version 5.005 */
4532
4533 case '<':
4534 ptr += 3;
4535 if (*ptr == '=' || *ptr == '!')
4536 {
4537 branch_newextra = 1 + LINK_SIZE;
4538 length += 1 + LINK_SIZE; /* For the first branch */
4539 break;
4540 }
4541 errorcode = ERR24;
4542 goto PCRE_ERROR_RETURN;
4543
4544 /* Conditionals are in Perl from version 5.005. The bracket must either
4545 be followed by a number (for bracket reference) or by an assertion
4546 group, or (a PCRE extension) by 'R' for a recursion test. */
4547
4548 case '(':
4549 if (ptr[3] == 'R' && ptr[4] == ')')
4550 {
4551 ptr += 4;
4552 length += 3;
4553 }
4554 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4555 {
4556 ptr += 4;
4557 length += 3;
4558 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4559 if (*ptr != ')')
4560 {
4561 errorcode = ERR26;
4562 goto PCRE_ERROR_RETURN;
4563 }
4564 }
4565 else /* An assertion must follow */
4566 {
4567 ptr++; /* Can treat like ':' as far as spacing is concerned */
4568 if (ptr[2] != '?' ||
4569 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4570 {
4571 ptr += 2; /* To get right offset in message */
4572 errorcode = ERR28;
4573 goto PCRE_ERROR_RETURN;
4574 }
4575 }
4576 break;
4577
4578 /* Else loop checking valid options until ) is met. Anything else is an
4579 error. If we are without any brackets, i.e. at top level, the settings
4580 act as if specified in the options, so massage the options immediately.
4581 This is for backward compatibility with Perl 5.004. */
4582
4583 default:
4584 set = unset = 0;
4585 optset = &set;
4586 ptr += 2;
4587
4588 for (;; ptr++)
4589 {
4590 c = *ptr;
4591 switch (c)
4592 {
4593 case 'i':
4594 *optset |= PCRE_CASELESS;
4595 continue;
4596
4597 case 'm':
4598 *optset |= PCRE_MULTILINE;
4599 continue;
4600
4601 case 's':
4602 *optset |= PCRE_DOTALL;
4603 continue;
4604
4605 case 'x':
4606 *optset |= PCRE_EXTENDED;
4607 continue;
4608
4609 case 'X':
4610 *optset |= PCRE_EXTRA;
4611 continue;
4612
4613 case 'U':
4614 *optset |= PCRE_UNGREEDY;
4615 continue;
4616
4617 case '-':
4618 optset = &unset;
4619 continue;
4620
4621 /* A termination by ')' indicates an options-setting-only item; if
4622 this is at the very start of the pattern (indicated by item_count
4623 being zero), we use it to set the global options. This is helpful
4624 when analyzing the pattern for first characters, etc. Otherwise
4625 nothing is done here and it is handled during the compiling
4626 process.
4627
4628 We allow for more than one options setting at the start. If such
4629 settings do not change the existing options, nothing is compiled.
4630 However, we must leave space just in case something is compiled.
4631 This can happen for pathological sequences such as (?i)(?-i)
4632 because the global options will end up with -i set. The space is
4633 small and not significant. (Before I did this there was a reported
4634 bug with (?i)(?-i) in a machine-generated pattern.)
4635
4636 [Historical note: Up to Perl 5.8, options settings at top level
4637 were always global settings, wherever they appeared in the pattern.
4638 That is, they were equivalent to an external setting. From 5.8
4639 onwards, they apply only to what follows (which is what you might
4640 expect).] */
4641
4642 case ')':
4643 if (item_count == 0)
4644 {
4645 options = (options | set) & (~unset);
4646 set = unset = 0; /* To save length */
4647 item_count--; /* To allow for several */
4648 length += 2;
4649 }
4650
4651 /* Fall through */
4652
4653 /* A termination by ':' indicates the start of a nested group with
4654 the given options set. This is again handled at compile time, but
4655 we must allow for compiled space if any of the ims options are
4656 set. We also have to allow for resetting space at the end of
4657 the group, which is why 4 is added to the length and not just 2.
4658 If there are several changes of options within the same group, this
4659 will lead to an over-estimate on the length, but this shouldn't
4660 matter very much. We also have to allow for resetting options at
4661 the start of any alternations, which we do by setting
4662 branch_newextra to 2. Finally, we record whether the case-dependent
4663 flag ever changes within the regex. This is used by the "required
4664 character" code. */
4665
4666 case ':':
4667 if (((set|unset) & PCRE_IMS) != 0)
4668 {
4669 length += 4;
4670 branch_newextra = 2;
4671 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4672 }
4673 goto END_OPTIONS;
4674
4675 /* Unrecognized option character */
4676
4677 default:
4678 errorcode = ERR12;
4679 goto PCRE_ERROR_RETURN;
4680 }
4681 }
4682
4683 /* If we hit a closing bracket, that's it - this is a freestanding
4684 option-setting. We need to ensure that branch_extra is updated if
4685 necessary. The only values branch_newextra can have here are 0 or 2.
4686 If the value is 2, then branch_extra must either be 2 or 5, depending
4687 on whether this is a lookbehind group or not. */
4688
4689 END_OPTIONS:
4690 if (c == ')')
4691 {
4692 if (branch_newextra == 2 &&
4693 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4694 branch_extra += branch_newextra;
4695 continue;
4696 }
4697
4698 /* If options were terminated by ':' control comes here. Fall through
4699 to handle the group below. */
4700 }
4701 }
4702
4703 /* Extracting brackets must be counted so we can process escapes in a
4704 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4705 need an additional 3 bytes of store per extracting bracket. However, if
4706 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4707 must leave the count alone (it will aways be zero). */
4708
4709 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4710 {
4711 bracount++;
4712 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4713 }
4714
4715 /* Save length for computing whole length at end if there's a repeat that
4716 requires duplication of the group. Also save the current value of
4717 branch_extra, and start the new group with the new value. If non-zero, this
4718 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4719
4720 if (brastackptr >= sizeof(brastack)/sizeof(int))
4721 {
4722 errorcode = ERR19;
4723 goto PCRE_ERROR_RETURN;
4724 }
4725
4726 bralenstack[brastackptr] = branch_extra;
4727 branch_extra = branch_newextra;
4728
4729 brastack[brastackptr++] = length;
4730 length += bracket_length;
4731 continue;
4732
4733 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4734 have to replicate this bracket up to that many times. If brastackptr is
4735 0 this is an unmatched bracket which will generate an error, but take care
4736 not to try to access brastack[-1] when computing the length and restoring
4737 the branch_extra value. */
4738
4739 case ')':
4740 length += 1 + LINK_SIZE;
4741 if (brastackptr > 0)
4742 {
4743 duplength = length - brastack[--brastackptr];
4744 branch_extra = bralenstack[brastackptr];
4745 }
4746 else duplength = 0;
4747
4748 /* The following code is also used when a recursion such as (?3) is
4749 followed by a quantifier, because in that case, it has to be wrapped inside
4750 brackets so that the quantifier works. The value of duplength must be
4751 set before arrival. */
4752
4753 HANDLE_QUANTIFIED_BRACKETS:
4754
4755 /* Leave ptr at the final char; for read_repeat_counts this happens
4756 automatically; for the others we need an increment. */
4757
4758 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4759 {
4760 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4761 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4762 }
4763 else if (c == '*') { min = 0; max = -1; ptr++; }
4764 else if (c == '+') { min = 1; max = -1; ptr++; }
4765 else if (c == '?') { min = 0; max = 1; ptr++; }
4766 else { min = 1; max = 1; }
4767
4768 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4769 group, and if the maximum is greater than zero, we have to replicate
4770 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4771 bracket set. */
4772
4773 if (min == 0)
4774 {
4775 length++;
4776 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4777 }
4778
4779 /* When the minimum is greater than zero, we have to replicate up to
4780 minval-1 times, with no additions required in the copies. Then, if there
4781 is a limited maximum we have to replicate up to maxval-1 times allowing
4782 for a BRAZERO item before each optional copy and nesting brackets for all
4783 but one of the optional copies. */
4784
4785 else
4786 {
4787 length += (min - 1) * duplength;
4788 if (max > min) /* Need this test as max=-1 means no limit */
4789 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4790 - (2 + 2*LINK_SIZE);
4791 }
4792
4793 /* Allow space for once brackets for "possessive quantifier" */
4794
4795 if (ptr[1] == '+')
4796 {
4797 ptr++;
4798 length += 2 + 2*LINK_SIZE;
4799 }
4800 continue;
4801
4802 /* Non-special character. It won't be space or # in extended mode, so it is
4803 always a genuine character. If we are in a \Q...\E sequence, check for the
4804 end; if not, we have a literal. */
4805
4806 default:
4807 NORMAL_CHAR:
4808
4809 if (inescq && c == '\\' && ptr[1] == 'E')
4810 {
4811 inescq = FALSE;
4812 ptr++;
4813 continue;
4814 }
4815
4816 length += 2; /* For a one-byte character */
4817 lastitemlength = 1; /* Default length of last item for repeats */
4818
4819 /* In UTF-8 mode, check for additional bytes. */
4820
4821 #ifdef SUPPORT_UTF8
4822 if (utf8 && (c & 0xc0) == 0xc0)
4823 {
4824 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4825 { /* because the end is marked */
4826 lastitemlength++; /* by a zero byte. */
4827 length++;
4828 ptr++;
4829 }
4830 }
4831 #endif
4832
4833 continue;
4834 }
4835 }
4836
4837 length += 2 + LINK_SIZE; /* For final KET and END */
4838
4839 if ((options & PCRE_AUTO_CALLOUT) != 0)
4840 length += 2 + 2*LINK_SIZE; /* For final callout */
4841
4842 if (length > MAX_PATTERN_SIZE)
4843 {
4844 errorcode = ERR20;
4845 goto PCRE_EARLY_ERROR_RETURN;
4846 }
4847
4848 /* Compute the size of data block needed and get it, either from malloc or
4849 externally provided function. */
4850
4851 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4852 re = (real_pcre *)(pcre_malloc)(size);
4853
4854 if (re == NULL)
4855 {
4856 errorcode = ERR21;
4857 goto PCRE_EARLY_ERROR_RETURN;
4858 }
4859
4860 /* Put in the magic number, and save the sizes, options, and character table
4861 pointer. NULL is used for the default character tables. The nullpad field is at
4862 the end; it's there to help in the case when a regex compiled on a system with
4863 4-byte pointers is run on another with 8-byte pointers. */
4864
4865 re->magic_number = MAGIC_NUMBER;
4866 re->size = size;
4867 re->options = options;
4868 re->dummy1 = 0;
4869 re->name_table_offset = sizeof(real_pcre);
4870 re->name_entry_size = max_name_size + 3;
4871 re->name_count = name_count;
4872 re->ref_count = 0;
4873 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4874 re->nullpad = NULL;
4875
4876 /* The starting points of the name/number translation table and of the code are
4877 passed around in the compile data block. */
4878
4879 compile_block.names_found = 0;
4880 compile_block.name_entry_size = max_name_size + 3;
4881 compile_block.name_table = (uschar *)re + re->name_table_offset;
4882 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4883 compile_block.start_code = codestart;
4884 compile_block.start_pattern = (const uschar *)pattern;
4885 compile_block.req_varyopt = 0;
4886 compile_block.nopartial = FALSE;
4887
4888 /* Set up a starting, non-extracting bracket, then compile the expression. On
4889 error, errorcode will be set non-zero, so we don't need to look at the result
4890 of the function here. */
4891
4892 ptr = (const uschar *)pattern;
4893 code = (uschar *)codestart;
4894 *code = OP_BRA;
4895 bracount = 0;
4896 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4897 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4898 re->top_bracket = bracount;
4899 re->top_backref = compile_block.top_backref;
4900
4901 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4902
4903 /* If not reached end of pattern on success, there's an excess bracket. */
4904
4905 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4906
4907 /* Fill in the terminating state and check for disastrous overflow, but
4908 if debugging, leave the test till after things are printed out. */
4909
4910 *code++ = OP_END;
4911
4912 #ifndef DEBUG
4913 if (code - codestart > length) errorcode = ERR23;
4914 #endif
4915
4916 /* Give an error if there's back reference to a non-existent capturing
4917 subpattern. */
4918
4919 if (re->top_backref > re->top_bracket) errorcode = ERR15;
4920
4921 /* Failed to compile, or error while post-processing */
4922
4923 if (errorcode != 0)
4924 {
4925 (pcre_free)(re);
4926 PCRE_ERROR_RETURN:
4927 *erroroffset = ptr - (const uschar *)pattern;
4928 PCRE_EARLY_ERROR_RETURN:
4929 *errorptr = error_texts[errorcode];
4930 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4931 return NULL;
4932 }
4933
4934 /* If the anchored option was not passed, set the flag if we can determine that
4935 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4936 as starting with .* when DOTALL is set).
4937
4938 Otherwise, if we know what the first character has to be, save it, because that
4939 speeds up unanchored matches no end. If not, see if we can set the
4940 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4941 start with ^. and also when all branches start with .* for non-DOTALL matches.
4942 */
4943
4944 if ((options & PCRE_ANCHORED) == 0)
4945 {
4946 int temp_options = options;
4947 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4948 re->options |= PCRE_ANCHORED;
4949 else
4950 {
4951 if (firstbyte < 0)
4952 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4953 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4954 {
4955 int ch = firstbyte & 255;
4956 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4957 compile_block.fcc[ch] == ch)? ch : firstbyte;
4958 re->options |= PCRE_FIRSTSET;
4959 }
4960 else if (is_startline(codestart, 0, compile_block.backref_map))
4961 re->options |= PCRE_STARTLINE;
4962 }
4963 }
4964
4965 /* For an anchored pattern, we use the "required byte" only if it follows a
4966 variable length item in the regex. Remove the caseless flag for non-caseable
4967 bytes. */
4968
4969 if (reqbyte >= 0 &&
4970 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4971 {
4972 int ch = reqbyte & 255;
4973 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4974 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4975 re->options |= PCRE_REQCHSET;
4976 }
4977
4978 /* Print out the compiled data for debugging */
4979
4980 #ifdef DEBUG
4981
4982 printf("Length = %d top_bracket = %d top_backref = %d\n",
4983 length, re->top_bracket, re->top_backref);
4984
4985 if (re->options != 0)
4986 {
4987 printf("%s%s%s%s%s%s%s%s%s%s\n",
4988 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
4989 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4990 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4991 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4992 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4993 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4994 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4995 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4996 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4997 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4998 }
4999
5000 if ((re->options & PCRE_FIRSTSET) != 0)
5001 {
5002 int ch = re->first_byte & 255;
5003 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5004 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5005 else printf("First char = \\x%02x%s\n", ch, caseless);
5006 }
5007
5008 if ((re->options & PCRE_REQCHSET) != 0)
5009 {
5010 int ch = re->req_byte & 255;
5011 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5012 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5013 else printf("Req char = \\x%02x%s\n", ch, caseless);
5014 }
5015
5016 _pcre_printint(re, stdout);
5017
5018 /* This check is done here in the debugging case so that the code that
5019 was compiled can be seen. */
5020
5021 if (code - codestart > length)
5022 {
5023 (pcre_free)(re);
5024 *errorptr = error_texts[ERR23];
5025 *erroroffset = ptr - (uschar *)pattern;
5026 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5027 return NULL;
5028 }
5029 #endif
5030
5031 return (pcre *)re;
5032 }
5033
5034 /* End of pcre_compile.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12