/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 81 - (show annotations) (download)
Sat Feb 24 21:40:59 2007 UTC (7 years, 4 months ago) by nigel
File MIME type: text/plain
File size: 159806 byte(s)
Load pcre-6.2 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #include "pcre_internal.h"
46
47
48 /*************************************************
49 * Code parameters and static tables *
50 *************************************************/
51
52 /* Maximum number of items on the nested bracket stacks at compile time. This
53 applies to the nesting of all kinds of parentheses. It does not limit
54 un-nested, non-capturing parentheses. This number can be made bigger if
55 necessary - it is used to dimension one int and one unsigned char vector at
56 compile time. */
57
58 #define BRASTACK_SIZE 200
59
60
61 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
62 are simple data values; negative values are for special things like \d and so
63 on. Zero means further processing is needed (for things like \x), or the escape
64 is invalid. */
65
66 #if !EBCDIC /* This is the "normal" table for ASCII systems */
67 static const short int escapes[] = {
68 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
69 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
70 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
71 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
72 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
73 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
74 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
75 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
76 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
77 0, 0, -ESC_z /* x - z */
78 };
79
80 #else /* This is the "abnormal" table for EBCDIC systems */
81 static const short int escapes[] = {
82 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
83 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
84 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
85 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
86 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
87 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
88 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
89 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
90 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
91 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
92 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
93 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
94 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
95 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
96 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
97 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
98 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
99 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
100 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
101 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
102 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
103 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
104 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
105 };
106 #endif
107
108
109 /* Tables of names of POSIX character classes and their lengths. The list is
110 terminated by a zero length entry. The first three must be alpha, upper, lower,
111 as this is assumed for handling case independence. */
112
113 static const char *const posix_names[] = {
114 "alpha", "lower", "upper",
115 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
116 "print", "punct", "space", "word", "xdigit" };
117
118 static const uschar posix_name_lengths[] = {
119 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
120
121 /* Table of class bit maps for each POSIX class; up to three may be combined
122 to form the class. The table for [:blank:] is dynamically modified to remove
123 the vertical space characters. */
124
125 static const int posix_class_maps[] = {
126 cbit_lower, cbit_upper, -1, /* alpha */
127 cbit_lower, -1, -1, /* lower */
128 cbit_upper, -1, -1, /* upper */
129 cbit_digit, cbit_lower, cbit_upper, /* alnum */
130 cbit_print, cbit_cntrl, -1, /* ascii */
131 cbit_space, -1, -1, /* blank - a GNU extension */
132 cbit_cntrl, -1, -1, /* cntrl */
133 cbit_digit, -1, -1, /* digit */
134 cbit_graph, -1, -1, /* graph */
135 cbit_print, -1, -1, /* print */
136 cbit_punct, -1, -1, /* punct */
137 cbit_space, -1, -1, /* space */
138 cbit_word, -1, -1, /* word - a Perl extension */
139 cbit_xdigit,-1, -1 /* xdigit */
140 };
141
142
143 /* The texts of compile-time error messages. These are "char *" because they
144 are passed to the outside world. */
145
146 static const char *error_texts[] = {
147 "no error",
148 "\\ at end of pattern",
149 "\\c at end of pattern",
150 "unrecognized character follows \\",
151 "numbers out of order in {} quantifier",
152 /* 5 */
153 "number too big in {} quantifier",
154 "missing terminating ] for character class",
155 "invalid escape sequence in character class",
156 "range out of order in character class",
157 "nothing to repeat",
158 /* 10 */
159 "operand of unlimited repeat could match the empty string",
160 "internal error: unexpected repeat",
161 "unrecognized character after (?",
162 "POSIX named classes are supported only within a class",
163 "missing )",
164 /* 15 */
165 "reference to non-existent subpattern",
166 "erroffset passed as NULL",
167 "unknown option bit(s) set",
168 "missing ) after comment",
169 "parentheses nested too deeply",
170 /* 20 */
171 "regular expression too large",
172 "failed to get memory",
173 "unmatched parentheses",
174 "internal error: code overflow",
175 "unrecognized character after (?<",
176 /* 25 */
177 "lookbehind assertion is not fixed length",
178 "malformed number after (?(",
179 "conditional group contains more than two branches",
180 "assertion expected after (?(",
181 "(?R or (?digits must be followed by )",
182 /* 30 */
183 "unknown POSIX class name",
184 "POSIX collating elements are not supported",
185 "this version of PCRE is not compiled with PCRE_UTF8 support",
186 "spare error",
187 "character value in \\x{...} sequence is too large",
188 /* 35 */
189 "invalid condition (?(0)",
190 "\\C not allowed in lookbehind assertion",
191 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
192 "number after (?C is > 255",
193 "closing ) for (?C expected",
194 /* 40 */
195 "recursive call could loop indefinitely",
196 "unrecognized character after (?P",
197 "syntax error after (?P",
198 "two named groups have the same name",
199 "invalid UTF-8 string",
200 /* 45 */
201 "support for \\P, \\p, and \\X has not been compiled",
202 "malformed \\P or \\p sequence",
203 "unknown property name after \\P or \\p"
204 };
205
206
207 /* Table to identify digits and hex digits. This is used when compiling
208 patterns. Note that the tables in chartables are dependent on the locale, and
209 may mark arbitrary characters as digits - but the PCRE compiling code expects
210 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
211 a private table here. It costs 256 bytes, but it is a lot faster than doing
212 character value tests (at least in some simple cases I timed), and in some
213 applications one wants PCRE to compile efficiently as well as match
214 efficiently.
215
216 For convenience, we use the same bit definitions as in chartables:
217
218 0x04 decimal digit
219 0x08 hexadecimal digit
220
221 Then we can use ctype_digit and ctype_xdigit in the code. */
222
223 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
224 static const unsigned char digitab[] =
225 {
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
232 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
233 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
234 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
235 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
236 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
238 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
258
259 #else /* This is the "abnormal" case, for EBCDIC systems */
260 static const unsigned char digitab[] =
261 {
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
278 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
286 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
292 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
293 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
294
295 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
296 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
297 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
298 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
300 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
304 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
305 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
307 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
309 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
312 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
313 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
314 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
315 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
316 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
317 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
318 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
319 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
320 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
321 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
322 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
323 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
324 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
325 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
326 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
327 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
328 #endif
329
330
331 /* Definition to allow mutual recursion */
332
333 static BOOL
334 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
335 int *, int *, branch_chain *, compile_data *);
336
337
338
339 /*************************************************
340 * Handle escapes *
341 *************************************************/
342
343 /* This function is called when a \ has been encountered. It either returns a
344 positive value for a simple escape such as \n, or a negative value which
345 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
346 a positive value greater than 255 may be returned. On entry, ptr is pointing at
347 the \. On exit, it is on the final character of the escape sequence.
348
349 Arguments:
350 ptrptr points to the pattern position pointer
351 errorcodeptr points to the errorcode variable
352 bracount number of previous extracting brackets
353 options the options bits
354 isclass TRUE if inside a character class
355
356 Returns: zero or positive => a data character
357 negative => a special escape sequence
358 on error, errorptr is set
359 */
360
361 static int
362 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
363 int options, BOOL isclass)
364 {
365 const uschar *ptr = *ptrptr;
366 int c, i;
367
368 /* If backslash is at the end of the pattern, it's an error. */
369
370 c = *(++ptr);
371 if (c == 0) *errorcodeptr = ERR1;
372
373 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
374 a table. A non-zero result is something that can be returned immediately.
375 Otherwise further processing may be required. */
376
377 #if !EBCDIC /* ASCII coding */
378 else if (c < '0' || c > 'z') {} /* Not alphameric */
379 else if ((i = escapes[c - '0']) != 0) c = i;
380
381 #else /* EBCDIC coding */
382 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
383 else if ((i = escapes[c - 0x48]) != 0) c = i;
384 #endif
385
386 /* Escapes that need further processing, or are illegal. */
387
388 else
389 {
390 const uschar *oldptr;
391 switch (c)
392 {
393 /* A number of Perl escapes are not handled by PCRE. We give an explicit
394 error. */
395
396 case 'l':
397 case 'L':
398 case 'N':
399 case 'u':
400 case 'U':
401 *errorcodeptr = ERR37;
402 break;
403
404 /* The handling of escape sequences consisting of a string of digits
405 starting with one that is not zero is not straightforward. By experiment,
406 the way Perl works seems to be as follows:
407
408 Outside a character class, the digits are read as a decimal number. If the
409 number is less than 10, or if there are that many previous extracting
410 left brackets, then it is a back reference. Otherwise, up to three octal
411 digits are read to form an escaped byte. Thus \123 is likely to be octal
412 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
413 value is greater than 377, the least significant 8 bits are taken. Inside a
414 character class, \ followed by a digit is always an octal number. */
415
416 case '1': case '2': case '3': case '4': case '5':
417 case '6': case '7': case '8': case '9':
418
419 if (!isclass)
420 {
421 oldptr = ptr;
422 c -= '0';
423 while ((digitab[ptr[1]] & ctype_digit) != 0)
424 c = c * 10 + *(++ptr) - '0';
425 if (c < 10 || c <= bracount)
426 {
427 c = -(ESC_REF + c);
428 break;
429 }
430 ptr = oldptr; /* Put the pointer back and fall through */
431 }
432
433 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
434 generates a binary zero byte and treats the digit as a following literal.
435 Thus we have to pull back the pointer by one. */
436
437 if ((c = *ptr) >= '8')
438 {
439 ptr--;
440 c = 0;
441 break;
442 }
443
444 /* \0 always starts an octal number, but we may drop through to here with a
445 larger first octal digit. */
446
447 case '0':
448 c -= '0';
449 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
450 c = c * 8 + *(++ptr) - '0';
451 c &= 255; /* Take least significant 8 bits */
452 break;
453
454 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
455 which can be greater than 0xff, but only if the ddd are hex digits. */
456
457 case 'x':
458 #ifdef SUPPORT_UTF8
459 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
460 {
461 const uschar *pt = ptr + 2;
462 register int count = 0;
463 c = 0;
464 while ((digitab[*pt] & ctype_xdigit) != 0)
465 {
466 int cc = *pt++;
467 count++;
468 #if !EBCDIC /* ASCII coding */
469 if (cc >= 'a') cc -= 32; /* Convert to upper case */
470 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
471 #else /* EBCDIC coding */
472 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
473 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
474 #endif
475 }
476 if (*pt == '}')
477 {
478 if (c < 0 || count > 8) *errorcodeptr = ERR34;
479 ptr = pt;
480 break;
481 }
482 /* If the sequence of hex digits does not end with '}', then we don't
483 recognize this construct; fall through to the normal \x handling. */
484 }
485 #endif
486
487 /* Read just a single hex char */
488
489 c = 0;
490 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
491 {
492 int cc; /* Some compilers don't like ++ */
493 cc = *(++ptr); /* in initializers */
494 #if !EBCDIC /* ASCII coding */
495 if (cc >= 'a') cc -= 32; /* Convert to upper case */
496 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
497 #else /* EBCDIC coding */
498 if (cc <= 'z') cc += 64; /* Convert to upper case */
499 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
500 #endif
501 }
502 break;
503
504 /* Other special escapes not starting with a digit are straightforward */
505
506 case 'c':
507 c = *(++ptr);
508 if (c == 0)
509 {
510 *errorcodeptr = ERR2;
511 return 0;
512 }
513
514 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
515 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
516 (However, an EBCDIC equivalent has now been added.) */
517
518 #if !EBCDIC /* ASCII coding */
519 if (c >= 'a' && c <= 'z') c -= 32;
520 c ^= 0x40;
521 #else /* EBCDIC coding */
522 if (c >= 'a' && c <= 'z') c += 64;
523 c ^= 0xC0;
524 #endif
525 break;
526
527 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
528 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
529 for Perl compatibility, it is a literal. This code looks a bit odd, but
530 there used to be some cases other than the default, and there may be again
531 in future, so I haven't "optimized" it. */
532
533 default:
534 if ((options & PCRE_EXTRA) != 0) switch(c)
535 {
536 default:
537 *errorcodeptr = ERR3;
538 break;
539 }
540 break;
541 }
542 }
543
544 *ptrptr = ptr;
545 return c;
546 }
547
548
549
550 #ifdef SUPPORT_UCP
551 /*************************************************
552 * Handle \P and \p *
553 *************************************************/
554
555 /* This function is called after \P or \p has been encountered, provided that
556 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
557 pointing at the P or p. On exit, it is pointing at the final character of the
558 escape sequence.
559
560 Argument:
561 ptrptr points to the pattern position pointer
562 negptr points to a boolean that is set TRUE for negation else FALSE
563 errorcodeptr points to the error code variable
564
565 Returns: value from ucp_type_table, or -1 for an invalid type
566 */
567
568 static int
569 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
570 {
571 int c, i, bot, top;
572 const uschar *ptr = *ptrptr;
573 char name[4];
574
575 c = *(++ptr);
576 if (c == 0) goto ERROR_RETURN;
577
578 *negptr = FALSE;
579
580 /* \P or \p can be followed by a one- or two-character name in {}, optionally
581 preceded by ^ for negation. */
582
583 if (c == '{')
584 {
585 if (ptr[1] == '^')
586 {
587 *negptr = TRUE;
588 ptr++;
589 }
590 for (i = 0; i <= 2; i++)
591 {
592 c = *(++ptr);
593 if (c == 0) goto ERROR_RETURN;
594 if (c == '}') break;
595 name[i] = c;
596 }
597 if (c !='}') /* Try to distinguish error cases */
598 {
599 while (*(++ptr) != 0 && *ptr != '}');
600 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
601 }
602 name[i] = 0;
603 }
604
605 /* Otherwise there is just one following character */
606
607 else
608 {
609 name[0] = c;
610 name[1] = 0;
611 }
612
613 *ptrptr = ptr;
614
615 /* Search for a recognized property name using binary chop */
616
617 bot = 0;
618 top = _pcre_utt_size;
619
620 while (bot < top)
621 {
622 i = (bot + top)/2;
623 c = strcmp(name, _pcre_utt[i].name);
624 if (c == 0) return _pcre_utt[i].value;
625 if (c > 0) bot = i + 1; else top = i;
626 }
627
628 UNKNOWN_RETURN:
629 *errorcodeptr = ERR47;
630 *ptrptr = ptr;
631 return -1;
632
633 ERROR_RETURN:
634 *errorcodeptr = ERR46;
635 *ptrptr = ptr;
636 return -1;
637 }
638 #endif
639
640
641
642
643 /*************************************************
644 * Check for counted repeat *
645 *************************************************/
646
647 /* This function is called when a '{' is encountered in a place where it might
648 start a quantifier. It looks ahead to see if it really is a quantifier or not.
649 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
650 where the ddds are digits.
651
652 Arguments:
653 p pointer to the first char after '{'
654
655 Returns: TRUE or FALSE
656 */
657
658 static BOOL
659 is_counted_repeat(const uschar *p)
660 {
661 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
662 while ((digitab[*p] & ctype_digit) != 0) p++;
663 if (*p == '}') return TRUE;
664
665 if (*p++ != ',') return FALSE;
666 if (*p == '}') return TRUE;
667
668 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
669 while ((digitab[*p] & ctype_digit) != 0) p++;
670
671 return (*p == '}');
672 }
673
674
675
676 /*************************************************
677 * Read repeat counts *
678 *************************************************/
679
680 /* Read an item of the form {n,m} and return the values. This is called only
681 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
682 so the syntax is guaranteed to be correct, but we need to check the values.
683
684 Arguments:
685 p pointer to first char after '{'
686 minp pointer to int for min
687 maxp pointer to int for max
688 returned as -1 if no max
689 errorcodeptr points to error code variable
690
691 Returns: pointer to '}' on success;
692 current ptr on error, with errorcodeptr set non-zero
693 */
694
695 static const uschar *
696 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
697 {
698 int min = 0;
699 int max = -1;
700
701 /* Read the minimum value and do a paranoid check: a negative value indicates
702 an integer overflow. */
703
704 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
705 if (min < 0 || min > 65535)
706 {
707 *errorcodeptr = ERR5;
708 return p;
709 }
710
711 /* Read the maximum value if there is one, and again do a paranoid on its size.
712 Also, max must not be less than min. */
713
714 if (*p == '}') max = min; else
715 {
716 if (*(++p) != '}')
717 {
718 max = 0;
719 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
720 if (max < 0 || max > 65535)
721 {
722 *errorcodeptr = ERR5;
723 return p;
724 }
725 if (max < min)
726 {
727 *errorcodeptr = ERR4;
728 return p;
729 }
730 }
731 }
732
733 /* Fill in the required variables, and pass back the pointer to the terminating
734 '}'. */
735
736 *minp = min;
737 *maxp = max;
738 return p;
739 }
740
741
742
743 /*************************************************
744 * Find first significant op code *
745 *************************************************/
746
747 /* This is called by several functions that scan a compiled expression looking
748 for a fixed first character, or an anchoring op code etc. It skips over things
749 that do not influence this. For some calls, a change of option is important.
750 For some calls, it makes sense to skip negative forward and all backward
751 assertions, and also the \b assertion; for others it does not.
752
753 Arguments:
754 code pointer to the start of the group
755 options pointer to external options
756 optbit the option bit whose changing is significant, or
757 zero if none are
758 skipassert TRUE if certain assertions are to be skipped
759
760 Returns: pointer to the first significant opcode
761 */
762
763 static const uschar*
764 first_significant_code(const uschar *code, int *options, int optbit,
765 BOOL skipassert)
766 {
767 for (;;)
768 {
769 switch ((int)*code)
770 {
771 case OP_OPT:
772 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
773 *options = (int)code[1];
774 code += 2;
775 break;
776
777 case OP_ASSERT_NOT:
778 case OP_ASSERTBACK:
779 case OP_ASSERTBACK_NOT:
780 if (!skipassert) return code;
781 do code += GET(code, 1); while (*code == OP_ALT);
782 code += _pcre_OP_lengths[*code];
783 break;
784
785 case OP_WORD_BOUNDARY:
786 case OP_NOT_WORD_BOUNDARY:
787 if (!skipassert) return code;
788 /* Fall through */
789
790 case OP_CALLOUT:
791 case OP_CREF:
792 case OP_BRANUMBER:
793 code += _pcre_OP_lengths[*code];
794 break;
795
796 default:
797 return code;
798 }
799 }
800 /* Control never reaches here */
801 }
802
803
804
805
806 /*************************************************
807 * Find the fixed length of a pattern *
808 *************************************************/
809
810 /* Scan a pattern and compute the fixed length of subject that will match it,
811 if the length is fixed. This is needed for dealing with backward assertions.
812 In UTF8 mode, the result is in characters rather than bytes.
813
814 Arguments:
815 code points to the start of the pattern (the bracket)
816 options the compiling options
817
818 Returns: the fixed length, or -1 if there is no fixed length,
819 or -2 if \C was encountered
820 */
821
822 static int
823 find_fixedlength(uschar *code, int options)
824 {
825 int length = -1;
826
827 register int branchlength = 0;
828 register uschar *cc = code + 1 + LINK_SIZE;
829
830 /* Scan along the opcodes for this branch. If we get to the end of the
831 branch, check the length against that of the other branches. */
832
833 for (;;)
834 {
835 int d;
836 register int op = *cc;
837 if (op >= OP_BRA) op = OP_BRA;
838
839 switch (op)
840 {
841 case OP_BRA:
842 case OP_ONCE:
843 case OP_COND:
844 d = find_fixedlength(cc, options);
845 if (d < 0) return d;
846 branchlength += d;
847 do cc += GET(cc, 1); while (*cc == OP_ALT);
848 cc += 1 + LINK_SIZE;
849 break;
850
851 /* Reached end of a branch; if it's a ket it is the end of a nested
852 call. If it's ALT it is an alternation in a nested call. If it is
853 END it's the end of the outer call. All can be handled by the same code. */
854
855 case OP_ALT:
856 case OP_KET:
857 case OP_KETRMAX:
858 case OP_KETRMIN:
859 case OP_END:
860 if (length < 0) length = branchlength;
861 else if (length != branchlength) return -1;
862 if (*cc != OP_ALT) return length;
863 cc += 1 + LINK_SIZE;
864 branchlength = 0;
865 break;
866
867 /* Skip over assertive subpatterns */
868
869 case OP_ASSERT:
870 case OP_ASSERT_NOT:
871 case OP_ASSERTBACK:
872 case OP_ASSERTBACK_NOT:
873 do cc += GET(cc, 1); while (*cc == OP_ALT);
874 /* Fall through */
875
876 /* Skip over things that don't match chars */
877
878 case OP_REVERSE:
879 case OP_BRANUMBER:
880 case OP_CREF:
881 case OP_OPT:
882 case OP_CALLOUT:
883 case OP_SOD:
884 case OP_SOM:
885 case OP_EOD:
886 case OP_EODN:
887 case OP_CIRC:
888 case OP_DOLL:
889 case OP_NOT_WORD_BOUNDARY:
890 case OP_WORD_BOUNDARY:
891 cc += _pcre_OP_lengths[*cc];
892 break;
893
894 /* Handle literal characters */
895
896 case OP_CHAR:
897 case OP_CHARNC:
898 branchlength++;
899 cc += 2;
900 #ifdef SUPPORT_UTF8
901 if ((options & PCRE_UTF8) != 0)
902 {
903 while ((*cc & 0xc0) == 0x80) cc++;
904 }
905 #endif
906 break;
907
908 /* Handle exact repetitions. The count is already in characters, but we
909 need to skip over a multibyte character in UTF8 mode. */
910
911 case OP_EXACT:
912 branchlength += GET2(cc,1);
913 cc += 4;
914 #ifdef SUPPORT_UTF8
915 if ((options & PCRE_UTF8) != 0)
916 {
917 while((*cc & 0x80) == 0x80) cc++;
918 }
919 #endif
920 break;
921
922 case OP_TYPEEXACT:
923 branchlength += GET2(cc,1);
924 cc += 4;
925 break;
926
927 /* Handle single-char matchers */
928
929 case OP_PROP:
930 case OP_NOTPROP:
931 cc++;
932 /* Fall through */
933
934 case OP_NOT_DIGIT:
935 case OP_DIGIT:
936 case OP_NOT_WHITESPACE:
937 case OP_WHITESPACE:
938 case OP_NOT_WORDCHAR:
939 case OP_WORDCHAR:
940 case OP_ANY:
941 branchlength++;
942 cc++;
943 break;
944
945 /* The single-byte matcher isn't allowed */
946
947 case OP_ANYBYTE:
948 return -2;
949
950 /* Check a class for variable quantification */
951
952 #ifdef SUPPORT_UTF8
953 case OP_XCLASS:
954 cc += GET(cc, 1) - 33;
955 /* Fall through */
956 #endif
957
958 case OP_CLASS:
959 case OP_NCLASS:
960 cc += 33;
961
962 switch (*cc)
963 {
964 case OP_CRSTAR:
965 case OP_CRMINSTAR:
966 case OP_CRQUERY:
967 case OP_CRMINQUERY:
968 return -1;
969
970 case OP_CRRANGE:
971 case OP_CRMINRANGE:
972 if (GET2(cc,1) != GET2(cc,3)) return -1;
973 branchlength += GET2(cc,1);
974 cc += 5;
975 break;
976
977 default:
978 branchlength++;
979 }
980 break;
981
982 /* Anything else is variable length */
983
984 default:
985 return -1;
986 }
987 }
988 /* Control never gets here */
989 }
990
991
992
993
994 /*************************************************
995 * Scan compiled regex for numbered bracket *
996 *************************************************/
997
998 /* This little function scans through a compiled pattern until it finds a
999 capturing bracket with the given number.
1000
1001 Arguments:
1002 code points to start of expression
1003 utf8 TRUE in UTF-8 mode
1004 number the required bracket number
1005
1006 Returns: pointer to the opcode for the bracket, or NULL if not found
1007 */
1008
1009 static const uschar *
1010 find_bracket(const uschar *code, BOOL utf8, int number)
1011 {
1012 #ifndef SUPPORT_UTF8
1013 utf8 = utf8; /* Stop pedantic compilers complaining */
1014 #endif
1015
1016 for (;;)
1017 {
1018 register int c = *code;
1019 if (c == OP_END) return NULL;
1020 else if (c > OP_BRA)
1021 {
1022 int n = c - OP_BRA;
1023 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1024 if (n == number) return (uschar *)code;
1025 code += _pcre_OP_lengths[OP_BRA];
1026 }
1027 else
1028 {
1029 code += _pcre_OP_lengths[c];
1030
1031 #ifdef SUPPORT_UTF8
1032
1033 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1034 by a multi-byte character. The length in the table is a minimum, so we have
1035 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1036 can use relatively efficient code. */
1037
1038 if (utf8) switch(c)
1039 {
1040 case OP_CHAR:
1041 case OP_CHARNC:
1042 case OP_EXACT:
1043 case OP_UPTO:
1044 case OP_MINUPTO:
1045 case OP_STAR:
1046 case OP_MINSTAR:
1047 case OP_PLUS:
1048 case OP_MINPLUS:
1049 case OP_QUERY:
1050 case OP_MINQUERY:
1051 while ((*code & 0xc0) == 0x80) code++;
1052 break;
1053
1054 /* XCLASS is used for classes that cannot be represented just by a bit
1055 map. This includes negated single high-valued characters. The length in
1056 the table is zero; the actual length is stored in the compiled code. */
1057
1058 case OP_XCLASS:
1059 code += GET(code, 1) + 1;
1060 break;
1061 }
1062 #endif
1063 }
1064 }
1065 }
1066
1067
1068
1069 /*************************************************
1070 * Scan compiled regex for recursion reference *
1071 *************************************************/
1072
1073 /* This little function scans through a compiled pattern until it finds an
1074 instance of OP_RECURSE.
1075
1076 Arguments:
1077 code points to start of expression
1078 utf8 TRUE in UTF-8 mode
1079
1080 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1081 */
1082
1083 static const uschar *
1084 find_recurse(const uschar *code, BOOL utf8)
1085 {
1086 #ifndef SUPPORT_UTF8
1087 utf8 = utf8; /* Stop pedantic compilers complaining */
1088 #endif
1089
1090 for (;;)
1091 {
1092 register int c = *code;
1093 if (c == OP_END) return NULL;
1094 else if (c == OP_RECURSE) return code;
1095 else if (c > OP_BRA)
1096 {
1097 code += _pcre_OP_lengths[OP_BRA];
1098 }
1099 else
1100 {
1101 code += _pcre_OP_lengths[c];
1102
1103 #ifdef SUPPORT_UTF8
1104
1105 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1106 by a multi-byte character. The length in the table is a minimum, so we have
1107 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1108 can use relatively efficient code. */
1109
1110 if (utf8) switch(c)
1111 {
1112 case OP_CHAR:
1113 case OP_CHARNC:
1114 case OP_EXACT:
1115 case OP_UPTO:
1116 case OP_MINUPTO:
1117 case OP_STAR:
1118 case OP_MINSTAR:
1119 case OP_PLUS:
1120 case OP_MINPLUS:
1121 case OP_QUERY:
1122 case OP_MINQUERY:
1123 while ((*code & 0xc0) == 0x80) code++;
1124 break;
1125
1126 /* XCLASS is used for classes that cannot be represented just by a bit
1127 map. This includes negated single high-valued characters. The length in
1128 the table is zero; the actual length is stored in the compiled code. */
1129
1130 case OP_XCLASS:
1131 code += GET(code, 1) + 1;
1132 break;
1133 }
1134 #endif
1135 }
1136 }
1137 }
1138
1139
1140
1141 /*************************************************
1142 * Scan compiled branch for non-emptiness *
1143 *************************************************/
1144
1145 /* This function scans through a branch of a compiled pattern to see whether it
1146 can match the empty string or not. It is called only from could_be_empty()
1147 below. Note that first_significant_code() skips over assertions. If we hit an
1148 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1149 whose current branch will already have been scanned.
1150
1151 Arguments:
1152 code points to start of search
1153 endcode points to where to stop
1154 utf8 TRUE if in UTF8 mode
1155
1156 Returns: TRUE if what is matched could be empty
1157 */
1158
1159 static BOOL
1160 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1161 {
1162 register int c;
1163 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1164 code < endcode;
1165 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1166 {
1167 const uschar *ccode;
1168
1169 c = *code;
1170
1171 if (c >= OP_BRA)
1172 {
1173 BOOL empty_branch;
1174 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1175
1176 /* Scan a closed bracket */
1177
1178 empty_branch = FALSE;
1179 do
1180 {
1181 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1182 empty_branch = TRUE;
1183 code += GET(code, 1);
1184 }
1185 while (*code == OP_ALT);
1186 if (!empty_branch) return FALSE; /* All branches are non-empty */
1187 code += 1 + LINK_SIZE;
1188 c = *code;
1189 }
1190
1191 else switch (c)
1192 {
1193 /* Check for quantifiers after a class */
1194
1195 #ifdef SUPPORT_UTF8
1196 case OP_XCLASS:
1197 ccode = code + GET(code, 1);
1198 goto CHECK_CLASS_REPEAT;
1199 #endif
1200
1201 case OP_CLASS:
1202 case OP_NCLASS:
1203 ccode = code + 33;
1204
1205 #ifdef SUPPORT_UTF8
1206 CHECK_CLASS_REPEAT:
1207 #endif
1208
1209 switch (*ccode)
1210 {
1211 case OP_CRSTAR: /* These could be empty; continue */
1212 case OP_CRMINSTAR:
1213 case OP_CRQUERY:
1214 case OP_CRMINQUERY:
1215 break;
1216
1217 default: /* Non-repeat => class must match */
1218 case OP_CRPLUS: /* These repeats aren't empty */
1219 case OP_CRMINPLUS:
1220 return FALSE;
1221
1222 case OP_CRRANGE:
1223 case OP_CRMINRANGE:
1224 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1225 break;
1226 }
1227 break;
1228
1229 /* Opcodes that must match a character */
1230
1231 case OP_PROP:
1232 case OP_NOTPROP:
1233 case OP_EXTUNI:
1234 case OP_NOT_DIGIT:
1235 case OP_DIGIT:
1236 case OP_NOT_WHITESPACE:
1237 case OP_WHITESPACE:
1238 case OP_NOT_WORDCHAR:
1239 case OP_WORDCHAR:
1240 case OP_ANY:
1241 case OP_ANYBYTE:
1242 case OP_CHAR:
1243 case OP_CHARNC:
1244 case OP_NOT:
1245 case OP_PLUS:
1246 case OP_MINPLUS:
1247 case OP_EXACT:
1248 case OP_NOTPLUS:
1249 case OP_NOTMINPLUS:
1250 case OP_NOTEXACT:
1251 case OP_TYPEPLUS:
1252 case OP_TYPEMINPLUS:
1253 case OP_TYPEEXACT:
1254 return FALSE;
1255
1256 /* End of branch */
1257
1258 case OP_KET:
1259 case OP_KETRMAX:
1260 case OP_KETRMIN:
1261 case OP_ALT:
1262 return TRUE;
1263
1264 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1265 followed by a multibyte character */
1266
1267 #ifdef SUPPORT_UTF8
1268 case OP_STAR:
1269 case OP_MINSTAR:
1270 case OP_QUERY:
1271 case OP_MINQUERY:
1272 case OP_UPTO:
1273 case OP_MINUPTO:
1274 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1275 break;
1276 #endif
1277 }
1278 }
1279
1280 return TRUE;
1281 }
1282
1283
1284
1285 /*************************************************
1286 * Scan compiled regex for non-emptiness *
1287 *************************************************/
1288
1289 /* This function is called to check for left recursive calls. We want to check
1290 the current branch of the current pattern to see if it could match the empty
1291 string. If it could, we must look outwards for branches at other levels,
1292 stopping when we pass beyond the bracket which is the subject of the recursion.
1293
1294 Arguments:
1295 code points to start of the recursion
1296 endcode points to where to stop (current RECURSE item)
1297 bcptr points to the chain of current (unclosed) branch starts
1298 utf8 TRUE if in UTF-8 mode
1299
1300 Returns: TRUE if what is matched could be empty
1301 */
1302
1303 static BOOL
1304 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1305 BOOL utf8)
1306 {
1307 while (bcptr != NULL && bcptr->current >= code)
1308 {
1309 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1310 bcptr = bcptr->outer;
1311 }
1312 return TRUE;
1313 }
1314
1315
1316
1317 /*************************************************
1318 * Check for POSIX class syntax *
1319 *************************************************/
1320
1321 /* This function is called when the sequence "[:" or "[." or "[=" is
1322 encountered in a character class. It checks whether this is followed by an
1323 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1324 ".]" or "=]".
1325
1326 Argument:
1327 ptr pointer to the initial [
1328 endptr where to return the end pointer
1329 cd pointer to compile data
1330
1331 Returns: TRUE or FALSE
1332 */
1333
1334 static BOOL
1335 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1336 {
1337 int terminator; /* Don't combine these lines; the Solaris cc */
1338 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1339 if (*(++ptr) == '^') ptr++;
1340 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1341 if (*ptr == terminator && ptr[1] == ']')
1342 {
1343 *endptr = ptr;
1344 return TRUE;
1345 }
1346 return FALSE;
1347 }
1348
1349
1350
1351
1352 /*************************************************
1353 * Check POSIX class name *
1354 *************************************************/
1355
1356 /* This function is called to check the name given in a POSIX-style class entry
1357 such as [:alnum:].
1358
1359 Arguments:
1360 ptr points to the first letter
1361 len the length of the name
1362
1363 Returns: a value representing the name, or -1 if unknown
1364 */
1365
1366 static int
1367 check_posix_name(const uschar *ptr, int len)
1368 {
1369 register int yield = 0;
1370 while (posix_name_lengths[yield] != 0)
1371 {
1372 if (len == posix_name_lengths[yield] &&
1373 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1374 yield++;
1375 }
1376 return -1;
1377 }
1378
1379
1380 /*************************************************
1381 * Adjust OP_RECURSE items in repeated group *
1382 *************************************************/
1383
1384 /* OP_RECURSE items contain an offset from the start of the regex to the group
1385 that is referenced. This means that groups can be replicated for fixed
1386 repetition simply by copying (because the recursion is allowed to refer to
1387 earlier groups that are outside the current group). However, when a group is
1388 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1389 it, after it has been compiled. This means that any OP_RECURSE items within it
1390 that refer to the group itself or any contained groups have to have their
1391 offsets adjusted. That is the job of this function. Before it is called, the
1392 partially compiled regex must be temporarily terminated with OP_END.
1393
1394 Arguments:
1395 group points to the start of the group
1396 adjust the amount by which the group is to be moved
1397 utf8 TRUE in UTF-8 mode
1398 cd contains pointers to tables etc.
1399
1400 Returns: nothing
1401 */
1402
1403 static void
1404 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1405 {
1406 uschar *ptr = group;
1407 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1408 {
1409 int offset = GET(ptr, 1);
1410 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1411 ptr += 1 + LINK_SIZE;
1412 }
1413 }
1414
1415
1416
1417 /*************************************************
1418 * Insert an automatic callout point *
1419 *************************************************/
1420
1421 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1422 callout points before each pattern item.
1423
1424 Arguments:
1425 code current code pointer
1426 ptr current pattern pointer
1427 cd pointers to tables etc
1428
1429 Returns: new code pointer
1430 */
1431
1432 static uschar *
1433 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1434 {
1435 *code++ = OP_CALLOUT;
1436 *code++ = 255;
1437 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1438 PUT(code, LINK_SIZE, 0); /* Default length */
1439 return code + 2*LINK_SIZE;
1440 }
1441
1442
1443
1444 /*************************************************
1445 * Complete a callout item *
1446 *************************************************/
1447
1448 /* A callout item contains the length of the next item in the pattern, which
1449 we can't fill in till after we have reached the relevant point. This is used
1450 for both automatic and manual callouts.
1451
1452 Arguments:
1453 previous_callout points to previous callout item
1454 ptr current pattern pointer
1455 cd pointers to tables etc
1456
1457 Returns: nothing
1458 */
1459
1460 static void
1461 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1462 {
1463 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1464 PUT(previous_callout, 2 + LINK_SIZE, length);
1465 }
1466
1467
1468
1469 #ifdef SUPPORT_UCP
1470 /*************************************************
1471 * Get othercase range *
1472 *************************************************/
1473
1474 /* This function is passed the start and end of a class range, in UTF-8 mode
1475 with UCP support. It searches up the characters, looking for internal ranges of
1476 characters in the "other" case. Each call returns the next one, updating the
1477 start address.
1478
1479 Arguments:
1480 cptr points to starting character value; updated
1481 d end value
1482 ocptr where to put start of othercase range
1483 odptr where to put end of othercase range
1484
1485 Yield: TRUE when range returned; FALSE when no more
1486 */
1487
1488 static BOOL
1489 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1490 {
1491 int c, chartype, othercase, next;
1492
1493 for (c = *cptr; c <= d; c++)
1494 {
1495 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1496 break;
1497 }
1498
1499 if (c > d) return FALSE;
1500
1501 *ocptr = othercase;
1502 next = othercase + 1;
1503
1504 for (++c; c <= d; c++)
1505 {
1506 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1507 othercase != next)
1508 break;
1509 next++;
1510 }
1511
1512 *odptr = next - 1;
1513 *cptr = c;
1514
1515 return TRUE;
1516 }
1517 #endif /* SUPPORT_UCP */
1518
1519
1520 /*************************************************
1521 * Compile one branch *
1522 *************************************************/
1523
1524 /* Scan the pattern, compiling it into the code vector. If the options are
1525 changed during the branch, the pointer is used to change the external options
1526 bits.
1527
1528 Arguments:
1529 optionsptr pointer to the option bits
1530 brackets points to number of extracting brackets used
1531 codeptr points to the pointer to the current code point
1532 ptrptr points to the current pattern pointer
1533 errorcodeptr points to error code variable
1534 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1535 reqbyteptr set to the last literal character required, else < 0
1536 bcptr points to current branch chain
1537 cd contains pointers to tables etc.
1538
1539 Returns: TRUE on success
1540 FALSE, with *errorcodeptr set non-zero on error
1541 */
1542
1543 static BOOL
1544 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1545 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1546 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1547 {
1548 int repeat_type, op_type;
1549 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1550 int bravalue = 0;
1551 int greedy_default, greedy_non_default;
1552 int firstbyte, reqbyte;
1553 int zeroreqbyte, zerofirstbyte;
1554 int req_caseopt, reqvary, tempreqvary;
1555 int condcount = 0;
1556 int options = *optionsptr;
1557 int after_manual_callout = 0;
1558 register int c;
1559 register uschar *code = *codeptr;
1560 uschar *tempcode;
1561 BOOL inescq = FALSE;
1562 BOOL groupsetfirstbyte = FALSE;
1563 const uschar *ptr = *ptrptr;
1564 const uschar *tempptr;
1565 uschar *previous = NULL;
1566 uschar *previous_callout = NULL;
1567 uschar classbits[32];
1568
1569 #ifdef SUPPORT_UTF8
1570 BOOL class_utf8;
1571 BOOL utf8 = (options & PCRE_UTF8) != 0;
1572 uschar *class_utf8data;
1573 uschar utf8_char[6];
1574 #else
1575 BOOL utf8 = FALSE;
1576 #endif
1577
1578 /* Set up the default and non-default settings for greediness */
1579
1580 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1581 greedy_non_default = greedy_default ^ 1;
1582
1583 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1584 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1585 matches a non-fixed char first char; reqbyte just remains unset if we never
1586 find one.
1587
1588 When we hit a repeat whose minimum is zero, we may have to adjust these values
1589 to take the zero repeat into account. This is implemented by setting them to
1590 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1591 item types that can be repeated set these backoff variables appropriately. */
1592
1593 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1594
1595 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1596 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1597 value > 255. It is added into the firstbyte or reqbyte variables to record the
1598 case status of the value. This is used only for ASCII characters. */
1599
1600 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1601
1602 /* Switch on next character until the end of the branch */
1603
1604 for (;; ptr++)
1605 {
1606 BOOL negate_class;
1607 BOOL possessive_quantifier;
1608 BOOL is_quantifier;
1609 int class_charcount;
1610 int class_lastchar;
1611 int newoptions;
1612 int recno;
1613 int skipbytes;
1614 int subreqbyte;
1615 int subfirstbyte;
1616 int mclength;
1617 uschar mcbuffer[8];
1618
1619 /* Next byte in the pattern */
1620
1621 c = *ptr;
1622
1623 /* If in \Q...\E, check for the end; if not, we have a literal */
1624
1625 if (inescq && c != 0)
1626 {
1627 if (c == '\\' && ptr[1] == 'E')
1628 {
1629 inescq = FALSE;
1630 ptr++;
1631 continue;
1632 }
1633 else
1634 {
1635 if (previous_callout != NULL)
1636 {
1637 complete_callout(previous_callout, ptr, cd);
1638 previous_callout = NULL;
1639 }
1640 if ((options & PCRE_AUTO_CALLOUT) != 0)
1641 {
1642 previous_callout = code;
1643 code = auto_callout(code, ptr, cd);
1644 }
1645 goto NORMAL_CHAR;
1646 }
1647 }
1648
1649 /* Fill in length of a previous callout, except when the next thing is
1650 a quantifier. */
1651
1652 is_quantifier = c == '*' || c == '+' || c == '?' ||
1653 (c == '{' && is_counted_repeat(ptr+1));
1654
1655 if (!is_quantifier && previous_callout != NULL &&
1656 after_manual_callout-- <= 0)
1657 {
1658 complete_callout(previous_callout, ptr, cd);
1659 previous_callout = NULL;
1660 }
1661
1662 /* In extended mode, skip white space and comments */
1663
1664 if ((options & PCRE_EXTENDED) != 0)
1665 {
1666 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1667 if (c == '#')
1668 {
1669 /* The space before the ; is to avoid a warning on a silly compiler
1670 on the Macintosh. */
1671 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1672 if (c != 0) continue; /* Else fall through to handle end of string */
1673 }
1674 }
1675
1676 /* No auto callout for quantifiers. */
1677
1678 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1679 {
1680 previous_callout = code;
1681 code = auto_callout(code, ptr, cd);
1682 }
1683
1684 switch(c)
1685 {
1686 /* The branch terminates at end of string, |, or ). */
1687
1688 case 0:
1689 case '|':
1690 case ')':
1691 *firstbyteptr = firstbyte;
1692 *reqbyteptr = reqbyte;
1693 *codeptr = code;
1694 *ptrptr = ptr;
1695 return TRUE;
1696
1697 /* Handle single-character metacharacters. In multiline mode, ^ disables
1698 the setting of any following char as a first character. */
1699
1700 case '^':
1701 if ((options & PCRE_MULTILINE) != 0)
1702 {
1703 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1704 }
1705 previous = NULL;
1706 *code++ = OP_CIRC;
1707 break;
1708
1709 case '$':
1710 previous = NULL;
1711 *code++ = OP_DOLL;
1712 break;
1713
1714 /* There can never be a first char if '.' is first, whatever happens about
1715 repeats. The value of reqbyte doesn't change either. */
1716
1717 case '.':
1718 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1719 zerofirstbyte = firstbyte;
1720 zeroreqbyte = reqbyte;
1721 previous = code;
1722 *code++ = OP_ANY;
1723 break;
1724
1725 /* Character classes. If the included characters are all < 255 in value, we
1726 build a 32-byte bitmap of the permitted characters, except in the special
1727 case where there is only one such character. For negated classes, we build
1728 the map as usual, then invert it at the end. However, we use a different
1729 opcode so that data characters > 255 can be handled correctly.
1730
1731 If the class contains characters outside the 0-255 range, a different
1732 opcode is compiled. It may optionally have a bit map for characters < 256,
1733 but those above are are explicitly listed afterwards. A flag byte tells
1734 whether the bitmap is present, and whether this is a negated class or not.
1735 */
1736
1737 case '[':
1738 previous = code;
1739
1740 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1741 they are encountered at the top level, so we'll do that too. */
1742
1743 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1744 check_posix_syntax(ptr, &tempptr, cd))
1745 {
1746 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1747 goto FAILED;
1748 }
1749
1750 /* If the first character is '^', set the negation flag and skip it. */
1751
1752 if ((c = *(++ptr)) == '^')
1753 {
1754 negate_class = TRUE;
1755 c = *(++ptr);
1756 }
1757 else
1758 {
1759 negate_class = FALSE;
1760 }
1761
1762 /* Keep a count of chars with values < 256 so that we can optimize the case
1763 of just a single character (as long as it's < 256). For higher valued UTF-8
1764 characters, we don't yet do any optimization. */
1765
1766 class_charcount = 0;
1767 class_lastchar = -1;
1768
1769 #ifdef SUPPORT_UTF8
1770 class_utf8 = FALSE; /* No chars >= 256 */
1771 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1772 #endif
1773
1774 /* Initialize the 32-char bit map to all zeros. We have to build the
1775 map in a temporary bit of store, in case the class contains only 1
1776 character (< 256), because in that case the compiled code doesn't use the
1777 bit map. */
1778
1779 memset(classbits, 0, 32 * sizeof(uschar));
1780
1781 /* Process characters until ] is reached. By writing this as a "do" it
1782 means that an initial ] is taken as a data character. The first pass
1783 through the regex checked the overall syntax, so we don't need to be very
1784 strict here. At the start of the loop, c contains the first byte of the
1785 character. */
1786
1787 do
1788 {
1789 #ifdef SUPPORT_UTF8
1790 if (utf8 && c > 127)
1791 { /* Braces are required because the */
1792 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1793 }
1794 #endif
1795
1796 /* Inside \Q...\E everything is literal except \E */
1797
1798 if (inescq)
1799 {
1800 if (c == '\\' && ptr[1] == 'E')
1801 {
1802 inescq = FALSE;
1803 ptr++;
1804 continue;
1805 }
1806 else goto LONE_SINGLE_CHARACTER;
1807 }
1808
1809 /* Handle POSIX class names. Perl allows a negation extension of the
1810 form [:^name:]. A square bracket that doesn't match the syntax is
1811 treated as a literal. We also recognize the POSIX constructions
1812 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1813 5.6 and 5.8 do. */
1814
1815 if (c == '[' &&
1816 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1817 check_posix_syntax(ptr, &tempptr, cd))
1818 {
1819 BOOL local_negate = FALSE;
1820 int posix_class, i;
1821 register const uschar *cbits = cd->cbits;
1822
1823 if (ptr[1] != ':')
1824 {
1825 *errorcodeptr = ERR31;
1826 goto FAILED;
1827 }
1828
1829 ptr += 2;
1830 if (*ptr == '^')
1831 {
1832 local_negate = TRUE;
1833 ptr++;
1834 }
1835
1836 posix_class = check_posix_name(ptr, tempptr - ptr);
1837 if (posix_class < 0)
1838 {
1839 *errorcodeptr = ERR30;
1840 goto FAILED;
1841 }
1842
1843 /* If matching is caseless, upper and lower are converted to
1844 alpha. This relies on the fact that the class table starts with
1845 alpha, lower, upper as the first 3 entries. */
1846
1847 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1848 posix_class = 0;
1849
1850 /* Or into the map we are building up to 3 of the static class
1851 tables, or their negations. The [:blank:] class sets up the same
1852 chars as the [:space:] class (all white space). We remove the vertical
1853 white space chars afterwards. */
1854
1855 posix_class *= 3;
1856 for (i = 0; i < 3; i++)
1857 {
1858 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1859 int taboffset = posix_class_maps[posix_class + i];
1860 if (taboffset < 0) break;
1861 if (local_negate)
1862 {
1863 if (i == 0)
1864 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1865 else
1866 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1867 if (blankclass) classbits[1] |= 0x3c;
1868 }
1869 else
1870 {
1871 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1872 if (blankclass) classbits[1] &= ~0x3c;
1873 }
1874 }
1875
1876 ptr = tempptr + 1;
1877 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1878 continue; /* End of POSIX syntax handling */
1879 }
1880
1881 /* Backslash may introduce a single character, or it may introduce one
1882 of the specials, which just set a flag. Escaped items are checked for
1883 validity in the pre-compiling pass. The sequence \b is a special case.
1884 Inside a class (and only there) it is treated as backspace. Elsewhere
1885 it marks a word boundary. Other escapes have preset maps ready to
1886 or into the one we are building. We assume they have more than one
1887 character in them, so set class_charcount bigger than one. */
1888
1889 if (c == '\\')
1890 {
1891 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1892
1893 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1894 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1895 else if (-c == ESC_Q) /* Handle start of quoted string */
1896 {
1897 if (ptr[1] == '\\' && ptr[2] == 'E')
1898 {
1899 ptr += 2; /* avoid empty string */
1900 }
1901 else inescq = TRUE;
1902 continue;
1903 }
1904
1905 if (c < 0)
1906 {
1907 register const uschar *cbits = cd->cbits;
1908 class_charcount += 2; /* Greater than 1 is what matters */
1909 switch (-c)
1910 {
1911 case ESC_d:
1912 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1913 continue;
1914
1915 case ESC_D:
1916 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1917 continue;
1918
1919 case ESC_w:
1920 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1921 continue;
1922
1923 case ESC_W:
1924 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1925 continue;
1926
1927 case ESC_s:
1928 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1929 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1930 continue;
1931
1932 case ESC_S:
1933 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1934 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1935 continue;
1936
1937 #ifdef SUPPORT_UCP
1938 case ESC_p:
1939 case ESC_P:
1940 {
1941 BOOL negated;
1942 int property = get_ucp(&ptr, &negated, errorcodeptr);
1943 if (property < 0) goto FAILED;
1944 class_utf8 = TRUE;
1945 *class_utf8data++ = ((-c == ESC_p) != negated)?
1946 XCL_PROP : XCL_NOTPROP;
1947 *class_utf8data++ = property;
1948 class_charcount -= 2; /* Not a < 256 character */
1949 }
1950 continue;
1951 #endif
1952
1953 /* Unrecognized escapes are faulted if PCRE is running in its
1954 strict mode. By default, for compatibility with Perl, they are
1955 treated as literals. */
1956
1957 default:
1958 if ((options & PCRE_EXTRA) != 0)
1959 {
1960 *errorcodeptr = ERR7;
1961 goto FAILED;
1962 }
1963 c = *ptr; /* The final character */
1964 class_charcount -= 2; /* Undo the default count from above */
1965 }
1966 }
1967
1968 /* Fall through if we have a single character (c >= 0). This may be
1969 > 256 in UTF-8 mode. */
1970
1971 } /* End of backslash handling */
1972
1973 /* A single character may be followed by '-' to form a range. However,
1974 Perl does not permit ']' to be the end of the range. A '-' character
1975 here is treated as a literal. */
1976
1977 if (ptr[1] == '-' && ptr[2] != ']')
1978 {
1979 int d;
1980 ptr += 2;
1981
1982 #ifdef SUPPORT_UTF8
1983 if (utf8)
1984 { /* Braces are required because the */
1985 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1986 }
1987 else
1988 #endif
1989 d = *ptr; /* Not UTF-8 mode */
1990
1991 /* The second part of a range can be a single-character escape, but
1992 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1993 in such circumstances. */
1994
1995 if (d == '\\')
1996 {
1997 const uschar *oldptr = ptr;
1998 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1999
2000 /* \b is backslash; \X is literal X; any other special means the '-'
2001 was literal */
2002
2003 if (d < 0)
2004 {
2005 if (d == -ESC_b) d = '\b';
2006 else if (d == -ESC_X) d = 'X'; else
2007 {
2008 ptr = oldptr - 2;
2009 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2010 }
2011 }
2012 }
2013
2014 /* The check that the two values are in the correct order happens in
2015 the pre-pass. Optimize one-character ranges */
2016
2017 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2018
2019 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2020 matching, we have to use an XCLASS with extra data items. Caseless
2021 matching for characters > 127 is available only if UCP support is
2022 available. */
2023
2024 #ifdef SUPPORT_UTF8
2025 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2026 {
2027 class_utf8 = TRUE;
2028
2029 /* With UCP support, we can find the other case equivalents of
2030 the relevant characters. There may be several ranges. Optimize how
2031 they fit with the basic range. */
2032
2033 #ifdef SUPPORT_UCP
2034 if ((options & PCRE_CASELESS) != 0)
2035 {
2036 int occ, ocd;
2037 int cc = c;
2038 int origd = d;
2039 while (get_othercase_range(&cc, origd, &occ, &ocd))
2040 {
2041 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2042
2043 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2044 { /* if there is overlap, */
2045 c = occ; /* noting that if occ < c */
2046 continue; /* we can't have ocd > d */
2047 } /* because a subrange is */
2048 if (ocd > d && occ <= d + 1) /* always shorter than */
2049 { /* the basic range. */
2050 d = ocd;
2051 continue;
2052 }
2053
2054 if (occ == ocd)
2055 {
2056 *class_utf8data++ = XCL_SINGLE;
2057 }
2058 else
2059 {
2060 *class_utf8data++ = XCL_RANGE;
2061 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2062 }
2063 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2064 }
2065 }
2066 #endif /* SUPPORT_UCP */
2067
2068 /* Now record the original range, possibly modified for UCP caseless
2069 overlapping ranges. */
2070
2071 *class_utf8data++ = XCL_RANGE;
2072 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2073 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2074
2075 /* With UCP support, we are done. Without UCP support, there is no
2076 caseless matching for UTF-8 characters > 127; we can use the bit map
2077 for the smaller ones. */
2078
2079 #ifdef SUPPORT_UCP
2080 continue; /* With next character in the class */
2081 #else
2082 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2083
2084 /* Adjust upper limit and fall through to set up the map */
2085
2086 d = 127;
2087
2088 #endif /* SUPPORT_UCP */
2089 }
2090 #endif /* SUPPORT_UTF8 */
2091
2092 /* We use the bit map for all cases when not in UTF-8 mode; else
2093 ranges that lie entirely within 0-127 when there is UCP support; else
2094 for partial ranges without UCP support. */
2095
2096 for (; c <= d; c++)
2097 {
2098 classbits[c/8] |= (1 << (c&7));
2099 if ((options & PCRE_CASELESS) != 0)
2100 {
2101 int uc = cd->fcc[c]; /* flip case */
2102 classbits[uc/8] |= (1 << (uc&7));
2103 }
2104 class_charcount++; /* in case a one-char range */
2105 class_lastchar = c;
2106 }
2107
2108 continue; /* Go get the next char in the class */
2109 }
2110
2111 /* Handle a lone single character - we can get here for a normal
2112 non-escape char, or after \ that introduces a single character or for an
2113 apparent range that isn't. */
2114
2115 LONE_SINGLE_CHARACTER:
2116
2117 /* Handle a character that cannot go in the bit map */
2118
2119 #ifdef SUPPORT_UTF8
2120 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2121 {
2122 class_utf8 = TRUE;
2123 *class_utf8data++ = XCL_SINGLE;
2124 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2125
2126 #ifdef SUPPORT_UCP
2127 if ((options & PCRE_CASELESS) != 0)
2128 {
2129 int chartype;
2130 int othercase;
2131 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2132 othercase > 0)
2133 {
2134 *class_utf8data++ = XCL_SINGLE;
2135 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2136 }
2137 }
2138 #endif /* SUPPORT_UCP */
2139
2140 }
2141 else
2142 #endif /* SUPPORT_UTF8 */
2143
2144 /* Handle a single-byte character */
2145 {
2146 classbits[c/8] |= (1 << (c&7));
2147 if ((options & PCRE_CASELESS) != 0)
2148 {
2149 c = cd->fcc[c]; /* flip case */
2150 classbits[c/8] |= (1 << (c&7));
2151 }
2152 class_charcount++;
2153 class_lastchar = c;
2154 }
2155 }
2156
2157 /* Loop until ']' reached; the check for end of string happens inside the
2158 loop. This "while" is the end of the "do" above. */
2159
2160 while ((c = *(++ptr)) != ']' || inescq);
2161
2162 /* If class_charcount is 1, we saw precisely one character whose value is
2163 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2164 can optimize the negative case only if there were no characters >= 128
2165 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2166 single-bytes only. This is an historical hangover. Maybe one day we can
2167 tidy these opcodes to handle multi-byte characters.
2168
2169 The optimization throws away the bit map. We turn the item into a
2170 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2171 that OP_NOT does not support multibyte characters. In the positive case, it
2172 can cause firstbyte to be set. Otherwise, there can be no first char if
2173 this item is first, whatever repeat count may follow. In the case of
2174 reqbyte, save the previous value for reinstating. */
2175
2176 #ifdef SUPPORT_UTF8
2177 if (class_charcount == 1 &&
2178 (!utf8 ||
2179 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2180
2181 #else
2182 if (class_charcount == 1)
2183 #endif
2184 {
2185 zeroreqbyte = reqbyte;
2186
2187 /* The OP_NOT opcode works on one-byte characters only. */
2188
2189 if (negate_class)
2190 {
2191 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2192 zerofirstbyte = firstbyte;
2193 *code++ = OP_NOT;
2194 *code++ = class_lastchar;
2195 break;
2196 }
2197
2198 /* For a single, positive character, get the value into mcbuffer, and
2199 then we can handle this with the normal one-character code. */
2200
2201 #ifdef SUPPORT_UTF8
2202 if (utf8 && class_lastchar > 127)
2203 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2204 else
2205 #endif
2206 {
2207 mcbuffer[0] = class_lastchar;
2208 mclength = 1;
2209 }
2210 goto ONE_CHAR;
2211 } /* End of 1-char optimization */
2212
2213 /* The general case - not the one-char optimization. If this is the first
2214 thing in the branch, there can be no first char setting, whatever the
2215 repeat count. Any reqbyte setting must remain unchanged after any kind of
2216 repeat. */
2217
2218 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2219 zerofirstbyte = firstbyte;
2220 zeroreqbyte = reqbyte;
2221
2222 /* If there are characters with values > 255, we have to compile an
2223 extended class, with its own opcode. If there are no characters < 256,
2224 we can omit the bitmap. */
2225
2226 #ifdef SUPPORT_UTF8
2227 if (class_utf8)
2228 {
2229 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2230 *code++ = OP_XCLASS;
2231 code += LINK_SIZE;
2232 *code = negate_class? XCL_NOT : 0;
2233
2234 /* If the map is required, install it, and move on to the end of
2235 the extra data */
2236
2237 if (class_charcount > 0)
2238 {
2239 *code++ |= XCL_MAP;
2240 memcpy(code, classbits, 32);
2241 code = class_utf8data;
2242 }
2243
2244 /* If the map is not required, slide down the extra data. */
2245
2246 else
2247 {
2248 int len = class_utf8data - (code + 33);
2249 memmove(code + 1, code + 33, len);
2250 code += len + 1;
2251 }
2252
2253 /* Now fill in the complete length of the item */
2254
2255 PUT(previous, 1, code - previous);
2256 break; /* End of class handling */
2257 }
2258 #endif
2259
2260 /* If there are no characters > 255, negate the 32-byte map if necessary,
2261 and copy it into the code vector. If this is the first thing in the branch,
2262 there can be no first char setting, whatever the repeat count. Any reqbyte
2263 setting must remain unchanged after any kind of repeat. */
2264
2265 if (negate_class)
2266 {
2267 *code++ = OP_NCLASS;
2268 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2269 }
2270 else
2271 {
2272 *code++ = OP_CLASS;
2273 memcpy(code, classbits, 32);
2274 }
2275 code += 32;
2276 break;
2277
2278 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2279 has been tested above. */
2280
2281 case '{':
2282 if (!is_quantifier) goto NORMAL_CHAR;
2283 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2284 if (*errorcodeptr != 0) goto FAILED;
2285 goto REPEAT;
2286
2287 case '*':
2288 repeat_min = 0;
2289 repeat_max = -1;
2290 goto REPEAT;
2291
2292 case '+':
2293 repeat_min = 1;
2294 repeat_max = -1;
2295 goto REPEAT;
2296
2297 case '?':
2298 repeat_min = 0;
2299 repeat_max = 1;
2300
2301 REPEAT:
2302 if (previous == NULL)
2303 {
2304 *errorcodeptr = ERR9;
2305 goto FAILED;
2306 }
2307
2308 if (repeat_min == 0)
2309 {
2310 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2311 reqbyte = zeroreqbyte; /* Ditto */
2312 }
2313
2314 /* Remember whether this is a variable length repeat */
2315
2316 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2317
2318 op_type = 0; /* Default single-char op codes */
2319 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2320
2321 /* Save start of previous item, in case we have to move it up to make space
2322 for an inserted OP_ONCE for the additional '+' extension. */
2323
2324 tempcode = previous;
2325
2326 /* If the next character is '+', we have a possessive quantifier. This
2327 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2328 If the next character is '?' this is a minimizing repeat, by default,
2329 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2330 repeat type to the non-default. */
2331
2332 if (ptr[1] == '+')
2333 {
2334 repeat_type = 0; /* Force greedy */
2335 possessive_quantifier = TRUE;
2336 ptr++;
2337 }
2338 else if (ptr[1] == '?')
2339 {
2340 repeat_type = greedy_non_default;
2341 ptr++;
2342 }
2343 else repeat_type = greedy_default;
2344
2345 /* If previous was a recursion, we need to wrap it inside brackets so that
2346 it can be replicated if necessary. */
2347
2348 if (*previous == OP_RECURSE)
2349 {
2350 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2351 code += 1 + LINK_SIZE;
2352 *previous = OP_BRA;
2353 PUT(previous, 1, code - previous);
2354 *code = OP_KET;
2355 PUT(code, 1, code - previous);
2356 code += 1 + LINK_SIZE;
2357 }
2358
2359 /* If previous was a character match, abolish the item and generate a
2360 repeat item instead. If a char item has a minumum of more than one, ensure
2361 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2362 the first thing in a branch because the x will have gone into firstbyte
2363 instead. */
2364
2365 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2366 {
2367 /* Deal with UTF-8 characters that take up more than one byte. It's
2368 easier to write this out separately than try to macrify it. Use c to
2369 hold the length of the character in bytes, plus 0x80 to flag that it's a
2370 length rather than a small character. */
2371
2372 #ifdef SUPPORT_UTF8
2373 if (utf8 && (code[-1] & 0x80) != 0)
2374 {
2375 uschar *lastchar = code - 1;
2376 while((*lastchar & 0xc0) == 0x80) lastchar--;
2377 c = code - lastchar; /* Length of UTF-8 character */
2378 memcpy(utf8_char, lastchar, c); /* Save the char */
2379 c |= 0x80; /* Flag c as a length */
2380 }
2381 else
2382 #endif
2383
2384 /* Handle the case of a single byte - either with no UTF8 support, or
2385 with UTF-8 disabled, or for a UTF-8 character < 128. */
2386
2387 {
2388 c = code[-1];
2389 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2390 }
2391
2392 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2393 }
2394
2395 /* If previous was a single negated character ([^a] or similar), we use
2396 one of the special opcodes, replacing it. The code is shared with single-
2397 character repeats by setting opt_type to add a suitable offset into
2398 repeat_type. OP_NOT is currently used only for single-byte chars. */
2399
2400 else if (*previous == OP_NOT)
2401 {
2402 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2403 c = previous[1];
2404 goto OUTPUT_SINGLE_REPEAT;
2405 }
2406
2407 /* If previous was a character type match (\d or similar), abolish it and
2408 create a suitable repeat item. The code is shared with single-character
2409 repeats by setting op_type to add a suitable offset into repeat_type. Note
2410 the the Unicode property types will be present only when SUPPORT_UCP is
2411 defined, but we don't wrap the little bits of code here because it just
2412 makes it horribly messy. */
2413
2414 else if (*previous < OP_EODN)
2415 {
2416 uschar *oldcode;
2417 int prop_type;
2418 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2419 c = *previous;
2420
2421 OUTPUT_SINGLE_REPEAT:
2422 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2423 previous[1] : -1;
2424
2425 oldcode = code;
2426 code = previous; /* Usually overwrite previous item */
2427
2428 /* If the maximum is zero then the minimum must also be zero; Perl allows
2429 this case, so we do too - by simply omitting the item altogether. */
2430
2431 if (repeat_max == 0) goto END_REPEAT;
2432
2433 /* All real repeats make it impossible to handle partial matching (maybe
2434 one day we will be able to remove this restriction). */
2435
2436 if (repeat_max != 1) cd->nopartial = TRUE;
2437
2438 /* Combine the op_type with the repeat_type */
2439
2440 repeat_type += op_type;
2441
2442 /* A minimum of zero is handled either as the special case * or ?, or as
2443 an UPTO, with the maximum given. */
2444
2445 if (repeat_min == 0)
2446 {
2447 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2448 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2449 else
2450 {
2451 *code++ = OP_UPTO + repeat_type;
2452 PUT2INC(code, 0, repeat_max);
2453 }
2454 }
2455
2456 /* A repeat minimum of 1 is optimized into some special cases. If the
2457 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2458 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2459 one less than the maximum. */
2460
2461 else if (repeat_min == 1)
2462 {
2463 if (repeat_max == -1)
2464 *code++ = OP_PLUS + repeat_type;
2465 else
2466 {
2467 code = oldcode; /* leave previous item in place */
2468 if (repeat_max == 1) goto END_REPEAT;
2469 *code++ = OP_UPTO + repeat_type;
2470 PUT2INC(code, 0, repeat_max - 1);
2471 }
2472 }
2473
2474 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2475 handled as an EXACT followed by an UPTO. */
2476
2477 else
2478 {
2479 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2480 PUT2INC(code, 0, repeat_min);
2481
2482 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2483 we have to insert the character for the previous code. For a repeated
2484 Unicode property match, there is an extra byte that defines the
2485 required property. In UTF-8 mode, long characters have their length in
2486 c, with the 0x80 bit as a flag. */
2487
2488 if (repeat_max < 0)
2489 {
2490 #ifdef SUPPORT_UTF8
2491 if (utf8 && c >= 128)
2492 {
2493 memcpy(code, utf8_char, c & 7);
2494 code += c & 7;
2495 }
2496 else
2497 #endif
2498 {
2499 *code++ = c;
2500 if (prop_type >= 0) *code++ = prop_type;
2501 }
2502 *code++ = OP_STAR + repeat_type;
2503 }
2504
2505 /* Else insert an UPTO if the max is greater than the min, again
2506 preceded by the character, for the previously inserted code. */
2507
2508 else if (repeat_max != repeat_min)
2509 {
2510 #ifdef SUPPORT_UTF8
2511 if (utf8 && c >= 128)
2512 {
2513 memcpy(code, utf8_char, c & 7);
2514 code += c & 7;
2515 }
2516 else
2517 #endif
2518 *code++ = c;
2519 if (prop_type >= 0) *code++ = prop_type;
2520 repeat_max -= repeat_min;
2521 *code++ = OP_UPTO + repeat_type;
2522 PUT2INC(code, 0, repeat_max);
2523 }
2524 }
2525
2526 /* The character or character type itself comes last in all cases. */
2527
2528 #ifdef SUPPORT_UTF8
2529 if (utf8 && c >= 128)
2530 {
2531 memcpy(code, utf8_char, c & 7);
2532 code += c & 7;
2533 }
2534 else
2535 #endif
2536 *code++ = c;
2537
2538 /* For a repeated Unicode property match, there is an extra byte that
2539 defines the required property. */
2540
2541 #ifdef SUPPORT_UCP
2542 if (prop_type >= 0) *code++ = prop_type;
2543 #endif
2544 }
2545
2546 /* If previous was a character class or a back reference, we put the repeat
2547 stuff after it, but just skip the item if the repeat was {0,0}. */
2548
2549 else if (*previous == OP_CLASS ||
2550 *previous == OP_NCLASS ||
2551 #ifdef SUPPORT_UTF8
2552 *previous == OP_XCLASS ||
2553 #endif
2554 *previous == OP_REF)
2555 {
2556 if (repeat_max == 0)
2557 {
2558 code = previous;
2559 goto END_REPEAT;
2560 }
2561
2562 /* All real repeats make it impossible to handle partial matching (maybe
2563 one day we will be able to remove this restriction). */
2564
2565 if (repeat_max != 1) cd->nopartial = TRUE;
2566
2567 if (repeat_min == 0 && repeat_max == -1)
2568 *code++ = OP_CRSTAR + repeat_type;
2569 else if (repeat_min == 1 && repeat_max == -1)
2570 *code++ = OP_CRPLUS + repeat_type;
2571 else if (repeat_min == 0 && repeat_max == 1)
2572 *code++ = OP_CRQUERY + repeat_type;
2573 else
2574 {
2575 *code++ = OP_CRRANGE + repeat_type;
2576 PUT2INC(code, 0, repeat_min);
2577 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2578 PUT2INC(code, 0, repeat_max);
2579 }
2580 }
2581
2582 /* If previous was a bracket group, we may have to replicate it in certain
2583 cases. */
2584
2585 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2586 *previous == OP_COND)
2587 {
2588 register int i;
2589 int ketoffset = 0;
2590 int len = code - previous;
2591 uschar *bralink = NULL;
2592
2593 /* If the maximum repeat count is unlimited, find the end of the bracket
2594 by scanning through from the start, and compute the offset back to it
2595 from the current code pointer. There may be an OP_OPT setting following
2596 the final KET, so we can't find the end just by going back from the code
2597 pointer. */
2598
2599 if (repeat_max == -1)
2600 {
2601 register uschar *ket = previous;
2602 do ket += GET(ket, 1); while (*ket != OP_KET);
2603 ketoffset = code - ket;
2604 }
2605
2606 /* The case of a zero minimum is special because of the need to stick
2607 OP_BRAZERO in front of it, and because the group appears once in the
2608 data, whereas in other cases it appears the minimum number of times. For
2609 this reason, it is simplest to treat this case separately, as otherwise
2610 the code gets far too messy. There are several special subcases when the
2611 minimum is zero. */
2612
2613 if (repeat_min == 0)
2614 {
2615 /* If the maximum is also zero, we just omit the group from the output
2616 altogether. */
2617
2618 if (repeat_max == 0)
2619 {
2620 code = previous;
2621 goto END_REPEAT;
2622 }
2623
2624 /* If the maximum is 1 or unlimited, we just have to stick in the
2625 BRAZERO and do no more at this point. However, we do need to adjust
2626 any OP_RECURSE calls inside the group that refer to the group itself or
2627 any internal group, because the offset is from the start of the whole
2628 regex. Temporarily terminate the pattern while doing this. */
2629
2630 if (repeat_max <= 1)
2631 {
2632 *code = OP_END;
2633 adjust_recurse(previous, 1, utf8, cd);
2634 memmove(previous+1, previous, len);
2635 code++;
2636 *previous++ = OP_BRAZERO + repeat_type;
2637 }
2638
2639 /* If the maximum is greater than 1 and limited, we have to replicate
2640 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2641 The first one has to be handled carefully because it's the original
2642 copy, which has to be moved up. The remainder can be handled by code
2643 that is common with the non-zero minimum case below. We have to
2644 adjust the value or repeat_max, since one less copy is required. Once
2645 again, we may have to adjust any OP_RECURSE calls inside the group. */
2646
2647 else
2648 {
2649 int offset;
2650 *code = OP_END;
2651 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2652 memmove(previous + 2 + LINK_SIZE, previous, len);
2653 code += 2 + LINK_SIZE;
2654 *previous++ = OP_BRAZERO + repeat_type;
2655 *previous++ = OP_BRA;
2656
2657 /* We chain together the bracket offset fields that have to be
2658 filled in later when the ends of the brackets are reached. */
2659
2660 offset = (bralink == NULL)? 0 : previous - bralink;
2661 bralink = previous;
2662 PUTINC(previous, 0, offset);
2663 }
2664
2665 repeat_max--;
2666 }
2667
2668 /* If the minimum is greater than zero, replicate the group as many
2669 times as necessary, and adjust the maximum to the number of subsequent
2670 copies that we need. If we set a first char from the group, and didn't
2671 set a required char, copy the latter from the former. */
2672
2673 else
2674 {
2675 if (repeat_min > 1)
2676 {
2677 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2678 for (i = 1; i < repeat_min; i++)
2679 {
2680 memcpy(code, previous, len);
2681 code += len;
2682 }
2683 }
2684 if (repeat_max > 0) repeat_max -= repeat_min;
2685 }
2686
2687 /* This code is common to both the zero and non-zero minimum cases. If
2688 the maximum is limited, it replicates the group in a nested fashion,
2689 remembering the bracket starts on a stack. In the case of a zero minimum,
2690 the first one was set up above. In all cases the repeat_max now specifies
2691 the number of additional copies needed. */
2692
2693 if (repeat_max >= 0)
2694 {
2695 for (i = repeat_max - 1; i >= 0; i--)
2696 {
2697 *code++ = OP_BRAZERO + repeat_type;
2698
2699 /* All but the final copy start a new nesting, maintaining the
2700 chain of brackets outstanding. */
2701
2702 if (i != 0)
2703 {
2704 int offset;
2705 *code++ = OP_BRA;
2706 offset = (bralink == NULL)? 0 : code - bralink;
2707 bralink = code;
2708 PUTINC(code, 0, offset);
2709 }
2710
2711 memcpy(code, previous, len);
2712 code += len;
2713 }
2714
2715 /* Now chain through the pending brackets, and fill in their length
2716 fields (which are holding the chain links pro tem). */
2717
2718 while (bralink != NULL)
2719 {
2720 int oldlinkoffset;
2721 int offset = code - bralink + 1;
2722 uschar *bra = code - offset;
2723 oldlinkoffset = GET(bra, 1);
2724 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2725 *code++ = OP_KET;
2726 PUTINC(code, 0, offset);
2727 PUT(bra, 1, offset);
2728 }
2729 }
2730
2731 /* If the maximum is unlimited, set a repeater in the final copy. We
2732 can't just offset backwards from the current code point, because we
2733 don't know if there's been an options resetting after the ket. The
2734 correct offset was computed above. */
2735
2736 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2737 }
2738
2739 /* Else there's some kind of shambles */
2740
2741 else
2742 {
2743 *errorcodeptr = ERR11;
2744 goto FAILED;
2745 }
2746
2747 /* If the character following a repeat is '+', we wrap the entire repeated
2748 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2749 Sun's Java package. The repeated item starts at tempcode, not at previous,
2750 which might be the first part of a string whose (former) last char we
2751 repeated. However, we don't support '+' after a greediness '?'. */
2752
2753 if (possessive_quantifier)
2754 {
2755 int len = code - tempcode;
2756 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2757 code += 1 + LINK_SIZE;
2758 len += 1 + LINK_SIZE;
2759 tempcode[0] = OP_ONCE;
2760 *code++ = OP_KET;
2761 PUTINC(code, 0, len);
2762 PUT(tempcode, 1, len);
2763 }
2764
2765 /* In all case we no longer have a previous item. We also set the
2766 "follows varying string" flag for subsequently encountered reqbytes if
2767 it isn't already set and we have just passed a varying length item. */
2768
2769 END_REPEAT:
2770 previous = NULL;
2771 cd->req_varyopt |= reqvary;
2772 break;
2773
2774
2775 /* Start of nested bracket sub-expression, or comment or lookahead or
2776 lookbehind or option setting or condition. First deal with special things
2777 that can come after a bracket; all are introduced by ?, and the appearance
2778 of any of them means that this is not a referencing group. They were
2779 checked for validity in the first pass over the string, so we don't have to
2780 check for syntax errors here. */
2781
2782 case '(':
2783 newoptions = options;
2784 skipbytes = 0;
2785
2786 if (*(++ptr) == '?')
2787 {
2788 int set, unset;
2789 int *optset;
2790
2791 switch (*(++ptr))
2792 {
2793 case '#': /* Comment; skip to ket */
2794 ptr++;
2795 while (*ptr != ')') ptr++;
2796 continue;
2797
2798 case ':': /* Non-extracting bracket */
2799 bravalue = OP_BRA;
2800 ptr++;
2801 break;
2802
2803 case '(':
2804 bravalue = OP_COND; /* Conditional group */
2805
2806 /* Condition to test for recursion */
2807
2808 if (ptr[1] == 'R')
2809 {
2810 code[1+LINK_SIZE] = OP_CREF;
2811 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2812 skipbytes = 3;
2813 ptr += 3;
2814 }
2815
2816 /* Condition to test for a numbered subpattern match. We know that
2817 if a digit follows ( then there will just be digits until ) because
2818 the syntax was checked in the first pass. */
2819
2820 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2821 {
2822 int condref; /* Don't amalgamate; some compilers */
2823 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2824 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2825 if (condref == 0)
2826 {
2827 *errorcodeptr = ERR35;
2828 goto FAILED;
2829 }
2830 ptr++;
2831 code[1+LINK_SIZE] = OP_CREF;
2832 PUT2(code, 2+LINK_SIZE, condref);
2833 skipbytes = 3;
2834 }
2835 /* For conditions that are assertions, we just fall through, having
2836 set bravalue above. */
2837 break;
2838
2839 case '=': /* Positive lookahead */
2840 bravalue = OP_ASSERT;
2841 ptr++;
2842 break;
2843
2844 case '!': /* Negative lookahead */
2845 bravalue = OP_ASSERT_NOT;
2846 ptr++;
2847 break;
2848
2849 case '<': /* Lookbehinds */
2850 switch (*(++ptr))
2851 {
2852 case '=': /* Positive lookbehind */
2853 bravalue = OP_ASSERTBACK;
2854 ptr++;
2855 break;
2856
2857 case '!': /* Negative lookbehind */
2858 bravalue = OP_ASSERTBACK_NOT;
2859 ptr++;
2860 break;
2861 }
2862 break;
2863
2864 case '>': /* One-time brackets */
2865 bravalue = OP_ONCE;
2866 ptr++;
2867 break;
2868
2869 case 'C': /* Callout - may be followed by digits; */
2870 previous_callout = code; /* Save for later completion */
2871 after_manual_callout = 1; /* Skip one item before completing */
2872 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2873 { /* closing parenthesis is present. */
2874 int n = 0;
2875 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2876 n = n * 10 + *ptr - '0';
2877 if (n > 255)
2878 {
2879 *errorcodeptr = ERR38;
2880 goto FAILED;
2881 }
2882 *code++ = n;
2883 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2884 PUT(code, LINK_SIZE, 0); /* Default length */
2885 code += 2 * LINK_SIZE;
2886 }
2887 previous = NULL;
2888 continue;
2889
2890 case 'P': /* Named subpattern handling */
2891 if (*(++ptr) == '<') /* Definition */
2892 {
2893 int i, namelen;
2894 uschar *slot = cd->name_table;
2895 const uschar *name; /* Don't amalgamate; some compilers */
2896 name = ++ptr; /* grumble at autoincrement in declaration */
2897
2898 while (*ptr++ != '>');
2899 namelen = ptr - name - 1;
2900
2901 for (i = 0; i < cd->names_found; i++)
2902 {
2903 int crc = memcmp(name, slot+2, namelen);
2904 if (crc == 0)
2905 {
2906 if (slot[2+namelen] == 0)
2907 {
2908 *errorcodeptr = ERR43;
2909 goto FAILED;
2910 }
2911 crc = -1; /* Current name is substring */
2912 }
2913 if (crc < 0)
2914 {
2915 memmove(slot + cd->name_entry_size, slot,
2916 (cd->names_found - i) * cd->name_entry_size);
2917 break;
2918 }
2919 slot += cd->name_entry_size;
2920 }
2921
2922 PUT2(slot, 0, *brackets + 1);
2923 memcpy(slot + 2, name, namelen);
2924 slot[2+namelen] = 0;
2925 cd->names_found++;
2926 goto NUMBERED_GROUP;
2927 }
2928
2929 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2930 {
2931 int i, namelen;
2932 int type = *ptr++;
2933 const uschar *name = ptr;
2934 uschar *slot = cd->name_table;
2935
2936 while (*ptr != ')') ptr++;
2937 namelen = ptr - name;
2938
2939 for (i = 0; i < cd->names_found; i++)
2940 {
2941 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2942 slot += cd->name_entry_size;
2943 }
2944 if (i >= cd->names_found)
2945 {
2946 *errorcodeptr = ERR15;
2947 goto FAILED;
2948 }
2949
2950 recno = GET2(slot, 0);
2951
2952 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2953
2954 /* Back reference */
2955
2956 previous = code;
2957 *code++ = OP_REF;
2958 PUT2INC(code, 0, recno);
2959 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2960 if (recno > cd->top_backref) cd->top_backref = recno;
2961 continue;
2962 }
2963
2964 /* Should never happen */
2965 break;
2966
2967 case 'R': /* Pattern recursion */
2968 ptr++; /* Same as (?0) */
2969 /* Fall through */
2970
2971 /* Recursion or "subroutine" call */
2972
2973 case '0': case '1': case '2': case '3': case '4':
2974 case '5': case '6': case '7': case '8': case '9':
2975 {
2976 const uschar *called;
2977 recno = 0;
2978 while((digitab[*ptr] & ctype_digit) != 0)
2979 recno = recno * 10 + *ptr++ - '0';
2980
2981 /* Come here from code above that handles a named recursion */
2982
2983 HANDLE_RECURSION:
2984
2985 previous = code;
2986
2987 /* Find the bracket that is being referenced. Temporarily end the
2988 regex in case it doesn't exist. */
2989
2990 *code = OP_END;
2991 called = (recno == 0)?
2992 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2993
2994 if (called == NULL)
2995 {
2996 *errorcodeptr = ERR15;
2997 goto FAILED;
2998 }
2999
3000 /* If the subpattern is still open, this is a recursive call. We
3001 check to see if this is a left recursion that could loop for ever,
3002 and diagnose that case. */
3003
3004 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3005 {
3006 *errorcodeptr = ERR40;
3007 goto FAILED;
3008 }
3009
3010 /* Insert the recursion/subroutine item */
3011
3012 *code = OP_RECURSE;
3013 PUT(code, 1, called - cd->start_code);
3014 code += 1 + LINK_SIZE;
3015 }
3016 continue;
3017
3018 /* Character after (? not specially recognized */
3019
3020 default: /* Option setting */
3021 set = unset = 0;
3022 optset = &set;
3023
3024 while (*ptr != ')' && *ptr != ':')
3025 {
3026 switch (*ptr++)
3027 {
3028 case '-': optset = &unset; break;
3029
3030 case 'i': *optset |= PCRE_CASELESS; break;
3031 case 'm': *optset |= PCRE_MULTILINE; break;
3032 case 's': *optset |= PCRE_DOTALL; break;
3033 case 'x': *optset |= PCRE_EXTENDED; break;
3034 case 'U': *optset |= PCRE_UNGREEDY; break;
3035 case 'X': *optset |= PCRE_EXTRA; break;
3036 }
3037 }
3038
3039 /* Set up the changed option bits, but don't change anything yet. */
3040
3041 newoptions = (options | set) & (~unset);
3042
3043 /* If the options ended with ')' this is not the start of a nested
3044 group with option changes, so the options change at this level. Compile
3045 code to change the ims options if this setting actually changes any of
3046 them. We also pass the new setting back so that it can be put at the
3047 start of any following branches, and when this group ends (if we are in
3048 a group), a resetting item can be compiled.
3049
3050 Note that if this item is right at the start of the pattern, the
3051 options will have been abstracted and made global, so there will be no
3052 change to compile. */
3053
3054 if (*ptr == ')')
3055 {
3056 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3057 {
3058 *code++ = OP_OPT;
3059 *code++ = newoptions & PCRE_IMS;
3060 }
3061
3062 /* Change options at this level, and pass them back for use
3063 in subsequent branches. Reset the greedy defaults and the case
3064 value for firstbyte and reqbyte. */
3065
3066 *optionsptr = options = newoptions;
3067 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3068 greedy_non_default = greedy_default ^ 1;
3069 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3070
3071 previous = NULL; /* This item can't be repeated */
3072 continue; /* It is complete */
3073 }
3074
3075 /* If the options ended with ':' we are heading into a nested group
3076 with possible change of options. Such groups are non-capturing and are
3077 not assertions of any kind. All we need to do is skip over the ':';
3078 the newoptions value is handled below. */
3079
3080 bravalue = OP_BRA;
3081 ptr++;
3082 }
3083 }
3084
3085 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3086 non-capturing and behave like (?:...) brackets */
3087
3088 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3089 {
3090 bravalue = OP_BRA;
3091 }
3092
3093 /* Else we have a referencing group; adjust the opcode. If the bracket
3094 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3095 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3096
3097 else
3098 {
3099 NUMBERED_GROUP:
3100 if (++(*brackets) > EXTRACT_BASIC_MAX)
3101 {
3102 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3103 code[1+LINK_SIZE] = OP_BRANUMBER;
3104 PUT2(code, 2+LINK_SIZE, *brackets);
3105 skipbytes = 3;
3106 }
3107 else bravalue = OP_BRA + *brackets;
3108 }
3109
3110 /* Process nested bracketed re. Assertions may not be repeated, but other
3111 kinds can be. We copy code into a non-register variable in order to be able
3112 to pass its address because some compilers complain otherwise. Pass in a
3113 new setting for the ims options if they have changed. */
3114
3115 previous = (bravalue >= OP_ONCE)? code : NULL;
3116 *code = bravalue;
3117 tempcode = code;
3118 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3119
3120 if (!compile_regex(
3121 newoptions, /* The complete new option state */
3122 options & PCRE_IMS, /* The previous ims option state */
3123 brackets, /* Extracting bracket count */
3124 &tempcode, /* Where to put code (updated) */
3125 &ptr, /* Input pointer (updated) */
3126 errorcodeptr, /* Where to put an error message */
3127 (bravalue == OP_ASSERTBACK ||
3128 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3129 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3130 &subfirstbyte, /* For possible first char */
3131 &subreqbyte, /* For possible last char */
3132 bcptr, /* Current branch chain */
3133 cd)) /* Tables block */
3134 goto FAILED;
3135
3136 /* At the end of compiling, code is still pointing to the start of the
3137 group, while tempcode has been updated to point past the end of the group
3138 and any option resetting that may follow it. The pattern pointer (ptr)
3139 is on the bracket. */
3140
3141 /* If this is a conditional bracket, check that there are no more than
3142 two branches in the group. */
3143
3144 else if (bravalue == OP_COND)
3145 {
3146 uschar *tc = code;
3147 condcount = 0;
3148
3149 do {
3150 condcount++;
3151 tc += GET(tc,1);
3152 }
3153 while (*tc != OP_KET);
3154
3155 if (condcount > 2)
3156 {
3157 *errorcodeptr = ERR27;
3158 goto FAILED;
3159 }
3160
3161 /* If there is just one branch, we must not make use of its firstbyte or
3162 reqbyte, because this is equivalent to an empty second branch. */
3163
3164 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3165 }
3166
3167 /* Handle updating of the required and first characters. Update for normal
3168 brackets of all kinds, and conditions with two branches (see code above).
3169 If the bracket is followed by a quantifier with zero repeat, we have to
3170 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3171 main loop so that they can be accessed for the back off. */
3172
3173 zeroreqbyte = reqbyte;
3174 zerofirstbyte = firstbyte;
3175 groupsetfirstbyte = FALSE;
3176
3177 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3178 {
3179 /* If we have not yet set a firstbyte in this branch, take it from the
3180 subpattern, remembering that it was set here so that a repeat of more
3181 than one can replicate it as reqbyte if necessary. If the subpattern has
3182 no firstbyte, set "none" for the whole branch. In both cases, a zero
3183 repeat forces firstbyte to "none". */
3184
3185 if (firstbyte == REQ_UNSET)
3186 {
3187 if (subfirstbyte >= 0)
3188 {
3189 firstbyte = subfirstbyte;
3190 groupsetfirstbyte = TRUE;
3191 }
3192 else firstbyte = REQ_NONE;
3193 zerofirstbyte = REQ_NONE;
3194 }
3195
3196 /* If firstbyte was previously set, convert the subpattern's firstbyte
3197 into reqbyte if there wasn't one, using the vary flag that was in
3198 existence beforehand. */
3199
3200 else if (subfirstbyte >= 0 && subreqbyte < 0)
3201 subreqbyte = subfirstbyte | tempreqvary;
3202
3203 /* If the subpattern set a required byte (or set a first byte that isn't
3204 really the first byte - see above), set it. */
3205
3206 if (subreqbyte >= 0) reqbyte = subreqbyte;
3207 }
3208
3209 /* For a forward assertion, we take the reqbyte, if set. This can be
3210 helpful if the pattern that follows the assertion doesn't set a different
3211 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3212 for an assertion, however because it leads to incorrect effect for patterns
3213 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3214 of a firstbyte. This is overcome by a scan at the end if there's no
3215 firstbyte, looking for an asserted first char. */
3216
3217 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3218
3219 /* Now update the main code pointer to the end of the group. */
3220
3221 code = tempcode;
3222
3223 /* Error if hit end of pattern */
3224
3225 if (*ptr != ')')
3226 {
3227 *errorcodeptr = ERR14;
3228 goto FAILED;
3229 }
3230 break;
3231
3232 /* Check \ for being a real metacharacter; if not, fall through and handle
3233 it as a data character at the start of a string. Escape items are checked
3234 for validity in the pre-compiling pass. */
3235
3236 case '\\':
3237 tempptr = ptr;
3238 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3239
3240 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3241 are arranged to be the negation of the corresponding OP_values. For the
3242 back references, the values are ESC_REF plus the reference number. Only
3243 back references and those types that consume a character may be repeated.
3244 We can test for values between ESC_b and ESC_Z for the latter; this may
3245 have to change if any new ones are ever created. */
3246
3247 if (c < 0)
3248 {
3249 if (-c == ESC_Q) /* Handle start of quoted string */
3250 {
3251 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3252 else inescq = TRUE;
3253 continue;
3254 }
3255
3256 /* For metasequences that actually match a character, we disable the
3257 setting of a first character if it hasn't already been set. */
3258
3259 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3260 firstbyte = REQ_NONE;
3261
3262 /* Set values to reset to if this is followed by a zero repeat. */
3263
3264 zerofirstbyte = firstbyte;
3265 zeroreqbyte = reqbyte;
3266
3267 /* Back references are handled specially */
3268
3269 if (-c >= ESC_REF)
3270 {
3271 int number = -c - ESC_REF;
3272 previous = code;
3273 *code++ = OP_REF;
3274 PUT2INC(code, 0, number);
3275 }
3276
3277 /* So are Unicode property matches, if supported. We know that get_ucp
3278 won't fail because it was tested in the pre-pass. */
3279
3280 #ifdef SUPPORT_UCP
3281 else if (-c == ESC_P || -c == ESC_p)
3282 {
3283 BOOL negated;
3284 int value = get_ucp(&ptr, &negated, errorcodeptr);
3285 previous = code;
3286 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3287 *code++ = value;
3288 }
3289 #endif
3290
3291 /* For the rest, we can obtain the OP value by negating the escape
3292 value */
3293
3294 else
3295 {
3296 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3297 *code++ = -c;
3298 }
3299 continue;
3300 }
3301
3302 /* We have a data character whose value is in c. In UTF-8 mode it may have
3303 a value > 127. We set its representation in the length/buffer, and then
3304 handle it as a data character. */
3305
3306 #ifdef SUPPORT_UTF8
3307 if (utf8 && c > 127)
3308 mclength = _pcre_ord2utf8(c, mcbuffer);
3309 else
3310 #endif
3311
3312 {
3313 mcbuffer[0] = c;
3314 mclength = 1;
3315 }
3316
3317 goto ONE_CHAR;
3318
3319 /* Handle a literal character. It is guaranteed not to be whitespace or #
3320 when the extended flag is set. If we are in UTF-8 mode, it may be a
3321 multi-byte literal character. */
3322
3323 default:
3324 NORMAL_CHAR:
3325 mclength = 1;
3326 mcbuffer[0] = c;
3327
3328 #ifdef SUPPORT_UTF8
3329 if (utf8 && (c & 0xc0) == 0xc0)
3330 {
3331 while ((ptr[1] & 0xc0) == 0x80)
3332 mcbuffer[mclength++] = *(++ptr);
3333 }
3334 #endif
3335
3336 /* At this point we have the character's bytes in mcbuffer, and the length
3337 in mclength. When not in UTF-8 mode, the length is always 1. */
3338
3339 ONE_CHAR:
3340 previous = code;
3341 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3342 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3343
3344 /* Set the first and required bytes appropriately. If no previous first
3345 byte, set it from this character, but revert to none on a zero repeat.
3346 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3347 repeat. */
3348
3349 if (firstbyte == REQ_UNSET)
3350 {
3351 zerofirstbyte = REQ_NONE;
3352 zeroreqbyte = reqbyte;
3353
3354 /* If the character is more than one byte long, we can set firstbyte
3355 only if it is not to be matched caselessly. */
3356
3357 if (mclength == 1 || req_caseopt == 0)
3358 {
3359 firstbyte = mcbuffer[0] | req_caseopt;
3360 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3361 }
3362 else firstbyte = reqbyte = REQ_NONE;
3363 }
3364
3365 /* firstbyte was previously set; we can set reqbyte only the length is
3366 1 or the matching is caseful. */
3367
3368 else
3369 {
3370 zerofirstbyte = firstbyte;
3371 zeroreqbyte = reqbyte;
3372 if (mclength == 1 || req_caseopt == 0)
3373 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3374 }
3375
3376 break; /* End of literal character handling */
3377 }
3378 } /* end of big loop */
3379
3380 /* Control never reaches here by falling through, only by a goto for all the
3381 error states. Pass back the position in the pattern so that it can be displayed
3382 to the user for diagnosing the error. */
3383
3384 FAILED:
3385 *ptrptr = ptr;
3386 return FALSE;
3387 }
3388
3389
3390
3391
3392 /*************************************************
3393 * Compile sequence of alternatives *
3394 *************************************************/
3395
3396 /* On entry, ptr is pointing past the bracket character, but on return
3397 it points to the closing bracket, or vertical bar, or end of string.
3398 The code variable is pointing at the byte into which the BRA operator has been
3399 stored. If the ims options are changed at the start (for a (?ims: group) or
3400 during any branch, we need to insert an OP_OPT item at the start of every
3401 following branch to ensure they get set correctly at run time, and also pass
3402 the new options into every subsequent branch compile.
3403
3404 Argument:
3405 options option bits, including any changes for this subpattern
3406 oldims previous settings of ims option bits
3407 brackets -> int containing the number of extracting brackets used
3408 codeptr -> the address of the current code pointer
3409 ptrptr -> the address of the current pattern pointer
3410 errorcodeptr -> pointer to error code variable
3411 lookbehind TRUE if this is a lookbehind assertion
3412 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3413 firstbyteptr place to put the first required character, or a negative number
3414 reqbyteptr place to put the last required character, or a negative number
3415 bcptr pointer to the chain of currently open branches
3416 cd points to the data block with tables pointers etc.
3417
3418 Returns: TRUE on success
3419 */
3420
3421 static BOOL
3422 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3423 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3424 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3425 {
3426 const uschar *ptr = *ptrptr;
3427 uschar *code = *codeptr;
3428 uschar *last_branch = code;
3429 uschar *start_bracket = code;
3430 uschar *reverse_count = NULL;
3431 int firstbyte, reqbyte;
3432 int branchfirstbyte, branchreqbyte;
3433 branch_chain bc;
3434
3435 bc.outer = bcptr;
3436 bc.current = code;
3437
3438 firstbyte = reqbyte = REQ_UNSET;
3439
3440 /* Offset is set zero to mark that this bracket is still open */
3441
3442 PUT(code, 1, 0);
3443 code += 1 + LINK_SIZE + skipbytes;
3444
3445 /* Loop for each alternative branch */
3446
3447 for (;;)
3448 {
3449 /* Handle a change of ims options at the start of the branch */
3450
3451 if ((options & PCRE_IMS) != oldims)
3452 {
3453 *code++ = OP_OPT;
3454 *code++ = options & PCRE_IMS;
3455 }
3456
3457 /* Set up dummy OP_REVERSE if lookbehind assertion */
3458
3459 if (lookbehind)
3460 {
3461 *code++ = OP_REVERSE;
3462 reverse_count = code;
3463 PUTINC(code, 0, 0);
3464 }
3465
3466 /* Now compile the branch */
3467
3468 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3469 &branchfirstbyte, &branchreqbyte, &bc, cd))
3470 {
3471 *ptrptr = ptr;
3472 return FALSE;
3473 }
3474
3475 /* If this is the first branch, the firstbyte and reqbyte values for the
3476 branch become the values for the regex. */
3477
3478 if (*last_branch != OP_ALT)
3479 {
3480 firstbyte = branchfirstbyte;
3481 reqbyte = branchreqbyte;
3482 }
3483
3484 /* If this is not the first branch, the first char and reqbyte have to
3485 match the values from all the previous branches, except that if the previous
3486 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3487 REQ_VARY for the regex. */
3488
3489 else
3490 {
3491 /* If we previously had a firstbyte, but it doesn't match the new branch,
3492 we have to abandon the firstbyte for the regex, but if there was previously
3493 no reqbyte, it takes on the value of the old firstbyte. */
3494
3495 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3496 {
3497 if (reqbyte < 0) reqbyte = firstbyte;
3498 firstbyte = REQ_NONE;
3499 }
3500
3501 /* If we (now or from before) have no firstbyte, a firstbyte from the
3502 branch becomes a reqbyte if there isn't a branch reqbyte. */
3503
3504 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3505 branchreqbyte = branchfirstbyte;
3506
3507 /* Now ensure that the reqbytes match */
3508
3509 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3510 reqbyte = REQ_NONE;
3511 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3512 }
3513
3514 /* If lookbehind, check that this branch matches a fixed-length string,
3515 and put the length into the OP_REVERSE item. Temporarily mark the end of
3516 the branch with OP_END. */
3517
3518 if (lookbehind)
3519 {
3520 int length;
3521 *code = OP_END;
3522 length = find_fixedlength(last_branch, options);
3523 DPRINTF(("fixed length = %d\n", length));
3524 if (length < 0)
3525 {
3526 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3527 *ptrptr = ptr;
3528 return FALSE;
3529 }
3530 PUT(reverse_count, 0, length);
3531 }
3532
3533 /* Reached end of expression, either ')' or end of pattern. Go back through
3534 the alternative branches and reverse the chain of offsets, with the field in
3535 the BRA item now becoming an offset to the first alternative. If there are
3536 no alternatives, it points to the end of the group. The length in the
3537 terminating ket is always the length of the whole bracketed item. If any of
3538 the ims options were changed inside the group, compile a resetting op-code
3539 following, except at the very end of the pattern. Return leaving the pointer
3540 at the terminating char. */
3541
3542 if (*ptr != '|')
3543 {
3544 int length = code - last_branch;
3545 do
3546 {
3547 int prev_length = GET(last_branch, 1);
3548 PUT(last_branch, 1, length);
3549 length = prev_length;
3550 last_branch -= length;
3551 }
3552 while (length > 0);
3553
3554 /* Fill in the ket */
3555
3556 *code = OP_KET;
3557 PUT(code, 1, code - start_bracket);
3558 code += 1 + LINK_SIZE;
3559
3560 /* Resetting option if needed */
3561
3562 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3563 {
3564 *code++ = OP_OPT;
3565 *code++ = oldims;
3566 }
3567
3568 /* Set values to pass back */
3569
3570 *codeptr = code;
3571 *ptrptr = ptr;
3572 *firstbyteptr = firstbyte;
3573 *reqbyteptr = reqbyte;
3574 return TRUE;
3575 }
3576
3577 /* Another branch follows; insert an "or" node. Its length field points back
3578 to the previous branch while the bracket remains open. At the end the chain
3579 is reversed. It's done like this so that the start of the bracket has a
3580 zero offset until it is closed, making it possible to detect recursion. */
3581
3582 *code = OP_ALT;
3583 PUT(code, 1, code - last_branch);
3584 bc.current = last_branch = code;
3585 code += 1 + LINK_SIZE;
3586 ptr++;
3587 }
3588 /* Control never reaches here */
3589 }
3590
3591
3592
3593
3594 /*************************************************
3595 * Check for anchored expression *
3596 *************************************************/
3597
3598 /* Try to find out if this is an anchored regular expression. Consider each
3599 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3600 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3601 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3602 counts, since OP_CIRC can match in the middle.
3603
3604 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3605 This is the code for \G, which means "match at start of match position, taking
3606 into account the match offset".
3607
3608 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3609 because that will try the rest of the pattern at all possible matching points,
3610 so there is no point trying again.... er ....
3611
3612 .... except when the .* appears inside capturing parentheses, and there is a
3613 subsequent back reference to those parentheses. We haven't enough information
3614 to catch that case precisely.
3615
3616 At first, the best we could do was to detect when .* was in capturing brackets
3617 and the highest back reference was greater than or equal to that level.
3618 However, by keeping a bitmap of the first 31 back references, we can catch some
3619 of the more common cases more precisely.
3620
3621 Arguments:
3622 code points to start of expression (the bracket)
3623 options points to the options setting
3624 bracket_map a bitmap of which brackets we are inside while testing; this
3625 handles up to substring 31; after that we just have to take
3626 the less precise approach
3627 backref_map the back reference bitmap
3628
3629 Returns: TRUE or FALSE
3630 */
3631
3632 static BOOL
3633 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3634 unsigned int backref_map)
3635 {
3636 do {
3637 const uschar *scode =
3638 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3639 register int op = *scode;
3640
3641 /* Capturing brackets */
3642
3643 if (op > OP_BRA)
3644 {
3645 int new_map;
3646 op -= OP_BRA;
3647 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3648 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3649 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3650 }
3651
3652 /* Other brackets */
3653
3654 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3655 {
3656 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3657 }
3658
3659 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3660 are or may be referenced. */
3661
3662 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3663 (*options & PCRE_DOTALL) != 0)
3664 {
3665 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3666 }
3667
3668 /* Check for explicit anchoring */
3669
3670 else if (op != OP_SOD && op != OP_SOM &&
3671 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3672 return FALSE;
3673 code += GET(code, 1);
3674 }
3675 while (*code == OP_ALT); /* Loop for each alternative */
3676 return TRUE;
3677 }
3678
3679
3680
3681 /*************************************************
3682 * Check for starting with ^ or .* *
3683 *************************************************/
3684
3685 /* This is called to find out if every branch starts with ^ or .* so that
3686 "first char" processing can be done to speed things up in multiline
3687 matching and for non-DOTALL patterns that start with .* (which must start at
3688 the beginning or after \n). As in the case of is_anchored() (see above), we
3689 have to take account of back references to capturing brackets that contain .*
3690 because in that case we can't make the assumption.
3691
3692 Arguments:
3693 code points to start of expression (the bracket)
3694 bracket_map a bitmap of which brackets we are inside while testing; this
3695 handles up to substring 31; after that we just have to take
3696 the less precise approach
3697 backref_map the back reference bitmap
3698
3699 Returns: TRUE or FALSE
3700 */
3701
3702 static BOOL
3703 is_startline(const uschar *code, unsigned int bracket_map,
3704 unsigned int backref_map)
3705 {
3706 do {
3707 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3708 FALSE);
3709 register int op = *scode;
3710
3711 /* Capturing brackets */
3712
3713 if (op > OP_BRA)
3714 {
3715 int new_map;
3716 op -= OP_BRA;
3717 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3718 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3719 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3720 }
3721
3722 /* Other brackets */
3723
3724 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3725 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3726
3727 /* .* means "start at start or after \n" if it isn't in brackets that
3728 may be referenced. */
3729
3730 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3731 {
3732 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3733 }
3734
3735 /* Check for explicit circumflex */
3736
3737 else if (op != OP_CIRC) return FALSE;
3738
3739 /* Move on to the next alternative */
3740
3741 code += GET(code, 1);
3742 }
3743 while (*code == OP_ALT); /* Loop for each alternative */
3744 return TRUE;
3745 }
3746
3747
3748
3749 /*************************************************
3750 * Check for asserted fixed first char *
3751 *************************************************/
3752
3753 /* During compilation, the "first char" settings from forward assertions are
3754 discarded, because they can cause conflicts with actual literals that follow.
3755 However, if we end up without a first char setting for an unanchored pattern,
3756 it is worth scanning the regex to see if there is an initial asserted first
3757 char. If all branches start with the same asserted char, or with a bracket all
3758 of whose alternatives start with the same asserted char (recurse ad lib), then
3759 we return that char, otherwise -1.
3760
3761 Arguments:
3762 code points to start of expression (the bracket)
3763 options pointer to the options (used to check casing changes)
3764 inassert TRUE if in an assertion
3765
3766 Returns: -1 or the fixed first char
3767 */
3768
3769 static int
3770 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3771 {
3772 register int c = -1;
3773 do {
3774 int d;
3775 const uschar *scode =
3776 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3777 register int op = *scode;
3778
3779 if (op >= OP_BRA) op = OP_BRA;
3780
3781 switch(op)
3782 {
3783 default:
3784 return -1;
3785
3786 case OP_BRA:
3787 case OP_ASSERT:
3788 case OP_ONCE:
3789 case OP_COND:
3790 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3791 return -1;
3792 if (c < 0) c = d; else if (c != d) return -1;
3793 break;
3794
3795 case OP_EXACT: /* Fall through */
3796 scode += 2;
3797
3798 case OP_CHAR:
3799 case OP_CHARNC:
3800 case OP_PLUS:
3801 case OP_MINPLUS:
3802 if (!inassert) return -1;
3803 if (c < 0)
3804 {
3805 c = scode[1];
3806 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3807 }
3808 else if (c != scode[1]) return -1;
3809 break;
3810 }
3811
3812 code += GET(code, 1);
3813 }
3814 while (*code == OP_ALT);
3815 return c;
3816 }
3817
3818
3819
3820 /*************************************************
3821 * Compile a Regular Expression *
3822 *************************************************/
3823
3824 /* This function takes a string and returns a pointer to a block of store
3825 holding a compiled version of the expression. The original API for this
3826 function had no error code return variable; it is retained for backwards
3827 compatibility. The new function is given a new name.
3828
3829 Arguments:
3830 pattern the regular expression
3831 options various option bits
3832 errorcodeptr pointer to error code variable (pcre_compile2() only)
3833 can be NULL if you don't want a code value
3834 errorptr pointer to pointer to error text
3835 erroroffset ptr offset in pattern where error was detected
3836 tables pointer to character tables or NULL
3837
3838 Returns: pointer to compiled data block, or NULL on error,
3839 with errorptr and erroroffset set
3840 */
3841
3842 EXPORT pcre *
3843 pcre_compile(const char *pattern, int options, const char **errorptr,
3844 int *erroroffset, const unsigned char *tables)
3845 {
3846 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3847 }
3848
3849
3850 EXPORT pcre *
3851 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3852 const char **errorptr, int *erroroffset, const unsigned char *tables)
3853 {
3854 real_pcre *re;
3855 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3856 int c, firstbyte, reqbyte;
3857 int bracount = 0;
3858 int branch_extra = 0;
3859 int branch_newextra;
3860 int item_count = -1;
3861 int name_count = 0;
3862 int max_name_size = 0;
3863 int lastitemlength = 0;
3864 int errorcode = 0;
3865 #ifdef SUPPORT_UTF8
3866 BOOL utf8;
3867 BOOL class_utf8;
3868 #endif
3869 BOOL inescq = FALSE;
3870 BOOL capturing;
3871 unsigned int brastackptr = 0;
3872 size_t size;
3873 uschar *code;
3874 const uschar *codestart;
3875 const uschar *ptr;
3876 compile_data compile_block;
3877 int brastack[BRASTACK_SIZE];
3878 uschar bralenstack[BRASTACK_SIZE];
3879
3880 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3881 can do is just return NULL, but we can set a code value if there is a code
3882 pointer. */
3883
3884 if (errorptr == NULL)
3885 {
3886 if (errorcodeptr != NULL) *errorcodeptr = 99;
3887 return NULL;
3888 }
3889
3890 *errorptr = NULL;
3891 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3892
3893 /* However, we can give a message for this error */
3894
3895 if (erroroffset == NULL)
3896 {
3897 errorcode = ERR16;
3898 goto PCRE_EARLY_ERROR_RETURN;
3899 }
3900
3901 *erroroffset = 0;
3902
3903 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3904
3905 #ifdef SUPPORT_UTF8
3906 utf8 = (options & PCRE_UTF8) != 0;
3907 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3908 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3909 {
3910 errorcode = ERR44;
3911 goto PCRE_EARLY_ERROR_RETURN;
3912 }
3913 #else
3914 if ((options & PCRE_UTF8) != 0)
3915 {
3916 errorcode = ERR32;
3917 goto PCRE_EARLY_ERROR_RETURN;
3918 }
3919 #endif
3920
3921 if ((options & ~PUBLIC_OPTIONS) != 0)
3922 {
3923 errorcode = ERR17;
3924 goto PCRE_EARLY_ERROR_RETURN;
3925 }
3926
3927 /* Set up pointers to the individual character tables */
3928
3929 if (tables == NULL) tables = _pcre_default_tables;
3930 compile_block.lcc = tables + lcc_offset;
3931 compile_block.fcc = tables + fcc_offset;
3932 compile_block.cbits = tables + cbits_offset;
3933 compile_block.ctypes = tables + ctypes_offset;
3934
3935 /* Maximum back reference and backref bitmap. This is updated for numeric
3936 references during the first pass, but for named references during the actual
3937 compile pass. The bitmap records up to 31 back references to help in deciding
3938 whether (.*) can be treated as anchored or not. */
3939
3940 compile_block.top_backref = 0;
3941 compile_block.backref_map = 0;
3942
3943 /* Reflect pattern for debugging output */
3944
3945 DPRINTF(("------------------------------------------------------------------\n"));
3946 DPRINTF(("%s\n", pattern));
3947
3948 /* The first thing to do is to make a pass over the pattern to compute the
3949 amount of store required to hold the compiled code. This does not have to be
3950 perfect as long as errors are overestimates. At the same time we can detect any
3951 flag settings right at the start, and extract them. Make an attempt to correct
3952 for any counted white space if an "extended" flag setting appears late in the
3953 pattern. We can't be so clever for #-comments. */
3954
3955 ptr = (const uschar *)(pattern - 1);
3956 while ((c = *(++ptr)) != 0)
3957 {
3958 int min, max;
3959 int class_optcount;
3960 int bracket_length;
3961 int duplength;
3962
3963 /* If we are inside a \Q...\E sequence, all chars are literal */
3964
3965 if (inescq)
3966 {
3967 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3968 goto NORMAL_CHAR;
3969 }
3970
3971 /* Otherwise, first check for ignored whitespace and comments */
3972
3973 if ((options & PCRE_EXTENDED) != 0)
3974 {
3975 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3976 if (c == '#')
3977 {
3978 /* The space before the ; is to avoid a warning on a silly compiler
3979 on the Macintosh. */
3980 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3981 if (c == 0) break;
3982 continue;
3983 }
3984 }
3985
3986 item_count++; /* Is zero for the first non-comment item */
3987
3988 /* Allow space for auto callout before every item except quantifiers. */
3989
3990 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3991 c != '*' && c != '+' && c != '?' &&
3992 (c != '{' || !is_counted_repeat(ptr + 1)))
3993 length += 2 + 2*LINK_SIZE;
3994
3995 switch(c)
3996 {
3997 /* A backslashed item may be an escaped data character or it may be a
3998 character type. */
3999
4000 case '\\':
4001 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4002 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4003
4004 lastitemlength = 1; /* Default length of last item for repeats */
4005
4006 if (c >= 0) /* Data character */
4007 {
4008 length += 2; /* For a one-byte character */
4009
4010 #ifdef SUPPORT_UTF8
4011 if (utf8 && c > 127)
4012 {
4013 int i;
4014 for (i = 0; i < _pcre_utf8_table1_size; i++)
4015 if (c <= _pcre_utf8_table1[i]) break;
4016 length += i;
4017 lastitemlength += i;
4018 }
4019 #endif
4020
4021 continue;
4022 }
4023
4024 /* If \Q, enter "literal" mode */
4025
4026 if (-c == ESC_Q)
4027 {
4028 inescq = TRUE;
4029 continue;
4030 }
4031
4032 /* \X is supported only if Unicode property support is compiled */
4033
4034 #ifndef SUPPORT_UCP
4035 if (-c == ESC_X)
4036 {
4037 errorcode = ERR45;
4038 goto PCRE_ERROR_RETURN;
4039 }
4040 #endif
4041
4042 /* \P and \p are for Unicode properties, but only when the support has
4043 been compiled. Each item needs 2 bytes. */
4044
4045 else if (-c == ESC_P || -c == ESC_p)
4046 {
4047 #ifdef SUPPORT_UCP
4048 BOOL negated;
4049 length += 2;
4050 lastitemlength = 2;
4051 if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4052 continue;
4053 #else
4054 errorcode = ERR45;
4055 goto PCRE_ERROR_RETURN;
4056 #endif
4057 }
4058
4059 /* Other escapes need one byte */
4060
4061 length++;
4062
4063 /* A back reference needs an additional 2 bytes, plus either one or 5
4064 bytes for a repeat. We also need to keep the value of the highest
4065 back reference. */
4066
4067 if (c <= -ESC_REF)
4068 {
4069 int refnum = -c - ESC_REF;
4070 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4071 if (refnum > compile_block.top_backref)
4072 compile_block.top_backref = refnum;
4073 length += 2; /* For single back reference */
4074 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4075 {
4076 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4077 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4078 if ((min == 0 && (max == 1 || max == -1)) ||
4079 (min == 1 && max == -1))
4080 length++;
4081 else length += 5;
4082 if (ptr[1] == '?') ptr++;
4083 }
4084 }
4085 continue;
4086
4087 case '^': /* Single-byte metacharacters */
4088 case '.':
4089 case '$':
4090 length++;
4091 lastitemlength = 1;
4092 continue;
4093
4094 case '*': /* These repeats won't be after brackets; */
4095 case '+': /* those are handled separately */
4096 case '?':
4097 length++;
4098 goto POSESSIVE; /* A few lines below */
4099
4100 /* This covers the cases of braced repeats after a single char, metachar,
4101 class, or back reference. */
4102
4103 case '{':
4104 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4105 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4106 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4107
4108 /* These special cases just insert one extra opcode */
4109
4110 if ((min == 0 && (max == 1 || max == -1)) ||
4111 (min == 1 && max == -1))
4112 length++;
4113
4114 /* These cases might insert additional copies of a preceding character. */
4115
4116 else
4117 {
4118 if (min != 1)
4119 {
4120 length -= lastitemlength; /* Uncount the original char or metachar */
4121 if (min > 0) length += 3 + lastitemlength;
4122 }
4123 length += lastitemlength + ((max > 0)? 3 : 1);
4124 }
4125
4126 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4127
4128 POSESSIVE: /* Test for possessive quantifier */
4129 if (ptr[1] == '+')
4130 {
4131 ptr++;
4132 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4133 }
4134 continue;
4135
4136 /* An alternation contains an offset to the next branch or ket. If any ims
4137 options changed in the previous branch(es), and/or if we are in a
4138 lookbehind assertion, extra space will be needed at the start of the
4139 branch. This is handled by branch_extra. */
4140
4141 case '|':
4142 length += 1 + LINK_SIZE + branch_extra;
4143 continue;
4144
4145 /* A character class uses 33 characters provided that all the character
4146 values are less than 256. Otherwise, it uses a bit map for low valued
4147 characters, and individual items for others. Don't worry about character
4148 types that aren't allowed in classes - they'll get picked up during the
4149 compile. A character class that contains only one single-byte character
4150 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4151 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4152
4153 case '[':
4154 if (*(++ptr) == '^')
4155 {
4156 class_optcount = 10; /* Greater than one */
4157 ptr++;
4158 }
4159 else class_optcount = 0;
4160
4161 #ifdef SUPPORT_UTF8
4162 class_utf8 = FALSE;
4163 #endif
4164
4165 /* Written as a "do" so that an initial ']' is taken as data */
4166
4167 if (*ptr != 0) do
4168 {
4169 /* Inside \Q...\E everything is literal except \E */
4170
4171 if (inescq)
4172 {
4173 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4174 inescq = FALSE;
4175 ptr += 1;
4176 continue;
4177 }
4178
4179 /* Outside \Q...\E, check for escapes */
4180
4181 if (*ptr == '\\')
4182 {
4183 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4184 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4185
4186 /* \b is backspace inside a class; \X is literal */
4187
4188 if (-c == ESC_b) c = '\b';
4189 else if (-c == ESC_X) c = 'X';
4190
4191 /* \Q enters quoting mode */
4192
4193 else if (-c == ESC_Q)
4194 {
4195 inescq = TRUE;
4196 continue;
4197 }
4198
4199 /* Handle escapes that turn into characters */
4200
4201 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4202
4203 /* Escapes that are meta-things. The normal ones just affect the
4204 bit map, but Unicode properties require an XCLASS extended item. */
4205
4206 else
4207 {
4208 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4209 #ifdef SUPPORT_UTF8
4210 if (-c == ESC_p || -c == ESC_P)
4211 {
4212 if (!class_utf8)
4213 {
4214 class_utf8 = TRUE;
4215 length += LINK_SIZE + 2;
4216 }
4217 length += 2;
4218 }
4219 #endif
4220 }
4221 }
4222
4223 /* Check the syntax for POSIX stuff. The bits we actually handle are
4224 checked during the real compile phase. */
4225
4226 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4227 {
4228 ptr++;
4229 class_optcount = 10; /* Make sure > 1 */
4230 }
4231
4232 /* Anything else increments the possible optimization count. We have to
4233 detect ranges here so that we can compute the number of extra ranges for
4234 caseless wide characters when UCP support is available. If there are wide
4235 characters, we are going to have to use an XCLASS, even for single
4236 characters. */
4237
4238 else
4239 {
4240 int d;
4241
4242 GET_ONE_CHARACTER:
4243
4244 #ifdef SUPPORT_UTF8
4245 if (utf8)
4246 {
4247 int extra = 0;
4248 GETCHARLEN(c, ptr, extra);
4249 ptr += extra;
4250 }
4251 else c = *ptr;
4252 #else
4253 c = *ptr;
4254 #endif
4255
4256 /* Come here from handling \ above when it escapes to a char value */
4257
4258 NON_SPECIAL_CHARACTER:
4259 class_optcount++;
4260
4261 d = -1;
4262 if (ptr[1] == '-')
4263 {
4264 uschar const *hyptr = ptr++;
4265 if (ptr[1] == '\\')
4266 {
4267 ptr++;
4268 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4269 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4270 if (-d == ESC_b) d = '\b'; /* backspace */
4271 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4272 }
4273 else if (ptr[1] != 0 && ptr[1] != ']')
4274 {
4275 ptr++;
4276 #ifdef SUPPORT_UTF8
4277 if (utf8)
4278 {
4279 int extra = 0;
4280 GETCHARLEN(d, ptr, extra);
4281 ptr += extra;
4282 }
4283 else
4284 #endif
4285 d = *ptr;
4286 }
4287 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4288 }
4289
4290 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4291 127 for caseless matching, we will need to use an XCLASS. */
4292
4293 if (d >= 0)
4294 {
4295 class_optcount = 10; /* Ensure > 1 */
4296 if (d < c)
4297 {
4298 errorcode = ERR8;
4299 goto PCRE_ERROR_RETURN;
4300 }
4301
4302 #ifdef SUPPORT_UTF8
4303 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4304 {
4305 uschar buffer[6];
4306 if (!class_utf8) /* Allow for XCLASS overhead */
4307 {
4308 class_utf8 = TRUE;
4309 length += LINK_SIZE + 2;
4310 }
4311
4312 #ifdef SUPPORT_UCP
4313 /* If we have UCP support, find out how many extra ranges are
4314 needed to map the other case of characters within this range. We
4315 have to mimic the range optimization here, because extending the
4316 range upwards might push d over a boundary that makes is use
4317 another byte in the UTF-8 representation. */
4318
4319 if ((options & PCRE_CASELESS) != 0)
4320 {
4321 int occ, ocd;
4322 int cc = c;
4323 int origd = d;
4324 while (get_othercase_range(&cc, origd, &occ, &ocd))
4325 {
4326 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4327
4328 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4329 { /* if there is overlap, */
4330 c = occ; /* noting that if occ < c */
4331 continue; /* we can't have ocd > d */
4332 } /* because a subrange is */
4333 if (ocd > d && occ <= d + 1) /* always shorter than */
4334 { /* the basic range. */
4335 d = ocd;
4336 continue;
4337 }
4338
4339 /* An extra item is needed */
4340
4341 length += 1 + _pcre_ord2utf8(occ, buffer) +
4342 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4343 }
4344 }
4345 #endif /* SUPPORT_UCP */
4346
4347 /* The length of the (possibly extended) range */
4348
4349 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4350 }
4351 #endif /* SUPPORT_UTF8 */
4352
4353 }
4354
4355 /* We have a single character. There is nothing to be done unless we
4356 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4357 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4358 support. */
4359
4360 else
4361 {
4362 #ifdef SUPPORT_UTF8
4363 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4364 {
4365 uschar buffer[6];
4366 class_optcount = 10; /* Ensure > 1 */
4367 if (!class_utf8) /* Allow for XCLASS overhead */
4368 {
4369 class_utf8 = TRUE;
4370 length += LINK_SIZE + 2;
4371 }
4372 #ifdef SUPPORT_UCP
4373 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4374 (1 + _pcre_ord2utf8(c, buffer));
4375 #else /* SUPPORT_UCP */
4376 length += 1 + _pcre_ord2utf8(c, buffer);
4377 #endif /* SUPPORT_UCP */
4378 }
4379 #endif /* SUPPORT_UTF8 */
4380 }
4381 }
4382 }
4383 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4384
4385 if (*ptr == 0) /* Missing terminating ']' */
4386 {
4387 errorcode = ERR6;
4388 goto PCRE_ERROR_RETURN;
4389 }
4390
4391 /* We can optimize when there was only one optimizable character. Repeats
4392 for positive and negated single one-byte chars are handled by the general
4393 code. Here, we handle repeats for the class opcodes. */
4394
4395 if (class_optcount == 1) length += 3; else
4396 {
4397 length += 33;
4398
4399 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4400 we also need extra for wrapping the whole thing in a sub-pattern. */
4401
4402 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4403 {
4404 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4405 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4406 if ((min == 0 && (max == 1 || max == -1)) ||
4407 (min == 1 && max == -1))
4408 length++;
4409 else length += 5;
4410 if (ptr[1] == '+')
4411 {
4412 ptr++;
4413 length += 2 + 2*LINK_SIZE;
4414 }
4415 else if (ptr[1] == '?') ptr++;
4416 }
4417 }
4418 continue;
4419
4420 /* Brackets may be genuine groups or special things */
4421
4422 case '(':
4423 branch_newextra = 0;
4424 bracket_length = 1 + LINK_SIZE;
4425 capturing = FALSE;
4426
4427 /* Handle special forms of bracket, which all start (? */
4428
4429 if (ptr[1] == '?')
4430 {
4431 int set, unset;
4432 int *optset;
4433
4434 switch (c = ptr[2])
4435 {
4436 /* Skip over comments entirely */
4437 case '#':
4438 ptr += 3;
4439 while (*ptr != 0 && *ptr != ')') ptr++;
4440 if (*ptr == 0)
4441 {
4442 errorcode = ERR18;
4443 goto PCRE_ERROR_RETURN;
4444 }
4445 continue;
4446
4447 /* Non-referencing groups and lookaheads just move the pointer on, and
4448 then behave like a non-special bracket, except that they don't increment
4449 the count of extracting brackets. Ditto for the "once only" bracket,
4450 which is in Perl from version 5.005. */
4451
4452 case ':':
4453 case '=':
4454 case '!':
4455 case '>':
4456 ptr += 2;
4457 break;
4458
4459 /* (?R) specifies a recursive call to the regex, which is an extension
4460 to provide the facility which can be obtained by (?p{perl-code}) in
4461 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4462
4463 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4464 the appropriate numbered brackets. This includes both recursive and
4465 non-recursive calls. (?R) is now synonymous with (?0). */
4466
4467 case 'R':
4468 ptr++;
4469
4470 case '0': case '1': case '2': case '3': case '4':
4471 case '5': case '6': case '7': case '8': case '9':
4472 ptr += 2;
4473 if (c != 'R')
4474 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4475 if (*ptr != ')')
4476 {
4477 errorcode = ERR29;
4478 goto PCRE_ERROR_RETURN;
4479 }
4480 length += 1 + LINK_SIZE;
4481
4482 /* If this item is quantified, it will get wrapped inside brackets so
4483 as to use the code for quantified brackets. We jump down and use the
4484 code that handles this for real brackets. */
4485
4486 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4487 {
4488 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4489 duplength = 5 + 3 * LINK_SIZE;
4490 goto HANDLE_QUANTIFIED_BRACKETS;
4491 }
4492 continue;
4493
4494 /* (?C) is an extension which provides "callout" - to provide a bit of
4495 the functionality of the Perl (?{...}) feature. An optional number may
4496 follow (default is zero). */
4497
4498 case 'C':
4499 ptr += 2;
4500 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4501 if (*ptr != ')')
4502 {
4503 errorcode = ERR39;
4504 goto PCRE_ERROR_RETURN;
4505 }
4506 length += 2 + 2*LINK_SIZE;
4507 continue;
4508
4509 /* Named subpatterns are an extension copied from Python */
4510
4511 case 'P':
4512 ptr += 3;
4513
4514 /* Handle the definition of a named subpattern */
4515
4516 if (*ptr == '<')
4517 {
4518 const uschar *p; /* Don't amalgamate; some compilers */
4519 p = ++ptr; /* grumble at autoincrement in declaration */
4520 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4521 if (*ptr != '>')
4522 {
4523 errorcode = ERR42;
4524 goto PCRE_ERROR_RETURN;
4525 }
4526 name_count++;
4527 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4528 capturing = TRUE; /* Named parentheses are always capturing */
4529 break;
4530 }
4531
4532 /* Handle back references and recursive calls to named subpatterns */
4533
4534 if (*ptr == '=' || *ptr == '>')
4535 {
4536 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4537 if (*ptr != ')')
4538 {
4539 errorcode = ERR42;
4540 goto PCRE_ERROR_RETURN;
4541 }
4542 break;
4543 }
4544
4545 /* Unknown character after (?P */
4546
4547 errorcode = ERR41;
4548 goto PCRE_ERROR_RETURN;
4549
4550 /* Lookbehinds are in Perl from version 5.005 */
4551
4552 case '<':
4553 ptr += 3;
4554 if (*ptr == '=' || *ptr == '!')
4555 {
4556 branch_newextra = 1 + LINK_SIZE;
4557 length += 1 + LINK_SIZE; /* For the first branch */
4558 break;
4559 }
4560 errorcode = ERR24;
4561 goto PCRE_ERROR_RETURN;
4562
4563 /* Conditionals are in Perl from version 5.005. The bracket must either
4564 be followed by a number (for bracket reference) or by an assertion
4565 group, or (a PCRE extension) by 'R' for a recursion test. */
4566
4567 case '(':
4568 if (ptr[3] == 'R' && ptr[4] == ')')
4569 {
4570 ptr += 4;
4571 length += 3;
4572 }
4573 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4574 {
4575 ptr += 4;
4576 length += 3;
4577 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4578 if (*ptr != ')')
4579 {
4580 errorcode = ERR26;
4581 goto PCRE_ERROR_RETURN;
4582 }
4583 }
4584 else /* An assertion must follow */
4585 {
4586 ptr++; /* Can treat like ':' as far as spacing is concerned */
4587 if (ptr[2] != '?' ||
4588 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4589 {
4590 ptr += 2; /* To get right offset in message */
4591 errorcode = ERR28;
4592 goto PCRE_ERROR_RETURN;
4593 }
4594 }
4595 break;
4596
4597 /* Else loop checking valid options until ) is met. Anything else is an
4598 error. If we are without any brackets, i.e. at top level, the settings
4599 act as if specified in the options, so massage the options immediately.
4600 This is for backward compatibility with Perl 5.004. */
4601
4602 default:
4603 set = unset = 0;
4604 optset = &set;
4605 ptr += 2;
4606
4607 for (;; ptr++)
4608 {
4609 c = *ptr;
4610 switch (c)
4611 {
4612 case 'i':
4613 *optset |= PCRE_CASELESS;
4614 continue;
4615
4616 case 'm':
4617 *optset |= PCRE_MULTILINE;
4618 continue;
4619
4620 case 's':
4621 *optset |= PCRE_DOTALL;
4622 continue;
4623
4624 case 'x':
4625 *optset |= PCRE_EXTENDED;
4626 continue;
4627
4628 case 'X':
4629 *optset |= PCRE_EXTRA;
4630 continue;
4631
4632 case 'U':
4633 *optset |= PCRE_UNGREEDY;
4634 continue;
4635
4636 case '-':
4637 optset = &unset;
4638 continue;
4639
4640 /* A termination by ')' indicates an options-setting-only item; if
4641 this is at the very start of the pattern (indicated by item_count
4642 being zero), we use it to set the global options. This is helpful
4643 when analyzing the pattern for first characters, etc. Otherwise
4644 nothing is done here and it is handled during the compiling
4645 process.
4646
4647 We allow for more than one options setting at the start. If such
4648 settings do not change the existing options, nothing is compiled.
4649 However, we must leave space just in case something is compiled.
4650 This can happen for pathological sequences such as (?i)(?-i)
4651 because the global options will end up with -i set. The space is
4652 small and not significant. (Before I did this there was a reported
4653 bug with (?i)(?-i) in a machine-generated pattern.)
4654
4655 [Historical note: Up to Perl 5.8, options settings at top level
4656 were always global settings, wherever they appeared in the pattern.
4657 That is, they were equivalent to an external setting. From 5.8
4658 onwards, they apply only to what follows (which is what you might
4659 expect).] */
4660
4661 case ')':
4662 if (item_count == 0)
4663 {
4664 options = (options | set) & (~unset);
4665 set = unset = 0; /* To save length */
4666 item_count--; /* To allow for several */
4667 length += 2;
4668 }
4669
4670 /* Fall through */
4671
4672 /* A termination by ':' indicates the start of a nested group with
4673 the given options set. This is again handled at compile time, but
4674 we must allow for compiled space if any of the ims options are
4675 set. We also have to allow for resetting space at the end of
4676 the group, which is why 4 is added to the length and not just 2.
4677 If there are several changes of options within the same group, this
4678 will lead to an over-estimate on the length, but this shouldn't
4679 matter very much. We also have to allow for resetting options at
4680 the start of any alternations, which we do by setting
4681 branch_newextra to 2. Finally, we record whether the case-dependent
4682 flag ever changes within the regex. This is used by the "required
4683 character" code. */
4684
4685 case ':':
4686 if (((set|unset) & PCRE_IMS) != 0)
4687 {
4688 length += 4;
4689 branch_newextra = 2;
4690 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4691 }
4692 goto END_OPTIONS;
4693
4694 /* Unrecognized option character */
4695
4696 default:
4697 errorcode = ERR12;
4698 goto PCRE_ERROR_RETURN;
4699 }
4700 }
4701
4702 /* If we hit a closing bracket, that's it - this is a freestanding
4703 option-setting. We need to ensure that branch_extra is updated if
4704 necessary. The only values branch_newextra can have here are 0 or 2.
4705 If the value is 2, then branch_extra must either be 2 or 5, depending
4706 on whether this is a lookbehind group or not. */
4707
4708 END_OPTIONS:
4709 if (c == ')')
4710 {
4711 if (branch_newextra == 2 &&
4712 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4713 branch_extra += branch_newextra;
4714 continue;
4715 }
4716
4717 /* If options were terminated by ':' control comes here. This is a
4718 non-capturing group with an options change. There is nothing more that
4719 needs to be done because "capturing" is already set FALSE by default;
4720 we can just fall through. */
4721
4722 }
4723 }
4724
4725 /* Ordinary parentheses, not followed by '?', are capturing unless
4726 PCRE_NO_AUTO_CAPTURE is set. */
4727
4728 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4729
4730 /* Capturing brackets must be counted so we can process escapes in a
4731 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4732 an additional 3 bytes of memory per capturing bracket. */
4733
4734 if (capturing)
4735 {
4736 bracount++;
4737 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4738 }
4739
4740 /* Save length for computing whole length at end if there's a repeat that
4741 requires duplication of the group. Also save the current value of
4742 branch_extra, and start the new group with the new value. If non-zero, this
4743 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4744
4745 if (brastackptr >= sizeof(brastack)/sizeof(int))
4746 {
4747 errorcode = ERR19;
4748 goto PCRE_ERROR_RETURN;
4749 }
4750
4751 bralenstack[brastackptr] = branch_extra;
4752 branch_extra = branch_newextra;
4753
4754 brastack[brastackptr++] = length;
4755 length += bracket_length;
4756 continue;
4757
4758 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4759 have to replicate this bracket up to that many times. If brastackptr is
4760 0 this is an unmatched bracket which will generate an error, but take care
4761 not to try to access brastack[-1] when computing the length and restoring
4762 the branch_extra value. */
4763
4764 case ')':
4765 length += 1 + LINK_SIZE;
4766 if (brastackptr > 0)
4767 {
4768 duplength = length - brastack[--brastackptr];
4769 branch_extra = bralenstack[brastackptr];
4770 }
4771 else duplength = 0;
4772
4773 /* The following code is also used when a recursion such as (?3) is
4774 followed by a quantifier, because in that case, it has to be wrapped inside
4775 brackets so that the quantifier works. The value of duplength must be
4776 set before arrival. */
4777
4778 HANDLE_QUANTIFIED_BRACKETS:
4779
4780 /* Leave ptr at the final char; for read_repeat_counts this happens
4781 automatically; for the others we need an increment. */
4782
4783 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4784 {
4785 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4786 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4787 }
4788 else if (c == '*') { min = 0; max = -1; ptr++; }
4789 else if (c == '+') { min = 1; max = -1; ptr++; }
4790 else if (c == '?') { min = 0; max = 1; ptr++; }
4791 else { min = 1; max = 1; }
4792
4793 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4794 group, and if the maximum is greater than zero, we have to replicate
4795 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4796 bracket set. */
4797
4798 if (min == 0)
4799 {
4800 length++;
4801 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4802 }
4803
4804 /* When the minimum is greater than zero, we have to replicate up to
4805 minval-1 times, with no additions required in the copies. Then, if there
4806 is a limited maximum we have to replicate up to maxval-1 times allowing
4807 for a BRAZERO item before each optional copy and nesting brackets for all
4808 but one of the optional copies. */
4809
4810 else
4811 {
4812 length += (min - 1) * duplength;
4813 if (max > min) /* Need this test as max=-1 means no limit */
4814 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4815 - (2 + 2*LINK_SIZE);
4816 }
4817
4818 /* Allow space for once brackets for "possessive quantifier" */
4819
4820 if (ptr[1] == '+')
4821 {
4822 ptr++;
4823 length += 2 + 2*LINK_SIZE;
4824 }
4825 continue;
4826
4827 /* Non-special character. It won't be space or # in extended mode, so it is
4828 always a genuine character. If we are in a \Q...\E sequence, check for the
4829 end; if not, we have a literal. */
4830
4831 default:
4832 NORMAL_CHAR:
4833
4834 if (inescq && c == '\\' && ptr[1] == 'E')
4835 {
4836 inescq = FALSE;
4837 ptr++;
4838 continue;
4839 }
4840
4841 length += 2; /* For a one-byte character */
4842 lastitemlength = 1; /* Default length of last item for repeats */
4843
4844 /* In UTF-8 mode, check for additional bytes. */
4845
4846 #ifdef SUPPORT_UTF8
4847 if (utf8 && (c & 0xc0) == 0xc0)
4848 {
4849 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4850 { /* because the end is marked */
4851 lastitemlength++; /* by a zero byte. */
4852 length++;
4853 ptr++;
4854 }
4855 }
4856 #endif
4857
4858 continue;
4859 }
4860 }
4861
4862 length += 2 + LINK_SIZE; /* For final KET and END */
4863
4864 if ((options & PCRE_AUTO_CALLOUT) != 0)
4865 length += 2 + 2*LINK_SIZE; /* For final callout */
4866
4867 if (length > MAX_PATTERN_SIZE)
4868 {
4869 errorcode = ERR20;
4870 goto PCRE_EARLY_ERROR_RETURN;
4871 }
4872
4873 /* Compute the size of data block needed and get it, either from malloc or
4874 externally provided function. */
4875
4876 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4877 re = (real_pcre *)(pcre_malloc)(size);
4878
4879 if (re == NULL)
4880 {
4881 errorcode = ERR21;
4882 goto PCRE_EARLY_ERROR_RETURN;
4883 }
4884
4885 /* Put in the magic number, and save the sizes, options, and character table
4886 pointer. NULL is used for the default character tables. The nullpad field is at
4887 the end; it's there to help in the case when a regex compiled on a system with
4888 4-byte pointers is run on another with 8-byte pointers. */
4889
4890 re->magic_number = MAGIC_NUMBER;
4891 re->size = size;
4892 re->options = options;
4893 re->dummy1 = 0;
4894 re->name_table_offset = sizeof(real_pcre);
4895 re->name_entry_size = max_name_size + 3;
4896 re->name_count = name_count;
4897 re->ref_count = 0;
4898 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4899 re->nullpad = NULL;
4900
4901 /* The starting points of the name/number translation table and of the code are
4902 passed around in the compile data block. */
4903
4904 compile_block.names_found = 0;
4905 compile_block.name_entry_size = max_name_size + 3;
4906 compile_block.name_table = (uschar *)re + re->name_table_offset;
4907 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4908 compile_block.start_code = codestart;
4909 compile_block.start_pattern = (const uschar *)pattern;
4910 compile_block.req_varyopt = 0;
4911 compile_block.nopartial = FALSE;
4912
4913 /* Set up a starting, non-extracting bracket, then compile the expression. On
4914 error, errorcode will be set non-zero, so we don't need to look at the result
4915 of the function here. */
4916
4917 ptr = (const uschar *)pattern;
4918 code = (uschar *)codestart;
4919 *code = OP_BRA;
4920 bracount = 0;
4921 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4922 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4923 re->top_bracket = bracount;
4924 re->top_backref = compile_block.top_backref;
4925
4926 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4927
4928 /* If not reached end of pattern on success, there's an excess bracket. */
4929
4930 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4931
4932 /* Fill in the terminating state and check for disastrous overflow, but
4933 if debugging, leave the test till after things are printed out. */
4934
4935 *code++ = OP_END;
4936
4937 #ifndef DEBUG
4938 if (code - codestart > length) errorcode = ERR23;
4939 #endif
4940
4941 /* Give an error if there's back reference to a non-existent capturing
4942 subpattern. */
4943
4944 if (re->top_backref > re->top_bracket) errorcode = ERR15;
4945
4946 /* Failed to compile, or error while post-processing */
4947
4948 if (errorcode != 0)
4949 {
4950 (pcre_free)(re);
4951 PCRE_ERROR_RETURN:
4952 *erroroffset = ptr - (const uschar *)pattern;
4953 PCRE_EARLY_ERROR_RETURN:
4954 *errorptr = error_texts[errorcode];
4955 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4956 return NULL;
4957 }
4958
4959 /* If the anchored option was not passed, set the flag if we can determine that
4960 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4961 as starting with .* when DOTALL is set).
4962
4963 Otherwise, if we know what the first character has to be, save it, because that
4964 speeds up unanchored matches no end. If not, see if we can set the
4965 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4966 start with ^. and also when all branches start with .* for non-DOTALL matches.
4967 */
4968
4969 if ((options & PCRE_ANCHORED) == 0)
4970 {
4971 int temp_options = options;
4972 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4973 re->options |= PCRE_ANCHORED;
4974 else
4975 {
4976 if (firstbyte < 0)
4977 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4978 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4979 {
4980 int ch = firstbyte & 255;
4981 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4982 compile_block.fcc[ch] == ch)? ch : firstbyte;
4983 re->options |= PCRE_FIRSTSET;
4984 }
4985 else if (is_startline(codestart, 0, compile_block.backref_map))
4986 re->options |= PCRE_STARTLINE;
4987 }
4988 }
4989
4990 /* For an anchored pattern, we use the "required byte" only if it follows a
4991 variable length item in the regex. Remove the caseless flag for non-caseable
4992 bytes. */
4993
4994 if (reqbyte >= 0 &&
4995 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4996 {
4997 int ch = reqbyte & 255;
4998 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4999 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5000 re->options |= PCRE_REQCHSET;
5001 }
5002
5003 /* Print out the compiled data for debugging */
5004
5005 #ifdef DEBUG
5006
5007 printf("Length = %d top_bracket = %d top_backref = %d\n",
5008 length, re->top_bracket, re->top_backref);
5009
5010 if (re->options != 0)
5011 {
5012 printf("%s%s%s%s%s%s%s%s%s%s\n",
5013 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5014 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5015 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5016 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5017 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5018 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5019 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5020 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5021 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5022 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5023 }
5024
5025 if ((re->options & PCRE_FIRSTSET) != 0)
5026 {
5027 int ch = re->first_byte & 255;
5028 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5029 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5030 else printf("First char = \\x%02x%s\n", ch, caseless);
5031 }
5032
5033 if ((re->options & PCRE_REQCHSET) != 0)
5034 {
5035 int ch = re->req_byte & 255;
5036 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5037 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5038 else printf("Req char = \\x%02x%s\n", ch, caseless);
5039 }
5040
5041 _pcre_printint(re, stdout);
5042
5043 /* This check is done here in the debugging case so that the code that
5044 was compiled can be seen. */
5045
5046 if (code - codestart > length)
5047 {
5048 (pcre_free)(re);
5049 *errorptr = error_texts[ERR23];
5050 *erroroffset = ptr - (uschar *)pattern;
5051 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5052 return NULL;
5053 }
5054 #endif
5055
5056 return (pcre *)re;
5057 }
5058
5059 /* End of pcre_compile.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12