/[pcre]/code/tags/pcre-6.4/pcre_compile.c
ViewVC logotype

Contents of /code/tags/pcre-6.4/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 86 - (show annotations) (download)
Sat Feb 24 21:41:15 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 160094 byte(s)
Tag code/trunk as code/tags/pcre-6.4.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #include "pcre_internal.h"
46
47
48 /* When DEBUG is defined, we need the pcre_printint() function, which is also
49 used by pcretest. DEBUG is not defined when building a production library. */
50
51 #ifdef DEBUG
52 #include "pcre_printint.src"
53 #endif
54
55
56
57 /*************************************************
58 * Code parameters and static tables *
59 *************************************************/
60
61 /* Maximum number of items on the nested bracket stacks at compile time. This
62 applies to the nesting of all kinds of parentheses. It does not limit
63 un-nested, non-capturing parentheses. This number can be made bigger if
64 necessary - it is used to dimension one int and one unsigned char vector at
65 compile time. */
66
67 #define BRASTACK_SIZE 200
68
69
70 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
71 are simple data values; negative values are for special things like \d and so
72 on. Zero means further processing is needed (for things like \x), or the escape
73 is invalid. */
74
75 #if !EBCDIC /* This is the "normal" table for ASCII systems */
76 static const short int escapes[] = {
77 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
78 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
79 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
80 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
81 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
82 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
83 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
84 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
85 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
86 0, 0, -ESC_z /* x - z */
87 };
88
89 #else /* This is the "abnormal" table for EBCDIC systems */
90 static const short int escapes[] = {
91 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
92 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
93 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
94 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
95 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
96 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
97 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
98 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
99 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
100 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
101 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
102 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
103 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
104 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
105 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
106 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
107 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
108 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
109 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
110 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
111 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
112 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
113 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
114 };
115 #endif
116
117
118 /* Tables of names of POSIX character classes and their lengths. The list is
119 terminated by a zero length entry. The first three must be alpha, upper, lower,
120 as this is assumed for handling case independence. */
121
122 static const char *const posix_names[] = {
123 "alpha", "lower", "upper",
124 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
125 "print", "punct", "space", "word", "xdigit" };
126
127 static const uschar posix_name_lengths[] = {
128 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
129
130 /* Table of class bit maps for each POSIX class; up to three may be combined
131 to form the class. The table for [:blank:] is dynamically modified to remove
132 the vertical space characters. */
133
134 static const int posix_class_maps[] = {
135 cbit_lower, cbit_upper, -1, /* alpha */
136 cbit_lower, -1, -1, /* lower */
137 cbit_upper, -1, -1, /* upper */
138 cbit_digit, cbit_lower, cbit_upper, /* alnum */
139 cbit_print, cbit_cntrl, -1, /* ascii */
140 cbit_space, -1, -1, /* blank - a GNU extension */
141 cbit_cntrl, -1, -1, /* cntrl */
142 cbit_digit, -1, -1, /* digit */
143 cbit_graph, -1, -1, /* graph */
144 cbit_print, -1, -1, /* print */
145 cbit_punct, -1, -1, /* punct */
146 cbit_space, -1, -1, /* space */
147 cbit_word, -1, -1, /* word - a Perl extension */
148 cbit_xdigit,-1, -1 /* xdigit */
149 };
150
151
152 /* The texts of compile-time error messages. These are "char *" because they
153 are passed to the outside world. */
154
155 static const char *error_texts[] = {
156 "no error",
157 "\\ at end of pattern",
158 "\\c at end of pattern",
159 "unrecognized character follows \\",
160 "numbers out of order in {} quantifier",
161 /* 5 */
162 "number too big in {} quantifier",
163 "missing terminating ] for character class",
164 "invalid escape sequence in character class",
165 "range out of order in character class",
166 "nothing to repeat",
167 /* 10 */
168 "operand of unlimited repeat could match the empty string",
169 "internal error: unexpected repeat",
170 "unrecognized character after (?",
171 "POSIX named classes are supported only within a class",
172 "missing )",
173 /* 15 */
174 "reference to non-existent subpattern",
175 "erroffset passed as NULL",
176 "unknown option bit(s) set",
177 "missing ) after comment",
178 "parentheses nested too deeply",
179 /* 20 */
180 "regular expression too large",
181 "failed to get memory",
182 "unmatched parentheses",
183 "internal error: code overflow",
184 "unrecognized character after (?<",
185 /* 25 */
186 "lookbehind assertion is not fixed length",
187 "malformed number after (?(",
188 "conditional group contains more than two branches",
189 "assertion expected after (?(",
190 "(?R or (?digits must be followed by )",
191 /* 30 */
192 "unknown POSIX class name",
193 "POSIX collating elements are not supported",
194 "this version of PCRE is not compiled with PCRE_UTF8 support",
195 "spare error",
196 "character value in \\x{...} sequence is too large",
197 /* 35 */
198 "invalid condition (?(0)",
199 "\\C not allowed in lookbehind assertion",
200 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
201 "number after (?C is > 255",
202 "closing ) for (?C expected",
203 /* 40 */
204 "recursive call could loop indefinitely",
205 "unrecognized character after (?P",
206 "syntax error after (?P",
207 "two named groups have the same name",
208 "invalid UTF-8 string",
209 /* 45 */
210 "support for \\P, \\p, and \\X has not been compiled",
211 "malformed \\P or \\p sequence",
212 "unknown property name after \\P or \\p"
213 };
214
215
216 /* Table to identify digits and hex digits. This is used when compiling
217 patterns. Note that the tables in chartables are dependent on the locale, and
218 may mark arbitrary characters as digits - but the PCRE compiling code expects
219 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
220 a private table here. It costs 256 bytes, but it is a lot faster than doing
221 character value tests (at least in some simple cases I timed), and in some
222 applications one wants PCRE to compile efficiently as well as match
223 efficiently.
224
225 For convenience, we use the same bit definitions as in chartables:
226
227 0x04 decimal digit
228 0x08 hexadecimal digit
229
230 Then we can use ctype_digit and ctype_xdigit in the code. */
231
232 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
233 static const unsigned char digitab[] =
234 {
235 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
236 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
241 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
242 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
243 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
247 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
267
268 #else /* This is the "abnormal" case, for EBCDIC systems */
269 static const unsigned char digitab[] =
270 {
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
287 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
295 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
301 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
302 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
303
304 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
305 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
306 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
307 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
309 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
313 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
314 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
316 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
318 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
321 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
322 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
323 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
324 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
325 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
326 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
327 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
328 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
329 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
330 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
331 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
332 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
333 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
334 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
335 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
336 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
337 #endif
338
339
340 /* Definition to allow mutual recursion */
341
342 static BOOL
343 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
344 int *, int *, branch_chain *, compile_data *);
345
346
347
348 /*************************************************
349 * Handle escapes *
350 *************************************************/
351
352 /* This function is called when a \ has been encountered. It either returns a
353 positive value for a simple escape such as \n, or a negative value which
354 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
355 a positive value greater than 255 may be returned. On entry, ptr is pointing at
356 the \. On exit, it is on the final character of the escape sequence.
357
358 Arguments:
359 ptrptr points to the pattern position pointer
360 errorcodeptr points to the errorcode variable
361 bracount number of previous extracting brackets
362 options the options bits
363 isclass TRUE if inside a character class
364
365 Returns: zero or positive => a data character
366 negative => a special escape sequence
367 on error, errorptr is set
368 */
369
370 static int
371 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
372 int options, BOOL isclass)
373 {
374 const uschar *ptr = *ptrptr;
375 int c, i;
376
377 /* If backslash is at the end of the pattern, it's an error. */
378
379 c = *(++ptr);
380 if (c == 0) *errorcodeptr = ERR1;
381
382 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
383 a table. A non-zero result is something that can be returned immediately.
384 Otherwise further processing may be required. */
385
386 #if !EBCDIC /* ASCII coding */
387 else if (c < '0' || c > 'z') {} /* Not alphameric */
388 else if ((i = escapes[c - '0']) != 0) c = i;
389
390 #else /* EBCDIC coding */
391 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
392 else if ((i = escapes[c - 0x48]) != 0) c = i;
393 #endif
394
395 /* Escapes that need further processing, or are illegal. */
396
397 else
398 {
399 const uschar *oldptr;
400 switch (c)
401 {
402 /* A number of Perl escapes are not handled by PCRE. We give an explicit
403 error. */
404
405 case 'l':
406 case 'L':
407 case 'N':
408 case 'u':
409 case 'U':
410 *errorcodeptr = ERR37;
411 break;
412
413 /* The handling of escape sequences consisting of a string of digits
414 starting with one that is not zero is not straightforward. By experiment,
415 the way Perl works seems to be as follows:
416
417 Outside a character class, the digits are read as a decimal number. If the
418 number is less than 10, or if there are that many previous extracting
419 left brackets, then it is a back reference. Otherwise, up to three octal
420 digits are read to form an escaped byte. Thus \123 is likely to be octal
421 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
422 value is greater than 377, the least significant 8 bits are taken. Inside a
423 character class, \ followed by a digit is always an octal number. */
424
425 case '1': case '2': case '3': case '4': case '5':
426 case '6': case '7': case '8': case '9':
427
428 if (!isclass)
429 {
430 oldptr = ptr;
431 c -= '0';
432 while ((digitab[ptr[1]] & ctype_digit) != 0)
433 c = c * 10 + *(++ptr) - '0';
434 if (c < 10 || c <= bracount)
435 {
436 c = -(ESC_REF + c);
437 break;
438 }
439 ptr = oldptr; /* Put the pointer back and fall through */
440 }
441
442 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
443 generates a binary zero byte and treats the digit as a following literal.
444 Thus we have to pull back the pointer by one. */
445
446 if ((c = *ptr) >= '8')
447 {
448 ptr--;
449 c = 0;
450 break;
451 }
452
453 /* \0 always starts an octal number, but we may drop through to here with a
454 larger first octal digit. */
455
456 case '0':
457 c -= '0';
458 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
459 c = c * 8 + *(++ptr) - '0';
460 c &= 255; /* Take least significant 8 bits */
461 break;
462
463 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
464 which can be greater than 0xff, but only if the ddd are hex digits. */
465
466 case 'x':
467 #ifdef SUPPORT_UTF8
468 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
469 {
470 const uschar *pt = ptr + 2;
471 register int count = 0;
472 c = 0;
473 while ((digitab[*pt] & ctype_xdigit) != 0)
474 {
475 int cc = *pt++;
476 count++;
477 #if !EBCDIC /* ASCII coding */
478 if (cc >= 'a') cc -= 32; /* Convert to upper case */
479 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
480 #else /* EBCDIC coding */
481 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
482 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
483 #endif
484 }
485 if (*pt == '}')
486 {
487 if (c < 0 || count > 8) *errorcodeptr = ERR34;
488 ptr = pt;
489 break;
490 }
491 /* If the sequence of hex digits does not end with '}', then we don't
492 recognize this construct; fall through to the normal \x handling. */
493 }
494 #endif
495
496 /* Read just a single hex char */
497
498 c = 0;
499 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
500 {
501 int cc; /* Some compilers don't like ++ */
502 cc = *(++ptr); /* in initializers */
503 #if !EBCDIC /* ASCII coding */
504 if (cc >= 'a') cc -= 32; /* Convert to upper case */
505 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
506 #else /* EBCDIC coding */
507 if (cc <= 'z') cc += 64; /* Convert to upper case */
508 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
509 #endif
510 }
511 break;
512
513 /* Other special escapes not starting with a digit are straightforward */
514
515 case 'c':
516 c = *(++ptr);
517 if (c == 0)
518 {
519 *errorcodeptr = ERR2;
520 return 0;
521 }
522
523 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
524 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
525 (However, an EBCDIC equivalent has now been added.) */
526
527 #if !EBCDIC /* ASCII coding */
528 if (c >= 'a' && c <= 'z') c -= 32;
529 c ^= 0x40;
530 #else /* EBCDIC coding */
531 if (c >= 'a' && c <= 'z') c += 64;
532 c ^= 0xC0;
533 #endif
534 break;
535
536 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
537 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
538 for Perl compatibility, it is a literal. This code looks a bit odd, but
539 there used to be some cases other than the default, and there may be again
540 in future, so I haven't "optimized" it. */
541
542 default:
543 if ((options & PCRE_EXTRA) != 0) switch(c)
544 {
545 default:
546 *errorcodeptr = ERR3;
547 break;
548 }
549 break;
550 }
551 }
552
553 *ptrptr = ptr;
554 return c;
555 }
556
557
558
559 #ifdef SUPPORT_UCP
560 /*************************************************
561 * Handle \P and \p *
562 *************************************************/
563
564 /* This function is called after \P or \p has been encountered, provided that
565 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
566 pointing at the P or p. On exit, it is pointing at the final character of the
567 escape sequence.
568
569 Argument:
570 ptrptr points to the pattern position pointer
571 negptr points to a boolean that is set TRUE for negation else FALSE
572 errorcodeptr points to the error code variable
573
574 Returns: value from ucp_type_table, or -1 for an invalid type
575 */
576
577 static int
578 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
579 {
580 int c, i, bot, top;
581 const uschar *ptr = *ptrptr;
582 char name[4];
583
584 c = *(++ptr);
585 if (c == 0) goto ERROR_RETURN;
586
587 *negptr = FALSE;
588
589 /* \P or \p can be followed by a one- or two-character name in {}, optionally
590 preceded by ^ for negation. */
591
592 if (c == '{')
593 {
594 if (ptr[1] == '^')
595 {
596 *negptr = TRUE;
597 ptr++;
598 }
599 for (i = 0; i <= 2; i++)
600 {
601 c = *(++ptr);
602 if (c == 0) goto ERROR_RETURN;
603 if (c == '}') break;
604 name[i] = c;
605 }
606 if (c !='}') /* Try to distinguish error cases */
607 {
608 while (*(++ptr) != 0 && *ptr != '}');
609 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
610 }
611 name[i] = 0;
612 }
613
614 /* Otherwise there is just one following character */
615
616 else
617 {
618 name[0] = c;
619 name[1] = 0;
620 }
621
622 *ptrptr = ptr;
623
624 /* Search for a recognized property name using binary chop */
625
626 bot = 0;
627 top = _pcre_utt_size;
628
629 while (bot < top)
630 {
631 i = (bot + top)/2;
632 c = strcmp(name, _pcre_utt[i].name);
633 if (c == 0) return _pcre_utt[i].value;
634 if (c > 0) bot = i + 1; else top = i;
635 }
636
637 UNKNOWN_RETURN:
638 *errorcodeptr = ERR47;
639 *ptrptr = ptr;
640 return -1;
641
642 ERROR_RETURN:
643 *errorcodeptr = ERR46;
644 *ptrptr = ptr;
645 return -1;
646 }
647 #endif
648
649
650
651
652 /*************************************************
653 * Check for counted repeat *
654 *************************************************/
655
656 /* This function is called when a '{' is encountered in a place where it might
657 start a quantifier. It looks ahead to see if it really is a quantifier or not.
658 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
659 where the ddds are digits.
660
661 Arguments:
662 p pointer to the first char after '{'
663
664 Returns: TRUE or FALSE
665 */
666
667 static BOOL
668 is_counted_repeat(const uschar *p)
669 {
670 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671 while ((digitab[*p] & ctype_digit) != 0) p++;
672 if (*p == '}') return TRUE;
673
674 if (*p++ != ',') return FALSE;
675 if (*p == '}') return TRUE;
676
677 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
678 while ((digitab[*p] & ctype_digit) != 0) p++;
679
680 return (*p == '}');
681 }
682
683
684
685 /*************************************************
686 * Read repeat counts *
687 *************************************************/
688
689 /* Read an item of the form {n,m} and return the values. This is called only
690 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
691 so the syntax is guaranteed to be correct, but we need to check the values.
692
693 Arguments:
694 p pointer to first char after '{'
695 minp pointer to int for min
696 maxp pointer to int for max
697 returned as -1 if no max
698 errorcodeptr points to error code variable
699
700 Returns: pointer to '}' on success;
701 current ptr on error, with errorcodeptr set non-zero
702 */
703
704 static const uschar *
705 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
706 {
707 int min = 0;
708 int max = -1;
709
710 /* Read the minimum value and do a paranoid check: a negative value indicates
711 an integer overflow. */
712
713 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
714 if (min < 0 || min > 65535)
715 {
716 *errorcodeptr = ERR5;
717 return p;
718 }
719
720 /* Read the maximum value if there is one, and again do a paranoid on its size.
721 Also, max must not be less than min. */
722
723 if (*p == '}') max = min; else
724 {
725 if (*(++p) != '}')
726 {
727 max = 0;
728 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
729 if (max < 0 || max > 65535)
730 {
731 *errorcodeptr = ERR5;
732 return p;
733 }
734 if (max < min)
735 {
736 *errorcodeptr = ERR4;
737 return p;
738 }
739 }
740 }
741
742 /* Fill in the required variables, and pass back the pointer to the terminating
743 '}'. */
744
745 *minp = min;
746 *maxp = max;
747 return p;
748 }
749
750
751
752 /*************************************************
753 * Find first significant op code *
754 *************************************************/
755
756 /* This is called by several functions that scan a compiled expression looking
757 for a fixed first character, or an anchoring op code etc. It skips over things
758 that do not influence this. For some calls, a change of option is important.
759 For some calls, it makes sense to skip negative forward and all backward
760 assertions, and also the \b assertion; for others it does not.
761
762 Arguments:
763 code pointer to the start of the group
764 options pointer to external options
765 optbit the option bit whose changing is significant, or
766 zero if none are
767 skipassert TRUE if certain assertions are to be skipped
768
769 Returns: pointer to the first significant opcode
770 */
771
772 static const uschar*
773 first_significant_code(const uschar *code, int *options, int optbit,
774 BOOL skipassert)
775 {
776 for (;;)
777 {
778 switch ((int)*code)
779 {
780 case OP_OPT:
781 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
782 *options = (int)code[1];
783 code += 2;
784 break;
785
786 case OP_ASSERT_NOT:
787 case OP_ASSERTBACK:
788 case OP_ASSERTBACK_NOT:
789 if (!skipassert) return code;
790 do code += GET(code, 1); while (*code == OP_ALT);
791 code += _pcre_OP_lengths[*code];
792 break;
793
794 case OP_WORD_BOUNDARY:
795 case OP_NOT_WORD_BOUNDARY:
796 if (!skipassert) return code;
797 /* Fall through */
798
799 case OP_CALLOUT:
800 case OP_CREF:
801 case OP_BRANUMBER:
802 code += _pcre_OP_lengths[*code];
803 break;
804
805 default:
806 return code;
807 }
808 }
809 /* Control never reaches here */
810 }
811
812
813
814
815 /*************************************************
816 * Find the fixed length of a pattern *
817 *************************************************/
818
819 /* Scan a pattern and compute the fixed length of subject that will match it,
820 if the length is fixed. This is needed for dealing with backward assertions.
821 In UTF8 mode, the result is in characters rather than bytes.
822
823 Arguments:
824 code points to the start of the pattern (the bracket)
825 options the compiling options
826
827 Returns: the fixed length, or -1 if there is no fixed length,
828 or -2 if \C was encountered
829 */
830
831 static int
832 find_fixedlength(uschar *code, int options)
833 {
834 int length = -1;
835
836 register int branchlength = 0;
837 register uschar *cc = code + 1 + LINK_SIZE;
838
839 /* Scan along the opcodes for this branch. If we get to the end of the
840 branch, check the length against that of the other branches. */
841
842 for (;;)
843 {
844 int d;
845 register int op = *cc;
846 if (op >= OP_BRA) op = OP_BRA;
847
848 switch (op)
849 {
850 case OP_BRA:
851 case OP_ONCE:
852 case OP_COND:
853 d = find_fixedlength(cc, options);
854 if (d < 0) return d;
855 branchlength += d;
856 do cc += GET(cc, 1); while (*cc == OP_ALT);
857 cc += 1 + LINK_SIZE;
858 break;
859
860 /* Reached end of a branch; if it's a ket it is the end of a nested
861 call. If it's ALT it is an alternation in a nested call. If it is
862 END it's the end of the outer call. All can be handled by the same code. */
863
864 case OP_ALT:
865 case OP_KET:
866 case OP_KETRMAX:
867 case OP_KETRMIN:
868 case OP_END:
869 if (length < 0) length = branchlength;
870 else if (length != branchlength) return -1;
871 if (*cc != OP_ALT) return length;
872 cc += 1 + LINK_SIZE;
873 branchlength = 0;
874 break;
875
876 /* Skip over assertive subpatterns */
877
878 case OP_ASSERT:
879 case OP_ASSERT_NOT:
880 case OP_ASSERTBACK:
881 case OP_ASSERTBACK_NOT:
882 do cc += GET(cc, 1); while (*cc == OP_ALT);
883 /* Fall through */
884
885 /* Skip over things that don't match chars */
886
887 case OP_REVERSE:
888 case OP_BRANUMBER:
889 case OP_CREF:
890 case OP_OPT:
891 case OP_CALLOUT:
892 case OP_SOD:
893 case OP_SOM:
894 case OP_EOD:
895 case OP_EODN:
896 case OP_CIRC:
897 case OP_DOLL:
898 case OP_NOT_WORD_BOUNDARY:
899 case OP_WORD_BOUNDARY:
900 cc += _pcre_OP_lengths[*cc];
901 break;
902
903 /* Handle literal characters */
904
905 case OP_CHAR:
906 case OP_CHARNC:
907 branchlength++;
908 cc += 2;
909 #ifdef SUPPORT_UTF8
910 if ((options & PCRE_UTF8) != 0)
911 {
912 while ((*cc & 0xc0) == 0x80) cc++;
913 }
914 #endif
915 break;
916
917 /* Handle exact repetitions. The count is already in characters, but we
918 need to skip over a multibyte character in UTF8 mode. */
919
920 case OP_EXACT:
921 branchlength += GET2(cc,1);
922 cc += 4;
923 #ifdef SUPPORT_UTF8
924 if ((options & PCRE_UTF8) != 0)
925 {
926 while((*cc & 0x80) == 0x80) cc++;
927 }
928 #endif
929 break;
930
931 case OP_TYPEEXACT:
932 branchlength += GET2(cc,1);
933 cc += 4;
934 break;
935
936 /* Handle single-char matchers */
937
938 case OP_PROP:
939 case OP_NOTPROP:
940 cc++;
941 /* Fall through */
942
943 case OP_NOT_DIGIT:
944 case OP_DIGIT:
945 case OP_NOT_WHITESPACE:
946 case OP_WHITESPACE:
947 case OP_NOT_WORDCHAR:
948 case OP_WORDCHAR:
949 case OP_ANY:
950 branchlength++;
951 cc++;
952 break;
953
954 /* The single-byte matcher isn't allowed */
955
956 case OP_ANYBYTE:
957 return -2;
958
959 /* Check a class for variable quantification */
960
961 #ifdef SUPPORT_UTF8
962 case OP_XCLASS:
963 cc += GET(cc, 1) - 33;
964 /* Fall through */
965 #endif
966
967 case OP_CLASS:
968 case OP_NCLASS:
969 cc += 33;
970
971 switch (*cc)
972 {
973 case OP_CRSTAR:
974 case OP_CRMINSTAR:
975 case OP_CRQUERY:
976 case OP_CRMINQUERY:
977 return -1;
978
979 case OP_CRRANGE:
980 case OP_CRMINRANGE:
981 if (GET2(cc,1) != GET2(cc,3)) return -1;
982 branchlength += GET2(cc,1);
983 cc += 5;
984 break;
985
986 default:
987 branchlength++;
988 }
989 break;
990
991 /* Anything else is variable length */
992
993 default:
994 return -1;
995 }
996 }
997 /* Control never gets here */
998 }
999
1000
1001
1002
1003 /*************************************************
1004 * Scan compiled regex for numbered bracket *
1005 *************************************************/
1006
1007 /* This little function scans through a compiled pattern until it finds a
1008 capturing bracket with the given number.
1009
1010 Arguments:
1011 code points to start of expression
1012 utf8 TRUE in UTF-8 mode
1013 number the required bracket number
1014
1015 Returns: pointer to the opcode for the bracket, or NULL if not found
1016 */
1017
1018 static const uschar *
1019 find_bracket(const uschar *code, BOOL utf8, int number)
1020 {
1021 #ifndef SUPPORT_UTF8
1022 utf8 = utf8; /* Stop pedantic compilers complaining */
1023 #endif
1024
1025 for (;;)
1026 {
1027 register int c = *code;
1028 if (c == OP_END) return NULL;
1029 else if (c > OP_BRA)
1030 {
1031 int n = c - OP_BRA;
1032 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1033 if (n == number) return (uschar *)code;
1034 code += _pcre_OP_lengths[OP_BRA];
1035 }
1036 else
1037 {
1038 code += _pcre_OP_lengths[c];
1039
1040 #ifdef SUPPORT_UTF8
1041
1042 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1043 by a multi-byte character. The length in the table is a minimum, so we have
1044 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1045 can use relatively efficient code. */
1046
1047 if (utf8) switch(c)
1048 {
1049 case OP_CHAR:
1050 case OP_CHARNC:
1051 case OP_EXACT:
1052 case OP_UPTO:
1053 case OP_MINUPTO:
1054 case OP_STAR:
1055 case OP_MINSTAR:
1056 case OP_PLUS:
1057 case OP_MINPLUS:
1058 case OP_QUERY:
1059 case OP_MINQUERY:
1060 while ((*code & 0xc0) == 0x80) code++;
1061 break;
1062
1063 /* XCLASS is used for classes that cannot be represented just by a bit
1064 map. This includes negated single high-valued characters. The length in
1065 the table is zero; the actual length is stored in the compiled code. */
1066
1067 case OP_XCLASS:
1068 code += GET(code, 1) + 1;
1069 break;
1070 }
1071 #endif
1072 }
1073 }
1074 }
1075
1076
1077
1078 /*************************************************
1079 * Scan compiled regex for recursion reference *
1080 *************************************************/
1081
1082 /* This little function scans through a compiled pattern until it finds an
1083 instance of OP_RECURSE.
1084
1085 Arguments:
1086 code points to start of expression
1087 utf8 TRUE in UTF-8 mode
1088
1089 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1090 */
1091
1092 static const uschar *
1093 find_recurse(const uschar *code, BOOL utf8)
1094 {
1095 #ifndef SUPPORT_UTF8
1096 utf8 = utf8; /* Stop pedantic compilers complaining */
1097 #endif
1098
1099 for (;;)
1100 {
1101 register int c = *code;
1102 if (c == OP_END) return NULL;
1103 else if (c == OP_RECURSE) return code;
1104 else if (c > OP_BRA)
1105 {
1106 code += _pcre_OP_lengths[OP_BRA];
1107 }
1108 else
1109 {
1110 code += _pcre_OP_lengths[c];
1111
1112 #ifdef SUPPORT_UTF8
1113
1114 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1115 by a multi-byte character. The length in the table is a minimum, so we have
1116 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1117 can use relatively efficient code. */
1118
1119 if (utf8) switch(c)
1120 {
1121 case OP_CHAR:
1122 case OP_CHARNC:
1123 case OP_EXACT:
1124 case OP_UPTO:
1125 case OP_MINUPTO:
1126 case OP_STAR:
1127 case OP_MINSTAR:
1128 case OP_PLUS:
1129 case OP_MINPLUS:
1130 case OP_QUERY:
1131 case OP_MINQUERY:
1132 while ((*code & 0xc0) == 0x80) code++;
1133 break;
1134
1135 /* XCLASS is used for classes that cannot be represented just by a bit
1136 map. This includes negated single high-valued characters. The length in
1137 the table is zero; the actual length is stored in the compiled code. */
1138
1139 case OP_XCLASS:
1140 code += GET(code, 1) + 1;
1141 break;
1142 }
1143 #endif
1144 }
1145 }
1146 }
1147
1148
1149
1150 /*************************************************
1151 * Scan compiled branch for non-emptiness *
1152 *************************************************/
1153
1154 /* This function scans through a branch of a compiled pattern to see whether it
1155 can match the empty string or not. It is called only from could_be_empty()
1156 below. Note that first_significant_code() skips over assertions. If we hit an
1157 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1158 whose current branch will already have been scanned.
1159
1160 Arguments:
1161 code points to start of search
1162 endcode points to where to stop
1163 utf8 TRUE if in UTF8 mode
1164
1165 Returns: TRUE if what is matched could be empty
1166 */
1167
1168 static BOOL
1169 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1170 {
1171 register int c;
1172 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1173 code < endcode;
1174 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1175 {
1176 const uschar *ccode;
1177
1178 c = *code;
1179
1180 if (c >= OP_BRA)
1181 {
1182 BOOL empty_branch;
1183 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1184
1185 /* Scan a closed bracket */
1186
1187 empty_branch = FALSE;
1188 do
1189 {
1190 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1191 empty_branch = TRUE;
1192 code += GET(code, 1);
1193 }
1194 while (*code == OP_ALT);
1195 if (!empty_branch) return FALSE; /* All branches are non-empty */
1196 code += 1 + LINK_SIZE;
1197 c = *code;
1198 }
1199
1200 else switch (c)
1201 {
1202 /* Check for quantifiers after a class */
1203
1204 #ifdef SUPPORT_UTF8
1205 case OP_XCLASS:
1206 ccode = code + GET(code, 1);
1207 goto CHECK_CLASS_REPEAT;
1208 #endif
1209
1210 case OP_CLASS:
1211 case OP_NCLASS:
1212 ccode = code + 33;
1213
1214 #ifdef SUPPORT_UTF8
1215 CHECK_CLASS_REPEAT:
1216 #endif
1217
1218 switch (*ccode)
1219 {
1220 case OP_CRSTAR: /* These could be empty; continue */
1221 case OP_CRMINSTAR:
1222 case OP_CRQUERY:
1223 case OP_CRMINQUERY:
1224 break;
1225
1226 default: /* Non-repeat => class must match */
1227 case OP_CRPLUS: /* These repeats aren't empty */
1228 case OP_CRMINPLUS:
1229 return FALSE;
1230
1231 case OP_CRRANGE:
1232 case OP_CRMINRANGE:
1233 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1234 break;
1235 }
1236 break;
1237
1238 /* Opcodes that must match a character */
1239
1240 case OP_PROP:
1241 case OP_NOTPROP:
1242 case OP_EXTUNI:
1243 case OP_NOT_DIGIT:
1244 case OP_DIGIT:
1245 case OP_NOT_WHITESPACE:
1246 case OP_WHITESPACE:
1247 case OP_NOT_WORDCHAR:
1248 case OP_WORDCHAR:
1249 case OP_ANY:
1250 case OP_ANYBYTE:
1251 case OP_CHAR:
1252 case OP_CHARNC:
1253 case OP_NOT:
1254 case OP_PLUS:
1255 case OP_MINPLUS:
1256 case OP_EXACT:
1257 case OP_NOTPLUS:
1258 case OP_NOTMINPLUS:
1259 case OP_NOTEXACT:
1260 case OP_TYPEPLUS:
1261 case OP_TYPEMINPLUS:
1262 case OP_TYPEEXACT:
1263 return FALSE;
1264
1265 /* End of branch */
1266
1267 case OP_KET:
1268 case OP_KETRMAX:
1269 case OP_KETRMIN:
1270 case OP_ALT:
1271 return TRUE;
1272
1273 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1274 followed by a multibyte character */
1275
1276 #ifdef SUPPORT_UTF8
1277 case OP_STAR:
1278 case OP_MINSTAR:
1279 case OP_QUERY:
1280 case OP_MINQUERY:
1281 case OP_UPTO:
1282 case OP_MINUPTO:
1283 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1284 break;
1285 #endif
1286 }
1287 }
1288
1289 return TRUE;
1290 }
1291
1292
1293
1294 /*************************************************
1295 * Scan compiled regex for non-emptiness *
1296 *************************************************/
1297
1298 /* This function is called to check for left recursive calls. We want to check
1299 the current branch of the current pattern to see if it could match the empty
1300 string. If it could, we must look outwards for branches at other levels,
1301 stopping when we pass beyond the bracket which is the subject of the recursion.
1302
1303 Arguments:
1304 code points to start of the recursion
1305 endcode points to where to stop (current RECURSE item)
1306 bcptr points to the chain of current (unclosed) branch starts
1307 utf8 TRUE if in UTF-8 mode
1308
1309 Returns: TRUE if what is matched could be empty
1310 */
1311
1312 static BOOL
1313 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1314 BOOL utf8)
1315 {
1316 while (bcptr != NULL && bcptr->current >= code)
1317 {
1318 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1319 bcptr = bcptr->outer;
1320 }
1321 return TRUE;
1322 }
1323
1324
1325
1326 /*************************************************
1327 * Check for POSIX class syntax *
1328 *************************************************/
1329
1330 /* This function is called when the sequence "[:" or "[." or "[=" is
1331 encountered in a character class. It checks whether this is followed by an
1332 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1333 ".]" or "=]".
1334
1335 Argument:
1336 ptr pointer to the initial [
1337 endptr where to return the end pointer
1338 cd pointer to compile data
1339
1340 Returns: TRUE or FALSE
1341 */
1342
1343 static BOOL
1344 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1345 {
1346 int terminator; /* Don't combine these lines; the Solaris cc */
1347 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1348 if (*(++ptr) == '^') ptr++;
1349 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1350 if (*ptr == terminator && ptr[1] == ']')
1351 {
1352 *endptr = ptr;
1353 return TRUE;
1354 }
1355 return FALSE;
1356 }
1357
1358
1359
1360
1361 /*************************************************
1362 * Check POSIX class name *
1363 *************************************************/
1364
1365 /* This function is called to check the name given in a POSIX-style class entry
1366 such as [:alnum:].
1367
1368 Arguments:
1369 ptr points to the first letter
1370 len the length of the name
1371
1372 Returns: a value representing the name, or -1 if unknown
1373 */
1374
1375 static int
1376 check_posix_name(const uschar *ptr, int len)
1377 {
1378 register int yield = 0;
1379 while (posix_name_lengths[yield] != 0)
1380 {
1381 if (len == posix_name_lengths[yield] &&
1382 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1383 yield++;
1384 }
1385 return -1;
1386 }
1387
1388
1389 /*************************************************
1390 * Adjust OP_RECURSE items in repeated group *
1391 *************************************************/
1392
1393 /* OP_RECURSE items contain an offset from the start of the regex to the group
1394 that is referenced. This means that groups can be replicated for fixed
1395 repetition simply by copying (because the recursion is allowed to refer to
1396 earlier groups that are outside the current group). However, when a group is
1397 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1398 it, after it has been compiled. This means that any OP_RECURSE items within it
1399 that refer to the group itself or any contained groups have to have their
1400 offsets adjusted. That is the job of this function. Before it is called, the
1401 partially compiled regex must be temporarily terminated with OP_END.
1402
1403 Arguments:
1404 group points to the start of the group
1405 adjust the amount by which the group is to be moved
1406 utf8 TRUE in UTF-8 mode
1407 cd contains pointers to tables etc.
1408
1409 Returns: nothing
1410 */
1411
1412 static void
1413 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1414 {
1415 uschar *ptr = group;
1416 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1417 {
1418 int offset = GET(ptr, 1);
1419 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1420 ptr += 1 + LINK_SIZE;
1421 }
1422 }
1423
1424
1425
1426 /*************************************************
1427 * Insert an automatic callout point *
1428 *************************************************/
1429
1430 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1431 callout points before each pattern item.
1432
1433 Arguments:
1434 code current code pointer
1435 ptr current pattern pointer
1436 cd pointers to tables etc
1437
1438 Returns: new code pointer
1439 */
1440
1441 static uschar *
1442 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1443 {
1444 *code++ = OP_CALLOUT;
1445 *code++ = 255;
1446 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1447 PUT(code, LINK_SIZE, 0); /* Default length */
1448 return code + 2*LINK_SIZE;
1449 }
1450
1451
1452
1453 /*************************************************
1454 * Complete a callout item *
1455 *************************************************/
1456
1457 /* A callout item contains the length of the next item in the pattern, which
1458 we can't fill in till after we have reached the relevant point. This is used
1459 for both automatic and manual callouts.
1460
1461 Arguments:
1462 previous_callout points to previous callout item
1463 ptr current pattern pointer
1464 cd pointers to tables etc
1465
1466 Returns: nothing
1467 */
1468
1469 static void
1470 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1471 {
1472 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1473 PUT(previous_callout, 2 + LINK_SIZE, length);
1474 }
1475
1476
1477
1478 #ifdef SUPPORT_UCP
1479 /*************************************************
1480 * Get othercase range *
1481 *************************************************/
1482
1483 /* This function is passed the start and end of a class range, in UTF-8 mode
1484 with UCP support. It searches up the characters, looking for internal ranges of
1485 characters in the "other" case. Each call returns the next one, updating the
1486 start address.
1487
1488 Arguments:
1489 cptr points to starting character value; updated
1490 d end value
1491 ocptr where to put start of othercase range
1492 odptr where to put end of othercase range
1493
1494 Yield: TRUE when range returned; FALSE when no more
1495 */
1496
1497 static BOOL
1498 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1499 {
1500 int c, chartype, othercase, next;
1501
1502 for (c = *cptr; c <= d; c++)
1503 {
1504 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1505 break;
1506 }
1507
1508 if (c > d) return FALSE;
1509
1510 *ocptr = othercase;
1511 next = othercase + 1;
1512
1513 for (++c; c <= d; c++)
1514 {
1515 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1516 othercase != next)
1517 break;
1518 next++;
1519 }
1520
1521 *odptr = next - 1;
1522 *cptr = c;
1523
1524 return TRUE;
1525 }
1526 #endif /* SUPPORT_UCP */
1527
1528
1529 /*************************************************
1530 * Compile one branch *
1531 *************************************************/
1532
1533 /* Scan the pattern, compiling it into the code vector. If the options are
1534 changed during the branch, the pointer is used to change the external options
1535 bits.
1536
1537 Arguments:
1538 optionsptr pointer to the option bits
1539 brackets points to number of extracting brackets used
1540 codeptr points to the pointer to the current code point
1541 ptrptr points to the current pattern pointer
1542 errorcodeptr points to error code variable
1543 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1544 reqbyteptr set to the last literal character required, else < 0
1545 bcptr points to current branch chain
1546 cd contains pointers to tables etc.
1547
1548 Returns: TRUE on success
1549 FALSE, with *errorcodeptr set non-zero on error
1550 */
1551
1552 static BOOL
1553 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1554 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1555 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1556 {
1557 int repeat_type, op_type;
1558 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1559 int bravalue = 0;
1560 int greedy_default, greedy_non_default;
1561 int firstbyte, reqbyte;
1562 int zeroreqbyte, zerofirstbyte;
1563 int req_caseopt, reqvary, tempreqvary;
1564 int condcount = 0;
1565 int options = *optionsptr;
1566 int after_manual_callout = 0;
1567 register int c;
1568 register uschar *code = *codeptr;
1569 uschar *tempcode;
1570 BOOL inescq = FALSE;
1571 BOOL groupsetfirstbyte = FALSE;
1572 const uschar *ptr = *ptrptr;
1573 const uschar *tempptr;
1574 uschar *previous = NULL;
1575 uschar *previous_callout = NULL;
1576 uschar classbits[32];
1577
1578 #ifdef SUPPORT_UTF8
1579 BOOL class_utf8;
1580 BOOL utf8 = (options & PCRE_UTF8) != 0;
1581 uschar *class_utf8data;
1582 uschar utf8_char[6];
1583 #else
1584 BOOL utf8 = FALSE;
1585 #endif
1586
1587 /* Set up the default and non-default settings for greediness */
1588
1589 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1590 greedy_non_default = greedy_default ^ 1;
1591
1592 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1593 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1594 matches a non-fixed char first char; reqbyte just remains unset if we never
1595 find one.
1596
1597 When we hit a repeat whose minimum is zero, we may have to adjust these values
1598 to take the zero repeat into account. This is implemented by setting them to
1599 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1600 item types that can be repeated set these backoff variables appropriately. */
1601
1602 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1603
1604 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1605 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1606 value > 255. It is added into the firstbyte or reqbyte variables to record the
1607 case status of the value. This is used only for ASCII characters. */
1608
1609 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1610
1611 /* Switch on next character until the end of the branch */
1612
1613 for (;; ptr++)
1614 {
1615 BOOL negate_class;
1616 BOOL possessive_quantifier;
1617 BOOL is_quantifier;
1618 int class_charcount;
1619 int class_lastchar;
1620 int newoptions;
1621 int recno;
1622 int skipbytes;
1623 int subreqbyte;
1624 int subfirstbyte;
1625 int mclength;
1626 uschar mcbuffer[8];
1627
1628 /* Next byte in the pattern */
1629
1630 c = *ptr;
1631
1632 /* If in \Q...\E, check for the end; if not, we have a literal */
1633
1634 if (inescq && c != 0)
1635 {
1636 if (c == '\\' && ptr[1] == 'E')
1637 {
1638 inescq = FALSE;
1639 ptr++;
1640 continue;
1641 }
1642 else
1643 {
1644 if (previous_callout != NULL)
1645 {
1646 complete_callout(previous_callout, ptr, cd);
1647 previous_callout = NULL;
1648 }
1649 if ((options & PCRE_AUTO_CALLOUT) != 0)
1650 {
1651 previous_callout = code;
1652 code = auto_callout(code, ptr, cd);
1653 }
1654 goto NORMAL_CHAR;
1655 }
1656 }
1657
1658 /* Fill in length of a previous callout, except when the next thing is
1659 a quantifier. */
1660
1661 is_quantifier = c == '*' || c == '+' || c == '?' ||
1662 (c == '{' && is_counted_repeat(ptr+1));
1663
1664 if (!is_quantifier && previous_callout != NULL &&
1665 after_manual_callout-- <= 0)
1666 {
1667 complete_callout(previous_callout, ptr, cd);
1668 previous_callout = NULL;
1669 }
1670
1671 /* In extended mode, skip white space and comments */
1672
1673 if ((options & PCRE_EXTENDED) != 0)
1674 {
1675 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1676 if (c == '#')
1677 {
1678 /* The space before the ; is to avoid a warning on a silly compiler
1679 on the Macintosh. */
1680 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1681 if (c != 0) continue; /* Else fall through to handle end of string */
1682 }
1683 }
1684
1685 /* No auto callout for quantifiers. */
1686
1687 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1688 {
1689 previous_callout = code;
1690 code = auto_callout(code, ptr, cd);
1691 }
1692
1693 switch(c)
1694 {
1695 /* The branch terminates at end of string, |, or ). */
1696
1697 case 0:
1698 case '|':
1699 case ')':
1700 *firstbyteptr = firstbyte;
1701 *reqbyteptr = reqbyte;
1702 *codeptr = code;
1703 *ptrptr = ptr;
1704 return TRUE;
1705
1706 /* Handle single-character metacharacters. In multiline mode, ^ disables
1707 the setting of any following char as a first character. */
1708
1709 case '^':
1710 if ((options & PCRE_MULTILINE) != 0)
1711 {
1712 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1713 }
1714 previous = NULL;
1715 *code++ = OP_CIRC;
1716 break;
1717
1718 case '$':
1719 previous = NULL;
1720 *code++ = OP_DOLL;
1721 break;
1722
1723 /* There can never be a first char if '.' is first, whatever happens about
1724 repeats. The value of reqbyte doesn't change either. */
1725
1726 case '.':
1727 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1728 zerofirstbyte = firstbyte;
1729 zeroreqbyte = reqbyte;
1730 previous = code;
1731 *code++ = OP_ANY;
1732 break;
1733
1734 /* Character classes. If the included characters are all < 255 in value, we
1735 build a 32-byte bitmap of the permitted characters, except in the special
1736 case where there is only one such character. For negated classes, we build
1737 the map as usual, then invert it at the end. However, we use a different
1738 opcode so that data characters > 255 can be handled correctly.
1739
1740 If the class contains characters outside the 0-255 range, a different
1741 opcode is compiled. It may optionally have a bit map for characters < 256,
1742 but those above are are explicitly listed afterwards. A flag byte tells
1743 whether the bitmap is present, and whether this is a negated class or not.
1744 */
1745
1746 case '[':
1747 previous = code;
1748
1749 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1750 they are encountered at the top level, so we'll do that too. */
1751
1752 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1753 check_posix_syntax(ptr, &tempptr, cd))
1754 {
1755 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1756 goto FAILED;
1757 }
1758
1759 /* If the first character is '^', set the negation flag and skip it. */
1760
1761 if ((c = *(++ptr)) == '^')
1762 {
1763 negate_class = TRUE;
1764 c = *(++ptr);
1765 }
1766 else
1767 {
1768 negate_class = FALSE;
1769 }
1770
1771 /* Keep a count of chars with values < 256 so that we can optimize the case
1772 of just a single character (as long as it's < 256). For higher valued UTF-8
1773 characters, we don't yet do any optimization. */
1774
1775 class_charcount = 0;
1776 class_lastchar = -1;
1777
1778 #ifdef SUPPORT_UTF8
1779 class_utf8 = FALSE; /* No chars >= 256 */
1780 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1781 #endif
1782
1783 /* Initialize the 32-char bit map to all zeros. We have to build the
1784 map in a temporary bit of store, in case the class contains only 1
1785 character (< 256), because in that case the compiled code doesn't use the
1786 bit map. */
1787
1788 memset(classbits, 0, 32 * sizeof(uschar));
1789
1790 /* Process characters until ] is reached. By writing this as a "do" it
1791 means that an initial ] is taken as a data character. The first pass
1792 through the regex checked the overall syntax, so we don't need to be very
1793 strict here. At the start of the loop, c contains the first byte of the
1794 character. */
1795
1796 do
1797 {
1798 #ifdef SUPPORT_UTF8
1799 if (utf8 && c > 127)
1800 { /* Braces are required because the */
1801 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1802 }
1803 #endif
1804
1805 /* Inside \Q...\E everything is literal except \E */
1806
1807 if (inescq)
1808 {
1809 if (c == '\\' && ptr[1] == 'E')
1810 {
1811 inescq = FALSE;
1812 ptr++;
1813 continue;
1814 }
1815 else goto LONE_SINGLE_CHARACTER;
1816 }
1817
1818 /* Handle POSIX class names. Perl allows a negation extension of the
1819 form [:^name:]. A square bracket that doesn't match the syntax is
1820 treated as a literal. We also recognize the POSIX constructions
1821 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1822 5.6 and 5.8 do. */
1823
1824 if (c == '[' &&
1825 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1826 check_posix_syntax(ptr, &tempptr, cd))
1827 {
1828 BOOL local_negate = FALSE;
1829 int posix_class, i;
1830 register const uschar *cbits = cd->cbits;
1831
1832 if (ptr[1] != ':')
1833 {
1834 *errorcodeptr = ERR31;
1835 goto FAILED;
1836 }
1837
1838 ptr += 2;
1839 if (*ptr == '^')
1840 {
1841 local_negate = TRUE;
1842 ptr++;
1843 }
1844
1845 posix_class = check_posix_name(ptr, tempptr - ptr);
1846 if (posix_class < 0)
1847 {
1848 *errorcodeptr = ERR30;
1849 goto FAILED;
1850 }
1851
1852 /* If matching is caseless, upper and lower are converted to
1853 alpha. This relies on the fact that the class table starts with
1854 alpha, lower, upper as the first 3 entries. */
1855
1856 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1857 posix_class = 0;
1858
1859 /* Or into the map we are building up to 3 of the static class
1860 tables, or their negations. The [:blank:] class sets up the same
1861 chars as the [:space:] class (all white space). We remove the vertical
1862 white space chars afterwards. */
1863
1864 posix_class *= 3;
1865 for (i = 0; i < 3; i++)
1866 {
1867 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1868 int taboffset = posix_class_maps[posix_class + i];
1869 if (taboffset < 0) break;
1870 if (local_negate)
1871 {
1872 if (i == 0)
1873 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1874 else
1875 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1876 if (blankclass) classbits[1] |= 0x3c;
1877 }
1878 else
1879 {
1880 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1881 if (blankclass) classbits[1] &= ~0x3c;
1882 }
1883 }
1884
1885 ptr = tempptr + 1;
1886 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1887 continue; /* End of POSIX syntax handling */
1888 }
1889
1890 /* Backslash may introduce a single character, or it may introduce one
1891 of the specials, which just set a flag. Escaped items are checked for
1892 validity in the pre-compiling pass. The sequence \b is a special case.
1893 Inside a class (and only there) it is treated as backspace. Elsewhere
1894 it marks a word boundary. Other escapes have preset maps ready to
1895 or into the one we are building. We assume they have more than one
1896 character in them, so set class_charcount bigger than one. */
1897
1898 if (c == '\\')
1899 {
1900 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1901
1902 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1903 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1904 else if (-c == ESC_Q) /* Handle start of quoted string */
1905 {
1906 if (ptr[1] == '\\' && ptr[2] == 'E')
1907 {
1908 ptr += 2; /* avoid empty string */
1909 }
1910 else inescq = TRUE;
1911 continue;
1912 }
1913
1914 if (c < 0)
1915 {
1916 register const uschar *cbits = cd->cbits;
1917 class_charcount += 2; /* Greater than 1 is what matters */
1918 switch (-c)
1919 {
1920 case ESC_d:
1921 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1922 continue;
1923
1924 case ESC_D:
1925 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1926 continue;
1927
1928 case ESC_w:
1929 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1930 continue;
1931
1932 case ESC_W:
1933 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1934 continue;
1935
1936 case ESC_s:
1937 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1938 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1939 continue;
1940
1941 case ESC_S:
1942 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1943 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1944 continue;
1945
1946 #ifdef SUPPORT_UCP
1947 case ESC_p:
1948 case ESC_P:
1949 {
1950 BOOL negated;
1951 int property = get_ucp(&ptr, &negated, errorcodeptr);
1952 if (property < 0) goto FAILED;
1953 class_utf8 = TRUE;
1954 *class_utf8data++ = ((-c == ESC_p) != negated)?
1955 XCL_PROP : XCL_NOTPROP;
1956 *class_utf8data++ = property;
1957 class_charcount -= 2; /* Not a < 256 character */
1958 }
1959 continue;
1960 #endif
1961
1962 /* Unrecognized escapes are faulted if PCRE is running in its
1963 strict mode. By default, for compatibility with Perl, they are
1964 treated as literals. */
1965
1966 default:
1967 if ((options & PCRE_EXTRA) != 0)
1968 {
1969 *errorcodeptr = ERR7;
1970 goto FAILED;
1971 }
1972 c = *ptr; /* The final character */
1973 class_charcount -= 2; /* Undo the default count from above */
1974 }
1975 }
1976
1977 /* Fall through if we have a single character (c >= 0). This may be
1978 > 256 in UTF-8 mode. */
1979
1980 } /* End of backslash handling */
1981
1982 /* A single character may be followed by '-' to form a range. However,
1983 Perl does not permit ']' to be the end of the range. A '-' character
1984 here is treated as a literal. */
1985
1986 if (ptr[1] == '-' && ptr[2] != ']')
1987 {
1988 int d;
1989 ptr += 2;
1990
1991 #ifdef SUPPORT_UTF8
1992 if (utf8)
1993 { /* Braces are required because the */
1994 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1995 }
1996 else
1997 #endif
1998 d = *ptr; /* Not UTF-8 mode */
1999
2000 /* The second part of a range can be a single-character escape, but
2001 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2002 in such circumstances. */
2003
2004 if (d == '\\')
2005 {
2006 const uschar *oldptr = ptr;
2007 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2008
2009 /* \b is backslash; \X is literal X; any other special means the '-'
2010 was literal */
2011
2012 if (d < 0)
2013 {
2014 if (d == -ESC_b) d = '\b';
2015 else if (d == -ESC_X) d = 'X'; else
2016 {
2017 ptr = oldptr - 2;
2018 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2019 }
2020 }
2021 }
2022
2023 /* The check that the two values are in the correct order happens in
2024 the pre-pass. Optimize one-character ranges */
2025
2026 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2027
2028 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2029 matching, we have to use an XCLASS with extra data items. Caseless
2030 matching for characters > 127 is available only if UCP support is
2031 available. */
2032
2033 #ifdef SUPPORT_UTF8
2034 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2035 {
2036 class_utf8 = TRUE;
2037
2038 /* With UCP support, we can find the other case equivalents of
2039 the relevant characters. There may be several ranges. Optimize how
2040 they fit with the basic range. */
2041
2042 #ifdef SUPPORT_UCP
2043 if ((options & PCRE_CASELESS) != 0)
2044 {
2045 int occ, ocd;
2046 int cc = c;
2047 int origd = d;
2048 while (get_othercase_range(&cc, origd, &occ, &ocd))
2049 {
2050 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2051
2052 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2053 { /* if there is overlap, */
2054 c = occ; /* noting that if occ < c */
2055 continue; /* we can't have ocd > d */
2056 } /* because a subrange is */
2057 if (ocd > d && occ <= d + 1) /* always shorter than */
2058 { /* the basic range. */
2059 d = ocd;
2060 continue;
2061 }
2062
2063 if (occ == ocd)
2064 {
2065 *class_utf8data++ = XCL_SINGLE;
2066 }
2067 else
2068 {
2069 *class_utf8data++ = XCL_RANGE;
2070 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2071 }
2072 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2073 }
2074 }
2075 #endif /* SUPPORT_UCP */
2076
2077 /* Now record the original range, possibly modified for UCP caseless
2078 overlapping ranges. */
2079
2080 *class_utf8data++ = XCL_RANGE;
2081 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2082 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2083
2084 /* With UCP support, we are done. Without UCP support, there is no
2085 caseless matching for UTF-8 characters > 127; we can use the bit map
2086 for the smaller ones. */
2087
2088 #ifdef SUPPORT_UCP
2089 continue; /* With next character in the class */
2090 #else
2091 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2092
2093 /* Adjust upper limit and fall through to set up the map */
2094
2095 d = 127;
2096
2097 #endif /* SUPPORT_UCP */
2098 }
2099 #endif /* SUPPORT_UTF8 */
2100
2101 /* We use the bit map for all cases when not in UTF-8 mode; else
2102 ranges that lie entirely within 0-127 when there is UCP support; else
2103 for partial ranges without UCP support. */
2104
2105 for (; c <= d; c++)
2106 {
2107 classbits[c/8] |= (1 << (c&7));
2108 if ((options & PCRE_CASELESS) != 0)
2109 {
2110 int uc = cd->fcc[c]; /* flip case */
2111 classbits[uc/8] |= (1 << (uc&7));
2112 }
2113 class_charcount++; /* in case a one-char range */
2114 class_lastchar = c;
2115 }
2116
2117 continue; /* Go get the next char in the class */
2118 }
2119
2120 /* Handle a lone single character - we can get here for a normal
2121 non-escape char, or after \ that introduces a single character or for an
2122 apparent range that isn't. */
2123
2124 LONE_SINGLE_CHARACTER:
2125
2126 /* Handle a character that cannot go in the bit map */
2127
2128 #ifdef SUPPORT_UTF8
2129 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2130 {
2131 class_utf8 = TRUE;
2132 *class_utf8data++ = XCL_SINGLE;
2133 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2134
2135 #ifdef SUPPORT_UCP
2136 if ((options & PCRE_CASELESS) != 0)
2137 {
2138 int chartype;
2139 int othercase;
2140 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2141 othercase > 0)
2142 {
2143 *class_utf8data++ = XCL_SINGLE;
2144 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2145 }
2146 }
2147 #endif /* SUPPORT_UCP */
2148
2149 }
2150 else
2151 #endif /* SUPPORT_UTF8 */
2152
2153 /* Handle a single-byte character */
2154 {
2155 classbits[c/8] |= (1 << (c&7));
2156 if ((options & PCRE_CASELESS) != 0)
2157 {
2158 c = cd->fcc[c]; /* flip case */
2159 classbits[c/8] |= (1 << (c&7));
2160 }
2161 class_charcount++;
2162 class_lastchar = c;
2163 }
2164 }
2165
2166 /* Loop until ']' reached; the check for end of string happens inside the
2167 loop. This "while" is the end of the "do" above. */
2168
2169 while ((c = *(++ptr)) != ']' || inescq);
2170
2171 /* If class_charcount is 1, we saw precisely one character whose value is
2172 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2173 can optimize the negative case only if there were no characters >= 128
2174 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2175 single-bytes only. This is an historical hangover. Maybe one day we can
2176 tidy these opcodes to handle multi-byte characters.
2177
2178 The optimization throws away the bit map. We turn the item into a
2179 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2180 that OP_NOT does not support multibyte characters. In the positive case, it
2181 can cause firstbyte to be set. Otherwise, there can be no first char if
2182 this item is first, whatever repeat count may follow. In the case of
2183 reqbyte, save the previous value for reinstating. */
2184
2185 #ifdef SUPPORT_UTF8
2186 if (class_charcount == 1 &&
2187 (!utf8 ||
2188 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2189
2190 #else
2191 if (class_charcount == 1)
2192 #endif
2193 {
2194 zeroreqbyte = reqbyte;
2195
2196 /* The OP_NOT opcode works on one-byte characters only. */
2197
2198 if (negate_class)
2199 {
2200 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2201 zerofirstbyte = firstbyte;
2202 *code++ = OP_NOT;
2203 *code++ = class_lastchar;
2204 break;
2205 }
2206
2207 /* For a single, positive character, get the value into mcbuffer, and
2208 then we can handle this with the normal one-character code. */
2209
2210 #ifdef SUPPORT_UTF8
2211 if (utf8 && class_lastchar > 127)
2212 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2213 else
2214 #endif
2215 {
2216 mcbuffer[0] = class_lastchar;
2217 mclength = 1;
2218 }
2219 goto ONE_CHAR;
2220 } /* End of 1-char optimization */
2221
2222 /* The general case - not the one-char optimization. If this is the first
2223 thing in the branch, there can be no first char setting, whatever the
2224 repeat count. Any reqbyte setting must remain unchanged after any kind of
2225 repeat. */
2226
2227 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2228 zerofirstbyte = firstbyte;
2229 zeroreqbyte = reqbyte;
2230
2231 /* If there are characters with values > 255, we have to compile an
2232 extended class, with its own opcode. If there are no characters < 256,
2233 we can omit the bitmap. */
2234
2235 #ifdef SUPPORT_UTF8
2236 if (class_utf8)
2237 {
2238 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2239 *code++ = OP_XCLASS;
2240 code += LINK_SIZE;
2241 *code = negate_class? XCL_NOT : 0;
2242
2243 /* If the map is required, install it, and move on to the end of
2244 the extra data */
2245
2246 if (class_charcount > 0)
2247 {
2248 *code++ |= XCL_MAP;
2249 memcpy(code, classbits, 32);
2250 code = class_utf8data;
2251 }
2252
2253 /* If the map is not required, slide down the extra data. */
2254
2255 else
2256 {
2257 int len = class_utf8data - (code + 33);
2258 memmove(code + 1, code + 33, len);
2259 code += len + 1;
2260 }
2261
2262 /* Now fill in the complete length of the item */
2263
2264 PUT(previous, 1, code - previous);
2265 break; /* End of class handling */
2266 }
2267 #endif
2268
2269 /* If there are no characters > 255, negate the 32-byte map if necessary,
2270 and copy it into the code vector. If this is the first thing in the branch,
2271 there can be no first char setting, whatever the repeat count. Any reqbyte
2272 setting must remain unchanged after any kind of repeat. */
2273
2274 if (negate_class)
2275 {
2276 *code++ = OP_NCLASS;
2277 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2278 }
2279 else
2280 {
2281 *code++ = OP_CLASS;
2282 memcpy(code, classbits, 32);
2283 }
2284 code += 32;
2285 break;
2286
2287 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2288 has been tested above. */
2289
2290 case '{':
2291 if (!is_quantifier) goto NORMAL_CHAR;
2292 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2293 if (*errorcodeptr != 0) goto FAILED;
2294 goto REPEAT;
2295
2296 case '*':
2297 repeat_min = 0;
2298 repeat_max = -1;
2299 goto REPEAT;
2300
2301 case '+':
2302 repeat_min = 1;
2303 repeat_max = -1;
2304 goto REPEAT;
2305
2306 case '?':
2307 repeat_min = 0;
2308 repeat_max = 1;
2309
2310 REPEAT:
2311 if (previous == NULL)
2312 {
2313 *errorcodeptr = ERR9;
2314 goto FAILED;
2315 }
2316
2317 if (repeat_min == 0)
2318 {
2319 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2320 reqbyte = zeroreqbyte; /* Ditto */
2321 }
2322
2323 /* Remember whether this is a variable length repeat */
2324
2325 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2326
2327 op_type = 0; /* Default single-char op codes */
2328 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2329
2330 /* Save start of previous item, in case we have to move it up to make space
2331 for an inserted OP_ONCE for the additional '+' extension. */
2332
2333 tempcode = previous;
2334
2335 /* If the next character is '+', we have a possessive quantifier. This
2336 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2337 If the next character is '?' this is a minimizing repeat, by default,
2338 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2339 repeat type to the non-default. */
2340
2341 if (ptr[1] == '+')
2342 {
2343 repeat_type = 0; /* Force greedy */
2344 possessive_quantifier = TRUE;
2345 ptr++;
2346 }
2347 else if (ptr[1] == '?')
2348 {
2349 repeat_type = greedy_non_default;
2350 ptr++;
2351 }
2352 else repeat_type = greedy_default;
2353
2354 /* If previous was a recursion, we need to wrap it inside brackets so that
2355 it can be replicated if necessary. */
2356
2357 if (*previous == OP_RECURSE)
2358 {
2359 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2360 code += 1 + LINK_SIZE;
2361 *previous = OP_BRA;
2362 PUT(previous, 1, code - previous);
2363 *code = OP_KET;
2364 PUT(code, 1, code - previous);
2365 code += 1 + LINK_SIZE;
2366 }
2367
2368 /* If previous was a character match, abolish the item and generate a
2369 repeat item instead. If a char item has a minumum of more than one, ensure
2370 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2371 the first thing in a branch because the x will have gone into firstbyte
2372 instead. */
2373
2374 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2375 {
2376 /* Deal with UTF-8 characters that take up more than one byte. It's
2377 easier to write this out separately than try to macrify it. Use c to
2378 hold the length of the character in bytes, plus 0x80 to flag that it's a
2379 length rather than a small character. */
2380
2381 #ifdef SUPPORT_UTF8
2382 if (utf8 && (code[-1] & 0x80) != 0)
2383 {
2384 uschar *lastchar = code - 1;
2385 while((*lastchar & 0xc0) == 0x80) lastchar--;
2386 c = code - lastchar; /* Length of UTF-8 character */
2387 memcpy(utf8_char, lastchar, c); /* Save the char */
2388 c |= 0x80; /* Flag c as a length */
2389 }
2390 else
2391 #endif
2392
2393 /* Handle the case of a single byte - either with no UTF8 support, or
2394 with UTF-8 disabled, or for a UTF-8 character < 128. */
2395
2396 {
2397 c = code[-1];
2398 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2399 }
2400
2401 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2402 }
2403
2404 /* If previous was a single negated character ([^a] or similar), we use
2405 one of the special opcodes, replacing it. The code is shared with single-
2406 character repeats by setting opt_type to add a suitable offset into
2407 repeat_type. OP_NOT is currently used only for single-byte chars. */
2408
2409 else if (*previous == OP_NOT)
2410 {
2411 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2412 c = previous[1];
2413 goto OUTPUT_SINGLE_REPEAT;
2414 }
2415
2416 /* If previous was a character type match (\d or similar), abolish it and
2417 create a suitable repeat item. The code is shared with single-character
2418 repeats by setting op_type to add a suitable offset into repeat_type. Note
2419 the the Unicode property types will be present only when SUPPORT_UCP is
2420 defined, but we don't wrap the little bits of code here because it just
2421 makes it horribly messy. */
2422
2423 else if (*previous < OP_EODN)
2424 {
2425 uschar *oldcode;
2426 int prop_type;
2427 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2428 c = *previous;
2429
2430 OUTPUT_SINGLE_REPEAT:
2431 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2432 previous[1] : -1;
2433
2434 oldcode = code;
2435 code = previous; /* Usually overwrite previous item */
2436
2437 /* If the maximum is zero then the minimum must also be zero; Perl allows
2438 this case, so we do too - by simply omitting the item altogether. */
2439
2440 if (repeat_max == 0) goto END_REPEAT;
2441
2442 /* All real repeats make it impossible to handle partial matching (maybe
2443 one day we will be able to remove this restriction). */
2444
2445 if (repeat_max != 1) cd->nopartial = TRUE;
2446
2447 /* Combine the op_type with the repeat_type */
2448
2449 repeat_type += op_type;
2450
2451 /* A minimum of zero is handled either as the special case * or ?, or as
2452 an UPTO, with the maximum given. */
2453
2454 if (repeat_min == 0)
2455 {
2456 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2457 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2458 else
2459 {
2460 *code++ = OP_UPTO + repeat_type;
2461 PUT2INC(code, 0, repeat_max);
2462 }
2463 }
2464
2465 /* A repeat minimum of 1 is optimized into some special cases. If the
2466 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2467 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2468 one less than the maximum. */
2469
2470 else if (repeat_min == 1)
2471 {
2472 if (repeat_max == -1)
2473 *code++ = OP_PLUS + repeat_type;
2474 else
2475 {
2476 code = oldcode; /* leave previous item in place */
2477 if (repeat_max == 1) goto END_REPEAT;
2478 *code++ = OP_UPTO + repeat_type;
2479 PUT2INC(code, 0, repeat_max - 1);
2480 }
2481 }
2482
2483 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2484 handled as an EXACT followed by an UPTO. */
2485
2486 else
2487 {
2488 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2489 PUT2INC(code, 0, repeat_min);
2490
2491 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2492 we have to insert the character for the previous code. For a repeated
2493 Unicode property match, there is an extra byte that defines the
2494 required property. In UTF-8 mode, long characters have their length in
2495 c, with the 0x80 bit as a flag. */
2496
2497 if (repeat_max < 0)
2498 {
2499 #ifdef SUPPORT_UTF8
2500 if (utf8 && c >= 128)
2501 {
2502 memcpy(code, utf8_char, c & 7);
2503 code += c & 7;
2504 }
2505 else
2506 #endif
2507 {
2508 *code++ = c;
2509 if (prop_type >= 0) *code++ = prop_type;
2510 }
2511 *code++ = OP_STAR + repeat_type;
2512 }
2513
2514 /* Else insert an UPTO if the max is greater than the min, again
2515 preceded by the character, for the previously inserted code. */
2516
2517 else if (repeat_max != repeat_min)
2518 {
2519 #ifdef SUPPORT_UTF8
2520 if (utf8 && c >= 128)
2521 {
2522 memcpy(code, utf8_char, c & 7);
2523 code += c & 7;
2524 }
2525 else
2526 #endif
2527 *code++ = c;
2528 if (prop_type >= 0) *code++ = prop_type;
2529 repeat_max -= repeat_min;
2530 *code++ = OP_UPTO + repeat_type;
2531 PUT2INC(code, 0, repeat_max);
2532 }
2533 }
2534
2535 /* The character or character type itself comes last in all cases. */
2536
2537 #ifdef SUPPORT_UTF8
2538 if (utf8 && c >= 128)
2539 {
2540 memcpy(code, utf8_char, c & 7);
2541 code += c & 7;
2542 }
2543 else
2544 #endif
2545 *code++ = c;
2546
2547 /* For a repeated Unicode property match, there is an extra byte that
2548 defines the required property. */
2549
2550 #ifdef SUPPORT_UCP
2551 if (prop_type >= 0) *code++ = prop_type;
2552 #endif
2553 }
2554
2555 /* If previous was a character class or a back reference, we put the repeat
2556 stuff after it, but just skip the item if the repeat was {0,0}. */
2557
2558 else if (*previous == OP_CLASS ||
2559 *previous == OP_NCLASS ||
2560 #ifdef SUPPORT_UTF8
2561 *previous == OP_XCLASS ||
2562 #endif
2563 *previous == OP_REF)
2564 {
2565 if (repeat_max == 0)
2566 {
2567 code = previous;
2568 goto END_REPEAT;
2569 }
2570
2571 /* All real repeats make it impossible to handle partial matching (maybe
2572 one day we will be able to remove this restriction). */
2573
2574 if (repeat_max != 1) cd->nopartial = TRUE;
2575
2576 if (repeat_min == 0 && repeat_max == -1)
2577 *code++ = OP_CRSTAR + repeat_type;
2578 else if (repeat_min == 1 && repeat_max == -1)
2579 *code++ = OP_CRPLUS + repeat_type;
2580 else if (repeat_min == 0 && repeat_max == 1)
2581 *code++ = OP_CRQUERY + repeat_type;
2582 else
2583 {
2584 *code++ = OP_CRRANGE + repeat_type;
2585 PUT2INC(code, 0, repeat_min);
2586 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2587 PUT2INC(code, 0, repeat_max);
2588 }
2589 }
2590
2591 /* If previous was a bracket group, we may have to replicate it in certain
2592 cases. */
2593
2594 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2595 *previous == OP_COND)
2596 {
2597 register int i;
2598 int ketoffset = 0;
2599 int len = code - previous;
2600 uschar *bralink = NULL;
2601
2602 /* If the maximum repeat count is unlimited, find the end of the bracket
2603 by scanning through from the start, and compute the offset back to it
2604 from the current code pointer. There may be an OP_OPT setting following
2605 the final KET, so we can't find the end just by going back from the code
2606 pointer. */
2607
2608 if (repeat_max == -1)
2609 {
2610 register uschar *ket = previous;
2611 do ket += GET(ket, 1); while (*ket != OP_KET);
2612 ketoffset = code - ket;
2613 }
2614
2615 /* The case of a zero minimum is special because of the need to stick
2616 OP_BRAZERO in front of it, and because the group appears once in the
2617 data, whereas in other cases it appears the minimum number of times. For
2618 this reason, it is simplest to treat this case separately, as otherwise
2619 the code gets far too messy. There are several special subcases when the
2620 minimum is zero. */
2621
2622 if (repeat_min == 0)
2623 {
2624 /* If the maximum is also zero, we just omit the group from the output
2625 altogether. */
2626
2627 if (repeat_max == 0)
2628 {
2629 code = previous;
2630 goto END_REPEAT;
2631 }
2632
2633 /* If the maximum is 1 or unlimited, we just have to stick in the
2634 BRAZERO and do no more at this point. However, we do need to adjust
2635 any OP_RECURSE calls inside the group that refer to the group itself or
2636 any internal group, because the offset is from the start of the whole
2637 regex. Temporarily terminate the pattern while doing this. */
2638
2639 if (repeat_max <= 1)
2640 {
2641 *code = OP_END;
2642 adjust_recurse(previous, 1, utf8, cd);
2643 memmove(previous+1, previous, len);
2644 code++;
2645 *previous++ = OP_BRAZERO + repeat_type;
2646 }
2647
2648 /* If the maximum is greater than 1 and limited, we have to replicate
2649 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2650 The first one has to be handled carefully because it's the original
2651 copy, which has to be moved up. The remainder can be handled by code
2652 that is common with the non-zero minimum case below. We have to
2653 adjust the value or repeat_max, since one less copy is required. Once
2654 again, we may have to adjust any OP_RECURSE calls inside the group. */
2655
2656 else
2657 {
2658 int offset;
2659 *code = OP_END;
2660 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2661 memmove(previous + 2 + LINK_SIZE, previous, len);
2662 code += 2 + LINK_SIZE;
2663 *previous++ = OP_BRAZERO + repeat_type;
2664 *previous++ = OP_BRA;
2665
2666 /* We chain together the bracket offset fields that have to be
2667 filled in later when the ends of the brackets are reached. */
2668
2669 offset = (bralink == NULL)? 0 : previous - bralink;
2670 bralink = previous;
2671 PUTINC(previous, 0, offset);
2672 }
2673
2674 repeat_max--;
2675 }
2676
2677 /* If the minimum is greater than zero, replicate the group as many
2678 times as necessary, and adjust the maximum to the number of subsequent
2679 copies that we need. If we set a first char from the group, and didn't
2680 set a required char, copy the latter from the former. */
2681
2682 else
2683 {
2684 if (repeat_min > 1)
2685 {
2686 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2687 for (i = 1; i < repeat_min; i++)
2688 {
2689 memcpy(code, previous, len);
2690 code += len;
2691 }
2692 }
2693 if (repeat_max > 0) repeat_max -= repeat_min;
2694 }
2695
2696 /* This code is common to both the zero and non-zero minimum cases. If
2697 the maximum is limited, it replicates the group in a nested fashion,
2698 remembering the bracket starts on a stack. In the case of a zero minimum,
2699 the first one was set up above. In all cases the repeat_max now specifies
2700 the number of additional copies needed. */
2701
2702 if (repeat_max >= 0)
2703 {
2704 for (i = repeat_max - 1; i >= 0; i--)
2705 {
2706 *code++ = OP_BRAZERO + repeat_type;
2707
2708 /* All but the final copy start a new nesting, maintaining the
2709 chain of brackets outstanding. */
2710
2711 if (i != 0)
2712 {
2713 int offset;
2714 *code++ = OP_BRA;
2715 offset = (bralink == NULL)? 0 : code - bralink;
2716 bralink = code;
2717 PUTINC(code, 0, offset);
2718 }
2719
2720 memcpy(code, previous, len);
2721 code += len;
2722 }
2723
2724 /* Now chain through the pending brackets, and fill in their length
2725 fields (which are holding the chain links pro tem). */
2726
2727 while (bralink != NULL)
2728 {
2729 int oldlinkoffset;
2730 int offset = code - bralink + 1;
2731 uschar *bra = code - offset;
2732 oldlinkoffset = GET(bra, 1);
2733 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2734 *code++ = OP_KET;
2735 PUTINC(code, 0, offset);
2736 PUT(bra, 1, offset);
2737 }
2738 }
2739
2740 /* If the maximum is unlimited, set a repeater in the final copy. We
2741 can't just offset backwards from the current code point, because we
2742 don't know if there's been an options resetting after the ket. The
2743 correct offset was computed above. */
2744
2745 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2746 }
2747
2748 /* Else there's some kind of shambles */
2749
2750 else
2751 {
2752 *errorcodeptr = ERR11;
2753 goto FAILED;
2754 }
2755
2756 /* If the character following a repeat is '+', we wrap the entire repeated
2757 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2758 Sun's Java package. The repeated item starts at tempcode, not at previous,
2759 which might be the first part of a string whose (former) last char we
2760 repeated. However, we don't support '+' after a greediness '?'. */
2761
2762 if (possessive_quantifier)
2763 {
2764 int len = code - tempcode;
2765 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2766 code += 1 + LINK_SIZE;
2767 len += 1 + LINK_SIZE;
2768 tempcode[0] = OP_ONCE;
2769 *code++ = OP_KET;
2770 PUTINC(code, 0, len);
2771 PUT(tempcode, 1, len);
2772 }
2773
2774 /* In all case we no longer have a previous item. We also set the
2775 "follows varying string" flag for subsequently encountered reqbytes if
2776 it isn't already set and we have just passed a varying length item. */
2777
2778 END_REPEAT:
2779 previous = NULL;
2780 cd->req_varyopt |= reqvary;
2781 break;
2782
2783
2784 /* Start of nested bracket sub-expression, or comment or lookahead or
2785 lookbehind or option setting or condition. First deal with special things
2786 that can come after a bracket; all are introduced by ?, and the appearance
2787 of any of them means that this is not a referencing group. They were
2788 checked for validity in the first pass over the string, so we don't have to
2789 check for syntax errors here. */
2790
2791 case '(':
2792 newoptions = options;
2793 skipbytes = 0;
2794
2795 if (*(++ptr) == '?')
2796 {
2797 int set, unset;
2798 int *optset;
2799
2800 switch (*(++ptr))
2801 {
2802 case '#': /* Comment; skip to ket */
2803 ptr++;
2804 while (*ptr != ')') ptr++;
2805 continue;
2806
2807 case ':': /* Non-extracting bracket */
2808 bravalue = OP_BRA;
2809 ptr++;
2810 break;
2811
2812 case '(':
2813 bravalue = OP_COND; /* Conditional group */
2814
2815 /* Condition to test for recursion */
2816
2817 if (ptr[1] == 'R')
2818 {
2819 code[1+LINK_SIZE] = OP_CREF;
2820 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2821 skipbytes = 3;
2822 ptr += 3;
2823 }
2824
2825 /* Condition to test for a numbered subpattern match. We know that
2826 if a digit follows ( then there will just be digits until ) because
2827 the syntax was checked in the first pass. */
2828
2829 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2830 {
2831 int condref; /* Don't amalgamate; some compilers */
2832 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2833 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2834 if (condref == 0)
2835 {
2836 *errorcodeptr = ERR35;
2837 goto FAILED;
2838 }
2839 ptr++;
2840 code[1+LINK_SIZE] = OP_CREF;
2841 PUT2(code, 2+LINK_SIZE, condref);
2842 skipbytes = 3;
2843 }
2844 /* For conditions that are assertions, we just fall through, having
2845 set bravalue above. */
2846 break;
2847
2848 case '=': /* Positive lookahead */
2849 bravalue = OP_ASSERT;
2850 ptr++;
2851 break;
2852
2853 case '!': /* Negative lookahead */
2854 bravalue = OP_ASSERT_NOT;
2855 ptr++;
2856 break;
2857
2858 case '<': /* Lookbehinds */
2859 switch (*(++ptr))
2860 {
2861 case '=': /* Positive lookbehind */
2862 bravalue = OP_ASSERTBACK;
2863 ptr++;
2864 break;
2865
2866 case '!': /* Negative lookbehind */
2867 bravalue = OP_ASSERTBACK_NOT;
2868 ptr++;
2869 break;
2870 }
2871 break;
2872
2873 case '>': /* One-time brackets */
2874 bravalue = OP_ONCE;
2875 ptr++;
2876 break;
2877
2878 case 'C': /* Callout - may be followed by digits; */
2879 previous_callout = code; /* Save for later completion */
2880 after_manual_callout = 1; /* Skip one item before completing */
2881 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2882 { /* closing parenthesis is present. */
2883 int n = 0;
2884 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2885 n = n * 10 + *ptr - '0';
2886 if (n > 255)
2887 {
2888 *errorcodeptr = ERR38;
2889 goto FAILED;
2890 }
2891 *code++ = n;
2892 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2893 PUT(code, LINK_SIZE, 0); /* Default length */
2894 code += 2 * LINK_SIZE;
2895 }
2896 previous = NULL;
2897 continue;
2898
2899 case 'P': /* Named subpattern handling */
2900 if (*(++ptr) == '<') /* Definition */
2901 {
2902 int i, namelen;
2903 uschar *slot = cd->name_table;
2904 const uschar *name; /* Don't amalgamate; some compilers */
2905 name = ++ptr; /* grumble at autoincrement in declaration */
2906
2907 while (*ptr++ != '>');
2908 namelen = ptr - name - 1;
2909
2910 for (i = 0; i < cd->names_found; i++)
2911 {
2912 int crc = memcmp(name, slot+2, namelen);
2913 if (crc == 0)
2914 {
2915 if (slot[2+namelen] == 0)
2916 {
2917 *errorcodeptr = ERR43;
2918 goto FAILED;
2919 }
2920 crc = -1; /* Current name is substring */
2921 }
2922 if (crc < 0)
2923 {
2924 memmove(slot + cd->name_entry_size, slot,
2925 (cd->names_found - i) * cd->name_entry_size);
2926 break;
2927 }
2928 slot += cd->name_entry_size;
2929 }
2930
2931 PUT2(slot, 0, *brackets + 1);
2932 memcpy(slot + 2, name, namelen);
2933 slot[2+namelen] = 0;
2934 cd->names_found++;
2935 goto NUMBERED_GROUP;
2936 }
2937
2938 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2939 {
2940 int i, namelen;
2941 int type = *ptr++;
2942 const uschar *name = ptr;
2943 uschar *slot = cd->name_table;
2944
2945 while (*ptr != ')') ptr++;
2946 namelen = ptr - name;
2947
2948 for (i = 0; i < cd->names_found; i++)
2949 {
2950 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2951 slot += cd->name_entry_size;
2952 }
2953 if (i >= cd->names_found)
2954 {
2955 *errorcodeptr = ERR15;
2956 goto FAILED;
2957 }
2958
2959 recno = GET2(slot, 0);
2960
2961 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2962
2963 /* Back reference */
2964
2965 previous = code;
2966 *code++ = OP_REF;
2967 PUT2INC(code, 0, recno);
2968 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2969 if (recno > cd->top_backref) cd->top_backref = recno;
2970 continue;
2971 }
2972
2973 /* Should never happen */
2974 break;
2975
2976 case 'R': /* Pattern recursion */
2977 ptr++; /* Same as (?0) */
2978 /* Fall through */
2979
2980 /* Recursion or "subroutine" call */
2981
2982 case '0': case '1': case '2': case '3': case '4':
2983 case '5': case '6': case '7': case '8': case '9':
2984 {
2985 const uschar *called;
2986 recno = 0;
2987 while((digitab[*ptr] & ctype_digit) != 0)
2988 recno = recno * 10 + *ptr++ - '0';
2989
2990 /* Come here from code above that handles a named recursion */
2991
2992 HANDLE_RECURSION:
2993
2994 previous = code;
2995
2996 /* Find the bracket that is being referenced. Temporarily end the
2997 regex in case it doesn't exist. */
2998
2999 *code = OP_END;
3000 called = (recno == 0)?
3001 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3002
3003 if (called == NULL)
3004 {
3005 *errorcodeptr = ERR15;
3006 goto FAILED;
3007 }
3008
3009 /* If the subpattern is still open, this is a recursive call. We
3010 check to see if this is a left recursion that could loop for ever,
3011 and diagnose that case. */
3012
3013 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3014 {
3015 *errorcodeptr = ERR40;
3016 goto FAILED;
3017 }
3018
3019 /* Insert the recursion/subroutine item */
3020
3021 *code = OP_RECURSE;
3022 PUT(code, 1, called - cd->start_code);
3023 code += 1 + LINK_SIZE;
3024 }
3025 continue;
3026
3027 /* Character after (? not specially recognized */
3028
3029 default: /* Option setting */
3030 set = unset = 0;
3031 optset = &set;
3032
3033 while (*ptr != ')' && *ptr != ':')
3034 {
3035 switch (*ptr++)
3036 {
3037 case '-': optset = &unset; break;
3038
3039 case 'i': *optset |= PCRE_CASELESS; break;
3040 case 'm': *optset |= PCRE_MULTILINE; break;
3041 case 's': *optset |= PCRE_DOTALL; break;
3042 case 'x': *optset |= PCRE_EXTENDED; break;
3043 case 'U': *optset |= PCRE_UNGREEDY; break;
3044 case 'X': *optset |= PCRE_EXTRA; break;
3045 }
3046 }
3047
3048 /* Set up the changed option bits, but don't change anything yet. */
3049
3050 newoptions = (options | set) & (~unset);
3051
3052 /* If the options ended with ')' this is not the start of a nested
3053 group with option changes, so the options change at this level. Compile
3054 code to change the ims options if this setting actually changes any of
3055 them. We also pass the new setting back so that it can be put at the
3056 start of any following branches, and when this group ends (if we are in
3057 a group), a resetting item can be compiled.
3058
3059 Note that if this item is right at the start of the pattern, the
3060 options will have been abstracted and made global, so there will be no
3061 change to compile. */
3062
3063 if (*ptr == ')')
3064 {
3065 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3066 {
3067 *code++ = OP_OPT;
3068 *code++ = newoptions & PCRE_IMS;
3069 }
3070
3071 /* Change options at this level, and pass them back for use
3072 in subsequent branches. Reset the greedy defaults and the case
3073 value for firstbyte and reqbyte. */
3074
3075 *optionsptr = options = newoptions;
3076 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3077 greedy_non_default = greedy_default ^ 1;
3078 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3079
3080 previous = NULL; /* This item can't be repeated */
3081 continue; /* It is complete */
3082 }
3083
3084 /* If the options ended with ':' we are heading into a nested group
3085 with possible change of options. Such groups are non-capturing and are
3086 not assertions of any kind. All we need to do is skip over the ':';
3087 the newoptions value is handled below. */
3088
3089 bravalue = OP_BRA;
3090 ptr++;
3091 }
3092 }
3093
3094 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3095 non-capturing and behave like (?:...) brackets */
3096
3097 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3098 {
3099 bravalue = OP_BRA;
3100 }
3101
3102 /* Else we have a referencing group; adjust the opcode. If the bracket
3103 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3104 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3105
3106 else
3107 {
3108 NUMBERED_GROUP:
3109 if (++(*brackets) > EXTRACT_BASIC_MAX)
3110 {
3111 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3112 code[1+LINK_SIZE] = OP_BRANUMBER;
3113 PUT2(code, 2+LINK_SIZE, *brackets);
3114 skipbytes = 3;
3115 }
3116 else bravalue = OP_BRA + *brackets;
3117 }
3118
3119 /* Process nested bracketed re. Assertions may not be repeated, but other
3120 kinds can be. We copy code into a non-register variable in order to be able
3121 to pass its address because some compilers complain otherwise. Pass in a
3122 new setting for the ims options if they have changed. */
3123
3124 previous = (bravalue >= OP_ONCE)? code : NULL;
3125 *code = bravalue;
3126 tempcode = code;
3127 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3128
3129 if (!compile_regex(
3130 newoptions, /* The complete new option state */
3131 options & PCRE_IMS, /* The previous ims option state */
3132 brackets, /* Extracting bracket count */
3133 &tempcode, /* Where to put code (updated) */
3134 &ptr, /* Input pointer (updated) */
3135 errorcodeptr, /* Where to put an error message */
3136 (bravalue == OP_ASSERTBACK ||
3137 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3138 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3139 &subfirstbyte, /* For possible first char */
3140 &subreqbyte, /* For possible last char */
3141 bcptr, /* Current branch chain */
3142 cd)) /* Tables block */
3143 goto FAILED;
3144
3145 /* At the end of compiling, code is still pointing to the start of the
3146 group, while tempcode has been updated to point past the end of the group
3147 and any option resetting that may follow it. The pattern pointer (ptr)
3148 is on the bracket. */
3149
3150 /* If this is a conditional bracket, check that there are no more than
3151 two branches in the group. */
3152
3153 else if (bravalue == OP_COND)
3154 {
3155 uschar *tc = code;
3156 condcount = 0;
3157
3158 do {
3159 condcount++;
3160 tc += GET(tc,1);
3161 }
3162 while (*tc != OP_KET);
3163
3164 if (condcount > 2)
3165 {
3166 *errorcodeptr = ERR27;
3167 goto FAILED;
3168 }
3169
3170 /* If there is just one branch, we must not make use of its firstbyte or
3171 reqbyte, because this is equivalent to an empty second branch. */
3172
3173 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3174 }
3175
3176 /* Handle updating of the required and first characters. Update for normal
3177 brackets of all kinds, and conditions with two branches (see code above).
3178 If the bracket is followed by a quantifier with zero repeat, we have to
3179 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3180 main loop so that they can be accessed for the back off. */
3181
3182 zeroreqbyte = reqbyte;
3183 zerofirstbyte = firstbyte;
3184 groupsetfirstbyte = FALSE;
3185
3186 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3187 {
3188 /* If we have not yet set a firstbyte in this branch, take it from the
3189 subpattern, remembering that it was set here so that a repeat of more
3190 than one can replicate it as reqbyte if necessary. If the subpattern has
3191 no firstbyte, set "none" for the whole branch. In both cases, a zero
3192 repeat forces firstbyte to "none". */
3193
3194 if (firstbyte == REQ_UNSET)
3195 {
3196 if (subfirstbyte >= 0)
3197 {
3198 firstbyte = subfirstbyte;
3199 groupsetfirstbyte = TRUE;
3200 }
3201 else firstbyte = REQ_NONE;
3202 zerofirstbyte = REQ_NONE;
3203 }
3204
3205 /* If firstbyte was previously set, convert the subpattern's firstbyte
3206 into reqbyte if there wasn't one, using the vary flag that was in
3207 existence beforehand. */
3208
3209 else if (subfirstbyte >= 0 && subreqbyte < 0)
3210 subreqbyte = subfirstbyte | tempreqvary;
3211
3212 /* If the subpattern set a required byte (or set a first byte that isn't
3213 really the first byte - see above), set it. */
3214
3215 if (subreqbyte >= 0) reqbyte = subreqbyte;
3216 }
3217
3218 /* For a forward assertion, we take the reqbyte, if set. This can be
3219 helpful if the pattern that follows the assertion doesn't set a different
3220 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3221 for an assertion, however because it leads to incorrect effect for patterns
3222 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3223 of a firstbyte. This is overcome by a scan at the end if there's no
3224 firstbyte, looking for an asserted first char. */
3225
3226 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3227
3228 /* Now update the main code pointer to the end of the group. */
3229
3230 code = tempcode;
3231
3232 /* Error if hit end of pattern */
3233
3234 if (*ptr != ')')
3235 {
3236 *errorcodeptr = ERR14;
3237 goto FAILED;
3238 }
3239 break;
3240
3241 /* Check \ for being a real metacharacter; if not, fall through and handle
3242 it as a data character at the start of a string. Escape items are checked
3243 for validity in the pre-compiling pass. */
3244
3245 case '\\':
3246 tempptr = ptr;
3247 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3248
3249 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3250 are arranged to be the negation of the corresponding OP_values. For the
3251 back references, the values are ESC_REF plus the reference number. Only
3252 back references and those types that consume a character may be repeated.
3253 We can test for values between ESC_b and ESC_Z for the latter; this may
3254 have to change if any new ones are ever created. */
3255
3256 if (c < 0)
3257 {
3258 if (-c == ESC_Q) /* Handle start of quoted string */
3259 {
3260 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3261 else inescq = TRUE;
3262 continue;
3263 }
3264
3265 /* For metasequences that actually match a character, we disable the
3266 setting of a first character if it hasn't already been set. */
3267
3268 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3269 firstbyte = REQ_NONE;
3270
3271 /* Set values to reset to if this is followed by a zero repeat. */
3272
3273 zerofirstbyte = firstbyte;
3274 zeroreqbyte = reqbyte;
3275
3276 /* Back references are handled specially */
3277
3278 if (-c >= ESC_REF)
3279 {
3280 int number = -c - ESC_REF;
3281 previous = code;
3282 *code++ = OP_REF;
3283 PUT2INC(code, 0, number);
3284 }
3285
3286 /* So are Unicode property matches, if supported. We know that get_ucp
3287 won't fail because it was tested in the pre-pass. */
3288
3289 #ifdef SUPPORT_UCP
3290 else if (-c == ESC_P || -c == ESC_p)
3291 {
3292 BOOL negated;
3293 int value = get_ucp(&ptr, &negated, errorcodeptr);
3294 previous = code;
3295 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3296 *code++ = value;
3297 }
3298 #endif
3299
3300 /* For the rest, we can obtain the OP value by negating the escape
3301 value */
3302
3303 else
3304 {
3305 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3306 *code++ = -c;
3307 }
3308 continue;
3309 }
3310
3311 /* We have a data character whose value is in c. In UTF-8 mode it may have
3312 a value > 127. We set its representation in the length/buffer, and then
3313 handle it as a data character. */
3314
3315 #ifdef SUPPORT_UTF8
3316 if (utf8 && c > 127)
3317 mclength = _pcre_ord2utf8(c, mcbuffer);
3318 else
3319 #endif
3320
3321 {
3322 mcbuffer[0] = c;
3323 mclength = 1;
3324 }
3325
3326 goto ONE_CHAR;
3327
3328 /* Handle a literal character. It is guaranteed not to be whitespace or #
3329 when the extended flag is set. If we are in UTF-8 mode, it may be a
3330 multi-byte literal character. */
3331
3332 default:
3333 NORMAL_CHAR:
3334 mclength = 1;
3335 mcbuffer[0] = c;
3336
3337 #ifdef SUPPORT_UTF8
3338 if (utf8 && (c & 0xc0) == 0xc0)
3339 {
3340 while ((ptr[1] & 0xc0) == 0x80)
3341 mcbuffer[mclength++] = *(++ptr);
3342 }
3343 #endif
3344
3345 /* At this point we have the character's bytes in mcbuffer, and the length
3346 in mclength. When not in UTF-8 mode, the length is always 1. */
3347
3348 ONE_CHAR:
3349 previous = code;
3350 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3351 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3352
3353 /* Set the first and required bytes appropriately. If no previous first
3354 byte, set it from this character, but revert to none on a zero repeat.
3355 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3356 repeat. */
3357
3358 if (firstbyte == REQ_UNSET)
3359 {
3360 zerofirstbyte = REQ_NONE;
3361 zeroreqbyte = reqbyte;
3362
3363 /* If the character is more than one byte long, we can set firstbyte
3364 only if it is not to be matched caselessly. */
3365
3366 if (mclength == 1 || req_caseopt == 0)
3367 {
3368 firstbyte = mcbuffer[0] | req_caseopt;
3369 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3370 }
3371 else firstbyte = reqbyte = REQ_NONE;
3372 }
3373
3374 /* firstbyte was previously set; we can set reqbyte only the length is
3375 1 or the matching is caseful. */
3376
3377 else
3378 {
3379 zerofirstbyte = firstbyte;
3380 zeroreqbyte = reqbyte;
3381 if (mclength == 1 || req_caseopt == 0)
3382 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3383 }
3384
3385 break; /* End of literal character handling */
3386 }
3387 } /* end of big loop */
3388
3389 /* Control never reaches here by falling through, only by a goto for all the
3390 error states. Pass back the position in the pattern so that it can be displayed
3391 to the user for diagnosing the error. */
3392
3393 FAILED:
3394 *ptrptr = ptr;
3395 return FALSE;
3396 }
3397
3398
3399
3400
3401 /*************************************************
3402 * Compile sequence of alternatives *
3403 *************************************************/
3404
3405 /* On entry, ptr is pointing past the bracket character, but on return
3406 it points to the closing bracket, or vertical bar, or end of string.
3407 The code variable is pointing at the byte into which the BRA operator has been
3408 stored. If the ims options are changed at the start (for a (?ims: group) or
3409 during any branch, we need to insert an OP_OPT item at the start of every
3410 following branch to ensure they get set correctly at run time, and also pass
3411 the new options into every subsequent branch compile.
3412
3413 Argument:
3414 options option bits, including any changes for this subpattern
3415 oldims previous settings of ims option bits
3416 brackets -> int containing the number of extracting brackets used
3417 codeptr -> the address of the current code pointer
3418 ptrptr -> the address of the current pattern pointer
3419 errorcodeptr -> pointer to error code variable
3420 lookbehind TRUE if this is a lookbehind assertion
3421 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3422 firstbyteptr place to put the first required character, or a negative number
3423 reqbyteptr place to put the last required character, or a negative number
3424 bcptr pointer to the chain of currently open branches
3425 cd points to the data block with tables pointers etc.
3426
3427 Returns: TRUE on success
3428 */
3429
3430 static BOOL
3431 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3432 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3433 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3434 {
3435 const uschar *ptr = *ptrptr;
3436 uschar *code = *codeptr;
3437 uschar *last_branch = code;
3438 uschar *start_bracket = code;
3439 uschar *reverse_count = NULL;
3440 int firstbyte, reqbyte;
3441 int branchfirstbyte, branchreqbyte;
3442 branch_chain bc;
3443
3444 bc.outer = bcptr;
3445 bc.current = code;
3446
3447 firstbyte = reqbyte = REQ_UNSET;
3448
3449 /* Offset is set zero to mark that this bracket is still open */
3450
3451 PUT(code, 1, 0);
3452 code += 1 + LINK_SIZE + skipbytes;
3453
3454 /* Loop for each alternative branch */
3455
3456 for (;;)
3457 {
3458 /* Handle a change of ims options at the start of the branch */
3459
3460 if ((options & PCRE_IMS) != oldims)
3461 {
3462 *code++ = OP_OPT;
3463 *code++ = options & PCRE_IMS;
3464 }
3465
3466 /* Set up dummy OP_REVERSE if lookbehind assertion */
3467
3468 if (lookbehind)
3469 {
3470 *code++ = OP_REVERSE;
3471 reverse_count = code;
3472 PUTINC(code, 0, 0);
3473 }
3474
3475 /* Now compile the branch */
3476
3477 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3478 &branchfirstbyte, &branchreqbyte, &bc, cd))
3479 {
3480 *ptrptr = ptr;
3481 return FALSE;
3482 }
3483
3484 /* If this is the first branch, the firstbyte and reqbyte values for the
3485 branch become the values for the regex. */
3486
3487 if (*last_branch != OP_ALT)
3488 {
3489 firstbyte = branchfirstbyte;
3490 reqbyte = branchreqbyte;
3491 }
3492
3493 /* If this is not the first branch, the first char and reqbyte have to
3494 match the values from all the previous branches, except that if the previous
3495 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3496 REQ_VARY for the regex. */
3497
3498 else
3499 {
3500 /* If we previously had a firstbyte, but it doesn't match the new branch,
3501 we have to abandon the firstbyte for the regex, but if there was previously
3502 no reqbyte, it takes on the value of the old firstbyte. */
3503
3504 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3505 {
3506 if (reqbyte < 0) reqbyte = firstbyte;
3507 firstbyte = REQ_NONE;
3508 }
3509
3510 /* If we (now or from before) have no firstbyte, a firstbyte from the
3511 branch becomes a reqbyte if there isn't a branch reqbyte. */
3512
3513 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3514 branchreqbyte = branchfirstbyte;
3515
3516 /* Now ensure that the reqbytes match */
3517
3518 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3519 reqbyte = REQ_NONE;
3520 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3521 }
3522
3523 /* If lookbehind, check that this branch matches a fixed-length string,
3524 and put the length into the OP_REVERSE item. Temporarily mark the end of
3525 the branch with OP_END. */
3526
3527 if (lookbehind)
3528 {
3529 int length;
3530 *code = OP_END;
3531 length = find_fixedlength(last_branch, options);
3532 DPRINTF(("fixed length = %d\n", length));
3533 if (length < 0)
3534 {
3535 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3536 *ptrptr = ptr;
3537 return FALSE;
3538 }
3539 PUT(reverse_count, 0, length);
3540 }
3541
3542 /* Reached end of expression, either ')' or end of pattern. Go back through
3543 the alternative branches and reverse the chain of offsets, with the field in
3544 the BRA item now becoming an offset to the first alternative. If there are
3545 no alternatives, it points to the end of the group. The length in the
3546 terminating ket is always the length of the whole bracketed item. If any of
3547 the ims options were changed inside the group, compile a resetting op-code
3548 following, except at the very end of the pattern. Return leaving the pointer
3549 at the terminating char. */
3550
3551 if (*ptr != '|')
3552 {
3553 int length = code - last_branch;
3554 do
3555 {
3556 int prev_length = GET(last_branch, 1);
3557 PUT(last_branch, 1, length);
3558 length = prev_length;
3559 last_branch -= length;
3560 }
3561 while (length > 0);
3562
3563 /* Fill in the ket */
3564
3565 *code = OP_KET;
3566 PUT(code, 1, code - start_bracket);
3567 code += 1 + LINK_SIZE;
3568
3569 /* Resetting option if needed */
3570
3571 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3572 {
3573 *code++ = OP_OPT;
3574 *code++ = oldims;
3575 }
3576
3577 /* Set values to pass back */
3578
3579 *codeptr = code;
3580 *ptrptr = ptr;
3581 *firstbyteptr = firstbyte;
3582 *reqbyteptr = reqbyte;
3583 return TRUE;
3584 }
3585
3586 /* Another branch follows; insert an "or" node. Its length field points back
3587 to the previous branch while the bracket remains open. At the end the chain
3588 is reversed. It's done like this so that the start of the bracket has a
3589 zero offset until it is closed, making it possible to detect recursion. */
3590
3591 *code = OP_ALT;
3592 PUT(code, 1, code - last_branch);
3593 bc.current = last_branch = code;
3594 code += 1 + LINK_SIZE;
3595 ptr++;
3596 }
3597 /* Control never reaches here */
3598 }
3599
3600
3601
3602
3603 /*************************************************
3604 * Check for anchored expression *
3605 *************************************************/
3606
3607 /* Try to find out if this is an anchored regular expression. Consider each
3608 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3609 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3610 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3611 counts, since OP_CIRC can match in the middle.
3612
3613 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3614 This is the code for \G, which means "match at start of match position, taking
3615 into account the match offset".
3616
3617 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3618 because that will try the rest of the pattern at all possible matching points,
3619 so there is no point trying again.... er ....
3620
3621 .... except when the .* appears inside capturing parentheses, and there is a
3622 subsequent back reference to those parentheses. We haven't enough information
3623 to catch that case precisely.
3624
3625 At first, the best we could do was to detect when .* was in capturing brackets
3626 and the highest back reference was greater than or equal to that level.
3627 However, by keeping a bitmap of the first 31 back references, we can catch some
3628 of the more common cases more precisely.
3629
3630 Arguments:
3631 code points to start of expression (the bracket)
3632 options points to the options setting
3633 bracket_map a bitmap of which brackets we are inside while testing; this
3634 handles up to substring 31; after that we just have to take
3635 the less precise approach
3636 backref_map the back reference bitmap
3637
3638 Returns: TRUE or FALSE
3639 */
3640
3641 static BOOL
3642 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3643 unsigned int backref_map)
3644 {
3645 do {
3646 const uschar *scode =
3647 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3648 register int op = *scode;
3649
3650 /* Capturing brackets */
3651
3652 if (op > OP_BRA)
3653 {
3654 int new_map;
3655 op -= OP_BRA;
3656 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3657 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3658 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3659 }
3660
3661 /* Other brackets */
3662
3663 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3664 {
3665 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3666 }
3667
3668 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3669 are or may be referenced. */
3670
3671 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3672 (*options & PCRE_DOTALL) != 0)
3673 {
3674 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3675 }
3676
3677 /* Check for explicit anchoring */
3678
3679 else if (op != OP_SOD && op != OP_SOM &&
3680 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3681 return FALSE;
3682 code += GET(code, 1);
3683 }
3684 while (*code == OP_ALT); /* Loop for each alternative */
3685 return TRUE;
3686 }
3687
3688
3689
3690 /*************************************************
3691 * Check for starting with ^ or .* *
3692 *************************************************/
3693
3694 /* This is called to find out if every branch starts with ^ or .* so that
3695 "first char" processing can be done to speed things up in multiline
3696 matching and for non-DOTALL patterns that start with .* (which must start at
3697 the beginning or after \n). As in the case of is_anchored() (see above), we
3698 have to take account of back references to capturing brackets that contain .*
3699 because in that case we can't make the assumption.
3700
3701 Arguments:
3702 code points to start of expression (the bracket)
3703 bracket_map a bitmap of which brackets we are inside while testing; this
3704 handles up to substring 31; after that we just have to take
3705 the less precise approach
3706 backref_map the back reference bitmap
3707
3708 Returns: TRUE or FALSE
3709 */
3710
3711 static BOOL
3712 is_startline(const uschar *code, unsigned int bracket_map,
3713 unsigned int backref_map)
3714 {
3715 do {
3716 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3717 FALSE);
3718 register int op = *scode;
3719
3720 /* Capturing brackets */
3721
3722 if (op > OP_BRA)
3723 {
3724 int new_map;
3725 op -= OP_BRA;
3726 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3727 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3728 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3729 }
3730
3731 /* Other brackets */
3732
3733 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3734 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3735
3736 /* .* means "start at start or after \n" if it isn't in brackets that
3737 may be referenced. */
3738
3739 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3740 {
3741 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3742 }
3743
3744 /* Check for explicit circumflex */
3745
3746 else if (op != OP_CIRC) return FALSE;
3747
3748 /* Move on to the next alternative */
3749
3750 code += GET(code, 1);
3751 }
3752 while (*code == OP_ALT); /* Loop for each alternative */
3753 return TRUE;
3754 }
3755
3756
3757
3758 /*************************************************
3759 * Check for asserted fixed first char *
3760 *************************************************/
3761
3762 /* During compilation, the "first char" settings from forward assertions are
3763 discarded, because they can cause conflicts with actual literals that follow.
3764 However, if we end up without a first char setting for an unanchored pattern,
3765 it is worth scanning the regex to see if there is an initial asserted first
3766 char. If all branches start with the same asserted char, or with a bracket all
3767 of whose alternatives start with the same asserted char (recurse ad lib), then
3768 we return that char, otherwise -1.
3769
3770 Arguments:
3771 code points to start of expression (the bracket)
3772 options pointer to the options (used to check casing changes)
3773 inassert TRUE if in an assertion
3774
3775 Returns: -1 or the fixed first char
3776 */
3777
3778 static int
3779 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3780 {
3781 register int c = -1;
3782 do {
3783 int d;
3784 const uschar *scode =
3785 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3786 register int op = *scode;
3787
3788 if (op >= OP_BRA) op = OP_BRA;
3789
3790 switch(op)
3791 {
3792 default:
3793 return -1;
3794
3795 case OP_BRA:
3796 case OP_ASSERT:
3797 case OP_ONCE:
3798 case OP_COND:
3799 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3800 return -1;
3801 if (c < 0) c = d; else if (c != d) return -1;
3802 break;
3803
3804 case OP_EXACT: /* Fall through */
3805 scode += 2;
3806
3807 case OP_CHAR:
3808 case OP_CHARNC:
3809 case OP_PLUS:
3810 case OP_MINPLUS:
3811 if (!inassert) return -1;
3812 if (c < 0)
3813 {
3814 c = scode[1];
3815 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3816 }
3817 else if (c != scode[1]) return -1;
3818 break;
3819 }
3820
3821 code += GET(code, 1);
3822 }
3823 while (*code == OP_ALT);
3824 return c;
3825 }
3826
3827
3828
3829 /*************************************************
3830 * Compile a Regular Expression *
3831 *************************************************/
3832
3833 /* This function takes a string and returns a pointer to a block of store
3834 holding a compiled version of the expression. The original API for this
3835 function had no error code return variable; it is retained for backwards
3836 compatibility. The new function is given a new name.
3837
3838 Arguments:
3839 pattern the regular expression
3840 options various option bits
3841 errorcodeptr pointer to error code variable (pcre_compile2() only)
3842 can be NULL if you don't want a code value
3843 errorptr pointer to pointer to error text
3844 erroroffset ptr offset in pattern where error was detected
3845 tables pointer to character tables or NULL
3846
3847 Returns: pointer to compiled data block, or NULL on error,
3848 with errorptr and erroroffset set
3849 */
3850
3851 PCRE_EXPORT pcre *
3852 pcre_compile(const char *pattern, int options, const char **errorptr,
3853 int *erroroffset, const unsigned char *tables)
3854 {
3855 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3856 }
3857
3858
3859 PCRE_EXPORT pcre *
3860 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3861 const char **errorptr, int *erroroffset, const unsigned char *tables)
3862 {
3863 real_pcre *re;
3864 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3865 int c, firstbyte, reqbyte;
3866 int bracount = 0;
3867 int branch_extra = 0;
3868 int branch_newextra;
3869 int item_count = -1;
3870 int name_count = 0;
3871 int max_name_size = 0;
3872 int lastitemlength = 0;
3873 int errorcode = 0;
3874 #ifdef SUPPORT_UTF8
3875 BOOL utf8;
3876 BOOL class_utf8;
3877 #endif
3878 BOOL inescq = FALSE;
3879 BOOL capturing;
3880 unsigned int brastackptr = 0;
3881 size_t size;
3882 uschar *code;
3883 const uschar *codestart;
3884 const uschar *ptr;
3885 compile_data compile_block;
3886 int brastack[BRASTACK_SIZE];
3887 uschar bralenstack[BRASTACK_SIZE];
3888
3889 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3890 can do is just return NULL, but we can set a code value if there is a code
3891 pointer. */
3892
3893 if (errorptr == NULL)
3894 {
3895 if (errorcodeptr != NULL) *errorcodeptr = 99;
3896 return NULL;
3897 }
3898
3899 *errorptr = NULL;
3900 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3901
3902 /* However, we can give a message for this error */
3903
3904 if (erroroffset == NULL)
3905 {
3906 errorcode = ERR16;
3907 goto PCRE_EARLY_ERROR_RETURN;
3908 }
3909
3910 *erroroffset = 0;
3911
3912 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3913
3914 #ifdef SUPPORT_UTF8
3915 utf8 = (options & PCRE_UTF8) != 0;
3916 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3917 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3918 {
3919 errorcode = ERR44;
3920 goto PCRE_EARLY_ERROR_RETURN;
3921 }
3922 #else
3923 if ((options & PCRE_UTF8) != 0)
3924 {
3925 errorcode = ERR32;
3926 goto PCRE_EARLY_ERROR_RETURN;
3927 }
3928 #endif
3929
3930 if ((options & ~PUBLIC_OPTIONS) != 0)
3931 {
3932 errorcode = ERR17;
3933 goto PCRE_EARLY_ERROR_RETURN;
3934 }
3935
3936 /* Set up pointers to the individual character tables */
3937
3938 if (tables == NULL) tables = _pcre_default_tables;
3939 compile_block.lcc = tables + lcc_offset;
3940 compile_block.fcc = tables + fcc_offset;
3941 compile_block.cbits = tables + cbits_offset;
3942 compile_block.ctypes = tables + ctypes_offset;
3943
3944 /* Maximum back reference and backref bitmap. This is updated for numeric
3945 references during the first pass, but for named references during the actual
3946 compile pass. The bitmap records up to 31 back references to help in deciding
3947 whether (.*) can be treated as anchored or not. */
3948
3949 compile_block.top_backref = 0;
3950 compile_block.backref_map = 0;
3951
3952 /* Reflect pattern for debugging output */
3953
3954 DPRINTF(("------------------------------------------------------------------\n"));
3955 DPRINTF(("%s\n", pattern));
3956
3957 /* The first thing to do is to make a pass over the pattern to compute the
3958 amount of store required to hold the compiled code. This does not have to be
3959 perfect as long as errors are overestimates. At the same time we can detect any
3960 flag settings right at the start, and extract them. Make an attempt to correct
3961 for any counted white space if an "extended" flag setting appears late in the
3962 pattern. We can't be so clever for #-comments. */
3963
3964 ptr = (const uschar *)(pattern - 1);
3965 while ((c = *(++ptr)) != 0)
3966 {
3967 int min, max;
3968 int class_optcount;
3969 int bracket_length;
3970 int duplength;
3971
3972 /* If we are inside a \Q...\E sequence, all chars are literal */
3973
3974 if (inescq)
3975 {
3976 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3977 goto NORMAL_CHAR;
3978 }
3979
3980 /* Otherwise, first check for ignored whitespace and comments */
3981
3982 if ((options & PCRE_EXTENDED) != 0)
3983 {
3984 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3985 if (c == '#')
3986 {
3987 /* The space before the ; is to avoid a warning on a silly compiler
3988 on the Macintosh. */
3989 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3990 if (c == 0) break;
3991 continue;
3992 }
3993 }
3994
3995 item_count++; /* Is zero for the first non-comment item */
3996
3997 /* Allow space for auto callout before every item except quantifiers. */
3998
3999 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4000 c != '*' && c != '+' && c != '?' &&
4001 (c != '{' || !is_counted_repeat(ptr + 1)))
4002 length += 2 + 2*LINK_SIZE;
4003
4004 switch(c)
4005 {
4006 /* A backslashed item may be an escaped data character or it may be a
4007 character type. */
4008
4009 case '\\':
4010 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4011 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4012
4013 lastitemlength = 1; /* Default length of last item for repeats */
4014
4015 if (c >= 0) /* Data character */
4016 {
4017 length += 2; /* For a one-byte character */
4018
4019 #ifdef SUPPORT_UTF8
4020 if (utf8 && c > 127)
4021 {
4022 int i;
4023 for (i = 0; i < _pcre_utf8_table1_size; i++)
4024 if (c <= _pcre_utf8_table1[i]) break;
4025 length += i;
4026 lastitemlength += i;
4027 }
4028 #endif
4029
4030 continue;
4031 }
4032
4033 /* If \Q, enter "literal" mode */
4034
4035 if (-c == ESC_Q)
4036 {
4037 inescq = TRUE;
4038 continue;
4039 }
4040
4041 /* \X is supported only if Unicode property support is compiled */
4042
4043 #ifndef SUPPORT_UCP
4044 if (-c == ESC_X)
4045 {
4046 errorcode = ERR45;
4047 goto PCRE_ERROR_RETURN;
4048 }
4049 #endif
4050
4051 /* \P and \p are for Unicode properties, but only when the support has
4052 been compiled. Each item needs 2 bytes. */
4053
4054 else if (-c == ESC_P || -c == ESC_p)
4055 {
4056 #ifdef SUPPORT_UCP
4057 BOOL negated;
4058 length += 2;
4059 lastitemlength = 2;
4060 if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4061 continue;
4062 #else
4063 errorcode = ERR45;
4064 goto PCRE_ERROR_RETURN;
4065 #endif
4066 }
4067
4068 /* Other escapes need one byte */
4069
4070 length++;
4071
4072 /* A back reference needs an additional 2 bytes, plus either one or 5
4073 bytes for a repeat. We also need to keep the value of the highest
4074 back reference. */
4075
4076 if (c <= -ESC_REF)
4077 {
4078 int refnum = -c - ESC_REF;
4079 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4080 if (refnum > compile_block.top_backref)
4081 compile_block.top_backref = refnum;
4082 length += 2; /* For single back reference */
4083 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4084 {
4085 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4086 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4087 if ((min == 0 && (max == 1 || max == -1)) ||
4088 (min == 1 && max == -1))
4089 length++;
4090 else length += 5;
4091 if (ptr[1] == '?') ptr++;
4092 }
4093 }
4094 continue;
4095
4096 case '^': /* Single-byte metacharacters */
4097 case '.':
4098 case '$':
4099 length++;
4100 lastitemlength = 1;
4101 continue;
4102
4103 case '*': /* These repeats won't be after brackets; */
4104 case '+': /* those are handled separately */
4105 case '?':
4106 length++;
4107 goto POSESSIVE; /* A few lines below */
4108
4109 /* This covers the cases of braced repeats after a single char, metachar,
4110 class, or back reference. */
4111
4112 case '{':
4113 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4114 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4115 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4116
4117 /* These special cases just insert one extra opcode */
4118
4119 if ((min == 0 && (max == 1 || max == -1)) ||
4120 (min == 1 && max == -1))
4121 length++;
4122
4123 /* These cases might insert additional copies of a preceding character. */
4124
4125 else
4126 {
4127 if (min != 1)
4128 {
4129 length -= lastitemlength; /* Uncount the original char or metachar */
4130 if (min > 0) length += 3 + lastitemlength;
4131 }
4132 length += lastitemlength + ((max > 0)? 3 : 1);
4133 }
4134
4135 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4136
4137 POSESSIVE: /* Test for possessive quantifier */
4138 if (ptr[1] == '+')
4139 {
4140 ptr++;
4141 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4142 }
4143 continue;
4144
4145 /* An alternation contains an offset to the next branch or ket. If any ims
4146 options changed in the previous branch(es), and/or if we are in a
4147 lookbehind assertion, extra space will be needed at the start of the
4148 branch. This is handled by branch_extra. */
4149
4150 case '|':
4151 length += 1 + LINK_SIZE + branch_extra;
4152 continue;
4153
4154 /* A character class uses 33 characters provided that all the character
4155 values are less than 256. Otherwise, it uses a bit map for low valued
4156 characters, and individual items for others. Don't worry about character
4157 types that aren't allowed in classes - they'll get picked up during the
4158 compile. A character class that contains only one single-byte character
4159 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4160 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4161
4162 case '[':
4163 if (*(++ptr) == '^')
4164 {
4165 class_optcount = 10; /* Greater than one */
4166 ptr++;
4167 }
4168 else class_optcount = 0;
4169
4170 #ifdef SUPPORT_UTF8
4171 class_utf8 = FALSE;
4172 #endif
4173
4174 /* Written as a "do" so that an initial ']' is taken as data */
4175
4176 if (*ptr != 0) do
4177 {
4178 /* Inside \Q...\E everything is literal except \E */
4179
4180 if (inescq)
4181 {
4182 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4183 inescq = FALSE;
4184 ptr += 1;
4185 continue;
4186 }
4187
4188 /* Outside \Q...\E, check for escapes */
4189
4190 if (*ptr == '\\')
4191 {
4192 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4193 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4194
4195 /* \b is backspace inside a class; \X is literal */
4196
4197 if (-c == ESC_b) c = '\b';
4198 else if (-c == ESC_X) c = 'X';
4199
4200 /* \Q enters quoting mode */
4201
4202 else if (-c == ESC_Q)
4203 {
4204 inescq = TRUE;
4205 continue;
4206 }
4207
4208 /* Handle escapes that turn into characters */
4209
4210 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4211
4212 /* Escapes that are meta-things. The normal ones just affect the
4213 bit map, but Unicode properties require an XCLASS extended item. */
4214
4215 else
4216 {
4217 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4218 #ifdef SUPPORT_UTF8
4219 if (-c == ESC_p || -c == ESC_P)
4220 {
4221 if (!class_utf8)
4222 {
4223 class_utf8 = TRUE;
4224 length += LINK_SIZE + 2;
4225 }
4226 length += 2;
4227 }
4228 #endif
4229 }
4230 }
4231
4232 /* Check the syntax for POSIX stuff. The bits we actually handle are
4233 checked during the real compile phase. */
4234
4235 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4236 {
4237 ptr++;
4238 class_optcount = 10; /* Make sure > 1 */
4239 }
4240
4241 /* Anything else increments the possible optimization count. We have to
4242 detect ranges here so that we can compute the number of extra ranges for
4243 caseless wide characters when UCP support is available. If there are wide
4244 characters, we are going to have to use an XCLASS, even for single
4245 characters. */
4246
4247 else
4248 {
4249 int d;
4250
4251 GET_ONE_CHARACTER:
4252
4253 #ifdef SUPPORT_UTF8
4254 if (utf8)
4255 {
4256 int extra = 0;
4257 GETCHARLEN(c, ptr, extra);
4258 ptr += extra;
4259 }
4260 else c = *ptr;
4261 #else
4262 c = *ptr;
4263 #endif
4264
4265 /* Come here from handling \ above when it escapes to a char value */
4266
4267 NON_SPECIAL_CHARACTER:
4268 class_optcount++;
4269
4270 d = -1;
4271 if (ptr[1] == '-')
4272 {
4273 uschar const *hyptr = ptr++;
4274 if (ptr[1] == '\\')
4275 {
4276 ptr++;
4277 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4278 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4279 if (-d == ESC_b) d = '\b'; /* backspace */
4280 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4281 }
4282 else if (ptr[1] != 0 && ptr[1] != ']')
4283 {
4284 ptr++;
4285 #ifdef SUPPORT_UTF8
4286 if (utf8)
4287 {
4288 int extra = 0;
4289 GETCHARLEN(d, ptr, extra);
4290 ptr += extra;
4291 }
4292 else
4293 #endif
4294 d = *ptr;
4295 }
4296 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4297 }
4298
4299 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4300 127 for caseless matching, we will need to use an XCLASS. */
4301
4302 if (d >= 0)
4303 {
4304 class_optcount = 10; /* Ensure > 1 */
4305 if (d < c)
4306 {
4307 errorcode = ERR8;
4308 goto PCRE_ERROR_RETURN;
4309 }
4310
4311 #ifdef SUPPORT_UTF8
4312 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4313 {
4314 uschar buffer[6];
4315 if (!class_utf8) /* Allow for XCLASS overhead */
4316 {
4317 class_utf8 = TRUE;
4318 length += LINK_SIZE + 2;
4319 }
4320
4321 #ifdef SUPPORT_UCP
4322 /* If we have UCP support, find out how many extra ranges are
4323 needed to map the other case of characters within this range. We
4324 have to mimic the range optimization here, because extending the
4325 range upwards might push d over a boundary that makes is use
4326 another byte in the UTF-8 representation. */
4327
4328 if ((options & PCRE_CASELESS) != 0)
4329 {
4330 int occ, ocd;
4331 int cc = c;
4332 int origd = d;
4333 while (get_othercase_range(&cc, origd, &occ, &ocd))
4334 {
4335 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4336
4337 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4338 { /* if there is overlap, */
4339 c = occ; /* noting that if occ < c */
4340 continue; /* we can't have ocd > d */
4341 } /* because a subrange is */
4342 if (ocd > d && occ <= d + 1) /* always shorter than */
4343 { /* the basic range. */
4344 d = ocd;
4345 continue;
4346 }
4347
4348 /* An extra item is needed */
4349
4350 length += 1 + _pcre_ord2utf8(occ, buffer) +
4351 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4352 }
4353 }
4354 #endif /* SUPPORT_UCP */
4355
4356 /* The length of the (possibly extended) range */
4357
4358 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4359 }
4360 #endif /* SUPPORT_UTF8 */
4361
4362 }
4363
4364 /* We have a single character. There is nothing to be done unless we
4365 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4366 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4367 support. */
4368
4369 else
4370 {
4371 #ifdef SUPPORT_UTF8
4372 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4373 {
4374 uschar buffer[6];
4375 class_optcount = 10; /* Ensure > 1 */
4376 if (!class_utf8) /* Allow for XCLASS overhead */
4377 {
4378 class_utf8 = TRUE;
4379 length += LINK_SIZE + 2;
4380 }
4381 #ifdef SUPPORT_UCP
4382 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4383 (1 + _pcre_ord2utf8(c, buffer));
4384 #else /* SUPPORT_UCP */
4385 length += 1 + _pcre_ord2utf8(c, buffer);
4386 #endif /* SUPPORT_UCP */
4387 }
4388 #endif /* SUPPORT_UTF8 */
4389 }
4390 }
4391 }
4392 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4393
4394 if (*ptr == 0) /* Missing terminating ']' */
4395 {
4396 errorcode = ERR6;
4397 goto PCRE_ERROR_RETURN;
4398 }
4399
4400 /* We can optimize when there was only one optimizable character. Repeats
4401 for positive and negated single one-byte chars are handled by the general
4402 code. Here, we handle repeats for the class opcodes. */
4403
4404 if (class_optcount == 1) length += 3; else
4405 {
4406 length += 33;
4407
4408 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4409 we also need extra for wrapping the whole thing in a sub-pattern. */
4410
4411 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4412 {
4413 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4414 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4415 if ((min == 0 && (max == 1 || max == -1)) ||
4416 (min == 1 && max == -1))
4417 length++;
4418 else length += 5;
4419 if (ptr[1] == '+')
4420 {
4421 ptr++;
4422 length += 2 + 2*LINK_SIZE;
4423 }
4424 else if (ptr[1] == '?') ptr++;
4425 }
4426 }
4427 continue;
4428
4429 /* Brackets may be genuine groups or special things */
4430
4431 case '(':
4432 branch_newextra = 0;
4433 bracket_length = 1 + LINK_SIZE;
4434 capturing = FALSE;
4435
4436 /* Handle special forms of bracket, which all start (? */
4437
4438 if (ptr[1] == '?')
4439 {
4440 int set, unset;
4441 int *optset;
4442
4443 switch (c = ptr[2])
4444 {
4445 /* Skip over comments entirely */
4446 case '#':
4447 ptr += 3;
4448 while (*ptr != 0 && *ptr != ')') ptr++;
4449 if (*ptr == 0)
4450 {
4451 errorcode = ERR18;
4452 goto PCRE_ERROR_RETURN;
4453 }
4454 continue;
4455
4456 /* Non-referencing groups and lookaheads just move the pointer on, and
4457 then behave like a non-special bracket, except that they don't increment
4458 the count of extracting brackets. Ditto for the "once only" bracket,
4459 which is in Perl from version 5.005. */
4460
4461 case ':':
4462 case '=':
4463 case '!':
4464 case '>':
4465 ptr += 2;
4466 break;
4467
4468 /* (?R) specifies a recursive call to the regex, which is an extension
4469 to provide the facility which can be obtained by (?p{perl-code}) in
4470 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4471
4472 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4473 the appropriate numbered brackets. This includes both recursive and
4474 non-recursive calls. (?R) is now synonymous with (?0). */
4475
4476 case 'R':
4477 ptr++;
4478
4479 case '0': case '1': case '2': case '3': case '4':
4480 case '5': case '6': case '7': case '8': case '9':
4481 ptr += 2;
4482 if (c != 'R')
4483 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4484 if (*ptr != ')')
4485 {
4486 errorcode = ERR29;
4487 goto PCRE_ERROR_RETURN;
4488 }
4489 length += 1 + LINK_SIZE;
4490
4491 /* If this item is quantified, it will get wrapped inside brackets so
4492 as to use the code for quantified brackets. We jump down and use the
4493 code that handles this for real brackets. */
4494
4495 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4496 {
4497 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4498 duplength = 5 + 3 * LINK_SIZE;
4499 goto HANDLE_QUANTIFIED_BRACKETS;
4500 }
4501 continue;
4502
4503 /* (?C) is an extension which provides "callout" - to provide a bit of
4504 the functionality of the Perl (?{...}) feature. An optional number may
4505 follow (default is zero). */
4506
4507 case 'C':
4508 ptr += 2;
4509 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4510 if (*ptr != ')')
4511 {
4512 errorcode = ERR39;
4513 goto PCRE_ERROR_RETURN;
4514 }
4515 length += 2 + 2*LINK_SIZE;
4516 continue;
4517
4518 /* Named subpatterns are an extension copied from Python */
4519
4520 case 'P':
4521 ptr += 3;
4522
4523 /* Handle the definition of a named subpattern */
4524
4525 if (*ptr == '<')
4526 {
4527 const uschar *p; /* Don't amalgamate; some compilers */
4528 p = ++ptr; /* grumble at autoincrement in declaration */
4529 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4530 if (*ptr != '>')
4531 {
4532 errorcode = ERR42;
4533 goto PCRE_ERROR_RETURN;
4534 }
4535 name_count++;
4536 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4537 capturing = TRUE; /* Named parentheses are always capturing */
4538 break;
4539 }
4540
4541 /* Handle back references and recursive calls to named subpatterns */
4542
4543 if (*ptr == '=' || *ptr == '>')
4544 {
4545 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4546 if (*ptr != ')')
4547 {
4548 errorcode = ERR42;
4549 goto PCRE_ERROR_RETURN;
4550 }
4551 break;
4552 }
4553
4554 /* Unknown character after (?P */
4555
4556 errorcode = ERR41;
4557 goto PCRE_ERROR_RETURN;
4558
4559 /* Lookbehinds are in Perl from version 5.005 */
4560
4561 case '<':
4562 ptr += 3;
4563 if (*ptr == '=' || *ptr == '!')
4564 {
4565 branch_newextra = 1 + LINK_SIZE;
4566 length += 1 + LINK_SIZE; /* For the first branch */
4567 break;
4568 }
4569 errorcode = ERR24;
4570 goto PCRE_ERROR_RETURN;
4571
4572 /* Conditionals are in Perl from version 5.005. The bracket must either
4573 be followed by a number (for bracket reference) or by an assertion
4574 group, or (a PCRE extension) by 'R' for a recursion test. */
4575
4576 case '(':
4577 if (ptr[3] == 'R' && ptr[4] == ')')
4578 {
4579 ptr += 4;
4580 length += 3;
4581 }
4582 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4583 {
4584 ptr += 4;
4585 length += 3;
4586 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4587 if (*ptr != ')')
4588 {
4589 errorcode = ERR26;
4590 goto PCRE_ERROR_RETURN;
4591 }
4592 }
4593 else /* An assertion must follow */
4594 {
4595 ptr++; /* Can treat like ':' as far as spacing is concerned */
4596 if (ptr[2] != '?' ||
4597 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4598 {
4599 ptr += 2; /* To get right offset in message */
4600 errorcode = ERR28;
4601 goto PCRE_ERROR_RETURN;
4602 }
4603 }
4604 break;
4605
4606 /* Else loop checking valid options until ) is met. Anything else is an
4607 error. If we are without any brackets, i.e. at top level, the settings
4608 act as if specified in the options, so massage the options immediately.
4609 This is for backward compatibility with Perl 5.004. */
4610
4611 default:
4612 set = unset = 0;
4613 optset = &set;
4614 ptr += 2;
4615
4616 for (;; ptr++)
4617 {
4618 c = *ptr;
4619 switch (c)
4620 {
4621 case 'i':
4622 *optset |= PCRE_CASELESS;
4623 continue;
4624
4625 case 'm':
4626 *optset |= PCRE_MULTILINE;
4627 continue;
4628
4629 case 's':
4630 *optset |= PCRE_DOTALL;
4631 continue;
4632
4633 case 'x':
4634 *optset |= PCRE_EXTENDED;
4635 continue;
4636
4637 case 'X':
4638 *optset |= PCRE_EXTRA;
4639 continue;
4640
4641 case 'U':
4642 *optset |= PCRE_UNGREEDY;
4643 continue;
4644
4645 case '-':
4646 optset = &unset;
4647 continue;
4648
4649 /* A termination by ')' indicates an options-setting-only item; if
4650 this is at the very start of the pattern (indicated by item_count
4651 being zero), we use it to set the global options. This is helpful
4652 when analyzing the pattern for first characters, etc. Otherwise
4653 nothing is done here and it is handled during the compiling
4654 process.
4655
4656 We allow for more than one options setting at the start. If such
4657 settings do not change the existing options, nothing is compiled.
4658 However, we must leave space just in case something is compiled.
4659 This can happen for pathological sequences such as (?i)(?-i)
4660 because the global options will end up with -i set. The space is
4661 small and not significant. (Before I did this there was a reported
4662 bug with (?i)(?-i) in a machine-generated pattern.)
4663
4664 [Historical note: Up to Perl 5.8, options settings at top level
4665 were always global settings, wherever they appeared in the pattern.
4666 That is, they were equivalent to an external setting. From 5.8
4667 onwards, they apply only to what follows (which is what you might
4668 expect).] */
4669
4670 case ')':
4671 if (item_count == 0)
4672 {
4673 options = (options | set) & (~unset);
4674 set = unset = 0; /* To save length */
4675 item_count--; /* To allow for several */
4676 length += 2;
4677 }
4678
4679 /* Fall through */
4680
4681 /* A termination by ':' indicates the start of a nested group with
4682 the given options set. This is again handled at compile time, but
4683 we must allow for compiled space if any of the ims options are
4684 set. We also have to allow for resetting space at the end of
4685 the group, which is why 4 is added to the length and not just 2.
4686 If there are several changes of options within the same group, this
4687 will lead to an over-estimate on the length, but this shouldn't
4688 matter very much. We also have to allow for resetting options at
4689 the start of any alternations, which we do by setting
4690 branch_newextra to 2. Finally, we record whether the case-dependent
4691 flag ever changes within the regex. This is used by the "required
4692 character" code. */
4693
4694 case ':':
4695 if (((set|unset) & PCRE_IMS) != 0)
4696 {
4697 length += 4;
4698 branch_newextra = 2;
4699 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4700 }
4701 goto END_OPTIONS;
4702
4703 /* Unrecognized option character */
4704
4705 default:
4706 errorcode = ERR12;
4707 goto PCRE_ERROR_RETURN;
4708 }
4709 }
4710
4711 /* If we hit a closing bracket, that's it - this is a freestanding
4712 option-setting. We need to ensure that branch_extra is updated if
4713 necessary. The only values branch_newextra can have here are 0 or 2.
4714 If the value is 2, then branch_extra must either be 2 or 5, depending
4715 on whether this is a lookbehind group or not. */
4716
4717 END_OPTIONS:
4718 if (c == ')')
4719 {
4720 if (branch_newextra == 2 &&
4721 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4722 branch_extra += branch_newextra;
4723 continue;
4724 }
4725
4726 /* If options were terminated by ':' control comes here. This is a
4727 non-capturing group with an options change. There is nothing more that
4728 needs to be done because "capturing" is already set FALSE by default;
4729 we can just fall through. */
4730
4731 }
4732 }
4733
4734 /* Ordinary parentheses, not followed by '?', are capturing unless
4735 PCRE_NO_AUTO_CAPTURE is set. */
4736
4737 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4738
4739 /* Capturing brackets must be counted so we can process escapes in a
4740 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4741 an additional 3 bytes of memory per capturing bracket. */
4742
4743 if (capturing)
4744 {
4745 bracount++;
4746 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4747 }
4748
4749 /* Save length for computing whole length at end if there's a repeat that
4750 requires duplication of the group. Also save the current value of
4751 branch_extra, and start the new group with the new value. If non-zero, this
4752 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4753
4754 if (brastackptr >= sizeof(brastack)/sizeof(int))
4755 {
4756 errorcode = ERR19;
4757 goto PCRE_ERROR_RETURN;
4758 }
4759
4760 bralenstack[brastackptr] = branch_extra;
4761 branch_extra = branch_newextra;
4762
4763 brastack[brastackptr++] = length;
4764 length += bracket_length;
4765 continue;
4766
4767 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4768 have to replicate this bracket up to that many times. If brastackptr is
4769 0 this is an unmatched bracket which will generate an error, but take care
4770 not to try to access brastack[-1] when computing the length and restoring
4771 the branch_extra value. */
4772
4773 case ')':
4774 length += 1 + LINK_SIZE;
4775 if (brastackptr > 0)
4776 {
4777 duplength = length - brastack[--brastackptr];
4778 branch_extra = bralenstack[brastackptr];
4779 }
4780 else duplength = 0;
4781
4782 /* The following code is also used when a recursion such as (?3) is
4783 followed by a quantifier, because in that case, it has to be wrapped inside
4784 brackets so that the quantifier works. The value of duplength must be
4785 set before arrival. */
4786
4787 HANDLE_QUANTIFIED_BRACKETS:
4788
4789 /* Leave ptr at the final char; for read_repeat_counts this happens
4790 automatically; for the others we need an increment. */
4791
4792 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4793 {
4794 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4795 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4796 }
4797 else if (c == '*') { min = 0; max = -1; ptr++; }
4798 else if (c == '+') { min = 1; max = -1; ptr++; }
4799 else if (c == '?') { min = 0; max = 1; ptr++; }
4800 else { min = 1; max = 1; }
4801
4802 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4803 group, and if the maximum is greater than zero, we have to replicate
4804 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4805 bracket set. */
4806
4807 if (min == 0)
4808 {
4809 length++;
4810 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4811 }
4812
4813 /* When the minimum is greater than zero, we have to replicate up to
4814 minval-1 times, with no additions required in the copies. Then, if there
4815 is a limited maximum we have to replicate up to maxval-1 times allowing
4816 for a BRAZERO item before each optional copy and nesting brackets for all
4817 but one of the optional copies. */
4818
4819 else
4820 {
4821 length += (min - 1) * duplength;
4822 if (max > min) /* Need this test as max=-1 means no limit */
4823 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4824 - (2 + 2*LINK_SIZE);
4825 }
4826
4827 /* Allow space for once brackets for "possessive quantifier" */
4828
4829 if (ptr[1] == '+')
4830 {
4831 ptr++;
4832 length += 2 + 2*LINK_SIZE;
4833 }
4834 continue;
4835
4836 /* Non-special character. It won't be space or # in extended mode, so it is
4837 always a genuine character. If we are in a \Q...\E sequence, check for the
4838 end; if not, we have a literal. */
4839
4840 default:
4841 NORMAL_CHAR:
4842
4843 if (inescq && c == '\\' && ptr[1] == 'E')
4844 {
4845 inescq = FALSE;
4846 ptr++;
4847 continue;
4848 }
4849
4850 length += 2; /* For a one-byte character */
4851 lastitemlength = 1; /* Default length of last item for repeats */
4852
4853 /* In UTF-8 mode, check for additional bytes. */
4854
4855 #ifdef SUPPORT_UTF8
4856 if (utf8 && (c & 0xc0) == 0xc0)
4857 {
4858 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4859 { /* because the end is marked */
4860 lastitemlength++; /* by a zero byte. */
4861 length++;
4862 ptr++;
4863 }
4864 }
4865 #endif
4866
4867 continue;
4868 }
4869 }
4870
4871 length += 2 + LINK_SIZE; /* For final KET and END */
4872
4873 if ((options & PCRE_AUTO_CALLOUT) != 0)
4874 length += 2 + 2*LINK_SIZE; /* For final callout */
4875
4876 if (length > MAX_PATTERN_SIZE)
4877 {
4878 errorcode = ERR20;
4879 goto PCRE_EARLY_ERROR_RETURN;
4880 }
4881
4882 /* Compute the size of data block needed and get it, either from malloc or
4883 externally provided function. */
4884
4885 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4886 re = (real_pcre *)(pcre_malloc)(size);
4887
4888 if (re == NULL)
4889 {
4890 errorcode = ERR21;
4891 goto PCRE_EARLY_ERROR_RETURN;
4892 }
4893
4894 /* Put in the magic number, and save the sizes, options, and character table
4895 pointer. NULL is used for the default character tables. The nullpad field is at
4896 the end; it's there to help in the case when a regex compiled on a system with
4897 4-byte pointers is run on another with 8-byte pointers. */
4898
4899 re->magic_number = MAGIC_NUMBER;
4900 re->size = size;
4901 re->options = options;
4902 re->dummy1 = 0;
4903 re->name_table_offset = sizeof(real_pcre);
4904 re->name_entry_size = max_name_size + 3;
4905 re->name_count = name_count;
4906 re->ref_count = 0;
4907 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4908 re->nullpad = NULL;
4909
4910 /* The starting points of the name/number translation table and of the code are
4911 passed around in the compile data block. */
4912
4913 compile_block.names_found = 0;
4914 compile_block.name_entry_size = max_name_size + 3;
4915 compile_block.name_table = (uschar *)re + re->name_table_offset;
4916 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4917 compile_block.start_code = codestart;
4918 compile_block.start_pattern = (const uschar *)pattern;
4919 compile_block.req_varyopt = 0;
4920 compile_block.nopartial = FALSE;
4921
4922 /* Set up a starting, non-extracting bracket, then compile the expression. On
4923 error, errorcode will be set non-zero, so we don't need to look at the result
4924 of the function here. */
4925
4926 ptr = (const uschar *)pattern;
4927 code = (uschar *)codestart;
4928 *code = OP_BRA;
4929 bracount = 0;
4930 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4931 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4932 re->top_bracket = bracount;
4933 re->top_backref = compile_block.top_backref;
4934
4935 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4936
4937 /* If not reached end of pattern on success, there's an excess bracket. */
4938
4939 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4940
4941 /* Fill in the terminating state and check for disastrous overflow, but
4942 if debugging, leave the test till after things are printed out. */
4943
4944 *code++ = OP_END;
4945
4946 #ifndef DEBUG
4947 if (code - codestart > length) errorcode = ERR23;
4948 #endif
4949
4950 /* Give an error if there's back reference to a non-existent capturing
4951 subpattern. */
4952
4953 if (re->top_backref > re->top_bracket) errorcode = ERR15;
4954
4955 /* Failed to compile, or error while post-processing */
4956
4957 if (errorcode != 0)
4958 {
4959 (pcre_free)(re);
4960 PCRE_ERROR_RETURN:
4961 *erroroffset = ptr - (const uschar *)pattern;
4962 PCRE_EARLY_ERROR_RETURN:
4963 *errorptr = error_texts[errorcode];
4964 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4965 return NULL;
4966 }
4967
4968 /* If the anchored option was not passed, set the flag if we can determine that
4969 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4970 as starting with .* when DOTALL is set).
4971
4972 Otherwise, if we know what the first character has to be, save it, because that
4973 speeds up unanchored matches no end. If not, see if we can set the
4974 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4975 start with ^. and also when all branches start with .* for non-DOTALL matches.
4976 */
4977
4978 if ((options & PCRE_ANCHORED) == 0)
4979 {
4980 int temp_options = options;
4981 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4982 re->options |= PCRE_ANCHORED;
4983 else
4984 {
4985 if (firstbyte < 0)
4986 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4987 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4988 {
4989 int ch = firstbyte & 255;
4990 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4991 compile_block.fcc[ch] == ch)? ch : firstbyte;
4992 re->options |= PCRE_FIRSTSET;
4993 }
4994 else if (is_startline(codestart, 0, compile_block.backref_map))
4995 re->options |= PCRE_STARTLINE;
4996 }
4997 }
4998
4999 /* For an anchored pattern, we use the "required byte" only if it follows a
5000 variable length item in the regex. Remove the caseless flag for non-caseable
5001 bytes. */
5002
5003 if (reqbyte >= 0 &&
5004 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5005 {
5006 int ch = reqbyte & 255;
5007 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5008 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5009 re->options |= PCRE_REQCHSET;
5010 }
5011
5012 /* Print out the compiled data if debugging is enabled. This is never the
5013 case when building a production library. */
5014
5015 #ifdef DEBUG
5016
5017 printf("Length = %d top_bracket = %d top_backref = %d\n",
5018 length, re->top_bracket, re->top_backref);
5019
5020 if (re->options != 0)
5021 {
5022 printf("%s%s%s%s%s%s%s%s%s%s\n",
5023 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5024 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5025 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5026 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5027 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5028 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5029 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5030 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5031 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5032 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5033 }
5034
5035 if ((re->options & PCRE_FIRSTSET) != 0)
5036 {
5037 int ch = re->first_byte & 255;
5038 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5039 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5040 else printf("First char = \\x%02x%s\n", ch, caseless);
5041 }
5042
5043 if ((re->options & PCRE_REQCHSET) != 0)
5044 {
5045 int ch = re->req_byte & 255;
5046 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5047 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5048 else printf("Req char = \\x%02x%s\n", ch, caseless);
5049 }
5050
5051 pcre_printint(re, stdout);
5052
5053 /* This check is done here in the debugging case so that the code that
5054 was compiled can be seen. */
5055
5056 if (code - codestart > length)
5057 {
5058 (pcre_free)(re);
5059 *errorptr = error_texts[ERR23];
5060 *erroroffset = ptr - (uschar *)pattern;
5061 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5062 return NULL;
5063 }
5064 #endif
5065
5066 return (pcre *)re;
5067 }
5068
5069 /* End of pcre_compile.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12