/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 213 - (show annotations) (download)
Wed Aug 15 11:34:14 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 187479 byte(s)
Add integer overflow tests to escape processing.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == '\\')
954 {
955 if (*(++ptr) == 0) return -1;
956 if (*ptr == 'Q') for (;;)
957 {
958 while (*(++ptr) != 0 && *ptr != '\\');
959 if (*ptr == 0) return -1;
960 if (*(++ptr) == 'E') break;
961 }
962 continue;
963 }
964 }
965 continue;
966 }
967
968 /* Skip comments in /x mode */
969
970 if (xmode && *ptr == '#')
971 {
972 while (*(++ptr) != 0 && *ptr != '\n');
973 if (*ptr == 0) return -1;
974 continue;
975 }
976
977 /* An opening parens must now be a real metacharacter */
978
979 if (*ptr != '(') continue;
980 if (ptr[1] != '?' && ptr[1] != '*')
981 {
982 count++;
983 if (name == NULL && count == lorn) return count;
984 continue;
985 }
986
987 ptr += 2;
988 if (*ptr == 'P') ptr++; /* Allow optional P */
989
990 /* We have to disambiguate (?<! and (?<= from (?<name> */
991
992 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
993 *ptr != '\'')
994 continue;
995
996 count++;
997
998 if (name == NULL && count == lorn) return count;
999 term = *ptr++;
1000 if (term == '<') term = '>';
1001 thisname = ptr;
1002 while (*ptr != term) ptr++;
1003 if (name != NULL && lorn == ptr - thisname &&
1004 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1005 return count;
1006 }
1007
1008 return -1;
1009 }
1010
1011
1012
1013 /*************************************************
1014 * Find first significant op code *
1015 *************************************************/
1016
1017 /* This is called by several functions that scan a compiled expression looking
1018 for a fixed first character, or an anchoring op code etc. It skips over things
1019 that do not influence this. For some calls, a change of option is important.
1020 For some calls, it makes sense to skip negative forward and all backward
1021 assertions, and also the \b assertion; for others it does not.
1022
1023 Arguments:
1024 code pointer to the start of the group
1025 options pointer to external options
1026 optbit the option bit whose changing is significant, or
1027 zero if none are
1028 skipassert TRUE if certain assertions are to be skipped
1029
1030 Returns: pointer to the first significant opcode
1031 */
1032
1033 static const uschar*
1034 first_significant_code(const uschar *code, int *options, int optbit,
1035 BOOL skipassert)
1036 {
1037 for (;;)
1038 {
1039 switch ((int)*code)
1040 {
1041 case OP_OPT:
1042 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1043 *options = (int)code[1];
1044 code += 2;
1045 break;
1046
1047 case OP_ASSERT_NOT:
1048 case OP_ASSERTBACK:
1049 case OP_ASSERTBACK_NOT:
1050 if (!skipassert) return code;
1051 do code += GET(code, 1); while (*code == OP_ALT);
1052 code += _pcre_OP_lengths[*code];
1053 break;
1054
1055 case OP_WORD_BOUNDARY:
1056 case OP_NOT_WORD_BOUNDARY:
1057 if (!skipassert) return code;
1058 /* Fall through */
1059
1060 case OP_CALLOUT:
1061 case OP_CREF:
1062 case OP_RREF:
1063 case OP_DEF:
1064 code += _pcre_OP_lengths[*code];
1065 break;
1066
1067 default:
1068 return code;
1069 }
1070 }
1071 /* Control never reaches here */
1072 }
1073
1074
1075
1076
1077 /*************************************************
1078 * Find the fixed length of a pattern *
1079 *************************************************/
1080
1081 /* Scan a pattern and compute the fixed length of subject that will match it,
1082 if the length is fixed. This is needed for dealing with backward assertions.
1083 In UTF8 mode, the result is in characters rather than bytes.
1084
1085 Arguments:
1086 code points to the start of the pattern (the bracket)
1087 options the compiling options
1088
1089 Returns: the fixed length, or -1 if there is no fixed length,
1090 or -2 if \C was encountered
1091 */
1092
1093 static int
1094 find_fixedlength(uschar *code, int options)
1095 {
1096 int length = -1;
1097
1098 register int branchlength = 0;
1099 register uschar *cc = code + 1 + LINK_SIZE;
1100
1101 /* Scan along the opcodes for this branch. If we get to the end of the
1102 branch, check the length against that of the other branches. */
1103
1104 for (;;)
1105 {
1106 int d;
1107 register int op = *cc;
1108
1109 switch (op)
1110 {
1111 case OP_CBRA:
1112 case OP_BRA:
1113 case OP_ONCE:
1114 case OP_COND:
1115 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 if (d < 0) return d;
1117 branchlength += d;
1118 do cc += GET(cc, 1); while (*cc == OP_ALT);
1119 cc += 1 + LINK_SIZE;
1120 break;
1121
1122 /* Reached end of a branch; if it's a ket it is the end of a nested
1123 call. If it's ALT it is an alternation in a nested call. If it is
1124 END it's the end of the outer call. All can be handled by the same code. */
1125
1126 case OP_ALT:
1127 case OP_KET:
1128 case OP_KETRMAX:
1129 case OP_KETRMIN:
1130 case OP_END:
1131 if (length < 0) length = branchlength;
1132 else if (length != branchlength) return -1;
1133 if (*cc != OP_ALT) return length;
1134 cc += 1 + LINK_SIZE;
1135 branchlength = 0;
1136 break;
1137
1138 /* Skip over assertive subpatterns */
1139
1140 case OP_ASSERT:
1141 case OP_ASSERT_NOT:
1142 case OP_ASSERTBACK:
1143 case OP_ASSERTBACK_NOT:
1144 do cc += GET(cc, 1); while (*cc == OP_ALT);
1145 /* Fall through */
1146
1147 /* Skip over things that don't match chars */
1148
1149 case OP_REVERSE:
1150 case OP_CREF:
1151 case OP_RREF:
1152 case OP_DEF:
1153 case OP_OPT:
1154 case OP_CALLOUT:
1155 case OP_SOD:
1156 case OP_SOM:
1157 case OP_EOD:
1158 case OP_EODN:
1159 case OP_CIRC:
1160 case OP_DOLL:
1161 case OP_NOT_WORD_BOUNDARY:
1162 case OP_WORD_BOUNDARY:
1163 cc += _pcre_OP_lengths[*cc];
1164 break;
1165
1166 /* Handle literal characters */
1167
1168 case OP_CHAR:
1169 case OP_CHARNC:
1170 case OP_NOT:
1171 branchlength++;
1172 cc += 2;
1173 #ifdef SUPPORT_UTF8
1174 if ((options & PCRE_UTF8) != 0)
1175 {
1176 while ((*cc & 0xc0) == 0x80) cc++;
1177 }
1178 #endif
1179 break;
1180
1181 /* Handle exact repetitions. The count is already in characters, but we
1182 need to skip over a multibyte character in UTF8 mode. */
1183
1184 case OP_EXACT:
1185 branchlength += GET2(cc,1);
1186 cc += 4;
1187 #ifdef SUPPORT_UTF8
1188 if ((options & PCRE_UTF8) != 0)
1189 {
1190 while((*cc & 0x80) == 0x80) cc++;
1191 }
1192 #endif
1193 break;
1194
1195 case OP_TYPEEXACT:
1196 branchlength += GET2(cc,1);
1197 cc += 4;
1198 break;
1199
1200 /* Handle single-char matchers */
1201
1202 case OP_PROP:
1203 case OP_NOTPROP:
1204 cc += 2;
1205 /* Fall through */
1206
1207 case OP_NOT_DIGIT:
1208 case OP_DIGIT:
1209 case OP_NOT_WHITESPACE:
1210 case OP_WHITESPACE:
1211 case OP_NOT_WORDCHAR:
1212 case OP_WORDCHAR:
1213 case OP_ANY:
1214 branchlength++;
1215 cc++;
1216 break;
1217
1218 /* The single-byte matcher isn't allowed */
1219
1220 case OP_ANYBYTE:
1221 return -2;
1222
1223 /* Check a class for variable quantification */
1224
1225 #ifdef SUPPORT_UTF8
1226 case OP_XCLASS:
1227 cc += GET(cc, 1) - 33;
1228 /* Fall through */
1229 #endif
1230
1231 case OP_CLASS:
1232 case OP_NCLASS:
1233 cc += 33;
1234
1235 switch (*cc)
1236 {
1237 case OP_CRSTAR:
1238 case OP_CRMINSTAR:
1239 case OP_CRQUERY:
1240 case OP_CRMINQUERY:
1241 return -1;
1242
1243 case OP_CRRANGE:
1244 case OP_CRMINRANGE:
1245 if (GET2(cc,1) != GET2(cc,3)) return -1;
1246 branchlength += GET2(cc,1);
1247 cc += 5;
1248 break;
1249
1250 default:
1251 branchlength++;
1252 }
1253 break;
1254
1255 /* Anything else is variable length */
1256
1257 default:
1258 return -1;
1259 }
1260 }
1261 /* Control never gets here */
1262 }
1263
1264
1265
1266
1267 /*************************************************
1268 * Scan compiled regex for numbered bracket *
1269 *************************************************/
1270
1271 /* This little function scans through a compiled pattern until it finds a
1272 capturing bracket with the given number.
1273
1274 Arguments:
1275 code points to start of expression
1276 utf8 TRUE in UTF-8 mode
1277 number the required bracket number
1278
1279 Returns: pointer to the opcode for the bracket, or NULL if not found
1280 */
1281
1282 static const uschar *
1283 find_bracket(const uschar *code, BOOL utf8, int number)
1284 {
1285 for (;;)
1286 {
1287 register int c = *code;
1288 if (c == OP_END) return NULL;
1289
1290 /* XCLASS is used for classes that cannot be represented just by a bit
1291 map. This includes negated single high-valued characters. The length in
1292 the table is zero; the actual length is stored in the compiled code. */
1293
1294 if (c == OP_XCLASS) code += GET(code, 1);
1295
1296 /* Handle capturing bracket */
1297
1298 else if (c == OP_CBRA)
1299 {
1300 int n = GET2(code, 1+LINK_SIZE);
1301 if (n == number) return (uschar *)code;
1302 code += _pcre_OP_lengths[c];
1303 }
1304
1305 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1306 a multi-byte character. The length in the table is a minimum, so we have to
1307 arrange to skip the extra bytes. */
1308
1309 else
1310 {
1311 code += _pcre_OP_lengths[c];
1312 #ifdef SUPPORT_UTF8
1313 if (utf8) switch(c)
1314 {
1315 case OP_CHAR:
1316 case OP_CHARNC:
1317 case OP_EXACT:
1318 case OP_UPTO:
1319 case OP_MINUPTO:
1320 case OP_POSUPTO:
1321 case OP_STAR:
1322 case OP_MINSTAR:
1323 case OP_POSSTAR:
1324 case OP_PLUS:
1325 case OP_MINPLUS:
1326 case OP_POSPLUS:
1327 case OP_QUERY:
1328 case OP_MINQUERY:
1329 case OP_POSQUERY:
1330 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1331 break;
1332 }
1333 #endif
1334 }
1335 }
1336 }
1337
1338
1339
1340 /*************************************************
1341 * Scan compiled regex for recursion reference *
1342 *************************************************/
1343
1344 /* This little function scans through a compiled pattern until it finds an
1345 instance of OP_RECURSE.
1346
1347 Arguments:
1348 code points to start of expression
1349 utf8 TRUE in UTF-8 mode
1350
1351 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1352 */
1353
1354 static const uschar *
1355 find_recurse(const uschar *code, BOOL utf8)
1356 {
1357 for (;;)
1358 {
1359 register int c = *code;
1360 if (c == OP_END) return NULL;
1361 if (c == OP_RECURSE) return code;
1362
1363 /* XCLASS is used for classes that cannot be represented just by a bit
1364 map. This includes negated single high-valued characters. The length in
1365 the table is zero; the actual length is stored in the compiled code. */
1366
1367 if (c == OP_XCLASS) code += GET(code, 1);
1368
1369 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1370 that are followed by a character may be followed by a multi-byte character.
1371 The length in the table is a minimum, so we have to arrange to skip the extra
1372 bytes. */
1373
1374 else
1375 {
1376 code += _pcre_OP_lengths[c];
1377 #ifdef SUPPORT_UTF8
1378 if (utf8) switch(c)
1379 {
1380 case OP_CHAR:
1381 case OP_CHARNC:
1382 case OP_EXACT:
1383 case OP_UPTO:
1384 case OP_MINUPTO:
1385 case OP_POSUPTO:
1386 case OP_STAR:
1387 case OP_MINSTAR:
1388 case OP_POSSTAR:
1389 case OP_PLUS:
1390 case OP_MINPLUS:
1391 case OP_POSPLUS:
1392 case OP_QUERY:
1393 case OP_MINQUERY:
1394 case OP_POSQUERY:
1395 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1396 break;
1397 }
1398 #endif
1399 }
1400 }
1401 }
1402
1403
1404
1405 /*************************************************
1406 * Scan compiled branch for non-emptiness *
1407 *************************************************/
1408
1409 /* This function scans through a branch of a compiled pattern to see whether it
1410 can match the empty string or not. It is called from could_be_empty()
1411 below and from compile_branch() when checking for an unlimited repeat of a
1412 group that can match nothing. Note that first_significant_code() skips over
1413 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1414 struck an inner bracket whose current branch will already have been scanned.
1415
1416 Arguments:
1417 code points to start of search
1418 endcode points to where to stop
1419 utf8 TRUE if in UTF8 mode
1420
1421 Returns: TRUE if what is matched could be empty
1422 */
1423
1424 static BOOL
1425 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1426 {
1427 register int c;
1428 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1429 code < endcode;
1430 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1431 {
1432 const uschar *ccode;
1433
1434 c = *code;
1435
1436 /* Groups with zero repeats can of course be empty; skip them. */
1437
1438 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1439 {
1440 code += _pcre_OP_lengths[c];
1441 do code += GET(code, 1); while (*code == OP_ALT);
1442 c = *code;
1443 continue;
1444 }
1445
1446 /* For other groups, scan the branches. */
1447
1448 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1449 {
1450 BOOL empty_branch;
1451 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1452
1453 /* Scan a closed bracket */
1454
1455 empty_branch = FALSE;
1456 do
1457 {
1458 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1459 empty_branch = TRUE;
1460 code += GET(code, 1);
1461 }
1462 while (*code == OP_ALT);
1463 if (!empty_branch) return FALSE; /* All branches are non-empty */
1464 c = *code;
1465 continue;
1466 }
1467
1468 /* Handle the other opcodes */
1469
1470 switch (c)
1471 {
1472 /* Check for quantifiers after a class */
1473
1474 #ifdef SUPPORT_UTF8
1475 case OP_XCLASS:
1476 ccode = code + GET(code, 1);
1477 goto CHECK_CLASS_REPEAT;
1478 #endif
1479
1480 case OP_CLASS:
1481 case OP_NCLASS:
1482 ccode = code + 33;
1483
1484 #ifdef SUPPORT_UTF8
1485 CHECK_CLASS_REPEAT:
1486 #endif
1487
1488 switch (*ccode)
1489 {
1490 case OP_CRSTAR: /* These could be empty; continue */
1491 case OP_CRMINSTAR:
1492 case OP_CRQUERY:
1493 case OP_CRMINQUERY:
1494 break;
1495
1496 default: /* Non-repeat => class must match */
1497 case OP_CRPLUS: /* These repeats aren't empty */
1498 case OP_CRMINPLUS:
1499 return FALSE;
1500
1501 case OP_CRRANGE:
1502 case OP_CRMINRANGE:
1503 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1504 break;
1505 }
1506 break;
1507
1508 /* Opcodes that must match a character */
1509
1510 case OP_PROP:
1511 case OP_NOTPROP:
1512 case OP_EXTUNI:
1513 case OP_NOT_DIGIT:
1514 case OP_DIGIT:
1515 case OP_NOT_WHITESPACE:
1516 case OP_WHITESPACE:
1517 case OP_NOT_WORDCHAR:
1518 case OP_WORDCHAR:
1519 case OP_ANY:
1520 case OP_ANYBYTE:
1521 case OP_CHAR:
1522 case OP_CHARNC:
1523 case OP_NOT:
1524 case OP_PLUS:
1525 case OP_MINPLUS:
1526 case OP_POSPLUS:
1527 case OP_EXACT:
1528 case OP_NOTPLUS:
1529 case OP_NOTMINPLUS:
1530 case OP_NOTPOSPLUS:
1531 case OP_NOTEXACT:
1532 case OP_TYPEPLUS:
1533 case OP_TYPEMINPLUS:
1534 case OP_TYPEPOSPLUS:
1535 case OP_TYPEEXACT:
1536 return FALSE;
1537
1538 /* End of branch */
1539
1540 case OP_KET:
1541 case OP_KETRMAX:
1542 case OP_KETRMIN:
1543 case OP_ALT:
1544 return TRUE;
1545
1546 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1547 MINUPTO, and POSUPTO may be followed by a multibyte character */
1548
1549 #ifdef SUPPORT_UTF8
1550 case OP_STAR:
1551 case OP_MINSTAR:
1552 case OP_POSSTAR:
1553 case OP_QUERY:
1554 case OP_MINQUERY:
1555 case OP_POSQUERY:
1556 case OP_UPTO:
1557 case OP_MINUPTO:
1558 case OP_POSUPTO:
1559 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1560 break;
1561 #endif
1562 }
1563 }
1564
1565 return TRUE;
1566 }
1567
1568
1569
1570 /*************************************************
1571 * Scan compiled regex for non-emptiness *
1572 *************************************************/
1573
1574 /* This function is called to check for left recursive calls. We want to check
1575 the current branch of the current pattern to see if it could match the empty
1576 string. If it could, we must look outwards for branches at other levels,
1577 stopping when we pass beyond the bracket which is the subject of the recursion.
1578
1579 Arguments:
1580 code points to start of the recursion
1581 endcode points to where to stop (current RECURSE item)
1582 bcptr points to the chain of current (unclosed) branch starts
1583 utf8 TRUE if in UTF-8 mode
1584
1585 Returns: TRUE if what is matched could be empty
1586 */
1587
1588 static BOOL
1589 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1590 BOOL utf8)
1591 {
1592 while (bcptr != NULL && bcptr->current >= code)
1593 {
1594 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1595 bcptr = bcptr->outer;
1596 }
1597 return TRUE;
1598 }
1599
1600
1601
1602 /*************************************************
1603 * Check for POSIX class syntax *
1604 *************************************************/
1605
1606 /* This function is called when the sequence "[:" or "[." or "[=" is
1607 encountered in a character class. It checks whether this is followed by an
1608 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1609 ".]" or "=]".
1610
1611 Argument:
1612 ptr pointer to the initial [
1613 endptr where to return the end pointer
1614 cd pointer to compile data
1615
1616 Returns: TRUE or FALSE
1617 */
1618
1619 static BOOL
1620 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1621 {
1622 int terminator; /* Don't combine these lines; the Solaris cc */
1623 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1624 if (*(++ptr) == '^') ptr++;
1625 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1626 if (*ptr == terminator && ptr[1] == ']')
1627 {
1628 *endptr = ptr;
1629 return TRUE;
1630 }
1631 return FALSE;
1632 }
1633
1634
1635
1636
1637 /*************************************************
1638 * Check POSIX class name *
1639 *************************************************/
1640
1641 /* This function is called to check the name given in a POSIX-style class entry
1642 such as [:alnum:].
1643
1644 Arguments:
1645 ptr points to the first letter
1646 len the length of the name
1647
1648 Returns: a value representing the name, or -1 if unknown
1649 */
1650
1651 static int
1652 check_posix_name(const uschar *ptr, int len)
1653 {
1654 register int yield = 0;
1655 while (posix_name_lengths[yield] != 0)
1656 {
1657 if (len == posix_name_lengths[yield] &&
1658 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1659 yield++;
1660 }
1661 return -1;
1662 }
1663
1664
1665 /*************************************************
1666 * Adjust OP_RECURSE items in repeated group *
1667 *************************************************/
1668
1669 /* OP_RECURSE items contain an offset from the start of the regex to the group
1670 that is referenced. This means that groups can be replicated for fixed
1671 repetition simply by copying (because the recursion is allowed to refer to
1672 earlier groups that are outside the current group). However, when a group is
1673 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1674 it, after it has been compiled. This means that any OP_RECURSE items within it
1675 that refer to the group itself or any contained groups have to have their
1676 offsets adjusted. That one of the jobs of this function. Before it is called,
1677 the partially compiled regex must be temporarily terminated with OP_END.
1678
1679 This function has been extended with the possibility of forward references for
1680 recursions and subroutine calls. It must also check the list of such references
1681 for the group we are dealing with. If it finds that one of the recursions in
1682 the current group is on this list, it adjusts the offset in the list, not the
1683 value in the reference (which is a group number).
1684
1685 Arguments:
1686 group points to the start of the group
1687 adjust the amount by which the group is to be moved
1688 utf8 TRUE in UTF-8 mode
1689 cd contains pointers to tables etc.
1690 save_hwm the hwm forward reference pointer at the start of the group
1691
1692 Returns: nothing
1693 */
1694
1695 static void
1696 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1697 uschar *save_hwm)
1698 {
1699 uschar *ptr = group;
1700 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1701 {
1702 int offset;
1703 uschar *hc;
1704
1705 /* See if this recursion is on the forward reference list. If so, adjust the
1706 reference. */
1707
1708 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1709 {
1710 offset = GET(hc, 0);
1711 if (cd->start_code + offset == ptr + 1)
1712 {
1713 PUT(hc, 0, offset + adjust);
1714 break;
1715 }
1716 }
1717
1718 /* Otherwise, adjust the recursion offset if it's after the start of this
1719 group. */
1720
1721 if (hc >= cd->hwm)
1722 {
1723 offset = GET(ptr, 1);
1724 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1725 }
1726
1727 ptr += 1 + LINK_SIZE;
1728 }
1729 }
1730
1731
1732
1733 /*************************************************
1734 * Insert an automatic callout point *
1735 *************************************************/
1736
1737 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1738 callout points before each pattern item.
1739
1740 Arguments:
1741 code current code pointer
1742 ptr current pattern pointer
1743 cd pointers to tables etc
1744
1745 Returns: new code pointer
1746 */
1747
1748 static uschar *
1749 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1750 {
1751 *code++ = OP_CALLOUT;
1752 *code++ = 255;
1753 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1754 PUT(code, LINK_SIZE, 0); /* Default length */
1755 return code + 2*LINK_SIZE;
1756 }
1757
1758
1759
1760 /*************************************************
1761 * Complete a callout item *
1762 *************************************************/
1763
1764 /* A callout item contains the length of the next item in the pattern, which
1765 we can't fill in till after we have reached the relevant point. This is used
1766 for both automatic and manual callouts.
1767
1768 Arguments:
1769 previous_callout points to previous callout item
1770 ptr current pattern pointer
1771 cd pointers to tables etc
1772
1773 Returns: nothing
1774 */
1775
1776 static void
1777 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1778 {
1779 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1780 PUT(previous_callout, 2 + LINK_SIZE, length);
1781 }
1782
1783
1784
1785 #ifdef SUPPORT_UCP
1786 /*************************************************
1787 * Get othercase range *
1788 *************************************************/
1789
1790 /* This function is passed the start and end of a class range, in UTF-8 mode
1791 with UCP support. It searches up the characters, looking for internal ranges of
1792 characters in the "other" case. Each call returns the next one, updating the
1793 start address.
1794
1795 Arguments:
1796 cptr points to starting character value; updated
1797 d end value
1798 ocptr where to put start of othercase range
1799 odptr where to put end of othercase range
1800
1801 Yield: TRUE when range returned; FALSE when no more
1802 */
1803
1804 static BOOL
1805 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1806 unsigned int *odptr)
1807 {
1808 unsigned int c, othercase, next;
1809
1810 for (c = *cptr; c <= d; c++)
1811 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1812
1813 if (c > d) return FALSE;
1814
1815 *ocptr = othercase;
1816 next = othercase + 1;
1817
1818 for (++c; c <= d; c++)
1819 {
1820 if (_pcre_ucp_othercase(c) != next) break;
1821 next++;
1822 }
1823
1824 *odptr = next - 1;
1825 *cptr = c;
1826
1827 return TRUE;
1828 }
1829 #endif /* SUPPORT_UCP */
1830
1831
1832
1833 /*************************************************
1834 * Check if auto-possessifying is possible *
1835 *************************************************/
1836
1837 /* This function is called for unlimited repeats of certain items, to see
1838 whether the next thing could possibly match the repeated item. If not, it makes
1839 sense to automatically possessify the repeated item.
1840
1841 Arguments:
1842 op_code the repeated op code
1843 this data for this item, depends on the opcode
1844 utf8 TRUE in UTF-8 mode
1845 utf8_char used for utf8 character bytes, NULL if not relevant
1846 ptr next character in pattern
1847 options options bits
1848 cd contains pointers to tables etc.
1849
1850 Returns: TRUE if possessifying is wanted
1851 */
1852
1853 static BOOL
1854 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1855 const uschar *ptr, int options, compile_data *cd)
1856 {
1857 int next;
1858
1859 /* Skip whitespace and comments in extended mode */
1860
1861 if ((options & PCRE_EXTENDED) != 0)
1862 {
1863 for (;;)
1864 {
1865 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1866 if (*ptr == '#')
1867 {
1868 while (*(++ptr) != 0)
1869 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1870 }
1871 else break;
1872 }
1873 }
1874
1875 /* If the next item is one that we can handle, get its value. A non-negative
1876 value is a character, a negative value is an escape value. */
1877
1878 if (*ptr == '\\')
1879 {
1880 int temperrorcode = 0;
1881 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1882 if (temperrorcode != 0) return FALSE;
1883 ptr++; /* Point after the escape sequence */
1884 }
1885
1886 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1887 {
1888 #ifdef SUPPORT_UTF8
1889 if (utf8) { GETCHARINC(next, ptr); } else
1890 #endif
1891 next = *ptr++;
1892 }
1893
1894 else return FALSE;
1895
1896 /* Skip whitespace and comments in extended mode */
1897
1898 if ((options & PCRE_EXTENDED) != 0)
1899 {
1900 for (;;)
1901 {
1902 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1903 if (*ptr == '#')
1904 {
1905 while (*(++ptr) != 0)
1906 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1907 }
1908 else break;
1909 }
1910 }
1911
1912 /* If the next thing is itself optional, we have to give up. */
1913
1914 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1915 return FALSE;
1916
1917 /* Now compare the next item with the previous opcode. If the previous is a
1918 positive single character match, "item" either contains the character or, if
1919 "item" is greater than 127 in utf8 mode, the character's bytes are in
1920 utf8_char. */
1921
1922
1923 /* Handle cases when the next item is a character. */
1924
1925 if (next >= 0) switch(op_code)
1926 {
1927 case OP_CHAR:
1928 #ifdef SUPPORT_UTF8
1929 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1930 #endif
1931 return item != next;
1932
1933 /* For CHARNC (caseless character) we must check the other case. If we have
1934 Unicode property support, we can use it to test the other case of
1935 high-valued characters. */
1936
1937 case OP_CHARNC:
1938 #ifdef SUPPORT_UTF8
1939 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1940 #endif
1941 if (item == next) return FALSE;
1942 #ifdef SUPPORT_UTF8
1943 if (utf8)
1944 {
1945 unsigned int othercase;
1946 if (next < 128) othercase = cd->fcc[next]; else
1947 #ifdef SUPPORT_UCP
1948 othercase = _pcre_ucp_othercase((unsigned int)next);
1949 #else
1950 othercase = NOTACHAR;
1951 #endif
1952 return (unsigned int)item != othercase;
1953 }
1954 else
1955 #endif /* SUPPORT_UTF8 */
1956 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1957
1958 /* For OP_NOT, "item" must be a single-byte character. */
1959
1960 case OP_NOT:
1961 if (next < 0) return FALSE; /* Not a character */
1962 if (item == next) return TRUE;
1963 if ((options & PCRE_CASELESS) == 0) return FALSE;
1964 #ifdef SUPPORT_UTF8
1965 if (utf8)
1966 {
1967 unsigned int othercase;
1968 if (next < 128) othercase = cd->fcc[next]; else
1969 #ifdef SUPPORT_UCP
1970 othercase = _pcre_ucp_othercase(next);
1971 #else
1972 othercase = NOTACHAR;
1973 #endif
1974 return (unsigned int)item == othercase;
1975 }
1976 else
1977 #endif /* SUPPORT_UTF8 */
1978 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1979
1980 case OP_DIGIT:
1981 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1982
1983 case OP_NOT_DIGIT:
1984 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1985
1986 case OP_WHITESPACE:
1987 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1988
1989 case OP_NOT_WHITESPACE:
1990 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1991
1992 case OP_WORDCHAR:
1993 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1994
1995 case OP_NOT_WORDCHAR:
1996 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1997
1998 case OP_HSPACE:
1999 case OP_NOT_HSPACE:
2000 switch(next)
2001 {
2002 case 0x09:
2003 case 0x20:
2004 case 0xa0:
2005 case 0x1680:
2006 case 0x180e:
2007 case 0x2000:
2008 case 0x2001:
2009 case 0x2002:
2010 case 0x2003:
2011 case 0x2004:
2012 case 0x2005:
2013 case 0x2006:
2014 case 0x2007:
2015 case 0x2008:
2016 case 0x2009:
2017 case 0x200A:
2018 case 0x202f:
2019 case 0x205f:
2020 case 0x3000:
2021 return op_code != OP_HSPACE;
2022 default:
2023 return op_code == OP_HSPACE;
2024 }
2025
2026 case OP_VSPACE:
2027 case OP_NOT_VSPACE:
2028 switch(next)
2029 {
2030 case 0x0a:
2031 case 0x0b:
2032 case 0x0c:
2033 case 0x0d:
2034 case 0x85:
2035 case 0x2028:
2036 case 0x2029:
2037 return op_code != OP_VSPACE;
2038 default:
2039 return op_code == OP_VSPACE;
2040 }
2041
2042 default:
2043 return FALSE;
2044 }
2045
2046
2047 /* Handle the case when the next item is \d, \s, etc. */
2048
2049 switch(op_code)
2050 {
2051 case OP_CHAR:
2052 case OP_CHARNC:
2053 #ifdef SUPPORT_UTF8
2054 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2055 #endif
2056 switch(-next)
2057 {
2058 case ESC_d:
2059 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2060
2061 case ESC_D:
2062 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2063
2064 case ESC_s:
2065 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2066
2067 case ESC_S:
2068 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2069
2070 case ESC_w:
2071 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2072
2073 case ESC_W:
2074 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2075
2076 case ESC_h:
2077 case ESC_H:
2078 switch(item)
2079 {
2080 case 0x09:
2081 case 0x20:
2082 case 0xa0:
2083 case 0x1680:
2084 case 0x180e:
2085 case 0x2000:
2086 case 0x2001:
2087 case 0x2002:
2088 case 0x2003:
2089 case 0x2004:
2090 case 0x2005:
2091 case 0x2006:
2092 case 0x2007:
2093 case 0x2008:
2094 case 0x2009:
2095 case 0x200A:
2096 case 0x202f:
2097 case 0x205f:
2098 case 0x3000:
2099 return -next != ESC_h;
2100 default:
2101 return -next == ESC_h;
2102 }
2103
2104 case ESC_v:
2105 case ESC_V:
2106 switch(item)
2107 {
2108 case 0x0a:
2109 case 0x0b:
2110 case 0x0c:
2111 case 0x0d:
2112 case 0x85:
2113 case 0x2028:
2114 case 0x2029:
2115 return -next != ESC_v;
2116 default:
2117 return -next == ESC_v;
2118 }
2119
2120 default:
2121 return FALSE;
2122 }
2123
2124 case OP_DIGIT:
2125 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2126 next == -ESC_h || next == -ESC_v;
2127
2128 case OP_NOT_DIGIT:
2129 return next == -ESC_d;
2130
2131 case OP_WHITESPACE:
2132 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2133
2134 case OP_NOT_WHITESPACE:
2135 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2136
2137 case OP_HSPACE:
2138 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2139
2140 case OP_NOT_HSPACE:
2141 return next == -ESC_h;
2142
2143 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2144 case OP_VSPACE:
2145 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2146
2147 case OP_NOT_VSPACE:
2148 return next == -ESC_v;
2149
2150 case OP_WORDCHAR:
2151 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2152
2153 case OP_NOT_WORDCHAR:
2154 return next == -ESC_w || next == -ESC_d;
2155
2156 default:
2157 return FALSE;
2158 }
2159
2160 /* Control does not reach here */
2161 }
2162
2163
2164
2165 /*************************************************
2166 * Compile one branch *
2167 *************************************************/
2168
2169 /* Scan the pattern, compiling it into the a vector. If the options are
2170 changed during the branch, the pointer is used to change the external options
2171 bits. This function is used during the pre-compile phase when we are trying
2172 to find out the amount of memory needed, as well as during the real compile
2173 phase. The value of lengthptr distinguishes the two phases.
2174
2175 Arguments:
2176 optionsptr pointer to the option bits
2177 codeptr points to the pointer to the current code point
2178 ptrptr points to the current pattern pointer
2179 errorcodeptr points to error code variable
2180 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2181 reqbyteptr set to the last literal character required, else < 0
2182 bcptr points to current branch chain
2183 cd contains pointers to tables etc.
2184 lengthptr NULL during the real compile phase
2185 points to length accumulator during pre-compile phase
2186
2187 Returns: TRUE on success
2188 FALSE, with *errorcodeptr set non-zero on error
2189 */
2190
2191 static BOOL
2192 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2193 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2194 compile_data *cd, int *lengthptr)
2195 {
2196 int repeat_type, op_type;
2197 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2198 int bravalue = 0;
2199 int greedy_default, greedy_non_default;
2200 int firstbyte, reqbyte;
2201 int zeroreqbyte, zerofirstbyte;
2202 int req_caseopt, reqvary, tempreqvary;
2203 int options = *optionsptr;
2204 int after_manual_callout = 0;
2205 int length_prevgroup = 0;
2206 register int c;
2207 register uschar *code = *codeptr;
2208 uschar *last_code = code;
2209 uschar *orig_code = code;
2210 uschar *tempcode;
2211 BOOL inescq = FALSE;
2212 BOOL groupsetfirstbyte = FALSE;
2213 const uschar *ptr = *ptrptr;
2214 const uschar *tempptr;
2215 uschar *previous = NULL;
2216 uschar *previous_callout = NULL;
2217 uschar *save_hwm = NULL;
2218 uschar classbits[32];
2219
2220 #ifdef SUPPORT_UTF8
2221 BOOL class_utf8;
2222 BOOL utf8 = (options & PCRE_UTF8) != 0;
2223 uschar *class_utf8data;
2224 uschar utf8_char[6];
2225 #else
2226 BOOL utf8 = FALSE;
2227 uschar *utf8_char = NULL;
2228 #endif
2229
2230 #ifdef DEBUG
2231 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2232 #endif
2233
2234 /* Set up the default and non-default settings for greediness */
2235
2236 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2237 greedy_non_default = greedy_default ^ 1;
2238
2239 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2240 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2241 matches a non-fixed char first char; reqbyte just remains unset if we never
2242 find one.
2243
2244 When we hit a repeat whose minimum is zero, we may have to adjust these values
2245 to take the zero repeat into account. This is implemented by setting them to
2246 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2247 item types that can be repeated set these backoff variables appropriately. */
2248
2249 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2250
2251 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2252 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2253 value > 255. It is added into the firstbyte or reqbyte variables to record the
2254 case status of the value. This is used only for ASCII characters. */
2255
2256 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2257
2258 /* Switch on next character until the end of the branch */
2259
2260 for (;; ptr++)
2261 {
2262 BOOL negate_class;
2263 BOOL possessive_quantifier;
2264 BOOL is_quantifier;
2265 BOOL is_recurse;
2266 BOOL reset_bracount;
2267 int class_charcount;
2268 int class_lastchar;
2269 int newoptions;
2270 int recno;
2271 int refsign;
2272 int skipbytes;
2273 int subreqbyte;
2274 int subfirstbyte;
2275 int terminator;
2276 int mclength;
2277 uschar mcbuffer[8];
2278
2279 /* Get next byte in the pattern */
2280
2281 c = *ptr;
2282
2283 /* If we are in the pre-compile phase, accumulate the length used for the
2284 previous cycle of this loop. */
2285
2286 if (lengthptr != NULL)
2287 {
2288 #ifdef DEBUG
2289 if (code > cd->hwm) cd->hwm = code; /* High water info */
2290 #endif
2291 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2292 {
2293 *errorcodeptr = ERR52;
2294 goto FAILED;
2295 }
2296
2297 /* There is at least one situation where code goes backwards: this is the
2298 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2299 the class is simply eliminated. However, it is created first, so we have to
2300 allow memory for it. Therefore, don't ever reduce the length at this point.
2301 */
2302
2303 if (code < last_code) code = last_code;
2304
2305 /* Paranoid check for integer overflow */
2306
2307 if (OFLOW_MAX - *lengthptr < code - last_code)
2308 {
2309 *errorcodeptr = ERR20;
2310 goto FAILED;
2311 }
2312
2313 *lengthptr += code - last_code;
2314 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2315
2316 /* If "previous" is set and it is not at the start of the work space, move
2317 it back to there, in order to avoid filling up the work space. Otherwise,
2318 if "previous" is NULL, reset the current code pointer to the start. */
2319
2320 if (previous != NULL)
2321 {
2322 if (previous > orig_code)
2323 {
2324 memmove(orig_code, previous, code - previous);
2325 code -= previous - orig_code;
2326 previous = orig_code;
2327 }
2328 }
2329 else code = orig_code;
2330
2331 /* Remember where this code item starts so we can pick up the length
2332 next time round. */
2333
2334 last_code = code;
2335 }
2336
2337 /* In the real compile phase, just check the workspace used by the forward
2338 reference list. */
2339
2340 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2341 {
2342 *errorcodeptr = ERR52;
2343 goto FAILED;
2344 }
2345
2346 /* If in \Q...\E, check for the end; if not, we have a literal */
2347
2348 if (inescq && c != 0)
2349 {
2350 if (c == '\\' && ptr[1] == 'E')
2351 {
2352 inescq = FALSE;
2353 ptr++;
2354 continue;
2355 }
2356 else
2357 {
2358 if (previous_callout != NULL)
2359 {
2360 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2361 complete_callout(previous_callout, ptr, cd);
2362 previous_callout = NULL;
2363 }
2364 if ((options & PCRE_AUTO_CALLOUT) != 0)
2365 {
2366 previous_callout = code;
2367 code = auto_callout(code, ptr, cd);
2368 }
2369 goto NORMAL_CHAR;
2370 }
2371 }
2372
2373 /* Fill in length of a previous callout, except when the next thing is
2374 a quantifier. */
2375
2376 is_quantifier = c == '*' || c == '+' || c == '?' ||
2377 (c == '{' && is_counted_repeat(ptr+1));
2378
2379 if (!is_quantifier && previous_callout != NULL &&
2380 after_manual_callout-- <= 0)
2381 {
2382 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2383 complete_callout(previous_callout, ptr, cd);
2384 previous_callout = NULL;
2385 }
2386
2387 /* In extended mode, skip white space and comments */
2388
2389 if ((options & PCRE_EXTENDED) != 0)
2390 {
2391 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2392 if (c == '#')
2393 {
2394 while (*(++ptr) != 0)
2395 {
2396 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2397 }
2398 if (*ptr != 0) continue;
2399
2400 /* Else fall through to handle end of string */
2401 c = 0;
2402 }
2403 }
2404
2405 /* No auto callout for quantifiers. */
2406
2407 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2408 {
2409 previous_callout = code;
2410 code = auto_callout(code, ptr, cd);
2411 }
2412
2413 switch(c)
2414 {
2415 /* ===================================================================*/
2416 case 0: /* The branch terminates at string end */
2417 case '|': /* or | or ) */
2418 case ')':
2419 *firstbyteptr = firstbyte;
2420 *reqbyteptr = reqbyte;
2421 *codeptr = code;
2422 *ptrptr = ptr;
2423 if (lengthptr != NULL)
2424 {
2425 if (OFLOW_MAX - *lengthptr < code - last_code)
2426 {
2427 *errorcodeptr = ERR20;
2428 goto FAILED;
2429 }
2430 *lengthptr += code - last_code; /* To include callout length */
2431 DPRINTF((">> end branch\n"));
2432 }
2433 return TRUE;
2434
2435
2436 /* ===================================================================*/
2437 /* Handle single-character metacharacters. In multiline mode, ^ disables
2438 the setting of any following char as a first character. */
2439
2440 case '^':
2441 if ((options & PCRE_MULTILINE) != 0)
2442 {
2443 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2444 }
2445 previous = NULL;
2446 *code++ = OP_CIRC;
2447 break;
2448
2449 case '$':
2450 previous = NULL;
2451 *code++ = OP_DOLL;
2452 break;
2453
2454 /* There can never be a first char if '.' is first, whatever happens about
2455 repeats. The value of reqbyte doesn't change either. */
2456
2457 case '.':
2458 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2459 zerofirstbyte = firstbyte;
2460 zeroreqbyte = reqbyte;
2461 previous = code;
2462 *code++ = OP_ANY;
2463 break;
2464
2465
2466 /* ===================================================================*/
2467 /* Character classes. If the included characters are all < 256, we build a
2468 32-byte bitmap of the permitted characters, except in the special case
2469 where there is only one such character. For negated classes, we build the
2470 map as usual, then invert it at the end. However, we use a different opcode
2471 so that data characters > 255 can be handled correctly.
2472
2473 If the class contains characters outside the 0-255 range, a different
2474 opcode is compiled. It may optionally have a bit map for characters < 256,
2475 but those above are are explicitly listed afterwards. A flag byte tells
2476 whether the bitmap is present, and whether this is a negated class or not.
2477 */
2478
2479 case '[':
2480 previous = code;
2481
2482 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2483 they are encountered at the top level, so we'll do that too. */
2484
2485 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2486 check_posix_syntax(ptr, &tempptr, cd))
2487 {
2488 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2489 goto FAILED;
2490 }
2491
2492 /* If the first character is '^', set the negation flag and skip it. Also,
2493 if the first few characters (either before or after ^) are \Q\E or \E we
2494 skip them too. This makes for compatibility with Perl. */
2495
2496 negate_class = FALSE;
2497 for (;;)
2498 {
2499 c = *(++ptr);
2500 if (c == '\\')
2501 {
2502 if (ptr[1] == 'E') ptr++;
2503 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2504 else break;
2505 }
2506 else if (!negate_class && c == '^')
2507 negate_class = TRUE;
2508 else break;
2509 }
2510
2511 /* Keep a count of chars with values < 256 so that we can optimize the case
2512 of just a single character (as long as it's < 256). However, For higher
2513 valued UTF-8 characters, we don't yet do any optimization. */
2514
2515 class_charcount = 0;
2516 class_lastchar = -1;
2517
2518 /* Initialize the 32-char bit map to all zeros. We build the map in a
2519 temporary bit of memory, in case the class contains only 1 character (less
2520 than 256), because in that case the compiled code doesn't use the bit map.
2521 */
2522
2523 memset(classbits, 0, 32 * sizeof(uschar));
2524
2525 #ifdef SUPPORT_UTF8
2526 class_utf8 = FALSE; /* No chars >= 256 */
2527 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2528 #endif
2529
2530 /* Process characters until ] is reached. By writing this as a "do" it
2531 means that an initial ] is taken as a data character. At the start of the
2532 loop, c contains the first byte of the character. */
2533
2534 if (c != 0) do
2535 {
2536 const uschar *oldptr;
2537
2538 #ifdef SUPPORT_UTF8
2539 if (utf8 && c > 127)
2540 { /* Braces are required because the */
2541 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2542 }
2543 #endif
2544
2545 /* Inside \Q...\E everything is literal except \E */
2546
2547 if (inescq)
2548 {
2549 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2550 {
2551 inescq = FALSE; /* Reset literal state */
2552 ptr++; /* Skip the 'E' */
2553 continue; /* Carry on with next */
2554 }
2555 goto CHECK_RANGE; /* Could be range if \E follows */
2556 }
2557
2558 /* Handle POSIX class names. Perl allows a negation extension of the
2559 form [:^name:]. A square bracket that doesn't match the syntax is
2560 treated as a literal. We also recognize the POSIX constructions
2561 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2562 5.6 and 5.8 do. */
2563
2564 if (c == '[' &&
2565 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2566 check_posix_syntax(ptr, &tempptr, cd))
2567 {
2568 BOOL local_negate = FALSE;
2569 int posix_class, taboffset, tabopt;
2570 register const uschar *cbits = cd->cbits;
2571 uschar pbits[32];
2572
2573 if (ptr[1] != ':')
2574 {
2575 *errorcodeptr = ERR31;
2576 goto FAILED;
2577 }
2578
2579 ptr += 2;
2580 if (*ptr == '^')
2581 {
2582 local_negate = TRUE;
2583 ptr++;
2584 }
2585
2586 posix_class = check_posix_name(ptr, tempptr - ptr);
2587 if (posix_class < 0)
2588 {
2589 *errorcodeptr = ERR30;
2590 goto FAILED;
2591 }
2592
2593 /* If matching is caseless, upper and lower are converted to
2594 alpha. This relies on the fact that the class table starts with
2595 alpha, lower, upper as the first 3 entries. */
2596
2597 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2598 posix_class = 0;
2599
2600 /* We build the bit map for the POSIX class in a chunk of local store
2601 because we may be adding and subtracting from it, and we don't want to
2602 subtract bits that may be in the main map already. At the end we or the
2603 result into the bit map that is being built. */
2604
2605 posix_class *= 3;
2606
2607 /* Copy in the first table (always present) */
2608
2609 memcpy(pbits, cbits + posix_class_maps[posix_class],
2610 32 * sizeof(uschar));
2611
2612 /* If there is a second table, add or remove it as required. */
2613
2614 taboffset = posix_class_maps[posix_class + 1];
2615 tabopt = posix_class_maps[posix_class + 2];
2616
2617 if (taboffset >= 0)
2618 {
2619 if (tabopt >= 0)
2620 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2621 else
2622 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2623 }
2624
2625 /* Not see if we need to remove any special characters. An option
2626 value of 1 removes vertical space and 2 removes underscore. */
2627
2628 if (tabopt < 0) tabopt = -tabopt;
2629 if (tabopt == 1) pbits[1] &= ~0x3c;
2630 else if (tabopt == 2) pbits[11] &= 0x7f;
2631
2632 /* Add the POSIX table or its complement into the main table that is
2633 being built and we are done. */
2634
2635 if (local_negate)
2636 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2637 else
2638 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2639
2640 ptr = tempptr + 1;
2641 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2642 continue; /* End of POSIX syntax handling */
2643 }
2644
2645 /* Backslash may introduce a single character, or it may introduce one
2646 of the specials, which just set a flag. The sequence \b is a special
2647 case. Inside a class (and only there) it is treated as backspace.
2648 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2649 to 'or' into the one we are building. We assume they have more than one
2650 character in them, so set class_charcount bigger than one. */
2651
2652 if (c == '\\')
2653 {
2654 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2655 if (*errorcodeptr != 0) goto FAILED;
2656
2657 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2658 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2659 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2660 else if (-c == ESC_Q) /* Handle start of quoted string */
2661 {
2662 if (ptr[1] == '\\' && ptr[2] == 'E')
2663 {
2664 ptr += 2; /* avoid empty string */
2665 }
2666 else inescq = TRUE;
2667 continue;
2668 }
2669
2670 if (c < 0)
2671 {
2672 register const uschar *cbits = cd->cbits;
2673 class_charcount += 2; /* Greater than 1 is what matters */
2674
2675 /* Save time by not doing this in the pre-compile phase. */
2676
2677 if (lengthptr == NULL) switch (-c)
2678 {
2679 case ESC_d:
2680 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2681 continue;
2682
2683 case ESC_D:
2684 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2685 continue;
2686
2687 case ESC_w:
2688 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2689 continue;
2690
2691 case ESC_W:
2692 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2693 continue;
2694
2695 case ESC_s:
2696 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2697 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2698 continue;
2699
2700 case ESC_S:
2701 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2702 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2703 continue;
2704
2705 case ESC_E: /* Perl ignores an orphan \E */
2706 continue;
2707
2708 default: /* Not recognized; fall through */
2709 break; /* Need "default" setting to stop compiler warning. */
2710 }
2711
2712 /* In the pre-compile phase, just do the recognition. */
2713
2714 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2715 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2716
2717 /* We need to deal with \H, \h, \V, and \v in both phases because
2718 they use extra memory. */
2719
2720 if (-c == ESC_h)
2721 {
2722 SETBIT(classbits, 0x09); /* VT */
2723 SETBIT(classbits, 0x20); /* SPACE */
2724 SETBIT(classbits, 0xa0); /* NSBP */
2725 #ifdef SUPPORT_UTF8
2726 if (utf8)
2727 {
2728 class_utf8 = TRUE;
2729 *class_utf8data++ = XCL_SINGLE;
2730 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2731 *class_utf8data++ = XCL_SINGLE;
2732 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2733 *class_utf8data++ = XCL_RANGE;
2734 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2735 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2736 *class_utf8data++ = XCL_SINGLE;
2737 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2738 *class_utf8data++ = XCL_SINGLE;
2739 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2740 *class_utf8data++ = XCL_SINGLE;
2741 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2742 }
2743 #endif
2744 continue;
2745 }
2746
2747 if (-c == ESC_H)
2748 {
2749 for (c = 0; c < 32; c++)
2750 {
2751 int x = 0xff;
2752 switch (c)
2753 {
2754 case 0x09/8: x ^= 1 << (0x09%8); break;
2755 case 0x20/8: x ^= 1 << (0x20%8); break;
2756 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2757 default: break;
2758 }
2759 classbits[c] |= x;
2760 }
2761
2762 #ifdef SUPPORT_UTF8
2763 if (utf8)
2764 {
2765 class_utf8 = TRUE;
2766 *class_utf8data++ = XCL_RANGE;
2767 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2768 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2769 *class_utf8data++ = XCL_RANGE;
2770 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2771 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2772 *class_utf8data++ = XCL_RANGE;
2773 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2774 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2775 *class_utf8data++ = XCL_RANGE;
2776 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2777 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2778 *class_utf8data++ = XCL_RANGE;
2779 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2780 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2781 *class_utf8data++ = XCL_RANGE;
2782 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2783 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2784 *class_utf8data++ = XCL_RANGE;
2785 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2786 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2787 }
2788 #endif
2789 continue;
2790 }
2791
2792 if (-c == ESC_v)
2793 {
2794 SETBIT(classbits, 0x0a); /* LF */
2795 SETBIT(classbits, 0x0b); /* VT */
2796 SETBIT(classbits, 0x0c); /* FF */
2797 SETBIT(classbits, 0x0d); /* CR */
2798 SETBIT(classbits, 0x85); /* NEL */
2799 #ifdef SUPPORT_UTF8
2800 if (utf8)
2801 {
2802 class_utf8 = TRUE;
2803 *class_utf8data++ = XCL_RANGE;
2804 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2805 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2806 }
2807 #endif
2808 continue;
2809 }
2810
2811 if (-c == ESC_V)
2812 {
2813 for (c = 0; c < 32; c++)
2814 {
2815 int x = 0xff;
2816 switch (c)
2817 {
2818 case 0x0a/8: x ^= 1 << (0x0a%8);
2819 x ^= 1 << (0x0b%8);
2820 x ^= 1 << (0x0c%8);
2821 x ^= 1 << (0x0d%8);
2822 break;
2823 case 0x85/8: x ^= 1 << (0x85%8); break;
2824 default: break;
2825 }
2826 classbits[c] |= x;
2827 }
2828
2829 #ifdef SUPPORT_UTF8
2830 if (utf8)
2831 {
2832 class_utf8 = TRUE;
2833 *class_utf8data++ = XCL_RANGE;
2834 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2835 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2836 *class_utf8data++ = XCL_RANGE;
2837 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2838 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2839 }
2840 #endif
2841 continue;
2842 }
2843
2844 /* We need to deal with \P and \p in both phases. */
2845
2846 #ifdef SUPPORT_UCP
2847 if (-c == ESC_p || -c == ESC_P)
2848 {
2849 BOOL negated;
2850 int pdata;
2851 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2852 if (ptype < 0) goto FAILED;
2853 class_utf8 = TRUE;
2854 *class_utf8data++ = ((-c == ESC_p) != negated)?
2855 XCL_PROP : XCL_NOTPROP;
2856 *class_utf8data++ = ptype;
2857 *class_utf8data++ = pdata;
2858 class_charcount -= 2; /* Not a < 256 character */
2859 continue;
2860 }
2861 #endif
2862 /* Unrecognized escapes are faulted if PCRE is running in its
2863 strict mode. By default, for compatibility with Perl, they are
2864 treated as literals. */
2865
2866 if ((options & PCRE_EXTRA) != 0)
2867 {
2868 *errorcodeptr = ERR7;
2869 goto FAILED;
2870 }
2871
2872 class_charcount -= 2; /* Undo the default count from above */
2873 c = *ptr; /* Get the final character and fall through */
2874 }
2875
2876 /* Fall through if we have a single character (c >= 0). This may be
2877 greater than 256 in UTF-8 mode. */
2878
2879 } /* End of backslash handling */
2880
2881 /* A single character may be followed by '-' to form a range. However,
2882 Perl does not permit ']' to be the end of the range. A '-' character
2883 at the end is treated as a literal. Perl ignores orphaned \E sequences
2884 entirely. The code for handling \Q and \E is messy. */
2885
2886 CHECK_RANGE:
2887 while (ptr[1] == '\\' && ptr[2] == 'E')
2888 {
2889 inescq = FALSE;
2890 ptr += 2;
2891 }
2892
2893 oldptr = ptr;
2894
2895 if (!inescq && ptr[1] == '-')
2896 {
2897 int d;
2898 ptr += 2;
2899 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2900
2901 /* If we hit \Q (not followed by \E) at this point, go into escaped
2902 mode. */
2903
2904 while (*ptr == '\\' && ptr[1] == 'Q')
2905 {
2906 ptr += 2;
2907 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2908 inescq = TRUE;
2909 break;
2910 }
2911
2912 if (*ptr == 0 || (!inescq && *ptr == ']'))
2913 {
2914 ptr = oldptr;
2915 goto LONE_SINGLE_CHARACTER;
2916 }
2917
2918 #ifdef SUPPORT_UTF8
2919 if (utf8)
2920 { /* Braces are required because the */
2921 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2922 }
2923 else
2924 #endif
2925 d = *ptr; /* Not UTF-8 mode */
2926
2927 /* The second part of a range can be a single-character escape, but
2928 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2929 in such circumstances. */
2930
2931 if (!inescq && d == '\\')
2932 {
2933 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2934 if (*errorcodeptr != 0) goto FAILED;
2935
2936 /* \b is backslash; \X is literal X; \R is literal R; any other
2937 special means the '-' was literal */
2938
2939 if (d < 0)
2940 {
2941 if (d == -ESC_b) d = '\b';
2942 else if (d == -ESC_X) d = 'X';
2943 else if (d == -ESC_R) d = 'R'; else
2944 {
2945 ptr = oldptr;
2946 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2947 }
2948 }
2949 }
2950
2951 /* Check that the two values are in the correct order. Optimize
2952 one-character ranges */
2953
2954 if (d < c)
2955 {
2956 *errorcodeptr = ERR8;
2957 goto FAILED;
2958 }
2959
2960 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2961
2962 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2963 matching, we have to use an XCLASS with extra data items. Caseless
2964 matching for characters > 127 is available only if UCP support is
2965 available. */
2966
2967 #ifdef SUPPORT_UTF8
2968 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2969 {
2970 class_utf8 = TRUE;
2971
2972 /* With UCP support, we can find the other case equivalents of
2973 the relevant characters. There may be several ranges. Optimize how
2974 they fit with the basic range. */
2975
2976 #ifdef SUPPORT_UCP
2977 if ((options & PCRE_CASELESS) != 0)
2978 {
2979 unsigned int occ, ocd;
2980 unsigned int cc = c;
2981 unsigned int origd = d;
2982 while (get_othercase_range(&cc, origd, &occ, &ocd))
2983 {
2984 if (occ >= (unsigned int)c &&
2985 ocd <= (unsigned int)d)
2986 continue; /* Skip embedded ranges */
2987
2988 if (occ < (unsigned int)c &&
2989 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2990 { /* if there is overlap, */
2991 c = occ; /* noting that if occ < c */
2992 continue; /* we can't have ocd > d */
2993 } /* because a subrange is */
2994 if (ocd > (unsigned int)d &&
2995 occ <= (unsigned int)d + 1) /* always shorter than */
2996 { /* the basic range. */
2997 d = ocd;
2998 continue;
2999 }
3000
3001 if (occ == ocd)
3002 {
3003 *class_utf8data++ = XCL_SINGLE;
3004 }
3005 else
3006 {
3007 *class_utf8data++ = XCL_RANGE;
3008 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3009 }
3010 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3011 }
3012 }
3013 #endif /* SUPPORT_UCP */
3014
3015 /* Now record the original range, possibly modified for UCP caseless
3016 overlapping ranges. */
3017
3018 *class_utf8data++ = XCL_RANGE;
3019 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3020 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3021
3022 /* With UCP support, we are done. Without UCP support, there is no
3023 caseless matching for UTF-8 characters > 127; we can use the bit map
3024 for the smaller ones. */
3025
3026 #ifdef SUPPORT_UCP
3027 continue; /* With next character in the class */
3028 #else
3029 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3030
3031 /* Adjust upper limit and fall through to set up the map */
3032
3033 d = 127;
3034
3035 #endif /* SUPPORT_UCP */
3036 }
3037 #endif /* SUPPORT_UTF8 */
3038
3039 /* We use the bit map for all cases when not in UTF-8 mode; else
3040 ranges that lie entirely within 0-127 when there is UCP support; else
3041 for partial ranges without UCP support. */
3042
3043 class_charcount += d - c + 1;
3044 class_lastchar = d;
3045
3046 /* We can save a bit of time by skipping this in the pre-compile. */
3047
3048 if (lengthptr == NULL) for (; c <= d; c++)
3049 {
3050 classbits[c/8] |= (1 << (c&7));
3051 if ((options & PCRE_CASELESS) != 0)
3052 {
3053 int uc = cd->fcc[c]; /* flip case */
3054 classbits[uc/8] |= (1 << (uc&7));
3055 }
3056 }
3057
3058 continue; /* Go get the next char in the class */
3059 }
3060
3061 /* Handle a lone single character - we can get here for a normal
3062 non-escape char, or after \ that introduces a single character or for an
3063 apparent range that isn't. */
3064
3065 LONE_SINGLE_CHARACTER:
3066
3067 /* Handle a character that cannot go in the bit map */
3068
3069 #ifdef SUPPORT_UTF8
3070 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3071 {
3072 class_utf8 = TRUE;
3073 *class_utf8data++ = XCL_SINGLE;
3074 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3075
3076 #ifdef SUPPORT_UCP
3077 if ((options & PCRE_CASELESS) != 0)
3078 {
3079 unsigned int othercase;
3080 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3081 {
3082 *class_utf8data++ = XCL_SINGLE;
3083 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3084 }
3085 }
3086 #endif /* SUPPORT_UCP */
3087
3088 }
3089 else
3090 #endif /* SUPPORT_UTF8 */
3091
3092 /* Handle a single-byte character */
3093 {
3094 classbits[c/8] |= (1 << (c&7));
3095 if ((options & PCRE_CASELESS) != 0)
3096 {
3097 c = cd->fcc[c]; /* flip case */
3098 classbits[c/8] |= (1 << (c&7));
3099 }
3100 class_charcount++;
3101 class_lastchar = c;
3102 }
3103 }
3104
3105 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3106
3107 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3108
3109 if (c == 0) /* Missing terminating ']' */
3110 {
3111 *errorcodeptr = ERR6;
3112 goto FAILED;
3113 }
3114
3115 /* If class_charcount is 1, we saw precisely one character whose value is
3116 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3117 can optimize the negative case only if there were no characters >= 128
3118 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3119 single-bytes only. This is an historical hangover. Maybe one day we can
3120 tidy these opcodes to handle multi-byte characters.
3121
3122 The optimization throws away the bit map. We turn the item into a
3123 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3124 that OP_NOT does not support multibyte characters. In the positive case, it
3125 can cause firstbyte to be set. Otherwise, there can be no first char if
3126 this item is first, whatever repeat count may follow. In the case of
3127 reqbyte, save the previous value for reinstating. */
3128
3129 #ifdef SUPPORT_UTF8
3130 if (class_charcount == 1 &&
3131 (!utf8 ||
3132 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3133
3134 #else
3135 if (class_charcount == 1)
3136 #endif
3137 {
3138 zeroreqbyte = reqbyte;
3139
3140 /* The OP_NOT opcode works on one-byte characters only. */
3141
3142 if (negate_class)
3143 {
3144 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3145 zerofirstbyte = firstbyte;
3146 *code++ = OP_NOT;
3147 *code++ = class_lastchar;
3148 break;
3149 }
3150
3151 /* For a single, positive character, get the value into mcbuffer, and
3152 then we can handle this with the normal one-character code. */
3153
3154 #ifdef SUPPORT_UTF8
3155 if (utf8 && class_lastchar > 127)
3156 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3157 else
3158 #endif
3159 {
3160 mcbuffer[0] = class_lastchar;
3161 mclength = 1;
3162 }
3163 goto ONE_CHAR;
3164 } /* End of 1-char optimization */
3165
3166 /* The general case - not the one-char optimization. If this is the first
3167 thing in the branch, there can be no first char setting, whatever the
3168 repeat count. Any reqbyte setting must remain unchanged after any kind of
3169 repeat. */
3170
3171 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3172 zerofirstbyte = firstbyte;
3173 zeroreqbyte = reqbyte;
3174
3175 /* If there are characters with values > 255, we have to compile an
3176 extended class, with its own opcode. If there are no characters < 256,
3177 we can omit the bitmap in the actual compiled code. */
3178
3179 #ifdef SUPPORT_UTF8
3180 if (class_utf8)
3181 {
3182 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3183 *code++ = OP_XCLASS;
3184 code += LINK_SIZE;
3185 *code = negate_class? XCL_NOT : 0;
3186
3187 /* If the map is required, move up the extra data to make room for it;
3188 otherwise just move the code pointer to the end of the extra data. */
3189
3190 if (class_charcount > 0)
3191 {
3192 *code++ |= XCL_MAP;
3193 memmove(code + 32, code, class_utf8data - code);
3194 memcpy(code, classbits, 32);
3195 code = class_utf8data + 32;
3196 }
3197 else code = class_utf8data;
3198
3199 /* Now fill in the complete length of the item */
3200
3201 PUT(previous, 1, code - previous);
3202 break; /* End of class handling */
3203 }
3204 #endif
3205
3206 /* If there are no characters > 255, negate the 32-byte map if necessary,
3207 and copy it into the code vector. If this is the first thing in the branch,
3208 there can be no first char setting, whatever the repeat count. Any reqbyte
3209 setting must remain unchanged after any kind of repeat. */
3210
3211 if (negate_class)
3212 {
3213 *code++ = OP_NCLASS;
3214 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3215 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3216 }
3217 else
3218 {
3219 *code++ = OP_CLASS;
3220 memcpy(code, classbits, 32);
3221 }
3222 code += 32;
3223 break;
3224
3225
3226 /* ===================================================================*/
3227 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3228 has been tested above. */
3229
3230 case '{':
3231 if (!is_quantifier) goto NORMAL_CHAR;
3232 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3233 if (*errorcodeptr != 0) goto FAILED;
3234 goto REPEAT;
3235
3236 case '*':
3237 repeat_min = 0;
3238 repeat_max = -1;
3239 goto REPEAT;
3240
3241 case '+':
3242 repeat_min = 1;
3243 repeat_max = -1;
3244 goto REPEAT;
3245
3246 case '?':
3247 repeat_min = 0;
3248 repeat_max = 1;
3249
3250 REPEAT:
3251 if (previous == NULL)
3252 {
3253 *errorcodeptr = ERR9;
3254 goto FAILED;
3255 }
3256
3257 if (repeat_min == 0)
3258 {
3259 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3260 reqbyte = zeroreqbyte; /* Ditto */
3261 }
3262
3263 /* Remember whether this is a variable length repeat */
3264
3265 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3266
3267 op_type = 0; /* Default single-char op codes */
3268 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3269
3270 /* Save start of previous item, in case we have to move it up to make space
3271 for an inserted OP_ONCE for the additional '+' extension. */
3272
3273 tempcode = previous;
3274
3275 /* If the next character is '+', we have a possessive quantifier. This
3276 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3277 If the next character is '?' this is a minimizing repeat, by default,
3278 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3279 repeat type to the non-default. */
3280
3281 if (ptr[1] == '+')
3282 {
3283 repeat_type = 0; /* Force greedy */
3284 possessive_quantifier = TRUE;
3285 ptr++;
3286 }
3287 else if (ptr[1] == '?')
3288 {
3289 repeat_type = greedy_non_default;
3290 ptr++;
3291 }
3292 else repeat_type = greedy_default;
3293
3294 /* If previous was a character match, abolish the item and generate a
3295 repeat item instead. If a char item has a minumum of more than one, ensure
3296 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3297 the first thing in a branch because the x will have gone into firstbyte
3298 instead. */
3299
3300 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3301 {
3302 /* Deal with UTF-8 characters that take up more than one byte. It's
3303 easier to write this out separately than try to macrify it. Use c to
3304 hold the length of the character in bytes, plus 0x80 to flag that it's a
3305 length rather than a small character. */
3306
3307 #ifdef SUPPORT_UTF8
3308 if (utf8 && (code[-1] & 0x80) != 0)
3309 {
3310 uschar *lastchar = code - 1;
3311 while((*lastchar & 0xc0) == 0x80) lastchar--;
3312 c = code - lastchar; /* Length of UTF-8 character */
3313 memcpy(utf8_char, lastchar, c); /* Save the char */
3314 c |= 0x80; /* Flag c as a length */
3315 }
3316 else
3317 #endif
3318
3319 /* Handle the case of a single byte - either with no UTF8 support, or
3320 with UTF-8 disabled, or for a UTF-8 character < 128. */
3321
3322 {
3323 c = code[-1];
3324 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3325 }
3326
3327 /* If the repetition is unlimited, it pays to see if the next thing on
3328 the line is something that cannot possibly match this character. If so,
3329 automatically possessifying this item gains some performance in the case
3330 where the match fails. */
3331
3332 if (!possessive_quantifier &&
3333 repeat_max < 0 &&
3334 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3335 options, cd))
3336 {
3337 repeat_type = 0; /* Force greedy */
3338 possessive_quantifier = TRUE;
3339 }
3340
3341 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3342 }
3343
3344 /* If previous was a single negated character ([^a] or similar), we use
3345 one of the special opcodes, replacing it. The code is shared with single-
3346 character repeats by setting opt_type to add a suitable offset into
3347 repeat_type. We can also test for auto-possessification. OP_NOT is
3348 currently used only for single-byte chars. */
3349
3350 else if (*previous == OP_NOT)
3351 {
3352 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3353 c = previous[1];
3354 if (!possessive_quantifier &&
3355 repeat_max < 0 &&
3356 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3357 {
3358 repeat_type = 0; /* Force greedy */
3359 possessive_quantifier = TRUE;
3360 }
3361 goto OUTPUT_SINGLE_REPEAT;
3362 }
3363
3364 /* If previous was a character type match (\d or similar), abolish it and
3365 create a suitable repeat item. The code is shared with single-character
3366 repeats by setting op_type to add a suitable offset into repeat_type. Note
3367 the the Unicode property types will be present only when SUPPORT_UCP is
3368 defined, but we don't wrap the little bits of code here because it just
3369 makes it horribly messy. */
3370
3371 else if (*previous < OP_EODN)
3372 {
3373 uschar *oldcode;
3374 int prop_type, prop_value;
3375 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3376 c = *previous;
3377
3378 if (!possessive_quantifier &&
3379 repeat_max < 0 &&
3380 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3381 {
3382 repeat_type = 0; /* Force greedy */
3383 possessive_quantifier = TRUE;
3384 }
3385
3386 OUTPUT_SINGLE_REPEAT:
3387 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3388 {
3389 prop_type = previous[1];
3390 prop_value = previous[2];
3391 }
3392 else prop_type = prop_value = -1;
3393
3394 oldcode = code;
3395 code = previous; /* Usually overwrite previous item */
3396
3397 /* If the maximum is zero then the minimum must also be zero; Perl allows
3398 this case, so we do too - by simply omitting the item altogether. */
3399
3400 if (repeat_max == 0) goto END_REPEAT;
3401
3402 /* All real repeats make it impossible to handle partial matching (maybe
3403 one day we will be able to remove this restriction). */
3404
3405 if (repeat_max != 1) cd->nopartial = TRUE;
3406
3407 /* Combine the op_type with the repeat_type */
3408
3409 repeat_type += op_type;
3410
3411 /* A minimum of zero is handled either as the special case * or ?, or as
3412 an UPTO, with the maximum given. */
3413
3414 if (repeat_min == 0)
3415 {
3416 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3417 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3418 else
3419 {
3420 *code++ = OP_UPTO + repeat_type;
3421 PUT2INC(code, 0, repeat_max);
3422 }
3423 }
3424
3425 /* A repeat minimum of 1 is optimized into some special cases. If the
3426 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3427 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3428 one less than the maximum. */
3429
3430 else if (repeat_min == 1)
3431 {
3432 if (repeat_max == -1)
3433 *code++ = OP_PLUS + repeat_type;
3434 else
3435 {
3436 code = oldcode; /* leave previous item in place */
3437 if (repeat_max == 1) goto END_REPEAT;
3438 *code++ = OP_UPTO + repeat_type;
3439 PUT2INC(code, 0, repeat_max - 1);
3440 }
3441 }
3442
3443 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3444 handled as an EXACT followed by an UPTO. */
3445
3446 else
3447 {
3448 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3449 PUT2INC(code, 0, repeat_min);
3450
3451 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3452 we have to insert the character for the previous code. For a repeated
3453 Unicode property match, there are two extra bytes that define the
3454 required property. In UTF-8 mode, long characters have their length in
3455 c, with the 0x80 bit as a flag. */
3456
3457 if (repeat_max < 0)
3458 {
3459 #ifdef SUPPORT_UTF8
3460 if (utf8 && c >= 128)
3461 {
3462 memcpy(code, utf8_char, c & 7);
3463 code += c & 7;
3464 }
3465 else
3466 #endif
3467 {
3468 *code++ = c;
3469 if (prop_type >= 0)
3470 {
3471 *code++ = prop_type;
3472 *code++ = prop_value;
3473 }
3474 }
3475 *code++ = OP_STAR + repeat_type;
3476 }
3477
3478 /* Else insert an UPTO if the max is greater than the min, again
3479 preceded by the character, for the previously inserted code. If the
3480 UPTO is just for 1 instance, we can use QUERY instead. */
3481
3482 else if (repeat_max != repeat_min)
3483 {
3484 #ifdef SUPPORT_UTF8
3485 if (utf8 && c >= 128)
3486 {
3487 memcpy(code, utf8_char, c & 7);
3488 code += c & 7;
3489 }
3490 else
3491 #endif
3492 *code++ = c;
3493 if (prop_type >= 0)
3494 {
3495 *code++ = prop_type;
3496 *code++ = prop_value;
3497 }
3498 repeat_max -= repeat_min;
3499
3500 if (repeat_max == 1)
3501 {
3502 *code++ = OP_QUERY + repeat_type;
3503 }
3504 else
3505 {
3506 *code++ = OP_UPTO + repeat_type;
3507 PUT2INC(code, 0, repeat_max);
3508 }
3509 }
3510 }
3511
3512 /* The character or character type itself comes last in all cases. */
3513
3514 #ifdef SUPPORT_UTF8
3515 if (utf8 && c >= 128)
3516 {
3517 memcpy(code, utf8_char, c & 7);
3518 code += c & 7;
3519 }
3520 else
3521 #endif
3522 *code++ = c;
3523
3524 /* For a repeated Unicode property match, there are two extra bytes that
3525 define the required property. */
3526
3527 #ifdef SUPPORT_UCP
3528 if (prop_type >= 0)
3529 {
3530 *code++ = prop_type;
3531 *code++ = prop_value;
3532 }
3533 #endif
3534 }
3535
3536 /* If previous was a character class or a back reference, we put the repeat
3537 stuff after it, but just skip the item if the repeat was {0,0}. */
3538
3539 else if (*previous == OP_CLASS ||
3540 *previous == OP_NCLASS ||
3541 #ifdef SUPPORT_UTF8
3542 *previous == OP_XCLASS ||
3543 #endif
3544 *previous == OP_REF)
3545 {
3546 if (repeat_max == 0)
3547 {
3548 code = previous;
3549 goto END_REPEAT;
3550 }
3551
3552 /* All real repeats make it impossible to handle partial matching (maybe
3553 one day we will be able to remove this restriction). */
3554
3555 if (repeat_max != 1) cd->nopartial = TRUE;
3556
3557 if (repeat_min == 0 && repeat_max == -1)
3558 *code++ = OP_CRSTAR + repeat_type;
3559 else if (repeat_min == 1 && repeat_max == -1)
3560 *code++ = OP_CRPLUS + repeat_type;
3561 else if (repeat_min == 0 && repeat_max == 1)
3562 *code++ = OP_CRQUERY + repeat_type;
3563 else
3564 {
3565 *code++ = OP_CRRANGE + repeat_type;
3566 PUT2INC(code, 0, repeat_min);
3567 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3568 PUT2INC(code, 0, repeat_max);
3569 }
3570 }
3571
3572 /* If previous was a bracket group, we may have to replicate it in certain
3573 cases. */
3574
3575 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3576 *previous == OP_ONCE || *previous == OP_COND)
3577 {
3578 register int i;
3579 int ketoffset = 0;
3580 int len = code - previous;
3581 uschar *bralink = NULL;
3582
3583 /* Repeating a DEFINE group is pointless */
3584
3585 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3586 {
3587 *errorcodeptr = ERR55;
3588 goto FAILED;
3589 }
3590
3591 /* If the maximum repeat count is unlimited, find the end of the bracket
3592 by scanning through from the start, and compute the offset back to it
3593 from the current code pointer. There may be an OP_OPT setting following
3594 the final KET, so we can't find the end just by going back from the code
3595 pointer. */
3596
3597 if (repeat_max == -1)
3598 {
3599 register uschar *ket = previous;
3600 do ket += GET(ket, 1); while (*ket != OP_KET);
3601 ketoffset = code - ket;
3602 }
3603
3604 /* The case of a zero minimum is special because of the need to stick
3605 OP_BRAZERO in front of it, and because the group appears once in the
3606 data, whereas in other cases it appears the minimum number of times. For
3607 this reason, it is simplest to treat this case separately, as otherwise
3608 the code gets far too messy. There are several special subcases when the
3609 minimum is zero. */
3610
3611 if (repeat_min == 0)
3612 {
3613 /* If the maximum is also zero, we just omit the group from the output
3614 altogether. */
3615
3616 if (repeat_max == 0)
3617 {
3618 code = previous;
3619 goto END_REPEAT;
3620 }
3621
3622 /* If the maximum is 1 or unlimited, we just have to stick in the
3623 BRAZERO and do no more at this point. However, we do need to adjust
3624 any OP_RECURSE calls inside the group that refer to the group itself or
3625 any internal or forward referenced group, because the offset is from
3626 the start of the whole regex. Temporarily terminate the pattern while
3627 doing this. */
3628
3629 if (repeat_max <= 1)
3630 {
3631 *code = OP_END;
3632 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3633 memmove(previous+1, previous, len);
3634 code++;
3635 *previous++ = OP_BRAZERO + repeat_type;
3636 }
3637
3638 /* If the maximum is greater than 1 and limited, we have to replicate
3639 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3640 The first one has to be handled carefully because it's the original
3641 copy, which has to be moved up. The remainder can be handled by code
3642 that is common with the non-zero minimum case below. We have to
3643 adjust the value or repeat_max, since one less copy is required. Once
3644 again, we may have to adjust any OP_RECURSE calls inside the group. */
3645
3646 else
3647 {
3648 int offset;
3649 *code = OP_END;
3650 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3651 memmove(previous + 2 + LINK_SIZE, previous, len);
3652 code += 2 + LINK_SIZE;
3653 *previous++ = OP_BRAZERO + repeat_type;
3654 *previous++ = OP_BRA;
3655
3656 /* We chain together the bracket offset fields that have to be
3657 filled in later when the ends of the brackets are reached. */
3658
3659 offset = (bralink == NULL)? 0 : previous - bralink;
3660 bralink = previous;
3661 PUTINC(previous, 0, offset);
3662 }
3663
3664 repeat_max--;
3665 }
3666
3667 /* If the minimum is greater than zero, replicate the group as many
3668 times as necessary, and adjust the maximum to the number of subsequent
3669 copies that we need. If we set a first char from the group, and didn't
3670 set a required char, copy the latter from the former. If there are any
3671 forward reference subroutine calls in the group, there will be entries on
3672 the workspace list; replicate these with an appropriate increment. */
3673
3674 else
3675 {
3676 if (repeat_min > 1)
3677 {
3678 /* In the pre-compile phase, we don't actually do the replication. We
3679 just adjust the length as if we had. Do some paranoid checks for
3680 potential integer overflow. */
3681
3682 if (lengthptr != NULL)
3683 {
3684 int delta = (repeat_min - 1)*length_prevgroup;
3685 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3686 (double)INT_MAX ||
3687 OFLOW_MAX - *lengthptr < delta)
3688 {
3689 *errorcodeptr = ERR20;
3690 goto FAILED;
3691 }
3692 *lengthptr += delta;
3693 }
3694
3695 /* This is compiling for real */
3696
3697 else
3698 {
3699 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3700 for (i = 1; i < repeat_min; i++)
3701 {
3702 uschar *hc;
3703 uschar *this_hwm = cd->hwm;
3704 memcpy(code, previous, len);
3705 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3706 {
3707 PUT(cd->hwm, 0, GET(hc, 0) + len);
3708 cd->hwm += LINK_SIZE;
3709 }
3710 save_hwm = this_hwm;
3711 code += len;
3712 }
3713 }
3714 }
3715
3716 if (repeat_max > 0) repeat_max -= repeat_min;
3717 }
3718
3719 /* This code is common to both the zero and non-zero minimum cases. If
3720 the maximum is limited, it replicates the group in a nested fashion,
3721 remembering the bracket starts on a stack. In the case of a zero minimum,
3722 the first one was set up above. In all cases the repeat_max now specifies
3723 the number of additional copies needed. Again, we must remember to
3724 replicate entries on the forward reference list. */
3725
3726 if (repeat_max >= 0)
3727 {
3728 /* In the pre-compile phase, we don't actually do the replication. We
3729 just adjust the length as if we had. For each repetition we must add 1
3730 to the length for BRAZERO and for all but the last repetition we must
3731 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3732 paranoid checks to avoid integer overflow. */
3733
3734 if (lengthptr != NULL && repeat_max > 0)
3735 {
3736 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3737 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3738 if ((double)repeat_max *
3739 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3740 > (double)INT_MAX ||
3741 OFLOW_MAX - *lengthptr < delta)
3742 {
3743 *errorcodeptr = ERR20;
3744 goto FAILED;
3745 }
3746 *lengthptr += delta;
3747 }
3748
3749 /* This is compiling for real */
3750
3751 else for (i = repeat_max - 1; i >= 0; i--)
3752 {
3753 uschar *hc;
3754 uschar *this_hwm = cd->hwm;
3755
3756 *code++ = OP_BRAZERO + repeat_type;
3757
3758 /* All but the final copy start a new nesting, maintaining the
3759 chain of brackets outstanding. */
3760
3761 if (i != 0)
3762 {
3763 int offset;
3764 *code++ = OP_BRA;
3765 offset = (bralink == NULL)? 0 : code - bralink;
3766 bralink = code;
3767 PUTINC(code, 0, offset);
3768 }
3769
3770 memcpy(code, previous, len);
3771 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3772 {
3773 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3774 cd->hwm += LINK_SIZE;
3775 }
3776 save_hwm = this_hwm;
3777 code += len;
3778 }
3779
3780 /* Now chain through the pending brackets, and fill in their length
3781 fields (which are holding the chain links pro tem). */
3782
3783 while (bralink != NULL)
3784 {
3785 int oldlinkoffset;
3786 int offset = code - bralink + 1;
3787 uschar *bra = code - offset;
3788 oldlinkoffset = GET(bra, 1);
3789 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3790 *code++ = OP_KET;
3791 PUTINC(code, 0, offset);
3792 PUT(bra, 1, offset);
3793 }
3794 }
3795
3796 /* If the maximum is unlimited, set a repeater in the final copy. We
3797 can't just offset backwards from the current code point, because we
3798 don't know if there's been an options resetting after the ket. The
3799 correct offset was computed above.
3800
3801 Then, when we are doing the actual compile phase, check to see whether
3802 this group is a non-atomic one that could match an empty string. If so,
3803 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3804 that runtime checking can be done. [This check is also applied to
3805 atomic groups at runtime, but in a different way.] */
3806
3807 else
3808 {
3809 uschar *ketcode = code - ketoffset;
3810 uschar *bracode = ketcode - GET(ketcode, 1);
3811 *ketcode = OP_KETRMAX + repeat_type;
3812 if (lengthptr == NULL && *bracode != OP_ONCE)
3813 {
3814 uschar *scode = bracode;
3815 do
3816 {
3817 if (could_be_empty_branch(scode, ketcode, utf8))
3818 {
3819 *bracode += OP_SBRA - OP_BRA;
3820 break;
3821 }
3822 scode += GET(scode, 1);
3823 }
3824 while (*scode == OP_ALT);
3825 }
3826 }
3827 }
3828
3829 /* Else there's some kind of shambles */
3830
3831 else
3832 {
3833 *errorcodeptr = ERR11;
3834 goto FAILED;
3835 }
3836
3837 /* If the character following a repeat is '+', or if certain optimization
3838 tests above succeeded, possessive_quantifier is TRUE. For some of the
3839 simpler opcodes, there is an special alternative opcode for this. For
3840 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3841 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3842 but the special opcodes can optimize it a bit. The repeated item starts at
3843 tempcode, not at previous, which might be the first part of a string whose
3844 (former) last char we repeated.
3845
3846 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3847 an 'upto' may follow. We skip over an 'exact' item, and then test the
3848 length of what remains before proceeding. */
3849
3850 if (possessive_quantifier)
3851 {
3852 int len;
3853 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3854 *tempcode == OP_NOTEXACT)
3855 tempcode += _pcre_OP_lengths[*tempcode];
3856 len = code - tempcode;
3857 if (len > 0) switch (*tempcode)
3858 {
3859 case OP_STAR: *tempcode = OP_POSSTAR; break;
3860 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3861 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3862 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3863
3864 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3865 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3866 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3867 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3868
3869 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3870 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3871 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3872 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3873
3874 default:
3875 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3876 code += 1 + LINK_SIZE;
3877 len += 1 + LINK_SIZE;
3878 tempcode[0] = OP_ONCE;
3879 *code++ = OP_KET;
3880 PUTINC(code, 0, len);
3881 PUT(tempcode, 1, len);
3882 break;
3883 }
3884 }
3885
3886 /* In all case we no longer have a previous item. We also set the
3887 "follows varying string" flag for subsequently encountered reqbytes if
3888 it isn't already set and we have just passed a varying length item. */
3889
3890 END_REPEAT:
3891 previous = NULL;
3892 cd->req_varyopt |= reqvary;
3893 break;
3894
3895
3896 /* ===================================================================*/
3897 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3898 lookbehind or option setting or condition or all the other extended
3899 parenthesis forms. */
3900
3901 case '(':
3902 newoptions = options;
3903 skipbytes = 0;
3904 bravalue = OP_CBRA;
3905 save_hwm = cd->hwm;
3906 reset_bracount = FALSE;
3907
3908 /* First deal with various "verbs" that can be introduced by '*'. */
3909
3910 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3911 {
3912 int i, namelen;
3913 const uschar *name = ++ptr;
3914 previous = NULL;
3915 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3916 if (*ptr == ':')
3917 {
3918 *errorcodeptr = ERR59; /* Not supported */
3919 goto FAILED;
3920 }
3921 if (*ptr != ')')
3922 {
3923 *errorcodeptr = ERR60;
3924 goto FAILED;
3925 }
3926 namelen = ptr - name;
3927 for (i = 0; i < verbcount; i++)
3928 {
3929 if (namelen == verbs[i].len &&
3930 strncmp((char *)name, verbs[i].name, namelen) == 0)
3931 {
3932 *code = verbs[i].op;
3933 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3934 break;
3935 }
3936 }
3937 if (i < verbcount) continue;
3938 *errorcodeptr = ERR60;
3939 goto FAILED;
3940 }
3941
3942 /* Deal with the extended parentheses; all are introduced by '?', and the
3943 appearance of any of them means that this is not a capturing group. */
3944
3945 else if (*ptr == '?')
3946 {
3947 int i, set, unset, namelen;
3948 int *optset;
3949 const uschar *name;
3950 uschar *slot;
3951
3952 switch (*(++ptr))
3953 {
3954 case '#': /* Comment; skip to ket */
3955 ptr++;
3956 while (*ptr != 0 && *ptr != ')') ptr++;
3957 if (*ptr == 0)
3958 {
3959 *errorcodeptr = ERR18;
3960 goto FAILED;
3961 }
3962 continue;
3963
3964
3965 /* ------------------------------------------------------------ */
3966 case '|': /* Reset capture count for each branch */
3967 reset_bracount = TRUE;
3968 /* Fall through */
3969
3970 /* ------------------------------------------------------------ */
3971 case ':': /* Non-capturing bracket */
3972 bravalue = OP_BRA;
3973 ptr++;
3974 break;
3975
3976
3977 /* ------------------------------------------------------------ */
3978 case '(':
3979 bravalue = OP_COND; /* Conditional group */
3980
3981 /* A condition can be an assertion, a number (referring to a numbered
3982 group), a name (referring to a named group), or 'R', referring to
3983 recursion. R<digits> and R&name are also permitted for recursion tests.
3984
3985 There are several syntaxes for testing a named group: (?(name)) is used
3986 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3987
3988 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3989 be the recursive thing or the name 'R' (and similarly for 'R' followed
3990 by digits), and (b) a number could be a name that consists of digits.
3991 In both cases, we look for a name first; if not found, we try the other
3992 cases. */
3993
3994 /* For conditions that are assertions, check the syntax, and then exit
3995 the switch. This will take control down to where bracketed groups,
3996 including assertions, are processed. */
3997
3998 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3999 break;
4000
4001 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4002 below), and all need to skip 3 bytes at the start of the group. */
4003
4004 code[1+LINK_SIZE] = OP_CREF;
4005 skipbytes = 3;
4006 refsign = -1;
4007
4008 /* Check for a test for recursion in a named group. */
4009
4010 if (ptr[1] == 'R' && ptr[2] == '&')
4011 {
4012 terminator = -1;
4013 ptr += 2;
4014 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4015 }
4016
4017 /* Check for a test for a named group's having been set, using the Perl
4018 syntax (?(<name>) or (?('name') */
4019
4020 else if (ptr[1] == '<')
4021 {
4022 terminator = '>';
4023 ptr++;
4024 }
4025 else if (ptr[1] == '\'')
4026 {
4027 terminator = '\'';
4028 ptr++;
4029 }
4030 else
4031 {
4032 terminator = 0;
4033 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4034 }
4035
4036 /* We now expect to read a name; any thing else is an error */
4037
4038 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4039 {
4040 ptr += 1; /* To get the right offset */
4041 *errorcodeptr = ERR28;
4042 goto FAILED;
4043 }
4044
4045 /* Read the name, but also get it as a number if it's all digits */
4046
4047 recno = 0;
4048 name = ++ptr;
4049 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4050 {
4051 if (recno >= 0)
4052 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4053 recno * 10 + *ptr - '0' : -1;
4054 ptr++;
4055 }
4056 namelen = ptr - name;
4057
4058 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4059 {
4060 ptr--; /* Error offset */
4061 *errorcodeptr = ERR26;
4062 goto FAILED;
4063 }
4064
4065 /* Do no further checking in the pre-compile phase. */
4066
4067 if (lengthptr != NULL) break;
4068
4069 /* In the real compile we do the work of looking for the actual
4070 reference. If the string started with "+" or "-" we require the rest to
4071 be digits, in which case recno will be set. */
4072
4073 if (refsign > 0)
4074 {
4075 if (recno <= 0)
4076 {
4077 *errorcodeptr = ERR58;
4078 goto FAILED;
4079 }
4080 if (refsign == '-')
4081 {
4082 recno = cd->bracount - recno + 1;
4083 if (recno <= 0)
4084 {
4085 *errorcodeptr = ERR15;
4086 goto FAILED;
4087 }
4088 }
4089 else recno += cd->bracount;
4090 PUT2(code, 2+LINK_SIZE, recno);
4091 break;
4092 }
4093
4094 /* Otherwise (did not start with "+" or "-"), start by looking for the
4095 name. */
4096
4097 slot = cd->name_table;
4098 for (i = 0; i < cd->names_found; i++)
4099 {
4100 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4101 slot += cd->name_entry_size;
4102 }
4103
4104 /* Found a previous named subpattern */
4105
4106 if (i < cd->names_found)
4107 {
4108 recno = GET2(slot, 0);
4109 PUT2(code, 2+LINK_SIZE, recno);
4110 }
4111
4112 /* Search the pattern for a forward reference */
4113
4114 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4115 (options & PCRE_EXTENDED) != 0)) > 0)
4116 {
4117 PUT2(code, 2+LINK_SIZE, i);
4118 }
4119
4120 /* If terminator == 0 it means that the name followed directly after
4121 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4122 some further alternatives to try. For the cases where terminator != 0
4123 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4124 now checked all the possibilities, so give an error. */
4125
4126 else if (terminator != 0)
4127 {
4128 *errorcodeptr = ERR15;
4129 goto FAILED;
4130 }
4131
4132 /* Check for (?(R) for recursion. Allow digits after R to specify a
4133 specific group number. */
4134
4135 else if (*name == 'R')
4136 {
4137 recno = 0;
4138 for (i = 1; i < namelen; i++)
4139 {
4140 if ((digitab[name[i]] & ctype_digit) == 0)
4141 {
4142 *errorcodeptr = ERR15;
4143 goto FAILED;
4144 }
4145 recno = recno * 10 + name[i] - '0';
4146 }
4147 if (recno == 0) recno = RREF_ANY;
4148 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4149 PUT2(code, 2+LINK_SIZE, recno);
4150 }
4151
4152 /* Similarly, check for the (?(DEFINE) "condition", which is always
4153 false. */
4154
4155 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4156 {
4157 code[1+LINK_SIZE] = OP_DEF;
4158 skipbytes = 1;
4159 }
4160
4161 /* Check for the "name" actually being a subpattern number. */
4162
4163 else if (recno > 0)
4164 {
4165 PUT2(code, 2+LINK_SIZE, recno);
4166 }
4167
4168 /* Either an unidentified subpattern, or a reference to (?(0) */
4169
4170 else
4171 {
4172 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4173 goto FAILED;
4174 }
4175 break;
4176
4177
4178 /* ------------------------------------------------------------ */
4179 case '=': /* Positive lookahead */
4180 bravalue = OP_ASSERT;
4181 ptr++;
4182 break;
4183
4184
4185 /* ------------------------------------------------------------ */
4186 case '!': /* Negative lookahead */
4187 ptr++;
4188 if (*ptr == ')') /* Optimize (?!) */
4189 {
4190 *code++ = OP_FAIL;
4191 previous = NULL;
4192 continue;
4193 }
4194 bravalue = OP_ASSERT_NOT;
4195 break;
4196
4197
4198 /* ------------------------------------------------------------ */
4199 case '<': /* Lookbehind or named define */
4200 switch (ptr[1])
4201 {
4202 case '=': /* Positive lookbehind */
4203 bravalue = OP_ASSERTBACK;
4204 ptr += 2;
4205 break;
4206
4207 case '!': /* Negative lookbehind */
4208 bravalue = OP_ASSERTBACK_NOT;
4209 ptr += 2;
4210 break;
4211
4212 default: /* Could be name define, else bad */
4213 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4214 ptr++; /* Correct offset for error */
4215 *errorcodeptr = ERR24;
4216 goto FAILED;
4217 }
4218 break;
4219
4220
4221 /* ------------------------------------------------------------ */
4222 case '>': /* One-time brackets */
4223 bravalue = OP_ONCE;
4224 ptr++;
4225 break;
4226
4227
4228 /* ------------------------------------------------------------ */
4229 case 'C': /* Callout - may be followed by digits; */
4230 previous_callout = code; /* Save for later completion */
4231 after_manual_callout = 1; /* Skip one item before completing */
4232 *code++ = OP_CALLOUT;
4233 {
4234 int n = 0;
4235 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4236 n = n * 10 + *ptr - '0';
4237 if (*ptr != ')')
4238 {
4239 *errorcodeptr = ERR39;
4240 goto FAILED;
4241 }
4242 if (n > 255)
4243 {
4244 *errorcodeptr = ERR38;
4245 goto FAILED;
4246 }
4247 *code++ = n;
4248 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4249 PUT(code, LINK_SIZE, 0); /* Default length */
4250 code += 2 * LINK_SIZE;
4251 }
4252 previous = NULL;
4253 continue;
4254
4255
4256 /* ------------------------------------------------------------ */
4257 case 'P': /* Python-style named subpattern handling */
4258 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4259 {
4260 is_recurse = *ptr == '>';
4261 terminator = ')';
4262 goto NAMED_REF_OR_RECURSE;
4263 }
4264 else if (*ptr != '<') /* Test for Python-style definition */
4265 {
4266 *errorcodeptr = ERR41;
4267 goto FAILED;
4268 }
4269 /* Fall through to handle (?P< as (?< is handled */
4270
4271
4272 /* ------------------------------------------------------------ */
4273 DEFINE_NAME: /* Come here from (?< handling */
4274 case '\'':
4275 {
4276 terminator = (*ptr == '<')? '>' : '\'';
4277 name = ++ptr;
4278
4279 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4280 namelen = ptr - name;
4281
4282 /* In the pre-compile phase, just do a syntax check. */
4283
4284 if (lengthptr != NULL)
4285 {
4286 if (*ptr != terminator)
4287 {
4288 *errorcodeptr = ERR42;
4289 goto FAILED;
4290 }
4291 if (cd->names_found >= MAX_NAME_COUNT)
4292 {
4293 *errorcodeptr = ERR49;
4294 goto FAILED;
4295 }
4296 if (namelen + 3 > cd->name_entry_size)
4297 {
4298 cd->name_entry_size = namelen + 3;
4299 if (namelen > MAX_NAME_SIZE)
4300 {
4301 *errorcodeptr = ERR48;
4302 goto FAILED;
4303 }
4304 }
4305 }
4306
4307 /* In the real compile, create the entry in the table */
4308
4309 else
4310 {
4311 slot = cd->name_table;
4312 for (i = 0; i < cd->names_found; i++)
4313 {
4314 int crc = memcmp(name, slot+2, namelen);
4315 if (crc == 0)
4316 {
4317 if (slot[2+namelen] == 0)
4318 {
4319 if ((options & PCRE_DUPNAMES) == 0)
4320 {
4321 *errorcodeptr = ERR43;
4322 goto FAILED;
4323 }
4324 }
4325 else crc = -1; /* Current name is substring */
4326 }
4327 if (crc < 0)
4328 {
4329 memmove(slot + cd->name_entry_size, slot,
4330 (cd->names_found - i) * cd->name_entry_size);
4331 break;
4332 }
4333 slot += cd->name_entry_size;
4334 }
4335
4336 PUT2(slot, 0, cd->bracount + 1);
4337 memcpy(slot + 2, name, namelen);
4338 slot[2+namelen] = 0;
4339 }
4340 }
4341
4342 /* In both cases, count the number of names we've encountered. */
4343
4344 ptr++; /* Move past > or ' */
4345 cd->names_found++;
4346 goto NUMBERED_GROUP;
4347
4348
4349 /* ------------------------------------------------------------ */
4350 case '&': /* Perl recursion/subroutine syntax */
4351 terminator = ')';
4352 is_recurse = TRUE;
4353 /* Fall through */
4354
4355 /* We come here from the Python syntax above that handles both
4356 references (?P=name) and recursion (?P>name), as well as falling
4357 through from the Perl recursion syntax (?&name). */
4358
4359 NAMED_REF_OR_RECURSE:
4360 name = ++ptr;
4361 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4362 namelen = ptr - name;
4363
4364 /* In the pre-compile phase, do a syntax check and set a dummy
4365 reference number. */
4366
4367 if (lengthptr != NULL)
4368 {
4369 if (*ptr != terminator)
4370 {
4371 *errorcodeptr = ERR42;
4372 goto FAILED;
4373 }
4374 if (namelen > MAX_NAME_SIZE)
4375 {
4376 *errorcodeptr = ERR48;
4377 goto FAILED;
4378 }
4379 recno = 0;
4380 }
4381
4382 /* In the real compile, seek the name in the table */
4383
4384 else
4385 {
4386 slot = cd->name_table;
4387 for (i = 0; i < cd->names_found; i++)
4388 {
4389 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4390 slot += cd->name_entry_size;
4391 }
4392
4393 if (i < cd->names_found) /* Back reference */
4394 {
4395 recno = GET2(slot, 0);
4396 }
4397 else if ((recno = /* Forward back reference */
4398 find_parens(ptr, cd->bracount, name, namelen,
4399 (options & PCRE_EXTENDED) != 0)) <= 0)
4400 {
4401 *errorcodeptr = ERR15;
4402 goto FAILED;
4403 }
4404 }
4405
4406 /* In both phases, we can now go to the code than handles numerical
4407 recursion or backreferences. */
4408
4409 if (is_recurse) goto HANDLE_RECURSION;
4410 else goto HANDLE_REFERENCE;
4411
4412
4413 /* ------------------------------------------------------------ */
4414 case 'R': /* Recursion */
4415 ptr++; /* Same as (?0) */
4416 /* Fall through */
4417
4418
4419 /* ------------------------------------------------------------ */
4420 case '-': case '+':
4421 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4422 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4423 {
4424 const uschar *called;
4425
4426 if ((refsign = *ptr) == '+') ptr++;
4427 else if (refsign == '-')
4428 {
4429 if ((digitab[ptr[1]] & ctype_digit) == 0)
4430 goto OTHER_CHAR_AFTER_QUERY;
4431 ptr++;
4432 }
4433
4434 recno = 0;
4435 while((digitab[*ptr] & ctype_digit) != 0)
4436 recno = recno * 10 + *ptr++ - '0';
4437
4438 if (*ptr != ')')
4439 {
4440 *errorcodeptr = ERR29;
4441 goto FAILED;
4442 }
4443
4444 if (refsign == '-')
4445 {
4446 if (recno == 0)
4447 {
4448 *errorcodeptr = ERR58;
4449 goto FAILED;
4450 }
4451 recno = cd->bracount - recno + 1;
4452 if (recno <= 0)
4453 {
4454 *errorcodeptr = ERR15;
4455 goto FAILED;
4456 }
4457 }
4458 else if (refsign == '+')
4459 {
4460 if (recno == 0)
4461 {
4462 *errorcodeptr = ERR58;
4463 goto FAILED;
4464 }
4465 recno += cd->bracount;
4466 }
4467
4468 /* Come here from code above that handles a named recursion */
4469
4470 HANDLE_RECURSION:
4471
4472 previous = code;
4473 called = cd->start_code;
4474
4475 /* When we are actually compiling, find the bracket that is being
4476 referenced. Temporarily end the regex in case it doesn't exist before
4477 this point. If we end up with a forward reference, first check that
4478 the bracket does occur later so we can give the error (and position)
4479 now. Then remember this forward reference in the workspace so it can
4480 be filled in at the end. */
4481
4482 if (lengthptr == NULL)
4483 {
4484 *code = OP_END;
4485 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4486
4487 /* Forward reference */
4488
4489 if (called == NULL)
4490 {
4491 if (find_parens(ptr, cd->bracount, NULL, recno,
4492 (options & PCRE_EXTENDED) != 0) < 0)
4493 {
4494 *errorcodeptr = ERR15;
4495 goto FAILED;
4496 }
4497 called = cd->start_code + recno;
4498 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4499 }
4500
4501 /* If not a forward reference, and the subpattern is still open,
4502 this is a recursive call. We check to see if this is a left
4503 recursion that could loop for ever, and diagnose that case. */
4504
4505 else if (GET(called, 1) == 0 &&
4506 could_be_empty(called, code, bcptr, utf8))
4507 {
4508 *errorcodeptr = ERR40;
4509 goto FAILED;
4510 }
4511 }
4512
4513 /* Insert the recursion/subroutine item, automatically wrapped inside
4514 "once" brackets. Set up a "previous group" length so that a
4515 subsequent quantifier will work. */
4516
4517 *code = OP_ONCE;
4518 PUT(code, 1, 2 + 2*LINK_SIZE);
4519 code += 1 + LINK_SIZE;
4520
4521 *code = OP_RECURSE;
4522 PUT(code, 1, called - cd->start_code);
4523 code += 1 + LINK_SIZE;
4524
4525 *code = OP_KET;
4526 PUT(code, 1, 2 + 2*LINK_SIZE);
4527 code += 1 + LINK_SIZE;
4528
4529 length_prevgroup = 3 + 3*LINK_SIZE;
4530 }
4531
4532 /* Can't determine a first byte now */
4533
4534 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4535 continue;
4536
4537
4538 /* ------------------------------------------------------------ */
4539 default: /* Other characters: check option setting */
4540 OTHER_CHAR_AFTER_QUERY:
4541 set = unset = 0;
4542 optset = &set;
4543
4544 while (*ptr != ')' && *ptr != ':')
4545 {
4546 switch (*ptr++)
4547 {
4548 case '-': optset = &unset; break;
4549
4550 case 'J': /* Record that it changed in the external options */
4551 *optset |= PCRE_DUPNAMES;
4552 cd->external_options |= PCRE_JCHANGED;
4553 break;
4554
4555 case 'i': *optset |= PCRE_CASELESS; break;
4556 case 'm': *optset |= PCRE_MULTILINE; break;
4557 case 's': *optset |= PCRE_DOTALL; break;
4558 case 'x': *optset |= PCRE_EXTENDED; break;
4559 case 'U': *optset |= PCRE_UNGREEDY; break;
4560 case 'X': *optset |= PCRE_EXTRA; break;
4561
4562 default: *errorcodeptr = ERR12;
4563 ptr--; /* Correct the offset */
4564 goto FAILED;
4565 }
4566 }
4567
4568 /* Set up the changed option bits, but don't change anything yet. */
4569
4570 newoptions = (options | set) & (~unset);
4571
4572 /* If the options ended with ')' this is not the start of a nested
4573 group with option changes, so the options change at this level. If this
4574 item is right at the start of the pattern, the options can be
4575 abstracted and made external in the pre-compile phase, and ignored in
4576 the compile phase. This can be helpful when matching -- for instance in
4577 caseless checking of required bytes.
4578
4579 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4580 definitely *not* at the start of the pattern because something has been
4581 compiled. In the pre-compile phase, however, the code pointer can have
4582 that value after the start, because it gets reset as code is discarded
4583 during the pre-compile. However, this can happen only at top level - if
4584 we are within parentheses, the starting BRA will still be present. At
4585 any parenthesis level, the length value can be used to test if anything
4586 has been compiled at that level. Thus, a test for both these conditions
4587 is necessary to ensure we correctly detect the start of the pattern in
4588 both phases.
4589
4590 If we are not at the pattern start, compile code to change the ims
4591 options if this setting actually changes any of them. We also pass the
4592 new setting back so that it can be put at the start of any following
4593 branches, and when this group ends (if we are in a group), a resetting
4594 item can be compiled. */
4595
4596 if (*ptr == ')')
4597 {
4598 if (code == cd->start_code + 1 + LINK_SIZE &&
4599 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4600 {
4601 cd->external_options = newoptions;
4602 options = newoptions;
4603 }
4604 else
4605 {
4606 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4607 {
4608 *code++ = OP_OPT;
4609 *code++ = newoptions & PCRE_IMS;
4610 }
4611
4612 /* Change options at this level, and pass them back for use
4613 in subsequent branches. Reset the greedy defaults and the case
4614 value for firstbyte and reqbyte. */
4615
4616 *optionsptr = options = newoptions;
4617 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4618 greedy_non_default = greedy_default ^ 1;
4619 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4620 }
4621
4622 previous = NULL; /* This item can't be repeated */
4623 continue; /* It is complete */
4624 }
4625
4626 /* If the options ended with ':' we are heading into a nested group
4627 with possible change of options. Such groups are non-capturing and are
4628 not assertions of any kind. All we need to do is skip over the ':';
4629 the newoptions value is handled below. */
4630
4631 bravalue = OP_BRA;
4632 ptr++;
4633 } /* End of switch for character following (? */
4634 } /* End of (? handling */
4635
4636 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4637 all unadorned brackets become non-capturing and behave like (?:...)
4638 brackets. */
4639
4640 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4641 {
4642 bravalue = OP_BRA;
4643 }
4644
4645 /* Else we have a capturing group. */
4646
4647 else
4648 {
4649 NUMBERED_GROUP:
4650 cd->bracount += 1;
4651 PUT2(code, 1+LINK_SIZE, cd->bracount);
4652 skipbytes = 2;
4653 }
4654
4655 /* Process nested bracketed regex. Assertions may not be repeated, but
4656 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4657 non-register variable in order to be able to pass its address because some
4658 compilers complain otherwise. Pass in a new setting for the ims options if
4659 they have changed. */
4660
4661 previous = (bravalue >= OP_ONCE)? code : NULL;
4662 *code = bravalue;
4663 tempcode = code;
4664 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4665 length_prevgroup = 0; /* Initialize for pre-compile phase */
4666
4667 if (!compile_regex(
4668 newoptions, /* The complete new option state */
4669 options & PCRE_IMS, /* The previous ims option state */
4670 &tempcode, /* Where to put code (updated) */
4671 &ptr, /* Input pointer (updated) */
4672 errorcodeptr, /* Where to put an error message */
4673 (bravalue == OP_ASSERTBACK ||
4674 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4675 reset_bracount, /* True if (?| group */
4676 skipbytes, /* Skip over bracket number */
4677 &subfirstbyte, /* For possible first char */
4678 &subreqbyte, /* For possible last char */
4679 bcptr, /* Current branch chain */
4680 cd, /* Tables block */
4681 (lengthptr == NULL)? NULL : /* Actual compile phase */
4682 &length_prevgroup /* Pre-compile phase */
4683 ))
4684 goto FAILED;
4685
4686 /* At the end of compiling, code is still pointing to the start of the
4687 group, while tempcode has been updated to point past the end of the group
4688 and any option resetting that may follow it. The pattern pointer (ptr)
4689 is on the bracket. */
4690
4691 /* If this is a conditional bracket, check that there are no more than
4692 two branches in the group, or just one if it's a DEFINE group. We do this
4693 in the real compile phase, not in the pre-pass, where the whole group may
4694 not be available. */
4695
4696 if (bravalue == OP_COND && lengthptr == NULL)
4697 {
4698 uschar *tc = code;
4699 int condcount = 0;
4700
4701 do {
4702 condcount++;
4703 tc += GET(tc,1);
4704 }
4705 while (*tc != OP_KET);
4706
4707 /* A DEFINE group is never obeyed inline (the "condition" is always
4708 false). It must have only one branch. */
4709
4710 if (code[LINK_SIZE+1] == OP_DEF)
4711 {
4712 if (condcount > 1)
4713 {
4714 *errorcodeptr = ERR54;
4715 goto FAILED;
4716 }
4717 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4718 }
4719
4720 /* A "normal" conditional group. If there is just one branch, we must not
4721 make use of its firstbyte or reqbyte, because this is equivalent to an
4722 empty second branch. */
4723
4724 else
4725 {
4726 if (condcount > 2)
4727 {
4728 *errorcodeptr = ERR27;
4729 goto FAILED;
4730 }
4731 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4732 }
4733 }
4734
4735 /* Error if hit end of pattern */
4736
4737 if (*ptr != ')')
4738 {
4739 *errorcodeptr = ERR14;
4740 goto FAILED;
4741 }
4742
4743 /* In the pre-compile phase, update the length by the length of the nested
4744 group, less the brackets at either end. Then reduce the compiled code to
4745 just the brackets so that it doesn't use much memory if it is duplicated by
4746 a quantifier. */
4747
4748 if (lengthptr != NULL)
4749 {
4750 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4751 {
4752 *errorcodeptr = ERR20;
4753 goto FAILED;
4754 }
4755 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4756 code++;
4757 PUTINC(code, 0, 1 + LINK_SIZE);
4758 *code++ = OP_KET;
4759 PUTINC(code, 0, 1 + LINK_SIZE);
4760 }
4761
4762 /* Otherwise update the main code pointer to the end of the group. */
4763
4764 else code = tempcode;
4765
4766 /* For a DEFINE group, required and first character settings are not
4767 relevant. */
4768
4769 if (bravalue == OP_DEF) break;
4770
4771 /* Handle updating of the required and first characters for other types of
4772 group. Update for normal brackets of all kinds, and conditions with two
4773 branches (see code above). If the bracket is followed by a quantifier with
4774 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4775 zerofirstbyte outside the main loop so that they can be accessed for the
4776 back off. */
4777
4778 zeroreqbyte = reqbyte;
4779 zerofirstbyte = firstbyte;
4780 groupsetfirstbyte = FALSE;
4781
4782 if (bravalue >= OP_ONCE)
4783 {
4784 /* If we have not yet set a firstbyte in this branch, take it from the
4785 subpattern, remembering that it was set here so that a repeat of more
4786 than one can replicate it as reqbyte if necessary. If the subpattern has
4787 no firstbyte, set "none" for the whole branch. In both cases, a zero
4788 repeat forces firstbyte to "none". */
4789
4790 if (firstbyte == REQ_UNSET)
4791 {
4792 if (subfirstbyte >= 0)
4793 {
4794 firstbyte = subfirstbyte;
4795 groupsetfirstbyte = TRUE;
4796 }
4797 else firstbyte = REQ_NONE;
4798 zerofirstbyte = REQ_NONE;
4799 }
4800
4801 /* If firstbyte was previously set, convert the subpattern's firstbyte
4802 into reqbyte if there wasn't one, using the vary flag that was in
4803 existence beforehand. */
4804
4805 else if (subfirstbyte >= 0 && subreqbyte < 0)
4806 subreqbyte = subfirstbyte | tempreqvary;
4807
4808 /* If the subpattern set a required byte (or set a first byte that isn't
4809 really the first byte - see above), set it. */
4810
4811 if (subreqbyte >= 0) reqbyte = subreqbyte;
4812 }
4813
4814 /* For a forward assertion, we take the reqbyte, if set. This can be
4815 helpful if the pattern that follows the assertion doesn't set a different
4816 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4817 for an assertion, however because it leads to incorrect effect for patterns
4818 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4819 of a firstbyte. This is overcome by a scan at the end if there's no
4820 firstbyte, looking for an asserted first char. */
4821
4822 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4823 break; /* End of processing '(' */
4824
4825
4826 /* ===================================================================*/
4827 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4828 are arranged to be the negation of the corresponding OP_values. For the
4829 back references, the values are ESC_REF plus the reference number. Only
4830 back references and those types that consume a character may be repeated.
4831 We can test for values between ESC_b and ESC_Z for the latter; this may
4832 have to change if any new ones are ever created. */
4833
4834 case '\\':
4835 tempptr = ptr;
4836 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4837 if (*errorcodeptr != 0) goto FAILED;
4838
4839 if (c < 0)
4840 {
4841 if (-c == ESC_Q) /* Handle start of quoted string */
4842 {
4843 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4844 else inescq = TRUE;
4845 continue;
4846 }
4847
4848 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4849
4850 /* For metasequences that actually match a character, we disable the
4851 setting of a first character if it hasn't already been set. */
4852
4853 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4854 firstbyte = REQ_NONE;
4855
4856 /* Set values to reset to if this is followed by a zero repeat. */
4857
4858 zerofirstbyte = firstbyte;
4859 zeroreqbyte = reqbyte;
4860
4861 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4862 We also support \k{name} (.NET syntax) */
4863
4864 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4865 {
4866 is_recurse = FALSE;
4867 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4868 goto NAMED_REF_OR_RECURSE;
4869 }
4870
4871 /* Back references are handled specially; must disable firstbyte if
4872 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4873 ':' later. */
4874
4875 if (-c >= ESC_REF)
4876 {
4877 recno = -c - ESC_REF;
4878
4879 HANDLE_REFERENCE: /* Come here from named backref handling */
4880 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4881 previous = code;
4882 *code++ = OP_REF;
4883 PUT2INC(code, 0, recno);
4884 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4885 if (recno > cd->top_backref) cd->top_backref = recno;
4886 }
4887
4888 /* So are Unicode property matches, if supported. */
4889
4890 #ifdef SUPPORT_UCP
4891 else if (-c == ESC_P || -c == ESC_p)
4892 {
4893 BOOL negated;
4894 int pdata;
4895 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4896 if (ptype < 0) goto FAILED;
4897 previous = code;
4898 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4899 *code++ = ptype;
4900 *code++ = pdata;
4901 }
4902 #else
4903
4904 /* If Unicode properties are not supported, \X, \P, and \p are not
4905 allowed. */
4906
4907 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4908 {
4909 *errorcodeptr = ERR45;
4910 goto FAILED;
4911 }
4912 #endif
4913
4914 /* For the rest (including \X when Unicode properties are supported), we
4915 can obtain the OP value by negating the escape value. */
4916
4917 else
4918 {
4919 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4920 *code++ = -c;
4921 }
4922 continue;
4923 }
4924
4925 /* We have a data character whose value is in c. In UTF-8 mode it may have
4926 a value > 127. We set its representation in the length/buffer, and then
4927 handle it as a data character. */
4928
4929 #ifdef SUPPORT_UTF8
4930 if (utf8 && c > 127)
4931 mclength = _pcre_ord2utf8(c, mcbuffer);
4932 else
4933 #endif
4934
4935 {
4936 mcbuffer[0] = c;
4937 mclength = 1;
4938 }
4939 goto ONE_CHAR;
4940
4941
4942 /* ===================================================================*/
4943 /* Handle a literal character. It is guaranteed not to be whitespace or #
4944 when the extended flag is set. If we are in UTF-8 mode, it may be a
4945 multi-byte literal character. */
4946
4947 default:
4948 NORMAL_CHAR:
4949 mclength = 1;
4950 mcbuffer[0] = c;
4951
4952 #ifdef SUPPORT_UTF8
4953 if (utf8 && c >= 0xc0)
4954 {
4955 while ((ptr[1] & 0xc0) == 0x80)
4956 mcbuffer[mclength++] = *(++ptr);
4957 }
4958 #endif
4959
4960 /* At this point we have the character's bytes in mcbuffer, and the length
4961 in mclength. When not in UTF-8 mode, the length is always 1. */
4962
4963 ONE_CHAR:
4964 previous = code;
4965 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4966 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4967
4968 /* Set the first and required bytes appropriately. If no previous first
4969 byte, set it from this character, but revert to none on a zero repeat.
4970 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4971 repeat. */
4972
4973 if (firstbyte == REQ_UNSET)
4974 {
4975 zerofirstbyte = REQ_NONE;
4976 zeroreqbyte = reqbyte;
4977
4978 /* If the character is more than one byte long, we can set firstbyte
4979 only if it is not to be matched caselessly. */
4980
4981 if (mclength == 1 || req_caseopt == 0)
4982 {
4983 firstbyte = mcbuffer[0] | req_caseopt;
4984 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4985 }
4986 else firstbyte = reqbyte = REQ_NONE;
4987 }
4988
4989 /* firstbyte was previously set; we can set reqbyte only the length is
4990 1 or the matching is caseful. */
4991
4992 else
4993 {
4994 zerofirstbyte = firstbyte;
4995 zeroreqbyte = reqbyte;
4996 if (mclength == 1 || req_caseopt == 0)
4997 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4998 }
4999
5000 break; /* End of literal character handling */
5001 }
5002 } /* end of big loop */
5003
5004
5005 /* Control never reaches here by falling through, only by a goto for all the
5006 error states. Pass back the position in the pattern so that it can be displayed
5007 to the user for diagnosing the error. */
5008
5009 FAILED:
5010 *ptrptr = ptr;
5011 return FALSE;
5012 }
5013
5014
5015
5016
5017 /*************************************************
5018 * Compile sequence of alternatives *
5019 *************************************************/
5020
5021 /* On entry, ptr is pointing past the bracket character, but on return it
5022 points to the closing bracket, or vertical bar, or end of string. The code
5023 variable is pointing at the byte into which the BRA operator has been stored.
5024 If the ims options are changed at the start (for a (?ims: group) or during any
5025 branch, we need to insert an OP_OPT item at the start of every following branch
5026 to ensure they get set correctly at run time, and also pass the new options
5027 into every subsequent branch compile.
5028
5029 This function is used during the pre-compile phase when we are trying to find
5030 out the amount of memory needed, as well as during the real compile phase. The
5031 value of lengthptr distinguishes the two phases.
5032
5033 Arguments:
5034 options option bits, including any changes for this subpattern
5035 oldims previous settings of ims option bits
5036 codeptr -> the address of the current code pointer
5037 ptrptr -> the address of the current pattern pointer
5038 errorcodeptr -> pointer to error code variable
5039 lookbehind TRUE if this is a lookbehind assertion
5040 reset_bracount TRUE to reset the count for each branch
5041 skipbytes skip this many bytes at start (for brackets and OP_COND)
5042 firstbyteptr place to put the first required character, or a negative number
5043 reqbyteptr place to put the last required character, or a negative number
5044 bcptr pointer to the chain of currently open branches
5045 cd points to the data block with tables pointers etc.
5046 lengthptr NULL during the real compile phase
5047 points to length accumulator during pre-compile phase
5048
5049 Returns: TRUE on success
5050 */
5051
5052 static BOOL
5053 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5054 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5055 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5056 int *lengthptr)
5057 {
5058 const uschar *ptr = *ptrptr;
5059 uschar *code = *codeptr;
5060 uschar *last_branch = code;
5061 uschar *start_bracket = code;
5062 uschar *reverse_count = NULL;
5063 int firstbyte, reqbyte;
5064 int branchfirstbyte, branchreqbyte;
5065 int length;
5066 int orig_bracount;
5067 int max_bracount;
5068 branch_chain bc;
5069
5070 bc.outer = bcptr;
5071 bc.current = code;
5072
5073 firstbyte = reqbyte = REQ_UNSET;
5074
5075 /* Accumulate the length for use in the pre-compile phase. Start with the
5076 length of the BRA and KET and any extra bytes that are required at the
5077 beginning. We accumulate in a local variable to save frequent testing of
5078 lenthptr for NULL. We cannot do this by looking at the value of code at the
5079 start and end of each alternative, because compiled items are discarded during
5080 the pre-compile phase so that the work space is not exceeded. */
5081
5082 length = 2 + 2*LINK_SIZE + skipbytes;
5083
5084 /* WARNING: If the above line is changed for any reason, you must also change
5085 the code that abstracts option settings at the start of the pattern and makes
5086 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5087 pre-compile phase to find out whether anything has yet been compiled or not. */
5088
5089 /* Offset is set zero to mark that this bracket is still open */
5090
5091 PUT(code, 1, 0);
5092 code += 1 + LINK_SIZE + skipbytes;
5093
5094 /* Loop for each alternative branch */
5095
5096 orig_bracount = max_bracount = cd->bracount;
5097 for (;;)
5098 {
5099 /* For a (?| group, reset the capturing bracket count so that each branch
5100 uses the same numbers. */
5101
5102 if (reset_bracount) cd->bracount = orig_bracount;
5103
5104 /* Handle a change of ims options at the start of the branch */
5105
5106 if ((options & PCRE_IMS) != oldims)
5107 {
5108 *code++ = OP_OPT;
5109 *code++ = options & PCRE_IMS;
5110 length += 2;
5111 }
5112
5113 /* Set up dummy OP_REVERSE if lookbehind assertion */
5114
5115 if (lookbehind)
5116 {
5117 *code++ = OP_REVERSE;
5118 reverse_count = code;
5119 PUTINC(code, 0, 0);
5120 length += 1 + LINK_SIZE;
5121 }
5122
5123 /* Now compile the branch; in the pre-compile phase its length gets added
5124 into the length. */
5125
5126 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5127 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5128 {
5129 *ptrptr = ptr;
5130 return FALSE;
5131 }
5132
5133 /* Keep the highest bracket count in case (?| was used and some branch
5134 has fewer than the rest. */
5135
5136 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5137
5138 /* In the real compile phase, there is some post-processing to be done. */
5139
5140 if (lengthptr == NULL)
5141 {
5142 /* If this is the first branch, the firstbyte and reqbyte values for the
5143 branch become the values for the regex. */
5144
5145 if (*last_branch != OP_ALT)
5146 {
5147 firstbyte = branchfirstbyte;
5148 reqbyte = branchreqbyte;
5149 }
5150
5151 /* If this is not the first branch, the first char and reqbyte have to
5152 match the values from all the previous branches, except that if the
5153 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5154 and we set REQ_VARY for the regex. */
5155
5156 else
5157 {
5158 /* If we previously had a firstbyte, but it doesn't match the new branch,
5159 we have to abandon the firstbyte for the regex, but if there was
5160 previously no reqbyte, it takes on the value of the old firstbyte. */
5161
5162 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5163 {
5164 if (reqbyte < 0) reqbyte = firstbyte;
5165 firstbyte = REQ_NONE;
5166 }
5167
5168 /* If we (now or from before) have no firstbyte, a firstbyte from the
5169 branch becomes a reqbyte if there isn't a branch reqbyte. */
5170
5171 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5172 branchreqbyte = branchfirstbyte;
5173
5174 /* Now ensure that the reqbytes match */
5175
5176 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5177 reqbyte = REQ_NONE;
5178 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5179 }
5180
5181 /* If lookbehind, check that this branch matches a fixed-length string, and
5182 put the length into the OP_REVERSE item. Temporarily mark the end of the
5183 branch with OP_END. */
5184
5185 if (lookbehind)
5186 {
5187 int fixed_length;
5188 *code = OP_END;
5189 fixed_length = find_fixedlength(last_branch, options);
5190 DPRINTF(("fixed length = %d\n", fixed_length));
5191 if (fixed_length < 0)
5192 {
5193 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5194 *ptrptr = ptr;
5195 return FALSE;
5196 }
5197 PUT(reverse_count, 0, fixed_length);
5198 }
5199 }
5200
5201 /* Reached end of expression, either ')' or end of pattern. In the real
5202 compile phase, go back through the alternative branches and reverse the chain
5203 of offsets, with the field in the BRA item now becoming an offset to the
5204 first alternative. If there are no alternatives, it points to the end of the
5205 group. The length in the terminating ket is always the length of the whole
5206 bracketed item. If any of the ims options were changed inside the group,
5207 compile a resetting op-code following, except at the very end of the pattern.
5208 Return leaving the pointer at the terminating char. */
5209
5210 if (*ptr != '|')
5211 {
5212 if (lengthptr == NULL)
5213 {
5214 int branch_length = code - last_branch;
5215 do
5216 {
5217 int prev_length = GET(last_branch, 1);
5218 PUT(last_branch, 1, branch_length);
5219 branch_length = prev_length;
5220 last_branch -= branch_length;
5221 }
5222 while (branch_length > 0);
5223 }
5224
5225 /* Fill in the ket */
5226
5227 *code = OP_KET;
5228 PUT(code, 1, code - start_bracket);
5229 code += 1 + LINK_SIZE;
5230
5231 /* Resetting option if needed */
5232
5233 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5234 {
5235 *code++ = OP_OPT;
5236 *code++ = oldims;
5237 length += 2;
5238 }
5239
5240 /* Retain the highest bracket number, in case resetting was used. */
5241
5242 cd->bracount = max_bracount;
5243
5244 /* Set values to pass back */
5245
5246 *codeptr = code;
5247 *ptrptr = ptr;
5248 *firstbyteptr = firstbyte;
5249 *reqbyteptr = reqbyte;
5250 if (lengthptr != NULL)
5251 {
5252 if (OFLOW_MAX - *lengthptr < length)
5253 {
5254 *errorcodeptr = ERR20;
5255 return FALSE;
5256 }
5257 *lengthptr += length;
5258 }
5259 return TRUE;
5260 }
5261
5262 /* Another branch follows. In the pre-compile phase, we can move the code
5263 pointer back to where it was for the start of the first branch. (That is,
5264 pretend that each branch is the only one.)
5265
5266 In the real compile phase, insert an ALT node. Its length field points back
5267 to the previous branch while the bracket remains open. At the end the chain
5268 is reversed. It's done like this so that the start of the bracket has a
5269 zero offset until it is closed, making it possible to detect recursion. */
5270
5271 if (lengthptr != NULL)
5272 {
5273 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5274 length += 1 + LINK_SIZE;
5275 }
5276 else
5277 {
5278 *code = OP_ALT;
5279 PUT(code, 1, code - last_branch);
5280 bc.current = last_branch = code;
5281 code += 1 + LINK_SIZE;
5282 }
5283
5284 ptr++;
5285 }
5286 /* Control never reaches here */
5287 }
5288
5289
5290
5291
5292 /*************************************************
5293 * Check for anchored expression *
5294 *************************************************/
5295
5296 /* Try to find out if this is an anchored regular expression. Consider each
5297 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5298 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5299 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5300 counts, since OP_CIRC can match in the middle.
5301
5302 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5303 This is the code for \G, which means "match at start of match position, taking
5304 into account the match offset".
5305
5306 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5307 because that will try the rest of the pattern at all possible matching points,
5308 so there is no point trying again.... er ....
5309
5310 .... except when the .* appears inside capturing parentheses, and there is a
5311 subsequent back reference to those parentheses. We haven't enough information
5312 to catch that case precisely.
5313
5314 At first, the best we could do was to detect when .* was in capturing brackets
5315 and the highest back reference was greater than or equal to that level.
5316 However, by keeping a bitmap of the first 31 back references, we can catch some
5317 of the more common cases more precisely.
5318
5319 Arguments:
5320 code points to start of expression (the bracket)
5321 options points to the options setting
5322 bracket_map a bitmap of which brackets we are inside while testing; this
5323 handles up to substring 31; after that we just have to take
5324 the less precise approach
5325 backref_map the back reference bitmap
5326
5327 Returns: TRUE or FALSE
5328 */
5329
5330 static BOOL
5331 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5332 unsigned int backref_map)
5333 {
5334 do {
5335 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5336 options, PCRE_MULTILINE, FALSE);
5337 register int op = *scode;
5338
5339 /* Non-capturing brackets */
5340
5341 if (op == OP_BRA)
5342 {
5343 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5344 }
5345
5346 /* Capturing brackets */
5347
5348 else if (op == OP_CBRA)
5349 {
5350 int n = GET2(scode, 1+LINK_SIZE);
5351 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5352 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5353 }
5354
5355 /* Other brackets */
5356
5357 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5358 {
5359 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5360 }
5361
5362 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5363 are or may be referenced. */
5364
5365 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5366 op == OP_TYPEPOSSTAR) &&
5367 (*options & PCRE_DOTALL) != 0)
5368 {
5369 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5370 }
5371
5372 /* Check for explicit anchoring */
5373
5374 else if (op != OP_SOD && op != OP_SOM &&
5375 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5376 return FALSE;
5377 code += GET(code, 1);
5378 }
5379 while (*code == OP_ALT); /* Loop for each alternative */
5380 return TRUE;
5381 }
5382
5383
5384
5385 /*************************************************
5386 * Check for starting with ^ or .* *
5387 *************************************************/
5388
5389 /* This is called to find out if every branch starts with ^ or .* so that
5390 "first char" processing can be done to speed things up in multiline
5391 matching and for non-DOTALL patterns that start with .* (which must start at
5392 the beginning or after \n). As in the case of is_anchored() (see above), we
5393 have to take account of back references to capturing brackets that contain .*
5394 because in that case we can't make the assumption.
5395
5396 Arguments:
5397 code points to start of expression (the bracket)
5398 bracket_map a bitmap of which brackets we are inside while testing; this
5399 handles up to substring 31; after that we just have to take
5400 the less precise approach
5401 backref_map the back reference bitmap
5402
5403 Returns: TRUE or FALSE
5404 */
5405
5406 static BOOL
5407 is_startline(const uschar *code, unsigned int bracket_map,
5408 unsigned int backref_map)
5409 {
5410 do {
5411 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5412 NULL, 0, FALSE);
5413 register int op = *scode;
5414
5415 /* Non-capturing brackets */
5416
5417 if (op == OP_BRA)
5418 {
5419 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5420 }
5421
5422 /* Capturing brackets */
5423
5424 else if (op == OP_CBRA)
5425 {
5426 int n = GET2(scode, 1+LINK_SIZE);
5427 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5428 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5429 }
5430
5431 /* Other brackets */
5432
5433 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5434 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5435
5436 /* .* means "start at start or after \n" if it isn't in brackets that
5437 may be referenced. */
5438
5439 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5440 {
5441 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5442 }
5443
5444 /* Check for explicit circumflex */
5445
5446 else if (op != OP_CIRC) return FALSE;
5447
5448 /* Move on to the next alternative */
5449
5450 code += GET(code, 1);
5451 }
5452 while (*code == OP_ALT); /* Loop for each alternative */
5453 return TRUE;
5454 }
5455
5456
5457
5458 /*************************************************
5459 * Check for asserted fixed first char *
5460 *************************************************/
5461
5462 /* During compilation, the "first char" settings from forward assertions are
5463 discarded, because they can cause conflicts with actual literals that follow.
5464 However, if we end up without a first char setting for an unanchored pattern,
5465 it is worth scanning the regex to see if there is an initial asserted first
5466 char. If all branches start with the same asserted char, or with a bracket all
5467 of whose alternatives start with the same asserted char (recurse ad lib), then
5468 we return that char, otherwise -1.
5469
5470 Arguments:
5471 code points to start of expression (the bracket)
5472 options pointer to the options (used to check casing changes)
5473 inassert TRUE if in an assertion
5474
5475 Returns: -1 or the fixed first char
5476 */
5477
5478 static int
5479 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5480 {
5481 register int c = -1;
5482 do {
5483 int d;
5484 const uschar *scode =
5485 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5486 register int op = *scode;
5487
5488 switch(op)
5489 {
5490 default:
5491 return -1;
5492
5493 case OP_BRA:
5494 case OP_CBRA:
5495 case OP_ASSERT:
5496 case OP_ONCE:
5497 case OP_COND:
5498 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5499 return -1;
5500 if (c < 0) c = d; else if (c != d) return -1;
5501 break;
5502
5503 case OP_EXACT: /* Fall through */
5504 scode += 2;
5505
5506 case OP_CHAR:
5507 case OP_CHARNC:
5508 case OP_PLUS:
5509 case OP_MINPLUS:
5510 case OP_POSPLUS:
5511 if (!inassert) return -1;
5512 if (c < 0)
5513 {
5514 c = scode[1];
5515 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5516 }
5517 else if (c != scode[1]) return -1;
5518 break;
5519 }
5520
5521 code += GET(code, 1);
5522 }
5523 while (*code == OP_ALT);
5524 return c;
5525 }
5526
5527
5528
5529 /*************************************************
5530 * Compile a Regular Expression *
5531 *************************************************/
5532
5533 /* This function takes a string and returns a pointer to a block of store
5534 holding a compiled version of the expression. The original API for this
5535 function had no error code return variable; it is retained for backwards
5536 compatibility. The new function is given a new name.
5537
5538 Arguments:
5539 pattern the regular expression
5540 options various option bits
5541 errorcodeptr pointer to error code variable (pcre_compile2() only)
5542 can be NULL if you don't want a code value
5543 errorptr pointer to pointer to error text
5544 erroroffset ptr offset in pattern where error was detected
5545 tables pointer to character tables or NULL
5546
5547 Returns: pointer to compiled data block, or NULL on error,
5548 with errorptr and erroroffset set
5549 */
5550
5551 PCRE_EXP_DEFN pcre *
5552 pcre_compile(const char *pattern, int options, const char **errorptr,
5553 int *erroroffset, const unsigned char *tables)
5554 {
5555 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5556 }
5557
5558
5559 PCRE_EXP_DEFN pcre *
5560 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5561 const char **errorptr, int *erroroffset, const unsigned char *tables)
5562 {
5563 real_pcre *re;
5564 int length = 1; /* For final END opcode */
5565 int firstbyte, reqbyte, newline;
5566 int errorcode = 0;
5567 #ifdef SUPPORT_UTF8
5568 BOOL utf8;
5569 #endif
5570 size_t size;
5571 uschar *code;
5572 const uschar *codestart;
5573 const uschar *ptr;
5574 compile_data compile_block;
5575 compile_data *cd = &compile_block;
5576
5577 /* This space is used for "compiling" into during the first phase, when we are
5578 computing the amount of memory that is needed. Compiled items are thrown away
5579 as soon as possible, so that a fairly large buffer should be sufficient for
5580 this purpose. The same space is used in the second phase for remembering where
5581 to fill in forward references to subpatterns. */
5582
5583 uschar cworkspace[COMPILE_WORK_SIZE];
5584
5585
5586 /* Set this early so that early errors get offset 0. */
5587
5588 ptr = (const uschar *)pattern;
5589
5590 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5591 can do is just return NULL, but we can set a code value if there is a code
5592 pointer. */
5593
5594 if (errorptr == NULL)
5595 {
5596 if (errorcodeptr != NULL) *errorcodeptr = 99;
5597 return NULL;
5598 }
5599
5600 *errorptr = NULL;
5601 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5602
5603 /* However, we can give a message for this error */
5604
5605 if (erroroffset == NULL)
5606 {
5607 errorcode = ERR16;
5608 goto PCRE_EARLY_ERROR_RETURN2;
5609 }
5610
5611 *erroroffset = 0;
5612
5613 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5614
5615 #ifdef SUPPORT_UTF8
5616 utf8 = (options & PCRE_UTF8) != 0;
5617 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5618 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5619 {
5620 errorcode = ERR44;
5621 goto PCRE_EARLY_ERROR_RETURN2;
5622 }
5623 #else
5624 if ((options & PCRE_UTF8) != 0)
5625 {
5626 errorcode = ERR32;
5627 goto PCRE_EARLY_ERROR_RETURN;
5628 }
5629 #endif
5630
5631 if ((options & ~PUBLIC_OPTIONS) != 0)
5632 {
5633 errorcode = ERR17;
5634 goto PCRE_EARLY_ERROR_RETURN;
5635 }
5636
5637 /* Set up pointers to the individual character tables */
5638
5639 if (tables == NULL) tables = _pcre_default_tables;
5640 cd->lcc = tables + lcc_offset;
5641 cd->fcc = tables + fcc_offset;
5642 cd->cbits = tables + cbits_offset;
5643 cd->ctypes = tables + ctypes_offset;
5644
5645 /* Handle different types of newline. The three bits give seven cases. The
5646 current code allows for fixed one- or two-byte sequences, plus "any" and
5647 "anycrlf". */
5648
5649 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5650 {
5651 case 0: newline = NEWLINE; break; /* Compile-time default */
5652 case PCRE_NEWLINE_CR: newline = '\r'; break;
5653 case PCRE_NEWLINE_LF: newline = '\n'; break;
5654 case PCRE_NEWLINE_CR+
5655 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5656 case PCRE_NEWLINE_ANY: newline = -1; break;
5657 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5658 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5659 }
5660
5661 if (newline == -2)
5662 {
5663 cd->nltype = NLTYPE_ANYCRLF;
5664 }
5665 else if (newline < 0)
5666 {
5667 cd->nltype = NLTYPE_ANY;
5668 }
5669 else
5670 {
5671 cd->nltype = NLTYPE_FIXED;
5672 if (newline > 255)
5673 {
5674 cd->nllen = 2;
5675 cd->nl[0] = (newline >> 8) & 255;
5676 cd->nl[1] = newline & 255;
5677 }
5678 else
5679 {
5680 cd->nllen = 1;
5681 cd->nl[0] = newline;
5682 }
5683 }
5684
5685 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5686 references to help in deciding whether (.*) can be treated as anchored or not.
5687 */
5688
5689 cd->top_backref = 0;
5690 cd->backref_map = 0;
5691
5692 /* Reflect pattern for debugging output */
5693
5694 DPRINTF(("------------------------------------------------------------------\n"));
5695 DPRINTF(("%s\n", pattern));
5696
5697 /* Pretend to compile the pattern while actually just accumulating the length
5698 of memory required. This behaviour is triggered by passing a non-NULL final
5699 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5700 to compile parts of the pattern into; the compiled code is discarded when it is
5701 no longer needed, so hopefully this workspace will never overflow, though there
5702 is a test for its doing so. */
5703
5704 cd->bracount = 0;
5705 cd->names_found = 0;
5706 cd->name_entry_size = 0;
5707 cd->name_table = NULL;
5708 cd->start_workspace = cworkspace;
5709 cd->start_code = cworkspace;
5710 cd->hwm = cworkspace;
5711 cd->start_pattern = (const uschar *)pattern;
5712 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5713 cd->req_varyopt = 0;
5714 cd->nopartial = FALSE;
5715 cd->external_options = options;
5716
5717 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5718 don't need to look at the result of the function here. The initial options have
5719 been put into the cd block so that they can be changed if an option setting is
5720 found within the regex right at the beginning. Bringing initial option settings
5721 outside can help speed up starting point checks. */
5722
5723 code = cworkspace;
5724 *code = OP_BRA;
5725 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5726 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5727 &length);
5728 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5729
5730 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5731 cd->hwm - cworkspace));
5732
5733 if (length > MAX_PATTERN_SIZE)
5734 {
5735 errorcode = ERR20;
5736 goto PCRE_EARLY_ERROR_RETURN;
5737 }
5738
5739 /* Compute the size of data block needed and get it, either from malloc or
5740 externally provided function. Integer overflow should no longer be possible
5741 because nowadays we limit the maximum value of cd->names_found and
5742 cd->name_entry_size. */
5743
5744 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5745 re = (real_pcre *)(pcre_malloc)(size);
5746
5747 if (re == NULL)
5748 {
5749 errorcode = ERR21;
5750 goto PCRE_EARLY_ERROR_RETURN;
5751 }
5752
5753 /* Put in the magic number, and save the sizes, initial options, and character
5754 table pointer. NULL is used for the default character tables. The nullpad field
5755 is at the end; it's there to help in the case when a regex compiled on a system
5756 with 4-byte pointers is run on another with 8-byte pointers. */
5757
5758 re->magic_number = MAGIC_NUMBER;
5759 re->size = size;
5760 re->options = cd->external_options;
5761 re->dummy1 = 0;
5762 re->first_byte = 0;
5763 re->req_byte = 0;
5764 re->name_table_offset = sizeof(real_pcre);
5765 re->name_entry_size = cd->name_entry_size;
5766 re->name_count = cd->names_found;
5767 re->ref_count = 0;
5768 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5769 re->nullpad = NULL;
5770
5771 /* The starting points of the name/number translation table and of the code are
5772 passed around in the compile data block. The start/end pattern and initial
5773 options are already set from the pre-compile phase, as is the name_entry_size
5774 field. Reset the bracket count and the names_found field. Also reset the hwm
5775 field; this time it's used for remembering forward references to subpatterns.
5776 */
5777
5778 cd->bracount = 0;
5779 cd->names_found = 0;
5780 cd->name_table = (uschar *)re + re->name_table_offset;
5781 codestart = cd->name_table + re->name_entry_size * re->name_count;
5782 cd->start_code = codestart;
5783 cd->hwm = cworkspace;
5784 cd->req_varyopt = 0;
5785 cd->nopartial = FALSE;
5786 cd->had_accept = FALSE;
5787
5788 /* Set up a starting, non-extracting bracket, then compile the expression. On
5789 error, errorcode will be set non-zero, so we don't need to look at the result
5790 of the function here. */
5791
5792 ptr = (const uschar *)pattern;
5793 code = (uschar *)codestart;
5794 *code = OP_BRA;
5795 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5796 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5797 re->top_bracket = cd->bracount;
5798 re->top_backref = cd->top_backref;
5799
5800 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5801 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5802
5803 /* If not reached end of pattern on success, there's an excess bracket. */
5804
5805 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5806
5807 /* Fill in the terminating state and check for disastrous overflow, but
5808 if debugging, leave the test till after things are printed out. */
5809
5810 *code++ = OP_END;
5811
5812 #ifndef DEBUG
5813 if (code - codestart > length) errorcode = ERR23;
5814 #endif
5815
5816 /* Fill in any forward references that are required. */
5817
5818 while (errorcode == 0 && cd->hwm > cworkspace)
5819 {
5820 int offset, recno;
5821 const uschar *groupptr;
5822 cd->hwm -= LINK_SIZE;
5823 offset = GET(cd->hwm, 0);
5824 recno = GET(codestart, offset);
5825 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5826 if (groupptr == NULL) errorcode = ERR53;
5827 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5828 }
5829
5830 /* Give an error if there's back reference to a non-existent capturing
5831 subpattern. */
5832
5833 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5834
5835 /* Failed to compile, or error while post-processing */
5836
5837 if (errorcode != 0)
5838 {
5839 (pcre_free)(re);
5840 PCRE_EARLY_ERROR_RETURN:
5841 *erroroffset = ptr - (const uschar *)pattern;
5842 PCRE_EARLY_ERROR_RETURN2:
5843 *errorptr = error_texts[errorcode];
5844 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5845 return NULL;
5846 }
5847
5848 /* If the anchored option was not passed, set the flag if we can determine that
5849 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5850 as starting with .* when DOTALL is set).
5851
5852 Otherwise, if we know what the first byte has to be, save it, because that
5853 speeds up unanchored matches no end. If not, see if we can set the
5854 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5855 start with ^. and also when all branches start with .* for non-DOTALL matches.
5856 */
5857
5858 if ((re->options & PCRE_ANCHORED) == 0)
5859 {
5860 int temp_options = re->options; /* May get changed during these scans */
5861 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5862 re->options |= PCRE_ANCHORED;
5863 else
5864 {
5865 if (firstbyte < 0)
5866 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5867 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5868 {
5869 int ch = firstbyte & 255;
5870 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5871 cd->fcc[ch] == ch)? ch : firstbyte;
5872 re->options |= PCRE_FIRSTSET;
5873 }
5874 else if (is_startline(codestart, 0, cd->backref_map))
5875 re->options |= PCRE_STARTLINE;
5876 }
5877 }
5878
5879 /* For an anchored pattern, we use the "required byte" only if it follows a
5880 variable length item in the regex. Remove the caseless flag for non-caseable
5881 bytes. */
5882
5883 if (reqbyte >= 0 &&
5884 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5885 {
5886 int ch = reqbyte & 255;
5887 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5888 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5889 re->options |= PCRE_REQCHSET;
5890 }
5891
5892 /* Print out the compiled data if debugging is enabled. This is never the
5893 case when building a production library. */
5894
5895 #ifdef DEBUG
5896
5897 printf("Length = %d top_bracket = %d top_backref = %d\n",
5898 length, re->top_bracket, re->top_backref);
5899
5900 if (re->options != 0)
5901 {
5902 printf("%s%s%s%s%s%s%s%s%s\n",
5903 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5904 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5905 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5906 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5907 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5908 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5909 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5910 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5911 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5912 }
5913
5914 if ((re->options & PCRE_FIRSTSET) != 0)
5915 {
5916 int ch = re->first_byte & 255;
5917 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5918 "" : " (caseless)";
5919 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5920 else printf("First char = \\x%02x%s\n", ch, caseless);
5921 }
5922
5923 if ((re->options & PCRE_REQCHSET) != 0)
5924 {
5925 int ch = re->req_byte & 255;
5926 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5927 "" : " (caseless)";
5928 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5929 else printf("Req char = \\x%02x%s\n", ch, caseless);
5930 }
5931
5932 pcre_printint(re, stdout, TRUE);
5933
5934 /* This check is done here in the debugging case so that the code that
5935 was compiled can be seen. */
5936
5937 if (code - codestart > length)
5938 {
5939 (pcre_free)(re);
5940 *errorptr = error_texts[ERR23];
5941 *erroroffset = ptr - (uschar *)pattern;
5942 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5943 return NULL;
5944 }
5945 #endif /* DEBUG */
5946
5947 return (pcre *)re;
5948 }
5949
5950 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12