/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 221 - (show annotations) (download)
Fri Aug 17 09:25:08 2007 UTC (6 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 189461 byte(s)
Fix bad fix for repeated \p and \P.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == 0) return -1;
954 if (*ptr == '\\')
955 {
956 if (*(++ptr) == 0) return -1;
957 if (*ptr == 'Q') for (;;)
958 {
959 while (*(++ptr) != 0 && *ptr != '\\');
960 if (*ptr == 0) return -1;
961 if (*(++ptr) == 'E') break;
962 }
963 continue;
964 }
965 }
966 continue;
967 }
968
969 /* Skip comments in /x mode */
970
971 if (xmode && *ptr == '#')
972 {
973 while (*(++ptr) != 0 && *ptr != '\n');
974 if (*ptr == 0) return -1;
975 continue;
976 }
977
978 /* An opening parens must now be a real metacharacter */
979
980 if (*ptr != '(') continue;
981 if (ptr[1] != '?' && ptr[1] != '*')
982 {
983 count++;
984 if (name == NULL && count == lorn) return count;
985 continue;
986 }
987
988 ptr += 2;
989 if (*ptr == 'P') ptr++; /* Allow optional P */
990
991 /* We have to disambiguate (?<! and (?<= from (?<name> */
992
993 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994 *ptr != '\'')
995 continue;
996
997 count++;
998
999 if (name == NULL && count == lorn) return count;
1000 term = *ptr++;
1001 if (term == '<') term = '>';
1002 thisname = ptr;
1003 while (*ptr != term) ptr++;
1004 if (name != NULL && lorn == ptr - thisname &&
1005 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 return count;
1007 }
1008
1009 return -1;
1010 }
1011
1012
1013
1014 /*************************************************
1015 * Find first significant op code *
1016 *************************************************/
1017
1018 /* This is called by several functions that scan a compiled expression looking
1019 for a fixed first character, or an anchoring op code etc. It skips over things
1020 that do not influence this. For some calls, a change of option is important.
1021 For some calls, it makes sense to skip negative forward and all backward
1022 assertions, and also the \b assertion; for others it does not.
1023
1024 Arguments:
1025 code pointer to the start of the group
1026 options pointer to external options
1027 optbit the option bit whose changing is significant, or
1028 zero if none are
1029 skipassert TRUE if certain assertions are to be skipped
1030
1031 Returns: pointer to the first significant opcode
1032 */
1033
1034 static const uschar*
1035 first_significant_code(const uschar *code, int *options, int optbit,
1036 BOOL skipassert)
1037 {
1038 for (;;)
1039 {
1040 switch ((int)*code)
1041 {
1042 case OP_OPT:
1043 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044 *options = (int)code[1];
1045 code += 2;
1046 break;
1047
1048 case OP_ASSERT_NOT:
1049 case OP_ASSERTBACK:
1050 case OP_ASSERTBACK_NOT:
1051 if (!skipassert) return code;
1052 do code += GET(code, 1); while (*code == OP_ALT);
1053 code += _pcre_OP_lengths[*code];
1054 break;
1055
1056 case OP_WORD_BOUNDARY:
1057 case OP_NOT_WORD_BOUNDARY:
1058 if (!skipassert) return code;
1059 /* Fall through */
1060
1061 case OP_CALLOUT:
1062 case OP_CREF:
1063 case OP_RREF:
1064 case OP_DEF:
1065 code += _pcre_OP_lengths[*code];
1066 break;
1067
1068 default:
1069 return code;
1070 }
1071 }
1072 /* Control never reaches here */
1073 }
1074
1075
1076
1077
1078 /*************************************************
1079 * Find the fixed length of a pattern *
1080 *************************************************/
1081
1082 /* Scan a pattern and compute the fixed length of subject that will match it,
1083 if the length is fixed. This is needed for dealing with backward assertions.
1084 In UTF8 mode, the result is in characters rather than bytes.
1085
1086 Arguments:
1087 code points to the start of the pattern (the bracket)
1088 options the compiling options
1089
1090 Returns: the fixed length, or -1 if there is no fixed length,
1091 or -2 if \C was encountered
1092 */
1093
1094 static int
1095 find_fixedlength(uschar *code, int options)
1096 {
1097 int length = -1;
1098
1099 register int branchlength = 0;
1100 register uschar *cc = code + 1 + LINK_SIZE;
1101
1102 /* Scan along the opcodes for this branch. If we get to the end of the
1103 branch, check the length against that of the other branches. */
1104
1105 for (;;)
1106 {
1107 int d;
1108 register int op = *cc;
1109 switch (op)
1110 {
1111 case OP_CBRA:
1112 case OP_BRA:
1113 case OP_ONCE:
1114 case OP_COND:
1115 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1116 if (d < 0) return d;
1117 branchlength += d;
1118 do cc += GET(cc, 1); while (*cc == OP_ALT);
1119 cc += 1 + LINK_SIZE;
1120 break;
1121
1122 /* Reached end of a branch; if it's a ket it is the end of a nested
1123 call. If it's ALT it is an alternation in a nested call. If it is
1124 END it's the end of the outer call. All can be handled by the same code. */
1125
1126 case OP_ALT:
1127 case OP_KET:
1128 case OP_KETRMAX:
1129 case OP_KETRMIN:
1130 case OP_END:
1131 if (length < 0) length = branchlength;
1132 else if (length != branchlength) return -1;
1133 if (*cc != OP_ALT) return length;
1134 cc += 1 + LINK_SIZE;
1135 branchlength = 0;
1136 break;
1137
1138 /* Skip over assertive subpatterns */
1139
1140 case OP_ASSERT:
1141 case OP_ASSERT_NOT:
1142 case OP_ASSERTBACK:
1143 case OP_ASSERTBACK_NOT:
1144 do cc += GET(cc, 1); while (*cc == OP_ALT);
1145 /* Fall through */
1146
1147 /* Skip over things that don't match chars */
1148
1149 case OP_REVERSE:
1150 case OP_CREF:
1151 case OP_RREF:
1152 case OP_DEF:
1153 case OP_OPT:
1154 case OP_CALLOUT:
1155 case OP_SOD:
1156 case OP_SOM:
1157 case OP_EOD:
1158 case OP_EODN:
1159 case OP_CIRC:
1160 case OP_DOLL:
1161 case OP_NOT_WORD_BOUNDARY:
1162 case OP_WORD_BOUNDARY:
1163 cc += _pcre_OP_lengths[*cc];
1164 break;
1165
1166 /* Handle literal characters */
1167
1168 case OP_CHAR:
1169 case OP_CHARNC:
1170 case OP_NOT:
1171 branchlength++;
1172 cc += 2;
1173 #ifdef SUPPORT_UTF8
1174 if ((options & PCRE_UTF8) != 0)
1175 {
1176 while ((*cc & 0xc0) == 0x80) cc++;
1177 }
1178 #endif
1179 break;
1180
1181 /* Handle exact repetitions. The count is already in characters, but we
1182 need to skip over a multibyte character in UTF8 mode. */
1183
1184 case OP_EXACT:
1185 branchlength += GET2(cc,1);
1186 cc += 4;
1187 #ifdef SUPPORT_UTF8
1188 if ((options & PCRE_UTF8) != 0)
1189 {
1190 while((*cc & 0x80) == 0x80) cc++;
1191 }
1192 #endif
1193 break;
1194
1195 case OP_TYPEEXACT:
1196 branchlength += GET2(cc,1);
1197 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1198 cc += 4;
1199 break;
1200
1201 /* Handle single-char matchers */
1202
1203 case OP_PROP:
1204 case OP_NOTPROP:
1205 cc += 2;
1206 /* Fall through */
1207
1208 case OP_NOT_DIGIT:
1209 case OP_DIGIT:
1210 case OP_NOT_WHITESPACE:
1211 case OP_WHITESPACE:
1212 case OP_NOT_WORDCHAR:
1213 case OP_WORDCHAR:
1214 case OP_ANY:
1215 branchlength++;
1216 cc++;
1217 break;
1218
1219 /* The single-byte matcher isn't allowed */
1220
1221 case OP_ANYBYTE:
1222 return -2;
1223
1224 /* Check a class for variable quantification */
1225
1226 #ifdef SUPPORT_UTF8
1227 case OP_XCLASS:
1228 cc += GET(cc, 1) - 33;
1229 /* Fall through */
1230 #endif
1231
1232 case OP_CLASS:
1233 case OP_NCLASS:
1234 cc += 33;
1235
1236 switch (*cc)
1237 {
1238 case OP_CRSTAR:
1239 case OP_CRMINSTAR:
1240 case OP_CRQUERY:
1241 case OP_CRMINQUERY:
1242 return -1;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(cc,1) != GET2(cc,3)) return -1;
1247 branchlength += GET2(cc,1);
1248 cc += 5;
1249 break;
1250
1251 default:
1252 branchlength++;
1253 }
1254 break;
1255
1256 /* Anything else is variable length */
1257
1258 default:
1259 return -1;
1260 }
1261 }
1262 /* Control never gets here */
1263 }
1264
1265
1266
1267
1268 /*************************************************
1269 * Scan compiled regex for numbered bracket *
1270 *************************************************/
1271
1272 /* This little function scans through a compiled pattern until it finds a
1273 capturing bracket with the given number.
1274
1275 Arguments:
1276 code points to start of expression
1277 utf8 TRUE in UTF-8 mode
1278 number the required bracket number
1279
1280 Returns: pointer to the opcode for the bracket, or NULL if not found
1281 */
1282
1283 static const uschar *
1284 find_bracket(const uschar *code, BOOL utf8, int number)
1285 {
1286 for (;;)
1287 {
1288 register int c = *code;
1289 if (c == OP_END) return NULL;
1290
1291 /* XCLASS is used for classes that cannot be represented just by a bit
1292 map. This includes negated single high-valued characters. The length in
1293 the table is zero; the actual length is stored in the compiled code. */
1294
1295 if (c == OP_XCLASS) code += GET(code, 1);
1296
1297 /* Handle capturing bracket */
1298
1299 else if (c == OP_CBRA)
1300 {
1301 int n = GET2(code, 1+LINK_SIZE);
1302 if (n == number) return (uschar *)code;
1303 code += _pcre_OP_lengths[c];
1304 }
1305
1306 /* Otherwise, we can get the item's length from the table, except that for
1307 repeated character types, we have to test for \p and \P, which have an extra
1308 two bytes of parameters. */
1309
1310 else
1311 {
1312 switch(c)
1313 {
1314 case OP_TYPESTAR:
1315 case OP_TYPEMINSTAR:
1316 case OP_TYPEPLUS:
1317 case OP_TYPEMINPLUS:
1318 case OP_TYPEQUERY:
1319 case OP_TYPEMINQUERY:
1320 case OP_TYPEPOSSTAR:
1321 case OP_TYPEPOSPLUS:
1322 case OP_TYPEPOSQUERY:
1323 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1324 break;
1325
1326 case OP_TYPEUPTO:
1327 case OP_TYPEMINUPTO:
1328 case OP_TYPEEXACT:
1329 case OP_TYPEPOSUPTO:
1330 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1331 break;
1332 }
1333
1334 /* Add in the fixed length from the table */
1335
1336 code += _pcre_OP_lengths[c];
1337
1338 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1339 a multi-byte character. The length in the table is a minimum, so we have to
1340 arrange to skip the extra bytes. */
1341
1342 #ifdef SUPPORT_UTF8
1343 if (utf8) switch(c)
1344 {
1345 case OP_CHAR:
1346 case OP_CHARNC:
1347 case OP_EXACT:
1348 case OP_UPTO:
1349 case OP_MINUPTO:
1350 case OP_POSUPTO:
1351 case OP_STAR:
1352 case OP_MINSTAR:
1353 case OP_POSSTAR:
1354 case OP_PLUS:
1355 case OP_MINPLUS:
1356 case OP_POSPLUS:
1357 case OP_QUERY:
1358 case OP_MINQUERY:
1359 case OP_POSQUERY:
1360 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1361 break;
1362 }
1363 #endif
1364 }
1365 }
1366 }
1367
1368
1369
1370 /*************************************************
1371 * Scan compiled regex for recursion reference *
1372 *************************************************/
1373
1374 /* This little function scans through a compiled pattern until it finds an
1375 instance of OP_RECURSE.
1376
1377 Arguments:
1378 code points to start of expression
1379 utf8 TRUE in UTF-8 mode
1380
1381 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1382 */
1383
1384 static const uschar *
1385 find_recurse(const uschar *code, BOOL utf8)
1386 {
1387 for (;;)
1388 {
1389 register int c = *code;
1390 if (c == OP_END) return NULL;
1391 if (c == OP_RECURSE) return code;
1392
1393 /* XCLASS is used for classes that cannot be represented just by a bit
1394 map. This includes negated single high-valued characters. The length in
1395 the table is zero; the actual length is stored in the compiled code. */
1396
1397 if (c == OP_XCLASS) code += GET(code, 1);
1398
1399 /* Otherwise, we can get the item's length from the table, except that for
1400 repeated character types, we have to test for \p and \P, which have an extra
1401 two bytes of parameters. */
1402
1403 else
1404 {
1405 switch(c)
1406 {
1407 case OP_TYPESTAR:
1408 case OP_TYPEMINSTAR:
1409 case OP_TYPEPLUS:
1410 case OP_TYPEMINPLUS:
1411 case OP_TYPEQUERY:
1412 case OP_TYPEMINQUERY:
1413 case OP_TYPEPOSSTAR:
1414 case OP_TYPEPOSPLUS:
1415 case OP_TYPEPOSQUERY:
1416 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1417 break;
1418
1419 case OP_TYPEPOSUPTO:
1420 case OP_TYPEUPTO:
1421 case OP_TYPEMINUPTO:
1422 case OP_TYPEEXACT:
1423 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1424 break;
1425 }
1426
1427 /* Add in the fixed length from the table */
1428
1429 code += _pcre_OP_lengths[c];
1430
1431 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1432 by a multi-byte character. The length in the table is a minimum, so we have
1433 to arrange to skip the extra bytes. */
1434
1435 #ifdef SUPPORT_UTF8
1436 if (utf8) switch(c)
1437 {
1438 case OP_CHAR:
1439 case OP_CHARNC:
1440 case OP_EXACT:
1441 case OP_UPTO:
1442 case OP_MINUPTO:
1443 case OP_POSUPTO:
1444 case OP_STAR:
1445 case OP_MINSTAR:
1446 case OP_POSSTAR:
1447 case OP_PLUS:
1448 case OP_MINPLUS:
1449 case OP_POSPLUS:
1450 case OP_QUERY:
1451 case OP_MINQUERY:
1452 case OP_POSQUERY:
1453 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1454 break;
1455 }
1456 #endif
1457 }
1458 }
1459 }
1460
1461
1462
1463 /*************************************************
1464 * Scan compiled branch for non-emptiness *
1465 *************************************************/
1466
1467 /* This function scans through a branch of a compiled pattern to see whether it
1468 can match the empty string or not. It is called from could_be_empty()
1469 below and from compile_branch() when checking for an unlimited repeat of a
1470 group that can match nothing. Note that first_significant_code() skips over
1471 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1472 struck an inner bracket whose current branch will already have been scanned.
1473
1474 Arguments:
1475 code points to start of search
1476 endcode points to where to stop
1477 utf8 TRUE if in UTF8 mode
1478
1479 Returns: TRUE if what is matched could be empty
1480 */
1481
1482 static BOOL
1483 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484 {
1485 register int c;
1486 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1487 code < endcode;
1488 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1489 {
1490 const uschar *ccode;
1491
1492 c = *code;
1493
1494 /* Groups with zero repeats can of course be empty; skip them. */
1495
1496 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1497 {
1498 code += _pcre_OP_lengths[c];
1499 do code += GET(code, 1); while (*code == OP_ALT);
1500 c = *code;
1501 continue;
1502 }
1503
1504 /* For other groups, scan the branches. */
1505
1506 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1507 {
1508 BOOL empty_branch;
1509 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1510
1511 /* Scan a closed bracket */
1512
1513 empty_branch = FALSE;
1514 do
1515 {
1516 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1517 empty_branch = TRUE;
1518 code += GET(code, 1);
1519 }
1520 while (*code == OP_ALT);
1521 if (!empty_branch) return FALSE; /* All branches are non-empty */
1522 c = *code;
1523 continue;
1524 }
1525
1526 /* Handle the other opcodes */
1527
1528 switch (c)
1529 {
1530 /* Check for quantifiers after a class. XCLASS is used for classes that
1531 cannot be represented just by a bit map. This includes negated single
1532 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1533 actual length is stored in the compiled code, so we must update "code"
1534 here. */
1535
1536 #ifdef SUPPORT_UTF8
1537 case OP_XCLASS:
1538 ccode = code += GET(code, 1);
1539 goto CHECK_CLASS_REPEAT;
1540 #endif
1541
1542 case OP_CLASS:
1543 case OP_NCLASS:
1544 ccode = code + 33;
1545
1546 #ifdef SUPPORT_UTF8
1547 CHECK_CLASS_REPEAT:
1548 #endif
1549
1550 switch (*ccode)
1551 {
1552 case OP_CRSTAR: /* These could be empty; continue */
1553 case OP_CRMINSTAR:
1554 case OP_CRQUERY:
1555 case OP_CRMINQUERY:
1556 break;
1557
1558 default: /* Non-repeat => class must match */
1559 case OP_CRPLUS: /* These repeats aren't empty */
1560 case OP_CRMINPLUS:
1561 return FALSE;
1562
1563 case OP_CRRANGE:
1564 case OP_CRMINRANGE:
1565 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1566 break;
1567 }
1568 break;
1569
1570 /* Opcodes that must match a character */
1571
1572 case OP_PROP:
1573 case OP_NOTPROP:
1574 case OP_EXTUNI:
1575 case OP_NOT_DIGIT:
1576 case OP_DIGIT:
1577 case OP_NOT_WHITESPACE:
1578 case OP_WHITESPACE:
1579 case OP_NOT_WORDCHAR:
1580 case OP_WORDCHAR:
1581 case OP_ANY:
1582 case OP_ANYBYTE:
1583 case OP_CHAR:
1584 case OP_CHARNC:
1585 case OP_NOT:
1586 case OP_PLUS:
1587 case OP_MINPLUS:
1588 case OP_POSPLUS:
1589 case OP_EXACT:
1590 case OP_NOTPLUS:
1591 case OP_NOTMINPLUS:
1592 case OP_NOTPOSPLUS:
1593 case OP_NOTEXACT:
1594 case OP_TYPEPLUS:
1595 case OP_TYPEMINPLUS:
1596 case OP_TYPEPOSPLUS:
1597 case OP_TYPEEXACT:
1598 return FALSE;
1599
1600 /* End of branch */
1601
1602 case OP_KET:
1603 case OP_KETRMAX:
1604 case OP_KETRMIN:
1605 case OP_ALT:
1606 return TRUE;
1607
1608 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1609 MINUPTO, and POSUPTO may be followed by a multibyte character */
1610
1611 #ifdef SUPPORT_UTF8
1612 case OP_STAR:
1613 case OP_MINSTAR:
1614 case OP_POSSTAR:
1615 case OP_QUERY:
1616 case OP_MINQUERY:
1617 case OP_POSQUERY:
1618 case OP_UPTO:
1619 case OP_MINUPTO:
1620 case OP_POSUPTO:
1621 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1622 break;
1623 #endif
1624 }
1625 }
1626
1627 return TRUE;
1628 }
1629
1630
1631
1632 /*************************************************
1633 * Scan compiled regex for non-emptiness *
1634 *************************************************/
1635
1636 /* This function is called to check for left recursive calls. We want to check
1637 the current branch of the current pattern to see if it could match the empty
1638 string. If it could, we must look outwards for branches at other levels,
1639 stopping when we pass beyond the bracket which is the subject of the recursion.
1640
1641 Arguments:
1642 code points to start of the recursion
1643 endcode points to where to stop (current RECURSE item)
1644 bcptr points to the chain of current (unclosed) branch starts
1645 utf8 TRUE if in UTF-8 mode
1646
1647 Returns: TRUE if what is matched could be empty
1648 */
1649
1650 static BOOL
1651 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1652 BOOL utf8)
1653 {
1654 while (bcptr != NULL && bcptr->current >= code)
1655 {
1656 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1657 bcptr = bcptr->outer;
1658 }
1659 return TRUE;
1660 }
1661
1662
1663
1664 /*************************************************
1665 * Check for POSIX class syntax *
1666 *************************************************/
1667
1668 /* This function is called when the sequence "[:" or "[." or "[=" is
1669 encountered in a character class. It checks whether this is followed by an
1670 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1671 ".]" or "=]".
1672
1673 Argument:
1674 ptr pointer to the initial [
1675 endptr where to return the end pointer
1676 cd pointer to compile data
1677
1678 Returns: TRUE or FALSE
1679 */
1680
1681 static BOOL
1682 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1683 {
1684 int terminator; /* Don't combine these lines; the Solaris cc */
1685 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1686 if (*(++ptr) == '^') ptr++;
1687 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1688 if (*ptr == terminator && ptr[1] == ']')
1689 {
1690 *endptr = ptr;
1691 return TRUE;
1692 }
1693 return FALSE;
1694 }
1695
1696
1697
1698
1699 /*************************************************
1700 * Check POSIX class name *
1701 *************************************************/
1702
1703 /* This function is called to check the name given in a POSIX-style class entry
1704 such as [:alnum:].
1705
1706 Arguments:
1707 ptr points to the first letter
1708 len the length of the name
1709
1710 Returns: a value representing the name, or -1 if unknown
1711 */
1712
1713 static int
1714 check_posix_name(const uschar *ptr, int len)
1715 {
1716 register int yield = 0;
1717 while (posix_name_lengths[yield] != 0)
1718 {
1719 if (len == posix_name_lengths[yield] &&
1720 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1721 yield++;
1722 }
1723 return -1;
1724 }
1725
1726
1727 /*************************************************
1728 * Adjust OP_RECURSE items in repeated group *
1729 *************************************************/
1730
1731 /* OP_RECURSE items contain an offset from the start of the regex to the group
1732 that is referenced. This means that groups can be replicated for fixed
1733 repetition simply by copying (because the recursion is allowed to refer to
1734 earlier groups that are outside the current group). However, when a group is
1735 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1736 it, after it has been compiled. This means that any OP_RECURSE items within it
1737 that refer to the group itself or any contained groups have to have their
1738 offsets adjusted. That one of the jobs of this function. Before it is called,
1739 the partially compiled regex must be temporarily terminated with OP_END.
1740
1741 This function has been extended with the possibility of forward references for
1742 recursions and subroutine calls. It must also check the list of such references
1743 for the group we are dealing with. If it finds that one of the recursions in
1744 the current group is on this list, it adjusts the offset in the list, not the
1745 value in the reference (which is a group number).
1746
1747 Arguments:
1748 group points to the start of the group
1749 adjust the amount by which the group is to be moved
1750 utf8 TRUE in UTF-8 mode
1751 cd contains pointers to tables etc.
1752 save_hwm the hwm forward reference pointer at the start of the group
1753
1754 Returns: nothing
1755 */
1756
1757 static void
1758 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1759 uschar *save_hwm)
1760 {
1761 uschar *ptr = group;
1762 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1763 {
1764 int offset;
1765 uschar *hc;
1766
1767 /* See if this recursion is on the forward reference list. If so, adjust the
1768 reference. */
1769
1770 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1771 {
1772 offset = GET(hc, 0);
1773 if (cd->start_code + offset == ptr + 1)
1774 {
1775 PUT(hc, 0, offset + adjust);
1776 break;
1777 }
1778 }
1779
1780 /* Otherwise, adjust the recursion offset if it's after the start of this
1781 group. */
1782
1783 if (hc >= cd->hwm)
1784 {
1785 offset = GET(ptr, 1);
1786 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1787 }
1788
1789 ptr += 1 + LINK_SIZE;
1790 }
1791 }
1792
1793
1794
1795 /*************************************************
1796 * Insert an automatic callout point *
1797 *************************************************/
1798
1799 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1800 callout points before each pattern item.
1801
1802 Arguments:
1803 code current code pointer
1804 ptr current pattern pointer
1805 cd pointers to tables etc
1806
1807 Returns: new code pointer
1808 */
1809
1810 static uschar *
1811 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1812 {
1813 *code++ = OP_CALLOUT;
1814 *code++ = 255;
1815 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1816 PUT(code, LINK_SIZE, 0); /* Default length */
1817 return code + 2*LINK_SIZE;
1818 }
1819
1820
1821
1822 /*************************************************
1823 * Complete a callout item *
1824 *************************************************/
1825
1826 /* A callout item contains the length of the next item in the pattern, which
1827 we can't fill in till after we have reached the relevant point. This is used
1828 for both automatic and manual callouts.
1829
1830 Arguments:
1831 previous_callout points to previous callout item
1832 ptr current pattern pointer
1833 cd pointers to tables etc
1834
1835 Returns: nothing
1836 */
1837
1838 static void
1839 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1840 {
1841 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1842 PUT(previous_callout, 2 + LINK_SIZE, length);
1843 }
1844
1845
1846
1847 #ifdef SUPPORT_UCP
1848 /*************************************************
1849 * Get othercase range *
1850 *************************************************/
1851
1852 /* This function is passed the start and end of a class range, in UTF-8 mode
1853 with UCP support. It searches up the characters, looking for internal ranges of
1854 characters in the "other" case. Each call returns the next one, updating the
1855 start address.
1856
1857 Arguments:
1858 cptr points to starting character value; updated
1859 d end value
1860 ocptr where to put start of othercase range
1861 odptr where to put end of othercase range
1862
1863 Yield: TRUE when range returned; FALSE when no more
1864 */
1865
1866 static BOOL
1867 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1868 unsigned int *odptr)
1869 {
1870 unsigned int c, othercase, next;
1871
1872 for (c = *cptr; c <= d; c++)
1873 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1874
1875 if (c > d) return FALSE;
1876
1877 *ocptr = othercase;
1878 next = othercase + 1;
1879
1880 for (++c; c <= d; c++)
1881 {
1882 if (_pcre_ucp_othercase(c) != next) break;
1883 next++;
1884 }
1885
1886 *odptr = next - 1;
1887 *cptr = c;
1888
1889 return TRUE;
1890 }
1891 #endif /* SUPPORT_UCP */
1892
1893
1894
1895 /*************************************************
1896 * Check if auto-possessifying is possible *
1897 *************************************************/
1898
1899 /* This function is called for unlimited repeats of certain items, to see
1900 whether the next thing could possibly match the repeated item. If not, it makes
1901 sense to automatically possessify the repeated item.
1902
1903 Arguments:
1904 op_code the repeated op code
1905 this data for this item, depends on the opcode
1906 utf8 TRUE in UTF-8 mode
1907 utf8_char used for utf8 character bytes, NULL if not relevant
1908 ptr next character in pattern
1909 options options bits
1910 cd contains pointers to tables etc.
1911
1912 Returns: TRUE if possessifying is wanted
1913 */
1914
1915 static BOOL
1916 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1917 const uschar *ptr, int options, compile_data *cd)
1918 {
1919 int next;
1920
1921 /* Skip whitespace and comments in extended mode */
1922
1923 if ((options & PCRE_EXTENDED) != 0)
1924 {
1925 for (;;)
1926 {
1927 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1928 if (*ptr == '#')
1929 {
1930 while (*(++ptr) != 0)
1931 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1932 }
1933 else break;
1934 }
1935 }
1936
1937 /* If the next item is one that we can handle, get its value. A non-negative
1938 value is a character, a negative value is an escape value. */
1939
1940 if (*ptr == '\\')
1941 {
1942 int temperrorcode = 0;
1943 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1944 if (temperrorcode != 0) return FALSE;
1945 ptr++; /* Point after the escape sequence */
1946 }
1947
1948 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1949 {
1950 #ifdef SUPPORT_UTF8
1951 if (utf8) { GETCHARINC(next, ptr); } else
1952 #endif
1953 next = *ptr++;
1954 }
1955
1956 else return FALSE;
1957
1958 /* Skip whitespace and comments in extended mode */
1959
1960 if ((options & PCRE_EXTENDED) != 0)
1961 {
1962 for (;;)
1963 {
1964 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1965 if (*ptr == '#')
1966 {
1967 while (*(++ptr) != 0)
1968 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1969 }
1970 else break;
1971 }
1972 }
1973
1974 /* If the next thing is itself optional, we have to give up. */
1975
1976 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1977 return FALSE;
1978
1979 /* Now compare the next item with the previous opcode. If the previous is a
1980 positive single character match, "item" either contains the character or, if
1981 "item" is greater than 127 in utf8 mode, the character's bytes are in
1982 utf8_char. */
1983
1984
1985 /* Handle cases when the next item is a character. */
1986
1987 if (next >= 0) switch(op_code)
1988 {
1989 case OP_CHAR:
1990 #ifdef SUPPORT_UTF8
1991 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1992 #endif
1993 return item != next;
1994
1995 /* For CHARNC (caseless character) we must check the other case. If we have
1996 Unicode property support, we can use it to test the other case of
1997 high-valued characters. */
1998
1999 case OP_CHARNC:
2000 #ifdef SUPPORT_UTF8
2001 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2002 #endif
2003 if (item == next) return FALSE;
2004 #ifdef SUPPORT_UTF8
2005 if (utf8)
2006 {
2007 unsigned int othercase;
2008 if (next < 128) othercase = cd->fcc[next]; else
2009 #ifdef SUPPORT_UCP
2010 othercase = _pcre_ucp_othercase((unsigned int)next);
2011 #else
2012 othercase = NOTACHAR;
2013 #endif
2014 return (unsigned int)item != othercase;
2015 }
2016 else
2017 #endif /* SUPPORT_UTF8 */
2018 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2019
2020 /* For OP_NOT, "item" must be a single-byte character. */
2021
2022 case OP_NOT:
2023 if (next < 0) return FALSE; /* Not a character */
2024 if (item == next) return TRUE;
2025 if ((options & PCRE_CASELESS) == 0) return FALSE;
2026 #ifdef SUPPORT_UTF8
2027 if (utf8)
2028 {
2029 unsigned int othercase;
2030 if (next < 128) othercase = cd->fcc[next]; else
2031 #ifdef SUPPORT_UCP
2032 othercase = _pcre_ucp_othercase(next);
2033 #else
2034 othercase = NOTACHAR;
2035 #endif
2036 return (unsigned int)item == othercase;
2037 }
2038 else
2039 #endif /* SUPPORT_UTF8 */
2040 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2041
2042 case OP_DIGIT:
2043 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2044
2045 case OP_NOT_DIGIT:
2046 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2047
2048 case OP_WHITESPACE:
2049 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2050
2051 case OP_NOT_WHITESPACE:
2052 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2053
2054 case OP_WORDCHAR:
2055 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2056
2057 case OP_NOT_WORDCHAR:
2058 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2059
2060 case OP_HSPACE:
2061 case OP_NOT_HSPACE:
2062 switch(next)
2063 {
2064 case 0x09:
2065 case 0x20:
2066 case 0xa0:
2067 case 0x1680:
2068 case 0x180e:
2069 case 0x2000:
2070 case 0x2001:
2071 case 0x2002:
2072 case 0x2003:
2073 case 0x2004:
2074 case 0x2005:
2075 case 0x2006:
2076 case 0x2007:
2077 case 0x2008:
2078 case 0x2009:
2079 case 0x200A:
2080 case 0x202f:
2081 case 0x205f:
2082 case 0x3000:
2083 return op_code != OP_HSPACE;
2084 default:
2085 return op_code == OP_HSPACE;
2086 }
2087
2088 case OP_VSPACE:
2089 case OP_NOT_VSPACE:
2090 switch(next)
2091 {
2092 case 0x0a:
2093 case 0x0b:
2094 case 0x0c:
2095 case 0x0d:
2096 case 0x85:
2097 case 0x2028:
2098 case 0x2029:
2099 return op_code != OP_VSPACE;
2100 default:
2101 return op_code == OP_VSPACE;
2102 }
2103
2104 default:
2105 return FALSE;
2106 }
2107
2108
2109 /* Handle the case when the next item is \d, \s, etc. */
2110
2111 switch(op_code)
2112 {
2113 case OP_CHAR:
2114 case OP_CHARNC:
2115 #ifdef SUPPORT_UTF8
2116 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2117 #endif
2118 switch(-next)
2119 {
2120 case ESC_d:
2121 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2122
2123 case ESC_D:
2124 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2125
2126 case ESC_s:
2127 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2128
2129 case ESC_S:
2130 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2131
2132 case ESC_w:
2133 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2134
2135 case ESC_W:
2136 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2137
2138 case ESC_h:
2139 case ESC_H:
2140 switch(item)
2141 {
2142 case 0x09:
2143 case 0x20:
2144 case 0xa0:
2145 case 0x1680:
2146 case 0x180e:
2147 case 0x2000:
2148 case 0x2001:
2149 case 0x2002:
2150 case 0x2003:
2151 case 0x2004:
2152 case 0x2005:
2153 case 0x2006:
2154 case 0x2007:
2155 case 0x2008:
2156 case 0x2009:
2157 case 0x200A:
2158 case 0x202f:
2159 case 0x205f:
2160 case 0x3000:
2161 return -next != ESC_h;
2162 default:
2163 return -next == ESC_h;
2164 }
2165
2166 case ESC_v:
2167 case ESC_V:
2168 switch(item)
2169 {
2170 case 0x0a:
2171 case 0x0b:
2172 case 0x0c:
2173 case 0x0d:
2174 case 0x85:
2175 case 0x2028:
2176 case 0x2029:
2177 return -next != ESC_v;
2178 default:
2179 return -next == ESC_v;
2180 }
2181
2182 default:
2183 return FALSE;
2184 }
2185
2186 case OP_DIGIT:
2187 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2188 next == -ESC_h || next == -ESC_v;
2189
2190 case OP_NOT_DIGIT:
2191 return next == -ESC_d;
2192
2193 case OP_WHITESPACE:
2194 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2195
2196 case OP_NOT_WHITESPACE:
2197 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2198
2199 case OP_HSPACE:
2200 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2201
2202 case OP_NOT_HSPACE:
2203 return next == -ESC_h;
2204
2205 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2206 case OP_VSPACE:
2207 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2208
2209 case OP_NOT_VSPACE:
2210 return next == -ESC_v;
2211
2212 case OP_WORDCHAR:
2213 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2214
2215 case OP_NOT_WORDCHAR:
2216 return next == -ESC_w || next == -ESC_d;
2217
2218 default:
2219 return FALSE;
2220 }
2221
2222 /* Control does not reach here */
2223 }
2224
2225
2226
2227 /*************************************************
2228 * Compile one branch *
2229 *************************************************/
2230
2231 /* Scan the pattern, compiling it into the a vector. If the options are
2232 changed during the branch, the pointer is used to change the external options
2233 bits. This function is used during the pre-compile phase when we are trying
2234 to find out the amount of memory needed, as well as during the real compile
2235 phase. The value of lengthptr distinguishes the two phases.
2236
2237 Arguments:
2238 optionsptr pointer to the option bits
2239 codeptr points to the pointer to the current code point
2240 ptrptr points to the current pattern pointer
2241 errorcodeptr points to error code variable
2242 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2243 reqbyteptr set to the last literal character required, else < 0
2244 bcptr points to current branch chain
2245 cd contains pointers to tables etc.
2246 lengthptr NULL during the real compile phase
2247 points to length accumulator during pre-compile phase
2248
2249 Returns: TRUE on success
2250 FALSE, with *errorcodeptr set non-zero on error
2251 */
2252
2253 static BOOL
2254 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2255 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2256 compile_data *cd, int *lengthptr)
2257 {
2258 int repeat_type, op_type;
2259 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2260 int bravalue = 0;
2261 int greedy_default, greedy_non_default;
2262 int firstbyte, reqbyte;
2263 int zeroreqbyte, zerofirstbyte;
2264 int req_caseopt, reqvary, tempreqvary;
2265 int options = *optionsptr;
2266 int after_manual_callout = 0;
2267 int length_prevgroup = 0;
2268 register int c;
2269 register uschar *code = *codeptr;
2270 uschar *last_code = code;
2271 uschar *orig_code = code;
2272 uschar *tempcode;
2273 BOOL inescq = FALSE;
2274 BOOL groupsetfirstbyte = FALSE;
2275 const uschar *ptr = *ptrptr;
2276 const uschar *tempptr;
2277 uschar *previous = NULL;
2278 uschar *previous_callout = NULL;
2279 uschar *save_hwm = NULL;
2280 uschar classbits[32];
2281
2282 #ifdef SUPPORT_UTF8
2283 BOOL class_utf8;
2284 BOOL utf8 = (options & PCRE_UTF8) != 0;
2285 uschar *class_utf8data;
2286 uschar utf8_char[6];
2287 #else
2288 BOOL utf8 = FALSE;
2289 uschar *utf8_char = NULL;
2290 #endif
2291
2292 #ifdef DEBUG
2293 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2294 #endif
2295
2296 /* Set up the default and non-default settings for greediness */
2297
2298 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2299 greedy_non_default = greedy_default ^ 1;
2300
2301 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2302 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2303 matches a non-fixed char first char; reqbyte just remains unset if we never
2304 find one.
2305
2306 When we hit a repeat whose minimum is zero, we may have to adjust these values
2307 to take the zero repeat into account. This is implemented by setting them to
2308 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2309 item types that can be repeated set these backoff variables appropriately. */
2310
2311 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2312
2313 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2314 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2315 value > 255. It is added into the firstbyte or reqbyte variables to record the
2316 case status of the value. This is used only for ASCII characters. */
2317
2318 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2319
2320 /* Switch on next character until the end of the branch */
2321
2322 for (;; ptr++)
2323 {
2324 BOOL negate_class;
2325 BOOL possessive_quantifier;
2326 BOOL is_quantifier;
2327 BOOL is_recurse;
2328 BOOL reset_bracount;
2329 int class_charcount;
2330 int class_lastchar;
2331 int newoptions;
2332 int recno;
2333 int refsign;
2334 int skipbytes;
2335 int subreqbyte;
2336 int subfirstbyte;
2337 int terminator;
2338 int mclength;
2339 uschar mcbuffer[8];
2340
2341 /* Get next byte in the pattern */
2342
2343 c = *ptr;
2344
2345 /* If we are in the pre-compile phase, accumulate the length used for the
2346 previous cycle of this loop. */
2347
2348 if (lengthptr != NULL)
2349 {
2350 #ifdef DEBUG
2351 if (code > cd->hwm) cd->hwm = code; /* High water info */
2352 #endif
2353 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2354 {
2355 *errorcodeptr = ERR52;
2356 goto FAILED;
2357 }
2358
2359 /* There is at least one situation where code goes backwards: this is the
2360 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2361 the class is simply eliminated. However, it is created first, so we have to
2362 allow memory for it. Therefore, don't ever reduce the length at this point.
2363 */
2364
2365 if (code < last_code) code = last_code;
2366
2367 /* Paranoid check for integer overflow */
2368
2369 if (OFLOW_MAX - *lengthptr < code - last_code)
2370 {
2371 *errorcodeptr = ERR20;
2372 goto FAILED;
2373 }
2374
2375 *lengthptr += code - last_code;
2376 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2377
2378 /* If "previous" is set and it is not at the start of the work space, move
2379 it back to there, in order to avoid filling up the work space. Otherwise,
2380 if "previous" is NULL, reset the current code pointer to the start. */
2381
2382 if (previous != NULL)
2383 {
2384 if (previous > orig_code)
2385 {
2386 memmove(orig_code, previous, code - previous);
2387 code -= previous - orig_code;
2388 previous = orig_code;
2389 }
2390 }
2391 else code = orig_code;
2392
2393 /* Remember where this code item starts so we can pick up the length
2394 next time round. */
2395
2396 last_code = code;
2397 }
2398
2399 /* In the real compile phase, just check the workspace used by the forward
2400 reference list. */
2401
2402 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2403 {
2404 *errorcodeptr = ERR52;
2405 goto FAILED;
2406 }
2407
2408 /* If in \Q...\E, check for the end; if not, we have a literal */
2409
2410 if (inescq && c != 0)
2411 {
2412 if (c == '\\' && ptr[1] == 'E')
2413 {
2414 inescq = FALSE;
2415 ptr++;
2416 continue;
2417 }
2418 else
2419 {
2420 if (previous_callout != NULL)
2421 {
2422 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2423 complete_callout(previous_callout, ptr, cd);
2424 previous_callout = NULL;
2425 }
2426 if ((options & PCRE_AUTO_CALLOUT) != 0)
2427 {
2428 previous_callout = code;
2429 code = auto_callout(code, ptr, cd);
2430 }
2431 goto NORMAL_CHAR;
2432 }
2433 }
2434
2435 /* Fill in length of a previous callout, except when the next thing is
2436 a quantifier. */
2437
2438 is_quantifier = c == '*' || c == '+' || c == '?' ||
2439 (c == '{' && is_counted_repeat(ptr+1));
2440
2441 if (!is_quantifier && previous_callout != NULL &&
2442 after_manual_callout-- <= 0)
2443 {
2444 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2445 complete_callout(previous_callout, ptr, cd);
2446 previous_callout = NULL;
2447 }
2448
2449 /* In extended mode, skip white space and comments */
2450
2451 if ((options & PCRE_EXTENDED) != 0)
2452 {
2453 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2454 if (c == '#')
2455 {
2456 while (*(++ptr) != 0)
2457 {
2458 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2459 }
2460 if (*ptr != 0) continue;
2461
2462 /* Else fall through to handle end of string */
2463 c = 0;
2464 }
2465 }
2466
2467 /* No auto callout for quantifiers. */
2468
2469 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2470 {
2471 previous_callout = code;
2472 code = auto_callout(code, ptr, cd);
2473 }
2474
2475 switch(c)
2476 {
2477 /* ===================================================================*/
2478 case 0: /* The branch terminates at string end */
2479 case '|': /* or | or ) */
2480 case ')':
2481 *firstbyteptr = firstbyte;
2482 *reqbyteptr = reqbyte;
2483 *codeptr = code;
2484 *ptrptr = ptr;
2485 if (lengthptr != NULL)
2486 {
2487 if (OFLOW_MAX - *lengthptr < code - last_code)
2488 {
2489 *errorcodeptr = ERR20;
2490 goto FAILED;
2491 }
2492 *lengthptr += code - last_code; /* To include callout length */
2493 DPRINTF((">> end branch\n"));
2494 }
2495 return TRUE;
2496
2497
2498 /* ===================================================================*/
2499 /* Handle single-character metacharacters. In multiline mode, ^ disables
2500 the setting of any following char as a first character. */
2501
2502 case '^':
2503 if ((options & PCRE_MULTILINE) != 0)
2504 {
2505 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2506 }
2507 previous = NULL;
2508 *code++ = OP_CIRC;
2509 break;
2510
2511 case '$':
2512 previous = NULL;
2513 *code++ = OP_DOLL;
2514 break;
2515
2516 /* There can never be a first char if '.' is first, whatever happens about
2517 repeats. The value of reqbyte doesn't change either. */
2518
2519 case '.':
2520 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2521 zerofirstbyte = firstbyte;
2522 zeroreqbyte = reqbyte;
2523 previous = code;
2524 *code++ = OP_ANY;
2525 break;
2526
2527
2528 /* ===================================================================*/
2529 /* Character classes. If the included characters are all < 256, we build a
2530 32-byte bitmap of the permitted characters, except in the special case
2531 where there is only one such character. For negated classes, we build the
2532 map as usual, then invert it at the end. However, we use a different opcode
2533 so that data characters > 255 can be handled correctly.
2534
2535 If the class contains characters outside the 0-255 range, a different
2536 opcode is compiled. It may optionally have a bit map for characters < 256,
2537 but those above are are explicitly listed afterwards. A flag byte tells
2538 whether the bitmap is present, and whether this is a negated class or not.
2539 */
2540
2541 case '[':
2542 previous = code;
2543
2544 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2545 they are encountered at the top level, so we'll do that too. */
2546
2547 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2548 check_posix_syntax(ptr, &tempptr, cd))
2549 {
2550 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2551 goto FAILED;
2552 }
2553
2554 /* If the first character is '^', set the negation flag and skip it. Also,
2555 if the first few characters (either before or after ^) are \Q\E or \E we
2556 skip them too. This makes for compatibility with Perl. */
2557
2558 negate_class = FALSE;
2559 for (;;)
2560 {
2561 c = *(++ptr);
2562 if (c == '\\')
2563 {
2564 if (ptr[1] == 'E') ptr++;
2565 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2566 else break;
2567 }
2568 else if (!negate_class && c == '^')
2569 negate_class = TRUE;
2570 else break;
2571 }
2572
2573 /* Keep a count of chars with values < 256 so that we can optimize the case
2574 of just a single character (as long as it's < 256). However, For higher
2575 valued UTF-8 characters, we don't yet do any optimization. */
2576
2577 class_charcount = 0;
2578 class_lastchar = -1;
2579
2580 /* Initialize the 32-char bit map to all zeros. We build the map in a
2581 temporary bit of memory, in case the class contains only 1 character (less
2582 than 256), because in that case the compiled code doesn't use the bit map.
2583 */
2584
2585 memset(classbits, 0, 32 * sizeof(uschar));
2586
2587 #ifdef SUPPORT_UTF8
2588 class_utf8 = FALSE; /* No chars >= 256 */
2589 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2590 #endif
2591
2592 /* Process characters until ] is reached. By writing this as a "do" it
2593 means that an initial ] is taken as a data character. At the start of the
2594 loop, c contains the first byte of the character. */
2595
2596 if (c != 0) do
2597 {
2598 const uschar *oldptr;
2599
2600 #ifdef SUPPORT_UTF8
2601 if (utf8 && c > 127)
2602 { /* Braces are required because the */
2603 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2604 }
2605 #endif
2606
2607 /* Inside \Q...\E everything is literal except \E */
2608
2609 if (inescq)
2610 {
2611 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2612 {
2613 inescq = FALSE; /* Reset literal state */
2614 ptr++; /* Skip the 'E' */
2615 continue; /* Carry on with next */
2616 }
2617 goto CHECK_RANGE; /* Could be range if \E follows */
2618 }
2619
2620 /* Handle POSIX class names. Perl allows a negation extension of the
2621 form [:^name:]. A square bracket that doesn't match the syntax is
2622 treated as a literal. We also recognize the POSIX constructions
2623 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2624 5.6 and 5.8 do. */
2625
2626 if (c == '[' &&
2627 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2628 check_posix_syntax(ptr, &tempptr, cd))
2629 {
2630 BOOL local_negate = FALSE;
2631 int posix_class, taboffset, tabopt;
2632 register const uschar *cbits = cd->cbits;
2633 uschar pbits[32];
2634
2635 if (ptr[1] != ':')
2636 {
2637 *errorcodeptr = ERR31;
2638 goto FAILED;
2639 }
2640
2641 ptr += 2;
2642 if (*ptr == '^')
2643 {
2644 local_negate = TRUE;
2645 ptr++;
2646 }
2647
2648 posix_class = check_posix_name(ptr, tempptr - ptr);
2649 if (posix_class < 0)
2650 {
2651 *errorcodeptr = ERR30;
2652 goto FAILED;
2653 }
2654
2655 /* If matching is caseless, upper and lower are converted to
2656 alpha. This relies on the fact that the class table starts with
2657 alpha, lower, upper as the first 3 entries. */
2658
2659 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2660 posix_class = 0;
2661
2662 /* We build the bit map for the POSIX class in a chunk of local store
2663 because we may be adding and subtracting from it, and we don't want to
2664 subtract bits that may be in the main map already. At the end we or the
2665 result into the bit map that is being built. */
2666
2667 posix_class *= 3;
2668
2669 /* Copy in the first table (always present) */
2670
2671 memcpy(pbits, cbits + posix_class_maps[posix_class],
2672 32 * sizeof(uschar));
2673
2674 /* If there is a second table, add or remove it as required. */
2675
2676 taboffset = posix_class_maps[posix_class + 1];
2677 tabopt = posix_class_maps[posix_class + 2];
2678
2679 if (taboffset >= 0)
2680 {
2681 if (tabopt >= 0)
2682 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2683 else
2684 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2685 }
2686
2687 /* Not see if we need to remove any special characters. An option
2688 value of 1 removes vertical space and 2 removes underscore. */
2689
2690 if (tabopt < 0) tabopt = -tabopt;
2691 if (tabopt == 1) pbits[1] &= ~0x3c;
2692 else if (tabopt == 2) pbits[11] &= 0x7f;
2693
2694 /* Add the POSIX table or its complement into the main table that is
2695 being built and we are done. */
2696
2697 if (local_negate)
2698 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2699 else
2700 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2701
2702 ptr = tempptr + 1;
2703 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2704 continue; /* End of POSIX syntax handling */
2705 }
2706
2707 /* Backslash may introduce a single character, or it may introduce one
2708 of the specials, which just set a flag. The sequence \b is a special
2709 case. Inside a class (and only there) it is treated as backspace.
2710 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2711 to 'or' into the one we are building. We assume they have more than one
2712 character in them, so set class_charcount bigger than one. */
2713
2714 if (c == '\\')
2715 {
2716 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2717 if (*errorcodeptr != 0) goto FAILED;
2718
2719 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2720 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2721 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2722 else if (-c == ESC_Q) /* Handle start of quoted string */
2723 {
2724 if (ptr[1] == '\\' && ptr[2] == 'E')
2725 {
2726 ptr += 2; /* avoid empty string */
2727 }
2728 else inescq = TRUE;
2729 continue;
2730 }
2731 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2732
2733 if (c < 0)
2734 {
2735 register const uschar *cbits = cd->cbits;
2736 class_charcount += 2; /* Greater than 1 is what matters */
2737
2738 /* Save time by not doing this in the pre-compile phase. */
2739
2740 if (lengthptr == NULL) switch (-c)
2741 {
2742 case ESC_d:
2743 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2744 continue;
2745
2746 case ESC_D:
2747 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2748 continue;
2749
2750 case ESC_w:
2751 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2752 continue;
2753
2754 case ESC_W:
2755 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2756 continue;
2757
2758 case ESC_s:
2759 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2760 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2761 continue;
2762
2763 case ESC_S:
2764 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2765 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2766 continue;
2767
2768 case ESC_E: /* Perl ignores an orphan \E */
2769 continue;
2770
2771 default: /* Not recognized; fall through */
2772 break; /* Need "default" setting to stop compiler warning. */
2773 }
2774
2775 /* In the pre-compile phase, just do the recognition. */
2776
2777 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2778 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2779
2780 /* We need to deal with \H, \h, \V, and \v in both phases because
2781 they use extra memory. */
2782
2783 if (-c == ESC_h)
2784 {
2785 SETBIT(classbits, 0x09); /* VT */
2786 SETBIT(classbits, 0x20); /* SPACE */
2787 SETBIT(classbits, 0xa0); /* NSBP */
2788 #ifdef SUPPORT_UTF8
2789 if (utf8)
2790 {
2791 class_utf8 = TRUE;
2792 *class_utf8data++ = XCL_SINGLE;
2793 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2794 *class_utf8data++ = XCL_SINGLE;
2795 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2796 *class_utf8data++ = XCL_RANGE;
2797 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2798 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2799 *class_utf8data++ = XCL_SINGLE;
2800 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2801 *class_utf8data++ = XCL_SINGLE;
2802 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2803 *class_utf8data++ = XCL_SINGLE;
2804 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2805 }
2806 #endif
2807 continue;
2808 }
2809
2810 if (-c == ESC_H)
2811 {
2812 for (c = 0; c < 32; c++)
2813 {
2814 int x = 0xff;
2815 switch (c)
2816 {
2817 case 0x09/8: x ^= 1 << (0x09%8); break;
2818 case 0x20/8: x ^= 1 << (0x20%8); break;
2819 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2820 default: break;
2821 }
2822 classbits[c] |= x;
2823 }
2824
2825 #ifdef SUPPORT_UTF8
2826 if (utf8)
2827 {
2828 class_utf8 = TRUE;
2829 *class_utf8data++ = XCL_RANGE;
2830 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2831 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2832 *class_utf8data++ = XCL_RANGE;
2833 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2834 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2835 *class_utf8data++ = XCL_RANGE;
2836 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2837 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2838 *class_utf8data++ = XCL_RANGE;
2839 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2840 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2841 *class_utf8data++ = XCL_RANGE;
2842 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2843 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2844 *class_utf8data++ = XCL_RANGE;
2845 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2846 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2847 *class_utf8data++ = XCL_RANGE;
2848 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2849 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2850 }
2851 #endif
2852 continue;
2853 }
2854
2855 if (-c == ESC_v)
2856 {
2857 SETBIT(classbits, 0x0a); /* LF */
2858 SETBIT(classbits, 0x0b); /* VT */
2859 SETBIT(classbits, 0x0c); /* FF */
2860 SETBIT(classbits, 0x0d); /* CR */
2861 SETBIT(classbits, 0x85); /* NEL */
2862 #ifdef SUPPORT_UTF8
2863 if (utf8)
2864 {
2865 class_utf8 = TRUE;
2866 *class_utf8data++ = XCL_RANGE;
2867 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2868 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2869 }
2870 #endif
2871 continue;
2872 }
2873
2874 if (-c == ESC_V)
2875 {
2876 for (c = 0; c < 32; c++)
2877 {
2878 int x = 0xff;
2879 switch (c)
2880 {
2881 case 0x0a/8: x ^= 1 << (0x0a%8);
2882 x ^= 1 << (0x0b%8);
2883 x ^= 1 << (0x0c%8);
2884 x ^= 1 << (0x0d%8);
2885 break;
2886 case 0x85/8: x ^= 1 << (0x85%8); break;
2887 default: break;
2888 }
2889 classbits[c] |= x;
2890 }
2891
2892 #ifdef SUPPORT_UTF8
2893 if (utf8)
2894 {
2895 class_utf8 = TRUE;
2896 *class_utf8data++ = XCL_RANGE;
2897 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2898 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2899 *class_utf8data++ = XCL_RANGE;
2900 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2901 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2902 }
2903 #endif
2904 continue;
2905 }
2906
2907 /* We need to deal with \P and \p in both phases. */
2908
2909 #ifdef SUPPORT_UCP
2910 if (-c == ESC_p || -c == ESC_P)
2911 {
2912 BOOL negated;
2913 int pdata;
2914 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2915 if (ptype < 0) goto FAILED;
2916 class_utf8 = TRUE;
2917 *class_utf8data++ = ((-c == ESC_p) != negated)?
2918 XCL_PROP : XCL_NOTPROP;
2919 *class_utf8data++ = ptype;
2920 *class_utf8data++ = pdata;
2921 class_charcount -= 2; /* Not a < 256 character */
2922 continue;
2923 }
2924 #endif
2925 /* Unrecognized escapes are faulted if PCRE is running in its
2926 strict mode. By default, for compatibility with Perl, they are
2927 treated as literals. */
2928
2929 if ((options & PCRE_EXTRA) != 0)
2930 {
2931 *errorcodeptr = ERR7;
2932 goto FAILED;
2933 }
2934
2935 class_charcount -= 2; /* Undo the default count from above */
2936 c = *ptr; /* Get the final character and fall through */
2937 }
2938
2939 /* Fall through if we have a single character (c >= 0). This may be
2940 greater than 256 in UTF-8 mode. */
2941
2942 } /* End of backslash handling */
2943
2944 /* A single character may be followed by '-' to form a range. However,
2945 Perl does not permit ']' to be the end of the range. A '-' character
2946 at the end is treated as a literal. Perl ignores orphaned \E sequences
2947 entirely. The code for handling \Q and \E is messy. */
2948
2949 CHECK_RANGE:
2950 while (ptr[1] == '\\' && ptr[2] == 'E')
2951 {
2952 inescq = FALSE;
2953 ptr += 2;
2954 }
2955
2956 oldptr = ptr;
2957
2958 if (!inescq && ptr[1] == '-')
2959 {
2960 int d;
2961 ptr += 2;
2962 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2963
2964 /* If we hit \Q (not followed by \E) at this point, go into escaped
2965 mode. */
2966
2967 while (*ptr == '\\' && ptr[1] == 'Q')
2968 {
2969 ptr += 2;
2970 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2971 inescq = TRUE;
2972 break;
2973 }
2974
2975 if (*ptr == 0 || (!inescq && *ptr == ']'))
2976 {
2977 ptr = oldptr;
2978 goto LONE_SINGLE_CHARACTER;
2979 }
2980
2981 #ifdef SUPPORT_UTF8
2982 if (utf8)
2983 { /* Braces are required because the */
2984 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2985 }
2986 else
2987 #endif
2988 d = *ptr; /* Not UTF-8 mode */
2989
2990 /* The second part of a range can be a single-character escape, but
2991 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2992 in such circumstances. */
2993
2994 if (!inescq && d == '\\')
2995 {
2996 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2997 if (*errorcodeptr != 0) goto FAILED;
2998
2999 /* \b is backslash; \X is literal X; \R is literal R; any other
3000 special means the '-' was literal */
3001
3002 if (d < 0)
3003 {
3004 if (d == -ESC_b) d = '\b';
3005 else if (d == -ESC_X) d = 'X';
3006 else if (d == -ESC_R) d = 'R'; else
3007 {
3008 ptr = oldptr;
3009 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3010 }
3011 }
3012 }
3013
3014 /* Check that the two values are in the correct order. Optimize
3015 one-character ranges */
3016
3017 if (d < c)
3018 {
3019 *errorcodeptr = ERR8;
3020 goto FAILED;
3021 }
3022
3023 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3024
3025 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3026 matching, we have to use an XCLASS with extra data items. Caseless
3027 matching for characters > 127 is available only if UCP support is
3028 available. */
3029
3030 #ifdef SUPPORT_UTF8
3031 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3032 {
3033 class_utf8 = TRUE;
3034
3035 /* With UCP support, we can find the other case equivalents of
3036 the relevant characters. There may be several ranges. Optimize how
3037 they fit with the basic range. */
3038
3039 #ifdef SUPPORT_UCP
3040 if ((options & PCRE_CASELESS) != 0)
3041 {
3042 unsigned int occ, ocd;
3043 unsigned int cc = c;
3044 unsigned int origd = d;
3045 while (get_othercase_range(&cc, origd, &occ, &ocd))
3046 {
3047 if (occ >= (unsigned int)c &&
3048 ocd <= (unsigned int)d)
3049 continue; /* Skip embedded ranges */
3050
3051 if (occ < (unsigned int)c &&
3052 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3053 { /* if there is overlap, */
3054 c = occ; /* noting that if occ < c */
3055 continue; /* we can't have ocd > d */
3056 } /* because a subrange is */
3057 if (ocd > (unsigned int)d &&
3058 occ <= (unsigned int)d + 1) /* always shorter than */
3059 { /* the basic range. */
3060 d = ocd;
3061 continue;
3062 }
3063
3064 if (occ == ocd)
3065 {
3066 *class_utf8data++ = XCL_SINGLE;
3067 }
3068 else
3069 {
3070 *class_utf8data++ = XCL_RANGE;
3071 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3072 }
3073 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3074 }
3075 }
3076 #endif /* SUPPORT_UCP */
3077
3078 /* Now record the original range, possibly modified for UCP caseless
3079 overlapping ranges. */
3080
3081 *class_utf8data++ = XCL_RANGE;
3082 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3083 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3084
3085 /* With UCP support, we are done. Without UCP support, there is no
3086 caseless matching for UTF-8 characters > 127; we can use the bit map
3087 for the smaller ones. */
3088
3089 #ifdef SUPPORT_UCP
3090 continue; /* With next character in the class */
3091 #else
3092 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3093
3094 /* Adjust upper limit and fall through to set up the map */
3095
3096 d = 127;
3097
3098 #endif /* SUPPORT_UCP */
3099 }
3100 #endif /* SUPPORT_UTF8 */
3101
3102 /* We use the bit map for all cases when not in UTF-8 mode; else
3103 ranges that lie entirely within 0-127 when there is UCP support; else
3104 for partial ranges without UCP support. */
3105
3106 class_charcount += d - c + 1;
3107 class_lastchar = d;
3108
3109 /* We can save a bit of time by skipping this in the pre-compile. */
3110
3111 if (lengthptr == NULL) for (; c <= d; c++)
3112 {
3113 classbits[c/8] |= (1 << (c&7));
3114 if ((options & PCRE_CASELESS) != 0)
3115 {
3116 int uc = cd->fcc[c]; /* flip case */
3117 classbits[uc/8] |= (1 << (uc&7));
3118 }
3119 }
3120
3121 continue; /* Go get the next char in the class */
3122 }
3123
3124 /* Handle a lone single character - we can get here for a normal
3125 non-escape char, or after \ that introduces a single character or for an
3126 apparent range that isn't. */
3127
3128 LONE_SINGLE_CHARACTER:
3129
3130 /* Handle a character that cannot go in the bit map */
3131
3132 #ifdef SUPPORT_UTF8
3133 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3134 {
3135 class_utf8 = TRUE;
3136 *class_utf8data++ = XCL_SINGLE;
3137 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3138
3139 #ifdef SUPPORT_UCP
3140 if ((options & PCRE_CASELESS) != 0)
3141 {
3142 unsigned int othercase;
3143 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3144 {
3145 *class_utf8data++ = XCL_SINGLE;
3146 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3147 }
3148 }
3149 #endif /* SUPPORT_UCP */
3150
3151 }
3152 else
3153 #endif /* SUPPORT_UTF8 */
3154
3155 /* Handle a single-byte character */
3156 {
3157 classbits[c/8] |= (1 << (c&7));
3158 if ((options & PCRE_CASELESS) != 0)
3159 {
3160 c = cd->fcc[c]; /* flip case */
3161 classbits[c/8] |= (1 << (c&7));
3162 }
3163 class_charcount++;
3164 class_lastchar = c;
3165 }
3166 }
3167
3168 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3169
3170 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3171
3172 if (c == 0) /* Missing terminating ']' */
3173 {
3174 *errorcodeptr = ERR6;
3175 goto FAILED;
3176 }
3177
3178 /* If class_charcount is 1, we saw precisely one character whose value is
3179 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3180 can optimize the negative case only if there were no characters >= 128
3181 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3182 single-bytes only. This is an historical hangover. Maybe one day we can
3183 tidy these opcodes to handle multi-byte characters.
3184
3185 The optimization throws away the bit map. We turn the item into a
3186 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3187 that OP_NOT does not support multibyte characters. In the positive case, it
3188 can cause firstbyte to be set. Otherwise, there can be no first char if
3189 this item is first, whatever repeat count may follow. In the case of
3190 reqbyte, save the previous value for reinstating. */
3191
3192 #ifdef SUPPORT_UTF8
3193 if (class_charcount == 1 &&
3194 (!utf8 ||
3195 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3196
3197 #else
3198 if (class_charcount == 1)
3199 #endif
3200 {
3201 zeroreqbyte = reqbyte;
3202
3203 /* The OP_NOT opcode works on one-byte characters only. */
3204
3205 if (negate_class)
3206 {
3207 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3208 zerofirstbyte = firstbyte;
3209 *code++ = OP_NOT;
3210 *code++ = class_lastchar;
3211 break;
3212 }
3213
3214 /* For a single, positive character, get the value into mcbuffer, and
3215 then we can handle this with the normal one-character code. */
3216
3217 #ifdef SUPPORT_UTF8
3218 if (utf8 && class_lastchar > 127)
3219 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3220 else
3221 #endif
3222 {
3223 mcbuffer[0] = class_lastchar;
3224 mclength = 1;
3225 }
3226 goto ONE_CHAR;
3227 } /* End of 1-char optimization */
3228
3229 /* The general case - not the one-char optimization. If this is the first
3230 thing in the branch, there can be no first char setting, whatever the
3231 repeat count. Any reqbyte setting must remain unchanged after any kind of
3232 repeat. */
3233
3234 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3235 zerofirstbyte = firstbyte;
3236 zeroreqbyte = reqbyte;
3237
3238 /* If there are characters with values > 255, we have to compile an
3239 extended class, with its own opcode. If there are no characters < 256,
3240 we can omit the bitmap in the actual compiled code. */
3241
3242 #ifdef SUPPORT_UTF8
3243 if (class_utf8)
3244 {
3245 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3246 *code++ = OP_XCLASS;
3247 code += LINK_SIZE;
3248 *code = negate_class? XCL_NOT : 0;
3249
3250 /* If the map is required, move up the extra data to make room for it;
3251 otherwise just move the code pointer to the end of the extra data. */
3252
3253 if (class_charcount > 0)
3254 {
3255 *code++ |= XCL_MAP;
3256 memmove(code + 32, code, class_utf8data - code);
3257 memcpy(code, classbits, 32);
3258 code = class_utf8data + 32;
3259 }
3260 else code = class_utf8data;
3261
3262 /* Now fill in the complete length of the item */
3263
3264 PUT(previous, 1, code - previous);
3265 break; /* End of class handling */
3266 }
3267 #endif
3268
3269 /* If there are no characters > 255, negate the 32-byte map if necessary,
3270 and copy it into the code vector. If this is the first thing in the branch,
3271 there can be no first char setting, whatever the repeat count. Any reqbyte
3272 setting must remain unchanged after any kind of repeat. */
3273
3274 if (negate_class)
3275 {
3276 *code++ = OP_NCLASS;
3277 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3278 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3279 }
3280 else
3281 {
3282 *code++ = OP_CLASS;
3283 memcpy(code, classbits, 32);
3284 }
3285 code += 32;
3286 break;
3287
3288
3289 /* ===================================================================*/
3290 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3291 has been tested above. */
3292
3293 case '{':
3294 if (!is_quantifier) goto NORMAL_CHAR;
3295 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3296 if (*errorcodeptr != 0) goto FAILED;
3297 goto REPEAT;
3298
3299 case '*':
3300 repeat_min = 0;
3301 repeat_max = -1;
3302 goto REPEAT;
3303
3304 case '+':
3305 repeat_min = 1;
3306 repeat_max = -1;
3307 goto REPEAT;
3308
3309 case '?':
3310 repeat_min = 0;
3311 repeat_max = 1;
3312
3313 REPEAT:
3314 if (previous == NULL)
3315 {
3316 *errorcodeptr = ERR9;
3317 goto FAILED;
3318 }
3319
3320 if (repeat_min == 0)
3321 {
3322 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3323 reqbyte = zeroreqbyte; /* Ditto */
3324 }
3325
3326 /* Remember whether this is a variable length repeat */
3327
3328 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3329
3330 op_type = 0; /* Default single-char op codes */
3331 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3332
3333 /* Save start of previous item, in case we have to move it up to make space
3334 for an inserted OP_ONCE for the additional '+' extension. */
3335
3336 tempcode = previous;
3337
3338 /* If the next character is '+', we have a possessive quantifier. This
3339 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3340 If the next character is '?' this is a minimizing repeat, by default,
3341 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3342 repeat type to the non-default. */
3343
3344 if (ptr[1] == '+')
3345 {
3346 repeat_type = 0; /* Force greedy */
3347 possessive_quantifier = TRUE;
3348 ptr++;
3349 }
3350 else if (ptr[1] == '?')
3351 {
3352 repeat_type = greedy_non_default;
3353 ptr++;
3354 }
3355 else repeat_type = greedy_default;
3356
3357 /* If previous was a character match, abolish the item and generate a
3358 repeat item instead. If a char item has a minumum of more than one, ensure
3359 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3360 the first thing in a branch because the x will have gone into firstbyte
3361 instead. */
3362
3363 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3364 {
3365 /* Deal with UTF-8 characters that take up more than one byte. It's
3366 easier to write this out separately than try to macrify it. Use c to
3367 hold the length of the character in bytes, plus 0x80 to flag that it's a
3368 length rather than a small character. */
3369
3370 #ifdef SUPPORT_UTF8
3371 if (utf8 && (code[-1] & 0x80) != 0)
3372 {
3373 uschar *lastchar = code - 1;
3374 while((*lastchar & 0xc0) == 0x80) lastchar--;
3375 c = code - lastchar; /* Length of UTF-8 character */
3376 memcpy(utf8_char, lastchar, c); /* Save the char */
3377 c |= 0x80; /* Flag c as a length */
3378 }
3379 else
3380 #endif
3381
3382 /* Handle the case of a single byte - either with no UTF8 support, or
3383 with UTF-8 disabled, or for a UTF-8 character < 128. */
3384
3385 {
3386 c = code[-1];
3387 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3388 }
3389
3390 /* If the repetition is unlimited, it pays to see if the next thing on
3391 the line is something that cannot possibly match this character. If so,
3392 automatically possessifying this item gains some performance in the case
3393 where the match fails. */
3394
3395 if (!possessive_quantifier &&
3396 repeat_max < 0 &&
3397 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3398 options, cd))
3399 {
3400 repeat_type = 0; /* Force greedy */
3401 possessive_quantifier = TRUE;
3402 }
3403
3404 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3405 }
3406
3407 /* If previous was a single negated character ([^a] or similar), we use
3408 one of the special opcodes, replacing it. The code is shared with single-
3409 character repeats by setting opt_type to add a suitable offset into
3410 repeat_type. We can also test for auto-possessification. OP_NOT is
3411 currently used only for single-byte chars. */
3412
3413 else if (*previous == OP_NOT)
3414 {
3415 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3416 c = previous[1];
3417 if (!possessive_quantifier &&
3418 repeat_max < 0 &&
3419 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3420 {
3421 repeat_type = 0; /* Force greedy */
3422 possessive_quantifier = TRUE;
3423 }
3424 goto OUTPUT_SINGLE_REPEAT;
3425 }
3426
3427 /* If previous was a character type match (\d or similar), abolish it and
3428 create a suitable repeat item. The code is shared with single-character
3429 repeats by setting op_type to add a suitable offset into repeat_type. Note
3430 the the Unicode property types will be present only when SUPPORT_UCP is
3431 defined, but we don't wrap the little bits of code here because it just
3432 makes it horribly messy. */
3433
3434 else if (*previous < OP_EODN)
3435 {
3436 uschar *oldcode;
3437 int prop_type, prop_value;
3438 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3439 c = *previous;
3440
3441 if (!possessive_quantifier &&
3442 repeat_max < 0 &&
3443 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3444 {
3445 repeat_type = 0; /* Force greedy */
3446 possessive_quantifier = TRUE;
3447 }
3448
3449 OUTPUT_SINGLE_REPEAT:
3450 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3451 {
3452 prop_type = previous[1];
3453 prop_value = previous[2];
3454 }
3455 else prop_type = prop_value = -1;
3456
3457 oldcode = code;
3458 code = previous; /* Usually overwrite previous item */
3459
3460 /* If the maximum is zero then the minimum must also be zero; Perl allows
3461 this case, so we do too - by simply omitting the item altogether. */
3462
3463 if (repeat_max == 0) goto END_REPEAT;
3464
3465 /* All real repeats make it impossible to handle partial matching (maybe
3466 one day we will be able to remove this restriction). */
3467
3468 if (repeat_max != 1) cd->nopartial = TRUE;
3469
3470 /* Combine the op_type with the repeat_type */
3471
3472 repeat_type += op_type;
3473
3474 /* A minimum of zero is handled either as the special case * or ?, or as
3475 an UPTO, with the maximum given. */
3476
3477 if (repeat_min == 0)
3478 {
3479 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3480 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3481 else
3482 {
3483 *code++ = OP_UPTO + repeat_type;
3484 PUT2INC(code, 0, repeat_max);
3485 }
3486 }
3487
3488 /* A repeat minimum of 1 is optimized into some special cases. If the
3489 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3490 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3491 one less than the maximum. */
3492
3493 else if (repeat_min == 1)
3494 {
3495 if (repeat_max == -1)
3496 *code++ = OP_PLUS + repeat_type;
3497 else
3498 {
3499 code = oldcode; /* leave previous item in place */
3500 if (repeat_max == 1) goto END_REPEAT;
3501 *code++ = OP_UPTO + repeat_type;
3502 PUT2INC(code, 0, repeat_max - 1);
3503 }
3504 }
3505
3506 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3507 handled as an EXACT followed by an UPTO. */
3508
3509 else
3510 {
3511 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3512 PUT2INC(code, 0, repeat_min);
3513
3514 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3515 we have to insert the character for the previous code. For a repeated
3516 Unicode property match, there are two extra bytes that define the
3517 required property. In UTF-8 mode, long characters have their length in
3518 c, with the 0x80 bit as a flag. */
3519
3520 if (repeat_max < 0)
3521 {
3522 #ifdef SUPPORT_UTF8
3523 if (utf8 && c >= 128)
3524 {
3525 memcpy(code, utf8_char, c & 7);
3526 code += c & 7;
3527 }
3528 else
3529 #endif
3530 {
3531 *code++ = c;
3532 if (prop_type >= 0)
3533 {
3534 *code++ = prop_type;
3535 *code++ = prop_value;
3536 }
3537 }
3538 *code++ = OP_STAR + repeat_type;
3539 }
3540
3541 /* Else insert an UPTO if the max is greater than the min, again
3542 preceded by the character, for the previously inserted code. If the
3543 UPTO is just for 1 instance, we can use QUERY instead. */
3544
3545 else if (repeat_max != repeat_min)
3546 {
3547 #ifdef SUPPORT_UTF8
3548 if (utf8 && c >= 128)
3549 {
3550 memcpy(code, utf8_char, c & 7);
3551 code += c & 7;
3552 }
3553 else
3554 #endif
3555 *code++ = c;
3556 if (prop_type >= 0)
3557 {
3558 *code++ = prop_type;
3559 *code++ = prop_value;
3560 }
3561 repeat_max -= repeat_min;
3562
3563 if (repeat_max == 1)
3564 {
3565 *code++ = OP_QUERY + repeat_type;
3566 }
3567 else
3568 {
3569 *code++ = OP_UPTO + repeat_type;
3570 PUT2INC(code, 0, repeat_max);
3571 }
3572 }
3573 }
3574
3575 /* The character or character type itself comes last in all cases. */
3576
3577 #ifdef SUPPORT_UTF8
3578 if (utf8 && c >= 128)
3579 {
3580 memcpy(code, utf8_char, c & 7);
3581 code += c & 7;
3582 }
3583 else
3584 #endif
3585 *code++ = c;
3586
3587 /* For a repeated Unicode property match, there are two extra bytes that
3588 define the required property. */
3589
3590 #ifdef SUPPORT_UCP
3591 if (prop_type >= 0)
3592 {
3593 *code++ = prop_type;
3594 *code++ = prop_value;
3595 }
3596 #endif
3597 }
3598
3599 /* If previous was a character class or a back reference, we put the repeat
3600 stuff after it, but just skip the item if the repeat was {0,0}. */
3601
3602 else if (*previous == OP_CLASS ||
3603 *previous == OP_NCLASS ||
3604 #ifdef SUPPORT_UTF8
3605 *previous == OP_XCLASS ||
3606 #endif
3607 *previous == OP_REF)
3608 {
3609 if (repeat_max == 0)
3610 {
3611 code = previous;
3612 goto END_REPEAT;
3613 }
3614
3615 /* All real repeats make it impossible to handle partial matching (maybe
3616 one day we will be able to remove this restriction). */
3617
3618 if (repeat_max != 1) cd->nopartial = TRUE;
3619
3620 if (repeat_min == 0 && repeat_max == -1)
3621 *code++ = OP_CRSTAR + repeat_type;
3622 else if (repeat_min == 1 && repeat_max == -1)
3623 *code++ = OP_CRPLUS + repeat_type;
3624 else if (repeat_min == 0 && repeat_max == 1)
3625 *code++ = OP_CRQUERY + repeat_type;
3626 else
3627 {
3628 *code++ = OP_CRRANGE + repeat_type;
3629 PUT2INC(code, 0, repeat_min);
3630 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3631 PUT2INC(code, 0, repeat_max);
3632 }
3633 }
3634
3635 /* If previous was a bracket group, we may have to replicate it in certain
3636 cases. */
3637
3638 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3639 *previous == OP_ONCE || *previous == OP_COND)
3640 {
3641 register int i;
3642 int ketoffset = 0;
3643 int len = code - previous;
3644 uschar *bralink = NULL;
3645
3646 /* Repeating a DEFINE group is pointless */
3647
3648 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3649 {
3650 *errorcodeptr = ERR55;
3651 goto FAILED;
3652 }
3653
3654 /* If the maximum repeat count is unlimited, find the end of the bracket
3655 by scanning through from the start, and compute the offset back to it
3656 from the current code pointer. There may be an OP_OPT setting following
3657 the final KET, so we can't find the end just by going back from the code
3658 pointer. */
3659
3660 if (repeat_max == -1)
3661 {
3662 register uschar *ket = previous;
3663 do ket += GET(ket, 1); while (*ket != OP_KET);
3664 ketoffset = code - ket;
3665 }
3666
3667 /* The case of a zero minimum is special because of the need to stick
3668 OP_BRAZERO in front of it, and because the group appears once in the
3669 data, whereas in other cases it appears the minimum number of times. For
3670 this reason, it is simplest to treat this case separately, as otherwise
3671 the code gets far too messy. There are several special subcases when the
3672 minimum is zero. */
3673
3674 if (repeat_min == 0)
3675 {
3676 /* If the maximum is also zero, we just omit the group from the output
3677 altogether. */
3678
3679 if (repeat_max == 0)
3680 {
3681 code = previous;
3682 goto END_REPEAT;
3683 }
3684
3685 /* If the maximum is 1 or unlimited, we just have to stick in the
3686 BRAZERO and do no more at this point. However, we do need to adjust
3687 any OP_RECURSE calls inside the group that refer to the group itself or
3688 any internal or forward referenced group, because the offset is from
3689 the start of the whole regex. Temporarily terminate the pattern while
3690 doing this. */
3691
3692 if (repeat_max <= 1)
3693 {
3694 *code = OP_END;
3695 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3696 memmove(previous+1, previous, len);
3697 code++;
3698 *previous++ = OP_BRAZERO + repeat_type;
3699 }
3700
3701 /* If the maximum is greater than 1 and limited, we have to replicate
3702 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3703 The first one has to be handled carefully because it's the original
3704 copy, which has to be moved up. The remainder can be handled by code
3705 that is common with the non-zero minimum case below. We have to
3706 adjust the value or repeat_max, since one less copy is required. Once
3707 again, we may have to adjust any OP_RECURSE calls inside the group. */
3708
3709 else
3710 {
3711 int offset;
3712 *code = OP_END;
3713 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3714 memmove(previous + 2 + LINK_SIZE, previous, len);
3715 code += 2 + LINK_SIZE;
3716 *previous++ = OP_BRAZERO + repeat_type;
3717 *previous++ = OP_BRA;
3718
3719 /* We chain together the bracket offset fields that have to be
3720 filled in later when the ends of the brackets are reached. */
3721
3722 offset = (bralink == NULL)? 0 : previous - bralink;
3723 bralink = previous;
3724 PUTINC(previous, 0, offset);
3725 }
3726
3727 repeat_max--;
3728 }
3729
3730 /* If the minimum is greater than zero, replicate the group as many
3731 times as necessary, and adjust the maximum to the number of subsequent
3732 copies that we need. If we set a first char from the group, and didn't
3733 set a required char, copy the latter from the former. If there are any
3734 forward reference subroutine calls in the group, there will be entries on
3735 the workspace list; replicate these with an appropriate increment. */
3736
3737 else
3738 {
3739 if (repeat_min > 1)
3740 {
3741 /* In the pre-compile phase, we don't actually do the replication. We
3742 just adjust the length as if we had. Do some paranoid checks for
3743 potential integer overflow. */
3744
3745 if (lengthptr != NULL)
3746 {
3747 int delta = (repeat_min - 1)*length_prevgroup;
3748 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3749 (double)INT_MAX ||
3750 OFLOW_MAX - *lengthptr < delta)
3751 {
3752 *errorcodeptr = ERR20;
3753 goto FAILED;
3754 }
3755 *lengthptr += delta;
3756 }
3757
3758 /* This is compiling for real */
3759
3760 else
3761 {
3762 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3763 for (i = 1; i < repeat_min; i++)
3764 {
3765 uschar *hc;
3766 uschar *this_hwm = cd->hwm;
3767 memcpy(code, previous, len);
3768 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3769 {
3770 PUT(cd->hwm, 0, GET(hc, 0) + len);
3771 cd->hwm += LINK_SIZE;
3772 }
3773 save_hwm = this_hwm;
3774 code += len;
3775 }
3776 }
3777 }
3778
3779 if (repeat_max > 0) repeat_max -= repeat_min;
3780 }
3781
3782 /* This code is common to both the zero and non-zero minimum cases. If
3783 the maximum is limited, it replicates the group in a nested fashion,
3784 remembering the bracket starts on a stack. In the case of a zero minimum,
3785 the first one was set up above. In all cases the repeat_max now specifies
3786 the number of additional copies needed. Again, we must remember to
3787 replicate entries on the forward reference list. */
3788
3789 if (repeat_max >= 0)
3790 {
3791 /* In the pre-compile phase, we don't actually do the replication. We
3792 just adjust the length as if we had. For each repetition we must add 1
3793 to the length for BRAZERO and for all but the last repetition we must
3794 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3795 paranoid checks to avoid integer overflow. */
3796
3797 if (lengthptr != NULL && repeat_max > 0)
3798 {
3799 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3800 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3801 if ((double)repeat_max *
3802 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3803 > (double)INT_MAX ||
3804 OFLOW_MAX - *lengthptr < delta)
3805 {
3806 *errorcodeptr = ERR20;
3807 goto FAILED;
3808 }
3809 *lengthptr += delta;
3810 }
3811
3812 /* This is compiling for real */
3813
3814 else for (i = repeat_max - 1; i >= 0; i--)
3815 {
3816 uschar *hc;
3817 uschar *this_hwm = cd->hwm;
3818
3819 *code++ = OP_BRAZERO + repeat_type;
3820
3821 /* All but the final copy start a new nesting, maintaining the
3822 chain of brackets outstanding. */
3823
3824 if (i != 0)
3825 {
3826 int offset;
3827 *code++ = OP_BRA;
3828 offset = (bralink == NULL)? 0 : code - bralink;
3829 bralink = code;
3830 PUTINC(code, 0, offset);
3831 }
3832
3833 memcpy(code, previous, len);
3834 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3835 {
3836 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3837 cd->hwm += LINK_SIZE;
3838 }
3839 save_hwm = this_hwm;
3840 code += len;
3841 }
3842
3843 /* Now chain through the pending brackets, and fill in their length
3844 fields (which are holding the chain links pro tem). */
3845
3846 while (bralink != NULL)
3847 {
3848 int oldlinkoffset;
3849 int offset = code - bralink + 1;
3850 uschar *bra = code - offset;
3851 oldlinkoffset = GET(bra, 1);
3852 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3853 *code++ = OP_KET;
3854 PUTINC(code, 0, offset);
3855 PUT(bra, 1, offset);
3856 }
3857 }
3858
3859 /* If the maximum is unlimited, set a repeater in the final copy. We
3860 can't just offset backwards from the current code point, because we
3861 don't know if there's been an options resetting after the ket. The
3862 correct offset was computed above.
3863
3864 Then, when we are doing the actual compile phase, check to see whether
3865 this group is a non-atomic one that could match an empty string. If so,
3866 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3867 that runtime checking can be done. [This check is also applied to
3868 atomic groups at runtime, but in a different way.] */
3869
3870 else
3871 {
3872 uschar *ketcode = code - ketoffset;
3873 uschar *bracode = ketcode - GET(ketcode, 1);
3874 *ketcode = OP_KETRMAX + repeat_type;
3875 if (lengthptr == NULL && *bracode != OP_ONCE)
3876 {
3877 uschar *scode = bracode;
3878 do
3879 {
3880 if (could_be_empty_branch(scode, ketcode, utf8))
3881 {
3882 *bracode += OP_SBRA - OP_BRA;
3883 break;
3884 }
3885 scode += GET(scode, 1);
3886 }
3887 while (*scode == OP_ALT);
3888 }
3889 }
3890 }
3891
3892 /* Else there's some kind of shambles */
3893
3894 else
3895 {
3896 *errorcodeptr = ERR11;
3897 goto FAILED;
3898 }
3899
3900 /* If the character following a repeat is '+', or if certain optimization
3901 tests above succeeded, possessive_quantifier is TRUE. For some of the
3902 simpler opcodes, there is an special alternative opcode for this. For
3903 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3904 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3905 but the special opcodes can optimize it a bit. The repeated item starts at
3906 tempcode, not at previous, which might be the first part of a string whose
3907 (former) last char we repeated.
3908
3909 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3910 an 'upto' may follow. We skip over an 'exact' item, and then test the
3911 length of what remains before proceeding. */
3912
3913 if (possessive_quantifier)
3914 {
3915 int len;
3916 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3917 *tempcode == OP_NOTEXACT)
3918 tempcode += _pcre_OP_lengths[*tempcode];
3919 len = code - tempcode;
3920 if (len > 0) switch (*tempcode)
3921 {
3922 case OP_STAR: *tempcode = OP_POSSTAR; break;
3923 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3924 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3925 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3926
3927 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3928 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3929 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3930 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3931
3932 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3933 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3934 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3935 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3936
3937 default:
3938 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3939 code += 1 + LINK_SIZE;
3940 len += 1 + LINK_SIZE;
3941 tempcode[0] = OP_ONCE;
3942 *code++ = OP_KET;
3943 PUTINC(code, 0, len);
3944 PUT(tempcode, 1, len);
3945 break;
3946 }
3947 }
3948
3949 /* In all case we no longer have a previous item. We also set the
3950 "follows varying string" flag for subsequently encountered reqbytes if
3951 it isn't already set and we have just passed a varying length item. */
3952
3953 END_REPEAT:
3954 previous = NULL;
3955 cd->req_varyopt |= reqvary;
3956 break;
3957
3958
3959 /* ===================================================================*/
3960 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3961 lookbehind or option setting or condition or all the other extended
3962 parenthesis forms. */
3963
3964 case '(':
3965 newoptions = options;
3966 skipbytes = 0;
3967 bravalue = OP_CBRA;
3968 save_hwm = cd->hwm;
3969 reset_bracount = FALSE;
3970
3971 /* First deal with various "verbs" that can be introduced by '*'. */
3972
3973 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3974 {
3975 int i, namelen;
3976 const uschar *name = ++ptr;
3977 previous = NULL;
3978 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3979 if (*ptr == ':')
3980 {
3981 *errorcodeptr = ERR59; /* Not supported */
3982 goto FAILED;
3983 }
3984 if (*ptr != ')')
3985 {
3986 *errorcodeptr = ERR60;
3987 goto FAILED;
3988 }
3989 namelen = ptr - name;
3990 for (i = 0; i < verbcount; i++)
3991 {
3992 if (namelen == verbs[i].len &&
3993 strncmp((char *)name, verbs[i].name, namelen) == 0)
3994 {
3995 *code = verbs[i].op;
3996 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3997 break;
3998 }
3999 }
4000 if (i < verbcount) continue;
4001 *errorcodeptr = ERR60;
4002 goto FAILED;
4003 }
4004
4005 /* Deal with the extended parentheses; all are introduced by '?', and the
4006 appearance of any of them means that this is not a capturing group. */
4007
4008 else if (*ptr == '?')
4009 {
4010 int i, set, unset, namelen;
4011 int *optset;
4012 const uschar *name;
4013 uschar *slot;
4014
4015 switch (*(++ptr))
4016 {
4017 case '#': /* Comment; skip to ket */
4018 ptr++;
4019 while (*ptr != 0 && *ptr != ')') ptr++;
4020 if (*ptr == 0)
4021 {
4022 *errorcodeptr = ERR18;
4023 goto FAILED;
4024 }
4025 continue;
4026
4027
4028 /* ------------------------------------------------------------ */
4029 case '|': /* Reset capture count for each branch */
4030 reset_bracount = TRUE;
4031 /* Fall through */
4032
4033 /* ------------------------------------------------------------ */
4034 case ':': /* Non-capturing bracket */
4035 bravalue = OP_BRA;
4036 ptr++;
4037 break;
4038
4039
4040 /* ------------------------------------------------------------ */
4041 case '(':
4042 bravalue = OP_COND; /* Conditional group */
4043
4044 /* A condition can be an assertion, a number (referring to a numbered
4045 group), a name (referring to a named group), or 'R', referring to
4046 recursion. R<digits> and R&name are also permitted for recursion tests.
4047
4048 There are several syntaxes for testing a named group: (?(name)) is used
4049 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4050
4051 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4052 be the recursive thing or the name 'R' (and similarly for 'R' followed
4053 by digits), and (b) a number could be a name that consists of digits.
4054 In both cases, we look for a name first; if not found, we try the other
4055 cases. */
4056
4057 /* For conditions that are assertions, check the syntax, and then exit
4058 the switch. This will take control down to where bracketed groups,
4059 including assertions, are processed. */
4060
4061 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4062 break;
4063
4064 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4065 below), and all need to skip 3 bytes at the start of the group. */
4066
4067 code[1+LINK_SIZE] = OP_CREF;
4068 skipbytes = 3;
4069 refsign = -1;
4070
4071 /* Check for a test for recursion in a named group. */
4072
4073 if (ptr[1] == 'R' && ptr[2] == '&')
4074 {
4075 terminator = -1;
4076 ptr += 2;
4077 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4078 }
4079
4080 /* Check for a test for a named group's having been set, using the Perl
4081 syntax (?(<name>) or (?('name') */
4082
4083 else if (ptr[1] == '<')
4084 {
4085 terminator = '>';
4086 ptr++;
4087 }
4088 else if (ptr[1] == '\'')
4089 {
4090 terminator = '\'';
4091 ptr++;
4092 }
4093 else
4094 {
4095 terminator = 0;
4096 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4097 }
4098
4099 /* We now expect to read a name; any thing else is an error */
4100
4101 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4102 {
4103 ptr += 1; /* To get the right offset */
4104 *errorcodeptr = ERR28;
4105 goto FAILED;
4106 }
4107
4108 /* Read the name, but also get it as a number if it's all digits */
4109
4110 recno = 0;
4111 name = ++ptr;
4112 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4113 {
4114 if (recno >= 0)
4115 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4116 recno * 10 + *ptr - '0' : -1;
4117 ptr++;
4118 }
4119 namelen = ptr - name;
4120
4121 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4122 {
4123 ptr--; /* Error offset */
4124 *errorcodeptr = ERR26;
4125 goto FAILED;
4126 }
4127
4128 /* Do no further checking in the pre-compile phase. */
4129
4130 if (lengthptr != NULL) break;
4131
4132 /* In the real compile we do the work of looking for the actual
4133 reference. If the string started with "+" or "-" we require the rest to
4134 be digits, in which case recno will be set. */
4135
4136 if (refsign > 0)
4137 {
4138 if (recno <= 0)
4139 {
4140 *errorcodeptr = ERR58;
4141 goto FAILED;
4142 }
4143 if (refsign == '-')
4144 {
4145 recno = cd->bracount - recno + 1;
4146 if (recno <= 0)
4147 {
4148 *errorcodeptr = ERR15;
4149 goto FAILED;
4150 }
4151 }
4152 else recno += cd->bracount;
4153 PUT2(code, 2+LINK_SIZE, recno);
4154 break;
4155 }
4156
4157 /* Otherwise (did not start with "+" or "-"), start by looking for the
4158 name. */
4159
4160 slot = cd->name_table;
4161 for (i = 0; i < cd->names_found; i++)
4162 {
4163 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4164 slot += cd->name_entry_size;
4165 }
4166
4167 /* Found a previous named subpattern */
4168
4169 if (i < cd->names_found)
4170 {
4171 recno = GET2(slot, 0);
4172 PUT2(code, 2+LINK_SIZE, recno);
4173 }
4174
4175 /* Search the pattern for a forward reference */
4176
4177 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4178 (options & PCRE_EXTENDED) != 0)) > 0)
4179 {
4180 PUT2(code, 2+LINK_SIZE, i);
4181 }
4182
4183 /* If terminator == 0 it means that the name followed directly after
4184 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4185 some further alternatives to try. For the cases where terminator != 0
4186 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4187 now checked all the possibilities, so give an error. */
4188
4189 else if (terminator != 0)
4190 {
4191 *errorcodeptr = ERR15;
4192 goto FAILED;
4193 }
4194
4195 /* Check for (?(R) for recursion. Allow digits after R to specify a
4196 specific group number. */
4197
4198 else if (*name == 'R')
4199 {
4200 recno = 0;
4201 for (i = 1; i < namelen; i++)
4202 {
4203 if ((digitab[name[i]] & ctype_digit) == 0)
4204 {
4205 *errorcodeptr = ERR15;
4206 goto FAILED;
4207 }
4208 recno = recno * 10 + name[i] - '0';
4209 }
4210 if (recno == 0) recno = RREF_ANY;
4211 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4212 PUT2(code, 2+LINK_SIZE, recno);
4213 }
4214
4215 /* Similarly, check for the (?(DEFINE) "condition", which is always
4216 false. */
4217
4218 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4219 {
4220 code[1+LINK_SIZE] = OP_DEF;
4221 skipbytes = 1;
4222 }
4223
4224 /* Check for the "name" actually being a subpattern number. */
4225
4226 else if (recno > 0)
4227 {
4228 PUT2(code, 2+LINK_SIZE, recno);
4229 }
4230
4231 /* Either an unidentified subpattern, or a reference to (?(0) */
4232
4233 else
4234 {
4235 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4236 goto FAILED;
4237 }
4238 break;
4239
4240
4241 /* ------------------------------------------------------------ */
4242 case '=': /* Positive lookahead */
4243 bravalue = OP_ASSERT;
4244 ptr++;
4245 break;
4246
4247
4248 /* ------------------------------------------------------------ */
4249 case '!': /* Negative lookahead */
4250 ptr++;
4251 if (*ptr == ')') /* Optimize (?!) */
4252 {
4253 *code++ = OP_FAIL;
4254 previous = NULL;
4255 continue;
4256 }
4257 bravalue = OP_ASSERT_NOT;
4258 break;
4259
4260
4261 /* ------------------------------------------------------------ */
4262 case '<': /* Lookbehind or named define */
4263 switch (ptr[1])
4264 {
4265 case '=': /* Positive lookbehind */
4266 bravalue = OP_ASSERTBACK;
4267 ptr += 2;
4268 break;
4269
4270 case '!': /* Negative lookbehind */
4271 bravalue = OP_ASSERTBACK_NOT;
4272 ptr += 2;
4273 break;
4274
4275 default: /* Could be name define, else bad */
4276 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4277 ptr++; /* Correct offset for error */
4278 *errorcodeptr = ERR24;
4279 goto FAILED;
4280 }
4281 break;
4282
4283
4284 /* ------------------------------------------------------------ */
4285 case '>': /* One-time brackets */
4286 bravalue = OP_ONCE;
4287 ptr++;
4288 break;
4289
4290
4291 /* ------------------------------------------------------------ */
4292 case 'C': /* Callout - may be followed by digits; */
4293 previous_callout = code; /* Save for later completion */
4294 after_manual_callout = 1; /* Skip one item before completing */
4295 *code++ = OP_CALLOUT;
4296 {
4297 int n = 0;
4298 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4299 n = n * 10 + *ptr - '0';
4300 if (*ptr != ')')
4301 {
4302 *errorcodeptr = ERR39;
4303 goto FAILED;
4304 }
4305 if (n > 255)
4306 {
4307 *errorcodeptr = ERR38;
4308 goto FAILED;
4309 }
4310 *code++ = n;
4311 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4312 PUT(code, LINK_SIZE, 0); /* Default length */
4313 code += 2 * LINK_SIZE;
4314 }
4315 previous = NULL;
4316 continue;
4317
4318
4319 /* ------------------------------------------------------------ */
4320 case 'P': /* Python-style named subpattern handling */
4321 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4322 {
4323 is_recurse = *ptr == '>';
4324 terminator = ')';
4325 goto NAMED_REF_OR_RECURSE;
4326 }
4327 else if (*ptr != '<') /* Test for Python-style definition */
4328 {
4329 *errorcodeptr = ERR41;
4330 goto FAILED;
4331 }
4332 /* Fall through to handle (?P< as (?< is handled */
4333
4334
4335 /* ------------------------------------------------------------ */
4336 DEFINE_NAME: /* Come here from (?< handling */
4337 case '\'':
4338 {
4339 terminator = (*ptr == '<')? '>' : '\'';
4340 name = ++ptr;
4341
4342 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4343 namelen = ptr - name;
4344
4345 /* In the pre-compile phase, just do a syntax check. */
4346
4347 if (lengthptr != NULL)
4348 {
4349 if (*ptr != terminator)
4350 {
4351 *errorcodeptr = ERR42;
4352 goto FAILED;
4353 }
4354 if (cd->names_found >= MAX_NAME_COUNT)
4355 {
4356 *errorcodeptr = ERR49;
4357 goto FAILED;
4358 }
4359 if (namelen + 3 > cd->name_entry_size)
4360 {
4361 cd->name_entry_size = namelen + 3;
4362 if (namelen > MAX_NAME_SIZE)
4363 {
4364 *errorcodeptr = ERR48;
4365 goto FAILED;
4366 }
4367 }
4368 }
4369
4370 /* In the real compile, create the entry in the table */
4371
4372 else
4373 {
4374 slot = cd->name_table;
4375 for (i = 0; i < cd->names_found; i++)
4376 {
4377 int crc = memcmp(name, slot+2, namelen);
4378 if (crc == 0)
4379 {
4380 if (slot[2+namelen] == 0)
4381 {
4382 if ((options & PCRE_DUPNAMES) == 0)
4383 {
4384 *errorcodeptr = ERR43;
4385 goto FAILED;
4386 }
4387 }
4388 else crc = -1; /* Current name is substring */
4389 }
4390 if (crc < 0)
4391 {
4392 memmove(slot + cd->name_entry_size, slot,
4393 (cd->names_found - i) * cd->name_entry_size);
4394 break;
4395 }
4396 slot += cd->name_entry_size;
4397 }
4398
4399 PUT2(slot, 0, cd->bracount + 1);
4400 memcpy(slot + 2, name, namelen);
4401 slot[2+namelen] = 0;
4402 }
4403 }
4404
4405 /* In both cases, count the number of names we've encountered. */
4406
4407 ptr++; /* Move past > or ' */
4408 cd->names_found++;
4409 goto NUMBERED_GROUP;
4410
4411
4412 /* ------------------------------------------------------------ */
4413 case '&': /* Perl recursion/subroutine syntax */
4414 terminator = ')';
4415 is_recurse = TRUE;
4416 /* Fall through */
4417
4418 /* We come here from the Python syntax above that handles both
4419 references (?P=name) and recursion (?P>name), as well as falling
4420 through from the Perl recursion syntax (?&name). */
4421
4422 NAMED_REF_OR_RECURSE:
4423 name = ++ptr;
4424 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4425 namelen = ptr - name;
4426
4427 /* In the pre-compile phase, do a syntax check and set a dummy
4428 reference number. */
4429
4430 if (lengthptr != NULL)
4431 {
4432 if (*ptr != terminator)
4433 {
4434 *errorcodeptr = ERR42;
4435 goto FAILED;
4436 }
4437 if (namelen > MAX_NAME_SIZE)
4438 {
4439 *errorcodeptr = ERR48;
4440 goto FAILED;
4441 }
4442 recno = 0;
4443 }
4444
4445 /* In the real compile, seek the name in the table */
4446
4447 else
4448 {
4449 slot = cd->name_table;
4450 for (i = 0; i < cd->names_found; i++)
4451 {
4452 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4453 slot += cd->name_entry_size;
4454 }
4455
4456 if (i < cd->names_found) /* Back reference */
4457 {
4458 recno = GET2(slot, 0);
4459 }
4460 else if ((recno = /* Forward back reference */
4461 find_parens(ptr, cd->bracount, name, namelen,
4462 (options & PCRE_EXTENDED) != 0)) <= 0)
4463 {
4464 *errorcodeptr = ERR15;
4465 goto FAILED;
4466 }
4467 }
4468
4469 /* In both phases, we can now go to the code than handles numerical
4470 recursion or backreferences. */
4471
4472 if (is_recurse) goto HANDLE_RECURSION;
4473 else goto HANDLE_REFERENCE;
4474
4475
4476 /* ------------------------------------------------------------ */
4477 case 'R': /* Recursion */
4478 ptr++; /* Same as (?0) */
4479 /* Fall through */
4480
4481
4482 /* ------------------------------------------------------------ */
4483 case '-': case '+':
4484 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4485 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4486 {
4487 const uschar *called;
4488
4489 if ((refsign = *ptr) == '+') ptr++;
4490 else if (refsign == '-')
4491 {
4492 if ((digitab[ptr[1]] & ctype_digit) == 0)
4493 goto OTHER_CHAR_AFTER_QUERY;
4494 ptr++;
4495 }
4496
4497 recno = 0;
4498 while((digitab[*ptr] & ctype_digit) != 0)
4499 recno = recno * 10 + *ptr++ - '0';
4500
4501 if (*ptr != ')')
4502 {
4503 *errorcodeptr = ERR29;
4504 goto FAILED;
4505 }
4506
4507 if (refsign == '-')
4508 {
4509 if (recno == 0)
4510 {
4511 *errorcodeptr = ERR58;
4512 goto FAILED;
4513 }
4514 recno = cd->bracount - recno + 1;
4515 if (recno <= 0)
4516 {
4517 *errorcodeptr = ERR15;
4518 goto FAILED;
4519 }
4520 }
4521 else if (refsign == '+')
4522 {
4523 if (recno == 0)
4524 {
4525 *errorcodeptr = ERR58;
4526 goto FAILED;
4527 }
4528 recno += cd->bracount;
4529 }
4530
4531 /* Come here from code above that handles a named recursion */
4532
4533 HANDLE_RECURSION:
4534
4535 previous = code;
4536 called = cd->start_code;
4537
4538 /* When we are actually compiling, find the bracket that is being
4539 referenced. Temporarily end the regex in case it doesn't exist before
4540 this point. If we end up with a forward reference, first check that
4541 the bracket does occur later so we can give the error (and position)
4542 now. Then remember this forward reference in the workspace so it can
4543 be filled in at the end. */
4544
4545 if (lengthptr == NULL)
4546 {
4547 *code = OP_END;
4548 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4549
4550 /* Forward reference */
4551
4552 if (called == NULL)
4553 {
4554 if (find_parens(ptr, cd->bracount, NULL, recno,
4555 (options & PCRE_EXTENDED) != 0) < 0)
4556 {
4557 *errorcodeptr = ERR15;
4558 goto FAILED;
4559 }
4560 called = cd->start_code + recno;
4561 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4562 }
4563
4564 /* If not a forward reference, and the subpattern is still open,
4565 this is a recursive call. We check to see if this is a left
4566 recursion that could loop for ever, and diagnose that case. */
4567
4568 else if (GET(called, 1) == 0 &&
4569 could_be_empty(called, code, bcptr, utf8))
4570 {
4571 *errorcodeptr = ERR40;
4572 goto FAILED;
4573 }
4574 }
4575
4576 /* Insert the recursion/subroutine item, automatically wrapped inside
4577 "once" brackets. Set up a "previous group" length so that a
4578 subsequent quantifier will work. */
4579
4580 *code = OP_ONCE;
4581 PUT(code, 1, 2 + 2*LINK_SIZE);
4582 code += 1 + LINK_SIZE;
4583
4584 *code = OP_RECURSE;
4585 PUT(code, 1, called - cd->start_code);
4586 code += 1 + LINK_SIZE;
4587
4588 *code = OP_KET;
4589 PUT(code, 1, 2 + 2*LINK_SIZE);
4590 code += 1 + LINK_SIZE;
4591
4592 length_prevgroup = 3 + 3*LINK_SIZE;
4593 }
4594
4595 /* Can't determine a first byte now */
4596
4597 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4598 continue;
4599
4600
4601 /* ------------------------------------------------------------ */
4602 default: /* Other characters: check option setting */
4603 OTHER_CHAR_AFTER_QUERY:
4604 set = unset = 0;
4605 optset = &set;
4606
4607 while (*ptr != ')' && *ptr != ':')
4608 {
4609 switch (*ptr++)
4610 {
4611 case '-': optset = &unset; break;
4612
4613 case 'J': /* Record that it changed in the external options */
4614 *optset |= PCRE_DUPNAMES;
4615 cd->external_options |= PCRE_JCHANGED;
4616 break;
4617
4618 case 'i': *optset |= PCRE_CASELESS; break;
4619 case 'm': *optset |= PCRE_MULTILINE; break;
4620 case 's': *optset |= PCRE_DOTALL; break;
4621 case 'x': *optset |= PCRE_EXTENDED; break;
4622 case 'U': *optset |= PCRE_UNGREEDY; break;
4623 case 'X': *optset |= PCRE_EXTRA; break;
4624
4625 default: *errorcodeptr = ERR12;
4626 ptr--; /* Correct the offset */
4627 goto FAILED;
4628 }
4629 }
4630
4631 /* Set up the changed option bits, but don't change anything yet. */
4632
4633 newoptions = (options | set) & (~unset);
4634
4635 /* If the options ended with ')' this is not the start of a nested
4636 group with option changes, so the options change at this level. If this
4637 item is right at the start of the pattern, the options can be
4638 abstracted and made external in the pre-compile phase, and ignored in
4639 the compile phase. This can be helpful when matching -- for instance in
4640 caseless checking of required bytes.
4641
4642 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4643 definitely *not* at the start of the pattern because something has been
4644 compiled. In the pre-compile phase, however, the code pointer can have
4645 that value after the start, because it gets reset as code is discarded
4646 during the pre-compile. However, this can happen only at top level - if
4647 we are within parentheses, the starting BRA will still be present. At
4648 any parenthesis level, the length value can be used to test if anything
4649 has been compiled at that level. Thus, a test for both these conditions
4650 is necessary to ensure we correctly detect the start of the pattern in
4651 both phases.
4652
4653 If we are not at the pattern start, compile code to change the ims
4654 options if this setting actually changes any of them. We also pass the
4655 new setting back so that it can be put at the start of any following
4656 branches, and when this group ends (if we are in a group), a resetting
4657 item can be compiled. */
4658
4659 if (*ptr == ')')
4660 {
4661 if (code == cd->start_code + 1 + LINK_SIZE &&
4662 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4663 {
4664 cd->external_options = newoptions;
4665 options = newoptions;
4666 }
4667 else
4668 {
4669 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4670 {
4671 *code++ = OP_OPT;
4672 *code++ = newoptions & PCRE_IMS;
4673 }
4674
4675 /* Change options at this level, and pass them back for use
4676 in subsequent branches. Reset the greedy defaults and the case
4677 value for firstbyte and reqbyte. */
4678
4679 *optionsptr = options = newoptions;
4680 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4681 greedy_non_default = greedy_default ^ 1;
4682 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4683 }
4684
4685 previous = NULL; /* This item can't be repeated */
4686 continue; /* It is complete */
4687 }
4688
4689 /* If the options ended with ':' we are heading into a nested group
4690 with possible change of options. Such groups are non-capturing and are
4691 not assertions of any kind. All we need to do is skip over the ':';
4692 the newoptions value is handled below. */
4693
4694 bravalue = OP_BRA;
4695 ptr++;
4696 } /* End of switch for character following (? */
4697 } /* End of (? handling */
4698
4699 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4700 all unadorned brackets become non-capturing and behave like (?:...)
4701 brackets. */
4702
4703 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4704 {
4705 bravalue = OP_BRA;
4706 }
4707
4708 /* Else we have a capturing group. */
4709
4710 else
4711 {
4712 NUMBERED_GROUP:
4713 cd->bracount += 1;
4714 PUT2(code, 1+LINK_SIZE, cd->bracount);
4715 skipbytes = 2;
4716 }
4717
4718 /* Process nested bracketed regex. Assertions may not be repeated, but
4719 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4720 non-register variable in order to be able to pass its address because some
4721 compilers complain otherwise. Pass in a new setting for the ims options if
4722 they have changed. */
4723
4724 previous = (bravalue >= OP_ONCE)? code : NULL;
4725 *code = bravalue;
4726 tempcode = code;
4727 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4728 length_prevgroup = 0; /* Initialize for pre-compile phase */
4729
4730 if (!compile_regex(
4731 newoptions, /* The complete new option state */
4732 options & PCRE_IMS, /* The previous ims option state */
4733 &tempcode, /* Where to put code (updated) */
4734 &ptr, /* Input pointer (updated) */
4735 errorcodeptr, /* Where to put an error message */
4736 (bravalue == OP_ASSERTBACK ||
4737 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4738 reset_bracount, /* True if (?| group */
4739 skipbytes, /* Skip over bracket number */
4740 &subfirstbyte, /* For possible first char */
4741 &subreqbyte, /* For possible last char */
4742 bcptr, /* Current branch chain */
4743 cd, /* Tables block */
4744 (lengthptr == NULL)? NULL : /* Actual compile phase */
4745 &length_prevgroup /* Pre-compile phase */
4746 ))
4747 goto FAILED;
4748
4749 /* At the end of compiling, code is still pointing to the start of the
4750 group, while tempcode has been updated to point past the end of the group
4751 and any option resetting that may follow it. The pattern pointer (ptr)
4752 is on the bracket. */
4753
4754 /* If this is a conditional bracket, check that there are no more than
4755 two branches in the group, or just one if it's a DEFINE group. We do this
4756 in the real compile phase, not in the pre-pass, where the whole group may
4757 not be available. */
4758
4759 if (bravalue == OP_COND && lengthptr == NULL)
4760 {
4761 uschar *tc = code;
4762 int condcount = 0;
4763
4764 do {
4765 condcount++;
4766 tc += GET(tc,1);
4767 }
4768 while (*tc != OP_KET);
4769
4770 /* A DEFINE group is never obeyed inline (the "condition" is always
4771 false). It must have only one branch. */
4772
4773 if (code[LINK_SIZE+1] == OP_DEF)
4774 {
4775 if (condcount > 1)
4776 {
4777 *errorcodeptr = ERR54;
4778 goto FAILED;
4779 }
4780 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4781 }
4782
4783 /* A "normal" conditional group. If there is just one branch, we must not
4784 make use of its firstbyte or reqbyte, because this is equivalent to an
4785 empty second branch. */
4786
4787 else
4788 {
4789 if (condcount > 2)
4790 {
4791 *errorcodeptr = ERR27;
4792 goto FAILED;
4793 }
4794 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4795 }
4796 }
4797
4798 /* Error if hit end of pattern */
4799
4800 if (*ptr != ')')
4801 {
4802 *errorcodeptr = ERR14;
4803 goto FAILED;
4804 }
4805
4806 /* In the pre-compile phase, update the length by the length of the group,
4807 less the brackets at either end. Then reduce the compiled code to just a
4808 set of non-capturing brackets so that it doesn't use much memory if it is
4809 duplicated by a quantifier.*/
4810
4811 if (lengthptr != NULL)
4812 {
4813 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4814 {
4815 *errorcodeptr = ERR20;
4816 goto FAILED;
4817 }
4818 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4819 *code++ = OP_BRA;
4820 PUTINC(code, 0, 1 + LINK_SIZE);
4821 *code++ = OP_KET;
4822 PUTINC(code, 0, 1 + LINK_SIZE);
4823 break; /* No need to waste time with special character handling */
4824 }
4825
4826 /* Otherwise update the main code pointer to the end of the group. */
4827
4828 code = tempcode;
4829
4830 /* For a DEFINE group, required and first character settings are not
4831 relevant. */
4832
4833 if (bravalue == OP_DEF) break;
4834
4835 /* Handle updating of the required and first characters for other types of
4836 group. Update for normal brackets of all kinds, and conditions with two
4837 branches (see code above). If the bracket is followed by a quantifier with
4838 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4839 zerofirstbyte outside the main loop so that they can be accessed for the
4840 back off. */
4841
4842 zeroreqbyte = reqbyte;
4843 zerofirstbyte = firstbyte;
4844 groupsetfirstbyte = FALSE;
4845
4846 if (bravalue >= OP_ONCE)
4847 {
4848 /* If we have not yet set a firstbyte in this branch, take it from the
4849 subpattern, remembering that it was set here so that a repeat of more
4850 than one can replicate it as reqbyte if necessary. If the subpattern has
4851 no firstbyte, set "none" for the whole branch. In both cases, a zero
4852 repeat forces firstbyte to "none". */
4853
4854 if (firstbyte == REQ_UNSET)
4855 {
4856 if (subfirstbyte >= 0)
4857 {
4858 firstbyte = subfirstbyte;
4859 groupsetfirstbyte = TRUE;
4860 }
4861 else firstbyte = REQ_NONE;
4862 zerofirstbyte = REQ_NONE;
4863 }
4864
4865 /* If firstbyte was previously set, convert the subpattern's firstbyte
4866 into reqbyte if there wasn't one, using the vary flag that was in
4867 existence beforehand. */
4868
4869 else if (subfirstbyte >= 0 && subreqbyte < 0)
4870 subreqbyte = subfirstbyte | tempreqvary;
4871
4872 /* If the subpattern set a required byte (or set a first byte that isn't
4873 really the first byte - see above), set it. */
4874
4875 if (subreqbyte >= 0) reqbyte = subreqbyte;
4876 }
4877
4878 /* For a forward assertion, we take the reqbyte, if set. This can be
4879 helpful if the pattern that follows the assertion doesn't set a different
4880 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4881 for an assertion, however because it leads to incorrect effect for patterns
4882 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4883 of a firstbyte. This is overcome by a scan at the end if there's no
4884 firstbyte, looking for an asserted first char. */
4885
4886 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4887 break; /* End of processing '(' */
4888
4889
4890 /* ===================================================================*/
4891 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4892 are arranged to be the negation of the corresponding OP_values. For the
4893 back references, the values are ESC_REF plus the reference number. Only
4894 back references and those types that consume a character may be repeated.
4895 We can test for values between ESC_b and ESC_Z for the latter; this may
4896 have to change if any new ones are ever created. */
4897
4898 case '\\':
4899 tempptr = ptr;
4900 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4901 if (*errorcodeptr != 0) goto FAILED;
4902
4903 if (c < 0)
4904 {
4905 if (-c == ESC_Q) /* Handle start of quoted string */
4906 {
4907 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4908 else inescq = TRUE;
4909 continue;
4910 }
4911
4912 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4913
4914 /* For metasequences that actually match a character, we disable the
4915 setting of a first character if it hasn't already been set. */
4916
4917 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4918 firstbyte = REQ_NONE;
4919
4920 /* Set values to reset to if this is followed by a zero repeat. */
4921
4922 zerofirstbyte = firstbyte;
4923 zeroreqbyte = reqbyte;
4924
4925 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4926 We also support \k{name} (.NET syntax) */
4927
4928 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4929 {
4930 is_recurse = FALSE;
4931 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4932 goto NAMED_REF_OR_RECURSE;
4933 }
4934
4935 /* Back references are handled specially; must disable firstbyte if
4936 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4937 ':' later. */
4938
4939 if (-c >= ESC_REF)
4940 {
4941 recno = -c - ESC_REF;
4942
4943 HANDLE_REFERENCE: /* Come here from named backref handling */
4944 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4945 previous = code;
4946 *code++ = OP_REF;
4947 PUT2INC(code, 0, recno);
4948 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4949 if (recno > cd->top_backref) cd->top_backref = recno;
4950 }
4951
4952 /* So are Unicode property matches, if supported. */
4953
4954 #ifdef SUPPORT_UCP
4955 else if (-c == ESC_P || -c == ESC_p)
4956 {
4957 BOOL negated;
4958 int pdata;
4959 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4960 if (ptype < 0) goto FAILED;
4961 previous = code;
4962 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4963 *code++ = ptype;
4964 *code++ = pdata;
4965 }
4966 #else
4967
4968 /* If Unicode properties are not supported, \X, \P, and \p are not
4969 allowed. */
4970
4971 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4972 {
4973 *errorcodeptr = ERR45;
4974 goto FAILED;
4975 }
4976 #endif
4977
4978 /* For the rest (including \X when Unicode properties are supported), we
4979 can obtain the OP value by negating the escape value. */
4980
4981 else
4982 {
4983 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4984 *code++ = -c;
4985 }
4986 continue;
4987 }
4988
4989 /* We have a data character whose value is in c. In UTF-8 mode it may have
4990 a value > 127. We set its representation in the length/buffer, and then
4991 handle it as a data character. */
4992
4993 #ifdef SUPPORT_UTF8
4994 if (utf8 && c > 127)
4995 mclength = _pcre_ord2utf8(c, mcbuffer);
4996 else
4997 #endif
4998
4999 {
5000 mcbuffer[0] = c;
5001 mclength = 1;
5002 }
5003 goto ONE_CHAR;
5004
5005
5006 /* ===================================================================*/
5007 /* Handle a literal character. It is guaranteed not to be whitespace or #
5008 when the extended flag is set. If we are in UTF-8 mode, it may be a
5009 multi-byte literal character. */
5010
5011 default:
5012 NORMAL_CHAR:
5013 mclength = 1;
5014 mcbuffer[0] = c;
5015
5016 #ifdef SUPPORT_UTF8
5017 if (utf8 && c >= 0xc0)
5018 {
5019 while ((ptr[1] & 0xc0) == 0x80)
5020 mcbuffer[mclength++] = *(++ptr);
5021 }
5022 #endif
5023
5024 /* At this point we have the character's bytes in mcbuffer, and the length
5025 in mclength. When not in UTF-8 mode, the length is always 1. */
5026
5027 ONE_CHAR:
5028 previous = code;
5029 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5030 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5031
5032 /* Set the first and required bytes appropriately. If no previous first
5033 byte, set it from this character, but revert to none on a zero repeat.
5034 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5035 repeat. */
5036
5037 if (firstbyte == REQ_UNSET)
5038 {
5039 zerofirstbyte = REQ_NONE;
5040 zeroreqbyte = reqbyte;
5041
5042 /* If the character is more than one byte long, we can set firstbyte
5043 only if it is not to be matched caselessly. */
5044
5045 if (mclength == 1 || req_caseopt == 0)
5046 {
5047 firstbyte = mcbuffer[0] | req_caseopt;
5048 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5049 }
5050 else firstbyte = reqbyte = REQ_NONE;
5051 }
5052
5053 /* firstbyte was previously set; we can set reqbyte only the length is
5054 1 or the matching is caseful. */
5055
5056 else
5057 {
5058 zerofirstbyte = firstbyte;
5059 zeroreqbyte = reqbyte;
5060 if (mclength == 1 || req_caseopt == 0)
5061 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5062 }
5063
5064 break; /* End of literal character handling */
5065 }
5066 } /* end of big loop */
5067
5068
5069 /* Control never reaches here by falling through, only by a goto for all the
5070 error states. Pass back the position in the pattern so that it can be displayed
5071 to the user for diagnosing the error. */
5072
5073 FAILED:
5074 *ptrptr = ptr;
5075 return FALSE;
5076 }
5077
5078
5079
5080
5081 /*************************************************
5082 * Compile sequence of alternatives *
5083 *************************************************/
5084
5085 /* On entry, ptr is pointing past the bracket character, but on return it
5086 points to the closing bracket, or vertical bar, or end of string. The code
5087 variable is pointing at the byte into which the BRA operator has been stored.
5088 If the ims options are changed at the start (for a (?ims: group) or during any
5089 branch, we need to insert an OP_OPT item at the start of every following branch
5090 to ensure they get set correctly at run time, and also pass the new options
5091 into every subsequent branch compile.
5092
5093 This function is used during the pre-compile phase when we are trying to find
5094 out the amount of memory needed, as well as during the real compile phase. The
5095 value of lengthptr distinguishes the two phases.
5096
5097 Arguments:
5098 options option bits, including any changes for this subpattern
5099 oldims previous settings of ims option bits
5100 codeptr -> the address of the current code pointer
5101 ptrptr -> the address of the current pattern pointer
5102 errorcodeptr -> pointer to error code variable
5103 lookbehind TRUE if this is a lookbehind assertion
5104 reset_bracount TRUE to reset the count for each branch
5105 skipbytes skip this many bytes at start (for brackets and OP_COND)
5106 firstbyteptr place to put the first required character, or a negative number
5107 reqbyteptr place to put the last required character, or a negative number
5108 bcptr pointer to the chain of currently open branches
5109 cd points to the data block with tables pointers etc.
5110 lengthptr NULL during the real compile phase
5111 points to length accumulator during pre-compile phase
5112
5113 Returns: TRUE on success
5114 */
5115
5116 static BOOL
5117 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5118 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5119 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5120 int *lengthptr)
5121 {
5122 const uschar *ptr = *ptrptr;
5123 uschar *code = *codeptr;
5124 uschar *last_branch = code;
5125 uschar *start_bracket = code;
5126 uschar *reverse_count = NULL;
5127 int firstbyte, reqbyte;
5128 int branchfirstbyte, branchreqbyte;
5129 int length;
5130 int orig_bracount;
5131 int max_bracount;
5132 branch_chain bc;
5133
5134 bc.outer = bcptr;
5135 bc.current = code;
5136
5137 firstbyte = reqbyte = REQ_UNSET;
5138
5139 /* Accumulate the length for use in the pre-compile phase. Start with the
5140 length of the BRA and KET and any extra bytes that are required at the
5141 beginning. We accumulate in a local variable to save frequent testing of
5142 lenthptr for NULL. We cannot do this by looking at the value of code at the
5143 start and end of each alternative, because compiled items are discarded during
5144 the pre-compile phase so that the work space is not exceeded. */
5145
5146 length = 2 + 2*LINK_SIZE + skipbytes;
5147
5148 /* WARNING: If the above line is changed for any reason, you must also change
5149 the code that abstracts option settings at the start of the pattern and makes
5150 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5151 pre-compile phase to find out whether anything has yet been compiled or not. */
5152
5153 /* Offset is set zero to mark that this bracket is still open */
5154
5155 PUT(code, 1, 0);
5156 code += 1 + LINK_SIZE + skipbytes;
5157
5158 /* Loop for each alternative branch */
5159
5160 orig_bracount = max_bracount = cd->bracount;
5161 for (;;)
5162 {
5163 /* For a (?| group, reset the capturing bracket count so that each branch
5164 uses the same numbers. */
5165
5166 if (reset_bracount) cd->bracount = orig_bracount;
5167
5168 /* Handle a change of ims options at the start of the branch */
5169
5170 if ((options & PCRE_IMS) != oldims)
5171 {
5172 *code++ = OP_OPT;
5173 *code++ = options & PCRE_IMS;
5174 length += 2;
5175 }
5176
5177 /* Set up dummy OP_REVERSE if lookbehind assertion */
5178
5179 if (lookbehind)
5180 {
5181 *code++ = OP_REVERSE;
5182 reverse_count = code;
5183 PUTINC(code, 0, 0);
5184 length += 1 + LINK_SIZE;
5185 }
5186
5187 /* Now compile the branch; in the pre-compile phase its length gets added
5188 into the length. */
5189
5190 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5191 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5192 {
5193 *ptrptr = ptr;
5194 return FALSE;
5195 }
5196
5197 /* Keep the highest bracket count in case (?| was used and some branch
5198 has fewer than the rest. */
5199
5200 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5201
5202 /* In the real compile phase, there is some post-processing to be done. */
5203
5204 if (lengthptr == NULL)
5205 {
5206 /* If this is the first branch, the firstbyte and reqbyte values for the
5207 branch become the values for the regex. */
5208
5209 if (*last_branch != OP_ALT)
5210 {
5211 firstbyte = branchfirstbyte;
5212 reqbyte = branchreqbyte;
5213 }
5214
5215 /* If this is not the first branch, the first char and reqbyte have to
5216 match the values from all the previous branches, except that if the
5217 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5218 and we set REQ_VARY for the regex. */
5219
5220 else
5221 {
5222 /* If we previously had a firstbyte, but it doesn't match the new branch,
5223 we have to abandon the firstbyte for the regex, but if there was
5224 previously no reqbyte, it takes on the value of the old firstbyte. */
5225
5226 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5227 {
5228 if (reqbyte < 0) reqbyte = firstbyte;
5229 firstbyte = REQ_NONE;
5230 }
5231
5232 /* If we (now or from before) have no firstbyte, a firstbyte from the
5233 branch becomes a reqbyte if there isn't a branch reqbyte. */
5234
5235 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5236 branchreqbyte = branchfirstbyte;
5237
5238 /* Now ensure that the reqbytes match */
5239
5240 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5241 reqbyte = REQ_NONE;
5242 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5243 }
5244
5245 /* If lookbehind, check that this branch matches a fixed-length string, and
5246 put the length into the OP_REVERSE item. Temporarily mark the end of the
5247 branch with OP_END. */
5248
5249 if (lookbehind)
5250 {
5251 int fixed_length;
5252 *code = OP_END;
5253 fixed_length = find_fixedlength(last_branch, options);
5254 DPRINTF(("fixed length = %d\n", fixed_length));
5255 if (fixed_length < 0)
5256 {
5257 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5258 *ptrptr = ptr;
5259 return FALSE;
5260 }
5261 PUT(reverse_count, 0, fixed_length);
5262 }
5263 }
5264
5265 /* Reached end of expression, either ')' or end of pattern. In the real
5266 compile phase, go back through the alternative branches and reverse the chain
5267 of offsets, with the field in the BRA item now becoming an offset to the
5268 first alternative. If there are no alternatives, it points to the end of the
5269 group. The length in the terminating ket is always the length of the whole
5270 bracketed item. If any of the ims options were changed inside the group,
5271 compile a resetting op-code following, except at the very end of the pattern.
5272 Return leaving the pointer at the terminating char. */
5273
5274 if (*ptr != '|')
5275 {
5276 if (lengthptr == NULL)
5277 {
5278 int branch_length = code - last_branch;
5279 do
5280 {
5281 int prev_length = GET(last_branch, 1);
5282 PUT(last_branch, 1, branch_length);
5283 branch_length = prev_length;
5284 last_branch -= branch_length;
5285 }
5286 while (branch_length > 0);
5287 }
5288
5289 /* Fill in the ket */
5290
5291 *code = OP_KET;
5292 PUT(code, 1, code - start_bracket);
5293 code += 1 + LINK_SIZE;
5294
5295 /* Resetting option if needed */
5296
5297 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5298 {
5299 *code++ = OP_OPT;
5300 *code++ = oldims;
5301 length += 2;
5302 }
5303
5304 /* Retain the highest bracket number, in case resetting was used. */
5305
5306 cd->bracount = max_bracount;
5307
5308 /* Set values to pass back */
5309
5310 *codeptr = code;
5311 *ptrptr = ptr;
5312 *firstbyteptr = firstbyte;
5313 *reqbyteptr = reqbyte;
5314 if (lengthptr != NULL)
5315 {
5316 if (OFLOW_MAX - *lengthptr < length)
5317 {
5318 *errorcodeptr = ERR20;
5319 return FALSE;
5320 }
5321 *lengthptr += length;
5322 }
5323 return TRUE;
5324 }
5325
5326 /* Another branch follows. In the pre-compile phase, we can move the code
5327 pointer back to where it was for the start of the first branch. (That is,
5328 pretend that each branch is the only one.)
5329
5330 In the real compile phase, insert an ALT node. Its length field points back
5331 to the previous branch while the bracket remains open. At the end the chain
5332 is reversed. It's done like this so that the start of the bracket has a
5333 zero offset until it is closed, making it possible to detect recursion. */
5334
5335 if (lengthptr != NULL)
5336 {
5337 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5338 length += 1 + LINK_SIZE;
5339 }
5340 else
5341 {
5342 *code = OP_ALT;
5343 PUT(code, 1, code - last_branch);
5344 bc.current = last_branch = code;
5345 code += 1 + LINK_SIZE;
5346 }
5347
5348 ptr++;
5349 }
5350 /* Control never reaches here */
5351 }
5352
5353
5354
5355
5356 /*************************************************
5357 * Check for anchored expression *
5358 *************************************************/
5359
5360 /* Try to find out if this is an anchored regular expression. Consider each
5361 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5362 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5363 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5364 counts, since OP_CIRC can match in the middle.
5365
5366 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5367 This is the code for \G, which means "match at start of match position, taking
5368 into account the match offset".
5369
5370 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5371 because that will try the rest of the pattern at all possible matching points,
5372 so there is no point trying again.... er ....
5373
5374 .... except when the .* appears inside capturing parentheses, and there is a
5375 subsequent back reference to those parentheses. We haven't enough information
5376 to catch that case precisely.
5377
5378 At first, the best we could do was to detect when .* was in capturing brackets
5379 and the highest back reference was greater than or equal to that level.
5380 However, by keeping a bitmap of the first 31 back references, we can catch some
5381 of the more common cases more precisely.
5382
5383 Arguments:
5384 code points to start of expression (the bracket)
5385 options points to the options setting
5386 bracket_map a bitmap of which brackets we are inside while testing; this
5387 handles up to substring 31; after that we just have to take
5388 the less precise approach
5389 backref_map the back reference bitmap
5390
5391 Returns: TRUE or FALSE
5392 */
5393
5394 static BOOL
5395 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5396 unsigned int backref_map)
5397 {
5398 do {
5399 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5400 options, PCRE_MULTILINE, FALSE);
5401 register int op = *scode;
5402
5403 /* Non-capturing brackets */
5404
5405 if (op == OP_BRA)
5406 {
5407 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5408 }
5409
5410 /* Capturing brackets */
5411
5412 else if (op == OP_CBRA)
5413 {
5414 int n = GET2(scode, 1+LINK_SIZE);
5415 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5416 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5417 }
5418
5419 /* Other brackets */
5420
5421 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5422 {
5423 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5424 }
5425
5426 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5427 are or may be referenced. */
5428
5429 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5430 op == OP_TYPEPOSSTAR) &&
5431 (*options & PCRE_DOTALL) != 0)
5432 {
5433 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5434 }
5435
5436 /* Check for explicit anchoring */
5437
5438 else if (op != OP_SOD && op != OP_SOM &&
5439 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5440 return FALSE;
5441 code += GET(code, 1);
5442 }
5443 while (*code == OP_ALT); /* Loop for each alternative */
5444 return TRUE;
5445 }
5446
5447
5448
5449 /*************************************************
5450 * Check for starting with ^ or .* *
5451 *************************************************/
5452
5453 /* This is called to find out if every branch starts with ^ or .* so that
5454 "first char" processing can be done to speed things up in multiline
5455 matching and for non-DOTALL patterns that start with .* (which must start at
5456 the beginning or after \n). As in the case of is_anchored() (see above), we
5457 have to take account of back references to capturing brackets that contain .*
5458 because in that case we can't make the assumption.
5459
5460 Arguments:
5461 code points to start of expression (the bracket)
5462 bracket_map a bitmap of which brackets we are inside while testing; this
5463 handles up to substring 31; after that we just have to take
5464 the less precise approach
5465 backref_map the back reference bitmap
5466
5467 Returns: TRUE or FALSE
5468 */
5469
5470 static BOOL
5471 is_startline(const uschar *code, unsigned int bracket_map,
5472 unsigned int backref_map)
5473 {
5474 do {
5475 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5476 NULL, 0, FALSE);
5477 register int op = *scode;
5478
5479 /* Non-capturing brackets */
5480
5481 if (op == OP_BRA)
5482 {
5483 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5484 }
5485
5486 /* Capturing brackets */
5487
5488 else if (op == OP_CBRA)
5489 {
5490 int n = GET2(scode, 1+LINK_SIZE);
5491 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5492 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5493 }
5494
5495 /* Other brackets */
5496
5497 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5498 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5499
5500 /* .* means "start at start or after \n" if it isn't in brackets that
5501 may be referenced. */
5502
5503 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5504 {
5505 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5506 }
5507
5508 /* Check for explicit circumflex */
5509
5510 else if (op != OP_CIRC) return FALSE;
5511
5512 /* Move on to the next alternative */
5513
5514 code += GET(code, 1);
5515 }
5516 while (*code == OP_ALT); /* Loop for each alternative */
5517 return TRUE;
5518 }
5519
5520
5521
5522 /*************************************************
5523 * Check for asserted fixed first char *
5524 *************************************************/
5525
5526 /* During compilation, the "first char" settings from forward assertions are
5527 discarded, because they can cause conflicts with actual literals that follow.
5528 However, if we end up without a first char setting for an unanchored pattern,
5529 it is worth scanning the regex to see if there is an initial asserted first
5530 char. If all branches start with the same asserted char, or with a bracket all
5531 of whose alternatives start with the same asserted char (recurse ad lib), then
5532 we return that char, otherwise -1.
5533
5534 Arguments:
5535 code points to start of expression (the bracket)
5536 options pointer to the options (used to check casing changes)
5537 inassert TRUE if in an assertion
5538
5539 Returns: -1 or the fixed first char
5540 */
5541
5542 static int
5543 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5544 {
5545 register int c = -1;
5546 do {
5547 int d;
5548 const uschar *scode =
5549 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5550 register int op = *scode;
5551
5552 switch(op)
5553 {
5554 default:
5555 return -1;
5556
5557 case OP_BRA:
5558 case OP_CBRA:
5559 case OP_ASSERT:
5560 case OP_ONCE:
5561 case OP_COND:
5562 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5563 return -1;
5564 if (c < 0) c = d; else if (c != d) return -1;
5565 break;
5566
5567 case OP_EXACT: /* Fall through */
5568 scode += 2;
5569
5570 case OP_CHAR:
5571 case OP_CHARNC:
5572 case OP_PLUS:
5573 case OP_MINPLUS:
5574 case OP_POSPLUS:
5575 if (!inassert) return -1;
5576 if (c < 0)
5577 {
5578 c = scode[1];
5579 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5580 }
5581 else if (c != scode[1]) return -1;
5582 break;
5583 }
5584
5585 code += GET(code, 1);
5586 }
5587 while (*code == OP_ALT);
5588 return c;
5589 }
5590
5591
5592
5593 /*************************************************
5594 * Compile a Regular Expression *
5595 *************************************************/
5596
5597 /* This function takes a string and returns a pointer to a block of store
5598 holding a compiled version of the expression. The original API for this
5599 function had no error code return variable; it is retained for backwards
5600 compatibility. The new function is given a new name.
5601
5602 Arguments:
5603 pattern the regular expression
5604 options various option bits
5605 errorcodeptr pointer to error code variable (pcre_compile2() only)
5606 can be NULL if you don't want a code value
5607 errorptr pointer to pointer to error text
5608 erroroffset ptr offset in pattern where error was detected
5609 tables pointer to character tables or NULL
5610
5611 Returns: pointer to compiled data block, or NULL on error,
5612 with errorptr and erroroffset set
5613 */
5614
5615 PCRE_EXP_DEFN pcre *
5616 pcre_compile(const char *pattern, int options, const char **errorptr,
5617 int *erroroffset, const unsigned char *tables)
5618 {
5619 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5620 }
5621
5622
5623 PCRE_EXP_DEFN pcre *
5624 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5625 const char **errorptr, int *erroroffset, const unsigned char *tables)
5626 {
5627 real_pcre *re;
5628 int length = 1; /* For final END opcode */
5629 int firstbyte, reqbyte, newline;
5630 int errorcode = 0;
5631 #ifdef SUPPORT_UTF8
5632 BOOL utf8;
5633 #endif
5634 size_t size;
5635 uschar *code;
5636 const uschar *codestart;
5637 const uschar *ptr;
5638 compile_data compile_block;
5639 compile_data *cd = &compile_block;
5640
5641 /* This space is used for "compiling" into during the first phase, when we are
5642 computing the amount of memory that is needed. Compiled items are thrown away
5643 as soon as possible, so that a fairly large buffer should be sufficient for
5644 this purpose. The same space is used in the second phase for remembering where
5645 to fill in forward references to subpatterns. */
5646
5647 uschar cworkspace[COMPILE_WORK_SIZE];
5648
5649
5650 /* Set this early so that early errors get offset 0. */
5651
5652 ptr = (const uschar *)pattern;
5653
5654 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5655 can do is just return NULL, but we can set a code value if there is a code
5656 pointer. */
5657
5658 if (errorptr == NULL)
5659 {
5660 if (errorcodeptr != NULL) *errorcodeptr = 99;
5661 return NULL;
5662 }
5663
5664 *errorptr = NULL;
5665 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5666
5667 /* However, we can give a message for this error */
5668
5669 if (erroroffset == NULL)
5670 {
5671 errorcode = ERR16;
5672 goto PCRE_EARLY_ERROR_RETURN2;
5673 }
5674
5675 *erroroffset = 0;
5676
5677 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5678
5679 #ifdef SUPPORT_UTF8
5680 utf8 = (options & PCRE_UTF8) != 0;
5681 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5682 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5683 {
5684 errorcode = ERR44;
5685 goto PCRE_EARLY_ERROR_RETURN2;
5686 }
5687 #else
5688 if ((options & PCRE_UTF8) != 0)
5689 {
5690 errorcode = ERR32;
5691 goto PCRE_EARLY_ERROR_RETURN;
5692 }
5693 #endif
5694
5695 if ((options & ~PUBLIC_OPTIONS) != 0)
5696 {
5697 errorcode = ERR17;
5698 goto PCRE_EARLY_ERROR_RETURN;
5699 }
5700
5701 /* Set up pointers to the individual character tables */
5702
5703 if (tables == NULL) tables = _pcre_default_tables;
5704 cd->lcc = tables + lcc_offset;
5705 cd->fcc = tables + fcc_offset;
5706 cd->cbits = tables + cbits_offset;
5707 cd->ctypes = tables + ctypes_offset;
5708
5709 /* Handle different types of newline. The three bits give seven cases. The
5710 current code allows for fixed one- or two-byte sequences, plus "any" and
5711 "anycrlf". */
5712
5713 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5714 {
5715 case 0: newline = NEWLINE; break; /* Compile-time default */
5716 case PCRE_NEWLINE_CR: newline = '\r'; break;
5717 case PCRE_NEWLINE_LF: newline = '\n'; break;
5718 case PCRE_NEWLINE_CR+
5719 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5720 case PCRE_NEWLINE_ANY: newline = -1; break;
5721 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5722 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5723 }
5724
5725 if (newline == -2)
5726 {
5727 cd->nltype = NLTYPE_ANYCRLF;
5728 }
5729 else if (newline < 0)
5730 {
5731 cd->nltype = NLTYPE_ANY;
5732 }
5733 else
5734 {
5735 cd->nltype = NLTYPE_FIXED;
5736 if (newline > 255)
5737 {
5738 cd->nllen = 2;
5739 cd->nl[0] = (newline >> 8) & 255;
5740 cd->nl[1] = newline & 255;
5741 }
5742 else
5743 {
5744 cd->nllen = 1;
5745 cd->nl[0] = newline;
5746 }
5747 }
5748
5749 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5750 references to help in deciding whether (.*) can be treated as anchored or not.
5751 */
5752
5753 cd->top_backref = 0;
5754 cd->backref_map = 0;
5755
5756 /* Reflect pattern for debugging output */
5757
5758 DPRINTF(("------------------------------------------------------------------\n"));
5759 DPRINTF(("%s\n", pattern));
5760
5761 /* Pretend to compile the pattern while actually just accumulating the length
5762 of memory required. This behaviour is triggered by passing a non-NULL final
5763 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5764 to compile parts of the pattern into; the compiled code is discarded when it is
5765 no longer needed, so hopefully this workspace will never overflow, though there
5766 is a test for its doing so. */
5767
5768 cd->bracount = 0;
5769 cd->names_found = 0;
5770 cd->name_entry_size = 0;
5771 cd->name_table = NULL;
5772 cd->start_workspace = cworkspace;
5773 cd->start_code = cworkspace;
5774 cd->hwm = cworkspace;
5775 cd->start_pattern = (const uschar *)pattern;
5776 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5777 cd->req_varyopt = 0;
5778 cd->nopartial = FALSE;
5779 cd->external_options = options;
5780
5781 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5782 don't need to look at the result of the function here. The initial options have
5783 been put into the cd block so that they can be changed if an option setting is
5784 found within the regex right at the beginning. Bringing initial option settings
5785 outside can help speed up starting point checks. */
5786
5787 code = cworkspace;
5788 *code = OP_BRA;
5789 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5790 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5791 &length);
5792 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5793
5794 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5795 cd->hwm - cworkspace));
5796
5797 if (length > MAX_PATTERN_SIZE)
5798 {
5799 errorcode = ERR20;
5800 goto PCRE_EARLY_ERROR_RETURN;
5801 }
5802
5803 /* Compute the size of data block needed and get it, either from malloc or
5804 externally provided function. Integer overflow should no longer be possible
5805 because nowadays we limit the maximum value of cd->names_found and
5806 cd->name_entry_size. */
5807
5808 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5809 re = (real_pcre *)(pcre_malloc)(size);
5810
5811 if (re == NULL)
5812 {
5813 errorcode = ERR21;
5814 goto PCRE_EARLY_ERROR_RETURN;
5815 }
5816
5817 /* Put in the magic number, and save the sizes, initial options, and character
5818 table pointer. NULL is used for the default character tables. The nullpad field
5819 is at the end; it's there to help in the case when a regex compiled on a system
5820 with 4-byte pointers is run on another with 8-byte pointers. */
5821
5822 re->magic_number = MAGIC_NUMBER;
5823 re->size = size;
5824 re->options = cd->external_options;
5825 re->dummy1 = 0;
5826 re->first_byte = 0;
5827 re->req_byte = 0;
5828 re->name_table_offset = sizeof(real_pcre);
5829 re->name_entry_size = cd->name_entry_size;
5830 re->name_count = cd->names_found;
5831 re->ref_count = 0;
5832 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5833 re->nullpad = NULL;
5834
5835 /* The starting points of the name/number translation table and of the code are
5836 passed around in the compile data block. The start/end pattern and initial
5837 options are already set from the pre-compile phase, as is the name_entry_size
5838 field. Reset the bracket count and the names_found field. Also reset the hwm
5839 field; this time it's used for remembering forward references to subpatterns.
5840 */
5841
5842 cd->bracount = 0;
5843 cd->names_found = 0;
5844 cd->name_table = (uschar *)re + re->name_table_offset;
5845 codestart = cd->name_table + re->name_entry_size * re->name_count;
5846 cd->start_code = codestart;
5847 cd->hwm = cworkspace;
5848 cd->req_varyopt = 0;
5849 cd->nopartial = FALSE;
5850 cd->had_accept = FALSE;
5851
5852 /* Set up a starting, non-extracting bracket, then compile the expression. On
5853 error, errorcode will be set non-zero, so we don't need to look at the result
5854 of the function here. */
5855
5856 ptr = (const uschar *)pattern;
5857 code = (uschar *)codestart;
5858 *code = OP_BRA;
5859 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5860 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5861 re->top_bracket = cd->bracount;
5862 re->top_backref = cd->top_backref;
5863
5864 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5865 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5866
5867 /* If not reached end of pattern on success, there's an excess bracket. */
5868
5869 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5870
5871 /* Fill in the terminating state and check for disastrous overflow, but
5872 if debugging, leave the test till after things are printed out. */
5873
5874 *code++ = OP_END;
5875
5876 #ifndef DEBUG
5877 if (code - codestart > length) errorcode = ERR23;
5878 #endif
5879
5880 /* Fill in any forward references that are required. */
5881
5882 while (errorcode == 0 && cd->hwm > cworkspace)
5883 {
5884 int offset, recno;
5885 const uschar *groupptr;
5886 cd->hwm -= LINK_SIZE;
5887 offset = GET(cd->hwm, 0);
5888 recno = GET(codestart, offset);
5889 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5890 if (groupptr == NULL) errorcode = ERR53;
5891 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5892 }
5893
5894 /* Give an error if there's back reference to a non-existent capturing
5895 subpattern. */
5896
5897 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5898
5899 /* Failed to compile, or error while post-processing */
5900
5901 if (errorcode != 0)
5902 {
5903 (pcre_free)(re);
5904 PCRE_EARLY_ERROR_RETURN:
5905 *erroroffset = ptr - (const uschar *)pattern;
5906 PCRE_EARLY_ERROR_RETURN2:
5907 *errorptr = error_texts[errorcode];
5908 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5909 return NULL;
5910 }
5911
5912 /* If the anchored option was not passed, set the flag if we can determine that
5913 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5914 as starting with .* when DOTALL is set).
5915
5916 Otherwise, if we know what the first byte has to be, save it, because that
5917 speeds up unanchored matches no end. If not, see if we can set the
5918 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5919 start with ^. and also when all branches start with .* for non-DOTALL matches.
5920 */
5921
5922 if ((re->options & PCRE_ANCHORED) == 0)
5923 {
5924 int temp_options = re->options; /* May get changed during these scans */
5925 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5926 re->options |= PCRE_ANCHORED;
5927 else
5928 {
5929 if (firstbyte < 0)
5930 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5931 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5932 {
5933 int ch = firstbyte & 255;
5934 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5935 cd->fcc[ch] == ch)? ch : firstbyte;
5936 re->options |= PCRE_FIRSTSET;
5937 }
5938 else if (is_startline(codestart, 0, cd->backref_map))
5939 re->options |= PCRE_STARTLINE;
5940 }
5941 }
5942
5943 /* For an anchored pattern, we use the "required byte" only if it follows a
5944 variable length item in the regex. Remove the caseless flag for non-caseable
5945 bytes. */
5946
5947 if (reqbyte >= 0 &&
5948 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5949 {
5950 int ch = reqbyte & 255;
5951 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5952 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5953 re->options |= PCRE_REQCHSET;
5954 }
5955
5956 /* Print out the compiled data if debugging is enabled. This is never the
5957 case when building a production library. */
5958
5959 #ifdef DEBUG
5960
5961 printf("Length = %d top_bracket = %d top_backref = %d\n",
5962 length, re->top_bracket, re->top_backref);
5963
5964 if (re->options != 0)
5965 {
5966 printf("%s%s%s%s%s%s%s%s%s\n",
5967 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5968 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5969 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5970 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5971 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5972 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5973 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5974 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5975 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5976 }
5977
5978 if ((re->options & PCRE_FIRSTSET) != 0)
5979 {
5980 int ch = re->first_byte & 255;
5981 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5982 "" : " (caseless)";
5983 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5984 else printf("First char = \\x%02x%s\n", ch, caseless);
5985 }
5986
5987 if ((re->options & PCRE_REQCHSET) != 0)
5988 {
5989 int ch = re->req_byte & 255;
5990 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5991 "" : " (caseless)";
5992 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5993 else printf("Req char = \\x%02x%s\n", ch, caseless);
5994 }
5995
5996 pcre_printint(re, stdout, TRUE);
5997
5998 /* This check is done here in the debugging case so that the code that
5999 was compiled can be seen. */
6000
6001 if (code - codestart > length)
6002 {
6003 (pcre_free)(re);
6004 *errorptr = error_texts[ERR23];
6005 *erroroffset = ptr - (uschar *)pattern;
6006 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6007 return NULL;
6008 }
6009 #endif /* DEBUG */
6010
6011 return (pcre *)re;
6012 }
6013
6014 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12