/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 216 - (show annotations) (download)
Wed Aug 15 14:35:57 2007 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 187780 byte(s)
Fixed compile-time loop for patterns like (?:[\PPa*]*){8,} (extended class 
inside group with unlimited repeat).

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
58
59 #ifdef DEBUG
60 #include "pcre_printint.src"
61 #endif
62
63
64 /* Macro for setting individual bits in class bitmaps. */
65
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
72
73 #define OFLOW_MAX (INT_MAX - 20)
74
75
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
79
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
86
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
91
92 #define COMPILE_WORK_SIZE (4096)
93
94
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96 are simple data values; negative values are for special things like \d and so
97 on. Zero means further processing is needed (for things like \x), or the escape
98 is invalid. */
99
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 static const short int escapes[] = {
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 0, 0, -ESC_z /* x - z */
112 };
113
114 #else /* This is the "abnormal" table for EBCDIC systems */
115 static const short int escapes[] = {
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139 };
140 #endif
141
142
143 /* Table of special "verbs" like (*PRUNE) */
144
145 typedef struct verbitem {
146 const char *name;
147 int len;
148 int op;
149 } verbitem;
150
151 static verbitem verbs[] = {
152 { "ACCEPT", 6, OP_ACCEPT },
153 { "COMMIT", 6, OP_COMMIT },
154 { "F", 1, OP_FAIL },
155 { "FAIL", 4, OP_FAIL },
156 { "PRUNE", 5, OP_PRUNE },
157 { "SKIP", 4, OP_SKIP },
158 { "THEN", 4, OP_THEN }
159 };
160
161 static int verbcount = sizeof(verbs)/sizeof(verbitem);
162
163
164 /* Tables of names of POSIX character classes and their lengths. The list is
165 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 as this is assumed for handling case independence. */
167
168 static const char *const posix_names[] = {
169 "alpha", "lower", "upper",
170 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171 "print", "punct", "space", "word", "xdigit" };
172
173 static const uschar posix_name_lengths[] = {
174 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175
176 /* Table of class bit maps for each POSIX class. Each class is formed from a
177 base map, with an optional addition or removal of another map. Then, for some
178 classes, there is some additional tweaking: for [:blank:] the vertical space
179 characters are removed, and for [:alpha:] and [:alnum:] the underscore
180 character is removed. The triples in the table consist of the base map offset,
181 second map offset or -1 if no second map, and a non-negative value for map
182 addition or a negative value for map subtraction (if there are two maps). The
183 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184 remove vertical space characters, 2 => remove underscore. */
185
186 static const int posix_class_maps[] = {
187 cbit_word, cbit_digit, -2, /* alpha */
188 cbit_lower, -1, 0, /* lower */
189 cbit_upper, -1, 0, /* upper */
190 cbit_word, -1, 2, /* alnum - word without underscore */
191 cbit_print, cbit_cntrl, 0, /* ascii */
192 cbit_space, -1, 1, /* blank - a GNU extension */
193 cbit_cntrl, -1, 0, /* cntrl */
194 cbit_digit, -1, 0, /* digit */
195 cbit_graph, -1, 0, /* graph */
196 cbit_print, -1, 0, /* print */
197 cbit_punct, -1, 0, /* punct */
198 cbit_space, -1, 0, /* space */
199 cbit_word, -1, 0, /* word - a Perl extension */
200 cbit_xdigit,-1, 0 /* xdigit */
201 };
202
203
204 #define STRING(a) # a
205 #define XSTRING(s) STRING(s)
206
207 /* The texts of compile-time error messages. These are "char *" because they
208 are passed to the outside world. Do not ever re-use any error number, because
209 they are documented. Always add a new error instead. Messages marked DEAD below
210 are no longer used. */
211
212 static const char *error_texts[] = {
213 "no error",
214 "\\ at end of pattern",
215 "\\c at end of pattern",
216 "unrecognized character follows \\",
217 "numbers out of order in {} quantifier",
218 /* 5 */
219 "number too big in {} quantifier",
220 "missing terminating ] for character class",
221 "invalid escape sequence in character class",
222 "range out of order in character class",
223 "nothing to repeat",
224 /* 10 */
225 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 "internal error: unexpected repeat",
227 "unrecognized character after (?",
228 "POSIX named classes are supported only within a class",
229 "missing )",
230 /* 15 */
231 "reference to non-existent subpattern",
232 "erroffset passed as NULL",
233 "unknown option bit(s) set",
234 "missing ) after comment",
235 "parentheses nested too deeply", /** DEAD **/
236 /* 20 */
237 "regular expression is too large",
238 "failed to get memory",
239 "unmatched parentheses",
240 "internal error: code overflow",
241 "unrecognized character after (?<",
242 /* 25 */
243 "lookbehind assertion is not fixed length",
244 "malformed number or name after (?(",
245 "conditional group contains more than two branches",
246 "assertion expected after (?(",
247 "(?R or (?[+-]digits must be followed by )",
248 /* 30 */
249 "unknown POSIX class name",
250 "POSIX collating elements are not supported",
251 "this version of PCRE is not compiled with PCRE_UTF8 support",
252 "spare error", /** DEAD **/
253 "character value in \\x{...} sequence is too large",
254 /* 35 */
255 "invalid condition (?(0)",
256 "\\C not allowed in lookbehind assertion",
257 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258 "number after (?C is > 255",
259 "closing ) for (?C expected",
260 /* 40 */
261 "recursive call could loop indefinitely",
262 "unrecognized character after (?P",
263 "syntax error in subpattern name (missing terminator)",
264 "two named subpatterns have the same name",
265 "invalid UTF-8 string",
266 /* 45 */
267 "support for \\P, \\p, and \\X has not been compiled",
268 "malformed \\P or \\p sequence",
269 "unknown property name after \\P or \\p",
270 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 /* 50 */
273 "repeated subpattern is too long", /** DEAD **/
274 "octal value is greater than \\377 (not in UTF-8 mode)",
275 "internal error: overran compiling workspace",
276 "internal error: previously-checked referenced subpattern not found",
277 "DEFINE group contains more than one branch",
278 /* 55 */
279 "repeating a DEFINE group is not allowed",
280 "inconsistent NEWLINE options",
281 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283 "(*VERB) with an argument is not supported",
284 /* 60 */
285 "(*VERB) not recognized",
286 "number is too big"
287 };
288
289
290 /* Table to identify digits and hex digits. This is used when compiling
291 patterns. Note that the tables in chartables are dependent on the locale, and
292 may mark arbitrary characters as digits - but the PCRE compiling code expects
293 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294 a private table here. It costs 256 bytes, but it is a lot faster than doing
295 character value tests (at least in some simple cases I timed), and in some
296 applications one wants PCRE to compile efficiently as well as match
297 efficiently.
298
299 For convenience, we use the same bit definitions as in chartables:
300
301 0x04 decimal digit
302 0x08 hexadecimal digit
303
304 Then we can use ctype_digit and ctype_xdigit in the code. */
305
306 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 static const unsigned char digitab[] =
308 {
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341
342 #else /* This is the "abnormal" case, for EBCDIC systems */
343 static const unsigned char digitab[] =
344 {
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377
378 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411 #endif
412
413
414 /* Definition to allow mutual recursion */
415
416 static BOOL
417 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 int *, int *, branch_chain *, compile_data *, int *);
419
420
421
422 /*************************************************
423 * Handle escapes *
424 *************************************************/
425
426 /* This function is called when a \ has been encountered. It either returns a
427 positive value for a simple escape such as \n, or a negative value which
428 encodes one of the more complicated things such as \d. A backreference to group
429 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431 ptr is pointing at the \. On exit, it is on the final character of the escape
432 sequence.
433
434 Arguments:
435 ptrptr points to the pattern position pointer
436 errorcodeptr points to the errorcode variable
437 bracount number of previous extracting brackets
438 options the options bits
439 isclass TRUE if inside a character class
440
441 Returns: zero or positive => a data character
442 negative => a special escape sequence
443 on error, errorcodeptr is set
444 */
445
446 static int
447 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448 int options, BOOL isclass)
449 {
450 BOOL utf8 = (options & PCRE_UTF8) != 0;
451 const uschar *ptr = *ptrptr + 1;
452 int c, i;
453
454 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455 ptr--; /* Set pointer back to the last byte */
456
457 /* If backslash is at the end of the pattern, it's an error. */
458
459 if (c == 0) *errorcodeptr = ERR1;
460
461 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462 a table. A non-zero result is something that can be returned immediately.
463 Otherwise further processing may be required. */
464
465 #ifndef EBCDIC /* ASCII coding */
466 else if (c < '0' || c > 'z') {} /* Not alphameric */
467 else if ((i = escapes[c - '0']) != 0) c = i;
468
469 #else /* EBCDIC coding */
470 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471 else if ((i = escapes[c - 0x48]) != 0) c = i;
472 #endif
473
474 /* Escapes that need further processing, or are illegal. */
475
476 else
477 {
478 const uschar *oldptr;
479 BOOL braced, negated;
480
481 switch (c)
482 {
483 /* A number of Perl escapes are not handled by PCRE. We give an explicit
484 error. */
485
486 case 'l':
487 case 'L':
488 case 'N':
489 case 'u':
490 case 'U':
491 *errorcodeptr = ERR37;
492 break;
493
494 /* \g must be followed by a number, either plain or braced. If positive, it
495 is an absolute backreference. If negative, it is a relative backreference.
496 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497 reference to a named group. This is part of Perl's movement towards a
498 unified syntax for back references. As this is synonymous with \k{name}, we
499 fudge it up by pretending it really was \k. */
500
501 case 'g':
502 if (ptr[1] == '{')
503 {
504 const uschar *p;
505 for (p = ptr+2; *p != 0 && *p != '}'; p++)
506 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 if (*p != 0 && *p != '}')
508 {
509 c = -ESC_k;
510 break;
511 }
512 braced = TRUE;
513 ptr++;
514 }
515 else braced = FALSE;
516
517 if (ptr[1] == '-')
518 {
519 negated = TRUE;
520 ptr++;
521 }
522 else negated = FALSE;
523
524 c = 0;
525 while ((digitab[ptr[1]] & ctype_digit) != 0)
526 c = c * 10 + *(++ptr) - '0';
527
528 if (c < 0)
529 {
530 *errorcodeptr = ERR61;
531 break;
532 }
533
534 if (c == 0 || (braced && *(++ptr) != '}'))
535 {
536 *errorcodeptr = ERR57;
537 break;
538 }
539
540 if (negated)
541 {
542 if (c > bracount)
543 {
544 *errorcodeptr = ERR15;
545 break;
546 }
547 c = bracount - (c - 1);
548 }
549
550 c = -(ESC_REF + c);
551 break;
552
553 /* The handling of escape sequences consisting of a string of digits
554 starting with one that is not zero is not straightforward. By experiment,
555 the way Perl works seems to be as follows:
556
557 Outside a character class, the digits are read as a decimal number. If the
558 number is less than 10, or if there are that many previous extracting
559 left brackets, then it is a back reference. Otherwise, up to three octal
560 digits are read to form an escaped byte. Thus \123 is likely to be octal
561 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562 value is greater than 377, the least significant 8 bits are taken. Inside a
563 character class, \ followed by a digit is always an octal number. */
564
565 case '1': case '2': case '3': case '4': case '5':
566 case '6': case '7': case '8': case '9':
567
568 if (!isclass)
569 {
570 oldptr = ptr;
571 c -= '0';
572 while ((digitab[ptr[1]] & ctype_digit) != 0)
573 c = c * 10 + *(++ptr) - '0';
574 if (c < 0)
575 {
576 *errorcodeptr = ERR61;
577 break;
578 }
579 if (c < 10 || c <= bracount)
580 {
581 c = -(ESC_REF + c);
582 break;
583 }
584 ptr = oldptr; /* Put the pointer back and fall through */
585 }
586
587 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588 generates a binary zero byte and treats the digit as a following literal.
589 Thus we have to pull back the pointer by one. */
590
591 if ((c = *ptr) >= '8')
592 {
593 ptr--;
594 c = 0;
595 break;
596 }
597
598 /* \0 always starts an octal number, but we may drop through to here with a
599 larger first octal digit. The original code used just to take the least
600 significant 8 bits of octal numbers (I think this is what early Perls used
601 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602 than 3 octal digits. */
603
604 case '0':
605 c -= '0';
606 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607 c = c * 8 + *(++ptr) - '0';
608 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 break;
610
611 /* \x is complicated. \x{ddd} is a character number which can be greater
612 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613 treated as a data character. */
614
615 case 'x':
616 if (ptr[1] == '{')
617 {
618 const uschar *pt = ptr + 2;
619 int count = 0;
620
621 c = 0;
622 while ((digitab[*pt] & ctype_xdigit) != 0)
623 {
624 register int cc = *pt++;
625 if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 count++;
627
628 #ifndef EBCDIC /* ASCII coding */
629 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 #else /* EBCDIC coding */
632 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 #endif
635 }
636
637 if (*pt == '}')
638 {
639 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 ptr = pt;
641 break;
642 }
643
644 /* If the sequence of hex digits does not end with '}', then we don't
645 recognize this construct; fall through to the normal \x handling. */
646 }
647
648 /* Read just a single-byte hex-defined char */
649
650 c = 0;
651 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652 {
653 int cc; /* Some compilers don't like ++ */
654 cc = *(++ptr); /* in initializers */
655 #ifndef EBCDIC /* ASCII coding */
656 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 #else /* EBCDIC coding */
659 if (cc <= 'z') cc += 64; /* Convert to upper case */
660 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661 #endif
662 }
663 break;
664
665 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666 This coding is ASCII-specific, but then the whole concept of \cx is
667 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668
669 case 'c':
670 c = *(++ptr);
671 if (c == 0)
672 {
673 *errorcodeptr = ERR2;
674 break;
675 }
676
677 #ifndef EBCDIC /* ASCII coding */
678 if (c >= 'a' && c <= 'z') c -= 32;
679 c ^= 0x40;
680 #else /* EBCDIC coding */
681 if (c >= 'a' && c <= 'z') c += 64;
682 c ^= 0xC0;
683 #endif
684 break;
685
686 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688 for Perl compatibility, it is a literal. This code looks a bit odd, but
689 there used to be some cases other than the default, and there may be again
690 in future, so I haven't "optimized" it. */
691
692 default:
693 if ((options & PCRE_EXTRA) != 0) switch(c)
694 {
695 default:
696 *errorcodeptr = ERR3;
697 break;
698 }
699 break;
700 }
701 }
702
703 *ptrptr = ptr;
704 return c;
705 }
706
707
708
709 #ifdef SUPPORT_UCP
710 /*************************************************
711 * Handle \P and \p *
712 *************************************************/
713
714 /* This function is called after \P or \p has been encountered, provided that
715 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716 pointing at the P or p. On exit, it is pointing at the final character of the
717 escape sequence.
718
719 Argument:
720 ptrptr points to the pattern position pointer
721 negptr points to a boolean that is set TRUE for negation else FALSE
722 dptr points to an int that is set to the detailed property value
723 errorcodeptr points to the error code variable
724
725 Returns: type value from ucp_type_table, or -1 for an invalid type
726 */
727
728 static int
729 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 {
731 int c, i, bot, top;
732 const uschar *ptr = *ptrptr;
733 char name[32];
734
735 c = *(++ptr);
736 if (c == 0) goto ERROR_RETURN;
737
738 *negptr = FALSE;
739
740 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741 negation. */
742
743 if (c == '{')
744 {
745 if (ptr[1] == '^')
746 {
747 *negptr = TRUE;
748 ptr++;
749 }
750 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 {
752 c = *(++ptr);
753 if (c == 0) goto ERROR_RETURN;
754 if (c == '}') break;
755 name[i] = c;
756 }
757 if (c !='}') goto ERROR_RETURN;
758 name[i] = 0;
759 }
760
761 /* Otherwise there is just one following character */
762
763 else
764 {
765 name[0] = c;
766 name[1] = 0;
767 }
768
769 *ptrptr = ptr;
770
771 /* Search for a recognized property name using binary chop */
772
773 bot = 0;
774 top = _pcre_utt_size;
775
776 while (bot < top)
777 {
778 i = (bot + top) >> 1;
779 c = strcmp(name, _pcre_utt[i].name);
780 if (c == 0)
781 {
782 *dptr = _pcre_utt[i].value;
783 return _pcre_utt[i].type;
784 }
785 if (c > 0) bot = i + 1; else top = i;
786 }
787
788 *errorcodeptr = ERR47;
789 *ptrptr = ptr;
790 return -1;
791
792 ERROR_RETURN:
793 *errorcodeptr = ERR46;
794 *ptrptr = ptr;
795 return -1;
796 }
797 #endif
798
799
800
801
802 /*************************************************
803 * Check for counted repeat *
804 *************************************************/
805
806 /* This function is called when a '{' is encountered in a place where it might
807 start a quantifier. It looks ahead to see if it really is a quantifier or not.
808 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809 where the ddds are digits.
810
811 Arguments:
812 p pointer to the first char after '{'
813
814 Returns: TRUE or FALSE
815 */
816
817 static BOOL
818 is_counted_repeat(const uschar *p)
819 {
820 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821 while ((digitab[*p] & ctype_digit) != 0) p++;
822 if (*p == '}') return TRUE;
823
824 if (*p++ != ',') return FALSE;
825 if (*p == '}') return TRUE;
826
827 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828 while ((digitab[*p] & ctype_digit) != 0) p++;
829
830 return (*p == '}');
831 }
832
833
834
835 /*************************************************
836 * Read repeat counts *
837 *************************************************/
838
839 /* Read an item of the form {n,m} and return the values. This is called only
840 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841 so the syntax is guaranteed to be correct, but we need to check the values.
842
843 Arguments:
844 p pointer to first char after '{'
845 minp pointer to int for min
846 maxp pointer to int for max
847 returned as -1 if no max
848 errorcodeptr points to error code variable
849
850 Returns: pointer to '}' on success;
851 current ptr on error, with errorcodeptr set non-zero
852 */
853
854 static const uschar *
855 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856 {
857 int min = 0;
858 int max = -1;
859
860 /* Read the minimum value and do a paranoid check: a negative value indicates
861 an integer overflow. */
862
863 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 if (min < 0 || min > 65535)
865 {
866 *errorcodeptr = ERR5;
867 return p;
868 }
869
870 /* Read the maximum value if there is one, and again do a paranoid on its size.
871 Also, max must not be less than min. */
872
873 if (*p == '}') max = min; else
874 {
875 if (*(++p) != '}')
876 {
877 max = 0;
878 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 if (max < 0 || max > 65535)
880 {
881 *errorcodeptr = ERR5;
882 return p;
883 }
884 if (max < min)
885 {
886 *errorcodeptr = ERR4;
887 return p;
888 }
889 }
890 }
891
892 /* Fill in the required variables, and pass back the pointer to the terminating
893 '}'. */
894
895 *minp = min;
896 *maxp = max;
897 return p;
898 }
899
900
901
902 /*************************************************
903 * Find forward referenced subpattern *
904 *************************************************/
905
906 /* This function scans along a pattern's text looking for capturing
907 subpatterns, and counting them. If it finds a named pattern that matches the
908 name it is given, it returns its number. Alternatively, if the name is NULL, it
909 returns when it reaches a given numbered subpattern. This is used for forward
910 references to subpatterns. We know that if (?P< is encountered, the name will
911 be terminated by '>' because that is checked in the first pass.
912
913 Arguments:
914 ptr current position in the pattern
915 count current count of capturing parens so far encountered
916 name name to seek, or NULL if seeking a numbered subpattern
917 lorn name length, or subpattern number if name is NULL
918 xmode TRUE if we are in /x mode
919
920 Returns: the number of the named subpattern, or -1 if not found
921 */
922
923 static int
924 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925 BOOL xmode)
926 {
927 const uschar *thisname;
928
929 for (; *ptr != 0; ptr++)
930 {
931 int term;
932
933 /* Skip over backslashed characters and also entire \Q...\E */
934
935 if (*ptr == '\\')
936 {
937 if (*(++ptr) == 0) return -1;
938 if (*ptr == 'Q') for (;;)
939 {
940 while (*(++ptr) != 0 && *ptr != '\\');
941 if (*ptr == 0) return -1;
942 if (*(++ptr) == 'E') break;
943 }
944 continue;
945 }
946
947 /* Skip over character classes */
948
949 if (*ptr == '[')
950 {
951 while (*(++ptr) != ']')
952 {
953 if (*ptr == 0) return -1;
954 if (*ptr == '\\')
955 {
956 if (*(++ptr) == 0) return -1;
957 if (*ptr == 'Q') for (;;)
958 {
959 while (*(++ptr) != 0 && *ptr != '\\');
960 if (*ptr == 0) return -1;
961 if (*(++ptr) == 'E') break;
962 }
963 continue;
964 }
965 }
966 continue;
967 }
968
969 /* Skip comments in /x mode */
970
971 if (xmode && *ptr == '#')
972 {
973 while (*(++ptr) != 0 && *ptr != '\n');
974 if (*ptr == 0) return -1;
975 continue;
976 }
977
978 /* An opening parens must now be a real metacharacter */
979
980 if (*ptr != '(') continue;
981 if (ptr[1] != '?' && ptr[1] != '*')
982 {
983 count++;
984 if (name == NULL && count == lorn) return count;
985 continue;
986 }
987
988 ptr += 2;
989 if (*ptr == 'P') ptr++; /* Allow optional P */
990
991 /* We have to disambiguate (?<! and (?<= from (?<name> */
992
993 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994 *ptr != '\'')
995 continue;
996
997 count++;
998
999 if (name == NULL && count == lorn) return count;
1000 term = *ptr++;
1001 if (term == '<') term = '>';
1002 thisname = ptr;
1003 while (*ptr != term) ptr++;
1004 if (name != NULL && lorn == ptr - thisname &&
1005 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 return count;
1007 }
1008
1009 return -1;
1010 }
1011
1012
1013
1014 /*************************************************
1015 * Find first significant op code *
1016 *************************************************/
1017
1018 /* This is called by several functions that scan a compiled expression looking
1019 for a fixed first character, or an anchoring op code etc. It skips over things
1020 that do not influence this. For some calls, a change of option is important.
1021 For some calls, it makes sense to skip negative forward and all backward
1022 assertions, and also the \b assertion; for others it does not.
1023
1024 Arguments:
1025 code pointer to the start of the group
1026 options pointer to external options
1027 optbit the option bit whose changing is significant, or
1028 zero if none are
1029 skipassert TRUE if certain assertions are to be skipped
1030
1031 Returns: pointer to the first significant opcode
1032 */
1033
1034 static const uschar*
1035 first_significant_code(const uschar *code, int *options, int optbit,
1036 BOOL skipassert)
1037 {
1038 for (;;)
1039 {
1040 switch ((int)*code)
1041 {
1042 case OP_OPT:
1043 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044 *options = (int)code[1];
1045 code += 2;
1046 break;
1047
1048 case OP_ASSERT_NOT:
1049 case OP_ASSERTBACK:
1050 case OP_ASSERTBACK_NOT:
1051 if (!skipassert) return code;
1052 do code += GET(code, 1); while (*code == OP_ALT);
1053 code += _pcre_OP_lengths[*code];
1054 break;
1055
1056 case OP_WORD_BOUNDARY:
1057 case OP_NOT_WORD_BOUNDARY:
1058 if (!skipassert) return code;
1059 /* Fall through */
1060
1061 case OP_CALLOUT:
1062 case OP_CREF:
1063 case OP_RREF:
1064 case OP_DEF:
1065 code += _pcre_OP_lengths[*code];
1066 break;
1067
1068 default:
1069 return code;
1070 }
1071 }
1072 /* Control never reaches here */
1073 }
1074
1075
1076
1077
1078 /*************************************************
1079 * Find the fixed length of a pattern *
1080 *************************************************/
1081
1082 /* Scan a pattern and compute the fixed length of subject that will match it,
1083 if the length is fixed. This is needed for dealing with backward assertions.
1084 In UTF8 mode, the result is in characters rather than bytes.
1085
1086 Arguments:
1087 code points to the start of the pattern (the bracket)
1088 options the compiling options
1089
1090 Returns: the fixed length, or -1 if there is no fixed length,
1091 or -2 if \C was encountered
1092 */
1093
1094 static int
1095 find_fixedlength(uschar *code, int options)
1096 {
1097 int length = -1;
1098
1099 register int branchlength = 0;
1100 register uschar *cc = code + 1 + LINK_SIZE;
1101
1102 /* Scan along the opcodes for this branch. If we get to the end of the
1103 branch, check the length against that of the other branches. */
1104
1105 for (;;)
1106 {
1107 int d;
1108 register int op = *cc;
1109
1110 switch (op)
1111 {
1112 case OP_CBRA:
1113 case OP_BRA:
1114 case OP_ONCE:
1115 case OP_COND:
1116 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1117 if (d < 0) return d;
1118 branchlength += d;
1119 do cc += GET(cc, 1); while (*cc == OP_ALT);
1120 cc += 1 + LINK_SIZE;
1121 break;
1122
1123 /* Reached end of a branch; if it's a ket it is the end of a nested
1124 call. If it's ALT it is an alternation in a nested call. If it is
1125 END it's the end of the outer call. All can be handled by the same code. */
1126
1127 case OP_ALT:
1128 case OP_KET:
1129 case OP_KETRMAX:
1130 case OP_KETRMIN:
1131 case OP_END:
1132 if (length < 0) length = branchlength;
1133 else if (length != branchlength) return -1;
1134 if (*cc != OP_ALT) return length;
1135 cc += 1 + LINK_SIZE;
1136 branchlength = 0;
1137 break;
1138
1139 /* Skip over assertive subpatterns */
1140
1141 case OP_ASSERT:
1142 case OP_ASSERT_NOT:
1143 case OP_ASSERTBACK:
1144 case OP_ASSERTBACK_NOT:
1145 do cc += GET(cc, 1); while (*cc == OP_ALT);
1146 /* Fall through */
1147
1148 /* Skip over things that don't match chars */
1149
1150 case OP_REVERSE:
1151 case OP_CREF:
1152 case OP_RREF:
1153 case OP_DEF:
1154 case OP_OPT:
1155 case OP_CALLOUT:
1156 case OP_SOD:
1157 case OP_SOM:
1158 case OP_EOD:
1159 case OP_EODN:
1160 case OP_CIRC:
1161 case OP_DOLL:
1162 case OP_NOT_WORD_BOUNDARY:
1163 case OP_WORD_BOUNDARY:
1164 cc += _pcre_OP_lengths[*cc];
1165 break;
1166
1167 /* Handle literal characters */
1168
1169 case OP_CHAR:
1170 case OP_CHARNC:
1171 case OP_NOT:
1172 branchlength++;
1173 cc += 2;
1174 #ifdef SUPPORT_UTF8
1175 if ((options & PCRE_UTF8) != 0)
1176 {
1177 while ((*cc & 0xc0) == 0x80) cc++;
1178 }
1179 #endif
1180 break;
1181
1182 /* Handle exact repetitions. The count is already in characters, but we
1183 need to skip over a multibyte character in UTF8 mode. */
1184
1185 case OP_EXACT:
1186 branchlength += GET2(cc,1);
1187 cc += 4;
1188 #ifdef SUPPORT_UTF8
1189 if ((options & PCRE_UTF8) != 0)
1190 {
1191 while((*cc & 0x80) == 0x80) cc++;
1192 }
1193 #endif
1194 break;
1195
1196 case OP_TYPEEXACT:
1197 branchlength += GET2(cc,1);
1198 cc += 4;
1199 break;
1200
1201 /* Handle single-char matchers */
1202
1203 case OP_PROP:
1204 case OP_NOTPROP:
1205 cc += 2;
1206 /* Fall through */
1207
1208 case OP_NOT_DIGIT:
1209 case OP_DIGIT:
1210 case OP_NOT_WHITESPACE:
1211 case OP_WHITESPACE:
1212 case OP_NOT_WORDCHAR:
1213 case OP_WORDCHAR:
1214 case OP_ANY:
1215 branchlength++;
1216 cc++;
1217 break;
1218
1219 /* The single-byte matcher isn't allowed */
1220
1221 case OP_ANYBYTE:
1222 return -2;
1223
1224 /* Check a class for variable quantification */
1225
1226 #ifdef SUPPORT_UTF8
1227 case OP_XCLASS:
1228 cc += GET(cc, 1) - 33;
1229 /* Fall through */
1230 #endif
1231
1232 case OP_CLASS:
1233 case OP_NCLASS:
1234 cc += 33;
1235
1236 switch (*cc)
1237 {
1238 case OP_CRSTAR:
1239 case OP_CRMINSTAR:
1240 case OP_CRQUERY:
1241 case OP_CRMINQUERY:
1242 return -1;
1243
1244 case OP_CRRANGE:
1245 case OP_CRMINRANGE:
1246 if (GET2(cc,1) != GET2(cc,3)) return -1;
1247 branchlength += GET2(cc,1);
1248 cc += 5;
1249 break;
1250
1251 default:
1252 branchlength++;
1253 }
1254 break;
1255
1256 /* Anything else is variable length */
1257
1258 default:
1259 return -1;
1260 }
1261 }
1262 /* Control never gets here */
1263 }
1264
1265
1266
1267
1268 /*************************************************
1269 * Scan compiled regex for numbered bracket *
1270 *************************************************/
1271
1272 /* This little function scans through a compiled pattern until it finds a
1273 capturing bracket with the given number.
1274
1275 Arguments:
1276 code points to start of expression
1277 utf8 TRUE in UTF-8 mode
1278 number the required bracket number
1279
1280 Returns: pointer to the opcode for the bracket, or NULL if not found
1281 */
1282
1283 static const uschar *
1284 find_bracket(const uschar *code, BOOL utf8, int number)
1285 {
1286 for (;;)
1287 {
1288 register int c = *code;
1289 if (c == OP_END) return NULL;
1290
1291 /* XCLASS is used for classes that cannot be represented just by a bit
1292 map. This includes negated single high-valued characters. The length in
1293 the table is zero; the actual length is stored in the compiled code. */
1294
1295 if (c == OP_XCLASS) code += GET(code, 1);
1296
1297 /* Handle capturing bracket */
1298
1299 else if (c == OP_CBRA)
1300 {
1301 int n = GET2(code, 1+LINK_SIZE);
1302 if (n == number) return (uschar *)code;
1303 code += _pcre_OP_lengths[c];
1304 }
1305
1306 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1307 a multi-byte character. The length in the table is a minimum, so we have to
1308 arrange to skip the extra bytes. */
1309
1310 else
1311 {
1312 code += _pcre_OP_lengths[c];
1313 #ifdef SUPPORT_UTF8
1314 if (utf8) switch(c)
1315 {
1316 case OP_CHAR:
1317 case OP_CHARNC:
1318 case OP_EXACT:
1319 case OP_UPTO:
1320 case OP_MINUPTO:
1321 case OP_POSUPTO:
1322 case OP_STAR:
1323 case OP_MINSTAR:
1324 case OP_POSSTAR:
1325 case OP_PLUS:
1326 case OP_MINPLUS:
1327 case OP_POSPLUS:
1328 case OP_QUERY:
1329 case OP_MINQUERY:
1330 case OP_POSQUERY:
1331 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1332 break;
1333 }
1334 #endif
1335 }
1336 }
1337 }
1338
1339
1340
1341 /*************************************************
1342 * Scan compiled regex for recursion reference *
1343 *************************************************/
1344
1345 /* This little function scans through a compiled pattern until it finds an
1346 instance of OP_RECURSE.
1347
1348 Arguments:
1349 code points to start of expression
1350 utf8 TRUE in UTF-8 mode
1351
1352 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1353 */
1354
1355 static const uschar *
1356 find_recurse(const uschar *code, BOOL utf8)
1357 {
1358 for (;;)
1359 {
1360 register int c = *code;
1361 if (c == OP_END) return NULL;
1362 if (c == OP_RECURSE) return code;
1363
1364 /* XCLASS is used for classes that cannot be represented just by a bit
1365 map. This includes negated single high-valued characters. The length in
1366 the table is zero; the actual length is stored in the compiled code. */
1367
1368 if (c == OP_XCLASS) code += GET(code, 1);
1369
1370 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1371 that are followed by a character may be followed by a multi-byte character.
1372 The length in the table is a minimum, so we have to arrange to skip the extra
1373 bytes. */
1374
1375 else
1376 {
1377 code += _pcre_OP_lengths[c];
1378 #ifdef SUPPORT_UTF8
1379 if (utf8) switch(c)
1380 {
1381 case OP_CHAR:
1382 case OP_CHARNC:
1383 case OP_EXACT:
1384 case OP_UPTO:
1385 case OP_MINUPTO:
1386 case OP_POSUPTO:
1387 case OP_STAR:
1388 case OP_MINSTAR:
1389 case OP_POSSTAR:
1390 case OP_PLUS:
1391 case OP_MINPLUS:
1392 case OP_POSPLUS:
1393 case OP_QUERY:
1394 case OP_MINQUERY:
1395 case OP_POSQUERY:
1396 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1397 break;
1398 }
1399 #endif
1400 }
1401 }
1402 }
1403
1404
1405
1406 /*************************************************
1407 * Scan compiled branch for non-emptiness *
1408 *************************************************/
1409
1410 /* This function scans through a branch of a compiled pattern to see whether it
1411 can match the empty string or not. It is called from could_be_empty()
1412 below and from compile_branch() when checking for an unlimited repeat of a
1413 group that can match nothing. Note that first_significant_code() skips over
1414 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1415 struck an inner bracket whose current branch will already have been scanned.
1416
1417 Arguments:
1418 code points to start of search
1419 endcode points to where to stop
1420 utf8 TRUE if in UTF8 mode
1421
1422 Returns: TRUE if what is matched could be empty
1423 */
1424
1425 static BOOL
1426 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1427 {
1428 register int c;
1429 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1430 code < endcode;
1431 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1432 {
1433 const uschar *ccode;
1434
1435 c = *code;
1436
1437 /* Groups with zero repeats can of course be empty; skip them. */
1438
1439 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1440 {
1441 code += _pcre_OP_lengths[c];
1442 do code += GET(code, 1); while (*code == OP_ALT);
1443 c = *code;
1444 continue;
1445 }
1446
1447 /* For other groups, scan the branches. */
1448
1449 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1450 {
1451 BOOL empty_branch;
1452 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1453
1454 /* Scan a closed bracket */
1455
1456 empty_branch = FALSE;
1457 do
1458 {
1459 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1460 empty_branch = TRUE;
1461 code += GET(code, 1);
1462 }
1463 while (*code == OP_ALT);
1464 if (!empty_branch) return FALSE; /* All branches are non-empty */
1465 c = *code;
1466 continue;
1467 }
1468
1469 /* Handle the other opcodes */
1470
1471 switch (c)
1472 {
1473 /* Check for quantifiers after a class. XCLASS is used for classes that
1474 cannot be represented just by a bit map. This includes negated single
1475 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1476 actual length is stored in the compiled code, so we must update "code"
1477 here. */
1478
1479 #ifdef SUPPORT_UTF8
1480 case OP_XCLASS:
1481 ccode = code += GET(code, 1);
1482 goto CHECK_CLASS_REPEAT;
1483 #endif
1484
1485 case OP_CLASS:
1486 case OP_NCLASS:
1487 ccode = code + 33;
1488
1489 #ifdef SUPPORT_UTF8
1490 CHECK_CLASS_REPEAT:
1491 #endif
1492
1493 switch (*ccode)
1494 {
1495 case OP_CRSTAR: /* These could be empty; continue */
1496 case OP_CRMINSTAR:
1497 case OP_CRQUERY:
1498 case OP_CRMINQUERY:
1499 break;
1500
1501 default: /* Non-repeat => class must match */
1502 case OP_CRPLUS: /* These repeats aren't empty */
1503 case OP_CRMINPLUS:
1504 return FALSE;
1505
1506 case OP_CRRANGE:
1507 case OP_CRMINRANGE:
1508 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1509 break;
1510 }
1511 break;
1512
1513 /* Opcodes that must match a character */
1514
1515 case OP_PROP:
1516 case OP_NOTPROP:
1517 case OP_EXTUNI:
1518 case OP_NOT_DIGIT:
1519 case OP_DIGIT:
1520 case OP_NOT_WHITESPACE:
1521 case OP_WHITESPACE:
1522 case OP_NOT_WORDCHAR:
1523 case OP_WORDCHAR:
1524 case OP_ANY:
1525 case OP_ANYBYTE:
1526 case OP_CHAR:
1527 case OP_CHARNC:
1528 case OP_NOT:
1529 case OP_PLUS:
1530 case OP_MINPLUS:
1531 case OP_POSPLUS:
1532 case OP_EXACT:
1533 case OP_NOTPLUS:
1534 case OP_NOTMINPLUS:
1535 case OP_NOTPOSPLUS:
1536 case OP_NOTEXACT:
1537 case OP_TYPEPLUS:
1538 case OP_TYPEMINPLUS:
1539 case OP_TYPEPOSPLUS:
1540 case OP_TYPEEXACT:
1541 return FALSE;
1542
1543 /* End of branch */
1544
1545 case OP_KET:
1546 case OP_KETRMAX:
1547 case OP_KETRMIN:
1548 case OP_ALT:
1549 return TRUE;
1550
1551 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1552 MINUPTO, and POSUPTO may be followed by a multibyte character */
1553
1554 #ifdef SUPPORT_UTF8
1555 case OP_STAR:
1556 case OP_MINSTAR:
1557 case OP_POSSTAR:
1558 case OP_QUERY:
1559 case OP_MINQUERY:
1560 case OP_POSQUERY:
1561 case OP_UPTO:
1562 case OP_MINUPTO:
1563 case OP_POSUPTO:
1564 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1565 break;
1566 #endif
1567 }
1568 }
1569
1570 return TRUE;
1571 }
1572
1573
1574
1575 /*************************************************
1576 * Scan compiled regex for non-emptiness *
1577 *************************************************/
1578
1579 /* This function is called to check for left recursive calls. We want to check
1580 the current branch of the current pattern to see if it could match the empty
1581 string. If it could, we must look outwards for branches at other levels,
1582 stopping when we pass beyond the bracket which is the subject of the recursion.
1583
1584 Arguments:
1585 code points to start of the recursion
1586 endcode points to where to stop (current RECURSE item)
1587 bcptr points to the chain of current (unclosed) branch starts
1588 utf8 TRUE if in UTF-8 mode
1589
1590 Returns: TRUE if what is matched could be empty
1591 */
1592
1593 static BOOL
1594 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1595 BOOL utf8)
1596 {
1597 while (bcptr != NULL && bcptr->current >= code)
1598 {
1599 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1600 bcptr = bcptr->outer;
1601 }
1602 return TRUE;
1603 }
1604
1605
1606
1607 /*************************************************
1608 * Check for POSIX class syntax *
1609 *************************************************/
1610
1611 /* This function is called when the sequence "[:" or "[." or "[=" is
1612 encountered in a character class. It checks whether this is followed by an
1613 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1614 ".]" or "=]".
1615
1616 Argument:
1617 ptr pointer to the initial [
1618 endptr where to return the end pointer
1619 cd pointer to compile data
1620
1621 Returns: TRUE or FALSE
1622 */
1623
1624 static BOOL
1625 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1626 {
1627 int terminator; /* Don't combine these lines; the Solaris cc */
1628 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1629 if (*(++ptr) == '^') ptr++;
1630 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1631 if (*ptr == terminator && ptr[1] == ']')
1632 {
1633 *endptr = ptr;
1634 return TRUE;
1635 }
1636 return FALSE;
1637 }
1638
1639
1640
1641
1642 /*************************************************
1643 * Check POSIX class name *
1644 *************************************************/
1645
1646 /* This function is called to check the name given in a POSIX-style class entry
1647 such as [:alnum:].
1648
1649 Arguments:
1650 ptr points to the first letter
1651 len the length of the name
1652
1653 Returns: a value representing the name, or -1 if unknown
1654 */
1655
1656 static int
1657 check_posix_name(const uschar *ptr, int len)
1658 {
1659 register int yield = 0;
1660 while (posix_name_lengths[yield] != 0)
1661 {
1662 if (len == posix_name_lengths[yield] &&
1663 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1664 yield++;
1665 }
1666 return -1;
1667 }
1668
1669
1670 /*************************************************
1671 * Adjust OP_RECURSE items in repeated group *
1672 *************************************************/
1673
1674 /* OP_RECURSE items contain an offset from the start of the regex to the group
1675 that is referenced. This means that groups can be replicated for fixed
1676 repetition simply by copying (because the recursion is allowed to refer to
1677 earlier groups that are outside the current group). However, when a group is
1678 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1679 it, after it has been compiled. This means that any OP_RECURSE items within it
1680 that refer to the group itself or any contained groups have to have their
1681 offsets adjusted. That one of the jobs of this function. Before it is called,
1682 the partially compiled regex must be temporarily terminated with OP_END.
1683
1684 This function has been extended with the possibility of forward references for
1685 recursions and subroutine calls. It must also check the list of such references
1686 for the group we are dealing with. If it finds that one of the recursions in
1687 the current group is on this list, it adjusts the offset in the list, not the
1688 value in the reference (which is a group number).
1689
1690 Arguments:
1691 group points to the start of the group
1692 adjust the amount by which the group is to be moved
1693 utf8 TRUE in UTF-8 mode
1694 cd contains pointers to tables etc.
1695 save_hwm the hwm forward reference pointer at the start of the group
1696
1697 Returns: nothing
1698 */
1699
1700 static void
1701 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1702 uschar *save_hwm)
1703 {
1704 uschar *ptr = group;
1705 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1706 {
1707 int offset;
1708 uschar *hc;
1709
1710 /* See if this recursion is on the forward reference list. If so, adjust the
1711 reference. */
1712
1713 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1714 {
1715 offset = GET(hc, 0);
1716 if (cd->start_code + offset == ptr + 1)
1717 {
1718 PUT(hc, 0, offset + adjust);
1719 break;
1720 }
1721 }
1722
1723 /* Otherwise, adjust the recursion offset if it's after the start of this
1724 group. */
1725
1726 if (hc >= cd->hwm)
1727 {
1728 offset = GET(ptr, 1);
1729 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1730 }
1731
1732 ptr += 1 + LINK_SIZE;
1733 }
1734 }
1735
1736
1737
1738 /*************************************************
1739 * Insert an automatic callout point *
1740 *************************************************/
1741
1742 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1743 callout points before each pattern item.
1744
1745 Arguments:
1746 code current code pointer
1747 ptr current pattern pointer
1748 cd pointers to tables etc
1749
1750 Returns: new code pointer
1751 */
1752
1753 static uschar *
1754 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1755 {
1756 *code++ = OP_CALLOUT;
1757 *code++ = 255;
1758 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1759 PUT(code, LINK_SIZE, 0); /* Default length */
1760 return code + 2*LINK_SIZE;
1761 }
1762
1763
1764
1765 /*************************************************
1766 * Complete a callout item *
1767 *************************************************/
1768
1769 /* A callout item contains the length of the next item in the pattern, which
1770 we can't fill in till after we have reached the relevant point. This is used
1771 for both automatic and manual callouts.
1772
1773 Arguments:
1774 previous_callout points to previous callout item
1775 ptr current pattern pointer
1776 cd pointers to tables etc
1777
1778 Returns: nothing
1779 */
1780
1781 static void
1782 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1783 {
1784 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1785 PUT(previous_callout, 2 + LINK_SIZE, length);
1786 }
1787
1788
1789
1790 #ifdef SUPPORT_UCP
1791 /*************************************************
1792 * Get othercase range *
1793 *************************************************/
1794
1795 /* This function is passed the start and end of a class range, in UTF-8 mode
1796 with UCP support. It searches up the characters, looking for internal ranges of
1797 characters in the "other" case. Each call returns the next one, updating the
1798 start address.
1799
1800 Arguments:
1801 cptr points to starting character value; updated
1802 d end value
1803 ocptr where to put start of othercase range
1804 odptr where to put end of othercase range
1805
1806 Yield: TRUE when range returned; FALSE when no more
1807 */
1808
1809 static BOOL
1810 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1811 unsigned int *odptr)
1812 {
1813 unsigned int c, othercase, next;
1814
1815 for (c = *cptr; c <= d; c++)
1816 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1817
1818 if (c > d) return FALSE;
1819
1820 *ocptr = othercase;
1821 next = othercase + 1;
1822
1823 for (++c; c <= d; c++)
1824 {
1825 if (_pcre_ucp_othercase(c) != next) break;
1826 next++;
1827 }
1828
1829 *odptr = next - 1;
1830 *cptr = c;
1831
1832 return TRUE;
1833 }
1834 #endif /* SUPPORT_UCP */
1835
1836
1837
1838 /*************************************************
1839 * Check if auto-possessifying is possible *
1840 *************************************************/
1841
1842 /* This function is called for unlimited repeats of certain items, to see
1843 whether the next thing could possibly match the repeated item. If not, it makes
1844 sense to automatically possessify the repeated item.
1845
1846 Arguments:
1847 op_code the repeated op code
1848 this data for this item, depends on the opcode
1849 utf8 TRUE in UTF-8 mode
1850 utf8_char used for utf8 character bytes, NULL if not relevant
1851 ptr next character in pattern
1852 options options bits
1853 cd contains pointers to tables etc.
1854
1855 Returns: TRUE if possessifying is wanted
1856 */
1857
1858 static BOOL
1859 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1860 const uschar *ptr, int options, compile_data *cd)
1861 {
1862 int next;
1863
1864 /* Skip whitespace and comments in extended mode */
1865
1866 if ((options & PCRE_EXTENDED) != 0)
1867 {
1868 for (;;)
1869 {
1870 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1871 if (*ptr == '#')
1872 {
1873 while (*(++ptr) != 0)
1874 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1875 }
1876 else break;
1877 }
1878 }
1879
1880 /* If the next item is one that we can handle, get its value. A non-negative
1881 value is a character, a negative value is an escape value. */
1882
1883 if (*ptr == '\\')
1884 {
1885 int temperrorcode = 0;
1886 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1887 if (temperrorcode != 0) return FALSE;
1888 ptr++; /* Point after the escape sequence */
1889 }
1890
1891 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1892 {
1893 #ifdef SUPPORT_UTF8
1894 if (utf8) { GETCHARINC(next, ptr); } else
1895 #endif
1896 next = *ptr++;
1897 }
1898
1899 else return FALSE;
1900
1901 /* Skip whitespace and comments in extended mode */
1902
1903 if ((options & PCRE_EXTENDED) != 0)
1904 {
1905 for (;;)
1906 {
1907 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1908 if (*ptr == '#')
1909 {
1910 while (*(++ptr) != 0)
1911 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1912 }
1913 else break;
1914 }
1915 }
1916
1917 /* If the next thing is itself optional, we have to give up. */
1918
1919 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1920 return FALSE;
1921
1922 /* Now compare the next item with the previous opcode. If the previous is a
1923 positive single character match, "item" either contains the character or, if
1924 "item" is greater than 127 in utf8 mode, the character's bytes are in
1925 utf8_char. */
1926
1927
1928 /* Handle cases when the next item is a character. */
1929
1930 if (next >= 0) switch(op_code)
1931 {
1932 case OP_CHAR:
1933 #ifdef SUPPORT_UTF8
1934 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1935 #endif
1936 return item != next;
1937
1938 /* For CHARNC (caseless character) we must check the other case. If we have
1939 Unicode property support, we can use it to test the other case of
1940 high-valued characters. */
1941
1942 case OP_CHARNC:
1943 #ifdef SUPPORT_UTF8
1944 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1945 #endif
1946 if (item == next) return FALSE;
1947 #ifdef SUPPORT_UTF8
1948 if (utf8)
1949 {
1950 unsigned int othercase;
1951 if (next < 128) othercase = cd->fcc[next]; else
1952 #ifdef SUPPORT_UCP
1953 othercase = _pcre_ucp_othercase((unsigned int)next);
1954 #else
1955 othercase = NOTACHAR;
1956 #endif
1957 return (unsigned int)item != othercase;
1958 }
1959 else
1960 #endif /* SUPPORT_UTF8 */
1961 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1962
1963 /* For OP_NOT, "item" must be a single-byte character. */
1964
1965 case OP_NOT:
1966 if (next < 0) return FALSE; /* Not a character */
1967 if (item == next) return TRUE;
1968 if ((options & PCRE_CASELESS) == 0) return FALSE;
1969 #ifdef SUPPORT_UTF8
1970 if (utf8)
1971 {
1972 unsigned int othercase;
1973 if (next < 128) othercase = cd->fcc[next]; else
1974 #ifdef SUPPORT_UCP
1975 othercase = _pcre_ucp_othercase(next);
1976 #else
1977 othercase = NOTACHAR;
1978 #endif
1979 return (unsigned int)item == othercase;
1980 }
1981 else
1982 #endif /* SUPPORT_UTF8 */
1983 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1984
1985 case OP_DIGIT:
1986 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1987
1988 case OP_NOT_DIGIT:
1989 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1990
1991 case OP_WHITESPACE:
1992 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1993
1994 case OP_NOT_WHITESPACE:
1995 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1996
1997 case OP_WORDCHAR:
1998 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1999
2000 case OP_NOT_WORDCHAR:
2001 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2002
2003 case OP_HSPACE:
2004 case OP_NOT_HSPACE:
2005 switch(next)
2006 {
2007 case 0x09:
2008 case 0x20:
2009 case 0xa0:
2010 case 0x1680:
2011 case 0x180e:
2012 case 0x2000:
2013 case 0x2001:
2014 case 0x2002:
2015 case 0x2003:
2016 case 0x2004:
2017 case 0x2005:
2018 case 0x2006:
2019 case 0x2007:
2020 case 0x2008:
2021 case 0x2009:
2022 case 0x200A:
2023 case 0x202f:
2024 case 0x205f:
2025 case 0x3000:
2026 return op_code != OP_HSPACE;
2027 default:
2028 return op_code == OP_HSPACE;
2029 }
2030
2031 case OP_VSPACE:
2032 case OP_NOT_VSPACE:
2033 switch(next)
2034 {
2035 case 0x0a:
2036 case 0x0b:
2037 case 0x0c:
2038 case 0x0d:
2039 case 0x85:
2040 case 0x2028:
2041 case 0x2029:
2042 return op_code != OP_VSPACE;
2043 default:
2044 return op_code == OP_VSPACE;
2045 }
2046
2047 default:
2048 return FALSE;
2049 }
2050
2051
2052 /* Handle the case when the next item is \d, \s, etc. */
2053
2054 switch(op_code)
2055 {
2056 case OP_CHAR:
2057 case OP_CHARNC:
2058 #ifdef SUPPORT_UTF8
2059 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2060 #endif
2061 switch(-next)
2062 {
2063 case ESC_d:
2064 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2065
2066 case ESC_D:
2067 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2068
2069 case ESC_s:
2070 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2071
2072 case ESC_S:
2073 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2074
2075 case ESC_w:
2076 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2077
2078 case ESC_W:
2079 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2080
2081 case ESC_h:
2082 case ESC_H:
2083 switch(item)
2084 {
2085 case 0x09:
2086 case 0x20:
2087 case 0xa0:
2088 case 0x1680:
2089 case 0x180e:
2090 case 0x2000:
2091 case 0x2001:
2092 case 0x2002:
2093 case 0x2003:
2094 case 0x2004:
2095 case 0x2005:
2096 case 0x2006:
2097 case 0x2007:
2098 case 0x2008:
2099 case 0x2009:
2100 case 0x200A:
2101 case 0x202f:
2102 case 0x205f:
2103 case 0x3000:
2104 return -next != ESC_h;
2105 default:
2106 return -next == ESC_h;
2107 }
2108
2109 case ESC_v:
2110 case ESC_V:
2111 switch(item)
2112 {
2113 case 0x0a:
2114 case 0x0b:
2115 case 0x0c:
2116 case 0x0d:
2117 case 0x85:
2118 case 0x2028:
2119 case 0x2029:
2120 return -next != ESC_v;
2121 default:
2122 return -next == ESC_v;
2123 }
2124
2125 default:
2126 return FALSE;
2127 }
2128
2129 case OP_DIGIT:
2130 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2131 next == -ESC_h || next == -ESC_v;
2132
2133 case OP_NOT_DIGIT:
2134 return next == -ESC_d;
2135
2136 case OP_WHITESPACE:
2137 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2138
2139 case OP_NOT_WHITESPACE:
2140 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2141
2142 case OP_HSPACE:
2143 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2144
2145 case OP_NOT_HSPACE:
2146 return next == -ESC_h;
2147
2148 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2149 case OP_VSPACE:
2150 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2151
2152 case OP_NOT_VSPACE:
2153 return next == -ESC_v;
2154
2155 case OP_WORDCHAR:
2156 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2157
2158 case OP_NOT_WORDCHAR:
2159 return next == -ESC_w || next == -ESC_d;
2160
2161 default:
2162 return FALSE;
2163 }
2164
2165 /* Control does not reach here */
2166 }
2167
2168
2169
2170 /*************************************************
2171 * Compile one branch *
2172 *************************************************/
2173
2174 /* Scan the pattern, compiling it into the a vector. If the options are
2175 changed during the branch, the pointer is used to change the external options
2176 bits. This function is used during the pre-compile phase when we are trying
2177 to find out the amount of memory needed, as well as during the real compile
2178 phase. The value of lengthptr distinguishes the two phases.
2179
2180 Arguments:
2181 optionsptr pointer to the option bits
2182 codeptr points to the pointer to the current code point
2183 ptrptr points to the current pattern pointer
2184 errorcodeptr points to error code variable
2185 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2186 reqbyteptr set to the last literal character required, else < 0
2187 bcptr points to current branch chain
2188 cd contains pointers to tables etc.
2189 lengthptr NULL during the real compile phase
2190 points to length accumulator during pre-compile phase
2191
2192 Returns: TRUE on success
2193 FALSE, with *errorcodeptr set non-zero on error
2194 */
2195
2196 static BOOL
2197 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2198 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2199 compile_data *cd, int *lengthptr)
2200 {
2201 int repeat_type, op_type;
2202 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2203 int bravalue = 0;
2204 int greedy_default, greedy_non_default;
2205 int firstbyte, reqbyte;
2206 int zeroreqbyte, zerofirstbyte;
2207 int req_caseopt, reqvary, tempreqvary;
2208 int options = *optionsptr;
2209 int after_manual_callout = 0;
2210 int length_prevgroup = 0;
2211 register int c;
2212 register uschar *code = *codeptr;
2213 uschar *last_code = code;
2214 uschar *orig_code = code;
2215 uschar *tempcode;
2216 BOOL inescq = FALSE;
2217 BOOL groupsetfirstbyte = FALSE;
2218 const uschar *ptr = *ptrptr;
2219 const uschar *tempptr;
2220 uschar *previous = NULL;
2221 uschar *previous_callout = NULL;
2222 uschar *save_hwm = NULL;
2223 uschar classbits[32];
2224
2225 #ifdef SUPPORT_UTF8
2226 BOOL class_utf8;
2227 BOOL utf8 = (options & PCRE_UTF8) != 0;
2228 uschar *class_utf8data;
2229 uschar utf8_char[6];
2230 #else
2231 BOOL utf8 = FALSE;
2232 uschar *utf8_char = NULL;
2233 #endif
2234
2235 #ifdef DEBUG
2236 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2237 #endif
2238
2239 /* Set up the default and non-default settings for greediness */
2240
2241 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2242 greedy_non_default = greedy_default ^ 1;
2243
2244 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2245 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2246 matches a non-fixed char first char; reqbyte just remains unset if we never
2247 find one.
2248
2249 When we hit a repeat whose minimum is zero, we may have to adjust these values
2250 to take the zero repeat into account. This is implemented by setting them to
2251 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2252 item types that can be repeated set these backoff variables appropriately. */
2253
2254 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2255
2256 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2257 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2258 value > 255. It is added into the firstbyte or reqbyte variables to record the
2259 case status of the value. This is used only for ASCII characters. */
2260
2261 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2262
2263 /* Switch on next character until the end of the branch */
2264
2265 for (;; ptr++)
2266 {
2267 BOOL negate_class;
2268 BOOL possessive_quantifier;
2269 BOOL is_quantifier;
2270 BOOL is_recurse;
2271 BOOL reset_bracount;
2272 int class_charcount;
2273 int class_lastchar;
2274 int newoptions;
2275 int recno;
2276 int refsign;
2277 int skipbytes;
2278 int subreqbyte;
2279 int subfirstbyte;
2280 int terminator;
2281 int mclength;
2282 uschar mcbuffer[8];
2283
2284 /* Get next byte in the pattern */
2285
2286 c = *ptr;
2287
2288 /* If we are in the pre-compile phase, accumulate the length used for the
2289 previous cycle of this loop. */
2290
2291 if (lengthptr != NULL)
2292 {
2293 #ifdef DEBUG
2294 if (code > cd->hwm) cd->hwm = code; /* High water info */
2295 #endif
2296 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2297 {
2298 *errorcodeptr = ERR52;
2299 goto FAILED;
2300 }
2301
2302 /* There is at least one situation where code goes backwards: this is the
2303 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2304 the class is simply eliminated. However, it is created first, so we have to
2305 allow memory for it. Therefore, don't ever reduce the length at this point.
2306 */
2307
2308 if (code < last_code) code = last_code;
2309
2310 /* Paranoid check for integer overflow */
2311
2312 if (OFLOW_MAX - *lengthptr < code - last_code)
2313 {
2314 *errorcodeptr = ERR20;
2315 goto FAILED;
2316 }
2317
2318 *lengthptr += code - last_code;
2319 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2320
2321 /* If "previous" is set and it is not at the start of the work space, move
2322 it back to there, in order to avoid filling up the work space. Otherwise,
2323 if "previous" is NULL, reset the current code pointer to the start. */
2324
2325 if (previous != NULL)
2326 {
2327 if (previous > orig_code)
2328 {
2329 memmove(orig_code, previous, code - previous);
2330 code -= previous - orig_code;
2331 previous = orig_code;
2332 }
2333 }
2334 else code = orig_code;
2335
2336 /* Remember where this code item starts so we can pick up the length
2337 next time round. */
2338
2339 last_code = code;
2340 }
2341
2342 /* In the real compile phase, just check the workspace used by the forward
2343 reference list. */
2344
2345 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2346 {
2347 *errorcodeptr = ERR52;
2348 goto FAILED;
2349 }
2350
2351 /* If in \Q...\E, check for the end; if not, we have a literal */
2352
2353 if (inescq && c != 0)
2354 {
2355 if (c == '\\' && ptr[1] == 'E')
2356 {
2357 inescq = FALSE;
2358 ptr++;
2359 continue;
2360 }
2361 else
2362 {
2363 if (previous_callout != NULL)
2364 {
2365 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2366 complete_callout(previous_callout, ptr, cd);
2367 previous_callout = NULL;
2368 }
2369 if ((options & PCRE_AUTO_CALLOUT) != 0)
2370 {
2371 previous_callout = code;
2372 code = auto_callout(code, ptr, cd);
2373 }
2374 goto NORMAL_CHAR;
2375 }
2376 }
2377
2378 /* Fill in length of a previous callout, except when the next thing is
2379 a quantifier. */
2380
2381 is_quantifier = c == '*' || c == '+' || c == '?' ||
2382 (c == '{' && is_counted_repeat(ptr+1));
2383
2384 if (!is_quantifier && previous_callout != NULL &&
2385 after_manual_callout-- <= 0)
2386 {
2387 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2388 complete_callout(previous_callout, ptr, cd);
2389 previous_callout = NULL;
2390 }
2391
2392 /* In extended mode, skip white space and comments */
2393
2394 if ((options & PCRE_EXTENDED) != 0)
2395 {
2396 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2397 if (c == '#')
2398 {
2399 while (*(++ptr) != 0)
2400 {
2401 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2402 }
2403 if (*ptr != 0) continue;
2404
2405 /* Else fall through to handle end of string */
2406 c = 0;
2407 }
2408 }
2409
2410 /* No auto callout for quantifiers. */
2411
2412 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2413 {
2414 previous_callout = code;
2415 code = auto_callout(code, ptr, cd);
2416 }
2417
2418 switch(c)
2419 {
2420 /* ===================================================================*/
2421 case 0: /* The branch terminates at string end */
2422 case '|': /* or | or ) */
2423 case ')':
2424 *firstbyteptr = firstbyte;
2425 *reqbyteptr = reqbyte;
2426 *codeptr = code;
2427 *ptrptr = ptr;
2428 if (lengthptr != NULL)
2429 {
2430 if (OFLOW_MAX - *lengthptr < code - last_code)
2431 {
2432 *errorcodeptr = ERR20;
2433 goto FAILED;
2434 }
2435 *lengthptr += code - last_code; /* To include callout length */
2436 DPRINTF((">> end branch\n"));
2437 }
2438 return TRUE;
2439
2440
2441 /* ===================================================================*/
2442 /* Handle single-character metacharacters. In multiline mode, ^ disables
2443 the setting of any following char as a first character. */
2444
2445 case '^':
2446 if ((options & PCRE_MULTILINE) != 0)
2447 {
2448 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2449 }
2450 previous = NULL;
2451 *code++ = OP_CIRC;
2452 break;
2453
2454 case '$':
2455 previous = NULL;
2456 *code++ = OP_DOLL;
2457 break;
2458
2459 /* There can never be a first char if '.' is first, whatever happens about
2460 repeats. The value of reqbyte doesn't change either. */
2461
2462 case '.':
2463 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2464 zerofirstbyte = firstbyte;
2465 zeroreqbyte = reqbyte;
2466 previous = code;
2467 *code++ = OP_ANY;
2468 break;
2469
2470
2471 /* ===================================================================*/
2472 /* Character classes. If the included characters are all < 256, we build a
2473 32-byte bitmap of the permitted characters, except in the special case
2474 where there is only one such character. For negated classes, we build the
2475 map as usual, then invert it at the end. However, we use a different opcode
2476 so that data characters > 255 can be handled correctly.
2477
2478 If the class contains characters outside the 0-255 range, a different
2479 opcode is compiled. It may optionally have a bit map for characters < 256,
2480 but those above are are explicitly listed afterwards. A flag byte tells
2481 whether the bitmap is present, and whether this is a negated class or not.
2482 */
2483
2484 case '[':
2485 previous = code;
2486
2487 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2488 they are encountered at the top level, so we'll do that too. */
2489
2490 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2491 check_posix_syntax(ptr, &tempptr, cd))
2492 {
2493 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2494 goto FAILED;
2495 }
2496
2497 /* If the first character is '^', set the negation flag and skip it. Also,
2498 if the first few characters (either before or after ^) are \Q\E or \E we
2499 skip them too. This makes for compatibility with Perl. */
2500
2501 negate_class = FALSE;
2502 for (;;)
2503 {
2504 c = *(++ptr);
2505 if (c == '\\')
2506 {
2507 if (ptr[1] == 'E') ptr++;
2508 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2509 else break;
2510 }
2511 else if (!negate_class && c == '^')
2512 negate_class = TRUE;
2513 else break;
2514 }
2515
2516 /* Keep a count of chars with values < 256 so that we can optimize the case
2517 of just a single character (as long as it's < 256). However, For higher
2518 valued UTF-8 characters, we don't yet do any optimization. */
2519
2520 class_charcount = 0;
2521 class_lastchar = -1;
2522
2523 /* Initialize the 32-char bit map to all zeros. We build the map in a
2524 temporary bit of memory, in case the class contains only 1 character (less
2525 than 256), because in that case the compiled code doesn't use the bit map.
2526 */
2527
2528 memset(classbits, 0, 32 * sizeof(uschar));
2529
2530 #ifdef SUPPORT_UTF8
2531 class_utf8 = FALSE; /* No chars >= 256 */
2532 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2533 #endif
2534
2535 /* Process characters until ] is reached. By writing this as a "do" it
2536 means that an initial ] is taken as a data character. At the start of the
2537 loop, c contains the first byte of the character. */
2538
2539 if (c != 0) do
2540 {
2541 const uschar *oldptr;
2542
2543 #ifdef SUPPORT_UTF8
2544 if (utf8 && c > 127)
2545 { /* Braces are required because the */
2546 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2547 }
2548 #endif
2549
2550 /* Inside \Q...\E everything is literal except \E */
2551
2552 if (inescq)
2553 {
2554 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2555 {
2556 inescq = FALSE; /* Reset literal state */
2557 ptr++; /* Skip the 'E' */
2558 continue; /* Carry on with next */
2559 }
2560 goto CHECK_RANGE; /* Could be range if \E follows */
2561 }
2562
2563 /* Handle POSIX class names. Perl allows a negation extension of the
2564 form [:^name:]. A square bracket that doesn't match the syntax is
2565 treated as a literal. We also recognize the POSIX constructions
2566 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2567 5.6 and 5.8 do. */
2568
2569 if (c == '[' &&
2570 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2571 check_posix_syntax(ptr, &tempptr, cd))
2572 {
2573 BOOL local_negate = FALSE;
2574 int posix_class, taboffset, tabopt;
2575 register const uschar *cbits = cd->cbits;
2576 uschar pbits[32];
2577
2578 if (ptr[1] != ':')
2579 {
2580 *errorcodeptr = ERR31;
2581 goto FAILED;
2582 }
2583
2584 ptr += 2;
2585 if (*ptr == '^')
2586 {
2587 local_negate = TRUE;
2588 ptr++;
2589 }
2590
2591 posix_class = check_posix_name(ptr, tempptr - ptr);
2592 if (posix_class < 0)
2593 {
2594 *errorcodeptr = ERR30;
2595 goto FAILED;
2596 }
2597
2598 /* If matching is caseless, upper and lower are converted to
2599 alpha. This relies on the fact that the class table starts with
2600 alpha, lower, upper as the first 3 entries. */
2601
2602 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2603 posix_class = 0;
2604
2605 /* We build the bit map for the POSIX class in a chunk of local store
2606 because we may be adding and subtracting from it, and we don't want to
2607 subtract bits that may be in the main map already. At the end we or the
2608 result into the bit map that is being built. */
2609
2610 posix_class *= 3;
2611
2612 /* Copy in the first table (always present) */
2613
2614 memcpy(pbits, cbits + posix_class_maps[posix_class],
2615 32 * sizeof(uschar));
2616
2617 /* If there is a second table, add or remove it as required. */
2618
2619 taboffset = posix_class_maps[posix_class + 1];
2620 tabopt = posix_class_maps[posix_class + 2];
2621
2622 if (taboffset >= 0)
2623 {
2624 if (tabopt >= 0)
2625 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2626 else
2627 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2628 }
2629
2630 /* Not see if we need to remove any special characters. An option
2631 value of 1 removes vertical space and 2 removes underscore. */
2632
2633 if (tabopt < 0) tabopt = -tabopt;
2634 if (tabopt == 1) pbits[1] &= ~0x3c;
2635 else if (tabopt == 2) pbits[11] &= 0x7f;
2636
2637 /* Add the POSIX table or its complement into the main table that is
2638 being built and we are done. */
2639
2640 if (local_negate)
2641 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2642 else
2643 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2644
2645 ptr = tempptr + 1;
2646 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2647 continue; /* End of POSIX syntax handling */
2648 }
2649
2650 /* Backslash may introduce a single character, or it may introduce one
2651 of the specials, which just set a flag. The sequence \b is a special
2652 case. Inside a class (and only there) it is treated as backspace.
2653 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2654 to 'or' into the one we are building. We assume they have more than one
2655 character in them, so set class_charcount bigger than one. */
2656
2657 if (c == '\\')
2658 {
2659 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2660 if (*errorcodeptr != 0) goto FAILED;
2661
2662 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2663 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2664 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2665 else if (-c == ESC_Q) /* Handle start of quoted string */
2666 {
2667 if (ptr[1] == '\\' && ptr[2] == 'E')
2668 {
2669 ptr += 2; /* avoid empty string */
2670 }
2671 else inescq = TRUE;
2672 continue;
2673 }
2674
2675 if (c < 0)
2676 {
2677 register const uschar *cbits = cd->cbits;
2678 class_charcount += 2; /* Greater than 1 is what matters */
2679
2680 /* Save time by not doing this in the pre-compile phase. */
2681
2682 if (lengthptr == NULL) switch (-c)
2683 {
2684 case ESC_d:
2685 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2686 continue;
2687
2688 case ESC_D:
2689 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2690 continue;
2691
2692 case ESC_w:
2693 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2694 continue;
2695
2696 case ESC_W:
2697 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2698 continue;
2699
2700 case ESC_s:
2701 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2702 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2703 continue;
2704
2705 case ESC_S:
2706 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2707 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2708 continue;
2709
2710 case ESC_E: /* Perl ignores an orphan \E */
2711 continue;
2712
2713 default: /* Not recognized; fall through */
2714 break; /* Need "default" setting to stop compiler warning. */
2715 }
2716
2717 /* In the pre-compile phase, just do the recognition. */
2718
2719 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2720 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2721
2722 /* We need to deal with \H, \h, \V, and \v in both phases because
2723 they use extra memory. */
2724
2725 if (-c == ESC_h)
2726 {
2727 SETBIT(classbits, 0x09); /* VT */
2728 SETBIT(classbits, 0x20); /* SPACE */
2729 SETBIT(classbits, 0xa0); /* NSBP */
2730 #ifdef SUPPORT_UTF8
2731 if (utf8)
2732 {
2733 class_utf8 = TRUE;
2734 *class_utf8data++ = XCL_SINGLE;
2735 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2736 *class_utf8data++ = XCL_SINGLE;
2737 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2738 *class_utf8data++ = XCL_RANGE;
2739 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2740 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2741 *class_utf8data++ = XCL_SINGLE;
2742 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2743 *class_utf8data++ = XCL_SINGLE;
2744 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2745 *class_utf8data++ = XCL_SINGLE;
2746 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2747 }
2748 #endif
2749 continue;
2750 }
2751
2752 if (-c == ESC_H)
2753 {
2754 for (c = 0; c < 32; c++)
2755 {
2756 int x = 0xff;
2757 switch (c)
2758 {
2759 case 0x09/8: x ^= 1 << (0x09%8); break;
2760 case 0x20/8: x ^= 1 << (0x20%8); break;
2761 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2762 default: break;
2763 }
2764 classbits[c] |= x;
2765 }
2766
2767 #ifdef SUPPORT_UTF8
2768 if (utf8)
2769 {
2770 class_utf8 = TRUE;
2771 *class_utf8data++ = XCL_RANGE;
2772 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2773 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2774 *class_utf8data++ = XCL_RANGE;
2775 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2776 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2777 *class_utf8data++ = XCL_RANGE;
2778 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2779 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2780 *class_utf8data++ = XCL_RANGE;
2781 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2782 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2783 *class_utf8data++ = XCL_RANGE;
2784 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2785 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2786 *class_utf8data++ = XCL_RANGE;
2787 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2788 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2789 *class_utf8data++ = XCL_RANGE;
2790 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2791 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2792 }
2793 #endif
2794 continue;
2795 }
2796
2797 if (-c == ESC_v)
2798 {
2799 SETBIT(classbits, 0x0a); /* LF */
2800 SETBIT(classbits, 0x0b); /* VT */
2801 SETBIT(classbits, 0x0c); /* FF */
2802 SETBIT(classbits, 0x0d); /* CR */
2803 SETBIT(classbits, 0x85); /* NEL */
2804 #ifdef SUPPORT_UTF8
2805 if (utf8)
2806 {
2807 class_utf8 = TRUE;
2808 *class_utf8data++ = XCL_RANGE;
2809 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2810 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2811 }
2812 #endif
2813 continue;
2814 }
2815
2816 if (-c == ESC_V)
2817 {
2818 for (c = 0; c < 32; c++)
2819 {
2820 int x = 0xff;
2821 switch (c)
2822 {
2823 case 0x0a/8: x ^= 1 << (0x0a%8);
2824 x ^= 1 << (0x0b%8);
2825 x ^= 1 << (0x0c%8);
2826 x ^= 1 << (0x0d%8);
2827 break;
2828 case 0x85/8: x ^= 1 << (0x85%8); break;
2829 default: break;
2830 }
2831 classbits[c] |= x;
2832 }
2833
2834 #ifdef SUPPORT_UTF8
2835 if (utf8)
2836 {
2837 class_utf8 = TRUE;
2838 *class_utf8data++ = XCL_RANGE;
2839 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2840 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2841 *class_utf8data++ = XCL_RANGE;
2842 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2843 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2844 }
2845 #endif
2846 continue;
2847 }
2848
2849 /* We need to deal with \P and \p in both phases. */
2850
2851 #ifdef SUPPORT_UCP
2852 if (-c == ESC_p || -c == ESC_P)
2853 {
2854 BOOL negated;
2855 int pdata;
2856 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2857 if (ptype < 0) goto FAILED;
2858 class_utf8 = TRUE;
2859 *class_utf8data++ = ((-c == ESC_p) != negated)?
2860 XCL_PROP : XCL_NOTPROP;
2861 *class_utf8data++ = ptype;
2862 *class_utf8data++ = pdata;
2863 class_charcount -= 2; /* Not a < 256 character */
2864 continue;
2865 }
2866 #endif
2867 /* Unrecognized escapes are faulted if PCRE is running in its
2868 strict mode. By default, for compatibility with Perl, they are
2869 treated as literals. */
2870
2871 if ((options & PCRE_EXTRA) != 0)
2872 {
2873 *errorcodeptr = ERR7;
2874 goto FAILED;
2875 }
2876
2877 class_charcount -= 2; /* Undo the default count from above */
2878 c = *ptr; /* Get the final character and fall through */
2879 }
2880
2881 /* Fall through if we have a single character (c >= 0). This may be
2882 greater than 256 in UTF-8 mode. */
2883
2884 } /* End of backslash handling */
2885
2886 /* A single character may be followed by '-' to form a range. However,
2887 Perl does not permit ']' to be the end of the range. A '-' character
2888 at the end is treated as a literal. Perl ignores orphaned \E sequences
2889 entirely. The code for handling \Q and \E is messy. */
2890
2891 CHECK_RANGE:
2892 while (ptr[1] == '\\' && ptr[2] == 'E')
2893 {
2894 inescq = FALSE;
2895 ptr += 2;
2896 }
2897
2898 oldptr = ptr;
2899
2900 if (!inescq && ptr[1] == '-')
2901 {
2902 int d;
2903 ptr += 2;
2904 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2905
2906 /* If we hit \Q (not followed by \E) at this point, go into escaped
2907 mode. */
2908
2909 while (*ptr == '\\' && ptr[1] == 'Q')
2910 {
2911 ptr += 2;
2912 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2913 inescq = TRUE;
2914 break;
2915 }
2916
2917 if (*ptr == 0 || (!inescq && *ptr == ']'))
2918 {
2919 ptr = oldptr;
2920 goto LONE_SINGLE_CHARACTER;
2921 }
2922
2923 #ifdef SUPPORT_UTF8
2924 if (utf8)
2925 { /* Braces are required because the */
2926 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2927 }
2928 else
2929 #endif
2930 d = *ptr; /* Not UTF-8 mode */
2931
2932 /* The second part of a range can be a single-character escape, but
2933 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2934 in such circumstances. */
2935
2936 if (!inescq && d == '\\')
2937 {
2938 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2939 if (*errorcodeptr != 0) goto FAILED;
2940
2941 /* \b is backslash; \X is literal X; \R is literal R; any other
2942 special means the '-' was literal */
2943
2944 if (d < 0)
2945 {
2946 if (d == -ESC_b) d = '\b';
2947 else if (d == -ESC_X) d = 'X';
2948 else if (d == -ESC_R) d = 'R'; else
2949 {
2950 ptr = oldptr;
2951 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2952 }
2953 }
2954 }
2955
2956 /* Check that the two values are in the correct order. Optimize
2957 one-character ranges */
2958
2959 if (d < c)
2960 {
2961 *errorcodeptr = ERR8;
2962 goto FAILED;
2963 }
2964
2965 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2966
2967 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2968 matching, we have to use an XCLASS with extra data items. Caseless
2969 matching for characters > 127 is available only if UCP support is
2970 available. */
2971
2972 #ifdef SUPPORT_UTF8
2973 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2974 {
2975 class_utf8 = TRUE;
2976
2977 /* With UCP support, we can find the other case equivalents of
2978 the relevant characters. There may be several ranges. Optimize how
2979 they fit with the basic range. */
2980
2981 #ifdef SUPPORT_UCP
2982 if ((options & PCRE_CASELESS) != 0)
2983 {
2984 unsigned int occ, ocd;
2985 unsigned int cc = c;
2986 unsigned int origd = d;
2987 while (get_othercase_range(&cc, origd, &occ, &ocd))
2988 {
2989 if (occ >= (unsigned int)c &&
2990 ocd <= (unsigned int)d)
2991 continue; /* Skip embedded ranges */
2992
2993 if (occ < (unsigned int)c &&
2994 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2995 { /* if there is overlap, */
2996 c = occ; /* noting that if occ < c */
2997 continue; /* we can't have ocd > d */
2998 } /* because a subrange is */
2999 if (ocd > (unsigned int)d &&
3000 occ <= (unsigned int)d + 1) /* always shorter than */
3001 { /* the basic range. */
3002 d = ocd;
3003 continue;
3004 }
3005
3006 if (occ == ocd)
3007 {
3008 *class_utf8data++ = XCL_SINGLE;
3009 }
3010 else
3011 {
3012 *class_utf8data++ = XCL_RANGE;
3013 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3014 }
3015 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3016 }
3017 }
3018 #endif /* SUPPORT_UCP */
3019
3020 /* Now record the original range, possibly modified for UCP caseless
3021 overlapping ranges. */
3022
3023 *class_utf8data++ = XCL_RANGE;
3024 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3025 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3026
3027 /* With UCP support, we are done. Without UCP support, there is no
3028 caseless matching for UTF-8 characters > 127; we can use the bit map
3029 for the smaller ones. */
3030
3031 #ifdef SUPPORT_UCP
3032 continue; /* With next character in the class */
3033 #else
3034 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3035
3036 /* Adjust upper limit and fall through to set up the map */
3037
3038 d = 127;
3039
3040 #endif /* SUPPORT_UCP */
3041 }
3042 #endif /* SUPPORT_UTF8 */
3043
3044 /* We use the bit map for all cases when not in UTF-8 mode; else
3045 ranges that lie entirely within 0-127 when there is UCP support; else
3046 for partial ranges without UCP support. */
3047
3048 class_charcount += d - c + 1;
3049 class_lastchar = d;
3050
3051 /* We can save a bit of time by skipping this in the pre-compile. */
3052
3053 if (lengthptr == NULL) for (; c <= d; c++)
3054 {
3055 classbits[c/8] |= (1 << (c&7));
3056 if ((options & PCRE_CASELESS) != 0)
3057 {
3058 int uc = cd->fcc[c]; /* flip case */
3059 classbits[uc/8] |= (1 << (uc&7));
3060 }
3061 }
3062
3063 continue; /* Go get the next char in the class */
3064 }
3065
3066 /* Handle a lone single character - we can get here for a normal
3067 non-escape char, or after \ that introduces a single character or for an
3068 apparent range that isn't. */
3069
3070 LONE_SINGLE_CHARACTER:
3071
3072 /* Handle a character that cannot go in the bit map */
3073
3074 #ifdef SUPPORT_UTF8
3075 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3076 {
3077 class_utf8 = TRUE;
3078 *class_utf8data++ = XCL_SINGLE;
3079 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3080
3081 #ifdef SUPPORT_UCP
3082 if ((options & PCRE_CASELESS) != 0)
3083 {
3084 unsigned int othercase;
3085 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3086 {
3087 *class_utf8data++ = XCL_SINGLE;
3088 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3089 }
3090 }
3091 #endif /* SUPPORT_UCP */
3092
3093 }
3094 else
3095 #endif /* SUPPORT_UTF8 */
3096
3097 /* Handle a single-byte character */
3098 {
3099 classbits[c/8] |= (1 << (c&7));
3100 if ((options & PCRE_CASELESS) != 0)
3101 {
3102 c = cd->fcc[c]; /* flip case */
3103 classbits[c/8] |= (1 << (c&7));
3104 }
3105 class_charcount++;
3106 class_lastchar = c;
3107 }
3108 }
3109
3110 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3111
3112 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3113
3114 if (c == 0) /* Missing terminating ']' */
3115 {
3116 *errorcodeptr = ERR6;
3117 goto FAILED;
3118 }
3119
3120 /* If class_charcount is 1, we saw precisely one character whose value is
3121 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3122 can optimize the negative case only if there were no characters >= 128
3123 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3124 single-bytes only. This is an historical hangover. Maybe one day we can
3125 tidy these opcodes to handle multi-byte characters.
3126
3127 The optimization throws away the bit map. We turn the item into a
3128 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3129 that OP_NOT does not support multibyte characters. In the positive case, it
3130 can cause firstbyte to be set. Otherwise, there can be no first char if
3131 this item is first, whatever repeat count may follow. In the case of
3132 reqbyte, save the previous value for reinstating. */
3133
3134 #ifdef SUPPORT_UTF8
3135 if (class_charcount == 1 &&
3136 (!utf8 ||
3137 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3138
3139 #else
3140 if (class_charcount == 1)
3141 #endif
3142 {
3143 zeroreqbyte = reqbyte;
3144
3145 /* The OP_NOT opcode works on one-byte characters only. */
3146
3147 if (negate_class)
3148 {
3149 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3150 zerofirstbyte = firstbyte;
3151 *code++ = OP_NOT;
3152 *code++ = class_lastchar;
3153 break;
3154 }
3155
3156 /* For a single, positive character, get the value into mcbuffer, and
3157 then we can handle this with the normal one-character code. */
3158
3159 #ifdef SUPPORT_UTF8
3160 if (utf8 && class_lastchar > 127)
3161 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3162 else
3163 #endif
3164 {
3165 mcbuffer[0] = class_lastchar;
3166 mclength = 1;
3167 }
3168 goto ONE_CHAR;
3169 } /* End of 1-char optimization */
3170
3171 /* The general case - not the one-char optimization. If this is the first
3172 thing in the branch, there can be no first char setting, whatever the
3173 repeat count. Any reqbyte setting must remain unchanged after any kind of
3174 repeat. */
3175
3176 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3177 zerofirstbyte = firstbyte;
3178 zeroreqbyte = reqbyte;
3179
3180 /* If there are characters with values > 255, we have to compile an
3181 extended class, with its own opcode. If there are no characters < 256,
3182 we can omit the bitmap in the actual compiled code. */
3183
3184 #ifdef SUPPORT_UTF8
3185 if (class_utf8)
3186 {
3187 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3188 *code++ = OP_XCLASS;
3189 code += LINK_SIZE;
3190 *code = negate_class? XCL_NOT : 0;
3191
3192 /* If the map is required, move up the extra data to make room for it;
3193 otherwise just move the code pointer to the end of the extra data. */
3194
3195 if (class_charcount > 0)
3196 {
3197 *code++ |= XCL_MAP;
3198 memmove(code + 32, code, class_utf8data - code);
3199 memcpy(code, classbits, 32);
3200 code = class_utf8data + 32;
3201 }
3202 else code = class_utf8data;
3203
3204 /* Now fill in the complete length of the item */
3205
3206 PUT(previous, 1, code - previous);
3207 break; /* End of class handling */
3208 }
3209 #endif
3210
3211 /* If there are no characters > 255, negate the 32-byte map if necessary,
3212 and copy it into the code vector. If this is the first thing in the branch,
3213 there can be no first char setting, whatever the repeat count. Any reqbyte
3214 setting must remain unchanged after any kind of repeat. */
3215
3216 if (negate_class)
3217 {
3218 *code++ = OP_NCLASS;
3219 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3220 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3221 }
3222 else
3223 {
3224 *code++ = OP_CLASS;
3225 memcpy(code, classbits, 32);
3226 }
3227 code += 32;
3228 break;
3229
3230
3231 /* ===================================================================*/
3232 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3233 has been tested above. */
3234
3235 case '{':
3236 if (!is_quantifier) goto NORMAL_CHAR;
3237 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3238 if (*errorcodeptr != 0) goto FAILED;
3239 goto REPEAT;
3240
3241 case '*':
3242 repeat_min = 0;
3243 repeat_max = -1;
3244 goto REPEAT;
3245
3246 case '+':
3247 repeat_min = 1;
3248 repeat_max = -1;
3249 goto REPEAT;
3250
3251 case '?':
3252 repeat_min = 0;
3253 repeat_max = 1;
3254
3255 REPEAT:
3256 if (previous == NULL)
3257 {
3258 *errorcodeptr = ERR9;
3259 goto FAILED;
3260 }
3261
3262 if (repeat_min == 0)
3263 {
3264 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3265 reqbyte = zeroreqbyte; /* Ditto */
3266 }
3267
3268 /* Remember whether this is a variable length repeat */
3269
3270 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3271
3272 op_type = 0; /* Default single-char op codes */
3273 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3274
3275 /* Save start of previous item, in case we have to move it up to make space
3276 for an inserted OP_ONCE for the additional '+' extension. */
3277
3278 tempcode = previous;
3279
3280 /* If the next character is '+', we have a possessive quantifier. This
3281 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3282 If the next character is '?' this is a minimizing repeat, by default,
3283 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3284 repeat type to the non-default. */
3285
3286 if (ptr[1] == '+')
3287 {
3288 repeat_type = 0; /* Force greedy */
3289 possessive_quantifier = TRUE;
3290 ptr++;
3291 }
3292 else if (ptr[1] == '?')
3293 {
3294 repeat_type = greedy_non_default;
3295 ptr++;
3296 }
3297 else repeat_type = greedy_default;
3298
3299 /* If previous was a character match, abolish the item and generate a
3300 repeat item instead. If a char item has a minumum of more than one, ensure
3301 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3302 the first thing in a branch because the x will have gone into firstbyte
3303 instead. */
3304
3305 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3306 {
3307 /* Deal with UTF-8 characters that take up more than one byte. It's
3308 easier to write this out separately than try to macrify it. Use c to
3309 hold the length of the character in bytes, plus 0x80 to flag that it's a
3310 length rather than a small character. */
3311
3312 #ifdef SUPPORT_UTF8
3313 if (utf8 && (code[-1] & 0x80) != 0)
3314 {
3315 uschar *lastchar = code - 1;
3316 while((*lastchar & 0xc0) == 0x80) lastchar--;
3317 c = code - lastchar; /* Length of UTF-8 character */
3318 memcpy(utf8_char, lastchar, c); /* Save the char */
3319 c |= 0x80; /* Flag c as a length */
3320 }
3321 else
3322 #endif
3323
3324 /* Handle the case of a single byte - either with no UTF8 support, or
3325 with UTF-8 disabled, or for a UTF-8 character < 128. */
3326
3327 {
3328 c = code[-1];
3329 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3330 }
3331
3332 /* If the repetition is unlimited, it pays to see if the next thing on
3333 the line is something that cannot possibly match this character. If so,
3334 automatically possessifying this item gains some performance in the case
3335 where the match fails. */
3336
3337 if (!possessive_quantifier &&
3338 repeat_max < 0 &&
3339 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3340 options, cd))
3341 {
3342 repeat_type = 0; /* Force greedy */
3343 possessive_quantifier = TRUE;
3344 }
3345
3346 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3347 }
3348
3349 /* If previous was a single negated character ([^a] or similar), we use
3350 one of the special opcodes, replacing it. The code is shared with single-
3351 character repeats by setting opt_type to add a suitable offset into
3352 repeat_type. We can also test for auto-possessification. OP_NOT is
3353 currently used only for single-byte chars. */
3354
3355 else if (*previous == OP_NOT)
3356 {
3357 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3358 c = previous[1];
3359 if (!possessive_quantifier &&
3360 repeat_max < 0 &&
3361 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3362 {
3363 repeat_type = 0; /* Force greedy */
3364 possessive_quantifier = TRUE;
3365 }
3366 goto OUTPUT_SINGLE_REPEAT;
3367 }
3368
3369 /* If previous was a character type match (\d or similar), abolish it and
3370 create a suitable repeat item. The code is shared with single-character
3371 repeats by setting op_type to add a suitable offset into repeat_type. Note
3372 the the Unicode property types will be present only when SUPPORT_UCP is
3373 defined, but we don't wrap the little bits of code here because it just
3374 makes it horribly messy. */
3375
3376 else if (*previous < OP_EODN)
3377 {
3378 uschar *oldcode;
3379 int prop_type, prop_value;
3380 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3381 c = *previous;
3382
3383 if (!possessive_quantifier &&
3384 repeat_max < 0 &&
3385 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3386 {
3387 repeat_type = 0; /* Force greedy */
3388 possessive_quantifier = TRUE;
3389 }
3390
3391 OUTPUT_SINGLE_REPEAT:
3392 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3393 {
3394 prop_type = previous[1];
3395 prop_value = previous[2];
3396 }
3397 else prop_type = prop_value = -1;
3398
3399 oldcode = code;
3400 code = previous; /* Usually overwrite previous item */
3401
3402 /* If the maximum is zero then the minimum must also be zero; Perl allows
3403 this case, so we do too - by simply omitting the item altogether. */
3404
3405 if (repeat_max == 0) goto END_REPEAT;
3406
3407 /* All real repeats make it impossible to handle partial matching (maybe
3408 one day we will be able to remove this restriction). */
3409
3410 if (repeat_max != 1) cd->nopartial = TRUE;
3411
3412 /* Combine the op_type with the repeat_type */
3413
3414 repeat_type += op_type;
3415
3416 /* A minimum of zero is handled either as the special case * or ?, or as
3417 an UPTO, with the maximum given. */
3418
3419 if (repeat_min == 0)
3420 {
3421 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3422 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3423 else
3424 {
3425 *code++ = OP_UPTO + repeat_type;
3426 PUT2INC(code, 0, repeat_max);
3427 }
3428 }
3429
3430 /* A repeat minimum of 1 is optimized into some special cases. If the
3431 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3432 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3433 one less than the maximum. */
3434
3435 else if (repeat_min == 1)
3436 {
3437 if (repeat_max == -1)
3438 *code++ = OP_PLUS + repeat_type;
3439 else
3440 {
3441 code = oldcode; /* leave previous item in place */
3442 if (repeat_max == 1) goto END_REPEAT;
3443 *code++ = OP_UPTO + repeat_type;
3444 PUT2INC(code, 0, repeat_max - 1);
3445 }
3446 }
3447
3448 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3449 handled as an EXACT followed by an UPTO. */
3450
3451 else
3452 {
3453 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3454 PUT2INC(code, 0, repeat_min);
3455
3456 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3457 we have to insert the character for the previous code. For a repeated
3458 Unicode property match, there are two extra bytes that define the
3459 required property. In UTF-8 mode, long characters have their length in
3460 c, with the 0x80 bit as a flag. */
3461
3462 if (repeat_max < 0)
3463 {
3464 #ifdef SUPPORT_UTF8
3465 if (utf8 && c >= 128)
3466 {
3467 memcpy(code, utf8_char, c & 7);
3468 code += c & 7;
3469 }
3470 else
3471 #endif
3472 {
3473 *code++ = c;
3474 if (prop_type >= 0)
3475 {
3476 *code++ = prop_type;
3477 *code++ = prop_value;
3478 }
3479 }
3480 *code++ = OP_STAR + repeat_type;
3481 }
3482
3483 /* Else insert an UPTO if the max is greater than the min, again
3484 preceded by the character, for the previously inserted code. If the
3485 UPTO is just for 1 instance, we can use QUERY instead. */
3486
3487 else if (repeat_max != repeat_min)
3488 {
3489 #ifdef SUPPORT_UTF8
3490 if (utf8 && c >= 128)
3491 {
3492 memcpy(code, utf8_char, c & 7);
3493 code += c & 7;
3494 }
3495 else
3496 #endif
3497 *code++ = c;
3498 if (prop_type >= 0)
3499 {
3500 *code++ = prop_type;
3501 *code++ = prop_value;
3502 }
3503 repeat_max -= repeat_min;
3504
3505 if (repeat_max == 1)
3506 {
3507 *code++ = OP_QUERY + repeat_type;
3508 }
3509 else
3510 {
3511 *code++ = OP_UPTO + repeat_type;
3512 PUT2INC(code, 0, repeat_max);
3513 }
3514 }
3515 }
3516
3517 /* The character or character type itself comes last in all cases. */
3518
3519 #ifdef SUPPORT_UTF8
3520 if (utf8 && c >= 128)
3521 {
3522 memcpy(code, utf8_char, c & 7);
3523 code += c & 7;
3524 }
3525 else
3526 #endif
3527 *code++ = c;
3528
3529 /* For a repeated Unicode property match, there are two extra bytes that
3530 define the required property. */
3531
3532 #ifdef SUPPORT_UCP
3533 if (prop_type >= 0)
3534 {
3535 *code++ = prop_type;
3536 *code++ = prop_value;
3537 }
3538 #endif
3539 }
3540
3541 /* If previous was a character class or a back reference, we put the repeat
3542 stuff after it, but just skip the item if the repeat was {0,0}. */
3543
3544 else if (*previous == OP_CLASS ||
3545 *previous == OP_NCLASS ||
3546 #ifdef SUPPORT_UTF8
3547 *previous == OP_XCLASS ||
3548 #endif
3549 *previous == OP_REF)
3550 {
3551 if (repeat_max == 0)
3552 {
3553 code = previous;
3554 goto END_REPEAT;
3555 }
3556
3557 /* All real repeats make it impossible to handle partial matching (maybe
3558 one day we will be able to remove this restriction). */
3559
3560 if (repeat_max != 1) cd->nopartial = TRUE;
3561
3562 if (repeat_min == 0 && repeat_max == -1)
3563 *code++ = OP_CRSTAR + repeat_type;
3564 else if (repeat_min == 1 && repeat_max == -1)
3565 *code++ = OP_CRPLUS + repeat_type;
3566 else if (repeat_min == 0 && repeat_max == 1)
3567 *code++ = OP_CRQUERY + repeat_type;
3568 else
3569 {
3570 *code++ = OP_CRRANGE + repeat_type;
3571 PUT2INC(code, 0, repeat_min);
3572 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3573 PUT2INC(code, 0, repeat_max);
3574 }
3575 }
3576
3577 /* If previous was a bracket group, we may have to replicate it in certain
3578 cases. */
3579
3580 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3581 *previous == OP_ONCE || *previous == OP_COND)
3582 {
3583 register int i;
3584 int ketoffset = 0;
3585 int len = code - previous;
3586 uschar *bralink = NULL;
3587
3588 /* Repeating a DEFINE group is pointless */
3589
3590 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3591 {
3592 *errorcodeptr = ERR55;
3593 goto FAILED;
3594 }
3595
3596 /* If the maximum repeat count is unlimited, find the end of the bracket
3597 by scanning through from the start, and compute the offset back to it
3598 from the current code pointer. There may be an OP_OPT setting following
3599 the final KET, so we can't find the end just by going back from the code
3600 pointer. */
3601
3602 if (repeat_max == -1)
3603 {
3604 register uschar *ket = previous;
3605 do ket += GET(ket, 1); while (*ket != OP_KET);
3606 ketoffset = code - ket;
3607 }
3608
3609 /* The case of a zero minimum is special because of the need to stick
3610 OP_BRAZERO in front of it, and because the group appears once in the
3611 data, whereas in other cases it appears the minimum number of times. For
3612 this reason, it is simplest to treat this case separately, as otherwise
3613 the code gets far too messy. There are several special subcases when the
3614 minimum is zero. */
3615
3616 if (repeat_min == 0)
3617 {
3618 /* If the maximum is also zero, we just omit the group from the output
3619 altogether. */
3620
3621 if (repeat_max == 0)
3622 {
3623 code = previous;
3624 goto END_REPEAT;
3625 }
3626
3627 /* If the maximum is 1 or unlimited, we just have to stick in the
3628 BRAZERO and do no more at this point. However, we do need to adjust
3629 any OP_RECURSE calls inside the group that refer to the group itself or
3630 any internal or forward referenced group, because the offset is from
3631 the start of the whole regex. Temporarily terminate the pattern while
3632 doing this. */
3633
3634 if (repeat_max <= 1)
3635 {
3636 *code = OP_END;
3637 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3638 memmove(previous+1, previous, len);
3639 code++;
3640 *previous++ = OP_BRAZERO + repeat_type;
3641 }
3642
3643 /* If the maximum is greater than 1 and limited, we have to replicate
3644 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3645 The first one has to be handled carefully because it's the original
3646 copy, which has to be moved up. The remainder can be handled by code
3647 that is common with the non-zero minimum case below. We have to
3648 adjust the value or repeat_max, since one less copy is required. Once
3649 again, we may have to adjust any OP_RECURSE calls inside the group. */
3650
3651 else
3652 {
3653 int offset;
3654 *code = OP_END;
3655 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3656 memmove(previous + 2 + LINK_SIZE, previous, len);
3657 code += 2 + LINK_SIZE;
3658 *previous++ = OP_BRAZERO + repeat_type;
3659 *previous++ = OP_BRA;
3660
3661 /* We chain together the bracket offset fields that have to be
3662 filled in later when the ends of the brackets are reached. */
3663
3664 offset = (bralink == NULL)? 0 : previous - bralink;
3665 bralink = previous;
3666 PUTINC(previous, 0, offset);
3667 }
3668
3669 repeat_max--;
3670 }
3671
3672 /* If the minimum is greater than zero, replicate the group as many
3673 times as necessary, and adjust the maximum to the number of subsequent
3674 copies that we need. If we set a first char from the group, and didn't
3675 set a required char, copy the latter from the former. If there are any
3676 forward reference subroutine calls in the group, there will be entries on
3677 the workspace list; replicate these with an appropriate increment. */
3678
3679 else
3680 {
3681 if (repeat_min > 1)
3682 {
3683 /* In the pre-compile phase, we don't actually do the replication. We
3684 just adjust the length as if we had. Do some paranoid checks for
3685 potential integer overflow. */
3686
3687 if (lengthptr != NULL)
3688 {
3689 int delta = (repeat_min - 1)*length_prevgroup;
3690 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3691 (double)INT_MAX ||
3692 OFLOW_MAX - *lengthptr < delta)
3693 {
3694 *errorcodeptr = ERR20;
3695 goto FAILED;
3696 }
3697 *lengthptr += delta;
3698 }
3699
3700 /* This is compiling for real */
3701
3702 else
3703 {
3704 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3705 for (i = 1; i < repeat_min; i++)
3706 {
3707 uschar *hc;
3708 uschar *this_hwm = cd->hwm;
3709 memcpy(code, previous, len);
3710 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3711 {
3712 PUT(cd->hwm, 0, GET(hc, 0) + len);
3713 cd->hwm += LINK_SIZE;
3714 }
3715 save_hwm = this_hwm;
3716 code += len;
3717 }
3718 }
3719 }
3720
3721 if (repeat_max > 0) repeat_max -= repeat_min;
3722 }
3723
3724 /* This code is common to both the zero and non-zero minimum cases. If
3725 the maximum is limited, it replicates the group in a nested fashion,
3726 remembering the bracket starts on a stack. In the case of a zero minimum,
3727 the first one was set up above. In all cases the repeat_max now specifies
3728 the number of additional copies needed. Again, we must remember to
3729 replicate entries on the forward reference list. */
3730
3731 if (repeat_max >= 0)
3732 {
3733 /* In the pre-compile phase, we don't actually do the replication. We
3734 just adjust the length as if we had. For each repetition we must add 1
3735 to the length for BRAZERO and for all but the last repetition we must
3736 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3737 paranoid checks to avoid integer overflow. */
3738
3739 if (lengthptr != NULL && repeat_max > 0)
3740 {
3741 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3742 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3743 if ((double)repeat_max *
3744 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3745 > (double)INT_MAX ||
3746 OFLOW_MAX - *lengthptr < delta)
3747 {
3748 *errorcodeptr = ERR20;
3749 goto FAILED;
3750 }
3751 *lengthptr += delta;
3752 }
3753
3754 /* This is compiling for real */
3755
3756 else for (i = repeat_max - 1; i >= 0; i--)
3757 {
3758 uschar *hc;
3759 uschar *this_hwm = cd->hwm;
3760
3761 *code++ = OP_BRAZERO + repeat_type;
3762
3763 /* All but the final copy start a new nesting, maintaining the
3764 chain of brackets outstanding. */
3765
3766 if (i != 0)
3767 {
3768 int offset;
3769 *code++ = OP_BRA;
3770 offset = (bralink == NULL)? 0 : code - bralink;
3771 bralink = code;
3772 PUTINC(code, 0, offset);
3773 }
3774
3775 memcpy(code, previous, len);
3776 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3777 {
3778 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3779 cd->hwm += LINK_SIZE;
3780 }
3781 save_hwm = this_hwm;
3782 code += len;
3783 }
3784
3785 /* Now chain through the pending brackets, and fill in their length
3786 fields (which are holding the chain links pro tem). */
3787
3788 while (bralink != NULL)
3789 {
3790 int oldlinkoffset;
3791 int offset = code - bralink + 1;
3792 uschar *bra = code - offset;
3793 oldlinkoffset = GET(bra, 1);
3794 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3795 *code++ = OP_KET;
3796 PUTINC(code, 0, offset);
3797 PUT(bra, 1, offset);
3798 }
3799 }
3800
3801 /* If the maximum is unlimited, set a repeater in the final copy. We
3802 can't just offset backwards from the current code point, because we
3803 don't know if there's been an options resetting after the ket. The
3804 correct offset was computed above.
3805
3806 Then, when we are doing the actual compile phase, check to see whether
3807 this group is a non-atomic one that could match an empty string. If so,
3808 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3809 that runtime checking can be done. [This check is also applied to
3810 atomic groups at runtime, but in a different way.] */
3811
3812 else
3813 {
3814 uschar *ketcode = code - ketoffset;
3815 uschar *bracode = ketcode - GET(ketcode, 1);
3816 *ketcode = OP_KETRMAX + repeat_type;
3817 if (lengthptr == NULL && *bracode != OP_ONCE)
3818 {
3819 uschar *scode = bracode;
3820 do
3821 {
3822 if (could_be_empty_branch(scode, ketcode, utf8))
3823 {
3824 *bracode += OP_SBRA - OP_BRA;
3825 break;
3826 }
3827 scode += GET(scode, 1);
3828 }
3829 while (*scode == OP_ALT);
3830 }
3831 }
3832 }
3833
3834 /* Else there's some kind of shambles */
3835
3836 else
3837 {
3838 *errorcodeptr = ERR11;
3839 goto FAILED;
3840 }
3841
3842 /* If the character following a repeat is '+', or if certain optimization
3843 tests above succeeded, possessive_quantifier is TRUE. For some of the
3844 simpler opcodes, there is an special alternative opcode for this. For
3845 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3846 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3847 but the special opcodes can optimize it a bit. The repeated item starts at
3848 tempcode, not at previous, which might be the first part of a string whose
3849 (former) last char we repeated.
3850
3851 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3852 an 'upto' may follow. We skip over an 'exact' item, and then test the
3853 length of what remains before proceeding. */
3854
3855 if (possessive_quantifier)
3856 {
3857 int len;
3858 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3859 *tempcode == OP_NOTEXACT)
3860 tempcode += _pcre_OP_lengths[*tempcode];
3861 len = code - tempcode;
3862 if (len > 0) switch (*tempcode)
3863 {
3864 case OP_STAR: *tempcode = OP_POSSTAR; break;
3865 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3866 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3867 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3868
3869 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3870 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3871 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3872 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3873
3874 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3875 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3876 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3877 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3878
3879 default:
3880 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3881 code += 1 + LINK_SIZE;
3882 len += 1 + LINK_SIZE;
3883 tempcode[0] = OP_ONCE;
3884 *code++ = OP_KET;
3885 PUTINC(code, 0, len);
3886 PUT(tempcode, 1, len);
3887 break;
3888 }
3889 }
3890
3891 /* In all case we no longer have a previous item. We also set the
3892 "follows varying string" flag for subsequently encountered reqbytes if
3893 it isn't already set and we have just passed a varying length item. */
3894
3895 END_REPEAT:
3896 previous = NULL;
3897 cd->req_varyopt |= reqvary;
3898 break;
3899
3900
3901 /* ===================================================================*/
3902 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3903 lookbehind or option setting or condition or all the other extended
3904 parenthesis forms. */
3905
3906 case '(':
3907 newoptions = options;
3908 skipbytes = 0;
3909 bravalue = OP_CBRA;
3910 save_hwm = cd->hwm;
3911 reset_bracount = FALSE;
3912
3913 /* First deal with various "verbs" that can be introduced by '*'. */
3914
3915 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3916 {
3917 int i, namelen;
3918 const uschar *name = ++ptr;
3919 previous = NULL;
3920 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3921 if (*ptr == ':')
3922 {
3923 *errorcodeptr = ERR59; /* Not supported */
3924 goto FAILED;
3925 }
3926 if (*ptr != ')')
3927 {
3928 *errorcodeptr = ERR60;
3929 goto FAILED;
3930 }
3931 namelen = ptr - name;
3932 for (i = 0; i < verbcount; i++)
3933 {
3934 if (namelen == verbs[i].len &&
3935 strncmp((char *)name, verbs[i].name, namelen) == 0)
3936 {
3937 *code = verbs[i].op;
3938 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3939 break;
3940 }
3941 }
3942 if (i < verbcount) continue;
3943 *errorcodeptr = ERR60;
3944 goto FAILED;
3945 }
3946
3947 /* Deal with the extended parentheses; all are introduced by '?', and the
3948 appearance of any of them means that this is not a capturing group. */
3949
3950 else if (*ptr == '?')
3951 {
3952 int i, set, unset, namelen;
3953 int *optset;
3954 const uschar *name;
3955 uschar *slot;
3956
3957 switch (*(++ptr))
3958 {
3959 case '#': /* Comment; skip to ket */
3960 ptr++;
3961 while (*ptr != 0 && *ptr != ')') ptr++;
3962 if (*ptr == 0)
3963 {
3964 *errorcodeptr = ERR18;
3965 goto FAILED;
3966 }
3967 continue;
3968
3969
3970 /* ------------------------------------------------------------ */
3971 case '|': /* Reset capture count for each branch */
3972 reset_bracount = TRUE;
3973 /* Fall through */
3974
3975 /* ------------------------------------------------------------ */
3976 case ':': /* Non-capturing bracket */
3977 bravalue = OP_BRA;
3978 ptr++;
3979 break;
3980
3981
3982 /* ------------------------------------------------------------ */
3983 case '(':
3984 bravalue = OP_COND; /* Conditional group */
3985
3986 /* A condition can be an assertion, a number (referring to a numbered
3987 group), a name (referring to a named group), or 'R', referring to
3988 recursion. R<digits> and R&name are also permitted for recursion tests.
3989
3990 There are several syntaxes for testing a named group: (?(name)) is used
3991 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3992
3993 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3994 be the recursive thing or the name 'R' (and similarly for 'R' followed
3995 by digits), and (b) a number could be a name that consists of digits.
3996 In both cases, we look for a name first; if not found, we try the other
3997 cases. */
3998
3999 /* For conditions that are assertions, check the syntax, and then exit
4000 the switch. This will take control down to where bracketed groups,
4001 including assertions, are processed. */
4002
4003 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4004 break;
4005
4006 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4007 below), and all need to skip 3 bytes at the start of the group. */
4008
4009 code[1+LINK_SIZE] = OP_CREF;
4010 skipbytes = 3;
4011 refsign = -1;
4012
4013 /* Check for a test for recursion in a named group. */
4014
4015 if (ptr[1] == 'R' && ptr[2] == '&')
4016 {
4017 terminator = -1;
4018 ptr += 2;
4019 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4020 }
4021
4022 /* Check for a test for a named group's having been set, using the Perl
4023 syntax (?(<name>) or (?('name') */
4024
4025 else if (ptr[1] == '<')
4026 {
4027 terminator = '>';
4028 ptr++;
4029 }
4030 else if (ptr[1] == '\'')
4031 {
4032 terminator = '\'';
4033 ptr++;
4034 }
4035 else
4036 {
4037 terminator = 0;
4038 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4039 }
4040
4041 /* We now expect to read a name; any thing else is an error */
4042
4043 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4044 {
4045 ptr += 1; /* To get the right offset */
4046 *errorcodeptr = ERR28;
4047 goto FAILED;
4048 }
4049
4050 /* Read the name, but also get it as a number if it's all digits */
4051
4052 recno = 0;
4053 name = ++ptr;
4054 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4055 {
4056 if (recno >= 0)
4057 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4058 recno * 10 + *ptr - '0' : -1;
4059 ptr++;
4060 }
4061 namelen = ptr - name;
4062
4063 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4064 {
4065 ptr--; /* Error offset */
4066 *errorcodeptr = ERR26;
4067 goto FAILED;
4068 }
4069
4070 /* Do no further checking in the pre-compile phase. */
4071
4072 if (lengthptr != NULL) break;
4073
4074 /* In the real compile we do the work of looking for the actual
4075 reference. If the string started with "+" or "-" we require the rest to
4076 be digits, in which case recno will be set. */
4077
4078 if (refsign > 0)
4079 {
4080 if (recno <= 0)
4081 {
4082 *errorcodeptr = ERR58;
4083 goto FAILED;
4084 }
4085 if (refsign == '-')
4086 {
4087 recno = cd->bracount - recno + 1;
4088 if (recno <= 0)
4089 {
4090 *errorcodeptr = ERR15;
4091 goto FAILED;
4092 }
4093 }
4094 else recno += cd->bracount;
4095 PUT2(code, 2+LINK_SIZE, recno);
4096 break;
4097 }
4098
4099 /* Otherwise (did not start with "+" or "-"), start by looking for the
4100 name. */
4101
4102 slot = cd->name_table;
4103 for (i = 0; i < cd->names_found; i++)
4104 {
4105 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4106 slot += cd->name_entry_size;
4107 }
4108
4109 /* Found a previous named subpattern */
4110
4111 if (i < cd->names_found)
4112 {
4113 recno = GET2(slot, 0);
4114 PUT2(code, 2+LINK_SIZE, recno);
4115 }
4116
4117 /* Search the pattern for a forward reference */
4118
4119 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4120 (options & PCRE_EXTENDED) != 0)) > 0)
4121 {
4122 PUT2(code, 2+LINK_SIZE, i);
4123 }
4124
4125 /* If terminator == 0 it means that the name followed directly after
4126 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4127 some further alternatives to try. For the cases where terminator != 0
4128 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4129 now checked all the possibilities, so give an error. */
4130
4131 else if (terminator != 0)
4132 {
4133 *errorcodeptr = ERR15;
4134 goto FAILED;
4135 }
4136
4137 /* Check for (?(R) for recursion. Allow digits after R to specify a
4138 specific group number. */
4139
4140 else if (*name == 'R')
4141 {
4142 recno = 0;
4143 for (i = 1; i < namelen; i++)
4144 {
4145 if ((digitab[name[i]] & ctype_digit) == 0)
4146 {
4147 *errorcodeptr = ERR15;
4148 goto FAILED;
4149 }
4150 recno = recno * 10 + name[i] - '0';
4151 }
4152 if (recno == 0) recno = RREF_ANY;
4153 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4154 PUT2(code, 2+LINK_SIZE, recno);
4155 }
4156
4157 /* Similarly, check for the (?(DEFINE) "condition", which is always
4158 false. */
4159
4160 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4161 {
4162 code[1+LINK_SIZE] = OP_DEF;
4163 skipbytes = 1;
4164 }
4165
4166 /* Check for the "name" actually being a subpattern number. */
4167
4168 else if (recno > 0)
4169 {
4170 PUT2(code, 2+LINK_SIZE, recno);
4171 }
4172
4173 /* Either an unidentified subpattern, or a reference to (?(0) */
4174
4175 else
4176 {
4177 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4178 goto FAILED;
4179 }
4180 break;
4181
4182
4183 /* ------------------------------------------------------------ */
4184 case '=': /* Positive lookahead */
4185 bravalue = OP_ASSERT;
4186 ptr++;
4187 break;
4188
4189
4190 /* ------------------------------------------------------------ */
4191 case '!': /* Negative lookahead */
4192 ptr++;
4193 if (*ptr == ')') /* Optimize (?!) */
4194 {
4195 *code++ = OP_FAIL;
4196 previous = NULL;
4197 continue;
4198 }
4199 bravalue = OP_ASSERT_NOT;
4200 break;
4201
4202
4203 /* ------------------------------------------------------------ */
4204 case '<': /* Lookbehind or named define */
4205 switch (ptr[1])
4206 {
4207 case '=': /* Positive lookbehind */
4208 bravalue = OP_ASSERTBACK;
4209 ptr += 2;
4210 break;
4211
4212 case '!': /* Negative lookbehind */
4213 bravalue = OP_ASSERTBACK_NOT;
4214 ptr += 2;
4215 break;
4216
4217 default: /* Could be name define, else bad */
4218 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4219 ptr++; /* Correct offset for error */
4220 *errorcodeptr = ERR24;
4221 goto FAILED;
4222 }
4223 break;
4224
4225
4226 /* ------------------------------------------------------------ */
4227 case '>': /* One-time brackets */
4228 bravalue = OP_ONCE;
4229 ptr++;
4230 break;
4231
4232
4233 /* ------------------------------------------------------------ */
4234 case 'C': /* Callout - may be followed by digits; */
4235 previous_callout = code; /* Save for later completion */
4236 after_manual_callout = 1; /* Skip one item before completing */
4237 *code++ = OP_CALLOUT;
4238 {
4239 int n = 0;
4240 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4241 n = n * 10 + *ptr - '0';
4242 if (*ptr != ')')
4243 {
4244 *errorcodeptr = ERR39;
4245 goto FAILED;
4246 }
4247 if (n > 255)
4248 {
4249 *errorcodeptr = ERR38;
4250 goto FAILED;
4251 }
4252 *code++ = n;
4253 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4254 PUT(code, LINK_SIZE, 0); /* Default length */
4255 code += 2 * LINK_SIZE;
4256 }
4257 previous = NULL;
4258 continue;
4259
4260
4261 /* ------------------------------------------------------------ */
4262 case 'P': /* Python-style named subpattern handling */
4263 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4264 {
4265 is_recurse = *ptr == '>';
4266 terminator = ')';
4267 goto NAMED_REF_OR_RECURSE;
4268 }
4269 else if (*ptr != '<') /* Test for Python-style definition */
4270 {
4271 *errorcodeptr = ERR41;
4272 goto FAILED;
4273 }
4274 /* Fall through to handle (?P< as (?< is handled */
4275
4276
4277 /* ------------------------------------------------------------ */
4278 DEFINE_NAME: /* Come here from (?< handling */
4279 case '\'':
4280 {
4281 terminator = (*ptr == '<')? '>' : '\'';
4282 name = ++ptr;
4283
4284 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4285 namelen = ptr - name;
4286
4287 /* In the pre-compile phase, just do a syntax check. */
4288
4289 if (lengthptr != NULL)
4290 {
4291 if (*ptr != terminator)
4292 {
4293 *errorcodeptr = ERR42;
4294 goto FAILED;
4295 }
4296 if (cd->names_found >= MAX_NAME_COUNT)
4297 {
4298 *errorcodeptr = ERR49;
4299 goto FAILED;
4300 }
4301 if (namelen + 3 > cd->name_entry_size)
4302 {
4303 cd->name_entry_size = namelen + 3;
4304 if (namelen > MAX_NAME_SIZE)
4305 {
4306 *errorcodeptr = ERR48;
4307 goto FAILED;
4308 }
4309 }
4310 }
4311
4312 /* In the real compile, create the entry in the table */
4313
4314 else
4315 {
4316 slot = cd->name_table;
4317 for (i = 0; i < cd->names_found; i++)
4318 {
4319 int crc = memcmp(name, slot+2, namelen);
4320 if (crc == 0)
4321 {
4322 if (slot[2+namelen] == 0)
4323 {
4324 if ((options & PCRE_DUPNAMES) == 0)
4325 {
4326 *errorcodeptr = ERR43;
4327 goto FAILED;
4328 }
4329 }
4330 else crc = -1; /* Current name is substring */
4331 }
4332 if (crc < 0)
4333 {
4334 memmove(slot + cd->name_entry_size, slot,
4335 (cd->names_found - i) * cd->name_entry_size);
4336 break;
4337 }
4338 slot += cd->name_entry_size;
4339 }
4340
4341 PUT2(slot, 0, cd->bracount + 1);
4342 memcpy(slot + 2, name, namelen);
4343 slot[2+namelen] = 0;
4344 }
4345 }
4346
4347 /* In both cases, count the number of names we've encountered. */
4348
4349 ptr++; /* Move past > or ' */
4350 cd->names_found++;
4351 goto NUMBERED_GROUP;
4352
4353
4354 /* ------------------------------------------------------------ */
4355 case '&': /* Perl recursion/subroutine syntax */
4356 terminator = ')';
4357 is_recurse = TRUE;
4358 /* Fall through */
4359
4360 /* We come here from the Python syntax above that handles both
4361 references (?P=name) and recursion (?P>name), as well as falling
4362 through from the Perl recursion syntax (?&name). */
4363
4364 NAMED_REF_OR_RECURSE:
4365 name = ++ptr;
4366 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4367 namelen = ptr - name;
4368
4369 /* In the pre-compile phase, do a syntax check and set a dummy
4370 reference number. */
4371
4372 if (lengthptr != NULL)
4373 {
4374 if (*ptr != terminator)
4375 {
4376 *errorcodeptr = ERR42;
4377 goto FAILED;
4378 }
4379 if (namelen > MAX_NAME_SIZE)
4380 {
4381 *errorcodeptr = ERR48;
4382 goto FAILED;
4383 }
4384 recno = 0;
4385 }
4386
4387 /* In the real compile, seek the name in the table */
4388
4389 else
4390 {
4391 slot = cd->name_table;
4392 for (i = 0; i < cd->names_found; i++)
4393 {
4394 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4395 slot += cd->name_entry_size;
4396 }
4397
4398 if (i < cd->names_found) /* Back reference */
4399 {
4400 recno = GET2(slot, 0);
4401 }
4402 else if ((recno = /* Forward back reference */
4403 find_parens(ptr, cd->bracount, name, namelen,
4404 (options & PCRE_EXTENDED) != 0)) <= 0)
4405 {
4406 *errorcodeptr = ERR15;
4407 goto FAILED;
4408 }
4409 }
4410
4411 /* In both phases, we can now go to the code than handles numerical
4412 recursion or backreferences. */
4413
4414 if (is_recurse) goto HANDLE_RECURSION;
4415 else goto HANDLE_REFERENCE;
4416
4417
4418 /* ------------------------------------------------------------ */
4419 case 'R': /* Recursion */
4420 ptr++; /* Same as (?0) */
4421 /* Fall through */
4422
4423
4424 /* ------------------------------------------------------------ */
4425 case '-': case '+':
4426 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4427 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4428 {
4429 const uschar *called;
4430
4431 if ((refsign = *ptr) == '+') ptr++;
4432 else if (refsign == '-')
4433 {
4434 if ((digitab[ptr[1]] & ctype_digit) == 0)
4435 goto OTHER_CHAR_AFTER_QUERY;
4436 ptr++;
4437 }
4438
4439 recno = 0;
4440 while((digitab[*ptr] & ctype_digit) != 0)
4441 recno = recno * 10 + *ptr++ - '0';
4442
4443 if (*ptr != ')')
4444 {
4445 *errorcodeptr = ERR29;
4446 goto FAILED;
4447 }
4448
4449 if (refsign == '-')
4450 {
4451 if (recno == 0)
4452 {
4453 *errorcodeptr = ERR58;
4454 goto FAILED;
4455 }
4456 recno = cd->bracount - recno + 1;
4457 if (recno <= 0)
4458 {
4459 *errorcodeptr = ERR15;
4460 goto FAILED;
4461 }
4462 }
4463 else if (refsign == '+')
4464 {
4465 if (recno == 0)
4466 {
4467 *errorcodeptr = ERR58;
4468 goto FAILED;
4469 }
4470 recno += cd->bracount;
4471 }
4472
4473 /* Come here from code above that handles a named recursion */
4474
4475 HANDLE_RECURSION:
4476
4477 previous = code;
4478 called = cd->start_code;
4479
4480 /* When we are actually compiling, find the bracket that is being
4481 referenced. Temporarily end the regex in case it doesn't exist before
4482 this point. If we end up with a forward reference, first check that
4483 the bracket does occur later so we can give the error (and position)
4484 now. Then remember this forward reference in the workspace so it can
4485 be filled in at the end. */
4486
4487 if (lengthptr == NULL)
4488 {
4489 *code = OP_END;
4490 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4491
4492 /* Forward reference */
4493
4494 if (called == NULL)
4495 {
4496 if (find_parens(ptr, cd->bracount, NULL, recno,
4497 (options & PCRE_EXTENDED) != 0) < 0)
4498 {
4499 *errorcodeptr = ERR15;
4500 goto FAILED;
4501 }
4502 called = cd->start_code + recno;
4503 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4504 }
4505
4506 /* If not a forward reference, and the subpattern is still open,
4507 this is a recursive call. We check to see if this is a left
4508 recursion that could loop for ever, and diagnose that case. */
4509
4510 else if (GET(called, 1) == 0 &&
4511 could_be_empty(called, code, bcptr, utf8))
4512 {
4513 *errorcodeptr = ERR40;
4514 goto FAILED;
4515 }
4516 }
4517
4518 /* Insert the recursion/subroutine item, automatically wrapped inside
4519 "once" brackets. Set up a "previous group" length so that a
4520 subsequent quantifier will work. */
4521
4522 *code = OP_ONCE;
4523 PUT(code, 1, 2 + 2*LINK_SIZE);
4524 code += 1 + LINK_SIZE;
4525
4526 *code = OP_RECURSE;
4527 PUT(code, 1, called - cd->start_code);
4528 code += 1 + LINK_SIZE;
4529
4530 *code = OP_KET;
4531 PUT(code, 1, 2 + 2*LINK_SIZE);
4532 code += 1 + LINK_SIZE;
4533
4534 length_prevgroup = 3 + 3*LINK_SIZE;
4535 }
4536
4537 /* Can't determine a first byte now */
4538
4539 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4540 continue;
4541
4542
4543 /* ------------------------------------------------------------ */
4544 default: /* Other characters: check option setting */
4545 OTHER_CHAR_AFTER_QUERY:
4546 set = unset = 0;
4547 optset = &set;
4548
4549 while (*ptr != ')' && *ptr != ':')
4550 {
4551 switch (*ptr++)
4552 {
4553 case '-': optset = &unset; break;
4554
4555 case 'J': /* Record that it changed in the external options */
4556 *optset |= PCRE_DUPNAMES;
4557 cd->external_options |= PCRE_JCHANGED;
4558 break;
4559
4560 case 'i': *optset |= PCRE_CASELESS; break;
4561 case 'm': *optset |= PCRE_MULTILINE; break;
4562 case 's': *optset |= PCRE_DOTALL; break;
4563 case 'x': *optset |= PCRE_EXTENDED; break;
4564 case 'U': *optset |= PCRE_UNGREEDY; break;
4565 case 'X': *optset |= PCRE_EXTRA; break;
4566
4567 default: *errorcodeptr = ERR12;
4568 ptr--; /* Correct the offset */
4569 goto FAILED;
4570 }
4571 }
4572
4573 /* Set up the changed option bits, but don't change anything yet. */
4574
4575 newoptions = (options | set) & (~unset);
4576
4577 /* If the options ended with ')' this is not the start of a nested
4578 group with option changes, so the options change at this level. If this
4579 item is right at the start of the pattern, the options can be
4580 abstracted and made external in the pre-compile phase, and ignored in
4581 the compile phase. This can be helpful when matching -- for instance in
4582 caseless checking of required bytes.
4583
4584 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4585 definitely *not* at the start of the pattern because something has been
4586 compiled. In the pre-compile phase, however, the code pointer can have
4587 that value after the start, because it gets reset as code is discarded
4588 during the pre-compile. However, this can happen only at top level - if
4589 we are within parentheses, the starting BRA will still be present. At
4590 any parenthesis level, the length value can be used to test if anything
4591 has been compiled at that level. Thus, a test for both these conditions
4592 is necessary to ensure we correctly detect the start of the pattern in
4593 both phases.
4594
4595 If we are not at the pattern start, compile code to change the ims
4596 options if this setting actually changes any of them. We also pass the
4597 new setting back so that it can be put at the start of any following
4598 branches, and when this group ends (if we are in a group), a resetting
4599 item can be compiled. */
4600
4601 if (*ptr == ')')
4602 {
4603 if (code == cd->start_code + 1 + LINK_SIZE &&
4604 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4605 {
4606 cd->external_options = newoptions;
4607 options = newoptions;
4608 }
4609 else
4610 {
4611 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4612 {
4613 *code++ = OP_OPT;
4614 *code++ = newoptions & PCRE_IMS;
4615 }
4616
4617 /* Change options at this level, and pass them back for use
4618 in subsequent branches. Reset the greedy defaults and the case
4619 value for firstbyte and reqbyte. */
4620
4621 *optionsptr = options = newoptions;
4622 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4623 greedy_non_default = greedy_default ^ 1;
4624 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4625 }
4626
4627 previous = NULL; /* This item can't be repeated */
4628 continue; /* It is complete */
4629 }
4630
4631 /* If the options ended with ':' we are heading into a nested group
4632 with possible change of options. Such groups are non-capturing and are
4633 not assertions of any kind. All we need to do is skip over the ':';
4634 the newoptions value is handled below. */
4635
4636 bravalue = OP_BRA;
4637 ptr++;
4638 } /* End of switch for character following (? */
4639 } /* End of (? handling */
4640
4641 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4642 all unadorned brackets become non-capturing and behave like (?:...)
4643 brackets. */
4644
4645 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4646 {
4647 bravalue = OP_BRA;
4648 }
4649
4650 /* Else we have a capturing group. */
4651
4652 else
4653 {
4654 NUMBERED_GROUP:
4655 cd->bracount += 1;
4656 PUT2(code, 1+LINK_SIZE, cd->bracount);
4657 skipbytes = 2;
4658 }
4659
4660 /* Process nested bracketed regex. Assertions may not be repeated, but
4661 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4662 non-register variable in order to be able to pass its address because some
4663 compilers complain otherwise. Pass in a new setting for the ims options if
4664 they have changed. */
4665
4666 previous = (bravalue >= OP_ONCE)? code : NULL;
4667 *code = bravalue;
4668 tempcode = code;
4669 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4670 length_prevgroup = 0; /* Initialize for pre-compile phase */
4671
4672 if (!compile_regex(
4673 newoptions, /* The complete new option state */
4674 options & PCRE_IMS, /* The previous ims option state */
4675 &tempcode, /* Where to put code (updated) */
4676 &ptr, /* Input pointer (updated) */
4677 errorcodeptr, /* Where to put an error message */
4678 (bravalue == OP_ASSERTBACK ||
4679 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4680 reset_bracount, /* True if (?| group */
4681 skipbytes, /* Skip over bracket number */
4682 &subfirstbyte, /* For possible first char */
4683 &subreqbyte, /* For possible last char */
4684 bcptr, /* Current branch chain */
4685 cd, /* Tables block */
4686 (lengthptr == NULL)? NULL : /* Actual compile phase */
4687 &length_prevgroup /* Pre-compile phase */
4688 ))
4689 goto FAILED;
4690
4691 /* At the end of compiling, code is still pointing to the start of the
4692 group, while tempcode has been updated to point past the end of the group
4693 and any option resetting that may follow it. The pattern pointer (ptr)
4694 is on the bracket. */
4695
4696 /* If this is a conditional bracket, check that there are no more than
4697 two branches in the group, or just one if it's a DEFINE group. We do this
4698 in the real compile phase, not in the pre-pass, where the whole group may
4699 not be available. */
4700
4701 if (bravalue == OP_COND && lengthptr == NULL)
4702 {
4703 uschar *tc = code;
4704 int condcount = 0;
4705
4706 do {
4707 condcount++;
4708 tc += GET(tc,1);
4709 }
4710 while (*tc != OP_KET);
4711
4712 /* A DEFINE group is never obeyed inline (the "condition" is always
4713 false). It must have only one branch. */
4714
4715 if (code[LINK_SIZE+1] == OP_DEF)
4716 {
4717 if (condcount > 1)
4718 {
4719 *errorcodeptr = ERR54;
4720 goto FAILED;
4721 }
4722 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4723 }
4724
4725 /* A "normal" conditional group. If there is just one branch, we must not
4726 make use of its firstbyte or reqbyte, because this is equivalent to an
4727 empty second branch. */
4728
4729 else
4730 {
4731 if (condcount > 2)
4732 {
4733 *errorcodeptr = ERR27;
4734 goto FAILED;
4735 }
4736 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4737 }
4738 }
4739
4740 /* Error if hit end of pattern */
4741
4742 if (*ptr != ')')
4743 {
4744 *errorcodeptr = ERR14;
4745 goto FAILED;
4746 }
4747
4748 /* In the pre-compile phase, update the length by the length of the nested
4749 group, less the brackets at either end. Then reduce the compiled code to
4750 just the brackets so that it doesn't use much memory if it is duplicated by
4751 a quantifier. */
4752
4753 if (lengthptr != NULL)
4754 {
4755 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4756 {
4757 *errorcodeptr = ERR20;
4758 goto FAILED;
4759 }
4760 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4761 code++;
4762 PUTINC(code, 0, 1 + LINK_SIZE);
4763 *code++ = OP_KET;
4764 PUTINC(code, 0, 1 + LINK_SIZE);
4765 }
4766
4767 /* Otherwise update the main code pointer to the end of the group. */
4768
4769 else code = tempcode;
4770
4771 /* For a DEFINE group, required and first character settings are not
4772 relevant. */
4773
4774 if (bravalue == OP_DEF) break;
4775
4776 /* Handle updating of the required and first characters for other types of
4777 group. Update for normal brackets of all kinds, and conditions with two
4778 branches (see code above). If the bracket is followed by a quantifier with
4779 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4780 zerofirstbyte outside the main loop so that they can be accessed for the
4781 back off. */
4782
4783 zeroreqbyte = reqbyte;
4784 zerofirstbyte = firstbyte;
4785 groupsetfirstbyte = FALSE;
4786
4787 if (bravalue >= OP_ONCE)
4788 {
4789 /* If we have not yet set a firstbyte in this branch, take it from the
4790 subpattern, remembering that it was set here so that a repeat of more
4791 than one can replicate it as reqbyte if necessary. If the subpattern has
4792 no firstbyte, set "none" for the whole branch. In both cases, a zero
4793 repeat forces firstbyte to "none". */
4794
4795 if (firstbyte == REQ_UNSET)
4796 {
4797 if (subfirstbyte >= 0)
4798 {
4799 firstbyte = subfirstbyte;
4800 groupsetfirstbyte = TRUE;
4801 }
4802 else firstbyte = REQ_NONE;
4803 zerofirstbyte = REQ_NONE;
4804 }
4805
4806 /* If firstbyte was previously set, convert the subpattern's firstbyte
4807 into reqbyte if there wasn't one, using the vary flag that was in
4808 existence beforehand. */
4809
4810 else if (subfirstbyte >= 0 && subreqbyte < 0)
4811 subreqbyte = subfirstbyte | tempreqvary;
4812
4813 /* If the subpattern set a required byte (or set a first byte that isn't
4814 really the first byte - see above), set it. */
4815
4816 if (subreqbyte >= 0) reqbyte = subreqbyte;
4817 }
4818
4819 /* For a forward assertion, we take the reqbyte, if set. This can be
4820 helpful if the pattern that follows the assertion doesn't set a different
4821 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4822 for an assertion, however because it leads to incorrect effect for patterns
4823 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4824 of a firstbyte. This is overcome by a scan at the end if there's no
4825 firstbyte, looking for an asserted first char. */
4826
4827 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4828 break; /* End of processing '(' */
4829
4830
4831 /* ===================================================================*/
4832 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4833 are arranged to be the negation of the corresponding OP_values. For the
4834 back references, the values are ESC_REF plus the reference number. Only
4835 back references and those types that consume a character may be repeated.
4836 We can test for values between ESC_b and ESC_Z for the latter; this may
4837 have to change if any new ones are ever created. */
4838
4839 case '\\':
4840 tempptr = ptr;
4841 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4842 if (*errorcodeptr != 0) goto FAILED;
4843
4844 if (c < 0)
4845 {
4846 if (-c == ESC_Q) /* Handle start of quoted string */
4847 {
4848 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4849 else inescq = TRUE;
4850 continue;
4851 }
4852
4853 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4854
4855 /* For metasequences that actually match a character, we disable the
4856 setting of a first character if it hasn't already been set. */
4857
4858 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4859 firstbyte = REQ_NONE;
4860
4861 /* Set values to reset to if this is followed by a zero repeat. */
4862
4863 zerofirstbyte = firstbyte;
4864 zeroreqbyte = reqbyte;
4865
4866 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4867 We also support \k{name} (.NET syntax) */
4868
4869 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4870 {
4871 is_recurse = FALSE;
4872 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4873 goto NAMED_REF_OR_RECURSE;
4874 }
4875
4876 /* Back references are handled specially; must disable firstbyte if
4877 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4878 ':' later. */
4879
4880 if (-c >= ESC_REF)
4881 {
4882 recno = -c - ESC_REF;
4883
4884 HANDLE_REFERENCE: /* Come here from named backref handling */
4885 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4886 previous = code;
4887 *code++ = OP_REF;
4888 PUT2INC(code, 0, recno);
4889 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4890 if (recno > cd->top_backref) cd->top_backref = recno;
4891 }
4892
4893 /* So are Unicode property matches, if supported. */
4894
4895 #ifdef SUPPORT_UCP
4896 else if (-c == ESC_P || -c == ESC_p)
4897 {
4898 BOOL negated;
4899 int pdata;
4900 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4901 if (ptype < 0) goto FAILED;
4902 previous = code;
4903 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4904 *code++ = ptype;
4905 *code++ = pdata;
4906 }
4907 #else
4908
4909 /* If Unicode properties are not supported, \X, \P, and \p are not
4910 allowed. */
4911
4912 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4913 {
4914 *errorcodeptr = ERR45;
4915 goto FAILED;
4916 }
4917 #endif
4918
4919 /* For the rest (including \X when Unicode properties are supported), we
4920 can obtain the OP value by negating the escape value. */
4921
4922 else
4923 {
4924 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4925 *code++ = -c;
4926 }
4927 continue;
4928 }
4929
4930 /* We have a data character whose value is in c. In UTF-8 mode it may have
4931 a value > 127. We set its representation in the length/buffer, and then
4932 handle it as a data character. */
4933
4934 #ifdef SUPPORT_UTF8
4935 if (utf8 && c > 127)
4936 mclength = _pcre_ord2utf8(c, mcbuffer);
4937 else
4938 #endif
4939
4940 {
4941 mcbuffer[0] = c;
4942 mclength = 1;
4943 }
4944 goto ONE_CHAR;
4945
4946
4947 /* ===================================================================*/
4948 /* Handle a literal character. It is guaranteed not to be whitespace or #
4949 when the extended flag is set. If we are in UTF-8 mode, it may be a
4950 multi-byte literal character. */
4951
4952 default:
4953 NORMAL_CHAR:
4954 mclength = 1;
4955 mcbuffer[0] = c;
4956
4957 #ifdef SUPPORT_UTF8
4958 if (utf8 && c >= 0xc0)
4959 {
4960 while ((ptr[1] & 0xc0) == 0x80)
4961 mcbuffer[mclength++] = *(++ptr);
4962 }
4963 #endif
4964
4965 /* At this point we have the character's bytes in mcbuffer, and the length
4966 in mclength. When not in UTF-8 mode, the length is always 1. */
4967
4968 ONE_CHAR:
4969 previous = code;
4970 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4971 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4972
4973 /* Set the first and required bytes appropriately. If no previous first
4974 byte, set it from this character, but revert to none on a zero repeat.
4975 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4976 repeat. */
4977
4978 if (firstbyte == REQ_UNSET)
4979 {
4980 zerofirstbyte = REQ_NONE;
4981 zeroreqbyte = reqbyte;
4982
4983 /* If the character is more than one byte long, we can set firstbyte
4984 only if it is not to be matched caselessly. */
4985
4986 if (mclength == 1 || req_caseopt == 0)
4987 {
4988 firstbyte = mcbuffer[0] | req_caseopt;
4989 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4990 }
4991 else firstbyte = reqbyte = REQ_NONE;
4992 }
4993
4994 /* firstbyte was previously set; we can set reqbyte only the length is
4995 1 or the matching is caseful. */
4996
4997 else
4998 {
4999 zerofirstbyte = firstbyte;
5000 zeroreqbyte = reqbyte;
5001 if (mclength == 1 || req_caseopt == 0)
5002 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5003 }
5004
5005 break; /* End of literal character handling */
5006 }
5007 } /* end of big loop */
5008
5009
5010 /* Control never reaches here by falling through, only by a goto for all the
5011 error states. Pass back the position in the pattern so that it can be displayed
5012 to the user for diagnosing the error. */
5013
5014 FAILED:
5015 *ptrptr = ptr;
5016 return FALSE;
5017 }
5018
5019
5020
5021
5022 /*************************************************
5023 * Compile sequence of alternatives *
5024 *************************************************/
5025
5026 /* On entry, ptr is pointing past the bracket character, but on return it
5027 points to the closing bracket, or vertical bar, or end of string. The code
5028 variable is pointing at the byte into which the BRA operator has been stored.
5029 If the ims options are changed at the start (for a (?ims: group) or during any
5030 branch, we need to insert an OP_OPT item at the start of every following branch
5031 to ensure they get set correctly at run time, and also pass the new options
5032 into every subsequent branch compile.
5033
5034 This function is used during the pre-compile phase when we are trying to find
5035 out the amount of memory needed, as well as during the real compile phase. The
5036 value of lengthptr distinguishes the two phases.
5037
5038 Arguments:
5039 options option bits, including any changes for this subpattern
5040 oldims previous settings of ims option bits
5041 codeptr -> the address of the current code pointer
5042 ptrptr -> the address of the current pattern pointer
5043 errorcodeptr -> pointer to error code variable
5044 lookbehind TRUE if this is a lookbehind assertion
5045 reset_bracount TRUE to reset the count for each branch
5046 skipbytes skip this many bytes at start (for brackets and OP_COND)
5047 firstbyteptr place to put the first required character, or a negative number
5048 reqbyteptr place to put the last required character, or a negative number
5049 bcptr pointer to the chain of currently open branches
5050 cd points to the data block with tables pointers etc.
5051 lengthptr NULL during the real compile phase
5052 points to length accumulator during pre-compile phase
5053
5054 Returns: TRUE on success
5055 */
5056
5057 static BOOL
5058 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5059 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5060 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5061 int *lengthptr)
5062 {
5063 const uschar *ptr = *ptrptr;
5064 uschar *code = *codeptr;
5065 uschar *last_branch = code;
5066 uschar *start_bracket = code;
5067 uschar *reverse_count = NULL;
5068 int firstbyte, reqbyte;
5069 int branchfirstbyte, branchreqbyte;
5070 int length;
5071 int orig_bracount;
5072 int max_bracount;
5073 branch_chain bc;
5074
5075 bc.outer = bcptr;
5076 bc.current = code;
5077
5078 firstbyte = reqbyte = REQ_UNSET;
5079
5080 /* Accumulate the length for use in the pre-compile phase. Start with the
5081 length of the BRA and KET and any extra bytes that are required at the
5082 beginning. We accumulate in a local variable to save frequent testing of
5083 lenthptr for NULL. We cannot do this by looking at the value of code at the
5084 start and end of each alternative, because compiled items are discarded during
5085 the pre-compile phase so that the work space is not exceeded. */
5086
5087 length = 2 + 2*LINK_SIZE + skipbytes;
5088
5089 /* WARNING: If the above line is changed for any reason, you must also change
5090 the code that abstracts option settings at the start of the pattern and makes
5091 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5092 pre-compile phase to find out whether anything has yet been compiled or not. */
5093
5094 /* Offset is set zero to mark that this bracket is still open */
5095
5096 PUT(code, 1, 0);
5097 code += 1 + LINK_SIZE + skipbytes;
5098
5099 /* Loop for each alternative branch */
5100
5101 orig_bracount = max_bracount = cd->bracount;
5102 for (;;)
5103 {
5104 /* For a (?| group, reset the capturing bracket count so that each branch
5105 uses the same numbers. */
5106
5107 if (reset_bracount) cd->bracount = orig_bracount;
5108
5109 /* Handle a change of ims options at the start of the branch */
5110
5111 if ((options & PCRE_IMS) != oldims)
5112 {
5113 *code++ = OP_OPT;
5114 *code++ = options & PCRE_IMS;
5115 length += 2;
5116 }
5117
5118 /* Set up dummy OP_REVERSE if lookbehind assertion */
5119
5120 if (lookbehind)
5121 {
5122 *code++ = OP_REVERSE;
5123 reverse_count = code;
5124 PUTINC(code, 0, 0);
5125 length += 1 + LINK_SIZE;
5126 }
5127
5128 /* Now compile the branch; in the pre-compile phase its length gets added
5129 into the length. */
5130
5131 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5132 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5133 {
5134 *ptrptr = ptr;
5135 return FALSE;
5136 }
5137
5138 /* Keep the highest bracket count in case (?| was used and some branch
5139 has fewer than the rest. */
5140
5141 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5142
5143 /* In the real compile phase, there is some post-processing to be done. */
5144
5145 if (lengthptr == NULL)
5146 {
5147 /* If this is the first branch, the firstbyte and reqbyte values for the
5148 branch become the values for the regex. */
5149
5150 if (*last_branch != OP_ALT)
5151 {
5152 firstbyte = branchfirstbyte;
5153 reqbyte = branchreqbyte;
5154 }
5155
5156 /* If this is not the first branch, the first char and reqbyte have to
5157 match the values from all the previous branches, except that if the
5158 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5159 and we set REQ_VARY for the regex. */
5160
5161 else
5162 {
5163 /* If we previously had a firstbyte, but it doesn't match the new branch,
5164 we have to abandon the firstbyte for the regex, but if there was
5165 previously no reqbyte, it takes on the value of the old firstbyte. */
5166
5167 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5168 {
5169 if (reqbyte < 0) reqbyte = firstbyte;
5170 firstbyte = REQ_NONE;
5171 }
5172
5173 /* If we (now or from before) have no firstbyte, a firstbyte from the
5174 branch becomes a reqbyte if there isn't a branch reqbyte. */
5175
5176 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5177 branchreqbyte = branchfirstbyte;
5178
5179 /* Now ensure that the reqbytes match */
5180
5181 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5182 reqbyte = REQ_NONE;
5183 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5184 }
5185
5186 /* If lookbehind, check that this branch matches a fixed-length string, and
5187 put the length into the OP_REVERSE item. Temporarily mark the end of the
5188 branch with OP_END. */
5189
5190 if (lookbehind)
5191 {
5192 int fixed_length;
5193 *code = OP_END;
5194 fixed_length = find_fixedlength(last_branch, options);
5195 DPRINTF(("fixed length = %d\n", fixed_length));
5196 if (fixed_length < 0)
5197 {
5198 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5199 *ptrptr = ptr;
5200 return FALSE;
5201 }
5202 PUT(reverse_count, 0, fixed_length);
5203 }
5204 }
5205
5206 /* Reached end of expression, either ')' or end of pattern. In the real
5207 compile phase, go back through the alternative branches and reverse the chain
5208 of offsets, with the field in the BRA item now becoming an offset to the
5209 first alternative. If there are no alternatives, it points to the end of the
5210 group. The length in the terminating ket is always the length of the whole
5211 bracketed item. If any of the ims options were changed inside the group,
5212 compile a resetting op-code following, except at the very end of the pattern.
5213 Return leaving the pointer at the terminating char. */
5214
5215 if (*ptr != '|')
5216 {
5217 if (lengthptr == NULL)
5218 {
5219 int branch_length = code - last_branch;
5220 do
5221 {
5222 int prev_length = GET(last_branch, 1);
5223 PUT(last_branch, 1, branch_length);
5224 branch_length = prev_length;
5225 last_branch -= branch_length;
5226 }
5227 while (branch_length > 0);
5228 }
5229
5230 /* Fill in the ket */
5231
5232 *code = OP_KET;
5233 PUT(code, 1, code - start_bracket);
5234 code += 1 + LINK_SIZE;
5235
5236 /* Resetting option if needed */
5237
5238 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5239 {
5240 *code++ = OP_OPT;
5241 *code++ = oldims;
5242 length += 2;
5243 }
5244
5245 /* Retain the highest bracket number, in case resetting was used. */
5246
5247 cd->bracount = max_bracount;
5248
5249 /* Set values to pass back */
5250
5251 *codeptr = code;
5252 *ptrptr = ptr;
5253 *firstbyteptr = firstbyte;
5254 *reqbyteptr = reqbyte;
5255 if (lengthptr != NULL)
5256 {
5257 if (OFLOW_MAX - *lengthptr < length)
5258 {
5259 *errorcodeptr = ERR20;
5260 return FALSE;
5261 }
5262 *lengthptr += length;
5263 }
5264 return TRUE;
5265 }
5266
5267 /* Another branch follows. In the pre-compile phase, we can move the code
5268 pointer back to where it was for the start of the first branch. (That is,
5269 pretend that each branch is the only one.)
5270
5271 In the real compile phase, insert an ALT node. Its length field points back
5272 to the previous branch while the bracket remains open. At the end the chain
5273 is reversed. It's done like this so that the start of the bracket has a
5274 zero offset until it is closed, making it possible to detect recursion. */
5275
5276 if (lengthptr != NULL)
5277 {
5278 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5279 length += 1 + LINK_SIZE;
5280 }
5281 else
5282 {
5283 *code = OP_ALT;
5284 PUT(code, 1, code - last_branch);
5285 bc.current = last_branch = code;
5286 code += 1 + LINK_SIZE;
5287 }
5288
5289 ptr++;
5290 }
5291 /* Control never reaches here */
5292 }
5293
5294
5295
5296
5297 /*************************************************
5298 * Check for anchored expression *
5299 *************************************************/
5300
5301 /* Try to find out if this is an anchored regular expression. Consider each
5302 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5303 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5304 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5305 counts, since OP_CIRC can match in the middle.
5306
5307 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5308 This is the code for \G, which means "match at start of match position, taking
5309 into account the match offset".
5310
5311 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5312 because that will try the rest of the pattern at all possible matching points,
5313 so there is no point trying again.... er ....
5314
5315 .... except when the .* appears inside capturing parentheses, and there is a
5316 subsequent back reference to those parentheses. We haven't enough information
5317 to catch that case precisely.
5318
5319 At first, the best we could do was to detect when .* was in capturing brackets
5320 and the highest back reference was greater than or equal to that level.
5321 However, by keeping a bitmap of the first 31 back references, we can catch some
5322 of the more common cases more precisely.
5323
5324 Arguments:
5325 code points to start of expression (the bracket)
5326 options points to the options setting
5327 bracket_map a bitmap of which brackets we are inside while testing; this
5328 handles up to substring 31; after that we just have to take
5329 the less precise approach
5330 backref_map the back reference bitmap
5331
5332 Returns: TRUE or FALSE
5333 */
5334
5335 static BOOL
5336 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5337 unsigned int backref_map)
5338 {
5339 do {
5340 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5341 options, PCRE_MULTILINE, FALSE);
5342 register int op = *scode;
5343
5344 /* Non-capturing brackets */
5345
5346 if (op == OP_BRA)
5347 {
5348 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5349 }
5350
5351 /* Capturing brackets */
5352
5353 else if (op == OP_CBRA)
5354 {
5355 int n = GET2(scode, 1+LINK_SIZE);
5356 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5357 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5358 }
5359
5360 /* Other brackets */
5361
5362 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5363 {
5364 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5365 }
5366
5367 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5368 are or may be referenced. */
5369
5370 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5371 op == OP_TYPEPOSSTAR) &&
5372 (*options & PCRE_DOTALL) != 0)
5373 {
5374 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5375 }
5376
5377 /* Check for explicit anchoring */
5378
5379 else if (op != OP_SOD && op != OP_SOM &&
5380 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5381 return FALSE;
5382 code += GET(code, 1);
5383 }
5384 while (*code == OP_ALT); /* Loop for each alternative */
5385 return TRUE;
5386 }
5387
5388
5389
5390 /*************************************************
5391 * Check for starting with ^ or .* *
5392 *************************************************/
5393
5394 /* This is called to find out if every branch starts with ^ or .* so that
5395 "first char" processing can be done to speed things up in multiline
5396 matching and for non-DOTALL patterns that start with .* (which must start at
5397 the beginning or after \n). As in the case of is_anchored() (see above), we
5398 have to take account of back references to capturing brackets that contain .*
5399 because in that case we can't make the assumption.
5400
5401 Arguments:
5402 code points to start of expression (the bracket)
5403 bracket_map a bitmap of which brackets we are inside while testing; this
5404 handles up to substring 31; after that we just have to take
5405 the less precise approach
5406 backref_map the back reference bitmap
5407
5408 Returns: TRUE or FALSE
5409 */
5410
5411 static BOOL
5412 is_startline(const uschar *code, unsigned int bracket_map,
5413 unsigned int backref_map)
5414 {
5415 do {
5416 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5417 NULL, 0, FALSE);
5418 register int op = *scode;
5419
5420 /* Non-capturing brackets */
5421
5422 if (op == OP_BRA)
5423 {
5424 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5425 }
5426
5427 /* Capturing brackets */
5428
5429 else if (op == OP_CBRA)
5430 {
5431 int n = GET2(scode, 1+LINK_SIZE);
5432 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5433 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5434 }
5435
5436 /* Other brackets */
5437
5438 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5439 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5440
5441 /* .* means "start at start or after \n" if it isn't in brackets that
5442 may be referenced. */
5443
5444 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5445 {
5446 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5447 }
5448
5449 /* Check for explicit circumflex */
5450
5451 else if (op != OP_CIRC) return FALSE;
5452
5453 /* Move on to the next alternative */
5454
5455 code += GET(code, 1);
5456 }
5457 while (*code == OP_ALT); /* Loop for each alternative */
5458 return TRUE;
5459 }
5460
5461
5462
5463 /*************************************************
5464 * Check for asserted fixed first char *
5465 *************************************************/
5466
5467 /* During compilation, the "first char" settings from forward assertions are
5468 discarded, because they can cause conflicts with actual literals that follow.
5469 However, if we end up without a first char setting for an unanchored pattern,
5470 it is worth scanning the regex to see if there is an initial asserted first
5471 char. If all branches start with the same asserted char, or with a bracket all
5472 of whose alternatives start with the same asserted char (recurse ad lib), then
5473 we return that char, otherwise -1.
5474
5475 Arguments:
5476 code points to start of expression (the bracket)
5477 options pointer to the options (used to check casing changes)
5478 inassert TRUE if in an assertion
5479
5480 Returns: -1 or the fixed first char
5481 */
5482
5483 static int
5484 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5485 {
5486 register int c = -1;
5487 do {
5488 int d;
5489 const uschar *scode =
5490 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5491 register int op = *scode;
5492
5493 switch(op)
5494 {
5495 default:
5496 return -1;
5497
5498 case OP_BRA:
5499 case OP_CBRA:
5500 case OP_ASSERT:
5501 case OP_ONCE:
5502 case OP_COND:
5503 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5504 return -1;
5505 if (c < 0) c = d; else if (c != d) return -1;
5506 break;
5507
5508 case OP_EXACT: /* Fall through */
5509 scode += 2;
5510
5511 case OP_CHAR:
5512 case OP_CHARNC:
5513 case OP_PLUS:
5514 case OP_MINPLUS:
5515 case OP_POSPLUS:
5516 if (!inassert) return -1;
5517 if (c < 0)
5518 {
5519 c = scode[1];
5520 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5521 }
5522 else if (c != scode[1]) return -1;
5523 break;
5524 }
5525
5526 code += GET(code, 1);
5527 }
5528 while (*code == OP_ALT);
5529 return c;
5530 }
5531
5532
5533
5534 /*************************************************
5535 * Compile a Regular Expression *
5536 *************************************************/
5537
5538 /* This function takes a string and returns a pointer to a block of store
5539 holding a compiled version of the expression. The original API for this
5540 function had no error code return variable; it is retained for backwards
5541 compatibility. The new function is given a new name.
5542
5543 Arguments:
5544 pattern the regular expression
5545 options various option bits
5546 errorcodeptr pointer to error code variable (pcre_compile2() only)
5547 can be NULL if you don't want a code value
5548 errorptr pointer to pointer to error text
5549 erroroffset ptr offset in pattern where error was detected
5550 tables pointer to character tables or NULL
5551
5552 Returns: pointer to compiled data block, or NULL on error,
5553 with errorptr and erroroffset set
5554 */
5555
5556 PCRE_EXP_DEFN pcre *
5557 pcre_compile(const char *pattern, int options, const char **errorptr,
5558 int *erroroffset, const unsigned char *tables)
5559 {
5560 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5561 }
5562
5563
5564 PCRE_EXP_DEFN pcre *
5565 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5566 const char **errorptr, int *erroroffset, const unsigned char *tables)
5567 {
5568 real_pcre *re;
5569 int length = 1; /* For final END opcode */
5570 int firstbyte, reqbyte, newline;
5571 int errorcode = 0;
5572 #ifdef SUPPORT_UTF8
5573 BOOL utf8;
5574 #endif
5575 size_t size;
5576 uschar *code;
5577 const uschar *codestart;
5578 const uschar *ptr;
5579 compile_data compile_block;
5580 compile_data *cd = &compile_block;
5581
5582 /* This space is used for "compiling" into during the first phase, when we are
5583 computing the amount of memory that is needed. Compiled items are thrown away
5584 as soon as possible, so that a fairly large buffer should be sufficient for
5585 this purpose. The same space is used in the second phase for remembering where
5586 to fill in forward references to subpatterns. */
5587
5588 uschar cworkspace[COMPILE_WORK_SIZE];
5589
5590
5591 /* Set this early so that early errors get offset 0. */
5592
5593 ptr = (const uschar *)pattern;
5594
5595 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5596 can do is just return NULL, but we can set a code value if there is a code
5597 pointer. */
5598
5599 if (errorptr == NULL)
5600 {
5601 if (errorcodeptr != NULL) *errorcodeptr = 99;
5602 return NULL;
5603 }
5604
5605 *errorptr = NULL;
5606 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5607
5608 /* However, we can give a message for this error */
5609
5610 if (erroroffset == NULL)
5611 {
5612 errorcode = ERR16;
5613 goto PCRE_EARLY_ERROR_RETURN2;
5614 }
5615
5616 *erroroffset = 0;
5617
5618 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5619
5620 #ifdef SUPPORT_UTF8
5621 utf8 = (options & PCRE_UTF8) != 0;
5622 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5623 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5624 {
5625 errorcode = ERR44;
5626 goto PCRE_EARLY_ERROR_RETURN2;
5627 }
5628 #else
5629 if ((options & PCRE_UTF8) != 0)
5630 {
5631 errorcode = ERR32;
5632 goto PCRE_EARLY_ERROR_RETURN;
5633 }
5634 #endif
5635
5636 if ((options & ~PUBLIC_OPTIONS) != 0)
5637 {
5638 errorcode = ERR17;
5639 goto PCRE_EARLY_ERROR_RETURN;
5640 }
5641
5642 /* Set up pointers to the individual character tables */
5643
5644 if (tables == NULL) tables = _pcre_default_tables;
5645 cd->lcc = tables + lcc_offset;
5646 cd->fcc = tables + fcc_offset;
5647 cd->cbits = tables + cbits_offset;
5648 cd->ctypes = tables + ctypes_offset;
5649
5650 /* Handle different types of newline. The three bits give seven cases. The
5651 current code allows for fixed one- or two-byte sequences, plus "any" and
5652 "anycrlf". */
5653
5654 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5655 {
5656 case 0: newline = NEWLINE; break; /* Compile-time default */
5657 case PCRE_NEWLINE_CR: newline = '\r'; break;
5658 case PCRE_NEWLINE_LF: newline = '\n'; break;
5659 case PCRE_NEWLINE_CR+
5660 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5661 case PCRE_NEWLINE_ANY: newline = -1; break;
5662 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5663 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5664 }
5665
5666 if (newline == -2)
5667 {
5668 cd->nltype = NLTYPE_ANYCRLF;
5669 }
5670 else if (newline < 0)
5671 {
5672 cd->nltype = NLTYPE_ANY;
5673 }
5674 else
5675 {
5676 cd->nltype = NLTYPE_FIXED;
5677 if (newline > 255)
5678 {
5679 cd->nllen = 2;
5680 cd->nl[0] = (newline >> 8) & 255;
5681 cd->nl[1] = newline & 255;
5682 }
5683 else
5684 {
5685 cd->nllen = 1;
5686 cd->nl[0] = newline;
5687 }
5688 }
5689
5690 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5691 references to help in deciding whether (.*) can be treated as anchored or not.
5692 */
5693
5694 cd->top_backref = 0;
5695 cd->backref_map = 0;
5696
5697 /* Reflect pattern for debugging output */
5698
5699 DPRINTF(("------------------------------------------------------------------\n"));
5700 DPRINTF(("%s\n", pattern));
5701
5702 /* Pretend to compile the pattern while actually just accumulating the length
5703 of memory required. This behaviour is triggered by passing a non-NULL final
5704 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5705 to compile parts of the pattern into; the compiled code is discarded when it is
5706 no longer needed, so hopefully this workspace will never overflow, though there
5707 is a test for its doing so. */
5708
5709 cd->bracount = 0;
5710 cd->names_found = 0;
5711 cd->name_entry_size = 0;
5712 cd->name_table = NULL;
5713 cd->start_workspace = cworkspace;
5714 cd->start_code = cworkspace;
5715 cd->hwm = cworkspace;
5716 cd->start_pattern = (const uschar *)pattern;
5717 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5718 cd->req_varyopt = 0;
5719 cd->nopartial = FALSE;
5720 cd->external_options = options;
5721
5722 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5723 don't need to look at the result of the function here. The initial options have
5724 been put into the cd block so that they can be changed if an option setting is
5725 found within the regex right at the beginning. Bringing initial option settings
5726 outside can help speed up starting point checks. */
5727
5728 code = cworkspace;
5729 *code = OP_BRA;
5730 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5731 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5732 &length);
5733 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5734
5735 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5736 cd->hwm - cworkspace));
5737
5738 if (length > MAX_PATTERN_SIZE)
5739 {
5740 errorcode = ERR20;
5741 goto PCRE_EARLY_ERROR_RETURN;
5742 }
5743
5744 /* Compute the size of data block needed and get it, either from malloc or
5745 externally provided function. Integer overflow should no longer be possible
5746 because nowadays we limit the maximum value of cd->names_found and
5747 cd->name_entry_size. */
5748
5749 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5750 re = (real_pcre *)(pcre_malloc)(size);
5751
5752 if (re == NULL)
5753 {
5754 errorcode = ERR21;
5755 goto PCRE_EARLY_ERROR_RETURN;
5756 }
5757
5758 /* Put in the magic number, and save the sizes, initial options, and character
5759 table pointer. NULL is used for the default character tables. The nullpad field
5760 is at the end; it's there to help in the case when a regex compiled on a system
5761 with 4-byte pointers is run on another with 8-byte pointers. */
5762
5763 re->magic_number = MAGIC_NUMBER;
5764 re->size = size;
5765 re->options = cd->external_options;
5766 re->dummy1 = 0;
5767 re->first_byte = 0;
5768 re->req_byte = 0;
5769 re->name_table_offset = sizeof(real_pcre);
5770 re->name_entry_size = cd->name_entry_size;
5771 re->name_count = cd->names_found;
5772 re->ref_count = 0;
5773 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5774 re->nullpad = NULL;
5775
5776 /* The starting points of the name/number translation table and of the code are
5777 passed around in the compile data block. The start/end pattern and initial
5778 options are already set from the pre-compile phase, as is the name_entry_size
5779 field. Reset the bracket count and the names_found field. Also reset the hwm
5780 field; this time it's used for remembering forward references to subpatterns.
5781 */
5782
5783 cd->bracount = 0;
5784 cd->names_found = 0;
5785 cd->name_table = (uschar *)re + re->name_table_offset;
5786 codestart = cd->name_table + re->name_entry_size * re->name_count;
5787 cd->start_code = codestart;
5788 cd->hwm = cworkspace;
5789 cd->req_varyopt = 0;
5790 cd->nopartial = FALSE;
5791 cd->had_accept = FALSE;
5792
5793 /* Set up a starting, non-extracting bracket, then compile the expression. On
5794 error, errorcode will be set non-zero, so we don't need to look at the result
5795 of the function here. */
5796
5797 ptr = (const uschar *)pattern;
5798 code = (uschar *)codestart;
5799 *code = OP_BRA;
5800 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5801 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5802 re->top_bracket = cd->bracount;
5803 re->top_backref = cd->top_backref;
5804
5805 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5806 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
5807
5808 /* If not reached end of pattern on success, there's an excess bracket. */
5809
5810 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5811
5812 /* Fill in the terminating state and check for disastrous overflow, but
5813 if debugging, leave the test till after things are printed out. */
5814
5815 *code++ = OP_END;
5816
5817 #ifndef DEBUG
5818 if (code - codestart > length) errorcode = ERR23;
5819 #endif
5820
5821 /* Fill in any forward references that are required. */
5822
5823 while (errorcode == 0 && cd->hwm > cworkspace)
5824 {
5825 int offset, recno;
5826 const uschar *groupptr;
5827 cd->hwm -= LINK_SIZE;
5828 offset = GET(cd->hwm, 0);
5829 recno = GET(codestart, offset);
5830 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5831 if (groupptr == NULL) errorcode = ERR53;
5832 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5833 }
5834
5835 /* Give an error if there's back reference to a non-existent capturing
5836 subpattern. */
5837
5838 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5839
5840 /* Failed to compile, or error while post-processing */
5841
5842 if (errorcode != 0)
5843 {
5844 (pcre_free)(re);
5845 PCRE_EARLY_ERROR_RETURN:
5846 *erroroffset = ptr - (const uschar *)pattern;
5847 PCRE_EARLY_ERROR_RETURN2:
5848 *errorptr = error_texts[errorcode];
5849 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5850 return NULL;
5851 }
5852
5853 /* If the anchored option was not passed, set the flag if we can determine that
5854 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5855 as starting with .* when DOTALL is set).
5856
5857 Otherwise, if we know what the first byte has to be, save it, because that
5858 speeds up unanchored matches no end. If not, see if we can set the
5859 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5860 start with ^. and also when all branches start with .* for non-DOTALL matches.
5861 */
5862
5863 if ((re->options & PCRE_ANCHORED) == 0)
5864 {
5865 int temp_options = re->options; /* May get changed during these scans */
5866 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5867 re->options |= PCRE_ANCHORED;
5868 else
5869 {
5870 if (firstbyte < 0)
5871 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5872 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5873 {
5874 int ch = firstbyte & 255;
5875 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5876 cd->fcc[ch] == ch)? ch : firstbyte;
5877 re->options |= PCRE_FIRSTSET;
5878 }
5879 else if (is_startline(codestart, 0, cd->backref_map))
5880 re->options |= PCRE_STARTLINE;
5881 }
5882 }
5883
5884 /* For an anchored pattern, we use the "required byte" only if it follows a
5885 variable length item in the regex. Remove the caseless flag for non-caseable
5886 bytes. */
5887
5888 if (reqbyte >= 0 &&
5889 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5890 {
5891 int ch = reqbyte & 255;
5892 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5893 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5894 re->options |= PCRE_REQCHSET;
5895 }
5896
5897 /* Print out the compiled data if debugging is enabled. This is never the
5898 case when building a production library. */
5899
5900 #ifdef DEBUG
5901
5902 printf("Length = %d top_bracket = %d top_backref = %d\n",
5903 length, re->top_bracket, re->top_backref);
5904
5905 if (re->options != 0)
5906 {
5907 printf("%s%s%s%s%s%s%s%s%s\n",
5908 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5909 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5910 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5911 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5912 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5913 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5914 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5915 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5916 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5917 }
5918
5919 if ((re->options & PCRE_FIRSTSET) != 0)
5920 {
5921 int ch = re->first_byte & 255;
5922 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5923 "" : " (caseless)";
5924 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5925 else printf("First char = \\x%02x%s\n", ch, caseless);
5926 }
5927
5928 if ((re->options & PCRE_REQCHSET) != 0)
5929 {
5930 int ch = re->req_byte & 255;
5931 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5932 "" : " (caseless)";
5933 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5934 else printf("Req char = \\x%02x%s\n", ch, caseless);
5935 }
5936
5937 pcre_printint(re, stdout, TRUE);
5938
5939 /* This check is done here in the debugging case so that the code that
5940 was compiled can be seen. */
5941
5942 if (code - codestart > length)
5943 {
5944 (pcre_free)(re);
5945 *errorptr = error_texts[ERR23];
5946 *erroroffset = ptr - (uschar *)pattern;
5947 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5948 return NULL;
5949 }
5950 #endif /* DEBUG */
5951
5952 return (pcre *)re;
5953 }
5954
5955 /* End of pcre_compile.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12